Merge tag 'v2.0.1'

libaom 2.0.1

2020-11-25 v2.0.1
  This release includes two bug fixes.

  - Bug fixes:
    Issue 2723: Fix crash in chroma_check() when generating a monochrome
    encoded stream in real-time mode.

    Issue 2833: Fix crash on some input when reduced still picture header is
    used in real-time mode and speed >=7.

BUG=aomedia:2881
Change-Id: I1c3648718a645401cd30445b5c5609e1865ac37d
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2ef0863..af89576 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -9,6 +9,18 @@
 # can obtain it at www.aomedia.org/license/patent.
 #
 cmake_minimum_required(VERSION 3.5)
+
+set(AOM_ROOT "${CMAKE_CURRENT_SOURCE_DIR}")
+set(AOM_CONFIG_DIR "${CMAKE_CURRENT_BINARY_DIR}")
+if("${AOM_ROOT}" STREQUAL "${AOM_CONFIG_DIR}")
+  message(
+    FATAL_ERROR "Building from within the aom source tree is not supported.\n"
+                "Hint: Run these commands\n"
+                "$ rm -rf CMakeCache.txt CMakeFiles\n"
+                "$ mkdir -p ../aom_build\n" "$ cd ../aom_build\n"
+                "And re-run CMake from the aom_build directory.")
+endif()
+
 project(AOM C CXX)
 
 if(NOT EMSCRIPTEN)
@@ -20,18 +32,6 @@
   endif()
 endif()
 
-set(AOM_ROOT "${CMAKE_CURRENT_SOURCE_DIR}")
-set(AOM_CONFIG_DIR "${CMAKE_CURRENT_BINARY_DIR}")
-
-if("${AOM_ROOT}" STREQUAL "${AOM_CONFIG_DIR}")
-  message(
-    FATAL_ERROR "Building from within the aom source tree is not supported.\n"
-                "Hint: Run these commands\n"
-                "$ rm -rf CMakeCache.txt CMakeFiles\n"
-                "$ mkdir -p ../aom_build\n" "$ cd ../aom_build\n"
-                "And re-run CMake from the aom_build directory.")
-endif()
-
 # Updating version info.
 # https://www.gnu.org/software/libtool/manual/libtool.html#Updating-version-info
 set(SO_VERSION 2)
@@ -432,17 +432,47 @@
   list(APPEND AOM_APP_TARGETS ${AOM_ENCODER_EXAMPLE_TARGETS}
               ${AOM_ENCODER_TOOL_TARGETS})
 
+  if(CONFIG_USE_VMAF_RC AND NOT CONFIG_TUNE_VMAF)
+    message(FATAL_ERROR "Turn on CONFIG_TUNE_VMAF to use CONFIG_USE_VMAF_RC.")
+  endif()
+
   if(CONFIG_TUNE_VMAF)
-    find_library(VMAF libvmaf.a vmaf)
-    if(NOT VMAF)
-      message(FATAL_ERROR "VMAF library not found.")
+    find_package(PkgConfig)
+    if(CONFIG_USE_VMAF_RC)
+      if(PKG_CONFIG_FOUND)
+        pkg_check_modules(VMAF_RC REQUIRED libvmaf_rc)
+        target_link_libraries(aom
+                              PRIVATE ${VMAF_RC_LDFLAGS} ${VMAF_RC_LIBRARIES})
+        target_include_directories(aom PRIVATE ${VMAF_RC_INCLUDE_DIRS})
+        if(VMAF_RC_CFLAGS)
+          append_compiler_flag("${VMAF_RC_CFLAGS}")
+        endif()
+      else()
+        message(FATAL_ERROR "CONFIG_USE_VMAF_RC error: pkg-config not found.")
+      endif()
+    else()
+      if(PKG_CONFIG_FOUND)
+        pkg_check_modules(VMAF REQUIRED libvmaf)
+      else()
+        find_library(VMAF_LIBRARIES vmaf)
+        find_path(VMAF_INCLUDE_DIRS libvmaf.h PATH_SUFFIXES libvmaf)
+        if(VMAF_LIBRARIES AND VMAF_INCLUDE_DIRS)
+          message(STATUS "Found VMAF library: ${VMAF_LIBRARIES}")
+          message(STATUS "Found VMAF include: ${VMAF_INCLUDE_DIRS}")
+        else()
+          message(FATAL_ERROR "VMAF library not found.")
+        endif()
+      endif()
+      target_link_libraries(aom PRIVATE ${VMAF_LDFLAGS} ${VMAF_LIBRARIES})
+      target_include_directories(aom PRIVATE ${VMAF_INCLUDE_DIRS})
+      if(VMAF_CFLAGS)
+        append_compiler_flag("${VMAF_CFLAGS}")
+      endif()
     endif()
-    message("-- Found VMAF library: " ${VMAF})
     set_target_properties(aom PROPERTIES LINKER_LANGUAGE CXX)
     if(BUILD_SHARED_LIBS)
       set_target_properties(aom_static PROPERTIES LINKER_LANGUAGE CXX)
     endif()
-    target_link_libraries(aom PRIVATE ${VMAF})
   endif()
 endif()
 
@@ -630,13 +660,38 @@
 set_user_flags()
 
 # Aomedia documentation rule.
+set(DOXYGEN_VERSION_VALUE 0)
 if(ENABLE_DOCS)
   include(FindDoxygen)
   if(DOXYGEN_FOUND)
+    # Check if Doxygen version is >= minimum required version(i.e. 1.8.10).
+    set(MINIMUM_DOXYGEN_VERSION 1008010)
+
+    if(DOXYGEN_VERSION)
+      string(REGEX REPLACE "\\." ";" DOXYGEN_VERSION_LIST ${DOXYGEN_VERSION})
+      list(GET DOXYGEN_VERSION_LIST 0 DOXYGEN_MAJOR)
+      list(GET DOXYGEN_VERSION_LIST 1 DOXYGEN_MINOR)
+      list(GET DOXYGEN_VERSION_LIST 2 DOXYGEN_PATCH)
+    endif()
+
+    # Construct a version value for comparison.
+    math(EXPR DOXYGEN_MAJOR "${DOXYGEN_MAJOR}*1000000")
+    math(EXPR DOXYGEN_MINOR "${DOXYGEN_MINOR}*1000")
+    math(EXPR DOXYGEN_VERSION_VALUE
+         "${DOXYGEN_MAJOR} + ${DOXYGEN_MINOR} + ${DOXYGEN_PATCH}")
+
+    if(${DOXYGEN_VERSION_VALUE} LESS ${MINIMUM_DOXYGEN_VERSION})
+      set(DOXYGEN_FOUND NO)
+    endif()
+  endif()
+
+  if(DOXYGEN_FOUND)
     include("${AOM_ROOT}/docs.cmake")
     setup_documentation_targets()
   else()
-    message("--- Cannot find doxygen, ENABLE_DOCS turned off.")
+    message(
+      "--- Cannot find doxygen(version 1.8.10 or newer), ENABLE_DOCS turned off."
+      )
     set(ENABLE_DOCS OFF)
   endif()
 endif()
diff --git a/README.md b/README.md
index cf057ae..a458f52 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,5 @@
+README.md                {#LREADME}
+=========
 # AV1 Codec Library
 
 ## Contents
@@ -40,9 +42,9 @@
 5. [Support](#support)
 6. [Bug reports](#bug-reports)
 
-## Building the library and applications
+## Building the library and applications {#building-the-library-and-applications}
 
-### Prerequisites
+### Prerequisites {#prerequisites}
 
  1. [CMake](https://cmake.org) version 3.5 or higher.
  2. [Git](https://git-scm.com/).
@@ -51,12 +53,13 @@
     recent version of [nasm](http://www.nasm.us/). If you download yasm with
     the intention to work with Visual Studio, please download win32.exe or
     win64.exe and rename it into yasm.exe. DO NOT download or use vsyasm.exe.
- 5. Building the documentation requires [doxygen](http://doxygen.org).
+ 5. Building the documentation requires
+   [doxygen version 1.8.10 or newer](http://doxygen.org).
  6. Building the unit tests requires [Python](https://www.python.org/).
  7. Emscripten builds require the portable
    [EMSDK](https://kripken.github.io/emscripten-site/index.html).
 
-### Get the code
+### Get the code {#get-the-code}
 
 The AV1 library source code is stored in the Alliance for Open Media Git
 repository:
@@ -67,7 +70,7 @@
     $ cd aom
 ~~~
 
-### Basic build
+### Basic build {#basic-build}
 
 CMake replaces the configure step typical of many projects. Running CMake will
 produce configuration and build files for the currently selected CMake
@@ -85,7 +88,7 @@
 applies: On systems where cc and c++ are present in $PATH at the time CMake is
 run the generated build will use cc and c++ by default.
 
-### Configuration options
+### Configuration options {#configuration-options}
 
 The AV1 codec library has a great many configuration options. These come in two
 varieties:
@@ -106,7 +109,7 @@
 in the root of the AV1 repository, and AV1 codec configuration options can
 currently be found in the file `build/cmake/aom_config_defaults.cmake`.
 
-### Dylib builds
+### Dylib builds {#dylib-builds}
 
 A dylib (shared object) build of the AV1 codec library can be enabled via the
 CMake built in variable `BUILD_SHARED_LIBS`:
@@ -118,7 +121,7 @@
 
 This is currently only supported on non-Windows targets.
 
-### Debugging
+### Debugging {#debugging}
 
 Depending on the generator used there are multiple ways of going about
 debugging AV1 components. For single configuration generators like the Unix
@@ -147,7 +150,7 @@
     $ cmake path/to/aom -DAOM_TARGET_CPU=generic
 ~~~
 
-### Cross compiling
+### Cross compiling {#cross-compiling}
 
 For the purposes of building the AV1 codec and applications and relative to the
 scope of this guide, all builds for architectures differing from the native host
@@ -197,7 +200,7 @@
 suffixed with gcc behave differently than the others. These toolchain files
 attempt to obey the $CROSS environment variable.
 
-### Sanitizers
+### Sanitizers {#sanitizers}
 
 Sanitizer integration is built-in to the CMake build system. To enable a
 sanitizer, add `-DSANITIZE=<type>` to the CMake command line. For example, to
@@ -211,7 +214,7 @@
 Sanitizers available vary by platform, target, and compiler. Consult your
 compiler documentation to determine which, if any, are available.
 
-### Microsoft Visual Studio builds
+### Microsoft Visual Studio builds {#microsoft-visual-studio-builds}
 
 Building the AV1 codec library in Microsoft Visual Studio is supported. Visual
 Studio 2017 (15.0) or later is required. The following example demonstrates
@@ -241,7 +244,7 @@
 NOTE: The build system targets Windows 7 or later by compiling files with
 `-D_WIN32_WINNT=0x0601`.
 
-### Xcode builds
+### Xcode builds {#xcode-builds}
 
 Building the AV1 codec library in Xcode is supported. The following example
 demonstrates generating an Xcode project:
@@ -250,7 +253,7 @@
     $ cmake path/to/aom -G Xcode
 ~~~
 
-### Emscripten builds
+### Emscripten builds {#emscripten-builds}
 
 Building the AV1 codec library with Emscripten is supported. Typically this is
 used to hook into the AOMAnalyzer GUI application. These instructions focus on
@@ -293,7 +296,7 @@
     $ path/to/AOMAnalyzer path/to/examples/inspect.js path/to/av1/input/file
 ~~~
 
-### Extra build flags
+### Extra build flags {#extra-build-flags}
 
 Three variables allow for passing of additional flags to the build system.
 
@@ -312,10 +315,10 @@
         -DAOM_EXTRA_CXX_FLAGS=-UNDEBUG
 ~~~
 
-### Build with VMAF support
+### Build with VMAF support {#build-with-vmaf}
 
 After installing
-[libvmaf.a](https://github.com/Netflix/vmaf/blob/master/resource/doc/libvmaf.md),
+[libvmaf.a](https://github.com/Netflix/vmaf/tree/master/libvmaf),
 you can use it with the encoder:
 
 ~~~
@@ -330,15 +333,15 @@
     # --vmaf-model-path=path/to/model
 ~~~
 
-## Testing the AV1 codec
+## Testing the AV1 codec {#testing-the-av1-codec}
 
-### Testing basics
+### Testing basics {#testing-basics}
 
 There are several methods of testing the AV1 codec. All of these methods require
 the presence of the AV1 source code and a working build of the AV1 library and
 applications.
 
-#### 1. Unit tests:
+#### 1. Unit tests: {#1_unit-tests}
 
 The unit tests can be run at build time:
 
@@ -352,7 +355,7 @@
     $ make runtests
 ~~~
 
-#### 2. Example tests:
+#### 2. Example tests: {#2_example-tests}
 
 The example tests require a bash shell and can be run in the following manner:
 
@@ -367,7 +370,7 @@
     $ path/to/aom/test/examples.sh --bin-path examples
 ~~~
 
-#### 3. Encoder tests:
+#### 3. Encoder tests: {#3_encoder-tests}
 
 When making a change to the encoder run encoder tests to confirm that your
 change has a positive or negligible impact on encode quality. When running these
@@ -418,7 +421,7 @@
 You can view the report by opening mytweak.html in a web browser.
 
 
-### IDE hosted tests
+### IDE hosted tests {#ide-hosted-tests}
 
 By default the generated projects files created by CMake will not include the
 runtests and testdata rules when generating for IDEs like Microsoft Visual
@@ -434,7 +437,7 @@
     $ cmake path/to/aom -DENABLE_IDE_TEST_HOSTING=1 -G Xcode
 ~~~
 
-### Downloading the test data
+### Downloading the test data {#downloading-the-test-data}
 
 The fastest and easiest way to obtain the test data is to use CMake to generate
 a build using the Unix Makefiles generator, and then to build only the testdata
@@ -448,7 +451,7 @@
 
 The above make command will only download and verify the test data.
 
-### Adding a new test data file
+### Adding a new test data file {#adding-a-new-test-data-file}
 
 First, add the new test data file to the `aom-test-data` bucket of the
 `aomedia-testing` project on Google Cloud Platform. You may need to ask someone
@@ -470,19 +473,19 @@
 checksum of a file can be calculated by running the `sha1sum` command on the
 file.)
 
-### Additional test data
+### Additional test data {#additional-test-data}
 
 The test data mentioned above is strictly intended for unit testing.
 
 Additional input data for testing the encoder can be obtained from:
 https://media.xiph.org/video/derf/
 
-### Sharded testing
+### Sharded testing {#sharded-testing}
 
 The AV1 codec library unit tests are built upon gtest which supports sharding of
 test jobs. Sharded test runs can be achieved in a couple of ways.
 
-#### 1. Running test\_libaom directly:
+#### 1. Running test\_libaom directly: {#1_running-test_libaom-directly}
 
 ~~~
    # Set the environment variable GTEST_TOTAL_SHARDS to control the number of
@@ -496,7 +499,7 @@
 To create a test shard for each CPU core available on the current system set
 `GTEST_TOTAL_SHARDS` to the number of CPU cores on your system minus one.
 
-#### 2. Running the tests via the CMake build:
+#### 2. Running the tests via the CMake build: {#2_running-the-tests-via-the-cmake-build}
 
 ~~~
     # For IDE based builds, ENABLE_IDE_TEST_HOSTING must be enabled. See
@@ -515,7 +518,7 @@
 the `-j` parameter. When CMake is unable to detect the number of cores 10 shards
 is the default maximum value.
 
-## Coding style
+## Coding style {#coding-style}
 
 We are using the Google C Coding Style defined by the
 [Google C++ Style Guide](https://google.github.io/styleguide/cppguide.html).
@@ -556,27 +559,27 @@
     $ git clang-format -f -p
 ~~~
 
-## Submitting patches
+## Submitting patches {#submitting-patches}
 
 We manage the submission of patches using the
 [Gerrit](https://www.gerritcodereview.com/) code review tool. This tool
 implements a workflow on top of the Git version control system to ensure that
 all changes get peer reviewed and tested prior to their distribution.
 
-### Login cookie
+### Login cookie {#login-cookie}
 
 Browse to [AOMedia Git index](https://aomedia.googlesource.com/) and login with
 your account (Gmail credentials, for example). Next, follow the
 `Generate Password` Password link at the top of the page. You’ll be given
 instructions for creating a cookie to use with our Git repos.
 
-### Contributor agreement
+### Contributor agreement {#contributor-agreement}
 
 You will be required to execute a
 [contributor agreement](http://aomedia.org/license) to ensure that the AOMedia
 Project has the right to distribute your changes.
 
-### Testing your code
+### Testing your code {#testing-your-code}
 
 The testing basics are covered in the [testing section](#testing-the-av1-codec)
 above.
@@ -584,7 +587,7 @@
 In addition to the local tests, many more (e.g. asan, tsan, valgrind) will run
 through Jenkins instances upon upload to gerrit.
 
-### Commit message hook
+### Commit message hook {#commit-message-hook}
 
 Gerrit requires that each submission include a unique Change-Id. You can assign
 one manually using git commit --amend, but it’s easier to automate it with the
@@ -604,7 +607,7 @@
 [documentation](https://gerrit-review.googlesource.com/Documentation/user-changeid.html)
 for more information.
 
-### Upload your change
+### Upload your change {#upload-your-change}
 
 The command line to upload your patch looks like this:
 
@@ -612,7 +615,7 @@
     $ git push https://aomedia-review.googlesource.com/aom HEAD:refs/for/master
 ~~~
 
-### Incorporating reviewer comments
+### Incorporating reviewer comments {#incorporating-reviewer-comments}
 
 If you previously uploaded a change to Gerrit and the Approver has asked for
 changes, follow these steps:
@@ -631,7 +634,7 @@
 review. Doing so can make it harder to follow the evolution of your change in
 the diff view.
 
-### Submitting your change
+### Submitting your change {#submitting-your-change}
 
 Once your change has been Approved and Verified, you can “submit” it through the
 Gerrit UI. This will usually automatically rebase your change onto the branch
@@ -648,18 +651,18 @@
 If there are any conflicts, resolve them as you normally would with Git. When
 you’re done, reupload your change.
 
-### Viewing the status of uploaded changes
+### Viewing the status of uploaded changes {#viewing-the-status-of-uploaded-changes}
 
 To check the status of a change that you uploaded, open
 [Gerrit](https://aomedia-review.googlesource.com/), sign in, and click My >
 Changes.
 
-## Support
+## Support {#support}
 
 This library is an open source project supported by its community. Please
 please email aomediacodec@jointdevelopment.kavi.com for help.
 
-## Bug reports
+## Bug reports {#bug-reports}
 
 Bug reports can be filed in the Alliance for Open Media
 [issue tracker](https://bugs.chromium.org/p/aomedia/issues/list).
diff --git a/aom/aom_codec.h b/aom/aom_codec.h
index 75f6a1a..6e4208d 100644
--- a/aom/aom_codec.h
+++ b/aom/aom_codec.h
@@ -9,6 +9,57 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
+///////////////////////////////////////////////////////////////////////////////
+// Internal implementation details
+///////////////////////////////////////////////////////////////////////////////
+//
+// There are two levels of interfaces used to access the AOM codec: the
+// the aom_codec_iface and the aom_codec_ctx.
+//
+// 1. aom_codec_iface_t
+//    (Related files: aom/aom_codec.h, aom/src/aom_codec.c,
+//    aom/internal/aom_codec_internal.h, av1/av1_cx_iface.c,
+//    av1/av1_dx_iface.c)
+//
+// Used to initialize the codec context, which contains the configuration for
+// for modifying the encoder/decoder during run-time. See the other
+// documentation in this header file for more details. For the most part,
+// users will call helper functions, such as aom_codec_iface_name,
+// aom_codec_get_caps, etc., to interact with it.
+//
+// The main purpose of the aom_codec_iface_t is to provide a way to generate
+// a default codec config, find out what capabilities the implementation has,
+// and create an aom_codec_ctx_t (which is actually used to interact with the
+// codec).
+//
+// Note that the implementations for the AV1 algorithm are located in
+// av1/av1_cx_iface.c and av1/av1_dx_iface.c
+//
+//
+// 2. aom_codec_ctx_t
+//  (Related files: aom/aom_codec.h, av1/av1_cx_iface.c, av1/av1_dx_iface.c,
+//   aom/aomcx.h, aom/aomdx.h, aom/src/aom_encoder.c, aom/src/aom_decoder.c)
+//
+// The actual interface between user code and the codec. It stores the name
+// of the codec, a pointer back to the aom_codec_iface_t that initialized it,
+// initialization flags, a config for either encoder or the decoder, and a
+// pointer to internal data.
+//
+// The codec is configured / queried through calls to aom_codec_control,
+// which takes a control ID (listed in aomcx.h and aomdx.h) and a parameter.
+// In the case of "getter" control IDs, the parameter is modified to have
+// the requested value; in the case of "setter" control IDs, the codec's
+// configuration is changed based on the parameter. Note that a aom_codec_err_t
+// is returned, which indicates if the operation was successful or not.
+//
+// Note that for the encoder, the aom_codec_alg_priv_t points to the
+// the aom_codec_alg_priv structure in av1/av1_cx_iface.c, and for the decoder,
+// the struct in av1/av1_dx_iface.c. Variables such as AV1_COMP cpi are stored
+// here and also used in the core algorithm.
+//
+// At the end, aom_codec_destroy should be called for each initialized
+// aom_codec_ctx_t.
+
 /*!\defgroup codec Common Algorithm Interface
  * This abstraction allows applications to easily support multiple video
  * formats with minimal code duplication. This section describes the interface
@@ -23,13 +74,16 @@
  * video codec algorithm.
  *
  * An application instantiates a specific codec instance by using
- * aom_codec_init() and a pointer to the algorithm's interface structure:
+ * aom_codec_dec_init() or aom_codec_enc_init() and a pointer to the
+ * algorithm's interface structure:
  *     <pre>
  *     my_app.c:
  *       extern aom_codec_iface_t my_codec;
  *       {
  *           aom_codec_ctx_t algo;
- *           res = aom_codec_init(&algo, &my_codec);
+ *           int threads = 4;
+ *           aom_codec_dec_cfg_t cfg = { threads, 0, 0, 1 };
+ *           res = aom_codec_dec_init(&algo, &my_codec, &cfg, 0);
  *       }
  *     </pre>
  *
@@ -185,13 +239,17 @@
  * Contains function pointers and other data private to the codec
  * implementation. This structure is opaque to the application. Common
  * functions used with this structure:
- *   - aom_codec_iface_name: get the name of the codec
- *   - aom_codec_get_caps: returns the capabilities of the codec (see
- *     aom_encoder.h for more details)
- *   - aom_codec_enc_config_default: generate the default config to use
- *     when initializing the encoder
+ *   - aom_codec_iface_name(aom_codec_iface_t *iface): get the
+ *     name of the codec
+ *   - aom_codec_get_caps(aom_codec_iface_t *iface): returns
+ *     the capabilities of the codec
+ *   - aom_codec_enc_config_default: generate the default config for
+ *     initializing the encoder (see documention in aom_encoder.h)
  *   - aom_codec_dec_init, aom_codec_enc_init: initialize the codec context
- *     structure (see documentation on aom_codec_ctx for more information).
+ *     structure (see documentation on aom_codec_ctx).
+ *
+ * To get access to the AV1 encoder and decoder, use aom_codec_av1_cx() and
+ *  aom_codec_av1_dx().
  */
 typedef const struct aom_codec_iface aom_codec_iface_t;
 
@@ -202,6 +260,27 @@
  */
 typedef struct aom_codec_priv aom_codec_priv_t;
 
+/*!\brief Compressed Frame Flags
+ *
+ * This type represents a bitfield containing information about a compressed
+ * frame that may be useful to an application. The most significant 16 bits
+ * can be used by an algorithm to provide additional detail, for example to
+ * support frame types that are codec specific (MPEG-1 D-frames for example)
+ */
+typedef uint32_t aom_codec_frame_flags_t;
+#define AOM_FRAME_IS_KEY 0x1 /**< frame is the start of a GOP */
+/*!\brief frame can be dropped without affecting the stream (no future frame
+ * depends on this one) */
+#define AOM_FRAME_IS_DROPPABLE 0x2
+/*!\brief this is an INTRA_ONLY frame */
+#define AOM_FRAME_IS_INTRAONLY 0x10
+/*!\brief this is an S-frame */
+#define AOM_FRAME_IS_SWITCH 0x20
+/*!\brief this is an error-resilient frame */
+#define AOM_FRAME_IS_ERROR_RESILIENT 0x40
+/*!\brief this is a key-frame dependent recovery-point frame */
+#define AOM_FRAME_IS_DELAYED_RANDOM_ACCESS_POINT 0x80
+
 /*!\brief Iterator
  *
  * Opaque storage used for iterating over lists.
@@ -266,31 +345,27 @@
 /*!\brief Return the version information (as an integer)
  *
  * Returns a packed encoding of the library version number. This will only
- * include
- * the major.minor.patch component of the version number. Note that this encoded
- * value should be accessed through the macros provided, as the encoding may
- * change
- * in the future.
+ * include the major.minor.patch component of the version number. Note that this
+ * encoded value should be accessed through the macros provided, as the encoding
+ * may change in the future.
  *
  */
 int aom_codec_version(void);
 
-/*!\brief Return the version major number */
+/*!\brief Return the major version number */
 #define aom_codec_version_major() ((aom_codec_version() >> 16) & 0xff)
 
-/*!\brief Return the version minor number */
+/*!\brief Return the minor version number */
 #define aom_codec_version_minor() ((aom_codec_version() >> 8) & 0xff)
 
-/*!\brief Return the version patch number */
+/*!\brief Return the patch version number */
 #define aom_codec_version_patch() ((aom_codec_version() >> 0) & 0xff)
 
 /*!\brief Return the version information (as a string)
  *
  * Returns a printable string containing the full library version number. This
- * may
- * contain additional text following the three digit version number, as to
- * indicate
- * release candidates, prerelease versions, etc.
+ * may contain additional text following the three digit version number, as to
+ * indicate release candidates, prerelease versions, etc.
  *
  */
 const char *aom_codec_version_str(void);
@@ -298,8 +373,7 @@
 /*!\brief Return the version information (as a string)
  *
  * Returns a printable "extra string". This is the component of the string
- * returned
- * by aom_codec_version_str() following the three digit version number.
+ * returned by aom_codec_version_str() following the three digit version number.
  *
  */
 const char *aom_codec_version_extra_str(void);
diff --git a/aom/aom_encoder.h b/aom/aom_encoder.h
index a494c17..ab19f4e 100644
--- a/aom/aom_encoder.h
+++ b/aom/aom_encoder.h
@@ -78,27 +78,6 @@
   size_t sz;       /**< Length of the buffer, in chars */
 } aom_fixed_buf_t; /**< alias for struct aom_fixed_buf */
 
-/*!\brief Compressed Frame Flags
- *
- * This type represents a bitfield containing information about a compressed
- * frame that may be useful to an application. The most significant 16 bits
- * can be used by an algorithm to provide additional detail, for example to
- * support frame types that are codec specific (MPEG-1 D-frames for example)
- */
-typedef uint32_t aom_codec_frame_flags_t;
-#define AOM_FRAME_IS_KEY 0x1 /**< frame is the start of a GOP */
-/*!\brief frame can be dropped without affecting the stream (no future frame
- * depends on this one) */
-#define AOM_FRAME_IS_DROPPABLE 0x2
-/*!\brief this is an INTRA_ONLY frame */
-#define AOM_FRAME_IS_INTRAONLY 0x10
-/*!\brief this is an S-frame */
-#define AOM_FRAME_IS_SWITCH 0x20
-/*!\brief this is an error-resilient frame */
-#define AOM_FRAME_IS_ERROR_RESILIENT 0x40
-/*!\brief this is a key-frame dependent recovery-point frame */
-#define AOM_FRAME_IS_DELAYED_RANDOM_ACCESS_POINT 0x80
-
 /*!\brief Error Resilient flags
  *
  * These flags define which error resilient features to enable in the
@@ -152,8 +131,17 @@
       unsigned int samples[4]; /**< Number of samples, total/y/u/v */
       uint64_t sse[4];         /**< sum squared error, total/y/u/v */
       double psnr[4];          /**< PSNR, total/y/u/v */
-    } psnr;                    /**< data for PSNR packet */
-    aom_fixed_buf_t raw;       /**< data for arbitrary packets */
+      /*!\brief Number of samples, total/y/u/v when
+       * input bit-depth < stream bit-depth.*/
+      unsigned int samples_hbd[4];
+      /*!\brief sum squared error, total/y/u/v when
+       * input bit-depth < stream bit-depth.*/
+      uint64_t sse_hbd[4];
+      /*!\brief PSNR, total/y/u/v when
+       * input bit-depth < stream bit-depth.*/
+      double psnr_hbd[4];
+    } psnr;              /**< data for PSNR packet */
+    aom_fixed_buf_t raw; /**< data for arbitrary packets */
 
     /* This packet size is fixed to allow codecs to extend this
      * interface without having to manage storage for raw packets,
@@ -202,6 +190,22 @@
   AOM_KF_DISABLED = 0 /**< Encoder does not place keyframes. */
 };
 
+/*!\brief Frame super-resolution mode. */
+typedef enum {
+  /**< Frame super-resolution is disabled for all frames. */
+  AOM_SUPERRES_NONE,
+  /**< All frames are coded at the specified scale and super-resolved. */
+  AOM_SUPERRES_FIXED,
+  /**< All frames are coded at a random scale and super-resolved. */
+  AOM_SUPERRES_RANDOM,
+  /**< Super-resolution scale for each frame is determined based on the q index
+     of that frame. */
+  AOM_SUPERRES_QTHRESH,
+  /**< Full-resolution or super-resolution and the scale (in case of
+     super-resolution) are automatically selected for each frame. */
+  AOM_SUPERRES_AUTO,
+} aom_superres_mode;
+
 /*!\brief Encoder Config Options
  *
  * This type allows to enumerate and control flags defined for encoder control
@@ -358,7 +362,8 @@
  * /algo/_eflag_*. The lower order 16 bits are reserved for common use.
  */
 typedef long aom_enc_frame_flags_t;
-#define AOM_EFLAG_FORCE_KF (1 << 0) /**< Force this frame to be a keyframe */
+/*!\brief Force this frame to be a keyframe */
+#define AOM_EFLAG_FORCE_KF (1 << 0)
 
 /*!\brief Encoder configuration structure
  *
@@ -546,10 +551,8 @@
    * Similar to spatial resampling, frame super-resolution integrates
    * upscaling after the encode/decode process. Taking control of upscaling and
    * using restoration filters should allow it to outperform normal resizing.
-   *
-   * Valid values are 0 to 4 as defined in enum SUPERRES_MODE.
    */
-  unsigned int rc_superres_mode;
+  aom_superres_mode rc_superres_mode;
 
   /*!\brief Frame super-resolution denominator.
    *
@@ -559,7 +562,7 @@
    *
    * Valid denominators are 8 to 16.
    *
-   * Used only by SUPERRES_FIXED.
+   * Used only by AOM_SUPERRES_FIXED.
    */
   unsigned int rc_superres_denominator;
 
@@ -578,7 +581,7 @@
    * The q level threshold after which superres is used.
    * Valid values are 1 to 63.
    *
-   * Used only by SUPERRES_QTHRESH
+   * Used only by AOM_SUPERRES_QTHRESH
    */
   unsigned int rc_superres_qthresh;
 
@@ -587,7 +590,7 @@
    * The q level threshold after which superres is used for key frames.
    * Valid values are 1 to 63.
    *
-   * Used only by SUPERRES_QTHRESH
+   * Used only by AOM_SUPERRES_QTHRESH
    */
   unsigned int rc_superres_kf_qthresh;
 
@@ -651,23 +654,17 @@
 
   /*!\brief Rate control adaptation undershoot control
    *
-   * This value, expressed as a percentage of the target bitrate,
-   * controls the maximum allowed adaptation speed of the codec.
-   * This factor controls the maximum amount of bits that can
-   * be subtracted from the target bitrate in order to compensate
-   * for prior overshoot.
+   * This value, controls the tolerance of the VBR algorithm to undershoot
+   * and is used as a trigger threshold for more agressive adaptation of Q.
    *
-   * Valid values in the range 0-1000.
+   * Valid values in the range 0-100.
    */
   unsigned int rc_undershoot_pct;
 
   /*!\brief Rate control adaptation overshoot control
    *
-   * This value, expressed as a percentage of the target bitrate,
-   * controls the maximum allowed adaptation speed of the codec.
-   * This factor controls the maximum amount of bits that can
-   * be added to the target bitrate in order to compensate for
-   * prior undershoot.
+   * This value, controls the tolerance of the VBR algorithm to overshoot
+   * and is used as a trigger threshold for more agressive adaptation of Q.
    *
    * Valid values in the range 0-1000.
    */
@@ -1019,15 +1016,18 @@
  * time stamp (PTS) \ref MUST be strictly increasing.
  *
  * When the last frame has been passed to the encoder, this function should
- * continue to be called, with the img parameter set to NULL. This will
- * signal the end-of-stream condition to the encoder and allow it to encode
- * any held buffers. Encoding is complete when aom_codec_encode() is called
- * and aom_codec_get_cx_data() returns no data.
+ * continue to be called in a loop, with the img parameter set to NULL. This
+ * will signal the end-of-stream condition to the encoder and allow it to
+ * encode any held buffers. Encoding is complete when aom_codec_encode() is
+ * called with img set to NULL and aom_codec_get_cx_data() returns no data.
  *
  * \param[in]    ctx       Pointer to this instance's context
  * \param[in]    img       Image data to encode, NULL to flush.
- * \param[in]    pts       Presentation time stamp, in timebase units.
- * \param[in]    duration  Duration to show frame, in timebase units.
+ * \param[in]    pts       Presentation time stamp, in timebase units. If img
+ *                         is NULL, pts is ignored.
+ * \param[in]    duration  Duration to show frame, in timebase units. If img
+ *                         is not NULL, duration must be nonzero. If img is
+ *                         NULL, duration is ignored.
  * \param[in]    flags     Flags to use for encoding this frame.
  *
  * \retval #AOM_CODEC_OK
diff --git a/aom/aom_image.h b/aom/aom_image.h
index bb6973f..03bc73e 100644
--- a/aom/aom_image.h
+++ b/aom/aom_image.h
@@ -360,6 +360,9 @@
  * \param[in]    data         Metadata contents
  * \param[in]    sz           Metadata contents size
  * \param[in]    insert_flag  Metadata insert flag
+ *
+ * \return Returns 0 on success. If img or data is NULL, sz is 0, or memory
+ * allocation fails, it returns -1.
  */
 int aom_img_add_metadata(aom_image_t *img, uint32_t type, const uint8_t *data,
                          size_t sz, aom_metadata_insert_flags_t insert_flag);
@@ -410,6 +413,9 @@
  * \param[in]    data         Metadata data pointer
  * \param[in]    sz           Metadata size
  * \param[in]    insert_flag  Metadata insert flag
+ *
+ * \return Returns the newly allocated aom_metadata struct. If data is NULL,
+ * sz is 0, or memory allocation fails, it returns NULL.
  */
 aom_metadata_t *aom_img_metadata_alloc(uint32_t type, const uint8_t *data,
                                        size_t sz,
diff --git a/aom/aom_integer.h b/aom/aom_integer.h
index 113671e..d9bba09 100644
--- a/aom/aom_integer.h
+++ b/aom/aom_integer.h
@@ -22,22 +22,7 @@
 #define AOM_INLINE inline
 #endif
 
-#if defined(AOM_EMULATE_INTTYPES)
-typedef signed char int8_t;
-typedef signed short int16_t;
-typedef signed int int32_t;
-
-typedef unsigned char uint8_t;
-typedef unsigned short uint16_t;
-typedef unsigned int uint32_t;
-
-#ifndef _UINTPTR_T_DEFINED
-typedef size_t uintptr_t;
-#endif
-
-#else
-
-/* Most platforms have the C99 standard integer types. */
+/* Assume platforms have the C99 standard integer types. */
 
 #if defined(__cplusplus)
 #if !defined(__STDC_FORMAT_MACROS)
@@ -49,27 +34,7 @@
 #endif  // __cplusplus
 
 #include <stdint.h>
-
-#endif
-
-/* VS2010 defines stdint.h, but not inttypes.h */
-#if defined(_MSC_VER) && _MSC_VER < 1800
-#define PRId64 "I64d"
-#else
 #include <inttypes.h>
-#endif
-
-#if !defined(INT8_MAX)
-#define INT8_MAX 127
-#endif
-
-#if !defined(INT32_MAX)
-#define INT32_MAX 2147483647
-#endif
-
-#if !defined(INT32_MIN)
-#define INT32_MIN (-2147483647 - 1)
-#endif
 
 #if defined(__cplusplus)
 extern "C" {
diff --git a/aom/aomcx.h b/aom/aomcx.h
index 051d33e..33f4636 100644
--- a/aom/aomcx.h
+++ b/aom/aomcx.h
@@ -31,11 +31,19 @@
 /*!\name Algorithm interface for AV1
  *
  * This interface provides the capability to encode raw AV1 streams.
- * @{
+ *@{
+ */
+
+/*!\brief A single instance of the AV1 encoder.
+ *\deprecated This access mechanism is provided for backwards compatibility;
+ * prefer aom_codec_av1_cx().
  */
 extern aom_codec_iface_t aom_codec_av1_cx_algo;
+
+/*!\brief The interface to the AV1 encoder.
+ */
 extern aom_codec_iface_t *aom_codec_av1_cx(void);
-/*!@} - end algorithm interface member group*/
+/*!@} - end algorithm interface member group */
 
 /*
  * Algorithm Flags
@@ -201,7 +209,10 @@
 
   /* NOTE: enum 15 unused */
 
-  /*!\brief Codec control function to set sharpness, unsigned int parameter.
+  /*!\brief Codec control function to set loop filter sharpness,
+   * unsigned int parameter.
+   *
+   * Valid range: 0..7. The default is 0.
    */
   AOME_SET_SHARPNESS = AOME_SET_ENABLEAUTOALTREF + 2,  // 16
 
@@ -382,7 +393,7 @@
    * AV1 has a bitstream feature to reduce decoding dependency between frames
    * by turning off backward update of probability context used in encoding
    * and decoding. This allows staged parallel processing of more than one
-   * video frames in the decoder. This control function provides a mean to
+   * video frames in the decoder. This control function provides a means to
    * turn this feature on or off for bitstreams produced by encoder.
    *
    * - 0 = disable (default)
@@ -418,10 +429,12 @@
    * AV1 has a segment based feature that allows encoder to adaptively change
    * quantization parameter for each segment within a frame to improve the
    * subjective quality. This control makes encoder operate in one of the
-   * several AQ_modes supported.
+   * several AQ modes supported.
    *
    * - 0 = disable (default)
-   * - 1 = enable
+   * - 1 = variance
+   * - 2 = complexity
+   * - 3 = cyclic refresh
    */
   AV1E_SET_AQ_MODE = 40,
 
@@ -429,7 +442,7 @@
    * int parameter
    *
    * One AV1 encoder speed feature is to enable quality boost by lowering
-   * frame level Q periodically. This control function provides a mean to
+   * frame level Q periodically. This control function provides a means to
    * turn on/off this feature.
    *
    * - 0 = disable (default)
@@ -847,7 +860,17 @@
    */
   AV1E_SET_ENABLE_FLIP_IDTX = 81,
 
-  /* Note: enum value 82 unused */
+  /*!\brief Codec control function to turn on / off rectangular transforms, int
+   * parameter
+   *
+   * This will enable or disable usage of rectangular transforms. NOTE:
+   * Rectangular transforms only enabled when corresponding rectangular
+   * partitions are.
+   *
+   * - 0 = disable
+   * - 1 = enable (default)
+   */
+  AV1E_SET_ENABLE_RECT_TX = 82,
 
   /*!\brief Codec control function to turn on / off dist-wtd compound mode
    * at sequence level, int parameter
@@ -892,7 +915,7 @@
   AV1E_SET_ENABLE_DUAL_FILTER = 86,
 
   /*!\brief Codec control function to turn on / off delta quantization in chroma
-   * planes usage for a sequence, int parameter
+   * planes for a sequence, int parameter
    *
    * - 0 = disable (default)
    * - 1 = enable
@@ -985,9 +1008,6 @@
   /*!\brief Codec control function to turn on / off filter intra usage at
    * sequence level, int parameter
    *
-   * \attention If AV1E_SET_ENABLE_FILTER_INTRA is 0, then this flag is
-   * forced to 0.
-   *
    * - 0 = disable
    * - 1 = enable (default)
    */
@@ -1025,8 +1045,6 @@
   /*!\brief Codec control function to turn on / off frame superresolution, int
    * parameter
    *
-   * \attention If AV1E_SET_ENABLE_SUPERRES is 0, then this flag is forced to 0.
-   *
    * - 0 = disable
    * - 1 = enable (default)
    */
@@ -1256,6 +1274,15 @@
    * Valid values: 0..4
    */
   AV1E_SET_GF_MIN_PYRAMID_HEIGHT = 156,
+
+  /*!\brief Control to set average complexity of the corpus in the case of
+   * single pass vbr based on LAP, unsigned int parameter
+   */
+  AV1E_SET_VBR_CORPUS_COMPLEXITY_LAP = 157,
+
+  /*!\brief Control to get baseline gf interval
+   */
+  AV1E_GET_BASELINE_GF_INTERVAL = 158,
 };
 
 /*!\brief aom 1-D scaling mode
@@ -1266,7 +1293,10 @@
   AOME_NORMAL = 0,
   AOME_FOURFIVE = 1,
   AOME_THREEFIVE = 2,
-  AOME_ONETWO = 3
+  AOME_THREEFOUR = 3,
+  AOME_ONEFOUR = 4,
+  AOME_ONEEIGHT = 5,
+  AOME_ONETWO = 6
 } AOM_SCALING_MODE;
 
 /*!\brief Max number of segments
@@ -1344,7 +1374,8 @@
   /* NOTE: enums 2 and 3 unused */
   AOM_TUNE_VMAF_WITH_PREPROCESSING = 4,
   AOM_TUNE_VMAF_WITHOUT_PREPROCESSING = 5,
-  AOM_TUNE_VMAF_MAX_GAIN = 6
+  AOM_TUNE_VMAF_MAX_GAIN = 6,
+  AOM_TUNE_VMAF_NEG_MAX_GAIN = 7,
 } aom_tune_metric;
 
 #define AOM_MAX_LAYERS 32   /**< Max number of layers */
@@ -1543,6 +1574,9 @@
 AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_FLIP_IDTX, int)
 #define AOM_CTRL_AV1E_SET_ENABLE_FLIP_IDTX
 
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_RECT_TX, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_RECT_TX
+
 AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_DIST_WTD_COMP, int)
 #define AOM_CTRL_AV1E_SET_ENABLE_DIST_WTD_COMP
 
@@ -1675,6 +1709,9 @@
 AOM_CTRL_USE_TYPE(AV1E_GET_SEQ_LEVEL_IDX, int *)
 #define AOM_CTRL_AV1E_GET_SEQ_LEVEL_IDX
 
+AOM_CTRL_USE_TYPE(AV1E_GET_BASELINE_GF_INTERVAL, int *)
+#define AOM_CTRL_AV1E_GET_BASELINE_GF_INTERVAL
+
 AOM_CTRL_USE_TYPE(AV1E_SET_SINGLE_TILE_DECODING, unsigned int)
 #define AOM_CTRL_AV1E_SET_SINGLE_TILE_DECODING
 
@@ -1765,6 +1802,9 @@
 AOM_CTRL_USE_TYPE(AV1E_ENABLE_SB_MULTIPASS_UNIT_TEST, unsigned int)
 #define AOM_CTRL_AV1E_ENABLE_SB_MULTIPASS_UNIT_TEST
 
+AOM_CTRL_USE_TYPE(AV1E_SET_VBR_CORPUS_COMPLEXITY_LAP, unsigned int)
+#define AOM_CTRL_AV1E_SET_VBR_CORPUS_COMPLEXITY_LAP
+
 /*!\endcond */
 /*! @} - end defgroup aom_encoder */
 #ifdef __cplusplus
diff --git a/aom/aomdx.h b/aom/aomdx.h
index 8cd5de3..aa4f435 100644
--- a/aom/aomdx.h
+++ b/aom/aomdx.h
@@ -33,9 +33,17 @@
  * This interface provides the capability to decode AV1 streams.
  * @{
  */
+
+/*!\brief A single instance of the AV1 decoder.
+ *\deprecated This access mechanism is provided for backwards compatibility;
+ * prefer aom_codec_av1_dx().
+ */
 extern aom_codec_iface_t aom_codec_av1_dx_algo;
+/*!\brief The interface to the AV1 decoder.
+ */
 extern aom_codec_iface_t *aom_codec_av1_dx(void);
-/*!@} - end algorithm interface member group*/
+
+/*!@} - end algorithm interface member group */
 
 /** Data structure that stores bit accounting for debug
  */
@@ -89,6 +97,81 @@
   size_t extra_size;
 } aom_tile_data;
 
+/*!\brief Max number of tile columns
+ *
+ * This is the limit of number of tile columns allowed within a frame.
+ *
+ * Currently same as "MAX_TILE_COLS" in AV1, the maximum that AV1 supports.
+ *
+ */
+#define AOM_MAX_TILE_COLS 64
+/*!\brief Max number of tile rows
+ *
+ * This is the limit of number of tile rows allowed within a frame.
+ *
+ * Currently same as "MAX_TILE_ROWS" in AV1, the maximum that AV1 supports.
+ *
+ */
+#define AOM_MAX_TILE_ROWS 64
+
+/*!\brief Structure to hold information about tiles in a frame.
+ *
+ * Defines a structure to hold a frame's tile information, namely
+ * number of tile columns, number of tile_rows, and the width and
+ * height of each tile.
+ */
+typedef struct aom_tile_info {
+  /*! Indicates the number of tile columns. */
+  int tile_columns;
+  /*! Indicates the number of tile rows. */
+  int tile_rows;
+  /*! Indicates the tile widths in units of SB. */
+  int tile_widths[AOM_MAX_TILE_COLS];
+  /*! Indicates the tile heights in units of SB. */
+  int tile_heights[AOM_MAX_TILE_ROWS];
+  /*! Indicates the number of tile groups present in a frame. */
+  int num_tile_groups;
+} aom_tile_info;
+
+/*!\brief Structure to hold information about still image coding.
+ *
+ * Defines a structure to hold a information regarding still picture
+ * and its header type.
+ */
+typedef struct aom_still_picture_info {
+  /*! Video is a single frame still picture */
+  int is_still_picture;
+  /*! Use full header for still picture */
+  int is_reduced_still_picture_hdr;
+} aom_still_picture_info;
+
+/*!\brief Structure to hold information about S_FRAME.
+ *
+ * Defines a structure to hold a information regarding S_FRAME
+ * and its position.
+ */
+typedef struct aom_s_frame_info {
+  /*! Indicates if current frame is S_FRAME */
+  int is_s_frame;
+  /*! Indicates if current S_FRAME is present at ALTREF frame*/
+  int is_s_frame_at_altref;
+} aom_s_frame_info;
+
+/*!\brief Structure to hold information about screen content tools.
+ *
+ * Defines a structure to hold information about screen content
+ * tools, namely: allow_screen_content_tools, allow_intrabc, and
+ * force_integer_mv.
+ */
+typedef struct aom_screen_content_tools_info {
+  /*! Are screen content tools allowed */
+  int allow_screen_content_tools;
+  /*! Is intrabc allowed */
+  int allow_intrabc;
+  /*! Is integer mv forced */
+  int force_integer_mv;
+} aom_screen_content_tools_info;
+
 /*!\brief Structure to hold the external reference frame pointer.
  *
  * Define a structure to hold the external reference frame pointer.
@@ -299,6 +382,50 @@
   AV1D_SET_SKIP_FILM_GRAIN,
 
   AOM_DECODER_CTRL_ID_MAX,
+
+  /*!\brief Codec control function to check the presence of forward key frames
+   */
+  AOMD_GET_FWD_KF_PRESENT,
+
+  /*!\brief Codec control function to get the frame flags of the previous frame
+   * decoded. This will return a flag of type aom_codec_frame_flags_t.
+   */
+  AOMD_GET_FRAME_FLAGS,
+
+  /*!\brief Codec control function to check the presence of altref frames */
+  AOMD_GET_ALTREF_PRESENT,
+
+  /*!\brief Codec control function to get tile information of the previous frame
+   * decoded. This will return a struct of type aom_tile_info.
+   */
+  AOMD_GET_TILE_INFO,
+
+  /*!\brief Codec control function to get screen content tools information.
+   * It returns a struct of type aom_screen_content_tools_info, which contains
+   * the header flags allow_screen_content_tools, allow_intrabc, and
+   * force_integer_mv.
+   */
+  AOMD_GET_SCREEN_CONTENT_TOOLS_INFO,
+
+  /*!\brief Codec control function to get the still picture coding information
+   */
+  AOMD_GET_STILL_PICTURE,
+
+  /*!\brief Codec control function to get superblock size.
+   * It returns an integer, indicating the superblock size
+   * read from the sequence header(0 for BLOCK_64X64 and
+   * 1 for BLOCK_128X128)
+   */
+  AOMD_GET_SB_SIZE,
+
+  /*!\brief Codec control function to check if the previous frame
+   * decoded has show existing frame flag set.
+   */
+  AOMD_GET_SHOW_EXISTING_FRAME_FLAG,
+
+  /*!\brief Codec control function to get the S_FRAME coding information
+   */
+  AOMD_GET_S_FRAME_INFO,
 };
 
 /*!\cond */
@@ -325,6 +452,34 @@
 AOM_CTRL_USE_TYPE(AOMD_GET_LAST_QUANTIZER, int *)
 #define AOM_CTRL_AOMD_GET_LAST_QUANTIZER
 
+AOM_CTRL_USE_TYPE(AOMD_GET_FWD_KF_PRESENT, int *)
+#define AOM_CTRL_AOMD_GET_FWD_KF_PRESENT
+
+AOM_CTRL_USE_TYPE(AOMD_GET_ALTREF_PRESENT, int *)
+#define AOM_CTRL_AOMD_GET_ALTREF_PRESENT
+
+AOM_CTRL_USE_TYPE(AOMD_GET_FRAME_FLAGS, int *)
+#define AOM_CTRL_AOMD_GET_FRAME_FLAGS
+
+AOM_CTRL_USE_TYPE(AOMD_GET_TILE_INFO, aom_tile_info *)
+#define AOM_CTRL_AOMD_GET_TILE_INFO
+
+AOM_CTRL_USE_TYPE(AOMD_GET_SCREEN_CONTENT_TOOLS_INFO,
+                  aom_screen_content_tools_info *)
+#define AOM_CTRL_AOMD_GET_SCREEN_CONTENT_TOOLS_INFO
+
+AOM_CTRL_USE_TYPE(AOMD_GET_STILL_PICTURE, aom_still_picture_info *)
+#define AOM_CTRL_AOMD_GET_STILL_PICTURE
+
+AOM_CTRL_USE_TYPE(AOMD_GET_SB_SIZE, aom_superblock_size_t *)
+#define AOMD_CTRL_AOMD_GET_SB_SIZE
+
+AOM_CTRL_USE_TYPE(AOMD_GET_SHOW_EXISTING_FRAME_FLAG, int *)
+#define AOMD_CTRL_AOMD_GET_SHOW_EXISTING_FRAME_FLAG
+
+AOM_CTRL_USE_TYPE(AOMD_GET_S_FRAME_INFO, aom_s_frame_info *)
+#define AOMD_CTRL_AOMD_GET_S_FRAME_INFO
+
 AOM_CTRL_USE_TYPE(AV1D_GET_DISPLAY_SIZE, int *)
 #define AOM_CTRL_AV1D_GET_DISPLAY_SIZE
 
@@ -389,7 +544,6 @@
 #define AOM_CTRL_AV1_SET_INSPECTION_CALLBACK
 /*!\endcond */
 /*! @} - end defgroup aom_decoder */
-
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/aom/internal/aom_codec_internal.h b/aom/internal/aom_codec_internal.h
index efe09ac..efebd17 100644
--- a/aom/internal/aom_codec_internal.h
+++ b/aom/internal/aom_codec_internal.h
@@ -28,13 +28,15 @@
  *     </pre>
  *
  * An application instantiates a specific decoder instance by using
- * aom_codec_init() and a pointer to the algorithm's interface structure:
+ * aom_codec_dec_init() and a pointer to the algorithm's interface structure:
  *     <pre>
  *     my_app.c:
  *       extern aom_codec_iface_t my_codec;
  *       {
  *           aom_codec_ctx_t algo;
- *           res = aom_codec_init(&algo, &my_codec);
+ *           int threads = 4;
+ *           aom_codec_dec_cfg_t cfg = { threads, 0, 0, 1 };
+ *           res = aom_codec_dec_init(&algo, &my_codec, &cfg, 0);
  *       }
  *     </pre>
  *
@@ -66,7 +68,7 @@
 /*!\brief init function pointer prototype
  *
  * Performs algorithm-specific initialization of the decoder context. This
- * function is called by the generic aom_codec_init() wrapper function, so
+ * function is called by aom_codec_dec_init() and aom_codec_enc_init(), so
  * plugins implementing this interface may trust the input parameters to be
  * properly initialized.
  *
@@ -155,18 +157,24 @@
  *
  * This structure stores the mapping between control identifiers and
  * implementing functions. Each algorithm provides a list of these
- * mappings. This list is searched by the aom_codec_control() wrapper
+ * mappings. This list is searched by the aom_codec_control()
  * function to determine which function to invoke. The special
- * value {0, NULL} is used to indicate end-of-list, and must be
- * present. The special value {0, <non-null>} can be used as a catch-all
- * mapping. This implies that ctrl_id values chosen by the algorithm
- * \ref MUST be non-zero.
+ * value defined by CTRL_MAP_END is used to indicate end-of-list, and must be
+ * present. It can be tested with the at_ctrl_map_end function. Note that
+ * ctrl_id values \ref MUST be non-zero.
  */
 typedef const struct aom_codec_ctrl_fn_map {
   int ctrl_id;
   aom_codec_control_fn_t fn;
 } aom_codec_ctrl_fn_map_t;
 
+#define CTRL_MAP_END \
+  { 0, NULL }
+
+static AOM_INLINE int at_ctrl_map_end(aom_codec_ctrl_fn_map_t *e) {
+  return e->ctrl_id == 0 && e->fn == NULL;
+}
+
 /*!\brief decode data function pointer prototype
  *
  * Processes a buffer of coded data. This function is called by the generic
@@ -307,19 +315,6 @@
 
 #define CAST(id, arg) va_arg((arg), aom_codec_control_type_##id)
 
-/* CODEC_INTERFACE convenience macro
- *
- * By convention, each codec interface is a struct with extern linkage, where
- * the symbol is suffixed with _algo. A getter function is also defined to
- * return a pointer to the struct, since in some cases it's easier to work
- * with text symbols than data symbols (see issue #169). This function has
- * the same name as the struct, less the _algo suffix. The CODEC_INTERFACE
- * macro is provided to define this getter function automatically.
- */
-#define CODEC_INTERFACE(id)                          \
-  aom_codec_iface_t *id(void) { return &id##_algo; } \
-  aom_codec_iface_t id##_algo
-
 /* Internal Utility Functions
  *
  * The following functions are intended to be used inside algorithms as
diff --git a/aom/internal/aom_image_internal.h b/aom/internal/aom_image_internal.h
index 7f2fd18..1b04c9e 100644
--- a/aom/internal/aom_image_internal.h
+++ b/aom/internal/aom_image_internal.h
@@ -32,8 +32,8 @@
 /*!\brief Alloc memory for aom_metadata_array struct.
  *
  * Allocate memory for aom_metadata_array struct.
- * If sz is 0 the aom_metadata_array structs internal buffer list will be NULL,
- * but the aom_metadata_array struct itself will still be allocated.
+ * If sz is 0 the aom_metadata_array struct's internal buffer list will be
+ * NULL, but the aom_metadata_array struct itself will still be allocated.
  * Returns a pointer to the allocated struct or NULL on failure.
  *
  * \param[in]    sz       Size of internal metadata list buffer
diff --git a/aom/src/aom_codec.c b/aom/src/aom_codec.c
index 196ab83..d418463 100644
--- a/aom/src/aom_codec.c
+++ b/aom/src/aom_codec.c
@@ -22,8 +22,6 @@
 #include "aom/aom_integer.h"
 #include "aom/internal/aom_codec_internal.h"
 
-#define SAVE_STATUS(ctx, var) (ctx ? (ctx->err = var) : var)
-
 int aom_codec_version(void) { return VERSION_PACKED; }
 
 const char *aom_codec_version_str(void) { return VERSION_STRING_NOSP; }
@@ -67,22 +65,19 @@
 }
 
 aom_codec_err_t aom_codec_destroy(aom_codec_ctx_t *ctx) {
-  aom_codec_err_t res;
-
-  if (!ctx)
-    res = AOM_CODEC_INVALID_PARAM;
-  else if (!ctx->iface || !ctx->priv)
-    res = AOM_CODEC_ERROR;
-  else {
-    ctx->iface->destroy((aom_codec_alg_priv_t *)ctx->priv);
-
-    ctx->iface = NULL;
-    ctx->name = NULL;
-    ctx->priv = NULL;
-    res = AOM_CODEC_OK;
+  if (!ctx) {
+    return AOM_CODEC_INVALID_PARAM;
   }
-
-  return SAVE_STATUS(ctx, res);
+  if (!ctx->iface || !ctx->priv) {
+    ctx->err = AOM_CODEC_ERROR;
+    return AOM_CODEC_ERROR;
+  }
+  ctx->iface->destroy((aom_codec_alg_priv_t *)ctx->priv);
+  ctx->iface = NULL;
+  ctx->name = NULL;
+  ctx->priv = NULL;
+  ctx->err = AOM_CODEC_OK;
+  return AOM_CODEC_OK;
 }
 
 aom_codec_caps_t aom_codec_get_caps(aom_codec_iface_t *iface) {
@@ -90,30 +85,33 @@
 }
 
 aom_codec_err_t aom_codec_control(aom_codec_ctx_t *ctx, int ctrl_id, ...) {
-  aom_codec_err_t res;
-
-  if (!ctx || !ctrl_id)
-    res = AOM_CODEC_INVALID_PARAM;
-  else if (!ctx->iface || !ctx->priv || !ctx->iface->ctrl_maps)
-    res = AOM_CODEC_ERROR;
-  else {
-    aom_codec_ctrl_fn_map_t *entry;
-
-    res = AOM_CODEC_ERROR;
-
-    for (entry = ctx->iface->ctrl_maps; entry && entry->fn; entry++) {
-      if (!entry->ctrl_id || entry->ctrl_id == ctrl_id) {
-        va_list ap;
-
-        va_start(ap, ctrl_id);
-        res = entry->fn((aom_codec_alg_priv_t *)ctx->priv, ap);
-        va_end(ap);
-        break;
-      }
-    }
+  if (!ctx) {
+    return AOM_CODEC_INVALID_PARAM;
+  }
+  // Control ID must be non-zero.
+  if (!ctrl_id) {
+    ctx->err = AOM_CODEC_INVALID_PARAM;
+    return AOM_CODEC_INVALID_PARAM;
+  }
+  if (!ctx->iface || !ctx->priv || !ctx->iface->ctrl_maps) {
+    ctx->err = AOM_CODEC_ERROR;
+    return AOM_CODEC_ERROR;
   }
 
-  return SAVE_STATUS(ctx, res);
+  // "ctrl_maps" is an array of (control ID, function pointer) elements,
+  // with CTRL_MAP_END as a sentinel.
+  for (aom_codec_ctrl_fn_map_t *entry = ctx->iface->ctrl_maps;
+       !at_ctrl_map_end(entry); ++entry) {
+    if (entry->ctrl_id == ctrl_id) {
+      va_list ap;
+      va_start(ap, ctrl_id);
+      ctx->err = entry->fn((aom_codec_alg_priv_t *)ctx->priv, ap);
+      va_end(ap);
+      return ctx->err;
+    }
+  }
+  ctx->err = AOM_CODEC_ERROR;
+  return AOM_CODEC_ERROR;
 }
 
 void aom_internal_error(struct aom_internal_error_info *info,
diff --git a/aom/src/aom_image.c b/aom/src/aom_image.c
index cd0b5ed..dfdee87 100644
--- a/aom/src/aom_image.c
+++ b/aom/src/aom_image.c
@@ -350,26 +350,18 @@
   }
   aom_metadata_t *metadata =
       aom_img_metadata_alloc(type, data, sz, insert_flag);
-  if (!metadata) goto fail;
-  if (!img->metadata->metadata_array) {
-    img->metadata->metadata_array =
-        (aom_metadata_t **)calloc(1, sizeof(metadata));
-    if (!img->metadata->metadata_array || img->metadata->sz != 0) {
-      aom_img_metadata_free(metadata);
-      goto fail;
-    }
-  } else {
-    img->metadata->metadata_array =
-        (aom_metadata_t **)realloc(img->metadata->metadata_array,
-                                   (img->metadata->sz + 1) * sizeof(metadata));
+  if (!metadata) return -1;
+  aom_metadata_t **metadata_array =
+      (aom_metadata_t **)realloc(img->metadata->metadata_array,
+                                 (img->metadata->sz + 1) * sizeof(metadata));
+  if (!metadata_array) {
+    aom_img_metadata_free(metadata);
+    return -1;
   }
+  img->metadata->metadata_array = metadata_array;
   img->metadata->metadata_array[img->metadata->sz] = metadata;
   img->metadata->sz++;
   return 0;
-fail:
-  aom_img_metadata_array_free(img->metadata);
-  img->metadata = NULL;
-  return -1;
 }
 
 void aom_img_remove_metadata(aom_image_t *img) {
diff --git a/aom_dsp/aom_convolve.c b/aom_dsp/aom_convolve.c
index 7879b88..254f640 100644
--- a/aom_dsp/aom_convolve.c
+++ b/aom_dsp/aom_convolve.c
@@ -111,19 +111,52 @@
                 w, h);
 }
 
+void aom_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+                     ptrdiff_t dst_stride, const InterpKernel *filter,
+                     int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w,
+                     int h) {
+  // Note: Fixed size intermediate buffer, temp, places limits on parameters.
+  // 2d filtering proceeds in 2 steps:
+  //   (1) Interpolate horizontally into an intermediate buffer, temp.
+  //   (2) Interpolate temp vertically to derive the sub-pixel result.
+  // Deriving the maximum number of rows in the temp buffer (135):
+  // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
+  // --Largest block size is 64x64 pixels.
+  // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
+  //   original frame (in 1/16th pixel units).
+  // --Must round-up because block may be located at sub-pixel position.
+  // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
+  // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
+  // When calling in frame scaling function, the smallest scaling factor is x1/4
+  // ==> y_step_q4 = 64. Since w and h are at most 16, the temp buffer is still
+  // big enough.
+  uint8_t temp[64 * 135];
+  const int intermediate_height =
+      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
+
+  assert(w <= 64);
+  assert(h <= 64);
+  assert(y_step_q4 <= 32 || (y_step_q4 <= 64 && h <= 32));
+  assert(x_step_q4 <= 64);
+
+  convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp, 64,
+                 filter, x0_q4, x_step_q4, w, intermediate_height);
+  convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride, filter,
+                y0_q4, y_step_q4, w, h);
+}
+
+void aom_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+                     ptrdiff_t dst_stride, const InterpKernel *filter,
+                     int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w,
+                     int h) {
+  aom_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4,
+                  y0_q4, y_step_q4, w, h);
+}
+
 void aom_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-                         ptrdiff_t dst_stride, const int16_t *filter_x,
-                         int filter_x_stride, const int16_t *filter_y,
-                         int filter_y_stride, int w, int h) {
-  int r;
-
-  (void)filter_x;
-  (void)filter_x_stride;
-  (void)filter_y;
-  (void)filter_y_stride;
-
-  for (r = h; r > 0; --r) {
-    memcpy(dst, src, w);
+                         ptrdiff_t dst_stride, int w, int h) {
+  for (int r = h; r > 0; --r) {
+    memmove(dst, src, w);
     src += src_stride;
     dst += dst_stride;
   }
@@ -216,22 +249,11 @@
                        y_step_q4, w, h, bd);
 }
 
-void aom_highbd_convolve_copy_c(const uint8_t *src8, ptrdiff_t src_stride,
-                                uint8_t *dst8, ptrdiff_t dst_stride,
-                                const int16_t *filter_x, int filter_x_stride,
-                                const int16_t *filter_y, int filter_y_stride,
-                                int w, int h, int bd) {
-  int r;
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
-  (void)filter_x;
-  (void)filter_y;
-  (void)filter_x_stride;
-  (void)filter_y_stride;
-  (void)bd;
-
-  for (r = h; r > 0; --r) {
-    memcpy(dst, src, w * sizeof(uint16_t));
+void aom_highbd_convolve_copy_c(const uint16_t *src, ptrdiff_t src_stride,
+                                uint16_t *dst, ptrdiff_t dst_stride, int w,
+                                int h) {
+  for (int y = 0; y < h; ++y) {
+    memmove(dst, src, w * sizeof(src[0]));
     src += src_stride;
     dst += dst_stride;
   }
diff --git a/aom_dsp/aom_dsp.cmake b/aom_dsp/aom_dsp.cmake
index f1b61f0..7a56223 100644
--- a/aom_dsp/aom_dsp.cmake
+++ b/aom_dsp/aom_dsp.cmake
@@ -48,7 +48,6 @@
             "${AOM_ROOT}/aom_dsp/avg.c")
 
 list(APPEND AOM_DSP_COMMON_ASM_SSE2
-            "${AOM_ROOT}/aom_dsp/x86/aom_convolve_copy_sse2.asm"
             "${AOM_ROOT}/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm"
             "${AOM_ROOT}/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm"
             "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_sse2.asm"
@@ -58,6 +57,7 @@
             "${AOM_ROOT}/aom_dsp/x86/inv_wht_sse2.asm")
 
 list(APPEND AOM_DSP_COMMON_INTRIN_SSE2
+            "${AOM_ROOT}/aom_dsp/x86/aom_convolve_copy_sse2.c"
             "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_intrin_sse2.c"
             "${AOM_ROOT}/aom_dsp/x86/aom_asm_stubs.c"
             "${AOM_ROOT}/aom_dsp/x86/convolve.h"
@@ -104,6 +104,7 @@
             "${AOM_ROOT}/aom_dsp/x86/blend_a64_vmask_sse4.c")
 
 list(APPEND AOM_DSP_COMMON_INTRIN_AVX2
+            "${AOM_ROOT}/aom_dsp/x86/aom_convolve_copy_avx2.c"
             "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c"
             "${AOM_ROOT}/aom_dsp/x86/common_avx2.h"
             "${AOM_ROOT}/aom_dsp/x86/txfm_common_avx2.h"
@@ -122,19 +123,21 @@
                    "${AOM_ROOT}/aom_dsp/x86/highbd_loopfilter_avx2.c")
 endif()
 
-list(APPEND AOM_DSP_COMMON_INTRIN_NEON "${AOM_ROOT}/aom_dsp/arm/fwd_txfm_neon.c"
+list(APPEND AOM_DSP_COMMON_INTRIN_NEON
+            "${AOM_ROOT}/aom_dsp/arm/aom_convolve_copy_neon.c"
+            "${AOM_ROOT}/aom_dsp/arm/fwd_txfm_neon.c"
             "${AOM_ROOT}/aom_dsp/arm/loopfilter_neon.c"
             "${AOM_ROOT}/aom_dsp/arm/intrapred_neon.c"
             "${AOM_ROOT}/aom_dsp/arm/subtract_neon.c"
             "${AOM_ROOT}/aom_dsp/arm/blend_a64_mask_neon.c")
 
 list(APPEND AOM_DSP_COMMON_INTRIN_DSPR2
+            "${AOM_ROOT}/aom_dsp/mips/aom_convolve_copy_dspr2.c"
             "${AOM_ROOT}/aom_dsp/mips/common_dspr2.c"
             "${AOM_ROOT}/aom_dsp/mips/common_dspr2.h"
             "${AOM_ROOT}/aom_dsp/mips/convolve2_dspr2.c"
             "${AOM_ROOT}/aom_dsp/mips/convolve2_horiz_dspr2.c"
             "${AOM_ROOT}/aom_dsp/mips/convolve2_vert_dspr2.c"
-            "${AOM_ROOT}/aom_dsp/mips/convolve8_dspr2.c"
             "${AOM_ROOT}/aom_dsp/mips/convolve8_horiz_dspr2.c"
             "${AOM_ROOT}/aom_dsp/mips/convolve8_vert_dspr2.c"
             "${AOM_ROOT}/aom_dsp/mips/convolve_common_dspr2.h"
@@ -246,9 +249,19 @@
               "${AOM_ROOT}/aom_dsp/x86/obmc_variance_avx2.c"
               "${AOM_ROOT}/aom_dsp/x86/blk_sse_sum_avx2.c"
               "${AOM_ROOT}/aom_dsp/x86/sum_squares_avx2.c")
+  if(NOT CONFIG_AV1_HIGHBITDEPTH)
+    list(REMOVE_ITEM AOM_DSP_ENCODER_INTRIN_AVX2
+                     "${AOM_ROOT}/aom_dsp/x86/highbd_variance_avx2.c")
+  endif()
 
-  list(APPEND AOM_DSP_ENCODER_AVX_ASM_X86_64
-              "${AOM_ROOT}/aom_dsp/x86/quantize_avx_x86_64.asm")
+  if(CONFIG_REALTIME_ONLY)
+    list(REMOVE_ITEM AOM_DSP_ENCODER_INTRIN_AVX2
+                     "${AOM_ROOT}/aom_dsp/x86/obmc_sad_avx2.c"
+                     "${AOM_ROOT}/aom_dsp/x86/obmc_variance_avx2.c")
+  endif()
+
+  list(APPEND AOM_DSP_ENCODER_INTRIN_AVX
+              "${AOM_ROOT}/aom_dsp/x86/aom_quantize_avx.c")
 
   list(APPEND AOM_DSP_ENCODER_INTRIN_SSSE3
               "${AOM_ROOT}/aom_dsp/x86/masked_sad_intrin_ssse3.h"
@@ -272,13 +285,20 @@
                      "${AOM_ROOT}/aom_dsp/x86/highbd_variance_sse4.c")
   endif()
 
+  if(CONFIG_REALTIME_ONLY)
+    list(REMOVE_ITEM AOM_DSP_ENCODER_INTRIN_SSE4_1
+                     "${AOM_ROOT}/aom_dsp/x86/obmc_sad_sse4.c"
+                     "${AOM_ROOT}/aom_dsp/x86/obmc_variance_sse4.c")
+  endif()
+
   list(APPEND AOM_DSP_ENCODER_INTRIN_NEON "${AOM_ROOT}/aom_dsp/arm/sad4d_neon.c"
               "${AOM_ROOT}/aom_dsp/arm/sad_neon.c"
               "${AOM_ROOT}/aom_dsp/arm/subpel_variance_neon.c"
               "${AOM_ROOT}/aom_dsp/arm/variance_neon.c"
               "${AOM_ROOT}/aom_dsp/arm/hadamard_neon.c"
               "${AOM_ROOT}/aom_dsp/arm/avg_neon.c"
-              "${AOM_ROOT}/aom_dsp/arm/sse_neon.c")
+              "${AOM_ROOT}/aom_dsp/arm/sse_neon.c"
+              "${AOM_ROOT}/aom_dsp/arm/sum_squares_neon.c")
 
   list(APPEND AOM_DSP_ENCODER_INTRIN_MSA "${AOM_ROOT}/aom_dsp/mips/sad_msa.c"
               "${AOM_ROOT}/aom_dsp/mips/subtract_msa.c"
@@ -330,6 +350,9 @@
     if(BUILD_SHARED_LIBS)
       target_sources(aom_static PRIVATE $<TARGET_OBJECTS:aom_dsp_encoder>)
     endif()
+    if(CONFIG_TUNE_VMAF)
+      target_include_directories(aom_dsp_encoder PRIVATE ${VMAF_INCLUDE_DIRS})
+    endif()
   endif()
 
   if(HAVE_SSE2)
@@ -372,9 +395,10 @@
     endif()
   endif()
 
-  if(HAVE_AVX AND "${AOM_TARGET_CPU}" STREQUAL "x86_64")
+  if(HAVE_AVX)
     if(CONFIG_AV1_ENCODER)
-      add_asm_library("aom_dsp_encoder_avx" "AOM_DSP_ENCODER_AVX_ASM_X86_64")
+      add_intrinsics_object_library("-mavx" "avx" "aom_dsp_encoder"
+                                    "AOM_DSP_ENCODER_INTRIN_AVX")
     endif()
   endif()
 
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index b7d5a41..3a31ad6 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -47,12 +47,15 @@
     push @block_sizes, [$w, $h] if ($w <= 2*$h && $h <= 2*$w) ;
   }
 }
-push @block_sizes, [4, 16];
-push @block_sizes, [16, 4];
-push @block_sizes, [8, 32];
-push @block_sizes, [32, 8];
-push @block_sizes, [16, 64];
-push @block_sizes, [64, 16];
+
+if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") {
+  push @block_sizes, [4, 16];
+  push @block_sizes, [16, 4];
+  push @block_sizes, [8, 32];
+  push @block_sizes, [32, 8];
+  push @block_sizes, [16, 64];
+  push @block_sizes, [64, 16];
+}
 
 @tx_dims = (2, 4, 8, 16, 32, 64);
 @tx_sizes = ();
@@ -60,7 +63,9 @@
   push @tx_sizes, [$w, $w];
   foreach $h (@tx_dims) {
     push @tx_sizes, [$w, $h] if ($w >=4 && $h >=4 && ($w == 2*$h || $h == 2*$w));
-    push @tx_sizes, [$w, $h] if ($w >=4 && $h >=4 && ($w == 4*$h || $h == 4*$w));
+    if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") {
+      push @tx_sizes, [$w, $h] if ($w >=4 && $h >=4 && ($w == 4*$h || $h == 4*$w));
+    }
   }
 }
 
@@ -84,183 +89,194 @@
 
 specialize qw/aom_dc_top_predictor_4x4 msa neon sse2/;
 specialize qw/aom_dc_top_predictor_4x8 sse2/;
-specialize qw/aom_dc_top_predictor_4x16 sse2/;
 specialize qw/aom_dc_top_predictor_8x4 sse2/;
 specialize qw/aom_dc_top_predictor_8x8 neon msa sse2/;
 specialize qw/aom_dc_top_predictor_8x16 sse2/;
-specialize qw/aom_dc_top_predictor_8x32 sse2/;
-specialize qw/aom_dc_top_predictor_16x4 sse2/;
 specialize qw/aom_dc_top_predictor_16x8 sse2/;
 specialize qw/aom_dc_top_predictor_16x16 neon msa sse2/;
-
 specialize qw/aom_dc_top_predictor_16x32 sse2/;
-specialize qw/aom_dc_top_predictor_16x64 sse2/;
-specialize qw/aom_dc_top_predictor_32x8 sse2/;
 specialize qw/aom_dc_top_predictor_32x16 sse2 avx2/;
 specialize qw/aom_dc_top_predictor_32x32 msa neon sse2 avx2/;
 specialize qw/aom_dc_top_predictor_32x64 sse2 avx2/;
 specialize qw/aom_dc_top_predictor_64x64 sse2 avx2/;
 specialize qw/aom_dc_top_predictor_64x32 sse2 avx2/;
-specialize qw/aom_dc_top_predictor_64x16 sse2 avx2/;
+
 specialize qw/aom_dc_left_predictor_4x4 msa neon sse2/;
 specialize qw/aom_dc_left_predictor_4x8 sse2/;
-specialize qw/aom_dc_left_predictor_4x16 sse2/;
 specialize qw/aom_dc_left_predictor_8x4 sse2/;
 specialize qw/aom_dc_left_predictor_8x8 neon msa sse2/;
 specialize qw/aom_dc_left_predictor_8x16 sse2/;
-specialize qw/aom_dc_left_predictor_8x32 sse2/;
-specialize qw/aom_dc_left_predictor_16x4 sse2/;
 specialize qw/aom_dc_left_predictor_16x8 sse2/;
 specialize qw/aom_dc_left_predictor_16x16 neon msa sse2/;
 specialize qw/aom_dc_left_predictor_16x32 sse2/;
-specialize qw/aom_dc_left_predictor_16x64 sse2/;
-specialize qw/aom_dc_left_predictor_32x8 sse2/;
 specialize qw/aom_dc_left_predictor_32x16 sse2 avx2/;
 specialize qw/aom_dc_left_predictor_32x32 msa neon sse2 avx2/;
 specialize qw/aom_dc_left_predictor_32x64 sse2 avx2/;
 specialize qw/aom_dc_left_predictor_64x64 sse2 avx2/;
 specialize qw/aom_dc_left_predictor_64x32 sse2 avx2/;
-specialize qw/aom_dc_left_predictor_64x16 sse2 avx2/;
+
 specialize qw/aom_dc_128_predictor_4x4 msa neon sse2/;
 specialize qw/aom_dc_128_predictor_4x8 sse2/;
-specialize qw/aom_dc_128_predictor_4x16 sse2/;
 specialize qw/aom_dc_128_predictor_8x4 sse2/;
 specialize qw/aom_dc_128_predictor_8x8 neon msa sse2/;
 specialize qw/aom_dc_128_predictor_8x16 sse2/;
-specialize qw/aom_dc_128_predictor_8x32 sse2/;
-specialize qw/aom_dc_128_predictor_16x4 sse2/;
 specialize qw/aom_dc_128_predictor_16x8 sse2/;
 specialize qw/aom_dc_128_predictor_16x16 neon msa sse2/;
 specialize qw/aom_dc_128_predictor_16x32 sse2/;
-specialize qw/aom_dc_128_predictor_16x64 sse2/;
-specialize qw/aom_dc_128_predictor_32x8 sse2/;
 specialize qw/aom_dc_128_predictor_32x16 sse2 avx2/;
 specialize qw/aom_dc_128_predictor_32x32 msa neon sse2 avx2/;
 specialize qw/aom_dc_128_predictor_32x64 sse2 avx2/;
 specialize qw/aom_dc_128_predictor_64x64 sse2 avx2/;
 specialize qw/aom_dc_128_predictor_64x32 sse2 avx2/;
-specialize qw/aom_dc_128_predictor_64x16 sse2 avx2/;
+
 specialize qw/aom_v_predictor_4x4 neon msa sse2/;
 specialize qw/aom_v_predictor_4x8 sse2/;
-specialize qw/aom_v_predictor_4x16 sse2/;
 specialize qw/aom_v_predictor_8x4 sse2/;
 specialize qw/aom_v_predictor_8x8 neon msa sse2/;
 specialize qw/aom_v_predictor_8x16 sse2/;
-specialize qw/aom_v_predictor_8x32 sse2/;
-specialize qw/aom_v_predictor_16x4 sse2/;
 specialize qw/aom_v_predictor_16x8 sse2/;
 specialize qw/aom_v_predictor_16x16 neon msa sse2/;
 specialize qw/aom_v_predictor_16x32 sse2/;
-specialize qw/aom_v_predictor_16x64 sse2/;
-specialize qw/aom_v_predictor_32x8 sse2/;
 specialize qw/aom_v_predictor_32x16 sse2 avx2/;
 specialize qw/aom_v_predictor_32x32 neon msa sse2 avx2/;
 specialize qw/aom_v_predictor_32x64 sse2 avx2/;
 specialize qw/aom_v_predictor_64x64 sse2 avx2/;
 specialize qw/aom_v_predictor_64x32 sse2 avx2/;
-specialize qw/aom_v_predictor_64x16 sse2 avx2/;
+
 specialize qw/aom_h_predictor_4x8 sse2/;
-specialize qw/aom_h_predictor_4x16 sse2/;
 specialize qw/aom_h_predictor_4x4 neon dspr2 msa sse2/;
 specialize qw/aom_h_predictor_8x4 sse2/;
 specialize qw/aom_h_predictor_8x8 neon dspr2 msa sse2/;
 specialize qw/aom_h_predictor_8x16 sse2/;
-specialize qw/aom_h_predictor_8x32 sse2/;
-specialize qw/aom_h_predictor_16x4 sse2/;
 specialize qw/aom_h_predictor_16x8 sse2/;
 specialize qw/aom_h_predictor_16x16 neon dspr2 msa sse2/;
 specialize qw/aom_h_predictor_16x32 sse2/;
-specialize qw/aom_h_predictor_16x64 sse2/;
-specialize qw/aom_h_predictor_32x8 sse2/;
 specialize qw/aom_h_predictor_32x16 sse2/;
 specialize qw/aom_h_predictor_32x32 neon msa sse2 avx2/;
 specialize qw/aom_h_predictor_32x64 sse2/;
 specialize qw/aom_h_predictor_64x64 sse2/;
 specialize qw/aom_h_predictor_64x32 sse2/;
-specialize qw/aom_h_predictor_64x16 sse2/;
+
 specialize qw/aom_paeth_predictor_4x4 ssse3/;
 specialize qw/aom_paeth_predictor_4x8 ssse3/;
-specialize qw/aom_paeth_predictor_4x16 ssse3/;
 specialize qw/aom_paeth_predictor_8x4 ssse3/;
 specialize qw/aom_paeth_predictor_8x8 ssse3/;
 specialize qw/aom_paeth_predictor_8x16 ssse3/;
-specialize qw/aom_paeth_predictor_8x32 ssse3/;
-specialize qw/aom_paeth_predictor_16x4 ssse3/;
 specialize qw/aom_paeth_predictor_16x8 ssse3 avx2/;
 specialize qw/aom_paeth_predictor_16x16 ssse3 avx2/;
 specialize qw/aom_paeth_predictor_16x32 ssse3 avx2/;
-specialize qw/aom_paeth_predictor_16x64 ssse3 avx2/;
-specialize qw/aom_paeth_predictor_32x8 ssse3/;
 specialize qw/aom_paeth_predictor_32x16 ssse3 avx2/;
 specialize qw/aom_paeth_predictor_32x32 ssse3 avx2/;
 specialize qw/aom_paeth_predictor_32x64 ssse3 avx2/;
 specialize qw/aom_paeth_predictor_64x32 ssse3 avx2/;
 specialize qw/aom_paeth_predictor_64x64 ssse3 avx2/;
-specialize qw/aom_paeth_predictor_64x16 ssse3 avx2/;
-specialize qw/aom_paeth_predictor_16x8 ssse3/;
-specialize qw/aom_paeth_predictor_16x16 ssse3/;
-specialize qw/aom_paeth_predictor_16x32 ssse3/;
-specialize qw/aom_paeth_predictor_32x16 ssse3/;
-specialize qw/aom_paeth_predictor_32x32 ssse3/;
-specialize qw/aom_smooth_predictor_4x4 ssse3/;
-specialize qw/aom_smooth_predictor_4x8 ssse3/;
-specialize qw/aom_smooth_predictor_4x16 ssse3/;
-specialize qw/aom_smooth_predictor_8x4 ssse3/;
-specialize qw/aom_smooth_predictor_8x8 ssse3/;
-specialize qw/aom_smooth_predictor_8x16 ssse3/;
-specialize qw/aom_smooth_predictor_8x32 ssse3/;
-specialize qw/aom_smooth_predictor_16x4 ssse3/;
-specialize qw/aom_smooth_predictor_16x8 ssse3/;
-specialize qw/aom_smooth_predictor_16x16 ssse3/;
-specialize qw/aom_smooth_predictor_16x32 ssse3/;
-specialize qw/aom_smooth_predictor_16x64 ssse3/;
-specialize qw/aom_smooth_predictor_32x8 ssse3/;
-specialize qw/aom_smooth_predictor_32x16 ssse3/;
-specialize qw/aom_smooth_predictor_32x32 ssse3/;
-specialize qw/aom_smooth_predictor_32x64 ssse3/;
-specialize qw/aom_smooth_predictor_64x64 ssse3/;
-specialize qw/aom_smooth_predictor_64x32 ssse3/;
-specialize qw/aom_smooth_predictor_64x16 ssse3/;
+
+specialize qw/aom_smooth_predictor_4x4 neon ssse3/;
+specialize qw/aom_smooth_predictor_4x8 neon ssse3/;
+specialize qw/aom_smooth_predictor_8x4 neon ssse3/;
+specialize qw/aom_smooth_predictor_8x8 neon ssse3/;
+specialize qw/aom_smooth_predictor_8x16 neon ssse3/;
+specialize qw/aom_smooth_predictor_16x8 neon ssse3/;
+specialize qw/aom_smooth_predictor_16x16 neon ssse3/;
+specialize qw/aom_smooth_predictor_16x32 neon ssse3/;
+specialize qw/aom_smooth_predictor_32x16 neon ssse3/;
+specialize qw/aom_smooth_predictor_32x32 neon ssse3/;
+specialize qw/aom_smooth_predictor_32x64 neon ssse3/;
+specialize qw/aom_smooth_predictor_64x64 neon ssse3/;
+specialize qw/aom_smooth_predictor_64x32 neon ssse3/;
 
 specialize qw/aom_smooth_v_predictor_4x4 ssse3/;
 specialize qw/aom_smooth_v_predictor_4x8 ssse3/;
-specialize qw/aom_smooth_v_predictor_4x16 ssse3/;
 specialize qw/aom_smooth_v_predictor_8x4 ssse3/;
 specialize qw/aom_smooth_v_predictor_8x8 ssse3/;
 specialize qw/aom_smooth_v_predictor_8x16 ssse3/;
-specialize qw/aom_smooth_v_predictor_8x32 ssse3/;
-specialize qw/aom_smooth_v_predictor_16x4 ssse3/;
 specialize qw/aom_smooth_v_predictor_16x8 ssse3/;
 specialize qw/aom_smooth_v_predictor_16x16 ssse3/;
 specialize qw/aom_smooth_v_predictor_16x32 ssse3/;
-specialize qw/aom_smooth_v_predictor_16x64 ssse3/;
-specialize qw/aom_smooth_v_predictor_32x8 ssse3/;
 specialize qw/aom_smooth_v_predictor_32x16 ssse3/;
 specialize qw/aom_smooth_v_predictor_32x32 ssse3/;
 specialize qw/aom_smooth_v_predictor_32x64 ssse3/;
 specialize qw/aom_smooth_v_predictor_64x64 ssse3/;
 specialize qw/aom_smooth_v_predictor_64x32 ssse3/;
-specialize qw/aom_smooth_v_predictor_64x16 ssse3/;
 
 specialize qw/aom_smooth_h_predictor_4x4 ssse3/;
 specialize qw/aom_smooth_h_predictor_4x8 ssse3/;
-specialize qw/aom_smooth_h_predictor_4x16 ssse3/;
 specialize qw/aom_smooth_h_predictor_8x4 ssse3/;
 specialize qw/aom_smooth_h_predictor_8x8 ssse3/;
 specialize qw/aom_smooth_h_predictor_8x16 ssse3/;
-specialize qw/aom_smooth_h_predictor_8x32 ssse3/;
-specialize qw/aom_smooth_h_predictor_16x4 ssse3/;
 specialize qw/aom_smooth_h_predictor_16x8 ssse3/;
 specialize qw/aom_smooth_h_predictor_16x16 ssse3/;
 specialize qw/aom_smooth_h_predictor_16x32 ssse3/;
-specialize qw/aom_smooth_h_predictor_16x64 ssse3/;
-specialize qw/aom_smooth_h_predictor_32x8 ssse3/;
 specialize qw/aom_smooth_h_predictor_32x16 ssse3/;
 specialize qw/aom_smooth_h_predictor_32x32 ssse3/;
 specialize qw/aom_smooth_h_predictor_32x64 ssse3/;
 specialize qw/aom_smooth_h_predictor_64x64 ssse3/;
 specialize qw/aom_smooth_h_predictor_64x32 ssse3/;
-specialize qw/aom_smooth_h_predictor_64x16 ssse3/;
+
+if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") {
+  specialize qw/aom_dc_top_predictor_4x16 sse2/;
+  specialize qw/aom_dc_top_predictor_8x32 sse2/;
+  specialize qw/aom_dc_top_predictor_16x4 sse2/;
+  specialize qw/aom_dc_top_predictor_16x64 sse2/;
+  specialize qw/aom_dc_top_predictor_32x8 sse2/;
+  specialize qw/aom_dc_top_predictor_64x16 sse2 avx2/;
+
+  specialize qw/aom_dc_left_predictor_4x16 sse2/;
+  specialize qw/aom_dc_left_predictor_8x32 sse2/;
+  specialize qw/aom_dc_left_predictor_16x4 sse2/;
+  specialize qw/aom_dc_left_predictor_16x64 sse2/;
+  specialize qw/aom_dc_left_predictor_32x8 sse2/;
+  specialize qw/aom_dc_left_predictor_64x16 sse2 avx2/;
+
+  specialize qw/aom_dc_128_predictor_4x16 sse2/;
+  specialize qw/aom_dc_128_predictor_8x32 sse2/;
+  specialize qw/aom_dc_128_predictor_16x4 sse2/;
+  specialize qw/aom_dc_128_predictor_16x64 sse2/;
+  specialize qw/aom_dc_128_predictor_32x8 sse2/;
+  specialize qw/aom_dc_128_predictor_64x16 sse2 avx2/;
+
+  specialize qw/aom_v_predictor_4x16 sse2/;
+  specialize qw/aom_v_predictor_8x32 sse2/;
+  specialize qw/aom_v_predictor_16x4 sse2/;
+  specialize qw/aom_v_predictor_16x64 sse2/;
+  specialize qw/aom_v_predictor_32x8 sse2/;
+  specialize qw/aom_v_predictor_64x16 sse2 avx2/;
+
+  specialize qw/aom_h_predictor_4x16 sse2/;
+  specialize qw/aom_h_predictor_8x32 sse2/;
+  specialize qw/aom_h_predictor_16x4 sse2/;
+  specialize qw/aom_h_predictor_16x64 sse2/;
+  specialize qw/aom_h_predictor_32x8 sse2/;
+  specialize qw/aom_h_predictor_64x16 sse2/;
+
+  specialize qw/aom_paeth_predictor_4x16 ssse3/;
+  specialize qw/aom_paeth_predictor_8x32 ssse3/;
+  specialize qw/aom_paeth_predictor_16x4 ssse3/;
+  specialize qw/aom_paeth_predictor_16x64 ssse3 avx2/;
+  specialize qw/aom_paeth_predictor_32x8 ssse3/;
+  specialize qw/aom_paeth_predictor_64x16 ssse3 avx2/;
+
+  specialize qw/aom_smooth_predictor_4x16 neon ssse3/;
+  specialize qw/aom_smooth_predictor_8x32 neon ssse3/;
+  specialize qw/aom_smooth_predictor_16x4 neon ssse3/;
+  specialize qw/aom_smooth_predictor_16x64 neon ssse3/;
+  specialize qw/aom_smooth_predictor_32x8 neon ssse3/;
+  specialize qw/aom_smooth_predictor_64x16 neon ssse3/;
+
+  specialize qw/aom_smooth_v_predictor_4x16 ssse3/;
+  specialize qw/aom_smooth_v_predictor_8x32 ssse3/;
+  specialize qw/aom_smooth_v_predictor_16x4 ssse3/;
+  specialize qw/aom_smooth_v_predictor_16x64 ssse3/;
+  specialize qw/aom_smooth_v_predictor_32x8 ssse3/;
+  specialize qw/aom_smooth_v_predictor_64x16 ssse3/;
+
+  specialize qw/aom_smooth_h_predictor_4x16 ssse3/;
+  specialize qw/aom_smooth_h_predictor_8x32 ssse3/;
+  specialize qw/aom_smooth_h_predictor_16x4 ssse3/;
+  specialize qw/aom_smooth_h_predictor_16x64 ssse3/;
+  specialize qw/aom_smooth_h_predictor_32x8 ssse3/;
+  specialize qw/aom_smooth_h_predictor_64x16 ssse3/;
+}
 
 # TODO(yunqingwang): optimize rectangular DC_PRED to replace division
 # by multiply and shift.
@@ -353,16 +369,20 @@
 #
 # Sub Pixel Filters
 #
-add_proto qw/void aom_convolve_copy/,             "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int filter_x_stride, const int16_t *filter_y, int filter_y_stride, int w, int h";
+add_proto qw/void aom_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
+add_proto qw/void aom_convolve_copy/,             "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, int w, int h";
 add_proto qw/void aom_convolve8_horiz/,           "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
 add_proto qw/void aom_convolve8_vert/,            "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
 
-specialize qw/aom_convolve_copy       sse2      /;
+specialize qw/aom_convolve_copy       neon dspr2 msa sse2 avx2/;
 specialize qw/aom_convolve8_horiz     sse2 ssse3/, "$avx2_ssse3";
 specialize qw/aom_convolve8_vert      sse2 ssse3/, "$avx2_ssse3";
 
+add_proto qw/void aom_scaled_2d/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
+specialize qw/aom_scaled_2d ssse3 neon/;
+
 if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
-  add_proto qw/void aom_highbd_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int filter_x_stride, const int16_t *filter_y, int filter_y_stride, int w, int h, int bd";
+  add_proto qw/void aom_highbd_convolve_copy/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, int w, int h";
   specialize qw/aom_highbd_convolve_copy sse2 avx2/;
 
   add_proto qw/void aom_highbd_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bd";
@@ -529,19 +549,19 @@
 #
 if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
   add_proto qw/void aom_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/aom_quantize_b sse2/, "$ssse3_x86_64", "$avx_x86_64";
+  specialize qw/aom_quantize_b sse2 neon avx/, "$ssse3_x86_64";
 
   add_proto qw/void aom_quantize_b_adaptive/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
   specialize qw/aom_quantize_b_adaptive sse2 avx2/;
 
   add_proto qw/void aom_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/aom_quantize_b_32x32/, "$ssse3_x86_64", "$avx_x86_64";
+  specialize qw/aom_quantize_b_32x32 neon avx/, "$ssse3_x86_64";
 
   add_proto qw/void aom_quantize_b_32x32_adaptive/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
   specialize qw/aom_quantize_b_32x32_adaptive sse2/;
 
   add_proto qw/void aom_quantize_b_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/aom_quantize_b_64x64 ssse3/;
+  specialize qw/aom_quantize_b_64x64 neon ssse3/;
 
   add_proto qw/void aom_quantize_b_64x64_adaptive/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
   specialize qw/aom_quantize_b_64x64_adaptive sse2/;
@@ -616,7 +636,7 @@
     # Sum of Squares
     #
     add_proto qw/uint64_t aom_sum_squares_2d_i16/, "const int16_t *src, int stride, int width, int height";
-    specialize qw/aom_sum_squares_2d_i16 sse2 avx2/;
+    specialize qw/aom_sum_squares_2d_i16 sse2 avx2 neon/;
 
     add_proto qw/uint64_t aom_sum_squares_i16/, "const int16_t *src, uint32_t N";
     specialize qw/aom_sum_squares_i16 sse2/;
@@ -634,11 +654,14 @@
   foreach (@block_sizes) {
     ($w, $h) = @$_;
     add_proto qw/unsigned int/, "aom_sad${w}x${h}", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+    add_proto qw/unsigned int/, "aom_sad_skip_${w}x${h}", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
     add_proto qw/unsigned int/, "aom_sad${w}x${h}_avg", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
     add_proto qw/unsigned int/, "aom_dist_wtd_sad${w}x${h}_avg", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param";
   }
 
-  specialize qw/aom_sad128x128    avx2          sse2/;
+  add_proto qw/uint64_t aom_sum_sse_2d_i16/, "const int16_t *src, int src_stride, int width, int height, int *sum";
+  specialize qw/aom_sum_sse_2d_i16 sse2 avx2/;
+  specialize qw/aom_sad128x128    avx2 neon     sse2/;
   specialize qw/aom_sad128x64     avx2          sse2/;
   specialize qw/aom_sad64x128     avx2          sse2/;
   specialize qw/aom_sad64x64      avx2 neon msa sse2/;
@@ -655,6 +678,34 @@
   specialize qw/aom_sad4x8                  msa sse2/;
   specialize qw/aom_sad4x4             neon msa sse2/;
 
+  specialize qw/aom_sad4x16                     sse2/;
+  specialize qw/aom_sad16x4                     sse2/;
+  specialize qw/aom_sad8x32                     sse2/;
+  specialize qw/aom_sad32x8                     sse2/;
+  specialize qw/aom_sad16x64                    sse2/;
+  specialize qw/aom_sad64x16                    sse2/;
+
+  specialize qw/aom_sad_skip_128x128    avx2          sse2  neon/;
+  specialize qw/aom_sad_skip_128x64     avx2          sse2  neon/;
+  specialize qw/aom_sad_skip_64x128     avx2          sse2  neon/;
+  specialize qw/aom_sad_skip_64x64      avx2          sse2  neon/;
+  specialize qw/aom_sad_skip_64x32      avx2          sse2  neon/;
+  specialize qw/aom_sad_skip_32x64      avx2          sse2  neon/;
+  specialize qw/aom_sad_skip_32x32      avx2          sse2  neon/;
+  specialize qw/aom_sad_skip_32x16      avx2          sse2  neon/;
+  specialize qw/aom_sad_skip_16x32                    sse2  neon/;
+  specialize qw/aom_sad_skip_16x16                    sse2  neon/;
+  specialize qw/aom_sad_skip_16x8                     sse2  neon/;
+  specialize qw/aom_sad_skip_8x16                     sse2  neon/;
+  specialize qw/aom_sad_skip_8x8                      sse2  neon/;
+  specialize qw/aom_sad_skip_4x8                      sse2  neon/;
+
+  specialize qw/aom_sad_skip_4x16                     sse2  neon/;
+  specialize qw/aom_sad_skip_8x32                     sse2  neon/;
+  specialize qw/aom_sad_skip_32x8                     sse2  neon/;
+  specialize qw/aom_sad_skip_16x64                    sse2  neon/;
+  specialize qw/aom_sad_skip_64x16                    sse2  neon/;
+
   specialize qw/aom_sad128x128_avg avx2     sse2/;
   specialize qw/aom_sad128x64_avg  avx2     sse2/;
   specialize qw/aom_sad64x128_avg  avx2     sse2/;
@@ -672,19 +723,12 @@
   specialize qw/aom_sad4x8_avg          msa sse2/;
   specialize qw/aom_sad4x4_avg          msa sse2/;
 
-  specialize qw/aom_sad4x16      sse2/;
-  specialize qw/aom_sad16x4      sse2/;
-  specialize qw/aom_sad8x32      sse2/;
-  specialize qw/aom_sad32x8      sse2/;
-  specialize qw/aom_sad16x64     sse2/;
-  specialize qw/aom_sad64x16     sse2/;
-
-  specialize qw/aom_sad4x16_avg  sse2/;
-  specialize qw/aom_sad16x4_avg  sse2/;
-  specialize qw/aom_sad8x32_avg  sse2/;
-  specialize qw/aom_sad32x8_avg  sse2/;
-  specialize qw/aom_sad16x64_avg sse2/;
-  specialize qw/aom_sad64x16_avg sse2/;
+  specialize qw/aom_sad4x16_avg             sse2/;
+  specialize qw/aom_sad16x4_avg             sse2/;
+  specialize qw/aom_sad8x32_avg             sse2/;
+  specialize qw/aom_sad32x8_avg             sse2/;
+  specialize qw/aom_sad16x64_avg            sse2/;
+  specialize qw/aom_sad64x16_avg            sse2/;
 
   specialize qw/aom_dist_wtd_sad128x128_avg ssse3/;
   specialize qw/aom_dist_wtd_sad128x64_avg  ssse3/;
@@ -728,6 +772,7 @@
     foreach (@block_sizes) {
       ($w, $h) = @$_;
       add_proto qw/unsigned int/, "aom_highbd_sad${w}x${h}", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+      add_proto qw/unsigned int/, "aom_highbd_sad_skip_${w}x${h}", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
       add_proto qw/unsigned int/, "aom_highbd_sad${w}x${h}_avg", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
       if ($w != 128 && $h != 128 && $w != 4) {
         specialize "aom_highbd_sad${w}x${h}", qw/sse2/;
@@ -746,9 +791,39 @@
     specialize qw/aom_highbd_sad16x32   avx2 sse2/;
     specialize qw/aom_highbd_sad16x16   avx2 sse2/;
     specialize qw/aom_highbd_sad16x8    avx2 sse2/;
-    specialize qw/aom_highbd_sad8x4     sse2/;
-    specialize qw/aom_highbd_sad4x8     sse2/;
-    specialize qw/aom_highbd_sad4x4     sse2/;
+    specialize qw/aom_highbd_sad8x16         sse2/;
+    specialize qw/aom_highbd_sad8x8          sse2/;
+    specialize qw/aom_highbd_sad8x4          sse2/;
+    specialize qw/aom_highbd_sad4x8          sse2/;
+    specialize qw/aom_highbd_sad4x4          sse2/;
+
+    specialize qw/aom_highbd_sad4x16         sse2/;
+    specialize qw/aom_highbd_sad16x4    avx2 sse2/;
+    specialize qw/aom_highbd_sad8x32         sse2/;
+    specialize qw/aom_highbd_sad32x8    avx2 sse2/;
+    specialize qw/aom_highbd_sad16x64   avx2 sse2/;
+    specialize qw/aom_highbd_sad64x16   avx2 sse2/;
+
+    specialize qw/aom_highbd_sad_skip_128x128 avx2/;
+    specialize qw/aom_highbd_sad_skip_128x64  avx2/;
+    specialize qw/aom_highbd_sad_skip_64x128  avx2/;
+    specialize qw/aom_highbd_sad_skip_64x64   avx2 sse2/;
+    specialize qw/aom_highbd_sad_skip_64x32   avx2 sse2/;
+    specialize qw/aom_highbd_sad_skip_32x64   avx2 sse2/;
+    specialize qw/aom_highbd_sad_skip_32x32   avx2 sse2/;
+    specialize qw/aom_highbd_sad_skip_32x16   avx2 sse2/;
+    specialize qw/aom_highbd_sad_skip_16x32   avx2 sse2/;
+    specialize qw/aom_highbd_sad_skip_16x16   avx2 sse2/;
+    specialize qw/aom_highbd_sad_skip_16x8    avx2 sse2/;
+    specialize qw/aom_highbd_sad_skip_8x16         sse2/;
+    specialize qw/aom_highbd_sad_skip_8x8          sse2/;
+    specialize qw/aom_highbd_sad_skip_4x8          sse2/;
+
+    specialize qw/aom_highbd_sad_skip_4x16         sse2/;
+    specialize qw/aom_highbd_sad_skip_8x32         sse2/;
+    specialize qw/aom_highbd_sad_skip_32x8    avx2 sse2/;
+    specialize qw/aom_highbd_sad_skip_16x64   avx2 sse2/;
+    specialize qw/aom_highbd_sad_skip_64x16   avx2 sse2/;
 
     specialize qw/aom_highbd_sad128x128_avg avx2/;
     specialize qw/aom_highbd_sad128x64_avg  avx2/;
@@ -765,13 +840,6 @@
     specialize qw/aom_highbd_sad4x8_avg     sse2/;
     specialize qw/aom_highbd_sad4x4_avg     sse2/;
 
-    specialize qw/aom_highbd_sad4x16        sse2/;
-    specialize qw/aom_highbd_sad16x4        avx2 sse2/;
-    specialize qw/aom_highbd_sad8x32        sse2/;
-    specialize qw/aom_highbd_sad32x8        avx2 sse2/;
-    specialize qw/aom_highbd_sad16x64       avx2 sse2/;
-    specialize qw/aom_highbd_sad64x16       avx2 sse2/;
-
     specialize qw/aom_highbd_sad4x16_avg    sse2/;
     specialize qw/aom_highbd_sad16x4_avg    avx2 sse2/;
     specialize qw/aom_highbd_sad8x32_avg    sse2/;
@@ -799,20 +867,22 @@
   #
   # OBMC SAD
   #
-  foreach (@block_sizes) {
-    ($w, $h) = @$_;
-    add_proto qw/unsigned int/, "aom_obmc_sad${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask";
-    if (! (($w == 128 && $h == 32) || ($w == 32 && $h == 128))) {
-       specialize "aom_obmc_sad${w}x${h}", qw/sse4_1 avx2/;
-    }
-  }
-
-  if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+  if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") {
     foreach (@block_sizes) {
       ($w, $h) = @$_;
-      add_proto qw/unsigned int/, "aom_highbd_obmc_sad${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask";
+      add_proto qw/unsigned int/, "aom_obmc_sad${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask";
       if (! (($w == 128 && $h == 32) || ($w == 32 && $h == 128))) {
-        specialize "aom_highbd_obmc_sad${w}x${h}", qw/sse4_1 avx2/;
+        specialize "aom_obmc_sad${w}x${h}", qw/sse4_1 avx2/;
+      }
+    }
+
+    if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+      foreach (@block_sizes) {
+        ($w, $h) = @$_;
+        add_proto qw/unsigned int/, "aom_highbd_obmc_sad${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask";
+        if (! (($w == 128 && $h == 32) || ($w == 32 && $h == 128))) {
+          specialize "aom_highbd_obmc_sad${w}x${h}", qw/sse4_1 avx2/;
+        }
       }
     }
   }
@@ -824,6 +894,7 @@
     ($w, $h) = @$_;
     add_proto qw/void/, "aom_sad${w}x${h}x4d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
     add_proto qw/void/, "aom_sad${w}x${h}x4d_avg", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, const uint8_t *second_pred, uint32_t *sad_array";
+    add_proto qw/void/, "aom_sad_skip_${w}x${h}x4d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
     add_proto qw/void/, "aom_masked_sad${w}x${h}x4d", "const uint8_t *src, int src_stride, const uint8_t *ref[], int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned sads[]";
   }
 
@@ -856,6 +927,31 @@
   specialize qw/aom_sad32x8x4d  sse2/;
   specialize qw/aom_sad64x16x4d sse2/;
 
+  specialize qw/aom_sad_skip_128x128x4d avx2 sse2 neon/;
+  specialize qw/aom_sad_skip_128x64x4d  avx2 sse2 neon/;
+  specialize qw/aom_sad_skip_64x128x4d  avx2 sse2 neon/;
+  specialize qw/aom_sad_skip_64x64x4d   avx2 sse2 neon/;
+  specialize qw/aom_sad_skip_64x32x4d   avx2 sse2 neon/;
+  specialize qw/aom_sad_skip_64x16x4d   avx2 sse2 neon/;
+  specialize qw/aom_sad_skip_32x64x4d   avx2 sse2 neon/;
+  specialize qw/aom_sad_skip_32x32x4d   avx2 sse2 neon/;
+  specialize qw/aom_sad_skip_32x16x4d   avx2 sse2 neon/;
+  specialize qw/aom_sad_skip_32x8x4d    avx2 sse2 neon/;
+
+  specialize qw/aom_sad_skip_16x64x4d        sse2 neon/;
+  specialize qw/aom_sad_skip_16x32x4d        sse2 neon/;
+  specialize qw/aom_sad_skip_16x16x4d        sse2 neon/;
+  specialize qw/aom_sad_skip_16x8x4d         sse2 neon/;
+  specialize qw/aom_sad_skip_8x16x4d         sse2 neon/;
+  specialize qw/aom_sad_skip_8x8x4d          sse2 neon/;
+  specialize qw/aom_sad_skip_4x16x4d         sse2 neon/;
+  specialize qw/aom_sad_skip_4x8x4d          sse2 neon/;
+  specialize qw/aom_sad_skip_4x32x4d         sse2 neon/;
+  specialize qw/aom_sad_skip_4x16x4d         sse2 neon/;
+  specialize qw/aom_sad_skip_8x32x4d         sse2 neon/;
+  specialize qw/aom_sad_skip_32x8x4d         sse2 neon/;
+  specialize qw/aom_sad_skip_64x16x4d        sse2 neon/;
+
   specialize qw/aom_sad128x128x4d_avg sse2/;
   specialize qw/aom_sad128x64x4d_avg  sse2/;
   specialize qw/aom_sad64x128x4d_avg  sse2/;
@@ -920,6 +1016,7 @@
     foreach (@block_sizes) {
       ($w, $h) = @$_;
       add_proto qw/void/, "aom_highbd_sad${w}x${h}x4d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+      add_proto qw/void/, "aom_highbd_sad_skip_${w}x${h}x4d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
       if ($w != 128 && $h != 128) {
         specialize "aom_highbd_sad${w}x${h}x4d", qw/sse2/;
       }
@@ -947,6 +1044,27 @@
     specialize qw/aom_highbd_sad32x8x4d    avx2 sse2/;
     specialize qw/aom_highbd_sad16x64x4d   avx2 sse2/;
     specialize qw/aom_highbd_sad64x16x4d   avx2 sse2/;
+
+    specialize qw/aom_highbd_sad_skip_128x128x4d avx2/;
+    specialize qw/aom_highbd_sad_skip_128x64x4d  avx2/;
+    specialize qw/aom_highbd_sad_skip_64x128x4d  avx2/;
+    specialize qw/aom_highbd_sad_skip_64x64x4d   avx2 sse2/;
+    specialize qw/aom_highbd_sad_skip_64x32x4d   avx2 sse2/;
+    specialize qw/aom_highbd_sad_skip_32x64x4d   avx2 sse2/;
+    specialize qw/aom_highbd_sad_skip_32x32x4d   avx2 sse2/;
+    specialize qw/aom_highbd_sad_skip_32x16x4d   avx2 sse2/;
+    specialize qw/aom_highbd_sad_skip_16x32x4d   avx2 sse2/;
+    specialize qw/aom_highbd_sad_skip_16x16x4d   avx2 sse2/;
+    specialize qw/aom_highbd_sad_skip_16x8x4d    avx2 sse2/;
+    specialize qw/aom_highbd_sad_skip_8x16x4d         sse2/;
+    specialize qw/aom_highbd_sad_skip_8x8x4d          sse2/;
+    specialize qw/aom_highbd_sad_skip_4x8x4d          sse2/;
+
+    specialize qw/aom_highbd_sad_skip_4x16x4d         sse2/;
+    specialize qw/aom_highbd_sad_skip_8x32x4d         sse2/;
+    specialize qw/aom_highbd_sad_skip_32x8x4d    avx2 sse2/;
+    specialize qw/aom_highbd_sad_skip_16x64x4d   avx2 sse2/;
+    specialize qw/aom_highbd_sad_skip_64x16x4d   avx2 sse2/;
   }
   #
   # Avg
@@ -967,14 +1085,15 @@
   }
 
   add_proto qw/void aom_int_pro_row/, "int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height";
-  specialize qw/aom_int_pro_row sse2/;
+  specialize qw/aom_int_pro_row sse2 neon/;
 
   add_proto qw/int16_t aom_int_pro_col/, "const uint8_t *ref, const int width";
-  specialize qw/aom_int_pro_col sse2/;
+  specialize qw/aom_int_pro_col sse2 neon/;
 
   add_proto qw/int aom_vector_var/, "const int16_t *ref, const int16_t *src, const int bwl";
+  specialize qw/aom_vector_var neon/;
   # TODO(kyslov@) bring back SSE2 by extending it to 128 block size
-  #specialize qw/aom_vector_var sse2/;
+  #specialize qw/aom_vector_var neon sse2/;
 
   #
   # hamadard transform and satd for implmenting temporal dependency model
@@ -1006,7 +1125,7 @@
     specialize qw/aom_highbd_hadamard_32x32 avx2/;
   }
   add_proto qw/int aom_satd/, "const tran_low_t *coeff, int length";
-  specialize qw/aom_satd avx2/;
+  specialize qw/aom_satd neon avx2/;
 
   add_proto qw/int aom_satd_lp/, "const int16_t *coeff, int length";
   specialize qw/aom_satd_lp avx2 neon/;
@@ -1129,6 +1248,9 @@
 
   add_proto qw/unsigned int/, "aom_variance4x2", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
 
+  add_proto qw/uint64_t/, "aom_mse_wxh_16bit", "uint8_t *dst, int dstride,uint16_t *src, int sstride, int w, int h";
+  specialize qw/aom_mse_wxh_16bit  sse2 avx2/;
+
   foreach (@block_sizes) {
     ($w, $h) = @$_;
     add_proto qw/unsigned int/, "aom_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
@@ -1137,38 +1259,38 @@
     add_proto qw/uint32_t/, "aom_dist_wtd_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param";
   }
   specialize qw/aom_variance128x128   sse2 avx2 neon    /;
-  specialize qw/aom_variance128x64    sse2 avx2         /;
-  specialize qw/aom_variance64x128    sse2 avx2         /;
+  specialize qw/aom_variance128x64    sse2 avx2 neon    /;
+  specialize qw/aom_variance64x128    sse2 avx2 neon    /;
   specialize qw/aom_variance64x64     sse2 avx2 neon msa/;
   specialize qw/aom_variance64x32     sse2 avx2 neon msa/;
   specialize qw/aom_variance32x64     sse2 avx2 neon msa/;
   specialize qw/aom_variance32x32     sse2 avx2 neon msa/;
-  specialize qw/aom_variance32x16     sse2 avx2      msa/;
-  specialize qw/aom_variance16x32     sse2 avx2      msa/;
+  specialize qw/aom_variance32x16     sse2 avx2 neon msa/;
+  specialize qw/aom_variance16x32     sse2 avx2 neon msa/;
   specialize qw/aom_variance16x16     sse2 avx2 neon msa/;
   specialize qw/aom_variance16x8      sse2 avx2 neon msa/;
   specialize qw/aom_variance8x16      sse2      neon msa/;
   specialize qw/aom_variance8x8       sse2      neon msa/;
-  specialize qw/aom_variance8x4       sse2           msa/;
-  specialize qw/aom_variance4x8       sse2           msa/;
-  specialize qw/aom_variance4x4       sse2           msa/;
+  specialize qw/aom_variance8x4       sse2      neon msa/;
+  specialize qw/aom_variance4x8       sse2      neon msa/;
+  specialize qw/aom_variance4x4       sse2      neon msa/;
 
-  specialize qw/aom_sub_pixel_variance128x128   avx2          sse2 ssse3/;
-  specialize qw/aom_sub_pixel_variance128x64    avx2          sse2 ssse3/;
-  specialize qw/aom_sub_pixel_variance64x128    avx2          sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance128x128   avx2 neon msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance128x64    avx2 neon msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance64x128    avx2 neon msa sse2 ssse3/;
   specialize qw/aom_sub_pixel_variance64x64     avx2 neon msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_variance64x32     avx2      msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_variance32x64     avx2      msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance64x32     avx2 neon msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance32x64     avx2 neon msa sse2 ssse3/;
   specialize qw/aom_sub_pixel_variance32x32     avx2 neon msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_variance32x16     avx2      msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_variance16x32     avx2      msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance32x16     avx2 neon msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance16x32     avx2 neon msa sse2 ssse3/;
   specialize qw/aom_sub_pixel_variance16x16     avx2 neon msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_variance16x8      avx2      msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_variance8x16                msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance16x8      avx2 neon msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance8x16           neon msa sse2 ssse3/;
   specialize qw/aom_sub_pixel_variance8x8            neon msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_variance8x4                 msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_variance4x8                 msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_variance4x4                 msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance8x4            neon msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance4x8            neon msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance4x4            neon msa sse2 ssse3/;
 
   specialize qw/aom_sub_pixel_avg_variance128x128 avx2     sse2 ssse3/;
   specialize qw/aom_sub_pixel_avg_variance128x64  avx2     sse2 ssse3/;
@@ -1187,25 +1309,34 @@
   specialize qw/aom_sub_pixel_avg_variance4x8          msa sse2 ssse3/;
   specialize qw/aom_sub_pixel_avg_variance4x4          msa sse2 ssse3/;
 
-  specialize qw/aom_variance4x16 sse2/;
-  specialize qw/aom_variance16x4 sse2 avx2/;
-  specialize qw/aom_variance8x32 sse2/;
-  specialize qw/aom_variance32x8 sse2 avx2/;
-  specialize qw/aom_variance16x64 sse2 avx2/;
-  specialize qw/aom_variance64x16 sse2 avx2/;
+  if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") {
+    specialize qw/aom_variance4x16 sse2/;
+    specialize qw/aom_variance16x4 sse2 avx2/;
+    specialize qw/aom_variance8x32 sse2/;
+    specialize qw/aom_variance32x8 sse2 avx2/;
+    specialize qw/aom_variance16x64 sse2 avx2/;
+    specialize qw/aom_variance64x16 sse2 avx2/;
 
-  specialize qw/aom_sub_pixel_variance4x16 sse2 ssse3/;
-  specialize qw/aom_sub_pixel_variance16x4 avx2 sse2 ssse3/;
-  specialize qw/aom_sub_pixel_variance8x32 sse2 ssse3/;
-  specialize qw/aom_sub_pixel_variance32x8 sse2 ssse3/;
-  specialize qw/aom_sub_pixel_variance16x64 avx2 sse2 ssse3/;
-  specialize qw/aom_sub_pixel_variance64x16 sse2 ssse3/;
-  specialize qw/aom_sub_pixel_avg_variance4x16 sse2 ssse3/;
-  specialize qw/aom_sub_pixel_avg_variance16x4 sse2 ssse3/;
-  specialize qw/aom_sub_pixel_avg_variance8x32 sse2 ssse3/;
-  specialize qw/aom_sub_pixel_avg_variance32x8 sse2 ssse3/;
-  specialize qw/aom_sub_pixel_avg_variance16x64 sse2 ssse3/;
-  specialize qw/aom_sub_pixel_avg_variance64x16 sse2 ssse3/;
+    specialize qw/aom_sub_pixel_variance4x16 neon sse2 ssse3/;
+    specialize qw/aom_sub_pixel_variance16x4 neon avx2 sse2 ssse3/;
+    specialize qw/aom_sub_pixel_variance8x32 neon sse2 ssse3/;
+    specialize qw/aom_sub_pixel_variance32x8 neon sse2 ssse3/;
+    specialize qw/aom_sub_pixel_variance16x64 neon avx2 sse2 ssse3/;
+    specialize qw/aom_sub_pixel_variance64x16 neon sse2 ssse3/;
+    specialize qw/aom_sub_pixel_avg_variance4x16 sse2 ssse3/;
+    specialize qw/aom_sub_pixel_avg_variance16x4 sse2 ssse3/;
+    specialize qw/aom_sub_pixel_avg_variance8x32 sse2 ssse3/;
+    specialize qw/aom_sub_pixel_avg_variance32x8 sse2 ssse3/;
+    specialize qw/aom_sub_pixel_avg_variance16x64 sse2 ssse3/;
+    specialize qw/aom_sub_pixel_avg_variance64x16 sse2 ssse3/;
+
+    specialize qw/aom_dist_wtd_sub_pixel_avg_variance4x16  ssse3/;
+    specialize qw/aom_dist_wtd_sub_pixel_avg_variance16x4  ssse3/;
+    specialize qw/aom_dist_wtd_sub_pixel_avg_variance8x32  ssse3/;
+    specialize qw/aom_dist_wtd_sub_pixel_avg_variance32x8  ssse3/;
+    specialize qw/aom_dist_wtd_sub_pixel_avg_variance16x64 ssse3/;
+    specialize qw/aom_dist_wtd_sub_pixel_avg_variance64x16 ssse3/;
+  }
 
   specialize qw/aom_dist_wtd_sub_pixel_avg_variance64x64 ssse3/;
   specialize qw/aom_dist_wtd_sub_pixel_avg_variance64x32 ssse3/;
@@ -1221,13 +1352,6 @@
   specialize qw/aom_dist_wtd_sub_pixel_avg_variance4x8   ssse3/;
   specialize qw/aom_dist_wtd_sub_pixel_avg_variance4x4   ssse3/;
 
-  specialize qw/aom_dist_wtd_sub_pixel_avg_variance4x16  ssse3/;
-  specialize qw/aom_dist_wtd_sub_pixel_avg_variance16x4  ssse3/;
-  specialize qw/aom_dist_wtd_sub_pixel_avg_variance8x32  ssse3/;
-  specialize qw/aom_dist_wtd_sub_pixel_avg_variance32x8  ssse3/;
-  specialize qw/aom_dist_wtd_sub_pixel_avg_variance16x64 ssse3/;
-  specialize qw/aom_dist_wtd_sub_pixel_avg_variance64x16 ssse3/;
-
   specialize qw/aom_dist_wtd_sub_pixel_avg_variance128x128  ssse3/;
   specialize qw/aom_dist_wtd_sub_pixel_avg_variance128x64   ssse3/;
   specialize qw/aom_dist_wtd_sub_pixel_avg_variance64x128   ssse3/;
@@ -1288,21 +1412,23 @@
   #
   # OBMC Variance / OBMC Subpixel Variance
   #
-  foreach (@block_sizes) {
-    ($w, $h) = @$_;
-    add_proto qw/unsigned int/, "aom_obmc_variance${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse";
-    add_proto qw/unsigned int/, "aom_obmc_sub_pixel_variance${w}x${h}", "const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse";
-    specialize "aom_obmc_variance${w}x${h}", qw/sse4_1 avx2/;
-    specialize "aom_obmc_sub_pixel_variance${w}x${h}", q/sse4_1/;
-  }
+  if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") {
+    foreach (@block_sizes) {
+      ($w, $h) = @$_;
+      add_proto qw/unsigned int/, "aom_obmc_variance${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse";
+      add_proto qw/unsigned int/, "aom_obmc_sub_pixel_variance${w}x${h}", "const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse";
+      specialize "aom_obmc_variance${w}x${h}", qw/sse4_1 avx2/;
+      specialize "aom_obmc_sub_pixel_variance${w}x${h}", q/sse4_1/;
+    }
 
-  if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
-    foreach $bd ("_", "_10_", "_12_") {
-      foreach (@block_sizes) {
-        ($w, $h) = @$_;
-        add_proto qw/unsigned int/, "aom_highbd${bd}obmc_variance${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse";
-        add_proto qw/unsigned int/, "aom_highbd${bd}obmc_sub_pixel_variance${w}x${h}", "const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse";
-        specialize "aom_highbd${bd}obmc_variance${w}x${h}", qw/sse4_1/;
+    if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+      foreach $bd ("_", "_10_", "_12_") {
+        foreach (@block_sizes) {
+          ($w, $h) = @$_;
+          add_proto qw/unsigned int/, "aom_highbd${bd}obmc_variance${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse";
+          add_proto qw/unsigned int/, "aom_highbd${bd}obmc_sub_pixel_variance${w}x${h}", "const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse";
+          specialize "aom_highbd${bd}obmc_variance${w}x${h}", qw/sse4_1/;
+        }
       }
     }
   }
@@ -1522,6 +1648,9 @@
 
     add_proto qw/void aom_highbd_dist_wtd_comp_avg_pred/, "uint8_t *comp_pred8, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride, const DIST_WTD_COMP_PARAMS *jcp_param";
     specialize qw/aom_highbd_dist_wtd_comp_avg_pred sse2/;
+
+    add_proto qw/uint64_t/, "aom_mse_wxh_16bit_highbd", "uint16_t *dst, int dstride,uint16_t *src, int sstride, int w, int h";
+    specialize qw/aom_mse_wxh_16bit_highbd   sse2 avx2/;
   }
     #
     # Subpixel Variance
@@ -1573,43 +1702,43 @@
       add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
 
       add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance128x128/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      specialize qw/aom_highbd_10_sub_pixel_variance128x128 sse2/;
+      specialize qw/aom_highbd_10_sub_pixel_variance128x128 sse2 avx2/;
 
       add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance128x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      specialize qw/aom_highbd_10_sub_pixel_variance128x64 sse2/;
+      specialize qw/aom_highbd_10_sub_pixel_variance128x64 sse2 avx2/;
 
       add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance64x128/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      specialize qw/aom_highbd_10_sub_pixel_variance64x128 sse2/;
+      specialize qw/aom_highbd_10_sub_pixel_variance64x128 sse2 avx2/;
 
       add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      specialize qw/aom_highbd_10_sub_pixel_variance64x64 sse2/;
+      specialize qw/aom_highbd_10_sub_pixel_variance64x64 sse2 avx2/;
 
       add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      specialize qw/aom_highbd_10_sub_pixel_variance64x32 sse2/;
+      specialize qw/aom_highbd_10_sub_pixel_variance64x32 sse2 avx2/;
 
       add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      specialize qw/aom_highbd_10_sub_pixel_variance32x64 sse2/;
+      specialize qw/aom_highbd_10_sub_pixel_variance32x64 sse2 avx2/;
 
       add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      specialize qw/aom_highbd_10_sub_pixel_variance32x32 sse2/;
+      specialize qw/aom_highbd_10_sub_pixel_variance32x32 sse2 avx2/;
 
       add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      specialize qw/aom_highbd_10_sub_pixel_variance32x16 sse2/;
+      specialize qw/aom_highbd_10_sub_pixel_variance32x16 sse2 avx2/;
 
       add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      specialize qw/aom_highbd_10_sub_pixel_variance16x32 sse2/;
+      specialize qw/aom_highbd_10_sub_pixel_variance16x32 sse2 avx2/;
 
       add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      specialize qw/aom_highbd_10_sub_pixel_variance16x16 sse2/;
+      specialize qw/aom_highbd_10_sub_pixel_variance16x16 sse2 avx2/;
 
       add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      specialize qw/aom_highbd_10_sub_pixel_variance16x8 sse2/;
+      specialize qw/aom_highbd_10_sub_pixel_variance16x8 sse2 avx2/;
 
       add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      specialize qw/aom_highbd_10_sub_pixel_variance8x16 sse2/;
+      specialize qw/aom_highbd_10_sub_pixel_variance8x16 sse2 avx2/;
 
       add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      specialize qw/aom_highbd_10_sub_pixel_variance8x8 sse2/;
+      specialize qw/aom_highbd_10_sub_pixel_variance8x8 sse2 avx2/;
 
       add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
       specialize qw/aom_highbd_10_sub_pixel_variance8x4 sse2/;
diff --git a/aom_dsp/arm/aom_convolve_copy_neon.c b/aom_dsp/arm/aom_convolve_copy_neon.c
new file mode 100644
index 0000000..583d832
--- /dev/null
+++ b/aom_dsp/arm/aom_convolve_copy_neon.c
@@ -0,0 +1,52 @@
+/*
+ *  Copyright (c) 2020, Alliance for Open Media. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+void aom_convolve_copy_neon(const uint8_t *src, ptrdiff_t src_stride,
+                            uint8_t *dst, ptrdiff_t dst_stride, int w, int h) {
+  const uint8_t *src1;
+  uint8_t *dst1;
+  int y;
+
+  if (!(w & 0x0F)) {
+    for (y = 0; y < h; ++y) {
+      src1 = src;
+      dst1 = dst;
+      for (int x = 0; x < (w >> 4); ++x) {
+        vst1q_u8(dst1, vld1q_u8(src1));
+        src1 += 16;
+        dst1 += 16;
+      }
+      src += src_stride;
+      dst += dst_stride;
+    }
+  } else if (!(w & 0x07)) {
+    for (y = 0; y < h; ++y) {
+      vst1_u8(dst, vld1_u8(src));
+      src += src_stride;
+      dst += dst_stride;
+    }
+  } else if (!(w & 0x03)) {
+    for (y = 0; y < h; ++y) {
+      vst1_lane_u32((uint32_t *)(dst), vreinterpret_u32_u8(vld1_u8(src)), 0);
+      src += src_stride;
+      dst += dst_stride;
+    }
+  } else if (!(w & 0x01)) {
+    for (y = 0; y < h; ++y) {
+      vst1_lane_u16((uint16_t *)(dst), vreinterpret_u16_u8(vld1_u8(src)), 0);
+      src += src_stride;
+      dst += dst_stride;
+    }
+  }
+}
diff --git a/aom_dsp/arm/avg_neon.c b/aom_dsp/arm/avg_neon.c
index af3769e..c3d4de2 100644
--- a/aom_dsp/arm/avg_neon.c
+++ b/aom_dsp/arm/avg_neon.c
@@ -72,3 +72,117 @@
     return satd;
   }
 }
+
+void aom_int_pro_row_neon(int16_t hbuf[16], const uint8_t *ref,
+                          const int ref_stride, const int height) {
+  int i;
+  const uint8_t *idx = ref;
+  uint16x8_t vec0 = vdupq_n_u16(0);
+  uint16x8_t vec1 = vec0;
+  uint8x16_t tmp;
+
+  for (i = 0; i < height; ++i) {
+    tmp = vld1q_u8(idx);
+    idx += ref_stride;
+    vec0 = vaddw_u8(vec0, vget_low_u8(tmp));
+    vec1 = vaddw_u8(vec1, vget_high_u8(tmp));
+  }
+
+  if (128 == height) {
+    vec0 = vshrq_n_u16(vec0, 6);
+    vec1 = vshrq_n_u16(vec1, 6);
+  } else if (64 == height) {
+    vec0 = vshrq_n_u16(vec0, 5);
+    vec1 = vshrq_n_u16(vec1, 5);
+  } else if (32 == height) {
+    vec0 = vshrq_n_u16(vec0, 4);
+    vec1 = vshrq_n_u16(vec1, 4);
+  } else if (16 == height) {
+    vec0 = vshrq_n_u16(vec0, 3);
+    vec1 = vshrq_n_u16(vec1, 3);
+  }
+
+  vst1q_s16(hbuf, vreinterpretq_s16_u16(vec0));
+  hbuf += 8;
+  vst1q_s16(hbuf, vreinterpretq_s16_u16(vec1));
+}
+
+int16_t aom_int_pro_col_neon(const uint8_t *ref, const int width) {
+  const uint8_t *idx;
+  uint16x8_t sum = vdupq_n_u16(0);
+
+  for (idx = ref; idx < (ref + width); idx += 16) {
+    uint8x16_t vec = vld1q_u8(idx);
+    sum = vaddq_u16(sum, vpaddlq_u8(vec));
+  }
+
+#if defined(__aarch64__)
+  return (int16_t)vaddvq_u16(sum);
+#else
+  const uint32x4_t a = vpaddlq_u16(sum);
+  const uint64x2_t b = vpaddlq_u32(a);
+  const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
+                                vreinterpret_u32_u64(vget_high_u64(b)));
+  return (int16_t)vget_lane_u32(c, 0);
+#endif
+}
+
+// coeff: 16 bits, dynamic range [-32640, 32640].
+// length: value range {16, 64, 256, 1024}.
+int aom_satd_neon(const tran_low_t *coeff, int length) {
+  const int32x4_t zero = vdupq_n_s32(0);
+  int32x4_t accum = zero;
+  do {
+    const int32x4_t src0 = vld1q_s32(&coeff[0]);
+    const int32x4_t src8 = vld1q_s32(&coeff[4]);
+    const int32x4_t src16 = vld1q_s32(&coeff[8]);
+    const int32x4_t src24 = vld1q_s32(&coeff[12]);
+    accum = vabaq_s32(accum, src0, zero);
+    accum = vabaq_s32(accum, src8, zero);
+    accum = vabaq_s32(accum, src16, zero);
+    accum = vabaq_s32(accum, src24, zero);
+    length -= 16;
+    coeff += 16;
+  } while (length != 0);
+
+  // satd: 26 bits, dynamic range [-32640 * 1024, 32640 * 1024]
+#ifdef __aarch64__
+  return vaddvq_s32(accum);
+#else
+  return horizontal_add_s32x4(accum);
+#endif  // __aarch64__
+}
+
+int aom_vector_var_neon(const int16_t *ref, const int16_t *src, const int bwl) {
+  int32x4_t v_mean = vdupq_n_s32(0);
+  int32x4_t v_sse = v_mean;
+  int16x8_t v_ref, v_src;
+  int16x4_t v_low;
+
+  int i, width = 4 << bwl;
+  for (i = 0; i < width; i += 8) {
+    v_ref = vld1q_s16(&ref[i]);
+    v_src = vld1q_s16(&src[i]);
+    const int16x8_t diff = vsubq_s16(v_ref, v_src);
+    // diff: dynamic range [-510, 510], 10 bits.
+    v_mean = vpadalq_s16(v_mean, diff);
+    v_low = vget_low_s16(diff);
+    v_sse = vmlal_s16(v_sse, v_low, v_low);
+#if defined(__aarch64__)
+    v_sse = vmlal_high_s16(v_sse, diff, diff);
+#else
+    const int16x4_t v_high = vget_high_s16(diff);
+    v_sse = vmlal_s16(v_sse, v_high, v_high);
+#endif
+  }
+#if defined(__aarch64__)
+  int mean = vaddvq_s32(v_mean);
+  int sse = (int)vaddvq_s32(v_sse);
+#else
+  int mean = horizontal_add_s32x4(v_mean);
+  int sse = horizontal_add_s32x4(v_sse);
+#endif
+  // (mean * mean): dynamic range 31 bits.
+  int var = sse - ((mean * mean) >> (bwl + 2));
+  return var;
+}
diff --git a/aom_dsp/arm/intrapred_neon.c b/aom_dsp/arm/intrapred_neon.c
index c85b1e9..6d41708 100644
--- a/aom_dsp/arm/intrapred_neon.c
+++ b/aom_dsp/arm/intrapred_neon.c
@@ -11,6 +11,8 @@
 
 #include <arm_neon.h>
 
+#include "common/tools_common.h"
+
 #include "config/aom_config.h"
 #include "config/aom_dsp_rtcd.h"
 
@@ -588,3 +590,2637 @@
 
 intra_pred_square(dc);
 #undef intra_pred_square
+
+/* ---------------------P R E D I C T I O N   Z 1--------------------------- */
+
+static DECLARE_ALIGNED(16, uint8_t, EvenOddMaskx[8][16]) = {
+  { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 },
+  { 0, 1, 3, 5, 7, 9, 11, 13, 0, 2, 4, 6, 8, 10, 12, 14 },
+  { 0, 0, 2, 4, 6, 8, 10, 12, 0, 0, 3, 5, 7, 9, 11, 13 },
+  { 0, 0, 0, 3, 5, 7, 9, 11, 0, 0, 0, 4, 6, 8, 10, 12 },
+  { 0, 0, 0, 0, 4, 6, 8, 10, 0, 0, 0, 0, 5, 7, 9, 11 },
+  { 0, 0, 0, 0, 0, 5, 7, 9, 0, 0, 0, 0, 0, 6, 8, 10 },
+  { 0, 0, 0, 0, 0, 0, 6, 8, 0, 0, 0, 0, 0, 0, 7, 9 },
+  { 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 8 }
+};
+
+// Low bit depth functions
+static DECLARE_ALIGNED(32, uint8_t, BaseMask[33][32]) = {
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0,    0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0,    0,    0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0,    0,    0,    0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0,    0,    0,    0,    0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0,    0,    0,    0,    0,    0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0,    0,    0,    0,    0,    0,    0,    0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0,
+    0,    0,    0,    0,    0,    0,    0,    0,    0, 0, 0, 0, 0, 0, 0, 0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0,
+    0,    0,    0,    0,    0,    0,    0,    0,    0,    0, 0, 0, 0, 0, 0, 0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,
+    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
+    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
+    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
+    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0,    0,    0,    0,    0,    0,    0,    0,    0,
+    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0,    0,    0,    0,    0,    0,    0,    0,
+    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0,    0,    0,    0,
+    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0,    0,    0,
+    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0,    0,
+    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0,
+    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0,
+    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,
+    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,
+    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0,    0,    0,    0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0,    0,    0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0,    0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff },
+};
+
+/* clang-format on */
+static AOM_FORCE_INLINE void dr_prediction_z1_HxW_internal_neon_64(
+    int H, int W, uint8x8_t *dst, const uint8_t *above, int upsample_above,
+    int dx) {
+  const int frac_bits = 6 - upsample_above;
+  const int max_base_x = ((W + H) - 1) << upsample_above;
+
+  assert(dx > 0);
+  // pre-filter above pixels
+  // store in temp buffers:
+  //   above[x] * 32 + 16
+  //   above[x+1] - above[x]
+  // final pixels will be calculated as:
+  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+
+  uint16x8_t a0, a1;
+  uint16x8_t diff, a32;
+  uint16x8_t a16;
+  uint8x8_t a_mbase_x;
+
+  a16 = vdupq_n_u16(16);
+  a_mbase_x = vdup_n_u8(above[max_base_x]);
+  uint16x8_t v_32 = vdupq_n_u16(32);
+  int16x8_t v_upsample_above = vdupq_n_s16(upsample_above);
+  uint16x8_t c3f = vdupq_n_u16(0x3f);
+
+  int x = dx;
+  for (int r = 0; r < W; r++) {
+    uint16x8_t res;
+    uint16x8_t shift;
+    uint8x8x2_t v_tmp_a0_128;
+
+    int base = x >> frac_bits;
+    int base_max_diff = (max_base_x - base) >> upsample_above;
+    if (base_max_diff <= 0) {
+      for (int i = r; i < W; ++i) {
+        dst[i] = a_mbase_x;  // save 4 values
+      }
+      return;
+    }
+
+    if (base_max_diff > H) base_max_diff = H;
+
+    if (upsample_above) {
+      v_tmp_a0_128 = vld2_u8(above + base);
+      shift = vshrq_n_u16(
+          vandq_u16(vshlq_u16(vdupq_n_u16(x), v_upsample_above), c3f), 1);
+    } else {
+      v_tmp_a0_128.val[0] = vld1_u8(above + base);
+      v_tmp_a0_128.val[1] = vld1_u8(above + base + 1);
+      shift = vshrq_n_u16(vandq_u16(vdupq_n_u16(x), c3f), 1);
+    }
+    a0 = vmovl_u8(v_tmp_a0_128.val[0]);
+    a1 = vmovl_u8(v_tmp_a0_128.val[1]);
+    diff = vsubq_u16(a1, a0);        // a[x+1] - a[x]
+    a32 = vmlaq_u16(a16, a0, v_32);  // a[x] * 32 + 16
+    res = vmlaq_u16(a32, diff, shift);
+
+    uint8x8_t mask = vld1_u8(BaseMask[base_max_diff]);
+    dst[r] =
+        vorr_u8(vand_u8(mask, vshrn_n_u16(res, 5)), vbic_u8(a_mbase_x, mask));
+
+    x += dx;
+  }
+}
+
+static void dr_prediction_z1_4xN_neon(int N, uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above, int upsample_above,
+                                      int dx) {
+  uint8x8_t dstvec[16];
+
+  dr_prediction_z1_HxW_internal_neon_64(4, N, dstvec, above, upsample_above,
+                                        dx);
+  for (int i = 0; i < N; i++) {
+    vst1_lane_u32((uint32_t *)(dst + stride * i),
+                  vreinterpret_u32_u8(dstvec[i]), 0);
+  }
+}
+
+static void dr_prediction_z1_8xN_neon(int N, uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above, int upsample_above,
+                                      int dx) {
+  uint8x8_t dstvec[32];
+
+  dr_prediction_z1_HxW_internal_neon_64(8, N, dstvec, above, upsample_above,
+                                        dx);
+  for (int i = 0; i < N; i++) {
+    vst1_u8(dst + stride * i, dstvec[i]);
+  }
+}
+
+static AOM_FORCE_INLINE void dr_prediction_z1_HxW_internal_neon(
+    int H, int W, uint8x16_t *dst, const uint8_t *above, int upsample_above,
+    int dx) {
+  const int frac_bits = 6 - upsample_above;
+  const int max_base_x = ((W + H) - 1) << upsample_above;
+
+  assert(dx > 0);
+  // pre-filter above pixels
+  // store in temp buffers:
+  //   above[x] * 32 + 16
+  //   above[x+1] - above[x]
+  // final pixels will be calculated as:
+  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+
+  uint8x16x2_t a0, a1;
+  uint16x8x2_t diff, a32;
+  uint16x8_t a16, c3f;
+  uint8x16_t a_mbase_x;
+
+  a16 = vdupq_n_u16(16);
+  a_mbase_x = vdupq_n_u8(above[max_base_x]);
+  c3f = vdupq_n_u16(0x3f);
+  uint16x8_t v_32 = vdupq_n_u16(32);
+  uint8x16_t v_zero = vdupq_n_u8(0);
+  int16x8_t v_upsample_above = vdupq_n_s16(upsample_above);
+
+  int x = dx;
+  for (int r = 0; r < W; r++) {
+    uint16x8x2_t res;
+    uint16x8_t shift;
+    uint8x16_t a0_128, a1_128;
+
+    int base = x >> frac_bits;
+    int base_max_diff = (max_base_x - base) >> upsample_above;
+    if (base_max_diff <= 0) {
+      for (int i = r; i < W; ++i) {
+        dst[i] = a_mbase_x;  // save 4 values
+      }
+      return;
+    }
+
+    if (base_max_diff > H) base_max_diff = H;
+
+    if (upsample_above) {
+      uint8x8x2_t v_tmp_a0_128 = vld2_u8(above + base);
+      a0_128 = vcombine_u8(v_tmp_a0_128.val[0], v_tmp_a0_128.val[1]);
+      a1_128 = vextq_u8(a0_128, v_zero, 8);
+      shift = vshrq_n_u16(
+          vandq_u16(vshlq_u16(vdupq_n_u16(x), v_upsample_above), c3f), 1);
+    } else {
+      a0_128 = vld1q_u8(above + base);
+      a1_128 = vld1q_u8(above + base + 1);
+      shift = vshrq_n_u16(vandq_u16(vdupq_n_u16(x), c3f), 1);
+    }
+    a0 = vzipq_u8(a0_128, v_zero);
+    a1 = vzipq_u8(a1_128, v_zero);
+    diff.val[0] = vsubq_u16(vreinterpretq_u16_u8(a1.val[0]),
+                            vreinterpretq_u16_u8(a0.val[0]));  // a[x+1] - a[x]
+    diff.val[1] = vsubq_u16(vreinterpretq_u16_u8(a1.val[1]),
+                            vreinterpretq_u16_u8(a0.val[1]));  // a[x+1] - a[x]
+    a32.val[0] = vmlaq_u16(a16, vreinterpretq_u16_u8(a0.val[0]),
+                           v_32);  // a[x] * 32 + 16
+    a32.val[1] = vmlaq_u16(a16, vreinterpretq_u16_u8(a0.val[1]),
+                           v_32);  // a[x] * 32 + 16
+    res.val[0] = vmlaq_u16(a32.val[0], diff.val[0], shift);
+    res.val[1] = vmlaq_u16(a32.val[1], diff.val[1], shift);
+    uint8x16_t v_temp =
+        vcombine_u8(vshrn_n_u16(res.val[0], 5), vshrn_n_u16(res.val[1], 5));
+
+    uint8x16_t mask = vld1q_u8(BaseMask[base_max_diff]);
+    dst[r] = vorrq_u8(vandq_u8(mask, v_temp), vbicq_u8(a_mbase_x, mask));
+
+    x += dx;
+  }
+}
+
+static void dr_prediction_z1_16xN_neon(int N, uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *above, int upsample_above,
+                                       int dx) {
+  uint8x16_t dstvec[64];
+
+  dr_prediction_z1_HxW_internal_neon(16, N, dstvec, above, upsample_above, dx);
+  for (int i = 0; i < N; i++) {
+    vst1q_u8(dst + stride * i, dstvec[i]);
+  }
+}
+
+static AOM_FORCE_INLINE void dr_prediction_z1_32xN_internal_neon(
+    int N, uint8x16x2_t *dstvec, const uint8_t *above, int upsample_above,
+    int dx) {
+  // here upsample_above is 0 by design of av1_use_intra_edge_upsample
+  (void)upsample_above;
+  const int frac_bits = 6;
+  const int max_base_x = ((32 + N) - 1);
+
+  // pre-filter above pixels
+  // store in temp buffers:
+  //   above[x] * 32 + 16
+  //   above[x+1] - above[x]
+  // final pixels will be calculated as:
+  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+
+  uint8x16_t a_mbase_x;
+  uint8x16x2_t a0, a1;
+  uint16x8x2_t diff, a32;
+  uint16x8_t a16, c3f;
+
+  a_mbase_x = vdupq_n_u8(above[max_base_x]);
+  a16 = vdupq_n_u16(16);
+  c3f = vdupq_n_u16(0x3f);
+  uint16x8_t v_32 = vdupq_n_u16(32);
+  uint8x16_t v_zero = vdupq_n_u8(0);
+
+  int x = dx;
+  for (int r = 0; r < N; r++) {
+    uint16x8x2_t res;
+    uint8x16_t res16[2];
+    uint8x16_t a0_128, a1_128;
+
+    int base = x >> frac_bits;
+    int base_max_diff = (max_base_x - base);
+    if (base_max_diff <= 0) {
+      for (int i = r; i < N; ++i) {
+        dstvec[i].val[0] = a_mbase_x;  // save 32 values
+        dstvec[i].val[1] = a_mbase_x;
+      }
+      return;
+    }
+    if (base_max_diff > 32) base_max_diff = 32;
+
+    uint16x8_t shift = vshrq_n_u16(vandq_u16(vdupq_n_u16(x), c3f), 1);
+
+    for (int j = 0, jj = 0; j < 32; j += 16, jj++) {
+      int mdiff = base_max_diff - j;
+      if (mdiff <= 0) {
+        res16[jj] = a_mbase_x;
+      } else {
+        a0_128 = vld1q_u8(above + base + j);
+        a1_128 = vld1q_u8(above + base + j + 1);
+        a0 = vzipq_u8(a0_128, v_zero);
+        a1 = vzipq_u8(a1_128, v_zero);
+        diff.val[0] =
+            vsubq_u16(vreinterpretq_u16_u8(a1.val[0]),
+                      vreinterpretq_u16_u8(a0.val[0]));  // a[x+1] - a[x]
+        diff.val[1] =
+            vsubq_u16(vreinterpretq_u16_u8(a1.val[1]),
+                      vreinterpretq_u16_u8(a0.val[1]));  // a[x+1] - a[x]
+        a32.val[0] = vmlaq_u16(a16, vreinterpretq_u16_u8(a0.val[0]),
+                               v_32);  // a[x] * 32 + 16
+        a32.val[1] = vmlaq_u16(a16, vreinterpretq_u16_u8(a0.val[1]),
+                               v_32);  // a[x] * 32 + 16
+        res.val[0] = vmlaq_u16(a32.val[0], diff.val[0], shift);
+        res.val[1] = vmlaq_u16(a32.val[1], diff.val[1], shift);
+
+        res16[jj] =
+            vcombine_u8(vshrn_n_u16(res.val[0], 5), vshrn_n_u16(res.val[1], 5));
+      }
+    }
+
+    uint8x16x2_t mask;
+
+    mask.val[0] = vld1q_u8(BaseMask[base_max_diff]);
+    mask.val[1] = vld1q_u8(BaseMask[base_max_diff] + 16);
+    dstvec[r].val[0] = vorrq_u8(vandq_u8(mask.val[0], res16[0]),
+                                vbicq_u8(a_mbase_x, mask.val[0]));
+    dstvec[r].val[1] = vorrq_u8(vandq_u8(mask.val[1], res16[1]),
+                                vbicq_u8(a_mbase_x, mask.val[1]));
+    x += dx;
+  }
+}
+
+static void dr_prediction_z1_32xN_neon(int N, uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *above, int upsample_above,
+                                       int dx) {
+  uint8x16x2_t dstvec[64];
+
+  dr_prediction_z1_32xN_internal_neon(N, dstvec, above, upsample_above, dx);
+  for (int i = 0; i < N; i++) {
+    vst1q_u8(dst + stride * i, dstvec[i].val[0]);
+    vst1q_u8(dst + stride * i + 16, dstvec[i].val[1]);
+  }
+}
+
+static void dr_prediction_z1_64xN_neon(int N, uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *above, int upsample_above,
+                                       int dx) {
+  // here upsample_above is 0 by design of av1_use_intra_edge_upsample
+  (void)upsample_above;
+  const int frac_bits = 6;
+  const int max_base_x = ((64 + N) - 1);
+
+  // pre-filter above pixels
+  // store in temp buffers:
+  //   above[x] * 32 + 16
+  //   above[x+1] - above[x]
+  // final pixels will be calculated as:
+  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+
+  uint8x16x2_t a0, a1;
+  uint16x8x2_t a32, diff;
+  uint16x8_t a16, c3f;
+  uint8x16_t a_mbase_x, max_base_x128, mask128;
+
+  a16 = vdupq_n_u16(16);
+  a_mbase_x = vdupq_n_u8(above[max_base_x]);
+  max_base_x128 = vdupq_n_u8(max_base_x);
+  c3f = vdupq_n_u16(0x3f);
+  uint16x8_t v_32 = vdupq_n_u16(32);
+  uint8x16_t v_zero = vdupq_n_u8(0);
+  uint8x16_t step = vdupq_n_u8(16);
+
+  int x = dx;
+  for (int r = 0; r < N; r++, dst += stride) {
+    uint16x8x2_t res;
+
+    int base = x >> frac_bits;
+    if (base >= max_base_x) {
+      for (int i = r; i < N; ++i) {
+        vst1q_u8(dst, a_mbase_x);
+        vst1q_u8(dst + 16, a_mbase_x);
+        vst1q_u8(dst + 32, a_mbase_x);
+        vst1q_u8(dst + 48, a_mbase_x);
+        dst += stride;
+      }
+      return;
+    }
+
+    uint16x8_t shift = vshrq_n_u16(vandq_u16(vdupq_n_u16(x), c3f), 1);
+    uint8x16_t a0_128, a1_128, res128;
+    uint8x16_t base_inc128 =
+        vaddq_u8(vdupq_n_u8(base), vcombine_u8(vcreate_u8(0x0706050403020100),
+                                               vcreate_u8(0x0F0E0D0C0B0A0908)));
+
+    for (int j = 0; j < 64; j += 16) {
+      int mdif = max_base_x - (base + j);
+      if (mdif <= 0) {
+        vst1q_u8(dst + j, a_mbase_x);
+      } else {
+        a0_128 = vld1q_u8(above + base + j);
+        a1_128 = vld1q_u8(above + base + 1 + j);
+        a0 = vzipq_u8(a0_128, v_zero);
+        a1 = vzipq_u8(a1_128, v_zero);
+        diff.val[0] =
+            vsubq_u16(vreinterpretq_u16_u8(a1.val[0]),
+                      vreinterpretq_u16_u8(a0.val[0]));  // a[x+1] - a[x]
+        diff.val[1] =
+            vsubq_u16(vreinterpretq_u16_u8(a1.val[1]),
+                      vreinterpretq_u16_u8(a0.val[1]));  // a[x+1] - a[x]
+        a32.val[0] = vmlaq_u16(a16, vreinterpretq_u16_u8(a0.val[0]),
+                               v_32);  // a[x] * 32 + 16
+        a32.val[1] = vmlaq_u16(a16, vreinterpretq_u16_u8(a0.val[1]),
+                               v_32);  // a[x] * 32 + 16
+        res.val[0] = vmlaq_u16(a32.val[0], diff.val[0], shift);
+        res.val[1] = vmlaq_u16(a32.val[1], diff.val[1], shift);
+        uint8x16_t v_temp =
+            vcombine_u8(vshrn_n_u16(res.val[0], 5), vshrn_n_u16(res.val[1], 5));
+
+        mask128 = vcgtq_u8(vqsubq_u8(max_base_x128, base_inc128), v_zero);
+        res128 =
+            vorrq_u8(vandq_u8(mask128, v_temp), vbicq_u8(a_mbase_x, mask128));
+        vst1q_u8(dst + j, res128);
+
+        base_inc128 = vaddq_u8(base_inc128, step);
+      }
+    }
+    x += dx;
+  }
+}
+
+// Directional prediction, zone 1: 0 < angle < 90
+void av1_dr_prediction_z1_neon(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
+                               const uint8_t *above, const uint8_t *left,
+                               int upsample_above, int dx, int dy) {
+  (void)left;
+  (void)dy;
+
+  switch (bw) {
+    case 4:
+      dr_prediction_z1_4xN_neon(bh, dst, stride, above, upsample_above, dx);
+      break;
+    case 8:
+      dr_prediction_z1_8xN_neon(bh, dst, stride, above, upsample_above, dx);
+      break;
+    case 16:
+      dr_prediction_z1_16xN_neon(bh, dst, stride, above, upsample_above, dx);
+      break;
+    case 32:
+      dr_prediction_z1_32xN_neon(bh, dst, stride, above, upsample_above, dx);
+      break;
+    case 64:
+      dr_prediction_z1_64xN_neon(bh, dst, stride, above, upsample_above, dx);
+      break;
+    default: break;
+  }
+  return;
+}
+
+/* ---------------------P R E D I C T I O N   Z 2--------------------------- */
+
+static DECLARE_ALIGNED(16, uint8_t, LoadMaskz2[4][16]) = {
+  { 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,
+    0, 0, 0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff }
+};
+
+static AOM_FORCE_INLINE void vector_shift_x4(uint8x8_t *vec, uint8x8_t *v_zero,
+                                             int shift_value) {
+  switch (shift_value) {
+    case 1: *vec = vext_u8(*v_zero, *vec, 7); break;
+    case 2: *vec = vext_u8(*v_zero, *vec, 6); break;
+    case 3: *vec = vext_u8(*v_zero, *vec, 5); break;
+    default: break;
+  }
+}
+
+static void dr_prediction_z2_Nx4_neon(int N, uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above, const uint8_t *left,
+                                      int upsample_above, int upsample_left,
+                                      int dx, int dy) {
+  const int min_base_x = -(1 << upsample_above);
+  const int min_base_y = -(1 << upsample_left);
+  const int frac_bits_x = 6 - upsample_above;
+  const int frac_bits_y = 6 - upsample_left;
+
+  assert(dx > 0);
+  // pre-filter above pixels
+  // store in temp buffers:
+  //   above[x] * 32 + 16
+  //   above[x+1] - above[x]
+  // final pixels will be calculated as:
+  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+  uint16x8_t a0_x, a1_x, a32, diff;
+  uint16x8_t v_32 = vdupq_n_u16(32);
+  uint16x8_t v_zero = vdupq_n_u16(0);
+  uint16x8_t a16 = vdupq_n_u16(16);
+
+  uint8x8_t v_zero_u8 = vdup_n_u8(0);
+  uint16x4_t v_c3f = vdup_n_u16(0x3f);
+  uint16x4_t r6 = vcreate_u16(0x00C0008000400000);
+  int16x4_t v_upsample_left = vdup_n_s16(upsample_left);
+  int16x4_t v_upsample_above = vdup_n_s16(upsample_above);
+  int16x4_t v_1234 = vcreate_s16(0x0004000300020001);
+  int16x4_t dy64 = vdup_n_s16(dy);
+  int16x4_t v_frac_bits_y = vdup_n_s16(-frac_bits_y);
+  int16x4_t min_base_y64 = vdup_n_s16(min_base_y);
+  int16x4_t v_one = vdup_lane_s16(v_1234, 0);
+
+  for (int r = 0; r < N; r++) {
+    uint16x8_t res, shift;
+    uint16x4_t ydx;
+    uint8x8_t resx, resy;
+    uint16x4x2_t v_shift;
+
+    int y = r + 1;
+    int base_x = (-y * dx) >> frac_bits_x;
+    int base_shift = 0;
+    if (base_x < (min_base_x - 1)) {
+      base_shift = (min_base_x - base_x - 1) >> upsample_above;
+    }
+    int base_min_diff =
+        (min_base_x - base_x + upsample_above) >> upsample_above;
+    if (base_min_diff > 4) {
+      base_min_diff = 4;
+    } else {
+      if (base_min_diff < 0) base_min_diff = 0;
+    }
+
+    if (base_shift > 3) {
+      a0_x = v_zero;
+      a1_x = v_zero;
+      v_shift.val[0] = vreinterpret_u16_u8(v_zero_u8);
+      v_shift.val[1] = vreinterpret_u16_u8(v_zero_u8);
+    } else {
+      ydx = vdup_n_u16(y * dx);
+
+      if (upsample_above) {
+        uint8x8x2_t v_tmp;
+        v_tmp.val[0] = vld1_u8(above + base_x + base_shift);
+        v_tmp.val[1] = vld1_u8(above + base_x + base_shift + 8);
+        uint8x8_t v_index_low = vld1_u8(EvenOddMaskx[base_shift]);
+        uint8x8_t v_index_high = vld1_u8(EvenOddMaskx[base_shift] + 8);
+        a0_x = vmovl_u8(vtbl2_u8(v_tmp, v_index_low));
+        a1_x = vmovl_u8(vtbl2_u8(v_tmp, v_index_high));
+        v_shift.val[0] = vshr_n_u16(
+            vand_u16(vshl_u16(vsub_u16(r6, ydx), v_upsample_above), v_c3f), 1);
+      } else {
+        uint8x8_t v_a0_x64 = vld1_u8(above + base_x + base_shift);
+        vector_shift_x4(&v_a0_x64, &v_zero_u8, base_shift);
+        uint8x8_t v_a1_x64 = vext_u8(v_a0_x64, v_zero_u8, 1);
+        v_shift.val[0] = vshr_n_u16(vand_u16(vsub_u16(r6, ydx), v_c3f), 1);
+        a0_x = vmovl_u8(v_a0_x64);
+        a1_x = vmovl_u8(v_a1_x64);
+      }
+    }
+
+    // y calc
+    uint8x8_t a0_y, a1_y;
+    if (base_x < min_base_x) {
+      DECLARE_ALIGNED(32, int16_t, base_y_c[4]);
+      int16x4_t v_r6 = vdup_n_s16(r << 6);
+      int16x4_t y_c64 = vmls_s16(v_r6, v_1234, dy64);
+      int16x4_t base_y_c64 = vshl_s16(y_c64, v_frac_bits_y);
+      uint16x4_t mask64 = vcgt_s16(min_base_y64, base_y_c64);
+
+      base_y_c64 = vbic_s16(base_y_c64, vreinterpret_s16_u16(mask64));
+      vst1_s16(base_y_c, base_y_c64);
+      a0_y = v_zero_u8;
+      a0_y = vld1_lane_u8(left + base_y_c[0], a0_y, 0);
+      a0_y = vld1_lane_u8(left + base_y_c[1], a0_y, 2);
+      a0_y = vld1_lane_u8(left + base_y_c[2], a0_y, 4);
+      a0_y = vld1_lane_u8(left + base_y_c[3], a0_y, 6);
+
+      base_y_c64 = vadd_s16(base_y_c64, v_one);
+      vst1_s16(base_y_c, base_y_c64);
+      a1_y = v_zero_u8;
+      a1_y = vld1_lane_u8(left + base_y_c[0], a1_y, 0);
+      a1_y = vld1_lane_u8(left + base_y_c[1], a1_y, 2);
+      a1_y = vld1_lane_u8(left + base_y_c[2], a1_y, 4);
+      a1_y = vld1_lane_u8(left + base_y_c[3], a1_y, 6);
+
+      if (upsample_left) {
+        v_shift.val[1] = vshr_n_u16(
+            vand_u16(vshl_u16(vreinterpret_u16_s16(y_c64), v_upsample_left),
+                     v_c3f),
+            1);
+      } else {
+        v_shift.val[1] =
+            vshr_n_u16(vand_u16(vreinterpret_u16_s16(y_c64), v_c3f), 1);
+      }
+
+      a0_x = vcombine_u16(vget_low_u16(a0_x), vreinterpret_u16_u8(a0_y));
+      a1_x = vcombine_u16(vget_low_u16(a1_x), vreinterpret_u16_u8(a1_y));
+    }
+    shift = vcombine_u16(v_shift.val[0], v_shift.val[1]);
+    diff = vsubq_u16(a1_x, a0_x);      // a[x+1] - a[x]
+    a32 = vmlaq_u16(a16, a0_x, v_32);  // a[x] * 32 + 16
+    res = vmlaq_u16(a32, diff, shift);
+    resx = vshrn_n_u16(res, 5);
+    resy = vext_u8(resx, v_zero_u8, 4);
+
+    uint8x8_t mask = vld1_u8(BaseMask[base_min_diff]);
+    uint8x8_t v_resxy = vorr_u8(vand_u8(mask, resy), vbic_u8(resx, mask));
+    vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(v_resxy), 0);
+
+    dst += stride;
+  }
+}
+
+static AOM_FORCE_INLINE void vector_shuffle(uint8x16_t *vec, uint8x16_t *vzero,
+                                            int shift_value) {
+  switch (shift_value) {
+    case 1: *vec = vextq_u8(*vzero, *vec, 15); break;
+    case 2: *vec = vextq_u8(*vzero, *vec, 14); break;
+    case 3: *vec = vextq_u8(*vzero, *vec, 13); break;
+    case 4: *vec = vextq_u8(*vzero, *vec, 12); break;
+    case 5: *vec = vextq_u8(*vzero, *vec, 11); break;
+    case 6: *vec = vextq_u8(*vzero, *vec, 10); break;
+    case 7: *vec = vextq_u8(*vzero, *vec, 9); break;
+    case 8: *vec = vextq_u8(*vzero, *vec, 8); break;
+    case 9: *vec = vextq_u8(*vzero, *vec, 7); break;
+    case 10: *vec = vextq_u8(*vzero, *vec, 6); break;
+    case 11: *vec = vextq_u8(*vzero, *vec, 5); break;
+    case 12: *vec = vextq_u8(*vzero, *vec, 4); break;
+    case 13: *vec = vextq_u8(*vzero, *vec, 3); break;
+    case 14: *vec = vextq_u8(*vzero, *vec, 2); break;
+    case 15: *vec = vextq_u8(*vzero, *vec, 1); break;
+    default: break;
+  }
+}
+
+static void dr_prediction_z2_Nx8_neon(int N, uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above, const uint8_t *left,
+                                      int upsample_above, int upsample_left,
+                                      int dx, int dy) {
+  const int min_base_x = -(1 << upsample_above);
+  const int min_base_y = -(1 << upsample_left);
+  const int frac_bits_x = 6 - upsample_above;
+  const int frac_bits_y = 6 - upsample_left;
+
+  // pre-filter above pixels
+  // store in temp buffers:
+  //   above[x] * 32 + 16
+  //   above[x+1] - above[x]
+  // final pixels will be calculated as:
+  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+  uint8x16x2_t a0_x, a1_x;
+  uint16x8x2_t diff, a32;
+  uint16x8_t c1234, a16, c3f;
+  uint8x16_t a0_x128, a1_x128;
+  int16x8_t min_base_y128, dy128;
+  uint16x8_t v_32 = vdupq_n_u16(32);
+  uint8x16_t v_zero = vdupq_n_u8(0);
+  int16x8_t v_upsample_left = vdupq_n_s16(upsample_left);
+  int16x8_t v_upsample_above = vdupq_n_s16(upsample_above);
+  int16x8_t v_frac_bits_y = vdupq_n_s16(-frac_bits_y);
+
+  a16 = vdupq_n_u16(16);
+  c3f = vdupq_n_u16(0x3f);
+  min_base_y128 = vdupq_n_s16(min_base_y);
+  dy128 = vdupq_n_s16(dy);
+  c1234 = vcombine_u16(vcreate_u16(0x0004000300020001),
+                       vcreate_u16(0x0008000700060005));
+
+  for (int r = 0; r < N; r++) {
+    uint8x8_t resx, resy, resxy;
+    uint16x8_t r6, ydx;
+    uint16x8x2_t res, shift;
+
+    int y = r + 1;
+    int base_x = (-y * dx) >> frac_bits_x;
+    int base_shift = 0;
+    if (base_x < (min_base_x - 1)) {
+      base_shift = (min_base_x - base_x - 1) >> upsample_above;
+    }
+    int base_min_diff =
+        (min_base_x - base_x + upsample_above) >> upsample_above;
+    if (base_min_diff > 8) {
+      base_min_diff = 8;
+    } else {
+      if (base_min_diff < 0) base_min_diff = 0;
+    }
+
+    if (base_shift > 7) {
+      a0_x.val[0] = v_zero;
+      a0_x.val[1] = v_zero;
+      a1_x.val[0] = v_zero;
+      a1_x.val[1] = v_zero;
+      shift.val[0] = vreinterpretq_u16_u8(v_zero);
+      shift.val[1] = vreinterpretq_u16_u8(v_zero);
+    } else {
+      ydx = vdupq_n_u16(y * dx);
+      r6 = vshlq_n_u16(vextq_u16(c1234, vreinterpretq_u16_u8(v_zero), 2), 6);
+
+      if (upsample_above) {
+        uint8x8x2_t v_tmp;
+        v_tmp.val[0] = vld1_u8(above + base_x + base_shift);
+        v_tmp.val[1] = vld1_u8(above + base_x + base_shift + 8);
+        uint8x8_t v_index_low = vld1_u8(EvenOddMaskx[base_shift]);
+        uint8x8_t v_index_high = vld1_u8(EvenOddMaskx[base_shift] + 8);
+        shift.val[0] = vshrq_n_u16(
+            vandq_u16(vshlq_u16(vsubq_u16(r6, ydx), v_upsample_above), c3f), 1);
+        a0_x.val[0] =
+            vreinterpretq_u8_u16(vmovl_u8(vtbl2_u8(v_tmp, v_index_low)));
+        a1_x.val[0] =
+            vreinterpretq_u8_u16(vmovl_u8(vtbl2_u8(v_tmp, v_index_high)));
+      } else {
+        a0_x128 = vld1q_u8(above + base_x + base_shift);
+        a1_x128 = vextq_u8(a0_x128, v_zero, 1);
+        vector_shuffle(&a0_x128, &v_zero, base_shift);
+        vector_shuffle(&a1_x128, &v_zero, base_shift);
+        shift.val[0] = vshrq_n_u16(vandq_u16(vsubq_u16(r6, ydx), c3f), 1);
+        a0_x.val[0] = vreinterpretq_u8_u16(vmovl_u8(vget_low_u8(a0_x128)));
+        a1_x.val[0] = vreinterpretq_u8_u16(vmovl_u8(vget_low_u8(a1_x128)));
+      }
+    }
+
+    // y calc
+    if (base_x < min_base_x) {
+      DECLARE_ALIGNED(32, int16_t, base_y_c[16]);
+      int16x8_t y_c128, base_y_c128;
+      uint16x8_t mask128;
+      int16x8_t v_r6 = vdupq_n_s16(r << 6);
+
+      y_c128 = vmlsq_s16(v_r6, vreinterpretq_s16_u16(c1234), dy128);
+      base_y_c128 = vshlq_s16(y_c128, v_frac_bits_y);
+      mask128 = vcgtq_s16(min_base_y128, base_y_c128);
+
+      base_y_c128 = vbicq_s16(base_y_c128, vreinterpretq_s16_u16(mask128));
+      vst1q_s16(base_y_c, base_y_c128);
+      a0_x.val[1] = v_zero;
+      a0_x.val[1] = vld1q_lane_u8(left + base_y_c[0], a0_x.val[1], 0);
+      a0_x.val[1] = vld1q_lane_u8(left + base_y_c[1], a0_x.val[1], 2);
+      a0_x.val[1] = vld1q_lane_u8(left + base_y_c[2], a0_x.val[1], 4);
+      a0_x.val[1] = vld1q_lane_u8(left + base_y_c[3], a0_x.val[1], 6);
+      a0_x.val[1] = vld1q_lane_u8(left + base_y_c[4], a0_x.val[1], 8);
+      a0_x.val[1] = vld1q_lane_u8(left + base_y_c[5], a0_x.val[1], 10);
+      a0_x.val[1] = vld1q_lane_u8(left + base_y_c[6], a0_x.val[1], 12);
+      a0_x.val[1] = vld1q_lane_u8(left + base_y_c[7], a0_x.val[1], 14);
+
+      base_y_c128 =
+          vaddq_s16(base_y_c128, vreinterpretq_s16_u16(vshrq_n_u16(a16, 4)));
+      vst1q_s16(base_y_c, base_y_c128);
+      a1_x.val[1] = v_zero;
+      a1_x.val[1] = vld1q_lane_u8(left + base_y_c[0], a1_x.val[1], 0);
+      a1_x.val[1] = vld1q_lane_u8(left + base_y_c[1], a1_x.val[1], 2);
+      a1_x.val[1] = vld1q_lane_u8(left + base_y_c[2], a1_x.val[1], 4);
+      a1_x.val[1] = vld1q_lane_u8(left + base_y_c[3], a1_x.val[1], 6);
+      a1_x.val[1] = vld1q_lane_u8(left + base_y_c[4], a1_x.val[1], 8);
+      a1_x.val[1] = vld1q_lane_u8(left + base_y_c[5], a1_x.val[1], 10);
+      a1_x.val[1] = vld1q_lane_u8(left + base_y_c[6], a1_x.val[1], 12);
+      a1_x.val[1] = vld1q_lane_u8(left + base_y_c[7], a1_x.val[1], 14);
+
+      if (upsample_left) {
+        shift.val[1] = vshrq_n_u16(
+            vandq_u16(vshlq_u16(vreinterpretq_u16_s16(y_c128), v_upsample_left),
+                      c3f),
+            1);
+      } else {
+        shift.val[1] =
+            vshrq_n_u16(vandq_u16(vreinterpretq_u16_s16(y_c128), c3f), 1);
+      }
+    }
+    diff.val[0] =
+        vsubq_u16(vreinterpretq_u16_u8(a1_x.val[0]),
+                  vreinterpretq_u16_u8(a0_x.val[0]));  // a[x+1] - a[x]
+    diff.val[1] =
+        vsubq_u16(vreinterpretq_u16_u8(a1_x.val[1]),
+                  vreinterpretq_u16_u8(a0_x.val[1]));  // a[x+1] - a[x]
+    a32.val[0] = vmlaq_u16(a16, vreinterpretq_u16_u8(a0_x.val[0]),
+                           v_32);  // a[x] * 32 + 16
+    a32.val[1] = vmlaq_u16(a16, vreinterpretq_u16_u8(a0_x.val[1]),
+                           v_32);  // a[x] * 32 + 16
+    res.val[0] = vmlaq_u16(a32.val[0], diff.val[0], shift.val[0]);
+    res.val[1] = vmlaq_u16(a32.val[1], diff.val[1], shift.val[1]);
+    resx = vshrn_n_u16(res.val[0], 5);
+    resy = vshrn_n_u16(res.val[1], 5);
+
+    uint8x8_t mask = vld1_u8(BaseMask[base_min_diff]);
+
+    resxy = vorr_u8(vand_u8(mask, resy), vbic_u8(resx, mask));
+    vst1_u8(dst, resxy);
+    dst += stride;
+  }
+}
+
+static void dr_prediction_z2_HxW_neon(int H, int W, uint8_t *dst,
+                                      ptrdiff_t stride, const uint8_t *above,
+                                      const uint8_t *left, int upsample_above,
+                                      int upsample_left, int dx, int dy) {
+  // here upsample_above and upsample_left are 0 by design of
+  // av1_use_intra_edge_upsample
+  const int min_base_x = -1;
+  const int min_base_y = -1;
+  (void)upsample_above;
+  (void)upsample_left;
+  const int frac_bits_x = 6;
+  const int frac_bits_y = 6;
+
+  uint16x8_t a16, c1, c3f;
+  int16x8_t min_base_y256, dy256;
+  uint16x8x2_t a32, c0123, c1234, diff, shifty;
+  uint8x16x2_t a0_x, a1_x, a0_y, a1_y;
+  uint8x16_t a0_x128, a1_x128;
+  uint16x8_t v_32 = vdupq_n_u16(32);
+  uint8x16_t v_zero = vdupq_n_u8(0);
+  int16x8_t v_frac_bits_y = vdupq_n_s16(-frac_bits_y);
+
+  DECLARE_ALIGNED(32, int16_t, base_y_c[16]);
+
+  a16 = vdupq_n_u16(16);
+  c1 = vshrq_n_u16(a16, 4);
+  min_base_y256 = vdupq_n_s16(min_base_y);
+  c3f = vdupq_n_u16(0x3f);
+  dy256 = vdupq_n_s16(dy);
+  c0123.val[0] = vcombine_u16(vcreate_u16(0x0003000200010000),
+                              vcreate_u16(0x0007000600050004));
+  c0123.val[1] = vcombine_u16(vcreate_u16(0x000B000A00090008),
+                              vcreate_u16(0x000F000E000D000C));
+  c1234.val[0] = vaddq_u16(c0123.val[0], c1);
+  c1234.val[1] = vaddq_u16(c0123.val[1], c1);
+
+  for (int r = 0; r < H; r++) {
+    uint16x8x2_t res, r6, shift;
+    uint16x8_t ydx, j256;
+    uint8x16_t resx, resy, resxy;
+    int y = r + 1;
+    ydx = vdupq_n_u16((uint16_t)(y * dx));
+
+    int base_x = (-y * dx) >> frac_bits_x;
+    for (int j = 0; j < W; j += 16) {
+      j256 = vdupq_n_u16(j);
+
+      int base_shift = 0;
+      if ((base_x + j) < (min_base_x - 1)) {
+        base_shift = (min_base_x - (base_x + j) - 1);
+      }
+      int base_min_diff = (min_base_x - base_x - j);
+      if (base_min_diff > 16) {
+        base_min_diff = 16;
+      } else {
+        if (base_min_diff < 0) base_min_diff = 0;
+      }
+
+      if (base_shift < 16) {
+        a0_x128 = vld1q_u8(above + base_x + base_shift + j);
+        a1_x128 = vld1q_u8(above + base_x + base_shift + 1 + j);
+        vector_shuffle(&a0_x128, &v_zero, base_shift);
+        vector_shuffle(&a1_x128, &v_zero, base_shift);
+        a0_x = vzipq_u8(a0_x128, v_zero);
+        a1_x = vzipq_u8(a1_x128, v_zero);
+        r6.val[0] = vshlq_n_u16(vaddq_u16(c0123.val[0], j256), 6);
+        r6.val[1] = vshlq_n_u16(vaddq_u16(c0123.val[1], j256), 6);
+        shift.val[0] =
+            vshrq_n_u16(vandq_u16(vsubq_u16(r6.val[0], ydx), c3f), 1);
+        shift.val[1] =
+            vshrq_n_u16(vandq_u16(vsubq_u16(r6.val[1], ydx), c3f), 1);
+        diff.val[0] =
+            vsubq_u16(vreinterpretq_u16_u8(a1_x.val[0]),
+                      vreinterpretq_u16_u8(a0_x.val[0]));  // a[x+1] - a[x]
+        diff.val[1] =
+            vsubq_u16(vreinterpretq_u16_u8(a1_x.val[1]),
+                      vreinterpretq_u16_u8(a0_x.val[1]));  // a[x+1] - a[x]
+        a32.val[0] = vmlaq_u16(a16, vreinterpretq_u16_u8(a0_x.val[0]),
+                               v_32);  // a[x] * 32 + 16
+        a32.val[1] = vmlaq_u16(a16, vreinterpretq_u16_u8(a0_x.val[1]),
+                               v_32);  // a[x] * 32 + 16
+        res.val[0] = vmlaq_u16(a32.val[0], diff.val[0], shift.val[0]);
+        res.val[1] = vmlaq_u16(a32.val[1], diff.val[1], shift.val[1]);
+        resx =
+            vcombine_u8(vshrn_n_u16(res.val[0], 5), vshrn_n_u16(res.val[1], 5));
+      } else {
+        resx = v_zero;
+      }
+
+      // y calc
+      if (base_x < min_base_x) {
+        uint16x8x2_t mask256;
+        int16x8x2_t c256, y_c256, base_y_c256, mul16;
+        int16x8_t v_r6 = vdupq_n_s16(r << 6);
+
+        c256.val[0] = vaddq_s16(vreinterpretq_s16_u16(j256),
+                                vreinterpretq_s16_u16(c1234.val[0]));
+        c256.val[1] = vaddq_s16(vreinterpretq_s16_u16(j256),
+                                vreinterpretq_s16_u16(c1234.val[1]));
+        mul16.val[0] = vminq_s16(vmulq_s16(c256.val[0], dy256),
+                                 vreinterpretq_s16_u16(vshrq_n_u16(
+                                     vreinterpretq_u16_s16(min_base_y256), 1)));
+        mul16.val[1] = vminq_s16(vmulq_s16(c256.val[1], dy256),
+                                 vreinterpretq_s16_u16(vshrq_n_u16(
+                                     vreinterpretq_u16_s16(min_base_y256), 1)));
+        y_c256.val[0] = vsubq_s16(v_r6, mul16.val[0]);
+        y_c256.val[1] = vsubq_s16(v_r6, mul16.val[1]);
+
+        base_y_c256.val[0] = vshlq_s16(y_c256.val[0], v_frac_bits_y);
+        base_y_c256.val[1] = vshlq_s16(y_c256.val[1], v_frac_bits_y);
+        mask256.val[0] = vcgtq_s16(min_base_y256, base_y_c256.val[0]);
+        mask256.val[1] = vcgtq_s16(min_base_y256, base_y_c256.val[1]);
+
+        base_y_c256.val[0] = vorrq_s16(
+            vandq_s16(vreinterpretq_s16_u16(mask256.val[0]), min_base_y256),
+            vbicq_s16(base_y_c256.val[0],
+                      vreinterpretq_s16_u16(mask256.val[0])));
+        base_y_c256.val[1] = vorrq_s16(
+            vandq_s16(vreinterpretq_s16_u16(mask256.val[1]), min_base_y256),
+            vbicq_s16(base_y_c256.val[1],
+                      vreinterpretq_s16_u16(mask256.val[1])));
+
+        int16_t min_y = vgetq_lane_s16(base_y_c256.val[1], 7);
+        int16_t max_y = vgetq_lane_s16(base_y_c256.val[0], 0);
+        int16_t offset_diff = max_y - min_y;
+
+        if (offset_diff < 16) {
+          int16x8_t min_y256 =
+              vdupq_lane_s16(vget_high_s16(base_y_c256.val[1]), 3);
+
+          int16x8x2_t base_y_offset;
+          base_y_offset.val[0] = vsubq_s16(base_y_c256.val[0], min_y256);
+          base_y_offset.val[1] = vsubq_s16(base_y_c256.val[1], min_y256);
+
+          int8x16_t base_y_offset128 =
+              vcombine_s8(vqmovn_s16(base_y_offset.val[0]),
+                          vqmovn_s16(base_y_offset.val[1]));
+
+          uint8x16_t a0_y128, a1_y128;
+          uint8x16_t v_loadmaskz2 = vld1q_u8(LoadMaskz2[offset_diff / 4]);
+          a0_y128 = vld1q_u8(left + min_y);
+          a0_y128 = vandq_u8(a0_y128, v_loadmaskz2);
+          a1_y128 = vld1q_u8(left + min_y + 1);
+          a1_y128 = vandq_u8(a1_y128, v_loadmaskz2);
+#if defined(__aarch64__)
+          a0_y128 = vqtbl1q_u8(a0_y128, vreinterpretq_u8_s8(base_y_offset128));
+          a1_y128 = vqtbl1q_u8(a1_y128, vreinterpretq_u8_s8(base_y_offset128));
+#else
+          uint8x8x2_t v_tmp;
+          uint8x8x2_t v_res;
+          uint8x8_t v_index_low =
+              vget_low_u8(vreinterpretq_u8_s8(base_y_offset128));
+          uint8x8_t v_index_high =
+              vget_high_u8(vreinterpretq_u8_s8(base_y_offset128));
+          v_tmp.val[0] = vget_low_u8(a0_y128);
+          v_tmp.val[1] = vget_high_u8(a0_y128);
+          v_res.val[0] = vtbl2_u8(v_tmp, v_index_low);
+          v_res.val[1] = vtbl2_u8(v_tmp, v_index_high);
+          a0_y128 = vcombine_u8(v_res.val[0], v_res.val[1]);
+          v_tmp.val[0] = vget_low_u8(a1_y128);
+          v_tmp.val[1] = vget_high_u8(a1_y128);
+          v_res.val[0] = vtbl2_u8(v_tmp, v_index_low);
+          v_res.val[1] = vtbl2_u8(v_tmp, v_index_high);
+          a1_y128 = vcombine_u8(v_res.val[0], v_res.val[1]);
+#endif
+          a0_y = vzipq_u8(a0_y128, v_zero);
+          a1_y = vzipq_u8(a1_y128, v_zero);
+        } else {
+          base_y_c256.val[0] = vbicq_s16(base_y_c256.val[0],
+                                         vreinterpretq_s16_u16(mask256.val[0]));
+          base_y_c256.val[1] = vbicq_s16(base_y_c256.val[1],
+                                         vreinterpretq_s16_u16(mask256.val[1]));
+          vst1q_s16(base_y_c, base_y_c256.val[0]);
+          vst1q_s16(base_y_c + 8, base_y_c256.val[1]);
+          a0_y.val[0] = v_zero;
+          a0_y.val[1] = v_zero;
+          a0_y.val[0] = vld1q_lane_u8(left + base_y_c[0], a0_y.val[0], 0);
+          a0_y.val[0] = vld1q_lane_u8(left + base_y_c[1], a0_y.val[0], 2);
+          a0_y.val[0] = vld1q_lane_u8(left + base_y_c[2], a0_y.val[0], 4);
+          a0_y.val[0] = vld1q_lane_u8(left + base_y_c[3], a0_y.val[0], 6);
+          a0_y.val[0] = vld1q_lane_u8(left + base_y_c[4], a0_y.val[0], 8);
+          a0_y.val[0] = vld1q_lane_u8(left + base_y_c[5], a0_y.val[0], 10);
+          a0_y.val[0] = vld1q_lane_u8(left + base_y_c[6], a0_y.val[0], 12);
+          a0_y.val[0] = vld1q_lane_u8(left + base_y_c[7], a0_y.val[0], 14);
+          a0_y.val[1] = vld1q_lane_u8(left + base_y_c[8], a0_y.val[1], 0);
+          a0_y.val[1] = vld1q_lane_u8(left + base_y_c[9], a0_y.val[1], 2);
+          a0_y.val[1] = vld1q_lane_u8(left + base_y_c[10], a0_y.val[1], 4);
+          a0_y.val[1] = vld1q_lane_u8(left + base_y_c[11], a0_y.val[1], 6);
+          a0_y.val[1] = vld1q_lane_u8(left + base_y_c[12], a0_y.val[1], 8);
+          a0_y.val[1] = vld1q_lane_u8(left + base_y_c[13], a0_y.val[1], 10);
+          a0_y.val[1] = vld1q_lane_u8(left + base_y_c[14], a0_y.val[1], 12);
+          a0_y.val[1] = vld1q_lane_u8(left + base_y_c[15], a0_y.val[1], 14);
+
+          base_y_c256.val[0] =
+              vaddq_s16(base_y_c256.val[0], vreinterpretq_s16_u16(c1));
+          base_y_c256.val[1] =
+              vaddq_s16(base_y_c256.val[1], vreinterpretq_s16_u16(c1));
+          vst1q_s16(base_y_c, base_y_c256.val[0]);
+          vst1q_s16(base_y_c + 8, base_y_c256.val[1]);
+          a1_y.val[0] = v_zero;
+          a1_y.val[1] = v_zero;
+          a1_y.val[0] = vld1q_lane_u8(left + base_y_c[0], a1_y.val[0], 0);
+          a1_y.val[0] = vld1q_lane_u8(left + base_y_c[1], a1_y.val[0], 2);
+          a1_y.val[0] = vld1q_lane_u8(left + base_y_c[2], a1_y.val[0], 4);
+          a1_y.val[0] = vld1q_lane_u8(left + base_y_c[3], a1_y.val[0], 6);
+          a1_y.val[0] = vld1q_lane_u8(left + base_y_c[4], a1_y.val[0], 8);
+          a1_y.val[0] = vld1q_lane_u8(left + base_y_c[5], a1_y.val[0], 10);
+          a1_y.val[0] = vld1q_lane_u8(left + base_y_c[6], a1_y.val[0], 12);
+          a1_y.val[0] = vld1q_lane_u8(left + base_y_c[7], a1_y.val[0], 14);
+          a1_y.val[1] = vld1q_lane_u8(left + base_y_c[8], a1_y.val[1], 0);
+          a1_y.val[1] = vld1q_lane_u8(left + base_y_c[9], a1_y.val[1], 2);
+          a1_y.val[1] = vld1q_lane_u8(left + base_y_c[10], a1_y.val[1], 4);
+          a1_y.val[1] = vld1q_lane_u8(left + base_y_c[11], a1_y.val[1], 6);
+          a1_y.val[1] = vld1q_lane_u8(left + base_y_c[12], a1_y.val[1], 8);
+          a1_y.val[1] = vld1q_lane_u8(left + base_y_c[13], a1_y.val[1], 10);
+          a1_y.val[1] = vld1q_lane_u8(left + base_y_c[14], a1_y.val[1], 12);
+          a1_y.val[1] = vld1q_lane_u8(left + base_y_c[15], a1_y.val[1], 14);
+        }
+        shifty.val[0] = vshrq_n_u16(
+            vandq_u16(vreinterpretq_u16_s16(y_c256.val[0]), c3f), 1);
+        shifty.val[1] = vshrq_n_u16(
+            vandq_u16(vreinterpretq_u16_s16(y_c256.val[1]), c3f), 1);
+        diff.val[0] =
+            vsubq_u16(vreinterpretq_u16_u8(a1_y.val[0]),
+                      vreinterpretq_u16_u8(a0_y.val[0]));  // a[x+1] - a[x]
+        diff.val[1] =
+            vsubq_u16(vreinterpretq_u16_u8(a1_y.val[1]),
+                      vreinterpretq_u16_u8(a0_y.val[1]));  // a[x+1] - a[x]
+        a32.val[0] = vmlaq_u16(a16, vreinterpretq_u16_u8(a0_y.val[0]),
+                               v_32);  // a[x] * 32 + 16
+        a32.val[1] = vmlaq_u16(a16, vreinterpretq_u16_u8(a0_y.val[1]),
+                               v_32);  // a[x] * 32 + 16
+        res.val[0] = vmlaq_u16(a32.val[0], diff.val[0], shifty.val[0]);
+        res.val[1] = vmlaq_u16(a32.val[1], diff.val[1], shifty.val[1]);
+
+        resy =
+            vcombine_u8(vshrn_n_u16(res.val[0], 5), vshrn_n_u16(res.val[1], 5));
+      } else {
+        resy = v_zero;
+      }
+      uint8x16_t mask = vld1q_u8(BaseMask[base_min_diff]);
+      resxy = vorrq_u8(vandq_u8(mask, resy), vbicq_u8(resx, mask));
+      vst1q_u8(dst + j, resxy);
+    }  // for j
+    dst += stride;
+  }
+}
+
+// Directional prediction, zone 2: 90 < angle < 180
+void av1_dr_prediction_z2_neon(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
+                               const uint8_t *above, const uint8_t *left,
+                               int upsample_above, int upsample_left, int dx,
+                               int dy) {
+  assert(dx > 0);
+  assert(dy > 0);
+
+  switch (bw) {
+    case 4:
+      dr_prediction_z2_Nx4_neon(bh, dst, stride, above, left, upsample_above,
+                                upsample_left, dx, dy);
+      break;
+    case 8:
+      dr_prediction_z2_Nx8_neon(bh, dst, stride, above, left, upsample_above,
+                                upsample_left, dx, dy);
+      break;
+    default:
+      dr_prediction_z2_HxW_neon(bh, bw, dst, stride, above, left,
+                                upsample_above, upsample_left, dx, dy);
+      break;
+  }
+  return;
+}
+
+/* ---------------------P R E D I C T I O N   Z 3--------------------------- */
+
+static AOM_FORCE_INLINE void transpose4x16_neon(uint8x16_t *x,
+                                                uint16x8x2_t *d) {
+  uint8x16x2_t w0, w1;
+
+  w0 = vzipq_u8(x[0], x[1]);
+  w1 = vzipq_u8(x[2], x[3]);
+
+  d[0] = vzipq_u16(vreinterpretq_u16_u8(w0.val[0]),
+                   vreinterpretq_u16_u8(w1.val[0]));
+  d[1] = vzipq_u16(vreinterpretq_u16_u8(w0.val[1]),
+                   vreinterpretq_u16_u8(w1.val[1]));
+}
+
+static AOM_FORCE_INLINE void transpose4x8_8x4_low_neon(uint8x8_t *x,
+                                                       uint16x4x2_t *d) {
+  uint8x8x2_t w0, w1;
+
+  w0 = vzip_u8(x[0], x[1]);
+  w1 = vzip_u8(x[2], x[3]);
+
+  *d = vzip_u16(vreinterpret_u16_u8(w0.val[0]), vreinterpret_u16_u8(w1.val[0]));
+}
+
+static AOM_FORCE_INLINE void transpose4x8_8x4_neon(uint8x8_t *x,
+                                                   uint16x4x2_t *d) {
+  uint8x8x2_t w0, w1;
+
+  w0 = vzip_u8(x[0], x[1]);
+  w1 = vzip_u8(x[2], x[3]);
+
+  d[0] =
+      vzip_u16(vreinterpret_u16_u8(w0.val[0]), vreinterpret_u16_u8(w1.val[0]));
+  d[1] =
+      vzip_u16(vreinterpret_u16_u8(w0.val[1]), vreinterpret_u16_u8(w1.val[1]));
+}
+
+static AOM_FORCE_INLINE void transpose8x8_low_neon(uint8x8_t *x,
+                                                   uint32x2x2_t *d) {
+  uint8x8x2_t w0, w1, w2, w3;
+  uint16x4x2_t w4, w5;
+
+  w0 = vzip_u8(x[0], x[1]);
+  w1 = vzip_u8(x[2], x[3]);
+  w2 = vzip_u8(x[4], x[5]);
+  w3 = vzip_u8(x[6], x[7]);
+
+  w4 = vzip_u16(vreinterpret_u16_u8(w0.val[0]), vreinterpret_u16_u8(w1.val[0]));
+  w5 = vzip_u16(vreinterpret_u16_u8(w2.val[0]), vreinterpret_u16_u8(w3.val[0]));
+
+  d[0] = vzip_u32(vreinterpret_u32_u16(w4.val[0]),
+                  vreinterpret_u32_u16(w5.val[0]));
+  d[1] = vzip_u32(vreinterpret_u32_u16(w4.val[1]),
+                  vreinterpret_u32_u16(w5.val[1]));
+}
+
+static AOM_FORCE_INLINE void transpose8x8_neon(uint8x8_t *x, uint32x2x2_t *d) {
+  uint8x8x2_t w0, w1, w2, w3;
+  uint16x4x2_t w4, w5, w6, w7;
+
+  w0 = vzip_u8(x[0], x[1]);
+  w1 = vzip_u8(x[2], x[3]);
+  w2 = vzip_u8(x[4], x[5]);
+  w3 = vzip_u8(x[6], x[7]);
+
+  w4 = vzip_u16(vreinterpret_u16_u8(w0.val[0]), vreinterpret_u16_u8(w1.val[0]));
+  w5 = vzip_u16(vreinterpret_u16_u8(w2.val[0]), vreinterpret_u16_u8(w3.val[0]));
+
+  d[0] = vzip_u32(vreinterpret_u32_u16(w4.val[0]),
+                  vreinterpret_u32_u16(w5.val[0]));
+  d[1] = vzip_u32(vreinterpret_u32_u16(w4.val[1]),
+                  vreinterpret_u32_u16(w5.val[1]));
+
+  w6 = vzip_u16(vreinterpret_u16_u8(w0.val[1]), vreinterpret_u16_u8(w1.val[1]));
+  w7 = vzip_u16(vreinterpret_u16_u8(w2.val[1]), vreinterpret_u16_u8(w3.val[1]));
+
+  d[2] = vzip_u32(vreinterpret_u32_u16(w6.val[0]),
+                  vreinterpret_u32_u16(w7.val[0]));
+  d[3] = vzip_u32(vreinterpret_u32_u16(w6.val[1]),
+                  vreinterpret_u32_u16(w7.val[1]));
+}
+
+static AOM_FORCE_INLINE void transpose16x8_8x16_neon(uint8x8_t *x,
+                                                     uint64x2_t *d) {
+  uint8x8x2_t w0, w1, w2, w3, w8, w9, w10, w11;
+  uint16x4x2_t w4, w5, w12, w13;
+  uint32x2x2_t w6, w7, w14, w15;
+
+  w0 = vzip_u8(x[0], x[1]);
+  w1 = vzip_u8(x[2], x[3]);
+  w2 = vzip_u8(x[4], x[5]);
+  w3 = vzip_u8(x[6], x[7]);
+
+  w8 = vzip_u8(x[8], x[9]);
+  w9 = vzip_u8(x[10], x[11]);
+  w10 = vzip_u8(x[12], x[13]);
+  w11 = vzip_u8(x[14], x[15]);
+
+  w4 = vzip_u16(vreinterpret_u16_u8(w0.val[0]), vreinterpret_u16_u8(w1.val[0]));
+  w5 = vzip_u16(vreinterpret_u16_u8(w2.val[0]), vreinterpret_u16_u8(w3.val[0]));
+  w12 =
+      vzip_u16(vreinterpret_u16_u8(w8.val[0]), vreinterpret_u16_u8(w9.val[0]));
+  w13 = vzip_u16(vreinterpret_u16_u8(w10.val[0]),
+                 vreinterpret_u16_u8(w11.val[0]));
+
+  w6 = vzip_u32(vreinterpret_u32_u16(w4.val[0]),
+                vreinterpret_u32_u16(w5.val[0]));
+  w7 = vzip_u32(vreinterpret_u32_u16(w4.val[1]),
+                vreinterpret_u32_u16(w5.val[1]));
+  w14 = vzip_u32(vreinterpret_u32_u16(w12.val[0]),
+                 vreinterpret_u32_u16(w13.val[0]));
+  w15 = vzip_u32(vreinterpret_u32_u16(w12.val[1]),
+                 vreinterpret_u32_u16(w13.val[1]));
+
+  // Store first 4-line result
+  d[0] = vcombine_u64(vreinterpret_u64_u32(w6.val[0]),
+                      vreinterpret_u64_u32(w14.val[0]));
+  d[1] = vcombine_u64(vreinterpret_u64_u32(w6.val[1]),
+                      vreinterpret_u64_u32(w14.val[1]));
+  d[2] = vcombine_u64(vreinterpret_u64_u32(w7.val[0]),
+                      vreinterpret_u64_u32(w15.val[0]));
+  d[3] = vcombine_u64(vreinterpret_u64_u32(w7.val[1]),
+                      vreinterpret_u64_u32(w15.val[1]));
+
+  w4 = vzip_u16(vreinterpret_u16_u8(w0.val[1]), vreinterpret_u16_u8(w1.val[1]));
+  w5 = vzip_u16(vreinterpret_u16_u8(w2.val[1]), vreinterpret_u16_u8(w3.val[1]));
+  w12 =
+      vzip_u16(vreinterpret_u16_u8(w8.val[1]), vreinterpret_u16_u8(w9.val[1]));
+  w13 = vzip_u16(vreinterpret_u16_u8(w10.val[1]),
+                 vreinterpret_u16_u8(w11.val[1]));
+
+  w6 = vzip_u32(vreinterpret_u32_u16(w4.val[0]),
+                vreinterpret_u32_u16(w5.val[0]));
+  w7 = vzip_u32(vreinterpret_u32_u16(w4.val[1]),
+                vreinterpret_u32_u16(w5.val[1]));
+  w14 = vzip_u32(vreinterpret_u32_u16(w12.val[0]),
+                 vreinterpret_u32_u16(w13.val[0]));
+  w15 = vzip_u32(vreinterpret_u32_u16(w12.val[1]),
+                 vreinterpret_u32_u16(w13.val[1]));
+
+  // Store second 4-line result
+  d[4] = vcombine_u64(vreinterpret_u64_u32(w6.val[0]),
+                      vreinterpret_u64_u32(w14.val[0]));
+  d[5] = vcombine_u64(vreinterpret_u64_u32(w6.val[1]),
+                      vreinterpret_u64_u32(w14.val[1]));
+  d[6] = vcombine_u64(vreinterpret_u64_u32(w7.val[0]),
+                      vreinterpret_u64_u32(w15.val[0]));
+  d[7] = vcombine_u64(vreinterpret_u64_u32(w7.val[1]),
+                      vreinterpret_u64_u32(w15.val[1]));
+}
+
+static AOM_FORCE_INLINE void transpose8x16_16x8_neon(uint8x16_t *x,
+                                                     uint64x2_t *d) {
+  uint8x16x2_t w0, w1, w2, w3;
+  uint16x8x2_t w4, w5, w6, w7;
+  uint32x4x2_t w8, w9, w10, w11;
+
+  w0 = vzipq_u8(x[0], x[1]);
+  w1 = vzipq_u8(x[2], x[3]);
+  w2 = vzipq_u8(x[4], x[5]);
+  w3 = vzipq_u8(x[6], x[7]);
+
+  w4 = vzipq_u16(vreinterpretq_u16_u8(w0.val[0]),
+                 vreinterpretq_u16_u8(w1.val[0]));
+  w5 = vzipq_u16(vreinterpretq_u16_u8(w2.val[0]),
+                 vreinterpretq_u16_u8(w3.val[0]));
+  w6 = vzipq_u16(vreinterpretq_u16_u8(w0.val[1]),
+                 vreinterpretq_u16_u8(w1.val[1]));
+  w7 = vzipq_u16(vreinterpretq_u16_u8(w2.val[1]),
+                 vreinterpretq_u16_u8(w3.val[1]));
+
+  w8 = vzipq_u32(vreinterpretq_u32_u16(w4.val[0]),
+                 vreinterpretq_u32_u16(w5.val[0]));
+  w9 = vzipq_u32(vreinterpretq_u32_u16(w6.val[0]),
+                 vreinterpretq_u32_u16(w7.val[0]));
+  w10 = vzipq_u32(vreinterpretq_u32_u16(w4.val[1]),
+                  vreinterpretq_u32_u16(w5.val[1]));
+  w11 = vzipq_u32(vreinterpretq_u32_u16(w6.val[1]),
+                  vreinterpretq_u32_u16(w7.val[1]));
+
+#if defined(__aarch64__)
+  d[0] = vzip1q_u64(vreinterpretq_u64_u32(w8.val[0]),
+                    vreinterpretq_u64_u32(w9.val[0]));
+  d[1] = vzip2q_u64(vreinterpretq_u64_u32(w8.val[0]),
+                    vreinterpretq_u64_u32(w9.val[0]));
+  d[2] = vzip1q_u64(vreinterpretq_u64_u32(w8.val[1]),
+                    vreinterpretq_u64_u32(w9.val[1]));
+  d[3] = vzip2q_u64(vreinterpretq_u64_u32(w8.val[1]),
+                    vreinterpretq_u64_u32(w9.val[1]));
+  d[4] = vzip1q_u64(vreinterpretq_u64_u32(w10.val[0]),
+                    vreinterpretq_u64_u32(w11.val[0]));
+  d[5] = vzip2q_u64(vreinterpretq_u64_u32(w10.val[0]),
+                    vreinterpretq_u64_u32(w11.val[0]));
+  d[6] = vzip1q_u64(vreinterpretq_u64_u32(w10.val[1]),
+                    vreinterpretq_u64_u32(w11.val[1]));
+  d[7] = vzip2q_u64(vreinterpretq_u64_u32(w10.val[1]),
+                    vreinterpretq_u64_u32(w11.val[1]));
+#else
+  d[0] = vreinterpretq_u64_u32(
+      vcombine_u32(vget_low_u32(w8.val[0]), vget_low_u32(w9.val[0])));
+  d[1] = vreinterpretq_u64_u32(
+      vcombine_u32(vget_high_u32(w8.val[0]), vget_high_u32(w9.val[0])));
+  d[2] = vreinterpretq_u64_u32(
+      vcombine_u32(vget_low_u32(w8.val[1]), vget_low_u32(w9.val[1])));
+  d[3] = vreinterpretq_u64_u32(
+      vcombine_u32(vget_high_u32(w8.val[1]), vget_high_u32(w9.val[1])));
+  d[4] = vreinterpretq_u64_u32(
+      vcombine_u32(vget_low_u32(w10.val[0]), vget_low_u32(w11.val[0])));
+  d[5] = vreinterpretq_u64_u32(
+      vcombine_u32(vget_high_u32(w10.val[0]), vget_high_u32(w11.val[0])));
+  d[6] = vreinterpretq_u64_u32(
+      vcombine_u32(vget_low_u32(w10.val[1]), vget_low_u32(w11.val[1])));
+  d[7] = vreinterpretq_u64_u32(
+      vcombine_u32(vget_high_u32(w10.val[1]), vget_high_u32(w11.val[1])));
+#endif
+}
+
+static AOM_FORCE_INLINE void transpose16x16_neon(uint8x16_t *x, uint64x2_t *d) {
+  uint8x16x2_t w0, w1, w2, w3, w4, w5, w6, w7;
+  uint16x8x2_t w8, w9, w10, w11;
+  uint32x4x2_t w12, w13, w14, w15;
+
+  w0 = vzipq_u8(x[0], x[1]);
+  w1 = vzipq_u8(x[2], x[3]);
+  w2 = vzipq_u8(x[4], x[5]);
+  w3 = vzipq_u8(x[6], x[7]);
+
+  w4 = vzipq_u8(x[8], x[9]);
+  w5 = vzipq_u8(x[10], x[11]);
+  w6 = vzipq_u8(x[12], x[13]);
+  w7 = vzipq_u8(x[14], x[15]);
+
+  w8 = vzipq_u16(vreinterpretq_u16_u8(w0.val[0]),
+                 vreinterpretq_u16_u8(w1.val[0]));
+  w9 = vzipq_u16(vreinterpretq_u16_u8(w2.val[0]),
+                 vreinterpretq_u16_u8(w3.val[0]));
+  w10 = vzipq_u16(vreinterpretq_u16_u8(w4.val[0]),
+                  vreinterpretq_u16_u8(w5.val[0]));
+  w11 = vzipq_u16(vreinterpretq_u16_u8(w6.val[0]),
+                  vreinterpretq_u16_u8(w7.val[0]));
+
+  w12 = vzipq_u32(vreinterpretq_u32_u16(w8.val[0]),
+                  vreinterpretq_u32_u16(w9.val[0]));
+  w13 = vzipq_u32(vreinterpretq_u32_u16(w10.val[0]),
+                  vreinterpretq_u32_u16(w11.val[0]));
+  w14 = vzipq_u32(vreinterpretq_u32_u16(w8.val[1]),
+                  vreinterpretq_u32_u16(w9.val[1]));
+  w15 = vzipq_u32(vreinterpretq_u32_u16(w10.val[1]),
+                  vreinterpretq_u32_u16(w11.val[1]));
+
+#if defined(__aarch64__)
+  d[0] = vzip1q_u64(vreinterpretq_u64_u32(w12.val[0]),
+                    vreinterpretq_u64_u32(w13.val[0]));
+  d[1] = vzip2q_u64(vreinterpretq_u64_u32(w12.val[0]),
+                    vreinterpretq_u64_u32(w13.val[0]));
+  d[2] = vzip1q_u64(vreinterpretq_u64_u32(w12.val[1]),
+                    vreinterpretq_u64_u32(w13.val[1]));
+  d[3] = vzip2q_u64(vreinterpretq_u64_u32(w12.val[1]),
+                    vreinterpretq_u64_u32(w13.val[1]));
+  d[4] = vzip1q_u64(vreinterpretq_u64_u32(w14.val[0]),
+                    vreinterpretq_u64_u32(w15.val[0]));
+  d[5] = vzip2q_u64(vreinterpretq_u64_u32(w14.val[0]),
+                    vreinterpretq_u64_u32(w15.val[0]));
+  d[6] = vzip1q_u64(vreinterpretq_u64_u32(w14.val[1]),
+                    vreinterpretq_u64_u32(w15.val[1]));
+  d[7] = vzip2q_u64(vreinterpretq_u64_u32(w14.val[1]),
+                    vreinterpretq_u64_u32(w15.val[1]));
+#else
+  d[0] = vreinterpretq_u64_u32(
+      vcombine_u32(vget_low_u32(w12.val[0]), vget_low_u32(w13.val[0])));
+  d[1] = vreinterpretq_u64_u32(
+      vcombine_u32(vget_high_u32(w12.val[0]), vget_high_u32(w13.val[0])));
+  d[2] = vreinterpretq_u64_u32(
+      vcombine_u32(vget_low_u32(w12.val[1]), vget_low_u32(w13.val[1])));
+  d[3] = vreinterpretq_u64_u32(
+      vcombine_u32(vget_high_u32(w12.val[1]), vget_high_u32(w13.val[1])));
+  d[4] = vreinterpretq_u64_u32(
+      vcombine_u32(vget_low_u32(w14.val[0]), vget_low_u32(w15.val[0])));
+  d[5] = vreinterpretq_u64_u32(
+      vcombine_u32(vget_high_u32(w14.val[0]), vget_high_u32(w15.val[0])));
+  d[6] = vreinterpretq_u64_u32(
+      vcombine_u32(vget_low_u32(w14.val[1]), vget_low_u32(w15.val[1])));
+  d[7] = vreinterpretq_u64_u32(
+      vcombine_u32(vget_high_u32(w14.val[1]), vget_high_u32(w15.val[1])));
+#endif
+
+  // upper half
+  w8 = vzipq_u16(vreinterpretq_u16_u8(w0.val[1]),
+                 vreinterpretq_u16_u8(w1.val[1]));
+  w9 = vzipq_u16(vreinterpretq_u16_u8(w2.val[1]),
+                 vreinterpretq_u16_u8(w3.val[1]));
+  w10 = vzipq_u16(vreinterpretq_u16_u8(w4.val[1]),
+                  vreinterpretq_u16_u8(w5.val[1]));
+  w11 = vzipq_u16(vreinterpretq_u16_u8(w6.val[1]),
+                  vreinterpretq_u16_u8(w7.val[1]));
+
+  w12 = vzipq_u32(vreinterpretq_u32_u16(w8.val[0]),
+                  vreinterpretq_u32_u16(w9.val[0]));
+  w13 = vzipq_u32(vreinterpretq_u32_u16(w10.val[0]),
+                  vreinterpretq_u32_u16(w11.val[0]));
+  w14 = vzipq_u32(vreinterpretq_u32_u16(w8.val[1]),
+                  vreinterpretq_u32_u16(w9.val[1]));
+  w15 = vzipq_u32(vreinterpretq_u32_u16(w10.val[1]),
+                  vreinterpretq_u32_u16(w11.val[1]));
+
+#if defined(__aarch64__)
+  d[8] = vzip1q_u64(vreinterpretq_u64_u32(w12.val[0]),
+                    vreinterpretq_u64_u32(w13.val[0]));
+  d[9] = vzip2q_u64(vreinterpretq_u64_u32(w12.val[0]),
+                    vreinterpretq_u64_u32(w13.val[0]));
+  d[10] = vzip1q_u64(vreinterpretq_u64_u32(w12.val[1]),
+                     vreinterpretq_u64_u32(w13.val[1]));
+  d[11] = vzip2q_u64(vreinterpretq_u64_u32(w12.val[1]),
+                     vreinterpretq_u64_u32(w13.val[1]));
+  d[12] = vzip1q_u64(vreinterpretq_u64_u32(w14.val[0]),
+                     vreinterpretq_u64_u32(w15.val[0]));
+  d[13] = vzip2q_u64(vreinterpretq_u64_u32(w14.val[0]),
+                     vreinterpretq_u64_u32(w15.val[0]));
+  d[14] = vzip1q_u64(vreinterpretq_u64_u32(w14.val[1]),
+                     vreinterpretq_u64_u32(w15.val[1]));
+  d[15] = vzip2q_u64(vreinterpretq_u64_u32(w14.val[1]),
+                     vreinterpretq_u64_u32(w15.val[1]));
+#else
+  d[8] = vreinterpretq_u64_u32(
+      vcombine_u32(vget_low_u32(w12.val[0]), vget_low_u32(w13.val[0])));
+  d[9] = vreinterpretq_u64_u32(
+      vcombine_u32(vget_high_u32(w12.val[0]), vget_high_u32(w13.val[0])));
+  d[10] = vreinterpretq_u64_u32(
+      vcombine_u32(vget_low_u32(w12.val[1]), vget_low_u32(w13.val[1])));
+  d[11] = vreinterpretq_u64_u32(
+      vcombine_u32(vget_high_u32(w12.val[1]), vget_high_u32(w13.val[1])));
+  d[12] = vreinterpretq_u64_u32(
+      vcombine_u32(vget_low_u32(w14.val[0]), vget_low_u32(w15.val[0])));
+  d[13] = vreinterpretq_u64_u32(
+      vcombine_u32(vget_high_u32(w14.val[0]), vget_high_u32(w15.val[0])));
+  d[14] = vreinterpretq_u64_u32(
+      vcombine_u32(vget_low_u32(w14.val[1]), vget_low_u32(w15.val[1])));
+  d[15] = vreinterpretq_u64_u32(
+      vcombine_u32(vget_high_u32(w14.val[1]), vget_high_u32(w15.val[1])));
+#endif
+}
+
+static AOM_FORCE_INLINE void transpose16x32_neon(uint8x16x2_t *x,
+                                                 uint64x2x2_t *d) {
+  uint8x16x2_t w0, w1, w2, w3, w8, w9, w10, w11;
+  uint16x8x2_t w4, w5, w12, w13;
+  uint32x4x2_t w6, w7, w14, w15;
+
+  w0 = vzipq_u8(x[0].val[0], x[1].val[0]);
+  w1 = vzipq_u8(x[2].val[0], x[3].val[0]);
+  w2 = vzipq_u8(x[4].val[0], x[5].val[0]);
+  w3 = vzipq_u8(x[6].val[0], x[7].val[0]);
+
+  w8 = vzipq_u8(x[8].val[0], x[9].val[0]);
+  w9 = vzipq_u8(x[10].val[0], x[11].val[0]);
+  w10 = vzipq_u8(x[12].val[0], x[13].val[0]);
+  w11 = vzipq_u8(x[14].val[0], x[15].val[0]);
+
+  w4 = vzipq_u16(vreinterpretq_u16_u8(w0.val[0]),
+                 vreinterpretq_u16_u8(w1.val[0]));
+  w5 = vzipq_u16(vreinterpretq_u16_u8(w2.val[0]),
+                 vreinterpretq_u16_u8(w3.val[0]));
+  w12 = vzipq_u16(vreinterpretq_u16_u8(w8.val[0]),
+                  vreinterpretq_u16_u8(w9.val[0]));
+  w13 = vzipq_u16(vreinterpretq_u16_u8(w10.val[0]),
+                  vreinterpretq_u16_u8(w11.val[0]));
+
+  w6 = vzipq_u32(vreinterpretq_u32_u16(w4.val[0]),
+                 vreinterpretq_u32_u16(w5.val[0]));
+  w7 = vzipq_u32(vreinterpretq_u32_u16(w4.val[1]),
+                 vreinterpretq_u32_u16(w5.val[1]));
+  w14 = vzipq_u32(vreinterpretq_u32_u16(w12.val[0]),
+                  vreinterpretq_u32_u16(w13.val[0]));
+  w15 = vzipq_u32(vreinterpretq_u32_u16(w12.val[1]),
+                  vreinterpretq_u32_u16(w13.val[1]));
+
+  // Store first 4-line result
+
+#if defined(__aarch64__)
+  d[0].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w6.val[0]),
+                           vreinterpretq_u64_u32(w14.val[0]));
+  d[0].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w6.val[0]),
+                           vreinterpretq_u64_u32(w14.val[0]));
+  d[1].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w6.val[1]),
+                           vreinterpretq_u64_u32(w14.val[1]));
+  d[1].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w6.val[1]),
+                           vreinterpretq_u64_u32(w14.val[1]));
+  d[2].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w7.val[0]),
+                           vreinterpretq_u64_u32(w15.val[0]));
+  d[2].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w7.val[0]),
+                           vreinterpretq_u64_u32(w15.val[0]));
+  d[3].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w7.val[1]),
+                           vreinterpretq_u64_u32(w15.val[1]));
+  d[3].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w7.val[1]),
+                           vreinterpretq_u64_u32(w15.val[1]));
+#else
+  d[0].val[0] = vreinterpretq_u64_u32(
+      vcombine_u32(vget_low_u32(w6.val[0]), vget_low_u32(w14.val[0])));
+  d[0].val[1] = vreinterpretq_u64_u32(
+      vcombine_u32(vget_high_u32(w6.val[0]), vget_high_u32(w14.val[0])));
+  d[1].val[0] = vreinterpretq_u64_u32(
+      vcombine_u32(vget_low_u32(w6.val[1]), vget_low_u32(w14.val[1])));
+  d[1].val[1] = vreinterpretq_u64_u32(
+      vcombine_u32(vget_high_u32(w6.val[1]), vget_high_u32(w14.val[1])));
+  d[2].val[0] = vreinterpretq_u64_u32(
+      vcombine_u32(vget_low_u32(w7.val[0]), vget_low_u32(w15.val[0])));
+  d[2].val[1] = vreinterpretq_u64_u32(
+      vcombine_u32(vget_high_u32(w7.val[0]), vget_high_u32(w15.val[0])));
+  d[3].val[0] = vreinterpretq_u64_u32(
+      vcombine_u32(vget_low_u32(w7.val[1]), vget_low_u32(w15.val[1])));
+  d[3].val[1] = vreinterpretq_u64_u32(
+      vcombine_u32(vget_high_u32(w7.val[1]), vget_high_u32(w15.val[1])));
+#endif
+
+  w4 = vzipq_u16(vreinterpretq_u16_u8(w0.val[1]),
+                 vreinterpretq_u16_u8(w1.val[1]));
+  w5 = vzipq_u16(vreinterpretq_u16_u8(w2.val[1]),
+                 vreinterpretq_u16_u8(w3.val[1]));
+  w12 = vzipq_u16(vreinterpretq_u16_u8(w8.val[1]),
+                  vreinterpretq_u16_u8(w9.val[1]));
+  w13 = vzipq_u16(vreinterpretq_u16_u8(w10.val[1]),
+                  vreinterpretq_u16_u8(w11.val[1]));
+
+  w6 = vzipq_u32(vreinterpretq_u32_u16(w4.val[0]),
+                 vreinterpretq_u32_u16(w5.val[0]));
+  w7 = vzipq_u32(vreinterpretq_u32_u16(w4.val[1]),
+                 vreinterpretq_u32_u16(w5.val[1]));
+  w14 = vzipq_u32(vreinterpretq_u32_u16(w12.val[0]),
+                  vreinterpretq_u32_u16(w13.val[0]));
+  w15 = vzipq_u32(vreinterpretq_u32_u16(w12.val[1]),
+                  vreinterpretq_u32_u16(w13.val[1]));
+
+  // Store second 4-line result
+
+#if defined(__aarch64__)
+  d[4].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w6.val[0]),
+                           vreinterpretq_u64_u32(w14.val[0]));
+  d[4].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w6.val[0]),
+                           vreinterpretq_u64_u32(w14.val[0]));
+  d[5].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w6.val[1]),
+                           vreinterpretq_u64_u32(w14.val[1]));
+  d[5].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w6.val[1]),
+                           vreinterpretq_u64_u32(w14.val[1]));
+  d[6].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w7.val[0]),
+                           vreinterpretq_u64_u32(w15.val[0]));
+  d[6].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w7.val[0]),
+                           vreinterpretq_u64_u32(w15.val[0]));
+  d[7].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w7.val[1]),
+                           vreinterpretq_u64_u32(w15.val[1]));
+  d[7].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w7.val[1]),
+                           vreinterpretq_u64_u32(w15.val[1]));
+#else
+  d[4].val[0] = vreinterpretq_u64_u32(
+      vcombine_u32(vget_low_u32(w6.val[0]), vget_low_u32(w14.val[0])));
+  d[4].val[1] = vreinterpretq_u64_u32(
+      vcombine_u32(vget_high_u32(w6.val[0]), vget_high_u32(w14.val[0])));
+  d[5].val[0] = vreinterpretq_u64_u32(
+      vcombine_u32(vget_low_u32(w6.val[1]), vget_low_u32(w14.val[1])));
+  d[5].val[1] = vreinterpretq_u64_u32(
+      vcombine_u32(vget_high_u32(w6.val[1]), vget_high_u32(w14.val[1])));
+  d[6].val[0] = vreinterpretq_u64_u32(
+      vcombine_u32(vget_low_u32(w7.val[0]), vget_low_u32(w15.val[0])));
+  d[6].val[1] = vreinterpretq_u64_u32(
+      vcombine_u32(vget_high_u32(w7.val[0]), vget_high_u32(w15.val[0])));
+  d[7].val[0] = vreinterpretq_u64_u32(
+      vcombine_u32(vget_low_u32(w7.val[1]), vget_low_u32(w15.val[1])));
+  d[7].val[1] = vreinterpretq_u64_u32(
+      vcombine_u32(vget_high_u32(w7.val[1]), vget_high_u32(w15.val[1])));
+#endif
+
+  // upper half
+  w0 = vzipq_u8(x[0].val[1], x[1].val[1]);
+  w1 = vzipq_u8(x[2].val[1], x[3].val[1]);
+  w2 = vzipq_u8(x[4].val[1], x[5].val[1]);
+  w3 = vzipq_u8(x[6].val[1], x[7].val[1]);
+
+  w8 = vzipq_u8(x[8].val[1], x[9].val[1]);
+  w9 = vzipq_u8(x[10].val[1], x[11].val[1]);
+  w10 = vzipq_u8(x[12].val[1], x[13].val[1]);
+  w11 = vzipq_u8(x[14].val[1], x[15].val[1]);
+
+  w4 = vzipq_u16(vreinterpretq_u16_u8(w0.val[0]),
+                 vreinterpretq_u16_u8(w1.val[0]));
+  w5 = vzipq_u16(vreinterpretq_u16_u8(w2.val[0]),
+                 vreinterpretq_u16_u8(w3.val[0]));
+  w12 = vzipq_u16(vreinterpretq_u16_u8(w8.val[0]),
+                  vreinterpretq_u16_u8(w9.val[0]));
+  w13 = vzipq_u16(vreinterpretq_u16_u8(w10.val[0]),
+                  vreinterpretq_u16_u8(w11.val[0]));
+
+  w6 = vzipq_u32(vreinterpretq_u32_u16(w4.val[0]),
+                 vreinterpretq_u32_u16(w5.val[0]));
+  w7 = vzipq_u32(vreinterpretq_u32_u16(w4.val[1]),
+                 vreinterpretq_u32_u16(w5.val[1]));
+  w14 = vzipq_u32(vreinterpretq_u32_u16(w12.val[0]),
+                  vreinterpretq_u32_u16(w13.val[0]));
+  w15 = vzipq_u32(vreinterpretq_u32_u16(w12.val[1]),
+                  vreinterpretq_u32_u16(w13.val[1]));
+
+  // Store first 4-line result
+
+#if defined(__aarch64__)
+  d[8].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w6.val[0]),
+                           vreinterpretq_u64_u32(w14.val[0]));
+  d[8].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w6.val[0]),
+                           vreinterpretq_u64_u32(w14.val[0]));
+  d[9].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w6.val[1]),
+                           vreinterpretq_u64_u32(w14.val[1]));
+  d[9].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w6.val[1]),
+                           vreinterpretq_u64_u32(w14.val[1]));
+  d[10].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w7.val[0]),
+                            vreinterpretq_u64_u32(w15.val[0]));
+  d[10].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w7.val[0]),
+                            vreinterpretq_u64_u32(w15.val[0]));
+  d[11].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w7.val[1]),
+                            vreinterpretq_u64_u32(w15.val[1]));
+  d[11].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w7.val[1]),
+                            vreinterpretq_u64_u32(w15.val[1]));
+#else
+  d[8].val[0] = vreinterpretq_u64_u32(
+      vcombine_u32(vget_low_u32(w6.val[0]), vget_low_u32(w14.val[0])));
+  d[8].val[1] = vreinterpretq_u64_u32(
+      vcombine_u32(vget_high_u32(w6.val[0]), vget_high_u32(w14.val[0])));
+  d[9].val[0] = vreinterpretq_u64_u32(
+      vcombine_u32(vget_low_u32(w6.val[1]), vget_low_u32(w14.val[1])));
+  d[9].val[1] = vreinterpretq_u64_u32(
+      vcombine_u32(vget_high_u32(w6.val[1]), vget_high_u32(w14.val[1])));
+  d[10].val[0] = vreinterpretq_u64_u32(
+      vcombine_u32(vget_low_u32(w7.val[0]), vget_low_u32(w15.val[0])));
+  d[10].val[1] = vreinterpretq_u64_u32(
+      vcombine_u32(vget_high_u32(w7.val[0]), vget_high_u32(w15.val[0])));
+  d[11].val[0] = vreinterpretq_u64_u32(
+      vcombine_u32(vget_low_u32(w7.val[1]), vget_low_u32(w15.val[1])));
+  d[11].val[1] = vreinterpretq_u64_u32(
+      vcombine_u32(vget_high_u32(w7.val[1]), vget_high_u32(w15.val[1])));
+#endif
+
+  w4 = vzipq_u16(vreinterpretq_u16_u8(w0.val[1]),
+                 vreinterpretq_u16_u8(w1.val[1]));
+  w5 = vzipq_u16(vreinterpretq_u16_u8(w2.val[1]),
+                 vreinterpretq_u16_u8(w3.val[1]));
+  w12 = vzipq_u16(vreinterpretq_u16_u8(w8.val[1]),
+                  vreinterpretq_u16_u8(w9.val[1]));
+  w13 = vzipq_u16(vreinterpretq_u16_u8(w10.val[1]),
+                  vreinterpretq_u16_u8(w11.val[1]));
+
+  w6 = vzipq_u32(vreinterpretq_u32_u16(w4.val[0]),
+                 vreinterpretq_u32_u16(w5.val[0]));
+  w7 = vzipq_u32(vreinterpretq_u32_u16(w4.val[1]),
+                 vreinterpretq_u32_u16(w5.val[1]));
+  w14 = vzipq_u32(vreinterpretq_u32_u16(w12.val[0]),
+                  vreinterpretq_u32_u16(w13.val[0]));
+  w15 = vzipq_u32(vreinterpretq_u32_u16(w12.val[1]),
+                  vreinterpretq_u32_u16(w13.val[1]));
+
+  // Store second 4-line result
+
+#if defined(__aarch64__)
+  d[12].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w6.val[0]),
+                            vreinterpretq_u64_u32(w14.val[0]));
+  d[12].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w6.val[0]),
+                            vreinterpretq_u64_u32(w14.val[0]));
+  d[13].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w6.val[1]),
+                            vreinterpretq_u64_u32(w14.val[1]));
+  d[13].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w6.val[1]),
+                            vreinterpretq_u64_u32(w14.val[1]));
+  d[14].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w7.val[0]),
+                            vreinterpretq_u64_u32(w15.val[0]));
+  d[14].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w7.val[0]),
+                            vreinterpretq_u64_u32(w15.val[0]));
+  d[15].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w7.val[1]),
+                            vreinterpretq_u64_u32(w15.val[1]));
+  d[15].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w7.val[1]),
+                            vreinterpretq_u64_u32(w15.val[1]));
+#else
+  d[12].val[0] = vreinterpretq_u64_u32(
+      vcombine_u32(vget_low_u32(w6.val[0]), vget_low_u32(w14.val[0])));
+  d[12].val[1] = vreinterpretq_u64_u32(
+      vcombine_u32(vget_high_u32(w6.val[0]), vget_high_u32(w14.val[0])));
+  d[13].val[0] = vreinterpretq_u64_u32(
+      vcombine_u32(vget_low_u32(w6.val[1]), vget_low_u32(w14.val[1])));
+  d[13].val[1] = vreinterpretq_u64_u32(
+      vcombine_u32(vget_high_u32(w6.val[1]), vget_high_u32(w14.val[1])));
+  d[14].val[0] = vreinterpretq_u64_u32(
+      vcombine_u32(vget_low_u32(w7.val[0]), vget_low_u32(w15.val[0])));
+  d[14].val[1] = vreinterpretq_u64_u32(
+      vcombine_u32(vget_high_u32(w7.val[0]), vget_high_u32(w15.val[0])));
+  d[15].val[0] = vreinterpretq_u64_u32(
+      vcombine_u32(vget_low_u32(w7.val[1]), vget_low_u32(w15.val[1])));
+  d[15].val[1] = vreinterpretq_u64_u32(
+      vcombine_u32(vget_high_u32(w7.val[1]), vget_high_u32(w15.val[1])));
+#endif
+}
+
+static void transpose_TX_16X16(const uint8_t *src, ptrdiff_t pitchSrc,
+                               uint8_t *dst, ptrdiff_t pitchDst) {
+  uint8x16_t r[16];
+  uint64x2_t d[16];
+  for (int i = 0; i < 16; i++) {
+    r[i] = vld1q_u8(src + i * pitchSrc);
+  }
+  transpose16x16_neon(r, d);
+  for (int i = 0; i < 16; i++) {
+    vst1q_u8(dst + i * pitchDst, vreinterpretq_u8_u64(d[i]));
+  }
+}
+
+static void transpose(const uint8_t *src, ptrdiff_t pitchSrc, uint8_t *dst,
+                      ptrdiff_t pitchDst, int width, int height) {
+  for (int j = 0; j < height; j += 16) {
+    for (int i = 0; i < width; i += 16) {
+      transpose_TX_16X16(src + i * pitchSrc + j, pitchSrc,
+                         dst + j * pitchDst + i, pitchDst);
+    }
+  }
+}
+
+static void dr_prediction_z3_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *left, int upsample_left,
+                                      int dy) {
+  uint8x8_t dstvec[4];
+  uint16x4x2_t dest;
+
+  dr_prediction_z1_HxW_internal_neon_64(4, 4, dstvec, left, upsample_left, dy);
+  transpose4x8_8x4_low_neon(dstvec, &dest);
+  vst1_lane_u32((uint32_t *)(dst + stride * 0),
+                vreinterpret_u32_u16(dest.val[0]), 0);
+  vst1_lane_u32((uint32_t *)(dst + stride * 1),
+                vreinterpret_u32_u16(dest.val[0]), 1);
+  vst1_lane_u32((uint32_t *)(dst + stride * 2),
+                vreinterpret_u32_u16(dest.val[1]), 0);
+  vst1_lane_u32((uint32_t *)(dst + stride * 3),
+                vreinterpret_u32_u16(dest.val[1]), 1);
+}
+
+static void dr_prediction_z3_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *left, int upsample_left,
+                                      int dy) {
+  uint8x8_t dstvec[8];
+  uint32x2x2_t d[4];
+
+  dr_prediction_z1_HxW_internal_neon_64(8, 8, dstvec, left, upsample_left, dy);
+  transpose8x8_neon(dstvec, d);
+  vst1_u32((uint32_t *)(dst + 0 * stride), d[0].val[0]);
+  vst1_u32((uint32_t *)(dst + 1 * stride), d[0].val[1]);
+  vst1_u32((uint32_t *)(dst + 2 * stride), d[1].val[0]);
+  vst1_u32((uint32_t *)(dst + 3 * stride), d[1].val[1]);
+  vst1_u32((uint32_t *)(dst + 4 * stride), d[2].val[0]);
+  vst1_u32((uint32_t *)(dst + 5 * stride), d[2].val[1]);
+  vst1_u32((uint32_t *)(dst + 6 * stride), d[3].val[0]);
+  vst1_u32((uint32_t *)(dst + 7 * stride), d[3].val[1]);
+}
+
+static void dr_prediction_z3_4x8_neon(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *left, int upsample_left,
+                                      int dy) {
+  uint8x8_t dstvec[4];
+  uint16x4x2_t d[2];
+
+  dr_prediction_z1_HxW_internal_neon_64(8, 4, dstvec, left, upsample_left, dy);
+  transpose4x8_8x4_neon(dstvec, d);
+  vst1_lane_u32((uint32_t *)(dst + stride * 0),
+                vreinterpret_u32_u16(d[0].val[0]), 0);
+  vst1_lane_u32((uint32_t *)(dst + stride * 1),
+                vreinterpret_u32_u16(d[0].val[0]), 1);
+  vst1_lane_u32((uint32_t *)(dst + stride * 2),
+                vreinterpret_u32_u16(d[0].val[1]), 0);
+  vst1_lane_u32((uint32_t *)(dst + stride * 3),
+                vreinterpret_u32_u16(d[0].val[1]), 1);
+  vst1_lane_u32((uint32_t *)(dst + stride * 4),
+                vreinterpret_u32_u16(d[1].val[0]), 0);
+  vst1_lane_u32((uint32_t *)(dst + stride * 5),
+                vreinterpret_u32_u16(d[1].val[0]), 1);
+  vst1_lane_u32((uint32_t *)(dst + stride * 6),
+                vreinterpret_u32_u16(d[1].val[1]), 0);
+  vst1_lane_u32((uint32_t *)(dst + stride * 7),
+                vreinterpret_u32_u16(d[1].val[1]), 1);
+}
+
+static void dr_prediction_z3_8x4_neon(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *left, int upsample_left,
+                                      int dy) {
+  uint8x8_t dstvec[8];
+  uint32x2x2_t d[2];
+
+  dr_prediction_z1_HxW_internal_neon_64(4, 8, dstvec, left, upsample_left, dy);
+  transpose8x8_low_neon(dstvec, d);
+  vst1_u32((uint32_t *)(dst + 0 * stride), d[0].val[0]);
+  vst1_u32((uint32_t *)(dst + 1 * stride), d[0].val[1]);
+  vst1_u32((uint32_t *)(dst + 2 * stride), d[1].val[0]);
+  vst1_u32((uint32_t *)(dst + 3 * stride), d[1].val[1]);
+}
+
+static void dr_prediction_z3_8x16_neon(uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *left, int upsample_left,
+                                       int dy) {
+  uint8x16_t dstvec[8];
+  uint64x2_t d[8];
+
+  dr_prediction_z1_HxW_internal_neon(16, 8, dstvec, left, upsample_left, dy);
+  transpose8x16_16x8_neon(dstvec, d);
+  for (int i = 0; i < 8; i++) {
+    vst1_u8(dst + i * stride, vreinterpret_u8_u64(vget_low_u64(d[i])));
+    vst1_u8(dst + (i + 8) * stride, vreinterpret_u8_u64(vget_high_u64(d[i])));
+  }
+}
+
+static void dr_prediction_z3_16x8_neon(uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *left, int upsample_left,
+                                       int dy) {
+  uint8x8_t dstvec[16];
+  uint64x2_t d[8];
+
+  dr_prediction_z1_HxW_internal_neon_64(8, 16, dstvec, left, upsample_left, dy);
+  transpose16x8_8x16_neon(dstvec, d);
+  for (int i = 0; i < 8; i++) {
+    vst1q_u8(dst + i * stride, vreinterpretq_u8_u64(d[i]));
+  }
+}
+
+static void dr_prediction_z3_4x16_neon(uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *left, int upsample_left,
+                                       int dy) {
+  uint8x16_t dstvec[4];
+  uint16x8x2_t d[2];
+
+  dr_prediction_z1_HxW_internal_neon(16, 4, dstvec, left, upsample_left, dy);
+  transpose4x16_neon(dstvec, d);
+  vst1q_lane_u32((uint32_t *)(dst + stride * 0),
+                 vreinterpretq_u32_u16(d[0].val[0]), 0);
+  vst1q_lane_u32((uint32_t *)(dst + stride * 1),
+                 vreinterpretq_u32_u16(d[0].val[0]), 1);
+  vst1q_lane_u32((uint32_t *)(dst + stride * 2),
+                 vreinterpretq_u32_u16(d[0].val[0]), 2);
+  vst1q_lane_u32((uint32_t *)(dst + stride * 3),
+                 vreinterpretq_u32_u16(d[0].val[0]), 3);
+
+  vst1q_lane_u32((uint32_t *)(dst + stride * 4),
+                 vreinterpretq_u32_u16(d[0].val[1]), 0);
+  vst1q_lane_u32((uint32_t *)(dst + stride * 5),
+                 vreinterpretq_u32_u16(d[0].val[1]), 1);
+  vst1q_lane_u32((uint32_t *)(dst + stride * 6),
+                 vreinterpretq_u32_u16(d[0].val[1]), 2);
+  vst1q_lane_u32((uint32_t *)(dst + stride * 7),
+                 vreinterpretq_u32_u16(d[0].val[1]), 3);
+
+  vst1q_lane_u32((uint32_t *)(dst + stride * 8),
+                 vreinterpretq_u32_u16(d[1].val[0]), 0);
+  vst1q_lane_u32((uint32_t *)(dst + stride * 9),
+                 vreinterpretq_u32_u16(d[1].val[0]), 1);
+  vst1q_lane_u32((uint32_t *)(dst + stride * 10),
+                 vreinterpretq_u32_u16(d[1].val[0]), 2);
+  vst1q_lane_u32((uint32_t *)(dst + stride * 11),
+                 vreinterpretq_u32_u16(d[1].val[0]), 3);
+
+  vst1q_lane_u32((uint32_t *)(dst + stride * 12),
+                 vreinterpretq_u32_u16(d[1].val[1]), 0);
+  vst1q_lane_u32((uint32_t *)(dst + stride * 13),
+                 vreinterpretq_u32_u16(d[1].val[1]), 1);
+  vst1q_lane_u32((uint32_t *)(dst + stride * 14),
+                 vreinterpretq_u32_u16(d[1].val[1]), 2);
+  vst1q_lane_u32((uint32_t *)(dst + stride * 15),
+                 vreinterpretq_u32_u16(d[1].val[1]), 3);
+}
+
+static void dr_prediction_z3_16x4_neon(uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *left, int upsample_left,
+                                       int dy) {
+  uint8x8_t dstvec[16];
+  uint64x2_t d[8];
+
+  dr_prediction_z1_HxW_internal_neon_64(4, 16, dstvec, left, upsample_left, dy);
+  transpose16x8_8x16_neon(dstvec, d);
+  for (int i = 0; i < 4; i++) {
+    vst1q_u8(dst + i * stride, vreinterpretq_u8_u64(d[i]));
+  }
+}
+
+static void dr_prediction_z3_8x32_neon(uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *left, int upsample_left,
+                                       int dy) {
+  uint8x16x2_t dstvec[16];
+  uint64x2x2_t d[16];
+  uint8x16_t v_zero = vdupq_n_u8(0);
+
+  dr_prediction_z1_32xN_internal_neon(8, dstvec, left, upsample_left, dy);
+  for (int i = 8; i < 16; i++) {
+    dstvec[i].val[0] = v_zero;
+    dstvec[i].val[1] = v_zero;
+  }
+  transpose16x32_neon(dstvec, d);
+  for (int i = 0; i < 16; i++) {
+    vst1_u8(dst + 2 * i * stride,
+            vreinterpret_u8_u64(vget_low_u64(d[i].val[0])));
+    vst1_u8(dst + (2 * i + 1) * stride,
+            vreinterpret_u8_u64(vget_low_u64(d[i].val[1])));
+  }
+}
+
+static void dr_prediction_z3_32x8_neon(uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *left, int upsample_left,
+                                       int dy) {
+  uint8x8_t dstvec[32];
+  uint64x2_t d[16];
+
+  dr_prediction_z1_HxW_internal_neon_64(8, 32, dstvec, left, upsample_left, dy);
+  transpose16x8_8x16_neon(dstvec, d);
+  transpose16x8_8x16_neon(dstvec + 16, d + 8);
+  for (int i = 0; i < 8; i++) {
+    vst1q_u8(dst + i * stride, vreinterpretq_u8_u64(d[i]));
+    vst1q_u8(dst + i * stride + 16, vreinterpretq_u8_u64(d[i + 8]));
+  }
+}
+
+static void dr_prediction_z3_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *left, int upsample_left,
+                                        int dy) {
+  uint8x16_t dstvec[16];
+  uint64x2_t d[16];
+
+  dr_prediction_z1_HxW_internal_neon(16, 16, dstvec, left, upsample_left, dy);
+  transpose16x16_neon(dstvec, d);
+  for (int i = 0; i < 16; i++) {
+    vst1q_u8(dst + i * stride, vreinterpretq_u8_u64(d[i]));
+  }
+}
+
+static void dr_prediction_z3_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *left, int upsample_left,
+                                        int dy) {
+  uint8x16x2_t dstvec[32];
+  uint64x2x2_t d[32];
+
+  dr_prediction_z1_32xN_internal_neon(32, dstvec, left, upsample_left, dy);
+  transpose16x32_neon(dstvec, d);
+  transpose16x32_neon(dstvec + 16, d + 16);
+  for (int i = 0; i < 16; i++) {
+    vst1q_u8(dst + 2 * i * stride, vreinterpretq_u8_u64(d[i].val[0]));
+    vst1q_u8(dst + 2 * i * stride + 16, vreinterpretq_u8_u64(d[i + 16].val[0]));
+    vst1q_u8(dst + (2 * i + 1) * stride, vreinterpretq_u8_u64(d[i].val[1]));
+    vst1q_u8(dst + (2 * i + 1) * stride + 16,
+             vreinterpretq_u8_u64(d[i + 16].val[1]));
+  }
+}
+
+static void dr_prediction_z3_64x64_neon(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *left, int upsample_left,
+                                        int dy) {
+  DECLARE_ALIGNED(16, uint8_t, dstT[64 * 64]);
+
+  dr_prediction_z1_64xN_neon(64, dstT, 64, left, upsample_left, dy);
+  transpose(dstT, 64, dst, stride, 64, 64);
+}
+
+static void dr_prediction_z3_16x32_neon(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *left, int upsample_left,
+                                        int dy) {
+  uint8x16x2_t dstvec[16];
+  uint64x2x2_t d[16];
+
+  dr_prediction_z1_32xN_internal_neon(16, dstvec, left, upsample_left, dy);
+  transpose16x32_neon(dstvec, d);
+  for (int i = 0; i < 16; i++) {
+    vst1q_u8(dst + 2 * i * stride, vreinterpretq_u8_u64(d[i].val[0]));
+    vst1q_u8(dst + (2 * i + 1) * stride, vreinterpretq_u8_u64(d[i].val[1]));
+  }
+}
+
+static void dr_prediction_z3_32x16_neon(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *left, int upsample_left,
+                                        int dy) {
+  uint8x16_t dstvec[32];
+  uint64x2_t d[16];
+
+  dr_prediction_z1_HxW_internal_neon(16, 32, dstvec, left, upsample_left, dy);
+  for (int i = 0; i < 32; i += 16) {
+    transpose16x16_neon((dstvec + i), d);
+    for (int j = 0; j < 16; j++) {
+      vst1q_u8(dst + j * stride + i, vreinterpretq_u8_u64(d[j]));
+    }
+  }
+}
+
+static void dr_prediction_z3_32x64_neon(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *left, int upsample_left,
+                                        int dy) {
+  uint8_t dstT[64 * 32];
+
+  dr_prediction_z1_64xN_neon(32, dstT, 64, left, upsample_left, dy);
+  transpose(dstT, 64, dst, stride, 32, 64);
+}
+
+static void dr_prediction_z3_64x32_neon(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *left, int upsample_left,
+                                        int dy) {
+  uint8_t dstT[32 * 64];
+
+  dr_prediction_z1_32xN_neon(64, dstT, 32, left, upsample_left, dy);
+  transpose(dstT, 32, dst, stride, 64, 32);
+}
+
+static void dr_prediction_z3_16x64_neon(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *left, int upsample_left,
+                                        int dy) {
+  uint8_t dstT[64 * 16];
+
+  dr_prediction_z1_64xN_neon(16, dstT, 64, left, upsample_left, dy);
+  transpose(dstT, 64, dst, stride, 16, 64);
+}
+
+static void dr_prediction_z3_64x16_neon(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *left, int upsample_left,
+                                        int dy) {
+  uint8x16_t dstvec[64];
+  uint64x2_t d[16];
+
+  dr_prediction_z1_HxW_internal_neon(16, 64, dstvec, left, upsample_left, dy);
+  for (int i = 0; i < 64; i += 16) {
+    transpose16x16_neon((dstvec + i), d);
+    for (int j = 0; j < 16; j++) {
+      vst1q_u8(dst + j * stride + i, vreinterpretq_u8_u64(d[j]));
+    }
+  }
+}
+
+void av1_dr_prediction_z3_neon(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
+                               const uint8_t *above, const uint8_t *left,
+                               int upsample_left, int dx, int dy) {
+  (void)above;
+  (void)dx;
+  assert(dx == 1);
+  assert(dy > 0);
+
+  if (bw == bh) {
+    switch (bw) {
+      case 4:
+        dr_prediction_z3_4x4_neon(dst, stride, left, upsample_left, dy);
+        break;
+      case 8:
+        dr_prediction_z3_8x8_neon(dst, stride, left, upsample_left, dy);
+        break;
+      case 16:
+        dr_prediction_z3_16x16_neon(dst, stride, left, upsample_left, dy);
+        break;
+      case 32:
+        dr_prediction_z3_32x32_neon(dst, stride, left, upsample_left, dy);
+        break;
+      case 64:
+        dr_prediction_z3_64x64_neon(dst, stride, left, upsample_left, dy);
+        break;
+    }
+  } else {
+    if (bw < bh) {
+      if (bw + bw == bh) {
+        switch (bw) {
+          case 4:
+            dr_prediction_z3_4x8_neon(dst, stride, left, upsample_left, dy);
+            break;
+          case 8:
+            dr_prediction_z3_8x16_neon(dst, stride, left, upsample_left, dy);
+            break;
+          case 16:
+            dr_prediction_z3_16x32_neon(dst, stride, left, upsample_left, dy);
+            break;
+          case 32:
+            dr_prediction_z3_32x64_neon(dst, stride, left, upsample_left, dy);
+            break;
+        }
+      } else {
+        switch (bw) {
+          case 4:
+            dr_prediction_z3_4x16_neon(dst, stride, left, upsample_left, dy);
+            break;
+          case 8:
+            dr_prediction_z3_8x32_neon(dst, stride, left, upsample_left, dy);
+            break;
+          case 16:
+            dr_prediction_z3_16x64_neon(dst, stride, left, upsample_left, dy);
+            break;
+        }
+      }
+    } else {
+      if (bh + bh == bw) {
+        switch (bh) {
+          case 4:
+            dr_prediction_z3_8x4_neon(dst, stride, left, upsample_left, dy);
+            break;
+          case 8:
+            dr_prediction_z3_16x8_neon(dst, stride, left, upsample_left, dy);
+            break;
+          case 16:
+            dr_prediction_z3_32x16_neon(dst, stride, left, upsample_left, dy);
+            break;
+          case 32:
+            dr_prediction_z3_64x32_neon(dst, stride, left, upsample_left, dy);
+            break;
+        }
+      } else {
+        switch (bh) {
+          case 4:
+            dr_prediction_z3_16x4_neon(dst, stride, left, upsample_left, dy);
+            break;
+          case 8:
+            dr_prediction_z3_32x8_neon(dst, stride, left, upsample_left, dy);
+            break;
+          case 16:
+            dr_prediction_z3_64x16_neon(dst, stride, left, upsample_left, dy);
+            break;
+        }
+      }
+    }
+  }
+}
+static const int sm_weight_log2_scale = 8;
+
+// max(block_size_wide[BLOCK_LARGEST], block_size_high[BLOCK_LARGEST])
+#define MAX_BLOCK_DIM 64
+
+/* clang-format off */
+static const uint8_t sm_weight_arrays[2 * MAX_BLOCK_DIM] = {
+    // Unused, because we always offset by bs, which is at least 2.
+    0, 0,
+    // bs = 2
+    255, 128,
+    // bs = 4
+    255, 149, 85, 64,
+    // bs = 8
+    255, 197, 146, 105, 73, 50, 37, 32,
+    // bs = 16
+    255, 225, 196, 170, 145, 123, 102, 84, 68, 54, 43, 33, 26, 20, 17, 16,
+    // bs = 32
+    255, 240, 225, 210, 196, 182, 169, 157, 145, 133, 122, 111, 101, 92, 83, 74,
+    66, 59, 52, 45, 39, 34, 29, 25, 21, 17, 14, 12, 10, 9, 8, 8,
+    // bs = 64
+    255, 248, 240, 233, 225, 218, 210, 203, 196, 189, 182, 176, 169, 163, 156,
+    150, 144, 138, 133, 127, 121, 116, 111, 106, 101, 96, 91, 86, 82, 77, 73,
+    69, 65, 61, 57, 54, 50, 47, 44, 41, 38, 35, 32, 29, 27, 25, 22, 20, 18, 16,
+    15, 13, 12, 10, 9, 8, 7, 6, 6, 5, 5, 4, 4, 4,
+};
+/* clang-format on */
+
+// -----------------------------------------------------------------------------
+// SMOOTH_PRED
+
+// pixels[0]: above and below_pred interleave vector
+// pixels[1]: left vector
+// pixels[2]: right_pred vector
+static INLINE void load_pixel_w4(const uint8_t *above, const uint8_t *left,
+                                 int height, uint8x16_t *pixels) {
+  uint32x4_t zero = vdupq_n_u32(0);
+  const uint8x8_t d = vcreate_u8(((const uint32_t *)above)[0]);
+  if (height == 4)
+    pixels[1] =
+        vreinterpretq_u8_u32(vld1q_lane_u32((const uint32_t *)left, zero, 0));
+  else if (height == 8) {
+    pixels[1] = vreinterpretq_u8_u64(vsetq_lane_u64(
+        ((const uint64_t *)left)[0], vreinterpretq_u64_u32(zero), 0));
+  } else {
+    pixels[1] = vld1q_u8(left);
+  }
+
+  pixels[2] = vreinterpretq_u8_u16(vdupq_n_u16(above[3]));
+
+  const uint16x8_t bp = vdupq_n_u16(left[height - 1]);
+#if defined(__aarch64__)
+  pixels[0] = vreinterpretq_u8_u16(vzip1q_u16(vmovl_u8(d), bp));
+#else
+  pixels[0] = vreinterpretq_u8_u16(vzipq_u16(vmovl_u8(d), bp).val[0]);
+#endif  // (__aarch64__)
+}
+
+// weight_h[0]: weight_h vector
+// weight_h[1]: scale - weight_h vector
+// weight_h[2]: same as [0], second half for height = 16 only
+// weight_h[3]: same as [1], second half for height = 16 only
+// weight_w[0]: weights_w and scale - weights_w interleave vector
+static INLINE void load_weight_w4(const uint8_t *weight_array, int height,
+                                  uint16x8_t *weight_h, uint16x8_t *weight_w) {
+  const uint16x8_t d = vdupq_n_u16((uint16_t)(1 << sm_weight_log2_scale));
+  const uint8x8_t t = vcreate_u8(((const uint32_t *)(weight_array))[1]);
+  weight_h[0] = vmovl_u8(t);
+  weight_h[1] = vsubw_u8(d, t);
+#if defined(__aarch64__)
+  weight_w[0] = vzip1q_u16(weight_h[0], weight_h[1]);
+#else
+  weight_w[0] = vzipq_u16(weight_h[0], weight_h[1]).val[0];
+#endif  // (__aarch64__)
+
+  if (height == 8) {
+    const uint8x8_t weight = vld1_u8(&weight_array[8]);
+    weight_h[0] = vmovl_u8(weight);
+    weight_h[1] = vsubw_u8(d, weight);
+  } else if (height == 16) {
+    const uint8x16_t zero = vdupq_n_u8(0);
+    const uint8x16_t weight = vld1q_u8(&weight_array[16]);
+    const uint8x16x2_t weight_h_02 = vzipq_u8(weight, zero);
+    weight_h[0] = vreinterpretq_u16_u8(weight_h_02.val[0]);
+    weight_h[1] = vsubq_u16(d, vreinterpretq_u16_u8(weight_h_02.val[0]));
+    weight_h[2] = vreinterpretq_u16_u8(weight_h_02.val[1]);
+    weight_h[3] = vsubq_u16(d, vreinterpretq_u16_u8(weight_h_02.val[1]));
+  }
+}
+
+static INLINE void smooth_pred_4xh(const uint8x16_t *pixel,
+                                   const uint16x8_t *wh, const uint16x8_t *ww,
+                                   int h, uint8_t *dst, ptrdiff_t stride,
+                                   int second_half) {
+  const uint16x4_t one = vdup_n_u16(1);
+  const uint16x4_t inc = vdup_n_u16(0x202);
+  uint16x4_t rep =
+      second_half ? vdup_n_u16((uint16_t)0x8008) : vdup_n_u16((uint16_t)0x8000);
+  uint16x4_t d = vdup_n_u16(0x100);
+  const uint16x4_t v_pixel_0_lo = vmovn_u32(vreinterpretq_u32_u8(pixel[0]));
+  const uint16x4_t v_pixel_0_hi =
+      vmovn_u32(vreinterpretq_u32_u8(vextq_u8(pixel[0], pixel[0], 2)));
+  const uint16x4_t v_pixel_2 = vget_low_u16(vreinterpretq_u16_u8(pixel[2]));
+  const uint16x4_t ww_0_lo = vmovn_u32(vreinterpretq_u32_u16(ww[0]));
+  const uint16x4_t ww_0_hi =
+      vmovn_u32(vreinterpretq_u32_u16(vextq_u16(ww[0], ww[0], 1)));
+  const uint8x8_t save_mask = vcreate_u8(0 + (2 << 8) + (4 << 16) + (6 << 24));
+
+#if !defined(__aarch64__)
+  const uint8x8x2_t v_split1 = { { vget_low_u8(vreinterpretq_u8_u16(wh[0])),
+                                   vget_high_u8(
+                                       vreinterpretq_u8_u16(wh[0])) } };
+  const uint8x8x2_t v_split2 = { { vget_low_u8(vreinterpretq_u8_u16(wh[1])),
+                                   vget_high_u8(
+                                       vreinterpretq_u8_u16(wh[1])) } };
+  const uint8x8x2_t v_split3 = { { vget_low_u8(pixel[1]),
+                                   vget_high_u8(pixel[1]) } };
+#endif  // (__aarch64__)
+
+  for (int i = 0; i < h; ++i) {
+#if defined(__aarch64__)
+    const uint8x8_t wg =
+        vqtbl1_u8(vreinterpretq_u8_u16(wh[0]), vreinterpret_u8_u16(d));
+    const uint8x8_t sc =
+        vqtbl1_u8(vreinterpretq_u8_u16(wh[1]), vreinterpret_u8_u16(d));
+#else
+    const uint8x8_t wg = vtbl2_u8(v_split1, vreinterpret_u8_u16(d));
+    const uint8x8_t sc = vtbl2_u8(v_split2, vreinterpret_u8_u16(d));
+#endif  // (__aarch64__)
+
+    uint32x4_t sum = vmull_u16(v_pixel_0_lo, vreinterpret_u16_u8(wg));
+    sum = vmlal_u16(sum, v_pixel_0_hi, vreinterpret_u16_u8(sc));
+
+#if defined(__aarch64__)
+    uint8x8_t b = vqtbl1_u8(pixel[1], vreinterpret_u8_u16(rep));
+#else
+    uint8x8_t b = vtbl2_u8(v_split3, vreinterpret_u8_u16(rep));
+#endif  // (__aarch64__)
+
+    sum = vmlal_u16(sum, vreinterpret_u16_u8(b), ww_0_lo);
+    sum = vmlal_u16(sum, v_pixel_2, ww_0_hi);
+    uint8x8_t sum_l = vreinterpret_u8_u16(vqrshrn_n_u32(sum, 9));
+    uint32x2_t predsh = vreinterpret_u32_u8(vtbl1_u8(sum_l, save_mask));
+    vst1_lane_u32((uint32_t *)dst, predsh, 0);
+
+    dst += stride;
+
+    rep = vadd_u16(rep, one);
+    d = vadd_u16(d, inc);
+  }
+}
+
+void aom_smooth_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+                                   const uint8_t *above, const uint8_t *left) {
+  uint8x16_t pixels[3];
+  load_pixel_w4(above, left, 4, pixels);
+
+  uint16x8_t wh[4], ww[2];
+  load_weight_w4(sm_weight_arrays, 4, wh, ww);
+
+  smooth_pred_4xh(pixels, wh, ww, 4, dst, stride, 0);
+}
+
+void aom_smooth_predictor_4x8_neon(uint8_t *dst, ptrdiff_t stride,
+                                   const uint8_t *above, const uint8_t *left) {
+  uint8x16_t pixels[3];
+  load_pixel_w4(above, left, 8, pixels);
+
+  uint16x8_t wh[4], ww[2];
+  load_weight_w4(sm_weight_arrays, 8, wh, ww);
+
+  smooth_pred_4xh(pixels, wh, ww, 8, dst, stride, 0);
+}
+
+void aom_smooth_predictor_4x16_neon(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  uint8x16_t pixels[3];
+  load_pixel_w4(above, left, 16, pixels);
+
+  uint16x8_t wh[4], ww[2];
+  load_weight_w4(sm_weight_arrays, 16, wh, ww);
+
+  smooth_pred_4xh(pixels, wh, ww, 8, dst, stride, 0);
+  dst += stride << 3;
+  smooth_pred_4xh(pixels, &wh[2], ww, 8, dst, stride, 1);
+}
+
+// pixels[0]: above and below_pred interleave vector, first half
+// pixels[1]: above and below_pred interleave vector, second half
+// pixels[2]: left vector
+// pixels[3]: right_pred vector
+// pixels[4]: above and below_pred interleave vector, first half
+// pixels[5]: above and below_pred interleave vector, second half
+// pixels[6]: left vector + 16
+// pixels[7]: right_pred vector
+static INLINE void load_pixel_w8(const uint8_t *above, const uint8_t *left,
+                                 int height, uint8x16_t *pixels) {
+  pixels[0] = vreinterpretq_u8_u16(vmovl_u8(vld1_u8(above)));
+  pixels[1] = vreinterpretq_u8_u16(vdupq_n_u16((uint16_t)left[height - 1]));
+  pixels[3] = vreinterpretq_u8_u16(vdupq_n_u16((uint16_t)above[7]));
+
+  if (height == 4) {
+    const uint32x4_t zero32 = vdupq_n_u32(0);
+    pixels[2] =
+        vreinterpretq_u8_u32(vld1q_lane_u32((const uint32_t *)left, zero32, 0));
+  } else if (height == 8) {
+    const uint64x2_t zero64 = vdupq_n_u64(0);
+    pixels[2] = vreinterpretq_u8_u64(
+        vsetq_lane_u64(((const uint64_t *)left)[0], zero64, 0));
+  } else if (height == 16) {
+    pixels[2] = vld1q_u8(left);
+  } else {
+    pixels[2] = vld1q_u8(left);
+    pixels[4] = pixels[0];
+    pixels[5] = pixels[1];
+    pixels[6] = vld1q_u8(left + 16);
+    pixels[7] = pixels[3];
+  }
+}
+
+// weight_h[0]: weight_h vector
+// weight_h[1]: scale - weight_h vector
+// weight_h[2]: same as [0], offset 8
+// weight_h[3]: same as [1], offset 8
+// weight_h[4]: same as [0], offset 16
+// weight_h[5]: same as [1], offset 16
+// weight_h[6]: same as [0], offset 24
+// weight_h[7]: same as [1], offset 24
+// weight_w[0]: weights_w and scale - weights_w interleave vector, first half
+// weight_w[1]: weights_w and scale - weights_w interleave vector, second half
+static INLINE void load_weight_w8(const uint8_t *weight_array, int height,
+                                  uint16x8_t *weight_h, uint16x8_t *weight_w) {
+  const uint8x16_t zero = vdupq_n_u8(0);
+  const int we_offset = height < 8 ? 4 : 8;
+  uint8x16_t we = vld1q_u8(&weight_array[we_offset]);
+#if defined(__aarch64__)
+  weight_h[0] = vreinterpretq_u16_u8(vzip1q_u8(we, zero));
+#else
+  weight_h[0] = vreinterpretq_u16_u8(vzipq_u8(we, zero).val[0]);
+#endif  // (__aarch64__)
+  const uint16x8_t d = vdupq_n_u16(256);
+  weight_h[1] = vsubq_u16(d, weight_h[0]);
+
+  if (height == 4) {
+    we = vextq_u8(we, zero, 4);
+#if defined(__aarch64__)
+    weight_w[0] = vreinterpretq_u16_u8(vzip1q_u8(we, zero));
+#else
+    weight_w[0] = vmovl_u8(vget_low_u8(we));
+#endif  // (__aarch64__)
+    weight_w[1] = vsubq_u16(d, weight_w[0]);
+  } else {
+    weight_w[0] = weight_h[0];
+    weight_w[1] = weight_h[1];
+  }
+
+  if (height == 16) {
+    we = vld1q_u8(&weight_array[16]);
+    const uint8x16x2_t weight_h_02 = vzipq_u8(we, zero);
+    weight_h[0] = vreinterpretq_u16_u8(weight_h_02.val[0]);
+    weight_h[1] = vsubq_u16(d, weight_h[0]);
+    weight_h[2] = vreinterpretq_u16_u8(weight_h_02.val[1]);
+    weight_h[3] = vsubq_u16(d, weight_h[2]);
+  } else if (height == 32) {
+    const uint8x16_t weight_lo = vld1q_u8(&weight_array[32]);
+    const uint8x16x2_t weight_h_02 = vzipq_u8(weight_lo, zero);
+    weight_h[0] = vreinterpretq_u16_u8(weight_h_02.val[0]);
+    weight_h[1] = vsubq_u16(d, weight_h[0]);
+    weight_h[2] = vreinterpretq_u16_u8(weight_h_02.val[1]);
+    weight_h[3] = vsubq_u16(d, weight_h[2]);
+    const uint8x16_t weight_hi = vld1q_u8(&weight_array[32 + 16]);
+    const uint8x16x2_t weight_h_46 = vzipq_u8(weight_hi, zero);
+    weight_h[4] = vreinterpretq_u16_u8(weight_h_46.val[0]);
+    weight_h[5] = vsubq_u16(d, weight_h[4]);
+    weight_h[6] = vreinterpretq_u16_u8(weight_h_46.val[1]);
+    weight_h[7] = vsubq_u16(d, weight_h[6]);
+  }
+}
+
+static INLINE void smooth_pred_8xh(const uint8x16_t *pixels,
+                                   const uint16x8_t *wh, const uint16x8_t *ww,
+                                   int h, uint8_t *dst, ptrdiff_t stride,
+                                   int second_half) {
+  const uint16x8_t one = vdupq_n_u16(1);
+  const uint16x8_t inc = vdupq_n_u16(0x202);
+  uint16x8_t rep = second_half ? vdupq_n_u16((uint16_t)0x8008)
+                               : vdupq_n_u16((uint16_t)0x8000);
+  uint16x8_t d = vdupq_n_u16(0x100);
+
+#if !defined(__aarch64__)
+  const uint8x8x2_t v_split1 = { { vget_low_u8(vreinterpretq_u8_u16(wh[0])),
+                                   vget_high_u8(
+                                       vreinterpretq_u8_u16(wh[0])) } };
+  const uint8x8x2_t v_split2 = { { vget_low_u8(vreinterpretq_u8_u16(wh[1])),
+                                   vget_high_u8(
+                                       vreinterpretq_u8_u16(wh[1])) } };
+  const uint8x8x2_t v_split3 = { { vget_low_u8(pixels[2]),
+                                   vget_high_u8(pixels[2]) } };
+#endif
+
+  for (int i = 0; i < h; ++i) {
+#if defined(__aarch64__)
+    const uint8x16_t wg_wg =
+        vqtbl1q_u8(vreinterpretq_u8_u16(wh[0]), vreinterpretq_u8_u16(d));
+    const uint8x16_t sc_sc =
+        vqtbl1q_u8(vreinterpretq_u8_u16(wh[1]), vreinterpretq_u8_u16(d));
+#else
+    const uint8x8_t v_d_lo = vreinterpret_u8_u16(vget_low_u16(d));
+    const uint8x8_t v_d_hi = vreinterpret_u8_u16(vget_high_u16(d));
+    const uint8x16_t wg_wg =
+        vcombine_u8(vtbl2_u8(v_split1, v_d_lo), vtbl2_u8(v_split1, v_d_hi));
+    const uint8x16_t sc_sc =
+        vcombine_u8(vtbl2_u8(v_split2, v_d_lo), vtbl2_u8(v_split2, v_d_hi));
+#endif  // (__aarch64__)
+    uint16x8_t s01 =
+        vmulq_u16(vreinterpretq_u16_u8(pixels[0]), vreinterpretq_u16_u8(wg_wg));
+    s01 = vmlaq_u16(s01, vreinterpretq_u16_u8(pixels[1]),
+                    vreinterpretq_u16_u8(sc_sc));
+#if defined(__aarch64__)
+    const uint8x16_t b = vqtbl1q_u8(pixels[2], vreinterpretq_u8_u16(rep));
+#else
+    const uint8x16_t b = vcombine_u8(
+        vtbl2_u8(v_split3, vget_low_u8(vreinterpretq_u8_u16(rep))),
+        vtbl2_u8(v_split3, vget_high_u8(vreinterpretq_u8_u16(rep))));
+#endif  // (__aarch64__)
+    uint16x8_t sum0 = vmulq_u16(vreinterpretq_u16_u8(b), ww[0]);
+    sum0 = vmlaq_u16(sum0, vreinterpretq_u16_u8(pixels[3]), ww[1]);
+
+    uint32x4_t s0 = vaddl_u16(vget_low_u16(s01), vget_low_u16(sum0));
+#if defined(__aarch64__)
+    uint32x4_t s1 = vaddl_high_u16(s01, sum0);
+#else
+    uint32x4_t s1 = vaddl_u16(vget_high_u16(s01), vget_high_u16(sum0));
+#endif  // (__aarch64__)
+
+    sum0 = vcombine_u16(vqrshrn_n_u32(s0, 9), vqrshrn_n_u32(s1, 9));
+    uint8x8_t predsh = vqmovn_u16(sum0);
+    vst1_u8(dst, predsh);
+
+    dst += stride;
+    rep = vaddq_u16(rep, one);
+    d = vaddq_u16(d, inc);
+  }
+}
+
+void aom_smooth_predictor_8x4_neon(uint8_t *dst, ptrdiff_t stride,
+                                   const uint8_t *above, const uint8_t *left) {
+  uint8x16_t pixels[4];
+  load_pixel_w8(above, left, 4, pixels);
+
+  uint16x8_t wh[4], ww[2];
+  load_weight_w8(sm_weight_arrays, 4, wh, ww);
+
+  smooth_pred_8xh(pixels, wh, ww, 4, dst, stride, 0);
+}
+
+void aom_smooth_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+                                   const uint8_t *above, const uint8_t *left) {
+  uint8x16_t pixels[4];
+  load_pixel_w8(above, left, 8, pixels);
+
+  uint16x8_t wh[4], ww[2];
+  load_weight_w8(sm_weight_arrays, 8, wh, ww);
+
+  smooth_pred_8xh(pixels, wh, ww, 8, dst, stride, 0);
+}
+
+void aom_smooth_predictor_8x16_neon(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  uint8x16_t pixels[4];
+  load_pixel_w8(above, left, 16, pixels);
+
+  uint16x8_t wh[4], ww[2];
+  load_weight_w8(sm_weight_arrays, 16, wh, ww);
+
+  smooth_pred_8xh(pixels, wh, ww, 8, dst, stride, 0);
+  dst += stride << 3;
+  smooth_pred_8xh(pixels, &wh[2], ww, 8, dst, stride, 1);
+}
+
+void aom_smooth_predictor_8x32_neon(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  uint8x16_t pixels[8];
+  load_pixel_w8(above, left, 32, pixels);
+
+  uint16x8_t wh[8], ww[2];
+  load_weight_w8(sm_weight_arrays, 32, wh, ww);
+
+  smooth_pred_8xh(&pixels[0], wh, ww, 8, dst, stride, 0);
+  dst += stride << 3;
+  smooth_pred_8xh(&pixels[0], &wh[2], ww, 8, dst, stride, 1);
+  dst += stride << 3;
+  smooth_pred_8xh(&pixels[4], &wh[4], ww, 8, dst, stride, 0);
+  dst += stride << 3;
+  smooth_pred_8xh(&pixels[4], &wh[6], ww, 8, dst, stride, 1);
+}
+
+static INLINE void smooth_predictor_wxh(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *above,
+                                        const uint8_t *left, uint32_t bw,
+                                        uint32_t bh) {
+  const uint8_t *const sm_weights_w = sm_weight_arrays + bw;
+  const uint8_t *const sm_weights_h = sm_weight_arrays + bh;
+  const uint16x8_t scale_value = vdupq_n_u16(256);
+
+  for (uint32_t y = 0; y < bh; ++y) {
+    const uint8x8_t left_y = vdup_n_u8(left[y]);
+    const uint8x8_t weights_y_dup = vdup_n_u8(sm_weights_h[y]);
+    const uint32x4_t pred_scaled_bl =
+        vdupq_n_u32(256 + (256 - sm_weights_h[y]) * left[bh - 1]);
+
+    for (uint32_t x = 0; x < bw; x += 8) {
+      const uint8x8_t weights_x = vld1_u8(sm_weights_w + x);
+      const uint8x8_t top_x = vld1_u8(above + x);
+
+      uint16x8_t pred_m1, pred_m2;
+      uint32x4_t pred_lo, pred_hi;
+      pred_m1 = vmull_u8(top_x, weights_y_dup);
+      pred_m2 = vmull_u8(weights_x, left_y);
+
+      pred_lo = vaddl_u16(vget_low_u16(pred_m1), vget_low_u16(pred_m2));
+#if defined(__aarch64__)
+      pred_hi = vaddl_high_u16(pred_m1, pred_m2);
+#else
+      pred_hi = vaddl_u16(vget_high_u16(pred_m1), vget_high_u16(pred_m2));
+#endif  // (__aarch64__)
+
+      const uint16x8_t scale_m_weights_x = vsubw_u8(scale_value, weights_x);
+
+      const uint16x8_t swxtr = vmulq_n_u16(scale_m_weights_x, above[bw - 1]);
+
+      pred_lo = vaddq_u32(pred_lo, pred_scaled_bl);
+      pred_hi = vaddq_u32(pred_hi, pred_scaled_bl);
+
+      pred_lo = vaddw_u16(pred_lo, vget_low_u16(swxtr));
+#if defined(__aarch64__)
+      pred_hi = vaddw_high_u16(pred_hi, swxtr);
+#else
+      pred_hi = vaddw_u16(pred_hi, vget_high_u16(swxtr));
+#endif  // (__aarch64__)
+
+      uint16x8_t pred =
+          vcombine_u16(vshrn_n_u32(pred_lo, 9), vshrn_n_u32(pred_hi, 9));
+
+      uint8x8_t predsh = vqmovn_u16(pred);
+
+      vst1_u8(dst + x, predsh);
+    }
+
+    dst += stride;
+  }
+}
+
+void aom_smooth_predictor_16x4_neon(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  smooth_predictor_wxh(dst, stride, above, left, 16, 4);
+}
+
+void aom_smooth_predictor_16x8_neon(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  smooth_predictor_wxh(dst, stride, above, left, 16, 8);
+}
+
+void aom_smooth_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  smooth_predictor_wxh(dst, stride, above, left, 16, 16);
+}
+
+void aom_smooth_predictor_16x32_neon(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  smooth_predictor_wxh(dst, stride, above, left, 16, 32);
+}
+
+void aom_smooth_predictor_32x8_neon(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  smooth_predictor_wxh(dst, stride, above, left, 32, 8);
+}
+
+void aom_smooth_predictor_32x16_neon(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  smooth_predictor_wxh(dst, stride, above, left, 32, 16);
+}
+
+void aom_smooth_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  smooth_predictor_wxh(dst, stride, above, left, 32, 32);
+}
+
+void aom_smooth_predictor_32x64_neon(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  smooth_predictor_wxh(dst, stride, above, left, 32, 64);
+}
+
+void aom_smooth_predictor_64x64_neon(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  smooth_predictor_wxh(dst, stride, above, left, 64, 64);
+}
+
+void aom_smooth_predictor_64x32_neon(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  smooth_predictor_wxh(dst, stride, above, left, 64, 32);
+}
+
+void aom_smooth_predictor_64x16_neon(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  smooth_predictor_wxh(dst, stride, above, left, 64, 16);
+}
+
+void aom_smooth_predictor_16x64_neon(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  smooth_predictor_wxh(dst, stride, above, left, 16, 64);
+}
diff --git a/aom_dsp/arm/sad4d_neon.c b/aom_dsp/arm/sad4d_neon.c
index 606950a..0e633b5 100644
--- a/aom_dsp/arm/sad4d_neon.c
+++ b/aom_dsp/arm/sad4d_neon.c
@@ -224,3 +224,368 @@
   res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi);
   res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi);
 }
+
+static INLINE unsigned int horizontal_add_16x4(const uint16x4_t vec_16x4) {
+  const uint32x2_t a = vpaddl_u16(vec_16x4);
+  const uint64x1_t b = vpaddl_u32(a);
+  return vget_lane_u32(vreinterpret_u32_u64(b), 0);
+}
+
+static INLINE unsigned int horizontal_add_16x8(const uint16x8_t vec_16x8) {
+  const uint32x4_t a = vpaddlq_u16(vec_16x8);
+  const uint64x2_t b = vpaddlq_u32(a);
+  const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
+                                vreinterpret_u32_u64(vget_high_u64(b)));
+  return vget_lane_u32(c, 0);
+}
+
+static void sad_row4_neon(uint16x4_t *vec_src, const uint8x8_t q0,
+                          const uint8x8_t ref) {
+  uint8x8_t q2 = vabd_u8(q0, ref);
+  *vec_src = vpadal_u8(*vec_src, q2);
+}
+
+static void sad_row8_neon(uint16x4_t *vec_src, const uint8x8_t *q0,
+                          const uint8_t *ref_ptr) {
+  uint8x8_t q1 = vld1_u8(ref_ptr);
+  uint8x8_t q2 = vabd_u8(*q0, q1);
+  *vec_src = vpadal_u8(*vec_src, q2);
+}
+
+static void sad_row16_neon(uint16x8_t *vec_src, const uint8x16_t *q0,
+                           const uint8_t *ref_ptr) {
+  uint8x16_t q1 = vld1q_u8(ref_ptr);
+  uint8x16_t q2 = vabdq_u8(*q0, q1);
+  *vec_src = vpadalq_u8(*vec_src, q2);
+}
+
+void aom_sadMxNx4d_neon(int width, int height, const uint8_t *src,
+                        int src_stride, const uint8_t *const ref[4],
+                        int ref_stride, uint32_t res[4]) {
+  const uint8_t *ref0, *ref1, *ref2, *ref3;
+
+  ref0 = ref[0];
+  ref1 = ref[1];
+  ref2 = ref[2];
+  ref3 = ref[3];
+
+  res[0] = 0;
+  res[1] = 0;
+  res[2] = 0;
+  res[3] = 0;
+
+  switch (width) {
+    case 4: {
+      uint32_t src4, ref40, ref41, ref42, ref43;
+      uint32x2_t q8 = vdup_n_u32(0);
+      uint32x2_t q4 = vdup_n_u32(0);
+      uint32x2_t q5 = vdup_n_u32(0);
+      uint32x2_t q6 = vdup_n_u32(0);
+      uint32x2_t q7 = vdup_n_u32(0);
+
+      for (int i = 0; i < height / 2; i++) {
+        uint16x4_t q0 = vdup_n_u16(0);
+        uint16x4_t q1 = vdup_n_u16(0);
+        uint16x4_t q2 = vdup_n_u16(0);
+        uint16x4_t q3 = vdup_n_u16(0);
+
+        memcpy(&src4, src, 4);
+        memcpy(&ref40, ref0, 4);
+        memcpy(&ref41, ref1, 4);
+        memcpy(&ref42, ref2, 4);
+        memcpy(&ref43, ref3, 4);
+
+        src += src_stride;
+        ref0 += ref_stride;
+        ref1 += ref_stride;
+        ref2 += ref_stride;
+        ref3 += ref_stride;
+
+        q8 = vset_lane_u32(src4, q8, 0);
+        q4 = vset_lane_u32(ref40, q4, 0);
+        q5 = vset_lane_u32(ref41, q5, 0);
+        q6 = vset_lane_u32(ref42, q6, 0);
+        q7 = vset_lane_u32(ref43, q7, 0);
+
+        memcpy(&src4, src, 4);
+        memcpy(&ref40, ref0, 4);
+        memcpy(&ref41, ref1, 4);
+        memcpy(&ref42, ref2, 4);
+        memcpy(&ref43, ref3, 4);
+
+        src += src_stride;
+        ref0 += ref_stride;
+        ref1 += ref_stride;
+        ref2 += ref_stride;
+        ref3 += ref_stride;
+
+        q8 = vset_lane_u32(src4, q8, 1);
+        q4 = vset_lane_u32(ref40, q4, 1);
+        q5 = vset_lane_u32(ref41, q5, 1);
+        q6 = vset_lane_u32(ref42, q6, 1);
+        q7 = vset_lane_u32(ref43, q7, 1);
+
+        sad_row4_neon(&q0, vreinterpret_u8_u32(q8), vreinterpret_u8_u32(q4));
+        sad_row4_neon(&q1, vreinterpret_u8_u32(q8), vreinterpret_u8_u32(q5));
+        sad_row4_neon(&q2, vreinterpret_u8_u32(q8), vreinterpret_u8_u32(q6));
+        sad_row4_neon(&q3, vreinterpret_u8_u32(q8), vreinterpret_u8_u32(q7));
+
+        res[0] += horizontal_add_16x4(q0);
+        res[1] += horizontal_add_16x4(q1);
+        res[2] += horizontal_add_16x4(q2);
+        res[3] += horizontal_add_16x4(q3);
+      }
+      break;
+    }
+    case 8: {
+      for (int i = 0; i < height; i++) {
+        uint16x4_t q0 = vdup_n_u16(0);
+        uint16x4_t q1 = vdup_n_u16(0);
+        uint16x4_t q2 = vdup_n_u16(0);
+        uint16x4_t q3 = vdup_n_u16(0);
+
+        uint8x8_t q5 = vld1_u8(src);
+
+        sad_row8_neon(&q0, &q5, ref0);
+        sad_row8_neon(&q1, &q5, ref1);
+        sad_row8_neon(&q2, &q5, ref2);
+        sad_row8_neon(&q3, &q5, ref3);
+
+        src += src_stride;
+        ref0 += ref_stride;
+        ref1 += ref_stride;
+        ref2 += ref_stride;
+        ref3 += ref_stride;
+
+        res[0] += horizontal_add_16x4(q0);
+        res[1] += horizontal_add_16x4(q1);
+        res[2] += horizontal_add_16x4(q2);
+        res[3] += horizontal_add_16x4(q3);
+      }
+      break;
+    }
+    case 16: {
+      for (int i = 0; i < height; i++) {
+        uint16x8_t q0 = vdupq_n_u16(0);
+        uint16x8_t q1 = vdupq_n_u16(0);
+        uint16x8_t q2 = vdupq_n_u16(0);
+        uint16x8_t q3 = vdupq_n_u16(0);
+
+        uint8x16_t q4 = vld1q_u8(src);
+
+        sad_row16_neon(&q0, &q4, ref0);
+        sad_row16_neon(&q1, &q4, ref1);
+        sad_row16_neon(&q2, &q4, ref2);
+        sad_row16_neon(&q3, &q4, ref3);
+
+        src += src_stride;
+        ref0 += ref_stride;
+        ref1 += ref_stride;
+        ref2 += ref_stride;
+        ref3 += ref_stride;
+
+        res[0] += horizontal_add_16x8(q0);
+        res[1] += horizontal_add_16x8(q1);
+        res[2] += horizontal_add_16x8(q2);
+        res[3] += horizontal_add_16x8(q3);
+      }
+      break;
+    }
+    case 32: {
+      for (int i = 0; i < height; i++) {
+        uint16x8_t q0 = vdupq_n_u16(0);
+        uint16x8_t q1 = vdupq_n_u16(0);
+        uint16x8_t q2 = vdupq_n_u16(0);
+        uint16x8_t q3 = vdupq_n_u16(0);
+
+        uint8x16_t q4 = vld1q_u8(src);
+
+        sad_row16_neon(&q0, &q4, ref0);
+        sad_row16_neon(&q1, &q4, ref1);
+        sad_row16_neon(&q2, &q4, ref2);
+        sad_row16_neon(&q3, &q4, ref3);
+
+        q4 = vld1q_u8(src + 16);
+
+        sad_row16_neon(&q0, &q4, ref0 + 16);
+        sad_row16_neon(&q1, &q4, ref1 + 16);
+        sad_row16_neon(&q2, &q4, ref2 + 16);
+        sad_row16_neon(&q3, &q4, ref3 + 16);
+
+        src += src_stride;
+        ref0 += ref_stride;
+        ref1 += ref_stride;
+        ref2 += ref_stride;
+        ref3 += ref_stride;
+
+        res[0] += horizontal_add_16x8(q0);
+        res[1] += horizontal_add_16x8(q1);
+        res[2] += horizontal_add_16x8(q2);
+        res[3] += horizontal_add_16x8(q3);
+      }
+      break;
+    }
+    case 64: {
+      for (int i = 0; i < height; i++) {
+        uint16x8_t q0 = vdupq_n_u16(0);
+        uint16x8_t q1 = vdupq_n_u16(0);
+        uint16x8_t q2 = vdupq_n_u16(0);
+        uint16x8_t q3 = vdupq_n_u16(0);
+
+        uint8x16_t q4 = vld1q_u8(src);
+
+        sad_row16_neon(&q0, &q4, ref0);
+        sad_row16_neon(&q1, &q4, ref1);
+        sad_row16_neon(&q2, &q4, ref2);
+        sad_row16_neon(&q3, &q4, ref3);
+
+        q4 = vld1q_u8(src + 16);
+
+        sad_row16_neon(&q0, &q4, ref0 + 16);
+        sad_row16_neon(&q1, &q4, ref1 + 16);
+        sad_row16_neon(&q2, &q4, ref2 + 16);
+        sad_row16_neon(&q3, &q4, ref3 + 16);
+
+        q4 = vld1q_u8(src + 32);
+
+        sad_row16_neon(&q0, &q4, ref0 + 32);
+        sad_row16_neon(&q1, &q4, ref1 + 32);
+        sad_row16_neon(&q2, &q4, ref2 + 32);
+        sad_row16_neon(&q3, &q4, ref3 + 32);
+
+        q4 = vld1q_u8(src + 48);
+
+        sad_row16_neon(&q0, &q4, ref0 + 48);
+        sad_row16_neon(&q1, &q4, ref1 + 48);
+        sad_row16_neon(&q2, &q4, ref2 + 48);
+        sad_row16_neon(&q3, &q4, ref3 + 48);
+
+        src += src_stride;
+        ref0 += ref_stride;
+        ref1 += ref_stride;
+        ref2 += ref_stride;
+        ref3 += ref_stride;
+
+        res[0] += horizontal_add_16x8(q0);
+        res[1] += horizontal_add_16x8(q1);
+        res[2] += horizontal_add_16x8(q2);
+        res[3] += horizontal_add_16x8(q3);
+      }
+      break;
+    }
+    case 128: {
+      for (int i = 0; i < height; i++) {
+        uint16x8_t q0 = vdupq_n_u16(0);
+        uint16x8_t q1 = vdupq_n_u16(0);
+        uint16x8_t q2 = vdupq_n_u16(0);
+        uint16x8_t q3 = vdupq_n_u16(0);
+
+        uint8x16_t q4 = vld1q_u8(src);
+
+        sad_row16_neon(&q0, &q4, ref0);
+        sad_row16_neon(&q1, &q4, ref1);
+        sad_row16_neon(&q2, &q4, ref2);
+        sad_row16_neon(&q3, &q4, ref3);
+
+        q4 = vld1q_u8(src + 16);
+
+        sad_row16_neon(&q0, &q4, ref0 + 16);
+        sad_row16_neon(&q1, &q4, ref1 + 16);
+        sad_row16_neon(&q2, &q4, ref2 + 16);
+        sad_row16_neon(&q3, &q4, ref3 + 16);
+
+        q4 = vld1q_u8(src + 32);
+
+        sad_row16_neon(&q0, &q4, ref0 + 32);
+        sad_row16_neon(&q1, &q4, ref1 + 32);
+        sad_row16_neon(&q2, &q4, ref2 + 32);
+        sad_row16_neon(&q3, &q4, ref3 + 32);
+
+        q4 = vld1q_u8(src + 48);
+
+        sad_row16_neon(&q0, &q4, ref0 + 48);
+        sad_row16_neon(&q1, &q4, ref1 + 48);
+        sad_row16_neon(&q2, &q4, ref2 + 48);
+        sad_row16_neon(&q3, &q4, ref3 + 48);
+
+        q4 = vld1q_u8(src + 64);
+
+        sad_row16_neon(&q0, &q4, ref0 + 64);
+        sad_row16_neon(&q1, &q4, ref1 + 64);
+        sad_row16_neon(&q2, &q4, ref2 + 64);
+        sad_row16_neon(&q3, &q4, ref3 + 64);
+
+        q4 = vld1q_u8(src + 80);
+
+        sad_row16_neon(&q0, &q4, ref0 + 80);
+        sad_row16_neon(&q1, &q4, ref1 + 80);
+        sad_row16_neon(&q2, &q4, ref2 + 80);
+        sad_row16_neon(&q3, &q4, ref3 + 80);
+
+        q4 = vld1q_u8(src + 96);
+
+        sad_row16_neon(&q0, &q4, ref0 + 96);
+        sad_row16_neon(&q1, &q4, ref1 + 96);
+        sad_row16_neon(&q2, &q4, ref2 + 96);
+        sad_row16_neon(&q3, &q4, ref3 + 96);
+
+        q4 = vld1q_u8(src + 112);
+
+        sad_row16_neon(&q0, &q4, ref0 + 112);
+        sad_row16_neon(&q1, &q4, ref1 + 112);
+        sad_row16_neon(&q2, &q4, ref2 + 112);
+        sad_row16_neon(&q3, &q4, ref3 + 112);
+
+        src += src_stride;
+        ref0 += ref_stride;
+        ref1 += ref_stride;
+        ref2 += ref_stride;
+        ref3 += ref_stride;
+
+        res[0] += horizontal_add_16x8(q0);
+        res[1] += horizontal_add_16x8(q1);
+        res[2] += horizontal_add_16x8(q2);
+        res[3] += horizontal_add_16x8(q3);
+      }
+    }
+  }
+}
+
+#define sad_skip_MxN_neon(m, n)                                             \
+  void aom_sad_skip_##m##x##n##x4d_neon(const uint8_t *src, int src_stride, \
+                                        const uint8_t *const ref[4],        \
+                                        int ref_stride, uint32_t res[4]) {  \
+    aom_sadMxNx4d_neon(m, ((n) >> 1), src, 2 * src_stride, ref,             \
+                       2 * ref_stride, res);                                \
+    res[0] <<= 1;                                                           \
+    res[1] <<= 1;                                                           \
+    res[2] <<= 1;                                                           \
+    res[3] <<= 1;                                                           \
+  }
+
+sad_skip_MxN_neon(4, 8);
+sad_skip_MxN_neon(4, 16);
+sad_skip_MxN_neon(4, 32);
+
+sad_skip_MxN_neon(8, 8);
+sad_skip_MxN_neon(8, 16);
+sad_skip_MxN_neon(8, 32);
+
+sad_skip_MxN_neon(16, 8);
+sad_skip_MxN_neon(16, 16);
+sad_skip_MxN_neon(16, 32);
+sad_skip_MxN_neon(16, 64);
+
+sad_skip_MxN_neon(32, 8);
+sad_skip_MxN_neon(32, 16);
+sad_skip_MxN_neon(32, 32);
+sad_skip_MxN_neon(32, 64);
+
+sad_skip_MxN_neon(64, 16);
+sad_skip_MxN_neon(64, 32);
+sad_skip_MxN_neon(64, 64);
+sad_skip_MxN_neon(64, 128);
+
+sad_skip_MxN_neon(128, 64);
+sad_skip_MxN_neon(128, 128);
+#undef sad_skip_MxN_neon
diff --git a/aom_dsp/arm/sad_neon.c b/aom_dsp/arm/sad_neon.c
index a39de91..32b9e9d 100644
--- a/aom_dsp/arm/sad_neon.c
+++ b/aom_dsp/arm/sad_neon.c
@@ -10,13 +10,12 @@
  */
 
 #include <arm_neon.h>
-
 #include "config/aom_config.h"
-
+#include "config/aom_dsp_rtcd.h"
 #include "aom/aom_integer.h"
 
-unsigned int aom_sad8x16_neon(unsigned char *src_ptr, int src_stride,
-                              unsigned char *ref_ptr, int ref_stride) {
+unsigned int aom_sad8x16_neon(const uint8_t *src_ptr, int src_stride,
+                              const uint8_t *ref_ptr, int ref_stride) {
   uint8x8_t d0, d8;
   uint16x8_t q12;
   uint32x4_t q1;
@@ -46,8 +45,8 @@
   return vget_lane_u32(d5, 0);
 }
 
-unsigned int aom_sad4x4_neon(unsigned char *src_ptr, int src_stride,
-                             unsigned char *ref_ptr, int ref_stride) {
+unsigned int aom_sad4x4_neon(const uint8_t *src_ptr, int src_stride,
+                             const uint8_t *ref_ptr, int ref_stride) {
   uint8x8_t d0, d8;
   uint16x8_t q12;
   uint32x2_t d1;
@@ -74,8 +73,8 @@
   return vget_lane_u32(vreinterpret_u32_u64(d3), 0);
 }
 
-unsigned int aom_sad16x8_neon(unsigned char *src_ptr, int src_stride,
-                              unsigned char *ref_ptr, int ref_stride) {
+unsigned int aom_sad16x8_neon(const uint8_t *src_ptr, int src_stride,
+                              const uint8_t *ref_ptr, int ref_stride) {
   uint8x16_t q0, q4;
   uint16x8_t q12, q13;
   uint32x4_t q1;
@@ -164,6 +163,77 @@
   return horizontal_long_add_16x8(vec_accum_lo, vec_accum_hi);
 }
 
+unsigned int aom_sad128x128_neon(const uint8_t *src, int src_stride,
+                                 const uint8_t *ref, int ref_stride) {
+  uint16x8_t vec_accum_lo, vec_accum_hi;
+  uint32x4_t vec_accum_32lo = vdupq_n_u32(0);
+  uint32x4_t vec_accum_32hi = vdupq_n_u32(0);
+  uint16x8_t tmp;
+  for (int i = 0; i < 128; ++i) {
+    const uint8x16_t vec_src_00 = vld1q_u8(src);
+    const uint8x16_t vec_src_16 = vld1q_u8(src + 16);
+    const uint8x16_t vec_src_32 = vld1q_u8(src + 32);
+    const uint8x16_t vec_src_48 = vld1q_u8(src + 48);
+    const uint8x16_t vec_src_64 = vld1q_u8(src + 64);
+    const uint8x16_t vec_src_80 = vld1q_u8(src + 80);
+    const uint8x16_t vec_src_96 = vld1q_u8(src + 96);
+    const uint8x16_t vec_src_112 = vld1q_u8(src + 112);
+    const uint8x16_t vec_ref_00 = vld1q_u8(ref);
+    const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16);
+    const uint8x16_t vec_ref_32 = vld1q_u8(ref + 32);
+    const uint8x16_t vec_ref_48 = vld1q_u8(ref + 48);
+    const uint8x16_t vec_ref_64 = vld1q_u8(ref + 64);
+    const uint8x16_t vec_ref_80 = vld1q_u8(ref + 80);
+    const uint8x16_t vec_ref_96 = vld1q_u8(ref + 96);
+    const uint8x16_t vec_ref_112 = vld1q_u8(ref + 112);
+    src += src_stride;
+    ref += ref_stride;
+    vec_accum_lo = vdupq_n_u16(0);
+    vec_accum_hi = vdupq_n_u16(0);
+    vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_00),
+                            vget_low_u8(vec_ref_00));
+    vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_00),
+                            vget_high_u8(vec_ref_00));
+    vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_16),
+                            vget_low_u8(vec_ref_16));
+    vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_16),
+                            vget_high_u8(vec_ref_16));
+    vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_32),
+                            vget_low_u8(vec_ref_32));
+    vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_32),
+                            vget_high_u8(vec_ref_32));
+    vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_48),
+                            vget_low_u8(vec_ref_48));
+    vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_48),
+                            vget_high_u8(vec_ref_48));
+    vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_64),
+                            vget_low_u8(vec_ref_64));
+    vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_64),
+                            vget_high_u8(vec_ref_64));
+    vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_80),
+                            vget_low_u8(vec_ref_80));
+    vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_80),
+                            vget_high_u8(vec_ref_80));
+    vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_96),
+                            vget_low_u8(vec_ref_96));
+    vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_96),
+                            vget_high_u8(vec_ref_96));
+    vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_112),
+                            vget_low_u8(vec_ref_112));
+    vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_112),
+                            vget_high_u8(vec_ref_112));
+
+    tmp = vaddq_u16(vec_accum_lo, vec_accum_hi);
+    vec_accum_32lo = vaddw_u16(vec_accum_32lo, vget_low_u16(tmp));
+    vec_accum_32hi = vaddw_u16(vec_accum_32hi, vget_high_u16(tmp));
+  }
+  const uint32x4_t a = vaddq_u32(vec_accum_32lo, vec_accum_32hi);
+  const uint64x2_t b = vpaddlq_u32(a);
+  const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
+                                vreinterpret_u32_u64(vget_high_u64(b)));
+  return vget_lane_u32(c, 0);
+}
+
 unsigned int aom_sad32x32_neon(const uint8_t *src, int src_stride,
                                const uint8_t *ref, int ref_stride) {
   int i;
@@ -222,3 +292,266 @@
   }
   return horizontal_add_16x8(vec_accum);
 }
+
+static INLINE unsigned int sad128xh_neon(const uint8_t *src_ptr, int src_stride,
+                                         const uint8_t *ref_ptr, int ref_stride,
+                                         int h) {
+  int sum = 0;
+  for (int i = 0; i < h; i++) {
+    uint16x8_t q3 = vdupq_n_u16(0);
+
+    uint8x16_t q0 = vld1q_u8(src_ptr);
+    uint8x16_t q1 = vld1q_u8(ref_ptr);
+    uint8x16_t q2 = vabdq_u8(q0, q1);
+    q3 = vpadalq_u8(q3, q2);
+
+    q0 = vld1q_u8(src_ptr + 16);
+    q1 = vld1q_u8(ref_ptr + 16);
+    q2 = vabdq_u8(q0, q1);
+    q3 = vpadalq_u8(q3, q2);
+
+    q0 = vld1q_u8(src_ptr + 32);
+    q1 = vld1q_u8(ref_ptr + 32);
+    q2 = vabdq_u8(q0, q1);
+    q3 = vpadalq_u8(q3, q2);
+
+    q0 = vld1q_u8(src_ptr + 48);
+    q1 = vld1q_u8(ref_ptr + 48);
+    q2 = vabdq_u8(q0, q1);
+    q3 = vpadalq_u8(q3, q2);
+
+    q0 = vld1q_u8(src_ptr + 64);
+    q1 = vld1q_u8(ref_ptr + 64);
+    q2 = vabdq_u8(q0, q1);
+    q3 = vpadalq_u8(q3, q2);
+
+    q0 = vld1q_u8(src_ptr + 80);
+    q1 = vld1q_u8(ref_ptr + 80);
+    q2 = vabdq_u8(q0, q1);
+    q3 = vpadalq_u8(q3, q2);
+
+    q0 = vld1q_u8(src_ptr + 96);
+    q1 = vld1q_u8(ref_ptr + 96);
+    q2 = vabdq_u8(q0, q1);
+    q3 = vpadalq_u8(q3, q2);
+
+    q0 = vld1q_u8(src_ptr + 112);
+    q1 = vld1q_u8(ref_ptr + 112);
+    q2 = vabdq_u8(q0, q1);
+    q3 = vpadalq_u8(q3, q2);
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+
+    sum += horizontal_add_16x8(q3);
+  }
+
+  return sum;
+}
+
+static INLINE unsigned int sad64xh_neon(const uint8_t *src_ptr, int src_stride,
+                                        const uint8_t *ref_ptr, int ref_stride,
+                                        int h) {
+  int sum = 0;
+  for (int i = 0; i < h; i++) {
+    uint16x8_t q3 = vdupq_n_u16(0);
+
+    uint8x16_t q0 = vld1q_u8(src_ptr);
+    uint8x16_t q1 = vld1q_u8(ref_ptr);
+    uint8x16_t q2 = vabdq_u8(q0, q1);
+    q3 = vpadalq_u8(q3, q2);
+
+    q0 = vld1q_u8(src_ptr + 16);
+    q1 = vld1q_u8(ref_ptr + 16);
+    q2 = vabdq_u8(q0, q1);
+    q3 = vpadalq_u8(q3, q2);
+
+    q0 = vld1q_u8(src_ptr + 32);
+    q1 = vld1q_u8(ref_ptr + 32);
+    q2 = vabdq_u8(q0, q1);
+    q3 = vpadalq_u8(q3, q2);
+
+    q0 = vld1q_u8(src_ptr + 48);
+    q1 = vld1q_u8(ref_ptr + 48);
+    q2 = vabdq_u8(q0, q1);
+    q3 = vpadalq_u8(q3, q2);
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+
+    sum += horizontal_add_16x8(q3);
+  }
+
+  return sum;
+}
+
+static INLINE unsigned int sad32xh_neon(const uint8_t *src_ptr, int src_stride,
+                                        const uint8_t *ref_ptr, int ref_stride,
+                                        int h) {
+  int sum = 0;
+  for (int i = 0; i < h; i++) {
+    uint16x8_t q3 = vdupq_n_u16(0);
+
+    uint8x16_t q0 = vld1q_u8(src_ptr);
+    uint8x16_t q1 = vld1q_u8(ref_ptr);
+    uint8x16_t q2 = vabdq_u8(q0, q1);
+    q3 = vpadalq_u8(q3, q2);
+
+    q0 = vld1q_u8(src_ptr + 16);
+    q1 = vld1q_u8(ref_ptr + 16);
+    q2 = vabdq_u8(q0, q1);
+    q3 = vpadalq_u8(q3, q2);
+
+    sum += horizontal_add_16x8(q3);
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+  }
+
+  return sum;
+}
+
+static INLINE unsigned int sad16xh_neon(const uint8_t *src_ptr, int src_stride,
+                                        const uint8_t *ref_ptr, int ref_stride,
+                                        int h) {
+  int sum = 0;
+  for (int i = 0; i < h; i++) {
+    uint8x8_t q0 = vld1_u8(src_ptr);
+    uint8x8_t q1 = vld1_u8(ref_ptr);
+    sum += vget_lane_u16(vpaddl_u8(vabd_u8(q0, q1)), 0);
+    sum += vget_lane_u16(vpaddl_u8(vabd_u8(q0, q1)), 1);
+    sum += vget_lane_u16(vpaddl_u8(vabd_u8(q0, q1)), 2);
+    sum += vget_lane_u16(vpaddl_u8(vabd_u8(q0, q1)), 3);
+    q0 = vld1_u8(src_ptr + 8);
+    q1 = vld1_u8(ref_ptr + 8);
+    sum += vget_lane_u16(vpaddl_u8(vabd_u8(q0, q1)), 0);
+    sum += vget_lane_u16(vpaddl_u8(vabd_u8(q0, q1)), 1);
+    sum += vget_lane_u16(vpaddl_u8(vabd_u8(q0, q1)), 2);
+    sum += vget_lane_u16(vpaddl_u8(vabd_u8(q0, q1)), 3);
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+  }
+
+  return sum;
+}
+
+static INLINE unsigned int sad8xh_neon(const uint8_t *src_ptr, int src_stride,
+                                       const uint8_t *ref_ptr, int ref_stride,
+                                       int h) {
+  uint16x8_t q3 = vdupq_n_u16(0);
+  for (int y = 0; y < h; y++) {
+    uint8x8_t q0 = vld1_u8(src_ptr);
+    uint8x8_t q1 = vld1_u8(ref_ptr);
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+    q3 = vabal_u8(q3, q0, q1);
+  }
+  return horizontal_add_16x8(q3);
+}
+
+static INLINE unsigned int sad4xh_neon(const uint8_t *src_ptr, int src_stride,
+                                       const uint8_t *ref_ptr, int ref_stride,
+                                       int h) {
+  uint16x8_t q3 = vdupq_n_u16(0);
+  uint32x2_t q0 = vdup_n_u32(0);
+  uint32x2_t q1 = vdup_n_u32(0);
+  uint32_t src4, ref4;
+  for (int y = 0; y < h / 2; y++) {
+    memcpy(&src4, src_ptr, 4);
+    memcpy(&ref4, ref_ptr, 4);
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+    q0 = vset_lane_u32(src4, q0, 0);
+    q1 = vset_lane_u32(ref4, q1, 0);
+
+    memcpy(&src4, src_ptr, 4);
+    memcpy(&ref4, ref_ptr, 4);
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+    q0 = vset_lane_u32(src4, q0, 1);
+    q1 = vset_lane_u32(ref4, q1, 1);
+
+    q3 = vabal_u8(q3, vreinterpret_u8_u32(q0), vreinterpret_u8_u32(q1));
+  }
+  return horizontal_add_16x8(q3);
+}
+
+#define FSADS128_H(h)                                                    \
+  unsigned int aom_sad_skip_128x##h##_neon(                              \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,    \
+      int ref_stride) {                                                  \
+    const uint32_t sum = sad128xh_neon(src_ptr, 2 * src_stride, ref_ptr, \
+                                       2 * ref_stride, h / 2);           \
+    return 2 * sum;                                                      \
+  }
+FSADS128_H(128);
+FSADS128_H(64);
+#undef FSADS128_H
+
+#define FSADS64_H(h)                                                          \
+  unsigned int aom_sad_skip_64x##h##_neon(                                    \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,         \
+      int ref_stride) {                                                       \
+    return 2 * sad64xh_neon(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \
+                            h / 2);                                           \
+  }
+
+FSADS64_H(128);
+FSADS64_H(64);
+FSADS64_H(32);
+FSADS64_H(16);
+#undef FSADS64_H
+
+#define FSADS32_H(h)                                                          \
+  unsigned int aom_sad_skip_32x##h##_neon(                                    \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,         \
+      int ref_stride) {                                                       \
+    return 2 * sad32xh_neon(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \
+                            h / 2);                                           \
+  }
+
+FSADS32_H(64);
+FSADS32_H(32);
+FSADS32_H(16);
+FSADS32_H(8);
+#undef FSADS32_H
+
+#define FSADS16_H(h)                                                          \
+  unsigned int aom_sad_skip_16x##h##_neon(                                    \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,         \
+      int ref_stride) {                                                       \
+    return 2 * sad16xh_neon(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \
+                            h / 2);                                           \
+  }
+
+FSADS16_H(64);
+FSADS16_H(32);
+FSADS16_H(16);
+FSADS16_H(8);
+#undef FSADS16_H
+
+#define FSADS8_H(h)                                                          \
+  unsigned int aom_sad_skip_8x##h##_neon(                                    \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,        \
+      int ref_stride) {                                                      \
+    return 2 * sad8xh_neon(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \
+                           h / 2);                                           \
+  }
+
+FSADS8_H(32);
+FSADS8_H(16);
+FSADS8_H(8);
+#undef FSADS8_H
+
+#define FSADS4_H(h)                                                          \
+  unsigned int aom_sad_skip_4x##h##_neon(                                    \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,        \
+      int ref_stride) {                                                      \
+    return 2 * sad4xh_neon(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \
+                           h / 2);                                           \
+  }
+
+FSADS4_H(16);
+FSADS4_H(8);
+#undef FSADS4_H
diff --git a/aom_dsp/arm/sse_neon.c b/aom_dsp/arm/sse_neon.c
index 06b81cc..1f73443 100644
--- a/aom_dsp/arm/sse_neon.c
+++ b/aom_dsp/arm/sse_neon.c
@@ -9,217 +9,176 @@
  */
 
 #include <arm_neon.h>
-#include "config/aom_config.h"
+
 #include "config/aom_dsp_rtcd.h"
-
 #include "aom/aom_integer.h"
+#include "aom_dsp/arm/sum_neon.h"
+#include "av1/common/arm/mem_neon.h"
+#include "av1/common/arm/transpose_neon.h"
 
-static INLINE uint32_t sse_W16x1_neon(uint8x16_t q2, uint8x16_t q3) {
-  const uint16_t sse1 = 0;
-  const uint16x8_t q1 = vld1q_dup_u16(&sse1);
-
-  uint32_t sse;
-
-  uint8x16_t q4 = vabdq_u8(q2, q3);  // diff = abs(a[x] - b[x])
-  uint8x8_t d0 = vget_low_u8(q4);
-  uint8x8_t d1 = vget_high_u8(q4);
-
-  uint16x8_t q6 = vmlal_u8(q1, d0, d0);
-  uint16x8_t q7 = vmlal_u8(q1, d1, d1);
-
-  uint32x4_t q8 = vaddl_u16(vget_low_u16(q6), vget_high_u16(q6));
-  uint32x4_t q9 = vaddl_u16(vget_low_u16(q7), vget_high_u16(q7));
-
-  uint32x2_t d4 = vadd_u32(vget_low_u32(q8), vget_high_u32(q8));
-  uint32x2_t d5 = vadd_u32(vget_low_u32(q9), vget_high_u32(q9));
-
-  uint32x2_t d6 = vadd_u32(d4, d5);
-
-  sse = vget_lane_u32(d6, 0);
-  sse += vget_lane_u32(d6, 1);
-
-  return sse;
+static INLINE void sse_w16_neon(uint32x4_t *sum, const uint8_t *a,
+                                const uint8_t *b) {
+  const uint8x16_t v_a0 = vld1q_u8(a);
+  const uint8x16_t v_b0 = vld1q_u8(b);
+  const uint8x16_t diff = vabdq_u8(v_a0, v_b0);
+  const uint8x8_t diff_lo = vget_low_u8(diff);
+  const uint8x8_t diff_hi = vget_high_u8(diff);
+  *sum = vpadalq_u16(*sum, vmull_u8(diff_lo, diff_lo));
+  *sum = vpadalq_u16(*sum, vmull_u8(diff_hi, diff_hi));
 }
-
+static INLINE void aom_sse4x2_neon(const uint8_t *a, int a_stride,
+                                   const uint8_t *b, int b_stride,
+                                   uint32x4_t *sum) {
+  uint8x8_t v_a0, v_b0;
+  v_a0 = v_b0 = vcreate_u8(0);
+  // above line is only to shadow [-Werror=uninitialized]
+  v_a0 = vreinterpret_u8_u32(
+      vld1_lane_u32((uint32_t *)a, vreinterpret_u32_u8(v_a0), 0));
+  v_a0 = vreinterpret_u8_u32(
+      vld1_lane_u32((uint32_t *)(a + a_stride), vreinterpret_u32_u8(v_a0), 1));
+  v_b0 = vreinterpret_u8_u32(
+      vld1_lane_u32((uint32_t *)b, vreinterpret_u32_u8(v_b0), 0));
+  v_b0 = vreinterpret_u8_u32(
+      vld1_lane_u32((uint32_t *)(b + b_stride), vreinterpret_u32_u8(v_b0), 1));
+  const uint8x8_t v_a_w = vabd_u8(v_a0, v_b0);
+  *sum = vpadalq_u16(*sum, vmull_u8(v_a_w, v_a_w));
+}
+static INLINE void aom_sse8_neon(const uint8_t *a, const uint8_t *b,
+                                 uint32x4_t *sum) {
+  const uint8x8_t v_a_w = vld1_u8(a);
+  const uint8x8_t v_b_w = vld1_u8(b);
+  const uint8x8_t v_d_w = vabd_u8(v_a_w, v_b_w);
+  *sum = vpadalq_u16(*sum, vmull_u8(v_d_w, v_d_w));
+}
 int64_t aom_sse_neon(const uint8_t *a, int a_stride, const uint8_t *b,
                      int b_stride, int width, int height) {
-  const uint8x16_t q0 = {
-    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
-  };
-  int addinc, x, y;
-  uint8x8_t d0, d1, d2, d3;
-  uint8_t dx;
-  uint8x16_t q2, q3, q4, q5;
-  uint32_t sse = 0;
-  uint8x8x2_t tmp, tmp2;
-
+  int y = 0;
+  int64_t sse = 0;
+  uint32x4_t sum = vdupq_n_u32(0);
   switch (width) {
     case 4:
-      for (y = 0; y < height; y += 4) {
-        d0 = vld1_u8(a);  // load 4 data
-        a += a_stride;
-        d1 = vld1_u8(a);
-        a += a_stride;
-        d2 = vld1_u8(a);
-        a += a_stride;
-        d3 = vld1_u8(a);
-        a += a_stride;
-        tmp = vzip_u8(d0, d1);
-        tmp2 = vzip_u8(d2, d3);
-        q2 = vcombine_u8(tmp.val[0], tmp2.val[0]);  // make a 16 data vector
-
-        d0 = vld1_u8(b);
-        b += b_stride;
-        d1 = vld1_u8(b);
-        b += b_stride;
-        d2 = vld1_u8(b);
-        b += b_stride;
-        d3 = vld1_u8(b);
-        b += b_stride;
-        tmp = vzip_u8(d0, d1);
-        tmp2 = vzip_u8(d2, d3);
-        q3 = vcombine_u8(tmp.val[0], tmp2.val[0]);
-
-        sse += sse_W16x1_neon(q2, q3);
-      }
+      do {
+        aom_sse4x2_neon(a, a_stride, b, b_stride, &sum);
+        a += a_stride << 1;
+        b += b_stride << 1;
+        y += 2;
+      } while (y < height);
+#if defined(__aarch64__)
+      sse = vaddvq_u32(sum);
+#else
+      sse = horizontal_add_s32x4(vreinterpretq_s32_u32(sum));
+#endif  // __aarch64__
       break;
     case 8:
-      for (y = 0; y < height; y += 2) {
-        d0 = vld1_u8(a);  // load 8 data
-        d1 = vld1_u8(a + a_stride);
-        q2 = vcombine_u8(d0, d1);  // make a 16 data vector
-
-        d0 = vld1_u8(b);
-        d1 = vld1_u8(b + b_stride);
-        q3 = vcombine_u8(d0, d1);
-
-        sse += sse_W16x1_neon(q2, q3);
-
-        a += 2 * a_stride;
-        b += 2 * b_stride;
-      }
+      do {
+        aom_sse8_neon(a, b, &sum);
+        a += a_stride;
+        b += b_stride;
+        y += 1;
+      } while (y < height);
+#if defined(__aarch64__)
+      sse = vaddvq_u32(sum);
+#else
+      sse = horizontal_add_s32x4(vreinterpretq_s32_u32(sum));
+#endif  // __aarch64__
       break;
     case 16:
-      for (y = 0; y < height; y++) {
-        q2 = vld1q_u8(a);
-        q3 = vld1q_u8(b);
-
-        sse += sse_W16x1_neon(q2, q3);
-
+      do {
+        sse_w16_neon(&sum, a, b);
         a += a_stride;
         b += b_stride;
-      }
+        y += 1;
+      } while (y < height);
+#if defined(__aarch64__)
+      sse = vaddvq_u32(sum);
+#else
+      sse = horizontal_add_s32x4(vreinterpretq_s32_u32(sum));
+#endif  // __aarch64__
       break;
     case 32:
-      for (y = 0; y < height; y++) {
-        q2 = vld1q_u8(a);
-        q3 = vld1q_u8(b);
-
-        sse += sse_W16x1_neon(q2, q3);
-
-        q2 = vld1q_u8(a + 16);
-        q3 = vld1q_u8(b + 16);
-
-        sse += sse_W16x1_neon(q2, q3);
-
+      do {
+        sse_w16_neon(&sum, a, b);
+        sse_w16_neon(&sum, a + 16, b + 16);
         a += a_stride;
         b += b_stride;
-      }
+        y += 1;
+      } while (y < height);
+#if defined(__aarch64__)
+      sse = vaddvq_u32(sum);
+#else
+      sse = horizontal_add_s32x4(vreinterpretq_s32_u32(sum));
+#endif  // __aarch64__
       break;
     case 64:
-      for (y = 0; y < height; y++) {
-        q2 = vld1q_u8(a);
-        q3 = vld1q_u8(b);
-
-        sse += sse_W16x1_neon(q2, q3);
-
-        q2 = vld1q_u8(a + 16);
-        q3 = vld1q_u8(b + 16);
-
-        sse += sse_W16x1_neon(q2, q3);
-
-        q2 = vld1q_u8(a + 32);
-        q3 = vld1q_u8(b + 32);
-
-        sse += sse_W16x1_neon(q2, q3);
-
-        q2 = vld1q_u8(a + 48);
-        q3 = vld1q_u8(b + 48);
-
-        sse += sse_W16x1_neon(q2, q3);
-
+      do {
+        sse_w16_neon(&sum, a, b);
+        sse_w16_neon(&sum, a + 16 * 1, b + 16 * 1);
+        sse_w16_neon(&sum, a + 16 * 2, b + 16 * 2);
+        sse_w16_neon(&sum, a + 16 * 3, b + 16 * 3);
         a += a_stride;
         b += b_stride;
-      }
+        y += 1;
+      } while (y < height);
+#if defined(__aarch64__)
+      sse = vaddvq_u32(sum);
+#else
+      sse = horizontal_add_s32x4(vreinterpretq_s32_u32(sum));
+#endif  // __aarch64__
       break;
     case 128:
-      for (y = 0; y < height; y++) {
-        q2 = vld1q_u8(a);
-        q3 = vld1q_u8(b);
-
-        sse += sse_W16x1_neon(q2, q3);
-
-        q2 = vld1q_u8(a + 16);
-        q3 = vld1q_u8(b + 16);
-
-        sse += sse_W16x1_neon(q2, q3);
-
-        q2 = vld1q_u8(a + 32);
-        q3 = vld1q_u8(b + 32);
-
-        sse += sse_W16x1_neon(q2, q3);
-
-        q2 = vld1q_u8(a + 48);
-        q3 = vld1q_u8(b + 48);
-
-        sse += sse_W16x1_neon(q2, q3);
-
-        q2 = vld1q_u8(a + 64);
-        q3 = vld1q_u8(b + 64);
-
-        sse += sse_W16x1_neon(q2, q3);
-
-        q2 = vld1q_u8(a + 80);
-        q3 = vld1q_u8(b + 80);
-
-        sse += sse_W16x1_neon(q2, q3);
-
-        q2 = vld1q_u8(a + 96);
-        q3 = vld1q_u8(b + 96);
-
-        sse += sse_W16x1_neon(q2, q3);
-
-        q2 = vld1q_u8(a + 112);
-        q3 = vld1q_u8(b + 112);
-
-        sse += sse_W16x1_neon(q2, q3);
-
+      do {
+        sse_w16_neon(&sum, a, b);
+        sse_w16_neon(&sum, a + 16 * 1, b + 16 * 1);
+        sse_w16_neon(&sum, a + 16 * 2, b + 16 * 2);
+        sse_w16_neon(&sum, a + 16 * 3, b + 16 * 3);
+        sse_w16_neon(&sum, a + 16 * 4, b + 16 * 4);
+        sse_w16_neon(&sum, a + 16 * 5, b + 16 * 5);
+        sse_w16_neon(&sum, a + 16 * 6, b + 16 * 6);
+        sse_w16_neon(&sum, a + 16 * 7, b + 16 * 7);
         a += a_stride;
         b += b_stride;
-      }
+        y += 1;
+      } while (y < height);
+#if defined(__aarch64__)
+      sse = vaddvq_u32(sum);
+#else
+      sse = horizontal_add_s32x4(vreinterpretq_s32_u32(sum));
+#endif  // __aarch64__
       break;
     default:
-      for (y = 0; y < height; y++) {
-        x = width;
-        while (x > 0) {
-          addinc = width - x;
-          q2 = vld1q_u8(a + addinc);
-          q3 = vld1q_u8(b + addinc);
-          if (x < 16) {
-            dx = x;
-            q4 = vld1q_dup_u8(&dx);
-            q5 = vcltq_u8(q0, q4);
-            q2 = vandq_u8(q2, q5);
-            q3 = vandq_u8(q3, q5);
-          }
-          sse += sse_W16x1_neon(q2, q3);
-          x -= 16;
-        }
-        a += a_stride;
-        b += b_stride;
+      if (width & 0x07) {
+        do {
+          int i = 0;
+          do {
+            aom_sse8_neon(a + i, b + i, &sum);
+            aom_sse8_neon(a + i + a_stride, b + i + b_stride, &sum);
+            i += 8;
+          } while (i + 4 < width);
+          aom_sse4x2_neon(a + i, a_stride, b + i, b_stride, &sum);
+          a += (a_stride << 1);
+          b += (b_stride << 1);
+          y += 2;
+        } while (y < height);
+      } else {
+        do {
+          int i = 0;
+          do {
+            aom_sse8_neon(a + i, b + i, &sum);
+            i += 8;
+          } while (i < width);
+          a += a_stride;
+          b += b_stride;
+          y += 1;
+        } while (y < height);
       }
+#if defined(__aarch64__)
+      sse = vaddvq_u32(sum);
+#else
+      sse = horizontal_add_s32x4(vreinterpretq_s32_u32(sum));
+#endif  // __aarch64__
+      break;
   }
-  return (int64_t)sse;
+  return sse;
 }
 
 #if CONFIG_AV1_HIGHBITDEPTH
diff --git a/aom_dsp/arm/subpel_variance_neon.c b/aom_dsp/arm/subpel_variance_neon.c
index cf618ee..4ecf891 100644
--- a/aom_dsp/arm/subpel_variance_neon.c
+++ b/aom_dsp/arm/subpel_variance_neon.c
@@ -20,6 +20,42 @@
 #include "aom_dsp/aom_filter.h"
 #include "aom_dsp/variance.h"
 
+// Load 2 sets of 4 bytes when alignment is not guaranteed.
+static INLINE uint8x8_t load_unaligned_u8(const uint8_t *buf, int stride) {
+  uint32_t a;
+  uint32x2_t a_u32 = vdup_n_u32(0);
+  if (stride == 4) return vld1_u8(buf);
+  memcpy(&a, buf, 4);
+  buf += stride;
+  a_u32 = vld1_lane_u32(&a, a_u32, 0);
+  memcpy(&a, buf, 4);
+  a_u32 = vld1_lane_u32(&a, a_u32, 1);
+  return vreinterpret_u8_u32(a_u32);
+}
+
+// Process a block exactly 4 wide and a multiple of 2 high.
+static void var_filter_block2d_bil_w4(const uint8_t *src_ptr,
+                                      uint8_t *output_ptr,
+                                      unsigned int src_pixels_per_line,
+                                      int pixel_step,
+                                      unsigned int output_height,
+                                      const uint8_t *filter) {
+  const uint8x8_t f0 = vdup_n_u8(filter[0]);
+  const uint8x8_t f1 = vdup_n_u8(filter[1]);
+  unsigned int i;
+  for (i = 0; i < output_height; i += 2) {
+    const uint8x8_t src_0 = load_unaligned_u8(src_ptr, src_pixels_per_line);
+    const uint8x8_t src_1 =
+        load_unaligned_u8(src_ptr + pixel_step, src_pixels_per_line);
+    const uint16x8_t a = vmull_u8(src_0, f0);
+    const uint16x8_t b = vmlal_u8(a, src_1, f1);
+    const uint8x8_t out = vrshrn_n_u16(b, FILTER_BITS);
+    vst1_u8(output_ptr, out);
+    src_ptr += 2 * src_pixels_per_line;
+    output_ptr += 8;
+  }
+}
+
 static void var_filter_block2d_bil_w8(const uint8_t *src_ptr,
                                       uint8_t *output_ptr,
                                       unsigned int src_pixels_per_line,
@@ -27,8 +63,8 @@
                                       unsigned int output_height,
                                       unsigned int output_width,
                                       const uint8_t *filter) {
-  const uint8x8_t f0 = vmov_n_u8(filter[0]);
-  const uint8x8_t f1 = vmov_n_u8(filter[1]);
+  const uint8x8_t f0 = vdup_n_u8(filter[0]);
+  const uint8x8_t f1 = vdup_n_u8(filter[1]);
   unsigned int i;
   for (i = 0; i < output_height; ++i) {
     const uint8x8_t src_0 = vld1_u8(&src_ptr[0]);
@@ -36,13 +72,14 @@
     const uint16x8_t a = vmull_u8(src_0, f0);
     const uint16x8_t b = vmlal_u8(a, src_1, f1);
     const uint8x8_t out = vrshrn_n_u16(b, FILTER_BITS);
-    vst1_u8(&output_ptr[0], out);
+    vst1_u8(output_ptr, out);
     // Next row...
     src_ptr += src_pixels_per_line;
     output_ptr += output_width;
   }
 }
 
+// Process a block which is a mutiple of 16 wide and any height.
 static void var_filter_block2d_bil_w16(const uint8_t *src_ptr,
                                        uint8_t *output_ptr,
                                        unsigned int src_pixels_per_line,
@@ -50,8 +87,8 @@
                                        unsigned int output_height,
                                        unsigned int output_width,
                                        const uint8_t *filter) {
-  const uint8x8_t f0 = vmov_n_u8(filter[0]);
-  const uint8x8_t f1 = vmov_n_u8(filter[1]);
+  const uint8x8_t f0 = vdup_n_u8(filter[0]);
+  const uint8x8_t f1 = vdup_n_u8(filter[1]);
   unsigned int i, j;
   for (i = 0; i < output_height; ++i) {
     for (j = 0; j < output_width; j += 16) {
@@ -63,9 +100,8 @@
       const uint16x8_t c = vmull_u8(vget_high_u8(src_0), f0);
       const uint16x8_t d = vmlal_u8(c, vget_high_u8(src_1), f1);
       const uint8x8_t out_hi = vrshrn_n_u16(d, FILTER_BITS);
-      vst1q_u8(&output_ptr[j], vcombine_u8(out_lo, out_hi));
+      vst1q_u8(output_ptr + j, vcombine_u8(out_lo, out_hi));
     }
-    // Next row...
     src_ptr += src_pixels_per_line;
     output_ptr += output_width;
   }
@@ -129,3 +165,276 @@
                              bilinear_filters_2t[yoffset]);
   return aom_variance64x64_neon(temp2, 64, dst, dst_stride, sse);
 }
+
+unsigned int aom_sub_pixel_variance4x4_neon(const uint8_t *a, int a_stride,
+                                            int xoffset, int yoffset,
+                                            const uint8_t *b, int b_stride,
+                                            uint32_t *sse) {
+  uint8_t temp0[4 * (4 + 2)];
+  uint8_t temp1[4 * 4];
+
+  var_filter_block2d_bil_w4(a, temp0, a_stride, 1, (4 + 2),
+                            bilinear_filters_2t[xoffset]);
+  var_filter_block2d_bil_w4(temp0, temp1, 4, 4, 4,
+                            bilinear_filters_2t[yoffset]);
+
+  return aom_variance4x4(temp1, 4, b, b_stride, sse);
+}
+
+unsigned int aom_sub_pixel_variance4x8_neon(const uint8_t *a, int a_stride,
+                                            int xoffset, int yoffset,
+                                            const uint8_t *b, int b_stride,
+                                            uint32_t *sse) {
+  uint8_t temp0[4 * (8 + 2)];
+  uint8_t temp1[4 * 8];
+
+  var_filter_block2d_bil_w4(a, temp0, a_stride, 1, (8 + 2),
+                            bilinear_filters_2t[xoffset]);
+  var_filter_block2d_bil_w4(temp0, temp1, 4, 4, 8,
+                            bilinear_filters_2t[yoffset]);
+
+  return aom_variance4x8(temp1, 4, b, b_stride, sse);
+}
+
+unsigned int aom_sub_pixel_variance8x4_neon(const uint8_t *a, int a_stride,
+                                            int xoffset, int yoffset,
+                                            const uint8_t *b, int b_stride,
+                                            uint32_t *sse) {
+  uint8_t temp0[8 * (4 + 1)];
+  uint8_t temp1[8 * 4];
+
+  var_filter_block2d_bil_w8(a, temp0, a_stride, 1, (4 + 1), 8,
+                            bilinear_filters_2t[xoffset]);
+  var_filter_block2d_bil_w8(temp0, temp1, 8, 8, 4, 8,
+                            bilinear_filters_2t[yoffset]);
+
+  return aom_variance8x4(temp1, 8, b, b_stride, sse);
+}
+
+unsigned int aom_sub_pixel_variance8x16_neon(const uint8_t *a, int a_stride,
+                                             int xoffset, int yoffset,
+                                             const uint8_t *b, int b_stride,
+                                             uint32_t *sse) {
+  uint8_t temp0[8 * (16 + 1)];
+  uint8_t temp1[8 * 16];
+
+  var_filter_block2d_bil_w8(a, temp0, a_stride, 1, (16 + 1), 8,
+                            bilinear_filters_2t[xoffset]);
+  var_filter_block2d_bil_w8(temp0, temp1, 8, 8, 16, 8,
+                            bilinear_filters_2t[yoffset]);
+
+  return aom_variance8x16(temp1, 8, b, b_stride, sse);
+}
+
+unsigned int aom_sub_pixel_variance16x8_neon(const uint8_t *a, int a_stride,
+                                             int xoffset, int yoffset,
+                                             const uint8_t *b, int b_stride,
+                                             uint32_t *sse) {
+  uint8_t temp0[16 * (8 + 1)];
+  uint8_t temp1[16 * 8];
+
+  var_filter_block2d_bil_w16(a, temp0, a_stride, 1, (8 + 1), 16,
+                             bilinear_filters_2t[xoffset]);
+  var_filter_block2d_bil_w16(temp0, temp1, 16, 16, 8, 16,
+                             bilinear_filters_2t[yoffset]);
+
+  return aom_variance16x8(temp1, 16, b, b_stride, sse);
+}
+
+unsigned int aom_sub_pixel_variance16x32_neon(const uint8_t *a, int a_stride,
+                                              int xoffset, int yoffset,
+                                              const uint8_t *b, int b_stride,
+                                              uint32_t *sse) {
+  uint8_t temp0[16 * (32 + 1)];
+  uint8_t temp1[16 * 32];
+
+  var_filter_block2d_bil_w16(a, temp0, a_stride, 1, (32 + 1), 16,
+                             bilinear_filters_2t[xoffset]);
+  var_filter_block2d_bil_w16(temp0, temp1, 16, 16, 32, 16,
+                             bilinear_filters_2t[yoffset]);
+
+  return aom_variance16x32(temp1, 16, b, b_stride, sse);
+}
+
+unsigned int aom_sub_pixel_variance32x16_neon(const uint8_t *a, int a_stride,
+                                              int xoffset, int yoffset,
+                                              const uint8_t *b, int b_stride,
+                                              uint32_t *sse) {
+  uint8_t temp0[32 * (16 + 1)];
+  uint8_t temp1[32 * 16];
+
+  var_filter_block2d_bil_w16(a, temp0, a_stride, 1, (16 + 1), 32,
+                             bilinear_filters_2t[xoffset]);
+  var_filter_block2d_bil_w16(temp0, temp1, 32, 32, 16, 32,
+                             bilinear_filters_2t[yoffset]);
+
+  return aom_variance32x16(temp1, 32, b, b_stride, sse);
+}
+
+unsigned int aom_sub_pixel_variance32x64_neon(const uint8_t *a, int a_stride,
+                                              int xoffset, int yoffset,
+                                              const uint8_t *b, int b_stride,
+                                              uint32_t *sse) {
+  uint8_t temp0[32 * (64 + 1)];
+  uint8_t temp1[32 * 64];
+
+  var_filter_block2d_bil_w16(a, temp0, a_stride, 1, (64 + 1), 32,
+                             bilinear_filters_2t[xoffset]);
+  var_filter_block2d_bil_w16(temp0, temp1, 32, 32, 64, 32,
+                             bilinear_filters_2t[yoffset]);
+
+  return aom_variance32x64(temp1, 32, b, b_stride, sse);
+}
+
+unsigned int aom_sub_pixel_variance64x32_neon(const uint8_t *a, int a_stride,
+                                              int xoffset, int yoffset,
+                                              const uint8_t *b, int b_stride,
+                                              uint32_t *sse) {
+  uint8_t temp0[64 * (32 + 1)];
+  uint8_t temp1[64 * 32];
+
+  var_filter_block2d_bil_w16(a, temp0, a_stride, 1, (32 + 1), 64,
+                             bilinear_filters_2t[xoffset]);
+  var_filter_block2d_bil_w16(temp0, temp1, 64, 64, 32, 64,
+                             bilinear_filters_2t[yoffset]);
+
+  return aom_variance64x32(temp1, 64, b, b_stride, sse);
+}
+
+unsigned int aom_sub_pixel_variance64x128_neon(const uint8_t *a, int a_stride,
+                                               int xoffset, int yoffset,
+                                               const uint8_t *b, int b_stride,
+                                               uint32_t *sse) {
+  uint8_t temp0[64 * (128 + 1)];
+  uint8_t temp1[64 * 128];
+
+  var_filter_block2d_bil_w16(a, temp0, a_stride, 1, (128 + 1), 64,
+                             bilinear_filters_2t[xoffset]);
+  var_filter_block2d_bil_w16(temp0, temp1, 64, 64, 128, 64,
+                             bilinear_filters_2t[yoffset]);
+
+  return aom_variance64x128(temp1, 64, b, b_stride, sse);
+}
+
+unsigned int aom_sub_pixel_variance128x64_neon(const uint8_t *a, int a_stride,
+                                               int xoffset, int yoffset,
+                                               const uint8_t *b, int b_stride,
+                                               uint32_t *sse) {
+  uint8_t temp0[128 * (64 + 1)];
+  uint8_t temp1[128 * 64];
+
+  var_filter_block2d_bil_w16(a, temp0, a_stride, 1, (64 + 1), 128,
+                             bilinear_filters_2t[xoffset]);
+  var_filter_block2d_bil_w16(temp0, temp1, 128, 128, 64, 128,
+                             bilinear_filters_2t[yoffset]);
+
+  return aom_variance128x64(temp1, 128, b, b_stride, sse);
+}
+
+unsigned int aom_sub_pixel_variance128x128_neon(const uint8_t *a, int a_stride,
+                                                int xoffset, int yoffset,
+                                                const uint8_t *b, int b_stride,
+                                                uint32_t *sse) {
+  uint8_t temp0[128 * (128 + 1)];
+  uint8_t temp1[128 * 128];
+
+  var_filter_block2d_bil_w16(a, temp0, a_stride, 1, (128 + 1), 128,
+                             bilinear_filters_2t[xoffset]);
+  var_filter_block2d_bil_w16(temp0, temp1, 128, 128, 128, 128,
+                             bilinear_filters_2t[yoffset]);
+
+  return aom_variance128x128(temp1, 128, b, b_stride, sse);
+}
+
+// Realtime mode doesn't use 4x rectangular blocks.
+#if !CONFIG_REALTIME_ONLY
+unsigned int aom_sub_pixel_variance4x16_neon(const uint8_t *a, int a_stride,
+                                             int xoffset, int yoffset,
+                                             const uint8_t *b, int b_stride,
+                                             uint32_t *sse) {
+  uint8_t temp0[4 * (16 + 2)];
+  uint8_t temp1[4 * 16];
+
+  var_filter_block2d_bil_w4(a, temp0, a_stride, 1, (16 + 2),
+                            bilinear_filters_2t[xoffset]);
+  var_filter_block2d_bil_w4(temp0, temp1, 4, 4, 16,
+                            bilinear_filters_2t[yoffset]);
+
+  return aom_variance4x16(temp1, 4, b, b_stride, sse);
+}
+
+unsigned int aom_sub_pixel_variance8x32_neon(const uint8_t *a, int a_stride,
+                                             int xoffset, int yoffset,
+                                             const uint8_t *b, int b_stride,
+                                             uint32_t *sse) {
+  uint8_t temp0[8 * (32 + 1)];
+  uint8_t temp1[8 * 32];
+
+  var_filter_block2d_bil_w8(a, temp0, a_stride, 1, (32 + 1), 8,
+                            bilinear_filters_2t[xoffset]);
+  var_filter_block2d_bil_w8(temp0, temp1, 8, 8, 32, 8,
+                            bilinear_filters_2t[yoffset]);
+
+  return aom_variance8x32(temp1, 8, b, b_stride, sse);
+}
+
+unsigned int aom_sub_pixel_variance16x4_neon(const uint8_t *a, int a_stride,
+                                             int xoffset, int yoffset,
+                                             const uint8_t *b, int b_stride,
+                                             uint32_t *sse) {
+  uint8_t temp0[16 * (4 + 1)];
+  uint8_t temp1[16 * 4];
+
+  var_filter_block2d_bil_w16(a, temp0, a_stride, 1, (4 + 1), 16,
+                             bilinear_filters_2t[xoffset]);
+  var_filter_block2d_bil_w16(temp0, temp1, 16, 16, 4, 16,
+                             bilinear_filters_2t[yoffset]);
+
+  return aom_variance16x4(temp1, 16, b, b_stride, sse);
+}
+
+unsigned int aom_sub_pixel_variance64x16_neon(const uint8_t *a, int a_stride,
+                                              int xoffset, int yoffset,
+                                              const uint8_t *b, int b_stride,
+                                              uint32_t *sse) {
+  uint8_t temp0[64 * (16 + 1)];
+  uint8_t temp1[64 * 16];
+
+  var_filter_block2d_bil_w16(a, temp0, a_stride, 1, (16 + 1), 64,
+                             bilinear_filters_2t[xoffset]);
+  var_filter_block2d_bil_w16(temp0, temp1, 64, 64, 16, 64,
+                             bilinear_filters_2t[yoffset]);
+
+  return aom_variance64x16(temp1, 64, b, b_stride, sse);
+}
+
+unsigned int aom_sub_pixel_variance16x64_neon(const uint8_t *a, int a_stride,
+                                              int xoffset, int yoffset,
+                                              const uint8_t *b, int b_stride,
+                                              uint32_t *sse) {
+  uint8_t temp0[16 * (64 + 1)];
+  uint8_t temp1[16 * 64];
+
+  var_filter_block2d_bil_w16(a, temp0, a_stride, 1, (64 + 1), 16,
+                             bilinear_filters_2t[xoffset]);
+  var_filter_block2d_bil_w16(temp0, temp1, 16, 16, 64, 16,
+                             bilinear_filters_2t[yoffset]);
+
+  return aom_variance16x64(temp1, 16, b, b_stride, sse);
+}
+
+unsigned int aom_sub_pixel_variance32x8_neon(const uint8_t *a, int a_stride,
+                                             int xoffset, int yoffset,
+                                             const uint8_t *b, int b_stride,
+                                             uint32_t *sse) {
+  uint8_t temp0[32 * (8 + 1)];
+  uint8_t temp1[32 * 8];
+
+  var_filter_block2d_bil_w16(a, temp0, a_stride, 1, (8 + 1), 32,
+                             bilinear_filters_2t[xoffset]);
+  var_filter_block2d_bil_w16(temp0, temp1, 32, 32, 8, 32,
+                             bilinear_filters_2t[yoffset]);
+
+  return aom_variance32x8(temp1, 32, b, b_stride, sse);
+}
+#endif  // !CONFIG_REALTIME_ONLY
diff --git a/aom_dsp/arm/sum_squares_neon.c b/aom_dsp/arm/sum_squares_neon.c
new file mode 100644
index 0000000..1ce12ec
--- /dev/null
+++ b/aom_dsp/arm/sum_squares_neon.c
@@ -0,0 +1,138 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "av1/common/arm/mem_neon.h"
+#include "config/aom_dsp_rtcd.h"
+
+static INLINE uint32x4_t sum_squares_i16_4x4_neon(const int16_t *src,
+                                                  int stride) {
+  const int16x4_t v_val_01_lo = vld1_s16(src + 0 * stride);
+  const int16x4_t v_val_01_hi = vld1_s16(src + 1 * stride);
+  const int16x4_t v_val_23_lo = vld1_s16(src + 2 * stride);
+  const int16x4_t v_val_23_hi = vld1_s16(src + 3 * stride);
+  int32x4_t v_sq_01_d = vmull_s16(v_val_01_lo, v_val_01_lo);
+  v_sq_01_d = vmlal_s16(v_sq_01_d, v_val_01_hi, v_val_01_hi);
+  int32x4_t v_sq_23_d = vmull_s16(v_val_23_lo, v_val_23_lo);
+  v_sq_23_d = vmlal_s16(v_sq_23_d, v_val_23_hi, v_val_23_hi);
+#if defined(__aarch64__)
+  return vreinterpretq_u32_s32(vpaddq_s32(v_sq_01_d, v_sq_23_d));
+#else
+  return vreinterpretq_u32_s32(vcombine_s32(
+      vqmovn_s64(vpaddlq_s32(v_sq_01_d)), vqmovn_s64(vpaddlq_s32(v_sq_23_d))));
+#endif
+}
+
+uint64_t aom_sum_squares_2d_i16_4x4_neon(const int16_t *src, int stride) {
+  const uint32x4_t v_sum_0123_d = sum_squares_i16_4x4_neon(src, stride);
+#if defined(__aarch64__)
+  return (uint64_t)vaddvq_u32(v_sum_0123_d);
+#else
+  uint64x2_t v_sum_d = vpaddlq_u32(v_sum_0123_d);
+  v_sum_d = vaddq_u64(v_sum_d, vextq_u64(v_sum_d, v_sum_d, 1));
+  return vgetq_lane_u64(v_sum_d, 0);
+#endif
+}
+
+uint64_t aom_sum_squares_2d_i16_4xn_neon(const int16_t *src, int stride,
+                                         int height) {
+  int r = 0;
+  uint32x4_t v_acc_q = vdupq_n_u32(0);
+  do {
+    const uint32x4_t v_acc_d = sum_squares_i16_4x4_neon(src, stride);
+    v_acc_q = vaddq_u32(v_acc_q, v_acc_d);
+    src += stride << 2;
+    r += 4;
+  } while (r < height);
+
+  uint64x2_t v_acc_64 = vpaddlq_u32(v_acc_q);
+#if defined(__aarch64__)
+  return vaddvq_u64(v_acc_64);
+#else
+  v_acc_64 = vaddq_u64(v_acc_64, vextq_u64(v_acc_64, v_acc_64, 1));
+  return vgetq_lane_u64(v_acc_64, 0);
+#endif
+}
+
+uint64_t aom_sum_squares_2d_i16_nxn_neon(const int16_t *src, int stride,
+                                         int width, int height) {
+  int r = 0;
+  const int32x4_t zero = vdupq_n_s32(0);
+  uint64x2_t v_acc_q = vreinterpretq_u64_s32(zero);
+  do {
+    int32x4_t v_sum = zero;
+    int c = 0;
+    do {
+      const int16_t *b = src + c;
+      const int16x8_t v_val_0 = vld1q_s16(b + 0 * stride);
+      const int16x8_t v_val_1 = vld1q_s16(b + 1 * stride);
+      const int16x8_t v_val_2 = vld1q_s16(b + 2 * stride);
+      const int16x8_t v_val_3 = vld1q_s16(b + 3 * stride);
+      const int16x4_t v_val_0_lo = vget_low_s16(v_val_0);
+      const int16x4_t v_val_1_lo = vget_low_s16(v_val_1);
+      const int16x4_t v_val_2_lo = vget_low_s16(v_val_2);
+      const int16x4_t v_val_3_lo = vget_low_s16(v_val_3);
+      int32x4_t v_sum_01 = vmull_s16(v_val_0_lo, v_val_0_lo);
+      v_sum_01 = vmlal_s16(v_sum_01, v_val_1_lo, v_val_1_lo);
+      int32x4_t v_sum_23 = vmull_s16(v_val_2_lo, v_val_2_lo);
+      v_sum_23 = vmlal_s16(v_sum_23, v_val_3_lo, v_val_3_lo);
+#if defined(__aarch64__)
+      v_sum_01 = vmlal_high_s16(v_sum_01, v_val_0, v_val_0);
+      v_sum_01 = vmlal_high_s16(v_sum_01, v_val_1, v_val_1);
+      v_sum_23 = vmlal_high_s16(v_sum_23, v_val_2, v_val_2);
+      v_sum_23 = vmlal_high_s16(v_sum_23, v_val_3, v_val_3);
+      v_sum = vaddq_s32(v_sum, vpaddq_s32(v_sum_01, v_sum_23));
+#else
+      const int16x4_t v_val_0_hi = vget_high_s16(v_val_0);
+      const int16x4_t v_val_1_hi = vget_high_s16(v_val_1);
+      const int16x4_t v_val_2_hi = vget_high_s16(v_val_2);
+      const int16x4_t v_val_3_hi = vget_high_s16(v_val_3);
+      v_sum_01 = vmlal_s16(v_sum_01, v_val_0_hi, v_val_0_hi);
+      v_sum_01 = vmlal_s16(v_sum_01, v_val_1_hi, v_val_1_hi);
+      v_sum_23 = vmlal_s16(v_sum_23, v_val_2_hi, v_val_2_hi);
+      v_sum_23 = vmlal_s16(v_sum_23, v_val_3_hi, v_val_3_hi);
+      v_sum = vaddq_s32(v_sum, vcombine_s32(vqmovn_s64(vpaddlq_s32(v_sum_01)),
+                                            vqmovn_s64(vpaddlq_s32(v_sum_23))));
+#endif
+      c += 8;
+    } while (c < width);
+
+    v_acc_q = vpadalq_u32(v_acc_q, vreinterpretq_u32_s32(v_sum));
+
+    src += 4 * stride;
+    r += 4;
+  } while (r < height);
+#if defined(__aarch64__)
+  return vaddvq_u64(v_acc_q);
+#else
+  v_acc_q = vaddq_u64(v_acc_q, vextq_u64(v_acc_q, v_acc_q, 1));
+  return vgetq_lane_u64(v_acc_q, 0);
+#endif
+}
+
+uint64_t aom_sum_squares_2d_i16_neon(const int16_t *src, int stride, int width,
+                                     int height) {
+  // 4 elements per row only requires half an SIMD register, so this
+  // must be a special case, but also note that over 75% of all calls
+  // are with size == 4, so it is also the common case.
+  if (LIKELY(width == 4 && height == 4)) {
+    return aom_sum_squares_2d_i16_4x4_neon(src, stride);
+  } else if (LIKELY(width == 4 && (height & 3) == 0)) {
+    return aom_sum_squares_2d_i16_4xn_neon(src, stride, height);
+  } else if (LIKELY((width & 7) == 0 && (height & 3) == 0)) {
+    // Generic case
+    return aom_sum_squares_2d_i16_nxn_neon(src, stride, width, height);
+  } else {
+    return aom_sum_squares_2d_i16_c(src, stride, width, height);
+  }
+}
diff --git a/aom_dsp/arm/variance_neon.c b/aom_dsp/arm/variance_neon.c
index d4107ce..3a95ba2 100644
--- a/aom_dsp/arm/variance_neon.c
+++ b/aom_dsp/arm/variance_neon.c
@@ -399,3 +399,257 @@
 
   return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0);
 }
+
+// Load 4 sets of 4 bytes when alignment is not guaranteed.
+static INLINE uint8x16_t load_unaligned_u8q(const uint8_t *buf, int stride) {
+  uint32_t a;
+  uint32x4_t a_u32 = vdupq_n_u32(0);
+  if (stride == 4) return vld1q_u8(buf);
+  memcpy(&a, buf, 4);
+  buf += stride;
+  a_u32 = vld1q_lane_u32(&a, a_u32, 0);
+  memcpy(&a, buf, 4);
+  buf += stride;
+  a_u32 = vld1q_lane_u32(&a, a_u32, 1);
+  memcpy(&a, buf, 4);
+  buf += stride;
+  a_u32 = vld1q_lane_u32(&a, a_u32, 2);
+  memcpy(&a, buf, 4);
+  buf += stride;
+  a_u32 = vld1q_lane_u32(&a, a_u32, 3);
+  return vreinterpretq_u8_u32(a_u32);
+}
+
+// The variance helper functions use int16_t for sum. 8 values are accumulated
+// and then added (at which point they expand up to int32_t). To avoid overflow,
+// there can be no more than 32767 / 255 ~= 128 values accumulated in each
+// column. For a 32x32 buffer, this results in 32 / 8 = 4 values per row * 32
+// rows = 128. Asserts have been added to each function to warn against reaching
+// this limit.
+
+// Process a block of width 4 four rows at a time.
+static void variance_neon_w4x4(const uint8_t *a, int a_stride, const uint8_t *b,
+                               int b_stride, int h, uint32_t *sse, int *sum) {
+  const int32x4_t zero = vdupq_n_s32(0);
+  int16x8_t sum_s16 = vreinterpretq_s16_s32(zero);
+  int32x4_t sse_s32 = zero;
+
+  // Since width is only 4, sum_s16 only loads a half row per loop.
+  assert(h <= 256);
+
+  int i;
+  for (i = 0; i < h; i += 4) {
+    const uint8x16_t a_u8 = load_unaligned_u8q(a, a_stride);
+    const uint8x16_t b_u8 = load_unaligned_u8q(b, b_stride);
+    const int16x8_t diff_lo_s16 =
+        vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(a_u8), vget_low_u8(b_u8)));
+    const int16x8_t diff_hi_s16 =
+        vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(a_u8), vget_high_u8(b_u8)));
+
+    sum_s16 = vaddq_s16(sum_s16, diff_lo_s16);
+    sum_s16 = vaddq_s16(sum_s16, diff_hi_s16);
+
+    sse_s32 = vmlal_s16(sse_s32, vget_low_s16(diff_lo_s16),
+                        vget_low_s16(diff_lo_s16));
+    sse_s32 = vmlal_s16(sse_s32, vget_high_s16(diff_lo_s16),
+                        vget_high_s16(diff_lo_s16));
+
+    sse_s32 = vmlal_s16(sse_s32, vget_low_s16(diff_hi_s16),
+                        vget_low_s16(diff_hi_s16));
+    sse_s32 = vmlal_s16(sse_s32, vget_high_s16(diff_hi_s16),
+                        vget_high_s16(diff_hi_s16));
+
+    a += 4 * a_stride;
+    b += 4 * b_stride;
+  }
+
+#if defined(__aarch64__)
+  *sum = vaddvq_s32(vpaddlq_s16(sum_s16));
+  *sse = (uint32_t)vaddvq_s32(sse_s32);
+#else
+  *sum = horizontal_add_s16x8(sum_s16);
+  *sse = (uint32_t)horizontal_add_s32x4(sse_s32);
+#endif
+}
+
+// Process a block of any size where the width is divisible by 16.
+static void variance_neon_w16(const uint8_t *a, int a_stride, const uint8_t *b,
+                              int b_stride, int w, int h, uint32_t *sse,
+                              int *sum) {
+  const int32x4_t zero = vdupq_n_s32(0);
+  int16x8_t sum_s16 = vreinterpretq_s16_s32(zero);
+  int32x4_t sse_s32 = zero;
+
+  // The loop loads 16 values at a time but doubles them up when accumulating
+  // into sum_s16.
+  assert(w / 8 * h <= 128);
+
+  int i, j;
+  for (i = 0; i < h; ++i) {
+    for (j = 0; j < w; j += 16) {
+      const uint8x16_t a_u8 = vld1q_u8(a + j);
+      const uint8x16_t b_u8 = vld1q_u8(b + j);
+
+      const int16x8_t diff_lo_s16 =
+          vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(a_u8), vget_low_u8(b_u8)));
+      const int16x8_t diff_hi_s16 = vreinterpretq_s16_u16(
+          vsubl_u8(vget_high_u8(a_u8), vget_high_u8(b_u8)));
+
+      sum_s16 = vaddq_s16(sum_s16, diff_lo_s16);
+      sum_s16 = vaddq_s16(sum_s16, diff_hi_s16);
+
+      sse_s32 = vmlal_s16(sse_s32, vget_low_s16(diff_lo_s16),
+                          vget_low_s16(diff_lo_s16));
+      sse_s32 = vmlal_s16(sse_s32, vget_high_s16(diff_lo_s16),
+                          vget_high_s16(diff_lo_s16));
+
+      sse_s32 = vmlal_s16(sse_s32, vget_low_s16(diff_hi_s16),
+                          vget_low_s16(diff_hi_s16));
+      sse_s32 = vmlal_s16(sse_s32, vget_high_s16(diff_hi_s16),
+                          vget_high_s16(diff_hi_s16));
+    }
+    a += a_stride;
+    b += b_stride;
+  }
+
+#if defined(__aarch64__)
+  *sum = vaddvq_s32(vpaddlq_s16(sum_s16));
+  *sse = (uint32_t)vaddvq_s32(sse_s32);
+#else
+  *sum = horizontal_add_s16x8(sum_s16);
+  *sse = (uint32_t)horizontal_add_s32x4(sse_s32);
+#endif
+}
+
+// Process a block of width 8 two rows at a time.
+static void variance_neon_w8x2(const uint8_t *a, int a_stride, const uint8_t *b,
+                               int b_stride, int h, uint32_t *sse, int *sum) {
+  const int32x4_t zero = vdupq_n_s32(0);
+  int16x8_t sum_s16 = vreinterpretq_s16_s32(zero);
+  int32x4_t sse_s32 = zero;
+
+  // Each column has it's own accumulator entry in sum_s16.
+  assert(h <= 128);
+
+  int i = 0;
+  do {
+    const uint8x8_t a_0_u8 = vld1_u8(a);
+    const uint8x8_t a_1_u8 = vld1_u8(a + a_stride);
+    const uint8x8_t b_0_u8 = vld1_u8(b);
+    const uint8x8_t b_1_u8 = vld1_u8(b + b_stride);
+    const int16x8_t diff_0_s16 =
+        vreinterpretq_s16_u16(vsubl_u8(a_0_u8, b_0_u8));
+    const int16x8_t diff_1_s16 =
+        vreinterpretq_s16_u16(vsubl_u8(a_1_u8, b_1_u8));
+    sum_s16 = vaddq_s16(sum_s16, diff_0_s16);
+    sum_s16 = vaddq_s16(sum_s16, diff_1_s16);
+    sse_s32 =
+        vmlal_s16(sse_s32, vget_low_s16(diff_0_s16), vget_low_s16(diff_0_s16));
+    sse_s32 =
+        vmlal_s16(sse_s32, vget_low_s16(diff_1_s16), vget_low_s16(diff_1_s16));
+    sse_s32 = vmlal_s16(sse_s32, vget_high_s16(diff_0_s16),
+                        vget_high_s16(diff_0_s16));
+    sse_s32 = vmlal_s16(sse_s32, vget_high_s16(diff_1_s16),
+                        vget_high_s16(diff_1_s16));
+    a += a_stride + a_stride;
+    b += b_stride + b_stride;
+    i += 2;
+  } while (i < h);
+
+#if defined(__aarch64__)
+  *sum = vaddvq_s32(vpaddlq_s16(sum_s16));
+  *sse = (uint32_t)vaddvq_s32(sse_s32);
+#else
+  *sum = horizontal_add_s16x8(sum_s16);
+  *sse = (uint32_t)horizontal_add_s32x4(sse_s32);
+#endif
+}
+
+#define varianceNxM(n, m, shift)                                            \
+  unsigned int aom_variance##n##x##m##_neon(const uint8_t *a, int a_stride, \
+                                            const uint8_t *b, int b_stride, \
+                                            unsigned int *sse) {            \
+    int sum;                                                                \
+    if (n == 4)                                                             \
+      variance_neon_w4x4(a, a_stride, b, b_stride, m, sse, &sum);           \
+    else if (n == 8)                                                        \
+      variance_neon_w8x2(a, a_stride, b, b_stride, m, sse, &sum);           \
+    else                                                                    \
+      variance_neon_w16(a, a_stride, b, b_stride, n, m, sse, &sum);         \
+    if (n * m < 16 * 16)                                                    \
+      return *sse - ((sum * sum) >> shift);                                 \
+    else                                                                    \
+      return *sse - (uint32_t)(((int64_t)sum * sum) >> shift);              \
+  }
+
+static void variance_neon_wide_block(const uint8_t *a, int a_stride,
+                                     const uint8_t *b, int b_stride, int w,
+                                     int h, uint32_t *sse, int *sum) {
+  const int32x4_t zero = vdupq_n_s32(0);
+  int32x4_t v_diff = zero;
+  int64x2_t v_sse = vreinterpretq_s64_s32(zero);
+
+  int s, i, j;
+  for (s = 0; s < 16; s++) {
+    int32x4_t sse_s32 = zero;
+    int16x8_t sum_s16 = vreinterpretq_s16_s32(zero);
+    for (i = (s * h) >> 4; i < (((s + 1) * h) >> 4); ++i) {
+      for (j = 0; j < w; j += 16) {
+        const uint8x16_t a_u8 = vld1q_u8(a + j);
+        const uint8x16_t b_u8 = vld1q_u8(b + j);
+
+        const int16x8_t diff_lo_s16 = vreinterpretq_s16_u16(
+            vsubl_u8(vget_low_u8(a_u8), vget_low_u8(b_u8)));
+        const int16x8_t diff_hi_s16 = vreinterpretq_s16_u16(
+            vsubl_u8(vget_high_u8(a_u8), vget_high_u8(b_u8)));
+
+        sum_s16 = vaddq_s16(sum_s16, diff_lo_s16);
+        sum_s16 = vaddq_s16(sum_s16, diff_hi_s16);
+
+        sse_s32 = vmlal_s16(sse_s32, vget_low_s16(diff_lo_s16),
+                            vget_low_s16(diff_lo_s16));
+        sse_s32 = vmlal_s16(sse_s32, vget_high_s16(diff_lo_s16),
+                            vget_high_s16(diff_lo_s16));
+        sse_s32 = vmlal_s16(sse_s32, vget_low_s16(diff_hi_s16),
+                            vget_low_s16(diff_hi_s16));
+        sse_s32 = vmlal_s16(sse_s32, vget_high_s16(diff_hi_s16),
+                            vget_high_s16(diff_hi_s16));
+      }
+
+      a += a_stride;
+      b += b_stride;
+    }
+
+    v_diff = vpadalq_s16(v_diff, sum_s16);
+    v_sse = vpadalq_s32(v_sse, sse_s32);
+  }
+#if defined(__aarch64__)
+  int diff = vaddvq_s32(v_diff);
+  uint32_t sq = (uint32_t)vaddvq_u64(vreinterpretq_u64_s64(v_sse));
+#else
+  int diff = horizontal_add_s32x4(v_diff);
+  uint32_t sq = vget_lane_u32(
+      vreinterpret_u32_s64(vadd_s64(vget_low_s64(v_sse), vget_high_s64(v_sse))),
+      0);
+#endif
+
+  *sum = diff;
+  *sse = sq;
+}
+
+#define varianceNxM_wide(W, H)                                              \
+  unsigned int aom_variance##W##x##H##_neon(const uint8_t *a, int a_stride, \
+                                            const uint8_t *b, int b_stride, \
+                                            uint32_t *sse) {                \
+    int sum;                                                                \
+    variance_neon_wide_block(a, a_stride, b, b_stride, W, H, sse, &sum);    \
+    return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H));               \
+  }
+
+varianceNxM(4, 4, 4);
+varianceNxM(4, 8, 5);
+varianceNxM(8, 4, 5);
+varianceNxM(16, 32, 9);
+varianceNxM(32, 16, 9);
+varianceNxM_wide(128, 64);
+varianceNxM_wide(64, 128);
diff --git a/aom_dsp/bitreader.h b/aom_dsp/bitreader.h
index a8b3f55..255d98c 100644
--- a/aom_dsp/bitreader.h
+++ b/aom_dsp/bitreader.h
@@ -23,6 +23,10 @@
 #include "aom_dsp/prob.h"
 #include "av1/common/odintrin.h"
 
+#if CONFIG_BITSTREAM_DEBUG
+#include "aom_util/debug_util.h"
+#endif  // CONFIG_BITSTREAM_DEBUG
+
 #if CONFIG_ACCOUNTING
 #include "av1/decoder/accounting.h"
 #define ACCT_STR_NAME acct_str
diff --git a/aom_dsp/bitwriter.c b/aom_dsp/bitwriter.c
index 41fcc51..23d28a1 100644
--- a/aom_dsp/bitwriter.c
+++ b/aom_dsp/bitwriter.c
@@ -29,3 +29,8 @@
   od_ec_enc_clear(&w->ec);
   return nb_bits;
 }
+
+int aom_tell_size(aom_writer *w) {
+  const int nb_bits = od_ec_enc_tell(&w->ec);
+  return nb_bits;
+}
diff --git a/aom_dsp/bitwriter.h b/aom_dsp/bitwriter.h
index 4e77a17..fb33909 100644
--- a/aom_dsp/bitwriter.h
+++ b/aom_dsp/bitwriter.h
@@ -24,6 +24,10 @@
 #include "av1/encoder/cost.h"
 #endif
 
+#if CONFIG_BITSTREAM_DEBUG
+#include "aom_util/debug_util.h"
+#endif  // CONFIG_BITSTREAM_DEBUG
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -60,18 +64,12 @@
 
 int aom_stop_encode(aom_writer *w);
 
+int aom_tell_size(aom_writer *w);
+
 static INLINE void aom_write(aom_writer *w, int bit, int probability) {
   int p = (0x7FFFFF - (probability << 15) + probability) >> 8;
 #if CONFIG_BITSTREAM_DEBUG
   aom_cdf_prob cdf[2] = { (aom_cdf_prob)p, 32767 };
-  /*int queue_r = 0;
-  int frame_idx_r = 0;
-  int queue_w = bitstream_queue_get_write();
-  int frame_idx_w = aom_bitstream_queue_get_frame_writee();
-  if (frame_idx_w == frame_idx_r && queue_w == queue_r) {
-    fprintf(stderr, "\n *** bitstream queue at frame_idx_w %d queue_w %d\n",
-    frame_idx_w, queue_w);
-  }*/
   bitstream_queue_push(bit, cdf, 2);
 #endif
 
@@ -91,14 +89,6 @@
 static INLINE void aom_write_cdf(aom_writer *w, int symb,
                                  const aom_cdf_prob *cdf, int nsymbs) {
 #if CONFIG_BITSTREAM_DEBUG
-  /*int queue_r = 0;
-  int frame_idx_r = 0;
-  int queue_w = bitstream_queue_get_write();
-  int frame_idx_w = aom_bitstream_queue_get_frame_writee();
-  if (frame_idx_w == frame_idx_r && queue_w == queue_r) {
-    fprintf(stderr, "\n *** bitstream queue at frame_idx_w %d queue_w %d\n",
-    frame_idx_w, queue_w);
-  }*/
   bitstream_queue_push(symb, cdf, nsymbs);
 #endif
 
diff --git a/aom_dsp/intrapred.c b/aom_dsp/intrapred.c
index 72ccfd8..2e43538 100644
--- a/aom_dsp/intrapred.c
+++ b/aom_dsp/intrapred.c
@@ -723,6 +723,25 @@
   }
 
 /* clang-format off */
+#if CONFIG_REALTIME_ONLY
+#define intra_pred_rectangular(type) \
+  intra_pred_sized(type, 4, 8) \
+  intra_pred_sized(type, 8, 4) \
+  intra_pred_sized(type, 8, 16) \
+  intra_pred_sized(type, 16, 8) \
+  intra_pred_sized(type, 16, 32) \
+  intra_pred_sized(type, 32, 16) \
+  intra_pred_sized(type, 32, 64) \
+  intra_pred_sized(type, 64, 32) \
+  intra_pred_highbd_sized(type, 4, 8) \
+  intra_pred_highbd_sized(type, 8, 4) \
+  intra_pred_highbd_sized(type, 8, 16) \
+  intra_pred_highbd_sized(type, 16, 8) \
+  intra_pred_highbd_sized(type, 16, 32) \
+  intra_pred_highbd_sized(type, 32, 16) \
+  intra_pred_highbd_sized(type, 32, 64) \
+  intra_pred_highbd_sized(type, 64, 32)
+#else
 #define intra_pred_rectangular(type) \
   intra_pred_sized(type, 4, 8) \
   intra_pred_sized(type, 8, 4) \
@@ -752,6 +771,8 @@
   intra_pred_highbd_sized(type, 32, 8) \
   intra_pred_highbd_sized(type, 16, 64) \
   intra_pred_highbd_sized(type, 64, 16)
+#endif
+
 #define intra_pred_above_4x4(type) \
   intra_pred_sized(type, 8, 8) \
   intra_pred_sized(type, 16, 16) \
diff --git a/aom_dsp/mips/convolve8_dspr2.c b/aom_dsp/mips/aom_convolve_copy_dspr2.c
similarity index 96%
rename from aom_dsp/mips/convolve8_dspr2.c
rename to aom_dsp/mips/aom_convolve_copy_dspr2.c
index af54b42..12a213e 100644
--- a/aom_dsp/mips/convolve8_dspr2.c
+++ b/aom_dsp/mips/aom_convolve_copy_dspr2.c
@@ -21,17 +21,9 @@
 
 #if HAVE_DSPR2
 void aom_convolve_copy_dspr2(const uint8_t *src, ptrdiff_t src_stride,
-                             uint8_t *dst, ptrdiff_t dst_stride,
-                             const int16_t *filter_x, int filter_x_stride,
-                             const int16_t *filter_y, int filter_y_stride,
-                             int w, int h) {
+                             uint8_t *dst, ptrdiff_t dst_stride, int w, int h) {
   int x, y;
 
-  (void)filter_x;
-  (void)filter_x_stride;
-  (void)filter_y;
-  (void)filter_y_stride;
-
   /* prefetch data to cache memory */
   prefetch_load(src);
   prefetch_load(src + 32);
diff --git a/aom_dsp/mips/aom_convolve_copy_msa.c b/aom_dsp/mips/aom_convolve_copy_msa.c
index f7f116f..12e7d95 100644
--- a/aom_dsp/mips/aom_convolve_copy_msa.c
+++ b/aom_dsp/mips/aom_convolve_copy_msa.c
@@ -198,15 +198,8 @@
 }
 
 void aom_convolve_copy_msa(const uint8_t *src, ptrdiff_t src_stride,
-                           uint8_t *dst, ptrdiff_t dst_stride,
-                           const int16_t *filter_x, int32_t filter_x_stride,
-                           const int16_t *filter_y, int32_t filter_y_stride,
-                           int32_t w, int32_t h) {
-  (void)filter_x;
-  (void)filter_y;
-  (void)filter_x_stride;
-  (void)filter_y_stride;
-
+                           uint8_t *dst, ptrdiff_t dst_stride, int32_t w,
+                           int32_t h) {
   switch (w) {
     case 4: {
       uint32_t cnt, tmp;
@@ -238,7 +231,7 @@
     default: {
       uint32_t cnt;
       for (cnt = h; cnt--;) {
-        memcpy(dst, src, w);
+        memmove(dst, src, w);
         src += src_stride;
         dst += dst_stride;
       }
diff --git a/aom_dsp/noise_model.c b/aom_dsp/noise_model.c
index c7a0003..1ad3a66 100644
--- a/aom_dsp/noise_model.c
+++ b/aom_dsp/noise_model.c
@@ -1353,7 +1353,7 @@
   if (chroma_sub[0] != chroma_sub[1]) {
     fprintf(stderr,
             "aom_wiener_denoise_2d doesn't handle different chroma "
-            "subsampling");
+            "subsampling\n");
     return 0;
   }
   init_success &= aom_flat_block_finder_init(&block_finder_full, block_size,
diff --git a/aom_dsp/psnr.c b/aom_dsp/psnr.c
index c66dd52..d846a10 100644
--- a/aom_dsp/psnr.c
+++ b/aom_dsp/psnr.c
@@ -363,6 +363,10 @@
 void aom_calc_highbd_psnr(const YV12_BUFFER_CONFIG *a,
                           const YV12_BUFFER_CONFIG *b, PSNR_STATS *psnr,
                           uint32_t bit_depth, uint32_t in_bit_depth) {
+  assert(a->y_crop_width == b->y_crop_width);
+  assert(a->y_crop_height == b->y_crop_height);
+  assert(a->uv_crop_width == b->uv_crop_width);
+  assert(a->uv_crop_height == b->uv_crop_height);
   const int widths[3] = { a->y_crop_width, a->uv_crop_width, a->uv_crop_width };
   const int heights[3] = { a->y_crop_height, a->uv_crop_height,
                            a->uv_crop_height };
@@ -371,7 +375,7 @@
   int i;
   uint64_t total_sse = 0;
   uint32_t total_samples = 0;
-  const double peak = (double)((1 << in_bit_depth) - 1);
+  double peak = (double)((1 << in_bit_depth) - 1);
   const unsigned int input_shift = bit_depth - in_bit_depth;
 
   for (i = 0; i < 3; ++i) {
@@ -403,11 +407,40 @@
   psnr->samples[0] = total_samples;
   psnr->psnr[0] =
       aom_sse_to_psnr((double)total_samples, peak, (double)total_sse);
+
+  // Compute PSNR based on stream bit depth
+  if ((a->flags & YV12_FLAG_HIGHBITDEPTH) && (in_bit_depth < bit_depth)) {
+    peak = (double)((1 << bit_depth) - 1);
+    total_sse = 0;
+    total_samples = 0;
+    for (i = 0; i < 3; ++i) {
+      const int w = widths[i];
+      const int h = heights[i];
+      const uint32_t samples = w * h;
+      uint64_t sse;
+      sse = highbd_get_sse(a->buffers[i], a_strides[i], b->buffers[i],
+                           b_strides[i], w, h);
+      psnr->sse_hbd[1 + i] = sse;
+      psnr->samples_hbd[1 + i] = samples;
+      psnr->psnr_hbd[1 + i] = aom_sse_to_psnr(samples, peak, (double)sse);
+      total_sse += sse;
+      total_samples += samples;
+    }
+
+    psnr->sse_hbd[0] = total_sse;
+    psnr->samples_hbd[0] = total_samples;
+    psnr->psnr_hbd[0] =
+        aom_sse_to_psnr((double)total_samples, peak, (double)total_sse);
+  }
 }
 #endif
 
 void aom_calc_psnr(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b,
                    PSNR_STATS *psnr) {
+  assert(a->y_crop_width == b->y_crop_width);
+  assert(a->y_crop_height == b->y_crop_height);
+  assert(a->uv_crop_width == b->uv_crop_width);
+  assert(a->uv_crop_height == b->uv_crop_height);
   static const double peak = 255.0;
   const int widths[3] = { a->y_crop_width, a->uv_crop_width, a->uv_crop_width };
   const int heights[3] = { a->y_crop_height, a->uv_crop_height,
diff --git a/aom_dsp/psnr.h b/aom_dsp/psnr.h
index 7f40b8b..96a17f4 100644
--- a/aom_dsp/psnr.h
+++ b/aom_dsp/psnr.h
@@ -21,9 +21,12 @@
 #endif
 
 typedef struct {
-  double psnr[4];       // total/y/u/v
-  uint64_t sse[4];      // total/y/u/v
-  uint32_t samples[4];  // total/y/u/v
+  double psnr[4];           // total/y/u/v
+  uint64_t sse[4];          // total/y/u/v
+  uint32_t samples[4];      // total/y/u/v
+  double psnr_hbd[4];       // total/y/u/v when input-bit-depth < bit-depth
+  uint64_t sse_hbd[4];      // total/y/u/v when input-bit-depth < bit-depth
+  uint32_t samples_hbd[4];  // total/y/u/v when input-bit-depth < bit-depth
 } PSNR_STATS;
 
 /*!\brief Converts SSE to PSNR
diff --git a/aom_dsp/sad.c b/aom_dsp/sad.c
index 8ddc683..d72b6c6 100644
--- a/aom_dsp/sad.c
+++ b/aom_dsp/sad.c
@@ -61,6 +61,11 @@
     aom_dist_wtd_comp_avg_pred_c(comp_pred, second_pred, m, n, ref,           \
                                  ref_stride, jcp_param);                      \
     return sad(src, src_stride, comp_pred, m, m, n);                          \
+  }                                                                           \
+  unsigned int aom_sad_skip_##m##x##n##_c(const uint8_t *src, int src_stride, \
+                                          const uint8_t *ref,                 \
+                                          int ref_stride) {                   \
+    return 2 * sad(src, 2 * src_stride, ref, 2 * ref_stride, (m), (n / 2));   \
   }
 
 // Calculate sad against 4 reference locations and store each in sad_array
@@ -82,6 +87,15 @@
       sad_array[i] = aom_sad##m##x##n##_avg_c(src, src_stride, ref_array[i], \
                                               ref_stride, second_pred);      \
     }                                                                        \
+  }                                                                          \
+  void aom_sad_skip_##m##x##n##x4d_c(const uint8_t *src, int src_stride,     \
+                                     const uint8_t *const ref_array[],       \
+                                     int ref_stride, uint32_t *sad_array) {  \
+    int i;                                                                   \
+    for (i = 0; i < 4; ++i) {                                                \
+      sad_array[i] = 2 * sad(src, 2 * src_stride, ref_array[i],              \
+                             2 * ref_stride, (m), (n / 2));                  \
+    }                                                                        \
   }
 
 // 128x128
@@ -227,6 +241,12 @@
     aom_highbd_dist_wtd_comp_avg_pred(comp_pred8, second_pred, m, n, ref,      \
                                       ref_stride, jcp_param);                  \
     return highbd_sadb(src, src_stride, comp_pred8, m, m, n);                  \
+  }                                                                            \
+  unsigned int aom_highbd_sad_skip_##m##x##n##_c(                              \
+      const uint8_t *src, int src_stride, const uint8_t *ref,                  \
+      int ref_stride) {                                                        \
+    return 2 *                                                                 \
+           highbd_sad(src, 2 * src_stride, ref, 2 * ref_stride, (m), (n / 2)); \
   }
 
 #define highbd_sadMxNx4D(m, n)                                               \
@@ -238,6 +258,15 @@
       sad_array[i] = aom_highbd_sad##m##x##n##_c(src, src_stride,            \
                                                  ref_array[i], ref_stride);  \
     }                                                                        \
+  }                                                                          \
+  void aom_highbd_sad_skip_##m##x##n##x4d_c(                                 \
+      const uint8_t *src, int src_stride, const uint8_t *const ref_array[],  \
+      int ref_stride, uint32_t *sad_array) {                                 \
+    int i;                                                                   \
+    for (i = 0; i < 4; ++i) {                                                \
+      sad_array[i] = 2 * highbd_sad(src, 2 * src_stride, ref_array[i],       \
+                                    2 * ref_stride, (m), (n / 2));           \
+    }                                                                        \
   }
 
 // 128x128
diff --git a/aom_dsp/sad_av1.c b/aom_dsp/sad_av1.c
index 4675181..3c90bf8 100644
--- a/aom_dsp/sad_av1.c
+++ b/aom_dsp/sad_av1.c
@@ -156,6 +156,7 @@
 HIGHBD_MASKSADMXN(64, 16)
 #endif  // CONFIG_AV1_HIGHBITDEPTH
 
+#if !CONFIG_REALTIME_ONLY
 // pre: predictor being evaluated
 // wsrc: target weighted prediction (has been *4096 to keep precision)
 // mask: 2d weights (scaled by 4096)
@@ -262,3 +263,4 @@
 HIGHBD_OBMCSADMXN(64, 16)
 /* clang-format on */
 #endif  // CONFIG_AV1_HIGHBITDEPTH
+#endif  // !CONFIG_REALTIME_ONLY
diff --git a/aom_dsp/ssim.c b/aom_dsp/ssim.c
index 95b8888..357da99 100644
--- a/aom_dsp/ssim.c
+++ b/aom_dsp/ssim.c
@@ -77,7 +77,7 @@
 static double similarity(uint32_t sum_s, uint32_t sum_r, uint32_t sum_sq_s,
                          uint32_t sum_sq_r, uint32_t sum_sxr, int count,
                          uint32_t bd) {
-  int64_t ssim_n, ssim_d;
+  double ssim_n, ssim_d;
   int64_t c1, c2;
   if (bd == 8) {
     // scale the constants by number of pixels
@@ -94,14 +94,14 @@
     assert(0);
   }
 
-  ssim_n = (2 * sum_s * sum_r + c1) *
-           ((int64_t)2 * count * sum_sxr - (int64_t)2 * sum_s * sum_r + c2);
+  ssim_n = (2.0 * sum_s * sum_r + c1) *
+           (2.0 * count * sum_sxr - 2.0 * sum_s * sum_r + c2);
 
-  ssim_d = (sum_s * sum_s + sum_r * sum_r + c1) *
-           ((int64_t)count * sum_sq_s - (int64_t)sum_s * sum_s +
-            (int64_t)count * sum_sq_r - (int64_t)sum_r * sum_r + c2);
+  ssim_d = ((double)sum_s * sum_s + (double)sum_r * sum_r + c1) *
+           ((double)count * sum_sq_s - (double)sum_s * sum_s +
+            (double)count * sum_sq_r - (double)sum_r * sum_r + c2);
 
-  return ssim_n * 1.0 / ssim_d;
+  return ssim_n / ssim_d;
 }
 
 static double ssim_8x8(const uint8_t *s, int sp, const uint8_t *r, int rp) {
@@ -165,8 +165,9 @@
   return ssim_total;
 }
 
-double aom_calc_ssim(const YV12_BUFFER_CONFIG *source,
-                     const YV12_BUFFER_CONFIG *dest, double *weight) {
+void aom_calc_ssim(const YV12_BUFFER_CONFIG *source,
+                   const YV12_BUFFER_CONFIG *dest, double *weight,
+                   double *fast_ssim) {
   double abc[3];
   for (int i = 0; i < 3; ++i) {
     const int is_uv = i > 0;
@@ -176,7 +177,7 @@
   }
 
   *weight = 1;
-  return abc[0] * .8 + .1 * (abc[1] + abc[2]);
+  *fast_ssim = abc[0] * .8 + .1 * (abc[1] + abc[2]);
 }
 
 // traditional ssim as per: http://en.wikipedia.org/wiki/Structural_similarity
@@ -421,11 +422,11 @@
   return inconsistency_total;
 }
 
-double aom_highbd_calc_ssim(const YV12_BUFFER_CONFIG *source,
-                            const YV12_BUFFER_CONFIG *dest, double *weight,
-                            uint32_t bd, uint32_t in_bd) {
+void aom_highbd_calc_ssim(const YV12_BUFFER_CONFIG *source,
+                          const YV12_BUFFER_CONFIG *dest, double *weight,
+                          uint32_t bd, uint32_t in_bd, double *fast_ssim) {
   assert(bd >= in_bd);
-  const uint32_t shift = bd - in_bd;
+  uint32_t shift = bd - in_bd;
 
   double abc[3];
   for (int i = 0; i < 3; ++i) {
@@ -436,6 +437,21 @@
                               source->crop_heights[is_uv], in_bd, shift);
   }
 
-  *weight = 1;
-  return abc[0] * .8 + .1 * (abc[1] + abc[2]);
+  weight[0] = 1;
+  fast_ssim[0] = abc[0] * .8 + .1 * (abc[1] + abc[2]);
+
+  if (bd > in_bd) {
+    // Compute SSIM based on stream bit depth
+    shift = 0;
+    for (int i = 0; i < 3; ++i) {
+      const int is_uv = i > 0;
+      abc[i] = aom_highbd_ssim2(source->buffers[i], dest->buffers[i],
+                                source->strides[is_uv], dest->strides[is_uv],
+                                source->crop_widths[is_uv],
+                                source->crop_heights[is_uv], bd, shift);
+    }
+
+    weight[1] = 1;
+    fast_ssim[1] = abc[0] * .8 + .1 * (abc[1] + abc[2]);
+  }
 }
diff --git a/aom_dsp/ssim.h b/aom_dsp/ssim.h
index 55038f4..d635ef5 100644
--- a/aom_dsp/ssim.h
+++ b/aom_dsp/ssim.h
@@ -68,17 +68,18 @@
                             int img2_pitch, int width, int height, Ssimv *sv2,
                             Metrics *m, int do_inconsistency);
 
-double aom_calc_ssim(const YV12_BUFFER_CONFIG *source,
-                     const YV12_BUFFER_CONFIG *dest, double *weight);
+void aom_calc_ssim(const YV12_BUFFER_CONFIG *source,
+                   const YV12_BUFFER_CONFIG *dest, double *weight,
+                   double *fast_ssim);
 
 double aom_calc_fastssim(const YV12_BUFFER_CONFIG *source,
                          const YV12_BUFFER_CONFIG *dest, double *ssim_y,
                          double *ssim_u, double *ssim_v, uint32_t bd,
                          uint32_t in_bd);
 
-double aom_highbd_calc_ssim(const YV12_BUFFER_CONFIG *source,
-                            const YV12_BUFFER_CONFIG *dest, double *weight,
-                            uint32_t bd, uint32_t in_bd);
+void aom_highbd_calc_ssim(const YV12_BUFFER_CONFIG *source,
+                          const YV12_BUFFER_CONFIG *dest, double *weight,
+                          uint32_t bd, uint32_t in_bd, double *fast_ssim);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/aom_dsp/sum_squares.c b/aom_dsp/sum_squares.c
index d739a60..f58defa 100644
--- a/aom_dsp/sum_squares.c
+++ b/aom_dsp/sum_squares.c
@@ -71,3 +71,20 @@
 
   return (ss - s * s / (width * height));
 }
+
+uint64_t aom_sum_sse_2d_i16_c(const int16_t *src, int src_stride, int width,
+                              int height, int *sum) {
+  int r, c;
+  int16_t *srcp = (int16_t *)src;
+  int64_t ss = 0;
+
+  for (r = 0; r < height; r++) {
+    for (c = 0; c < width; c++) {
+      const int16_t v = srcp[c];
+      ss += v * v;
+      *sum += v;
+    }
+    srcp += src_stride;
+  }
+  return ss;
+}
diff --git a/aom_dsp/variance.c b/aom_dsp/variance.c
index 695f12a..20af52b 100644
--- a/aom_dsp/variance.c
+++ b/aom_dsp/variance.c
@@ -250,12 +250,16 @@
 VARIANCES(4, 2)
 VARIANCES(2, 4)
 VARIANCES(2, 2)
+
+// Realtime mode doesn't use rectangular blocks.
+#if !CONFIG_REALTIME_ONLY
 VARIANCES(4, 16)
 VARIANCES(16, 4)
 VARIANCES(8, 32)
 VARIANCES(32, 8)
 VARIANCES(16, 64)
 VARIANCES(64, 16)
+#endif
 
 GET_VAR(16, 16)
 GET_VAR(8, 8)
@@ -363,8 +367,9 @@
                                    int ref_stride, int subpel_search) {
   int i, j;
 
-  aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
-                     subpel_x_q3, subpel_y_q3, ref, ref_stride, subpel_search);
+  aom_upsampled_pred_c(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
+                       subpel_x_q3, subpel_y_q3, ref, ref_stride,
+                       subpel_search);
   for (i = 0; i < height; i++) {
     for (j = 0; j < width; j++) {
       comp_pred[j] = ROUND_POWER_OF_TWO(comp_pred[j] + pred[j], 1);
@@ -789,12 +794,16 @@
 HIGHBD_VARIANCES(4, 2)
 HIGHBD_VARIANCES(2, 4)
 HIGHBD_VARIANCES(2, 2)
+
+// Realtime mode doesn't use 4x rectangular blocks.
+#if !CONFIG_REALTIME_ONLY
 HIGHBD_VARIANCES(4, 16)
 HIGHBD_VARIANCES(16, 4)
 HIGHBD_VARIANCES(8, 32)
 HIGHBD_VARIANCES(32, 8)
 HIGHBD_VARIANCES(16, 64)
 HIGHBD_VARIANCES(64, 16)
+#endif
 
 HIGHBD_GET_VAR(8)
 HIGHBD_GET_VAR(16)
@@ -1048,12 +1057,16 @@
 MASK_SUBPIX_VAR(64, 128)
 MASK_SUBPIX_VAR(128, 64)
 MASK_SUBPIX_VAR(128, 128)
+
+// Realtime mode doesn't use 4x rectangular blocks.
+#if !CONFIG_REALTIME_ONLY
 MASK_SUBPIX_VAR(4, 16)
 MASK_SUBPIX_VAR(16, 4)
 MASK_SUBPIX_VAR(8, 32)
 MASK_SUBPIX_VAR(32, 8)
 MASK_SUBPIX_VAR(16, 64)
 MASK_SUBPIX_VAR(64, 16)
+#endif
 
 #if CONFIG_AV1_HIGHBITDEPTH
 void aom_highbd_comp_mask_pred_c(uint8_t *comp_pred8, const uint8_t *pred8,
@@ -1174,14 +1187,17 @@
 HIGHBD_MASK_SUBPIX_VAR(64, 128)
 HIGHBD_MASK_SUBPIX_VAR(128, 64)
 HIGHBD_MASK_SUBPIX_VAR(128, 128)
+#if !CONFIG_REALTIME_ONLY
 HIGHBD_MASK_SUBPIX_VAR(4, 16)
 HIGHBD_MASK_SUBPIX_VAR(16, 4)
 HIGHBD_MASK_SUBPIX_VAR(8, 32)
 HIGHBD_MASK_SUBPIX_VAR(32, 8)
 HIGHBD_MASK_SUBPIX_VAR(16, 64)
 HIGHBD_MASK_SUBPIX_VAR(64, 16)
+#endif
 #endif  // CONFIG_AV1_HIGHBITDEPTH
 
+#if !CONFIG_REALTIME_ONLY
 static INLINE void obmc_variance(const uint8_t *pre, int pre_stride,
                                  const int32_t *wsrc, const int32_t *mask,
                                  int w, int h, unsigned int *sse, int *sum) {
@@ -1481,3 +1497,28 @@
 HIGHBD_OBMC_VAR(64, 16)
 HIGHBD_OBMC_SUBPIX_VAR(64, 16)
 #endif  // CONFIG_AV1_HIGHBITDEPTH
+#endif  // !CONFIG_REALTIME_ONLY
+
+uint64_t aom_mse_wxh_16bit_c(uint8_t *dst, int dstride, uint16_t *src,
+                             int sstride, int w, int h) {
+  uint64_t sum = 0;
+  for (int i = 0; i < h; i++) {
+    for (int j = 0; j < w; j++) {
+      int e = (uint16_t)dst[i * dstride + j] - src[i * sstride + j];
+      sum += e * e;
+    }
+  }
+  return sum;
+}
+
+uint64_t aom_mse_wxh_16bit_highbd_c(uint16_t *dst, int dstride, uint16_t *src,
+                                    int sstride, int w, int h) {
+  uint64_t sum = 0;
+  for (int i = 0; i < h; i++) {
+    for (int j = 0; j < w; j++) {
+      int e = dst[i * dstride + j] - src[i * sstride + j];
+      sum += e * e;
+    }
+  }
+  return sum;
+}
diff --git a/aom_dsp/variance.h b/aom_dsp/variance.h
index 4550c17..428afd0 100644
--- a/aom_dsp/variance.h
+++ b/aom_dsp/variance.h
@@ -90,11 +90,15 @@
 
 typedef struct aom_variance_vtable {
   aom_sad_fn_t sdf;
+  // Same as normal sad, but downsample the rows by a factor of 2.
+  aom_sad_fn_t sdsf;
   aom_sad_avg_fn_t sdaf;
   aom_variance_fn_t vf;
   aom_subpixvariance_fn_t svf;
   aom_subp_avg_variance_fn_t svaf;
   aom_sad_multi_d_fn_t sdx4df;
+  // Same as sadx4, but downsample the rows by a factor of 2.
+  aom_sad_multi_d_fn_t sdsx4df;
   aom_masked_sad_fn_t msdf;
   aom_masked_subpixvariance_fn_t msvf;
   aom_obmc_sad_fn_t osdf;
diff --git a/aom_dsp/vmaf.c b/aom_dsp/vmaf.c
index 3a012e7..4165343 100644
--- a/aom_dsp/vmaf.c
+++ b/aom_dsp/vmaf.c
@@ -9,16 +9,34 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
+#include "aom_dsp/vmaf.h"
+
 #include <assert.h>
-#include <libvmaf/libvmaf.h>
+#if !CONFIG_USE_VMAF_RC
+#include <libvmaf.h>
+#endif
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#ifdef _WIN32
+#include <process.h>
+#else
+#include <unistd.h>
+#endif
+
+#if CONFIG_USE_VMAF_RC
+#include <libvmaf/libvmaf.rc.h>
+#endif
 
 #include "aom_dsp/blend.h"
-#include "aom_dsp/vmaf.h"
 #include "aom_ports/system_state.h"
 
+static void vmaf_fatal_error(const char *message) {
+  fprintf(stderr, "Fatal error: %s\n", message);
+  exit(EXIT_FAILURE);
+}
+
+#if !CONFIG_USE_VMAF_RC
 typedef struct FrameData {
   const YV12_BUFFER_CONFIG *source;
   const YV12_BUFFER_CONFIG *distorted;
@@ -26,11 +44,6 @@
   int bit_depth;
 } FrameData;
 
-static void vmaf_fatal_error(const char *message) {
-  fprintf(stderr, "Fatal error: %s\n", message);
-  exit(EXIT_FAILURE);
-}
-
 // A callback function used to pass data to VMAF.
 // Returns 0 after reading a frame.
 // Returns 2 when there is no more frame to read.
@@ -44,7 +57,7 @@
     assert(width == frames->distorted->y_width);
     assert(height == frames->distorted->y_height);
 
-    if (frames->bit_depth > 8) {
+    if (frames->source->flags & YV12_FLAG_HIGHBITDEPTH) {
       const float scale_factor = 1.0f / (float)(1 << (frames->bit_depth - 8));
       uint16_t *ref_ptr = CONVERT_TO_SHORTPTR(frames->source->y_buffer);
       uint16_t *main_ptr = CONVERT_TO_SHORTPTR(frames->distorted->y_buffer);
@@ -115,24 +128,30 @@
   *vmaf = vmaf_score;
 }
 
-void aom_calc_vmaf_multi_frame(
-    void *user_data, const char *model_path,
-    int (*read_frame)(float *ref_data, float *main_data, float *temp_data,
-                      int stride_byte, void *user_data),
-    int frame_width, int frame_height, int bit_depth, double *vmaf) {
+void aom_calc_vmaf_multi_frame(void *user_data, const char *model_path,
+                               int (*rd_frm)(float *ref_data, float *main_data,
+                                             float *temp_data, int stride_byte,
+                                             void *user_data),
+                               int frame_width, int frame_height, int bit_depth,
+                               double *vmaf) {
   aom_clear_system_state();
 
   char *fmt = bit_depth == 10 ? "yuv420p10le" : "yuv420p";
+  int log_path_length = snprintf(NULL, 0, "vmaf_scores_%d.xml", getpid()) + 1;
+  char *log_path = malloc(log_path_length);
+  snprintf(log_path, log_path_length, "vmaf_scores_%d.xml", getpid());
   double vmaf_score;
-  const int ret = compute_vmaf(
-      &vmaf_score, fmt, frame_width, frame_height, read_frame,
-      /*user_data=*/user_data, (char *)model_path,
-      /*log_path=*/"vmaf_scores.xml", /*log_fmt=*/NULL, /*disable_clip=*/0,
-      /*disable_avx=*/0, /*enable_transform=*/0,
-      /*phone_model=*/0, /*do_psnr=*/0, /*do_ssim=*/0,
-      /*do_ms_ssim=*/0, /*pool_method=*/NULL, /*n_thread=*/0,
-      /*n_subsample=*/1, /*enable_conf_interval=*/0);
-  FILE *vmaf_log = fopen("vmaf_scores.xml", "r");
+  const int ret =
+      compute_vmaf(&vmaf_score, fmt, frame_width, frame_height, rd_frm,
+                   /*user_data=*/user_data, (char *)model_path,
+                   /*log_path=*/log_path, /*log_fmt=*/NULL, /*disable_clip=*/0,
+                   /*disable_avx=*/0, /*enable_transform=*/0,
+                   /*phone_model=*/0, /*do_psnr=*/0, /*do_ssim=*/0,
+                   /*do_ms_ssim=*/0, /*pool_method=*/NULL, /*n_thread=*/0,
+                   /*n_subsample=*/1, /*enable_conf_interval=*/0);
+  FILE *vmaf_log = fopen(log_path, "r");
+  free(log_path);
+  log_path = NULL;
   if (vmaf_log == NULL || ret) {
     vmaf_fatal_error("Failed to compute VMAF scores.");
   }
@@ -157,3 +176,116 @@
 
   aom_clear_system_state();
 }
+#endif
+
+#if CONFIG_USE_VMAF_RC
+void aom_init_vmaf_model_rc(VmafModel **vmaf_model, const char *model_path) {
+  if (*vmaf_model != NULL) return;
+  VmafModelConfig model_cfg;
+  model_cfg.flags = VMAF_MODEL_FLAG_DISABLE_CLIP;
+  model_cfg.name = "vmaf";
+  model_cfg.path = (char *)model_path;
+
+  if (vmaf_model_load_from_path(vmaf_model, &model_cfg)) {
+    vmaf_fatal_error("Failed to load VMAF model.");
+  }
+}
+
+void aom_close_vmaf_model_rc(VmafModel *vmaf_model) {
+  vmaf_model_destroy(vmaf_model);
+}
+
+static void copy_picture(const int bit_depth, const YV12_BUFFER_CONFIG *src,
+                         VmafPicture *dst) {
+  const int width = src->y_width;
+  const int height = src->y_height;
+
+  if (bit_depth > 8) {
+    uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src->y_buffer);
+    uint16_t *dst_ptr = dst->data[0];
+
+    for (int row = 0; row < height; ++row) {
+      memcpy(dst_ptr, src_ptr, width * sizeof(dst_ptr[0]));
+      src_ptr += src->y_stride;
+      dst_ptr += dst->stride[0] / 2;
+    }
+  } else {
+    uint8_t *src_ptr = src->y_buffer;
+    uint8_t *dst_ptr = (uint8_t *)dst->data[0];
+
+    for (int row = 0; row < height; ++row) {
+      memcpy(dst_ptr, src_ptr, width * sizeof(dst_ptr[0]));
+      src_ptr += src->y_stride;
+      dst_ptr += dst->stride[0];
+    }
+  }
+}
+
+void aom_init_vmaf_context_rc(VmafContext **vmaf_context, VmafModel *vmaf_model,
+                              bool cal_vmaf_neg) {
+  VmafConfiguration cfg;
+  cfg.log_level = VMAF_LOG_LEVEL_NONE;
+  cfg.n_threads = 0;
+  cfg.n_subsample = 0;
+  cfg.cpumask = 0;
+
+  if (vmaf_init(vmaf_context, cfg)) {
+    vmaf_fatal_error("Failed to init VMAF context.");
+  }
+
+  if (vmaf_use_features_from_model(*vmaf_context, vmaf_model)) {
+    vmaf_fatal_error("Failed to load feature extractors from VMAF model.");
+  }
+
+  if (cal_vmaf_neg) {
+    VmafFeatureDictionary *vif_feature = NULL;
+    vmaf_feature_dictionary_set(&vif_feature, "vif_enhn_gain_limit", "1.0");
+    if (vmaf_use_feature(*vmaf_context, "float_vif", vif_feature)) {
+      vmaf_fatal_error("Failed to use feature float_vif.");
+    }
+
+    VmafFeatureDictionary *adm_feature = NULL;
+    vmaf_feature_dictionary_set(&adm_feature, "adm_enhn_gain_limit", "1.0");
+    if (vmaf_use_feature(*vmaf_context, "float_adm", adm_feature)) {
+      vmaf_fatal_error("Failed to use feature float_adm.");
+    }
+  }
+
+  VmafFeatureDictionary *motion_force_zero = NULL;
+  vmaf_feature_dictionary_set(&motion_force_zero, "motion_force_zero", "true");
+  if (vmaf_use_feature(*vmaf_context, "float_motion", motion_force_zero)) {
+    vmaf_fatal_error("Failed to use feature float_motion.");
+  }
+}
+
+void aom_close_vmaf_context_rc(VmafContext *vmaf_context) {
+  if (vmaf_close(vmaf_context)) {
+    vmaf_fatal_error("Failed to close VMAF context.");
+  }
+}
+
+void aom_calc_vmaf_at_index_rc(VmafContext *vmaf_context, VmafModel *vmaf_model,
+                               const YV12_BUFFER_CONFIG *source,
+                               const YV12_BUFFER_CONFIG *distorted,
+                               int bit_depth, int frame_index, double *vmaf) {
+  VmafPicture ref, dist;
+  if (vmaf_picture_alloc(&ref, VMAF_PIX_FMT_YUV420P, bit_depth, source->y_width,
+                         source->y_height) ||
+      vmaf_picture_alloc(&dist, VMAF_PIX_FMT_YUV420P, bit_depth,
+                         source->y_width, source->y_height)) {
+    vmaf_fatal_error("Failed to alloc VMAF pictures.");
+  }
+  copy_picture(bit_depth, source, &ref);
+  copy_picture(bit_depth, distorted, &dist);
+  if (vmaf_read_pictures(vmaf_context, &ref, &dist,
+                         /*picture index=*/frame_index)) {
+    vmaf_fatal_error("Failed to read VMAF pictures.");
+  }
+
+  vmaf_picture_unref(&ref);
+  vmaf_picture_unref(&dist);
+
+  vmaf_score_at_index(vmaf_context, vmaf_model, vmaf, frame_index);
+}
+
+#endif  // CONFIG_USE_VMAF_RC
diff --git a/aom_dsp/vmaf.h b/aom_dsp/vmaf.h
index fb8bf46..d9da223 100644
--- a/aom_dsp/vmaf.h
+++ b/aom_dsp/vmaf.h
@@ -12,8 +12,27 @@
 #ifndef AOM_AOM_DSP_VMAF_H_
 #define AOM_AOM_DSP_VMAF_H_
 
+#include <stdbool.h>
 #include "aom_scale/yv12config.h"
 
+#if CONFIG_USE_VMAF_RC
+typedef struct VmafContext VmafContext;
+typedef struct VmafModel VmafModel;
+#endif
+
+#if CONFIG_USE_VMAF_RC
+void aom_init_vmaf_context_rc(VmafContext **vmaf_context, VmafModel *vmaf_model,
+                              bool cal_vmaf_neg);
+void aom_close_vmaf_context_rc(VmafContext *vmaf_context);
+
+void aom_init_vmaf_model_rc(VmafModel **vmaf_model, const char *model_path);
+void aom_close_vmaf_model_rc(VmafModel *vmaf_model);
+
+void aom_calc_vmaf_at_index_rc(VmafContext *vmaf_context, VmafModel *vmaf_model,
+                               const YV12_BUFFER_CONFIG *source,
+                               const YV12_BUFFER_CONFIG *distorted,
+                               int bit_depth, int frame_index, double *vmaf);
+#else
 void aom_calc_vmaf(const char *model_path, const YV12_BUFFER_CONFIG *source,
                    const YV12_BUFFER_CONFIG *distorted, int bit_depth,
                    double *vmaf);
@@ -23,5 +42,6 @@
     int (*read_frame)(float *ref_data, float *main_data, float *temp_data,
                       int stride_byte, void *user_data),
     int frame_width, int frame_height, int bit_depth, double *vmaf);
+#endif  // CONFIG_USE_VMAF_RC
 
 #endif  // AOM_AOM_DSP_VMAF_H_
diff --git a/aom_dsp/x86/aom_convolve_copy_avx2.c b/aom_dsp/x86/aom_convolve_copy_avx2.c
new file mode 100644
index 0000000..39c6a40
--- /dev/null
+++ b/aom_dsp/x86/aom_convolve_copy_avx2.c
@@ -0,0 +1,252 @@
+/*
+ *  Copyright (c) 2020, Alliance for Open Media. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <immintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+static INLINE void copy_128(const uint8_t *src, uint8_t *dst) {
+  __m256i s[4];
+  s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 32));
+  s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 32));
+  s[2] = _mm256_loadu_si256((__m256i *)(src + 2 * 32));
+  s[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 32));
+  _mm256_storeu_si256((__m256i *)(dst + 0 * 32), s[0]);
+  _mm256_storeu_si256((__m256i *)(dst + 1 * 32), s[1]);
+  _mm256_storeu_si256((__m256i *)(dst + 2 * 32), s[2]);
+  _mm256_storeu_si256((__m256i *)(dst + 3 * 32), s[3]);
+}
+
+void aom_convolve_copy_avx2(const uint8_t *src, ptrdiff_t src_stride,
+                            uint8_t *dst, ptrdiff_t dst_stride, int w, int h) {
+  if (w >= 16) {
+    assert(!((intptr_t)dst % 16));
+    assert(!(dst_stride % 16));
+  }
+
+  if (w == 2) {
+    do {
+      memmove(dst, src, 2 * sizeof(*src));
+      src += src_stride;
+      dst += dst_stride;
+      memmove(dst, src, 2 * sizeof(*src));
+      src += src_stride;
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 4) {
+    do {
+      memmove(dst, src, 4 * sizeof(*src));
+      src += src_stride;
+      dst += dst_stride;
+      memmove(dst, src, 4 * sizeof(*src));
+      src += src_stride;
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 8) {
+    do {
+      __m128i s[2];
+      s[0] = _mm_loadl_epi64((__m128i *)src);
+      src += src_stride;
+      s[1] = _mm_loadl_epi64((__m128i *)src);
+      src += src_stride;
+      _mm_storel_epi64((__m128i *)dst, s[0]);
+      dst += dst_stride;
+      _mm_storel_epi64((__m128i *)dst, s[1]);
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 16) {
+    do {
+      __m128i s[2];
+      s[0] = _mm_loadu_si128((__m128i *)src);
+      src += src_stride;
+      s[1] = _mm_loadu_si128((__m128i *)src);
+      src += src_stride;
+      _mm_store_si128((__m128i *)dst, s[0]);
+      dst += dst_stride;
+      _mm_store_si128((__m128i *)dst, s[1]);
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 32) {
+    do {
+      __m256i s[2];
+      s[0] = _mm256_loadu_si256((__m256i *)src);
+      src += src_stride;
+      s[1] = _mm256_loadu_si256((__m256i *)src);
+      src += src_stride;
+      _mm256_storeu_si256((__m256i *)dst, s[0]);
+      dst += dst_stride;
+      _mm256_storeu_si256((__m256i *)dst, s[1]);
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 64) {
+    do {
+      __m256i s[4];
+      s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 32));
+      s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 32));
+      src += src_stride;
+      s[2] = _mm256_loadu_si256((__m256i *)(src + 0 * 32));
+      s[3] = _mm256_loadu_si256((__m256i *)(src + 1 * 32));
+      src += src_stride;
+      _mm256_storeu_si256((__m256i *)(dst + 0 * 32), s[0]);
+      _mm256_storeu_si256((__m256i *)(dst + 1 * 32), s[1]);
+      dst += dst_stride;
+      _mm256_storeu_si256((__m256i *)(dst + 0 * 32), s[2]);
+      _mm256_storeu_si256((__m256i *)(dst + 1 * 32), s[3]);
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else {
+    do {
+      copy_128(src, dst);
+      src += src_stride;
+      dst += dst_stride;
+      copy_128(src, dst);
+      src += src_stride;
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  }
+}
+
+static INLINE void highbd_copy_64(const uint16_t *src, uint16_t *dst) {
+  __m256i s[4];
+  s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 16));
+  s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 16));
+  s[2] = _mm256_loadu_si256((__m256i *)(src + 2 * 16));
+  s[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 16));
+  _mm256_storeu_si256((__m256i *)(dst + 0 * 16), s[0]);
+  _mm256_storeu_si256((__m256i *)(dst + 1 * 16), s[1]);
+  _mm256_storeu_si256((__m256i *)(dst + 2 * 16), s[2]);
+  _mm256_storeu_si256((__m256i *)(dst + 3 * 16), s[3]);
+}
+
+static INLINE void highbd_copy_128(const uint16_t *src, uint16_t *dst) {
+  __m256i s[8];
+  s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 16));
+  s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 16));
+  s[2] = _mm256_loadu_si256((__m256i *)(src + 2 * 16));
+  s[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 16));
+  s[4] = _mm256_loadu_si256((__m256i *)(src + 4 * 16));
+  s[5] = _mm256_loadu_si256((__m256i *)(src + 5 * 16));
+  s[6] = _mm256_loadu_si256((__m256i *)(src + 6 * 16));
+  s[7] = _mm256_loadu_si256((__m256i *)(src + 7 * 16));
+
+  _mm256_storeu_si256((__m256i *)(dst + 0 * 16), s[0]);
+  _mm256_storeu_si256((__m256i *)(dst + 1 * 16), s[1]);
+  _mm256_storeu_si256((__m256i *)(dst + 2 * 16), s[2]);
+  _mm256_storeu_si256((__m256i *)(dst + 3 * 16), s[3]);
+  _mm256_storeu_si256((__m256i *)(dst + 4 * 16), s[4]);
+  _mm256_storeu_si256((__m256i *)(dst + 5 * 16), s[5]);
+  _mm256_storeu_si256((__m256i *)(dst + 6 * 16), s[6]);
+  _mm256_storeu_si256((__m256i *)(dst + 7 * 16), s[7]);
+}
+
+void aom_highbd_convolve_copy_avx2(const uint16_t *src, ptrdiff_t src_stride,
+                                   uint16_t *dst, ptrdiff_t dst_stride, int w,
+                                   int h) {
+  if (w >= 16) {
+    assert(!((intptr_t)dst % 16));
+    assert(!(dst_stride % 16));
+  }
+
+  if (w == 2) {
+    do {
+      memmove(dst, src, 2 * sizeof(*src));
+      src += src_stride;
+      dst += dst_stride;
+      memmove(dst, src, 2 * sizeof(*src));
+      src += src_stride;
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 4) {
+    do {
+      __m128i s[2];
+      s[0] = _mm_loadl_epi64((__m128i *)src);
+      src += src_stride;
+      s[1] = _mm_loadl_epi64((__m128i *)src);
+      src += src_stride;
+      _mm_storel_epi64((__m128i *)dst, s[0]);
+      dst += dst_stride;
+      _mm_storel_epi64((__m128i *)dst, s[1]);
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 8) {
+    do {
+      __m128i s[2];
+      s[0] = _mm_loadu_si128((__m128i *)src);
+      src += src_stride;
+      s[1] = _mm_loadu_si128((__m128i *)src);
+      src += src_stride;
+      _mm_store_si128((__m128i *)dst, s[0]);
+      dst += dst_stride;
+      _mm_store_si128((__m128i *)dst, s[1]);
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 16) {
+    do {
+      __m256i s[2];
+      s[0] = _mm256_loadu_si256((__m256i *)src);
+      src += src_stride;
+      s[1] = _mm256_loadu_si256((__m256i *)src);
+      src += src_stride;
+      _mm256_storeu_si256((__m256i *)dst, s[0]);
+      dst += dst_stride;
+      _mm256_storeu_si256((__m256i *)dst, s[1]);
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 32) {
+    do {
+      __m256i s[4];
+      s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 16));
+      s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 16));
+      src += src_stride;
+      s[2] = _mm256_loadu_si256((__m256i *)(src + 0 * 16));
+      s[3] = _mm256_loadu_si256((__m256i *)(src + 1 * 16));
+      src += src_stride;
+      _mm256_storeu_si256((__m256i *)(dst + 0 * 16), s[0]);
+      _mm256_storeu_si256((__m256i *)(dst + 1 * 16), s[1]);
+      dst += dst_stride;
+      _mm256_storeu_si256((__m256i *)(dst + 0 * 16), s[2]);
+      _mm256_storeu_si256((__m256i *)(dst + 1 * 16), s[3]);
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 64) {
+    do {
+      highbd_copy_64(src, dst);
+      src += src_stride;
+      dst += dst_stride;
+      highbd_copy_64(src, dst);
+      src += src_stride;
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else {
+    assert(w == 128);
+    do {
+      highbd_copy_128(src, dst);
+      src += src_stride;
+      dst += dst_stride;
+      highbd_copy_128(src, dst);
+      src += src_stride;
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  }
+}
diff --git a/aom_dsp/x86/aom_convolve_copy_sse2.asm b/aom_dsp/x86/aom_convolve_copy_sse2.asm
deleted file mode 100644
index 7283c32..0000000
--- a/aom_dsp/x86/aom_convolve_copy_sse2.asm
+++ /dev/null
@@ -1,297 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION .text
-
-%macro convolve_fn 1-2
-%ifidn %1, avg
-%define AUX_XMM_REGS 4
-%else
-%define AUX_XMM_REGS 0
-%endif
-%ifidn %2, highbd
-%define pavg pavgw
-cglobal %2_convolve_%1, 4, 7, 4+AUX_XMM_REGS, src, src_stride, \
-                                              dst, dst_stride, \
-                                              fx, fxs, fy, fys, w, h, bd
-%else
-%define pavg pavgb
-cglobal convolve_%1, 4, 7, 4+AUX_XMM_REGS, src, src_stride, \
-                                           dst, dst_stride, \
-                                           fx, fxs, fy, fys, w, h
-%endif
-  mov r4d, dword wm
-%ifidn %2, highbd
-  shl r4d, 1
-  shl srcq, 1
-  shl src_strideq, 1
-  shl dstq, 1
-  shl dst_strideq, 1
-%else
-  cmp r4d, 4
-  je .w4
-%endif
-  cmp r4d, 8
-  je .w8
-  cmp r4d, 16
-  je .w16
-  cmp r4d, 32
-  je .w32
-
-  cmp r4d, 64
-  je .w64
-%ifidn %2, highbd
-  cmp r4d, 128
-  je .w128
-
-.w256:
-  mov                    r4d, dword hm
-.loop256:
-  movu                    m0, [srcq]
-  movu                    m1, [srcq+16]
-  movu                    m2, [srcq+32]
-  movu                    m3, [srcq+48]
-%ifidn %1, avg
-  pavg                    m0, [dstq]
-  pavg                    m1, [dstq+16]
-  pavg                    m2, [dstq+32]
-  pavg                    m3, [dstq+48]
-%endif
-  mova             [dstq   ], m0
-  mova             [dstq+16], m1
-  mova             [dstq+32], m2
-  mova             [dstq+48], m3
-  movu                    m0, [srcq+64]
-  movu                    m1, [srcq+80]
-  movu                    m2, [srcq+96]
-  movu                    m3, [srcq+112]
-%ifidn %1, avg
-  pavg                    m0, [dstq+64]
-  pavg                    m1, [dstq+80]
-  pavg                    m2, [dstq+96]
-  pavg                    m3, [dstq+112]
-%endif
-  mova             [dstq+64], m0
-  mova             [dstq+80], m1
-  mova             [dstq+96], m2
-  mova            [dstq+112], m3
-  movu                    m0, [srcq+128]
-  movu                    m1, [srcq+128+16]
-  movu                    m2, [srcq+128+32]
-  movu                    m3, [srcq+128+48]
-%ifidn %1, avg
-  pavg                    m0, [dstq+128]
-  pavg                    m1, [dstq+128+16]
-  pavg                    m2, [dstq+128+32]
-  pavg                    m3, [dstq+128+48]
-%endif
-  mova         [dstq+128   ], m0
-  mova         [dstq+128+16], m1
-  mova         [dstq+128+32], m2
-  mova         [dstq+128+48], m3
-  movu                    m0, [srcq+128+64]
-  movu                    m1, [srcq+128+80]
-  movu                    m2, [srcq+128+96]
-  movu                    m3, [srcq+128+112]
-  add                   srcq, src_strideq
-%ifidn %1, avg
-  pavg                    m0, [dstq+128+64]
-  pavg                    m1, [dstq+128+80]
-  pavg                    m2, [dstq+128+96]
-  pavg                    m3, [dstq+128+112]
-%endif
-  mova         [dstq+128+64], m0
-  mova         [dstq+128+80], m1
-  mova         [dstq+128+96], m2
-  mova        [dstq+128+112], m3
-  add                   dstq, dst_strideq
-  sub                    r4d, 1
-  jnz .loop256
-  RET
-%endif
-
-.w128:
-  mov                    r4d, dword hm
-.loop128:
-  movu                    m0, [srcq]
-  movu                    m1, [srcq+16]
-  movu                    m2, [srcq+32]
-  movu                    m3, [srcq+48]
-%ifidn %1, avg
-  pavg                    m0, [dstq]
-  pavg                    m1, [dstq+16]
-  pavg                    m2, [dstq+32]
-  pavg                    m3, [dstq+48]
-%endif
-  mova             [dstq   ], m0
-  mova             [dstq+16], m1
-  mova             [dstq+32], m2
-  mova             [dstq+48], m3
-  movu                    m0, [srcq+64]
-  movu                    m1, [srcq+80]
-  movu                    m2, [srcq+96]
-  movu                    m3, [srcq+112]
-  add                   srcq, src_strideq
-%ifidn %1, avg
-  pavg                    m0, [dstq+64]
-  pavg                    m1, [dstq+80]
-  pavg                    m2, [dstq+96]
-  pavg                    m3, [dstq+112]
-%endif
-  mova             [dstq+64], m0
-  mova             [dstq+80], m1
-  mova             [dstq+96], m2
-  mova            [dstq+112], m3
-  add                   dstq, dst_strideq
-  sub                    r4d, 1
-  jnz .loop128
-  RET
-
-.w64:
-  mov                    r4d, dword hm
-.loop64:
-  movu                    m0, [srcq]
-  movu                    m1, [srcq+16]
-  movu                    m2, [srcq+32]
-  movu                    m3, [srcq+48]
-  add                   srcq, src_strideq
-%ifidn %1, avg
-  pavg                    m0, [dstq]
-  pavg                    m1, [dstq+16]
-  pavg                    m2, [dstq+32]
-  pavg                    m3, [dstq+48]
-%endif
-  mova             [dstq   ], m0
-  mova             [dstq+16], m1
-  mova             [dstq+32], m2
-  mova             [dstq+48], m3
-  add                   dstq, dst_strideq
-  sub                    r4d, 1
-  jnz .loop64
-  RET
-
-.w32:
-  mov                    r4d, dword hm
-.loop32:
-  movu                    m0, [srcq]
-  movu                    m1, [srcq+16]
-  movu                    m2, [srcq+src_strideq]
-  movu                    m3, [srcq+src_strideq+16]
-  lea                   srcq, [srcq+src_strideq*2]
-%ifidn %1, avg
-  pavg                    m0, [dstq]
-  pavg                    m1, [dstq            +16]
-  pavg                    m2, [dstq+dst_strideq]
-  pavg                    m3, [dstq+dst_strideq+16]
-%endif
-  mova [dstq               ], m0
-  mova [dstq            +16], m1
-  mova [dstq+dst_strideq   ], m2
-  mova [dstq+dst_strideq+16], m3
-  lea                   dstq, [dstq+dst_strideq*2]
-  sub                    r4d, 2
-  jnz .loop32
-  RET
-
-.w16:
-  mov                    r4d, dword hm
-  lea                    r5q, [src_strideq*3]
-  lea                    r6q, [dst_strideq*3]
-.loop16:
-  movu                    m0, [srcq]
-  movu                    m1, [srcq+src_strideq]
-  movu                    m2, [srcq+src_strideq*2]
-  movu                    m3, [srcq+r5q]
-  lea                   srcq, [srcq+src_strideq*4]
-%ifidn %1, avg
-  pavg                    m0, [dstq]
-  pavg                    m1, [dstq+dst_strideq]
-  pavg                    m2, [dstq+dst_strideq*2]
-  pavg                    m3, [dstq+r6q]
-%endif
-  mova  [dstq              ], m0
-  mova  [dstq+dst_strideq  ], m1
-  mova  [dstq+dst_strideq*2], m2
-  mova  [dstq+r6q          ], m3
-  lea                   dstq, [dstq+dst_strideq*4]
-  sub                    r4d, 4
-  jnz .loop16
-  RET
-
-.w8:
-  mov                    r4d, dword hm
-  lea                    r5q, [src_strideq*3]
-  lea                    r6q, [dst_strideq*3]
-.loop8:
-  movh                    m0, [srcq]
-  movh                    m1, [srcq+src_strideq]
-  movh                    m2, [srcq+src_strideq*2]
-  movh                    m3, [srcq+r5q]
-  lea                   srcq, [srcq+src_strideq*4]
-%ifidn %1, avg
-  movh                    m4, [dstq]
-  movh                    m5, [dstq+dst_strideq]
-  movh                    m6, [dstq+dst_strideq*2]
-  movh                    m7, [dstq+r6q]
-  pavg                    m0, m4
-  pavg                    m1, m5
-  pavg                    m2, m6
-  pavg                    m3, m7
-%endif
-  movh  [dstq              ], m0
-  movh  [dstq+dst_strideq  ], m1
-  movh  [dstq+dst_strideq*2], m2
-  movh  [dstq+r6q          ], m3
-  lea                   dstq, [dstq+dst_strideq*4]
-  sub                    r4d, 4
-  jnz .loop8
-  RET
-
-%ifnidn %2, highbd
-.w4:
-  mov                    r4d, dword hm
-  lea                    r5q, [src_strideq*3]
-  lea                    r6q, [dst_strideq*3]
-.loop4:
-  movd                    m0, [srcq]
-  movd                    m1, [srcq+src_strideq]
-  movd                    m2, [srcq+src_strideq*2]
-  movd                    m3, [srcq+r5q]
-  lea                   srcq, [srcq+src_strideq*4]
-%ifidn %1, avg
-  movd                    m4, [dstq]
-  movd                    m5, [dstq+dst_strideq]
-  movd                    m6, [dstq+dst_strideq*2]
-  movd                    m7, [dstq+r6q]
-  pavg                    m0, m4
-  pavg                    m1, m5
-  pavg                    m2, m6
-  pavg                    m3, m7
-%endif
-  movd  [dstq              ], m0
-  movd  [dstq+dst_strideq  ], m1
-  movd  [dstq+dst_strideq*2], m2
-  movd  [dstq+r6q          ], m3
-  lea                   dstq, [dstq+dst_strideq*4]
-  sub                    r4d, 4
-  jnz .loop4
-  RET
-%endif
-%endmacro
-
-INIT_XMM sse2
-convolve_fn copy
-convolve_fn avg
-convolve_fn copy, highbd
diff --git a/aom_dsp/x86/aom_convolve_copy_sse2.c b/aom_dsp/x86/aom_convolve_copy_sse2.c
new file mode 100644
index 0000000..f7b468a
--- /dev/null
+++ b/aom_dsp/x86/aom_convolve_copy_sse2.c
@@ -0,0 +1,308 @@
+/*
+ *  Copyright (c) 2020, Alliance for Open Media. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <immintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+static INLINE void copy_128(const uint8_t *src, uint8_t *dst) {
+  __m128i s[8];
+  s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 16));
+  s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 16));
+  s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 16));
+  s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 16));
+  s[4] = _mm_loadu_si128((__m128i *)(src + 4 * 16));
+  s[5] = _mm_loadu_si128((__m128i *)(src + 5 * 16));
+  s[6] = _mm_loadu_si128((__m128i *)(src + 6 * 16));
+  s[7] = _mm_loadu_si128((__m128i *)(src + 7 * 16));
+  _mm_store_si128((__m128i *)(dst + 0 * 16), s[0]);
+  _mm_store_si128((__m128i *)(dst + 1 * 16), s[1]);
+  _mm_store_si128((__m128i *)(dst + 2 * 16), s[2]);
+  _mm_store_si128((__m128i *)(dst + 3 * 16), s[3]);
+  _mm_store_si128((__m128i *)(dst + 4 * 16), s[4]);
+  _mm_store_si128((__m128i *)(dst + 5 * 16), s[5]);
+  _mm_store_si128((__m128i *)(dst + 6 * 16), s[6]);
+  _mm_store_si128((__m128i *)(dst + 7 * 16), s[7]);
+}
+
+void aom_convolve_copy_sse2(const uint8_t *src, ptrdiff_t src_stride,
+                            uint8_t *dst, ptrdiff_t dst_stride, int w, int h) {
+  if (w >= 16) {
+    assert(!((intptr_t)dst % 16));
+    assert(!(dst_stride % 16));
+  }
+
+  if (w == 2) {
+    do {
+      memmove(dst, src, 2 * sizeof(*src));
+      src += src_stride;
+      dst += dst_stride;
+      memmove(dst, src, 2 * sizeof(*src));
+      src += src_stride;
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 4) {
+    do {
+      memmove(dst, src, 4 * sizeof(*src));
+      src += src_stride;
+      dst += dst_stride;
+      memmove(dst, src, 4 * sizeof(*src));
+      src += src_stride;
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 8) {
+    do {
+      __m128i s[2];
+      s[0] = _mm_loadl_epi64((__m128i *)src);
+      src += src_stride;
+      s[1] = _mm_loadl_epi64((__m128i *)src);
+      src += src_stride;
+      _mm_storel_epi64((__m128i *)dst, s[0]);
+      dst += dst_stride;
+      _mm_storel_epi64((__m128i *)dst, s[1]);
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 16) {
+    do {
+      __m128i s[2];
+      s[0] = _mm_loadu_si128((__m128i *)src);
+      src += src_stride;
+      s[1] = _mm_loadu_si128((__m128i *)src);
+      src += src_stride;
+      _mm_store_si128((__m128i *)dst, s[0]);
+      dst += dst_stride;
+      _mm_store_si128((__m128i *)dst, s[1]);
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 32) {
+    do {
+      __m128i s[4];
+      s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 16));
+      s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 16));
+      src += src_stride;
+      s[2] = _mm_loadu_si128((__m128i *)(src + 0 * 16));
+      s[3] = _mm_loadu_si128((__m128i *)(src + 1 * 16));
+      src += src_stride;
+      _mm_store_si128((__m128i *)(dst + 0 * 16), s[0]);
+      _mm_store_si128((__m128i *)(dst + 1 * 16), s[1]);
+      dst += dst_stride;
+      _mm_store_si128((__m128i *)(dst + 0 * 16), s[2]);
+      _mm_store_si128((__m128i *)(dst + 1 * 16), s[3]);
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 64) {
+    do {
+      __m128i s[8];
+      s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 16));
+      s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 16));
+      s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 16));
+      s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 16));
+      src += src_stride;
+      s[4] = _mm_loadu_si128((__m128i *)(src + 0 * 16));
+      s[5] = _mm_loadu_si128((__m128i *)(src + 1 * 16));
+      s[6] = _mm_loadu_si128((__m128i *)(src + 2 * 16));
+      s[7] = _mm_loadu_si128((__m128i *)(src + 3 * 16));
+      src += src_stride;
+      _mm_store_si128((__m128i *)(dst + 0 * 16), s[0]);
+      _mm_store_si128((__m128i *)(dst + 1 * 16), s[1]);
+      _mm_store_si128((__m128i *)(dst + 2 * 16), s[2]);
+      _mm_store_si128((__m128i *)(dst + 3 * 16), s[3]);
+      dst += dst_stride;
+      _mm_store_si128((__m128i *)(dst + 0 * 16), s[4]);
+      _mm_store_si128((__m128i *)(dst + 1 * 16), s[5]);
+      _mm_store_si128((__m128i *)(dst + 2 * 16), s[6]);
+      _mm_store_si128((__m128i *)(dst + 3 * 16), s[7]);
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else {
+    do {
+      copy_128(src, dst);
+      src += src_stride;
+      dst += dst_stride;
+      copy_128(src, dst);
+      src += src_stride;
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  }
+}
+
+static INLINE void highbd_copy_64(const uint16_t *src, uint16_t *dst) {
+  __m128i s[8];
+  s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 8));
+  s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 8));
+  s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 8));
+  s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 8));
+  s[4] = _mm_loadu_si128((__m128i *)(src + 4 * 8));
+  s[5] = _mm_loadu_si128((__m128i *)(src + 5 * 8));
+  s[6] = _mm_loadu_si128((__m128i *)(src + 6 * 8));
+  s[7] = _mm_loadu_si128((__m128i *)(src + 7 * 8));
+  _mm_store_si128((__m128i *)(dst + 0 * 8), s[0]);
+  _mm_store_si128((__m128i *)(dst + 1 * 8), s[1]);
+  _mm_store_si128((__m128i *)(dst + 2 * 8), s[2]);
+  _mm_store_si128((__m128i *)(dst + 3 * 8), s[3]);
+  _mm_store_si128((__m128i *)(dst + 4 * 8), s[4]);
+  _mm_store_si128((__m128i *)(dst + 5 * 8), s[5]);
+  _mm_store_si128((__m128i *)(dst + 6 * 8), s[6]);
+  _mm_store_si128((__m128i *)(dst + 7 * 8), s[7]);
+}
+
+static INLINE void highbd_copy_128(const uint16_t *src, uint16_t *dst) {
+  __m128i s[16];
+  s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 8));
+  s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 8));
+  s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 8));
+  s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 8));
+  s[4] = _mm_loadu_si128((__m128i *)(src + 4 * 8));
+  s[5] = _mm_loadu_si128((__m128i *)(src + 5 * 8));
+  s[6] = _mm_loadu_si128((__m128i *)(src + 6 * 8));
+  s[7] = _mm_loadu_si128((__m128i *)(src + 7 * 8));
+  s[8] = _mm_loadu_si128((__m128i *)(src + 8 * 8));
+  s[9] = _mm_loadu_si128((__m128i *)(src + 9 * 8));
+  s[10] = _mm_loadu_si128((__m128i *)(src + 10 * 8));
+  s[11] = _mm_loadu_si128((__m128i *)(src + 11 * 8));
+  s[12] = _mm_loadu_si128((__m128i *)(src + 12 * 8));
+  s[13] = _mm_loadu_si128((__m128i *)(src + 13 * 8));
+  s[14] = _mm_loadu_si128((__m128i *)(src + 14 * 8));
+  s[15] = _mm_loadu_si128((__m128i *)(src + 15 * 8));
+  _mm_store_si128((__m128i *)(dst + 0 * 8), s[0]);
+  _mm_store_si128((__m128i *)(dst + 1 * 8), s[1]);
+  _mm_store_si128((__m128i *)(dst + 2 * 8), s[2]);
+  _mm_store_si128((__m128i *)(dst + 3 * 8), s[3]);
+  _mm_store_si128((__m128i *)(dst + 4 * 8), s[4]);
+  _mm_store_si128((__m128i *)(dst + 5 * 8), s[5]);
+  _mm_store_si128((__m128i *)(dst + 6 * 8), s[6]);
+  _mm_store_si128((__m128i *)(dst + 7 * 8), s[7]);
+  _mm_store_si128((__m128i *)(dst + 8 * 8), s[8]);
+  _mm_store_si128((__m128i *)(dst + 9 * 8), s[9]);
+  _mm_store_si128((__m128i *)(dst + 10 * 8), s[10]);
+  _mm_store_si128((__m128i *)(dst + 11 * 8), s[11]);
+  _mm_store_si128((__m128i *)(dst + 12 * 8), s[12]);
+  _mm_store_si128((__m128i *)(dst + 13 * 8), s[13]);
+  _mm_store_si128((__m128i *)(dst + 14 * 8), s[14]);
+  _mm_store_si128((__m128i *)(dst + 15 * 8), s[15]);
+}
+
+void aom_highbd_convolve_copy_sse2(const uint16_t *src, ptrdiff_t src_stride,
+                                   uint16_t *dst, ptrdiff_t dst_stride, int w,
+                                   int h) {
+  if (w >= 16) {
+    assert(!((intptr_t)dst % 16));
+    assert(!(dst_stride % 16));
+  }
+
+  if (w == 2) {
+    do {
+      __m128i s = _mm_loadl_epi64((__m128i *)src);
+      *(uint32_t *)dst = _mm_cvtsi128_si32(s);
+      src += src_stride;
+      dst += dst_stride;
+      s = _mm_loadl_epi64((__m128i *)src);
+      *(uint32_t *)dst = _mm_cvtsi128_si32(s);
+      src += src_stride;
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 4) {
+    do {
+      __m128i s[2];
+      s[0] = _mm_loadl_epi64((__m128i *)src);
+      src += src_stride;
+      s[1] = _mm_loadl_epi64((__m128i *)src);
+      src += src_stride;
+      _mm_storel_epi64((__m128i *)dst, s[0]);
+      dst += dst_stride;
+      _mm_storel_epi64((__m128i *)dst, s[1]);
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 8) {
+    do {
+      __m128i s[2];
+      s[0] = _mm_loadu_si128((__m128i *)src);
+      src += src_stride;
+      s[1] = _mm_loadu_si128((__m128i *)src);
+      src += src_stride;
+      _mm_store_si128((__m128i *)dst, s[0]);
+      dst += dst_stride;
+      _mm_store_si128((__m128i *)dst, s[1]);
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 16) {
+    do {
+      __m128i s[4];
+      s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 8));
+      s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 8));
+      src += src_stride;
+      s[2] = _mm_loadu_si128((__m128i *)(src + 0 * 8));
+      s[3] = _mm_loadu_si128((__m128i *)(src + 1 * 8));
+      src += src_stride;
+      _mm_store_si128((__m128i *)(dst + 0 * 8), s[0]);
+      _mm_store_si128((__m128i *)(dst + 1 * 8), s[1]);
+      dst += dst_stride;
+      _mm_store_si128((__m128i *)(dst + 0 * 8), s[2]);
+      _mm_store_si128((__m128i *)(dst + 1 * 8), s[3]);
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 32) {
+    do {
+      __m128i s[8];
+      s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 8));
+      s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 8));
+      s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 8));
+      s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 8));
+      src += src_stride;
+      s[4] = _mm_loadu_si128((__m128i *)(src + 0 * 8));
+      s[5] = _mm_loadu_si128((__m128i *)(src + 1 * 8));
+      s[6] = _mm_loadu_si128((__m128i *)(src + 2 * 8));
+      s[7] = _mm_loadu_si128((__m128i *)(src + 3 * 8));
+      src += src_stride;
+      _mm_store_si128((__m128i *)(dst + 0 * 8), s[0]);
+      _mm_store_si128((__m128i *)(dst + 1 * 8), s[1]);
+      _mm_store_si128((__m128i *)(dst + 2 * 8), s[2]);
+      _mm_store_si128((__m128i *)(dst + 3 * 8), s[3]);
+      dst += dst_stride;
+      _mm_store_si128((__m128i *)(dst + 0 * 8), s[4]);
+      _mm_store_si128((__m128i *)(dst + 1 * 8), s[5]);
+      _mm_store_si128((__m128i *)(dst + 2 * 8), s[6]);
+      _mm_store_si128((__m128i *)(dst + 3 * 8), s[7]);
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 64) {
+    do {
+      highbd_copy_64(src, dst);
+      src += src_stride;
+      dst += dst_stride;
+      highbd_copy_64(src, dst);
+      src += src_stride;
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else {
+    do {
+      highbd_copy_128(src, dst);
+      src += src_stride;
+      dst += dst_stride;
+      highbd_copy_128(src, dst);
+      src += src_stride;
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  }
+}
diff --git a/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm b/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm
index b6f0407..d392225 100644
--- a/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm
+++ b/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm
@@ -211,7 +211,7 @@
 ;    unsigned int   output_height,
 ;    short *filter
 ;)
-global sym(aom_highbd_filter_block1d4_v8_sse2) PRIVATE
+globalsym(aom_highbd_filter_block1d4_v8_sse2)
 sym(aom_highbd_filter_block1d4_v8_sse2):
     push        rbp
     mov         rbp, rsp
@@ -281,7 +281,7 @@
 ;    unsigned int   output_height,
 ;    short *filter
 ;)
-global sym(aom_highbd_filter_block1d8_v8_sse2) PRIVATE
+globalsym(aom_highbd_filter_block1d8_v8_sse2)
 sym(aom_highbd_filter_block1d8_v8_sse2):
     push        rbp
     mov         rbp, rsp
@@ -340,7 +340,7 @@
 ;    unsigned int   output_height,
 ;    short *filter
 ;)
-global sym(aom_highbd_filter_block1d16_v8_sse2) PRIVATE
+globalsym(aom_highbd_filter_block1d16_v8_sse2)
 sym(aom_highbd_filter_block1d16_v8_sse2):
     push        rbp
     mov         rbp, rsp
@@ -403,7 +403,7 @@
 ;    unsigned int    output_height,
 ;    short *filter
 ;)
-global sym(aom_highbd_filter_block1d4_h8_sse2) PRIVATE
+globalsym(aom_highbd_filter_block1d4_h8_sse2)
 sym(aom_highbd_filter_block1d4_h8_sse2):
     push        rbp
     mov         rbp, rsp
@@ -478,7 +478,7 @@
 ;    unsigned int    output_height,
 ;    short *filter
 ;)
-global sym(aom_highbd_filter_block1d8_h8_sse2) PRIVATE
+globalsym(aom_highbd_filter_block1d8_h8_sse2)
 sym(aom_highbd_filter_block1d8_h8_sse2):
     push        rbp
     mov         rbp, rsp
@@ -544,7 +544,7 @@
 ;    unsigned int    output_height,
 ;    short *filter
 ;)
-global sym(aom_highbd_filter_block1d16_h8_sse2) PRIVATE
+globalsym(aom_highbd_filter_block1d16_h8_sse2)
 sym(aom_highbd_filter_block1d16_h8_sse2):
     push        rbp
     mov         rbp, rsp
diff --git a/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm b/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm
index a7152be..db4cad9 100644
--- a/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm
+++ b/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm
@@ -177,7 +177,7 @@
 
 SECTION .text
 
-global sym(aom_highbd_filter_block1d4_v2_sse2) PRIVATE
+globalsym(aom_highbd_filter_block1d4_v2_sse2)
 sym(aom_highbd_filter_block1d4_v2_sse2):
     push        rbp
     mov         rbp, rsp
@@ -201,7 +201,7 @@
     pop         rbp
     ret
 
-global sym(aom_highbd_filter_block1d8_v2_sse2) PRIVATE
+globalsym(aom_highbd_filter_block1d8_v2_sse2)
 sym(aom_highbd_filter_block1d8_v2_sse2):
     push        rbp
     mov         rbp, rsp
@@ -235,7 +235,7 @@
     pop         rbp
     ret
 
-global sym(aom_highbd_filter_block1d16_v2_sse2) PRIVATE
+globalsym(aom_highbd_filter_block1d16_v2_sse2)
 sym(aom_highbd_filter_block1d16_v2_sse2):
     push        rbp
     mov         rbp, rsp
@@ -271,7 +271,7 @@
     pop         rbp
     ret
 
-global sym(aom_highbd_filter_block1d4_h2_sse2) PRIVATE
+globalsym(aom_highbd_filter_block1d4_h2_sse2)
 sym(aom_highbd_filter_block1d4_h2_sse2):
     push        rbp
     mov         rbp, rsp
@@ -296,7 +296,7 @@
     pop         rbp
     ret
 
-global sym(aom_highbd_filter_block1d8_h2_sse2) PRIVATE
+globalsym(aom_highbd_filter_block1d8_h2_sse2)
 sym(aom_highbd_filter_block1d8_h2_sse2):
     push        rbp
     mov         rbp, rsp
@@ -330,7 +330,7 @@
     pop         rbp
     ret
 
-global sym(aom_highbd_filter_block1d16_h2_sse2) PRIVATE
+globalsym(aom_highbd_filter_block1d16_h2_sse2)
 sym(aom_highbd_filter_block1d16_h2_sse2):
     push        rbp
     mov         rbp, rsp
diff --git a/aom_dsp/x86/aom_quantize_avx.c b/aom_dsp/x86/aom_quantize_avx.c
new file mode 100644
index 0000000..b2d6d4b
--- /dev/null
+++ b/aom_dsp/x86/aom_quantize_avx.c
@@ -0,0 +1,282 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/bitdepth_conversion_sse2.h"
+#include "aom_dsp/x86/quantize_x86.h"
+
+static INLINE void calculate_dqcoeff_and_store(__m128i qcoeff, __m128i dequant,
+                                               tran_low_t *dqcoeff) {
+  const __m128i low = _mm_mullo_epi16(qcoeff, dequant);
+  const __m128i high = _mm_mulhi_epi16(qcoeff, dequant);
+
+  const __m128i dqcoeff32_0 = _mm_unpacklo_epi16(low, high);
+  const __m128i dqcoeff32_1 = _mm_unpackhi_epi16(low, high);
+
+  _mm_store_si128((__m128i *)(dqcoeff), dqcoeff32_0);
+  _mm_store_si128((__m128i *)(dqcoeff + 4), dqcoeff32_1);
+}
+
+void aom_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                        const int16_t *zbin_ptr, const int16_t *round_ptr,
+                        const int16_t *quant_ptr,
+                        const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+                        tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+                        uint16_t *eob_ptr, const int16_t *scan,
+                        const int16_t *iscan) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m256i big_zero = _mm256_setzero_si256();
+  int index;
+
+  __m128i zbin, round, quant, dequant, shift;
+  __m128i coeff0, coeff1;
+  __m128i qcoeff0, qcoeff1;
+  __m128i cmp_mask0, cmp_mask1;
+  __m128i all_zero;
+  __m128i eob = zero, eob0;
+
+  (void)scan;
+
+  *eob_ptr = 0;
+
+  load_b_values(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, &quant,
+                dequant_ptr, &dequant, quant_shift_ptr, &shift);
+
+  // Do DC and first 15 AC.
+  coeff0 = load_tran_low(coeff_ptr);
+  coeff1 = load_tran_low(coeff_ptr + 8);
+
+  qcoeff0 = _mm_abs_epi16(coeff0);
+  qcoeff1 = _mm_abs_epi16(coeff1);
+
+  cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+  zbin = _mm_unpackhi_epi64(zbin, zbin);  // Switch DC to AC
+  cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+  all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+  if (_mm_test_all_zeros(all_zero, all_zero)) {
+    _mm256_store_si256((__m256i *)(qcoeff_ptr), big_zero);
+    _mm256_store_si256((__m256i *)(dqcoeff_ptr), big_zero);
+    _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), big_zero);
+    _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), big_zero);
+
+    if (n_coeffs == 16) return;
+
+    round = _mm_unpackhi_epi64(round, round);
+    quant = _mm_unpackhi_epi64(quant, quant);
+    shift = _mm_unpackhi_epi64(shift, shift);
+    dequant = _mm_unpackhi_epi64(dequant, dequant);
+  } else {
+    calculate_qcoeff(&qcoeff0, round, quant, shift);
+    round = _mm_unpackhi_epi64(round, round);
+    quant = _mm_unpackhi_epi64(quant, quant);
+    shift = _mm_unpackhi_epi64(shift, shift);
+    calculate_qcoeff(&qcoeff1, round, quant, shift);
+
+    // Reinsert signs
+    qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+    qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+    // Mask out zbin threshold coeffs
+    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+    store_tran_low(qcoeff0, qcoeff_ptr);
+    store_tran_low(qcoeff1, qcoeff_ptr + 8);
+
+    calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr);
+    dequant = _mm_unpackhi_epi64(dequant, dequant);
+    calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + 8);
+
+    eob =
+        scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero);
+  }
+
+  // AC only loop.
+  for (index = 16; index < n_coeffs; index += 16) {
+    coeff0 = load_tran_low(coeff_ptr + index);
+    coeff1 = load_tran_low(coeff_ptr + index + 8);
+
+    qcoeff0 = _mm_abs_epi16(coeff0);
+    qcoeff1 = _mm_abs_epi16(coeff1);
+
+    cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+    cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+    all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+    if (_mm_test_all_zeros(all_zero, all_zero)) {
+      _mm256_store_si256((__m256i *)(qcoeff_ptr + index), big_zero);
+      _mm256_store_si256((__m256i *)(dqcoeff_ptr + index), big_zero);
+      _mm256_store_si256((__m256i *)(qcoeff_ptr + index + 8), big_zero);
+      _mm256_store_si256((__m256i *)(dqcoeff_ptr + index + 8), big_zero);
+      continue;
+    }
+
+    calculate_qcoeff(&qcoeff0, round, quant, shift);
+    calculate_qcoeff(&qcoeff1, round, quant, shift);
+
+    qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+    qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+    store_tran_low(qcoeff0, qcoeff_ptr + index);
+    store_tran_low(qcoeff1, qcoeff_ptr + index + 8);
+
+    calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr + index);
+    calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + index + 8);
+
+    eob0 = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, index,
+                        zero);
+    eob = _mm_max_epi16(eob, eob0);
+  }
+
+  *eob_ptr = accumulate_eob(eob);
+}
+
+void aom_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                              const int16_t *zbin_ptr, const int16_t *round_ptr,
+                              const int16_t *quant_ptr,
+                              const int16_t *quant_shift_ptr,
+                              tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                              const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                              const int16_t *scan, const int16_t *iscan) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i one = _mm_set1_epi16(1);
+  const __m256i big_zero = _mm256_setzero_si256();
+  int index;
+  const int log_scale = 1;
+
+  __m128i zbin, round, quant, dequant, shift;
+  __m128i coeff0, coeff1;
+  __m128i qcoeff0, qcoeff1;
+  __m128i cmp_mask0, cmp_mask1;
+  __m128i all_zero;
+  __m128i eob = zero, eob0;
+
+  (void)scan;
+
+  // Setup global values.
+  // The 32x32 halves zbin and round.
+  zbin = _mm_load_si128((const __m128i *)zbin_ptr);
+  // Shift with rounding.
+  zbin = _mm_add_epi16(zbin, one);
+  zbin = _mm_srli_epi16(zbin, 1);
+  // x86 has no "greater *or equal*" comparison. Subtract 1 from zbin so
+  // it is a strict "greater" comparison.
+  zbin = _mm_sub_epi16(zbin, one);
+
+  round = _mm_load_si128((const __m128i *)round_ptr);
+  round = _mm_add_epi16(round, one);
+  round = _mm_srli_epi16(round, 1);
+
+  quant = _mm_load_si128((const __m128i *)quant_ptr);
+  dequant = _mm_load_si128((const __m128i *)dequant_ptr);
+  shift = _mm_load_si128((const __m128i *)quant_shift_ptr);
+
+  // Do DC and first 15 AC.
+  coeff0 = load_tran_low(coeff_ptr);
+  coeff1 = load_tran_low(coeff_ptr + 8);
+
+  qcoeff0 = _mm_abs_epi16(coeff0);
+  qcoeff1 = _mm_abs_epi16(coeff1);
+
+  cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+  zbin = _mm_unpackhi_epi64(zbin, zbin);  // Switch DC to AC.
+  cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+  all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+  if (_mm_test_all_zeros(all_zero, all_zero)) {
+    _mm256_store_si256((__m256i *)(qcoeff_ptr), big_zero);
+    _mm256_store_si256((__m256i *)(dqcoeff_ptr), big_zero);
+    _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), big_zero);
+    _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), big_zero);
+
+    round = _mm_unpackhi_epi64(round, round);
+    quant = _mm_unpackhi_epi64(quant, quant);
+    shift = _mm_unpackhi_epi64(shift, shift);
+    dequant = _mm_unpackhi_epi64(dequant, dequant);
+  } else {
+    calculate_qcoeff_log_scale(&qcoeff0, round, quant, &shift, &log_scale);
+    round = _mm_unpackhi_epi64(round, round);
+    quant = _mm_unpackhi_epi64(quant, quant);
+    shift = _mm_unpackhi_epi64(shift, shift);
+    calculate_qcoeff_log_scale(&qcoeff1, round, quant, &shift, &log_scale);
+
+    // Reinsert signs.
+    qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+    qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+    // Mask out zbin threshold coeffs.
+    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+    store_tran_low(qcoeff0, qcoeff_ptr);
+    store_tran_low(qcoeff1, qcoeff_ptr + 8);
+
+    calculate_dqcoeff_and_store_log_scale(qcoeff0, dequant, zero, dqcoeff_ptr,
+                                          &log_scale);
+    dequant = _mm_unpackhi_epi64(dequant, dequant);
+    calculate_dqcoeff_and_store_log_scale(qcoeff1, dequant, zero,
+                                          dqcoeff_ptr + 8, &log_scale);
+
+    eob =
+        scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero);
+  }
+
+  // AC only loop.
+  for (index = 16; index < n_coeffs; index += 16) {
+    coeff0 = load_tran_low(coeff_ptr + index);
+    coeff1 = load_tran_low(coeff_ptr + index + 8);
+
+    qcoeff0 = _mm_abs_epi16(coeff0);
+    qcoeff1 = _mm_abs_epi16(coeff1);
+
+    cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+    cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+    all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+    if (_mm_test_all_zeros(all_zero, all_zero)) {
+      _mm256_store_si256((__m256i *)(qcoeff_ptr + index), big_zero);
+      _mm256_store_si256((__m256i *)(dqcoeff_ptr + index), big_zero);
+      _mm256_store_si256((__m256i *)(qcoeff_ptr + index + 8), big_zero);
+      _mm256_store_si256((__m256i *)(dqcoeff_ptr + index + 8), big_zero);
+      continue;
+    }
+
+    calculate_qcoeff_log_scale(&qcoeff0, round, quant, &shift, &log_scale);
+    calculate_qcoeff_log_scale(&qcoeff1, round, quant, &shift, &log_scale);
+
+    qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+    qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+    store_tran_low(qcoeff0, qcoeff_ptr + index);
+    store_tran_low(qcoeff1, qcoeff_ptr + index + 8);
+
+    calculate_dqcoeff_and_store_log_scale(qcoeff0, dequant, zero,
+                                          dqcoeff_ptr + index, &log_scale);
+    calculate_dqcoeff_and_store_log_scale(qcoeff1, dequant, zero,
+                                          dqcoeff_ptr + index + 8, &log_scale);
+
+    eob0 = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, index,
+                        zero);
+    eob = _mm_max_epi16(eob, eob0);
+  }
+
+  *eob_ptr = accumulate_eob(eob);
+}
diff --git a/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c b/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c
index f64b821..18e8a2a 100644
--- a/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c
+++ b/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c
@@ -15,6 +15,9 @@
 
 #include "aom_dsp/aom_filter.h"
 #include "aom_dsp/x86/convolve.h"
+#include "aom_dsp/x86/convolve_sse2.h"
+#include "aom_dsp/x86/mem_sse2.h"
+#include "aom_dsp/x86/transpose_sse2.h"
 #include "aom_mem/aom_mem.h"
 #include "aom_ports/mem.h"
 #include "aom_ports/emmintrin_compat.h"
@@ -742,6 +745,348 @@
   }
 }
 
+static INLINE __m128i shuffle_filter_convolve8_8_ssse3(
+    const __m128i *const s, const int16_t *const filter) {
+  __m128i f[4];
+  shuffle_filter_ssse3(filter, f);
+  return convolve8_8_ssse3(s, f);
+}
+
+static void filter_horiz_w8_ssse3(const uint8_t *const src,
+                                  const ptrdiff_t src_stride,
+                                  uint8_t *const dst,
+                                  const int16_t *const x_filter) {
+  __m128i s[8], ss[4], temp;
+
+  load_8bit_8x8(src, src_stride, s);
+  // 00 01 10 11 20 21 30 31  40 41 50 51 60 61 70 71
+  // 02 03 12 13 22 23 32 33  42 43 52 53 62 63 72 73
+  // 04 05 14 15 24 25 34 35  44 45 54 55 64 65 74 75
+  // 06 07 16 17 26 27 36 37  46 47 56 57 66 67 76 77
+  transpose_16bit_4x8(s, ss);
+  temp = shuffle_filter_convolve8_8_ssse3(ss, x_filter);
+  // shrink to 8 bit each 16 bits
+  temp = _mm_packus_epi16(temp, temp);
+  // save only 8 bytes convolve result
+  _mm_storel_epi64((__m128i *)dst, temp);
+}
+
+static void transpose8x8_to_dst(const uint8_t *const src,
+                                const ptrdiff_t src_stride, uint8_t *const dst,
+                                const ptrdiff_t dst_stride) {
+  __m128i s[8];
+
+  load_8bit_8x8(src, src_stride, s);
+  transpose_8bit_8x8(s, s);
+  store_8bit_8x8(s, dst, dst_stride);
+}
+
+static void scaledconvolve_horiz_w8(const uint8_t *src,
+                                    const ptrdiff_t src_stride, uint8_t *dst,
+                                    const ptrdiff_t dst_stride,
+                                    const InterpKernel *const x_filters,
+                                    const int x0_q4, const int x_step_q4,
+                                    const int w, const int h) {
+  DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]);
+  int x, y, z;
+  src -= SUBPEL_TAPS / 2 - 1;
+
+  // This function processes 8x8 areas. The intermediate height is not always
+  // a multiple of 8, so force it to be a multiple of 8 here.
+  y = h + (8 - (h & 0x7));
+
+  do {
+    int x_q4 = x0_q4;
+    for (x = 0; x < w; x += 8) {
+      // process 8 src_x steps
+      for (z = 0; z < 8; ++z) {
+        const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+        const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
+        if (x_q4 & SUBPEL_MASK) {
+          filter_horiz_w8_ssse3(src_x, src_stride, temp + (z * 8), x_filter);
+        } else {
+          int i;
+          for (i = 0; i < 8; ++i) {
+            temp[z * 8 + i] = src_x[i * src_stride + 3];
+          }
+        }
+        x_q4 += x_step_q4;
+      }
+
+      // transpose the 8x8 filters values back to dst
+      transpose8x8_to_dst(temp, 8, dst + x, dst_stride);
+    }
+
+    src += src_stride * 8;
+    dst += dst_stride * 8;
+  } while (y -= 8);
+}
+
+static void filter_horiz_w4_ssse3(const uint8_t *const src,
+                                  const ptrdiff_t src_stride,
+                                  uint8_t *const dst,
+                                  const int16_t *const filter) {
+  __m128i s[4], ss[2];
+  __m128i temp;
+
+  load_8bit_8x4(src, src_stride, s);
+  transpose_16bit_4x4(s, ss);
+  // 00 01 10 11 20 21 30 31
+  s[0] = ss[0];
+  // 02 03 12 13 22 23 32 33
+  s[1] = _mm_srli_si128(ss[0], 8);
+  // 04 05 14 15 24 25 34 35
+  s[2] = ss[1];
+  // 06 07 16 17 26 27 36 37
+  s[3] = _mm_srli_si128(ss[1], 8);
+
+  temp = shuffle_filter_convolve8_8_ssse3(s, filter);
+  // shrink to 8 bit each 16 bits
+  temp = _mm_packus_epi16(temp, temp);
+  // save only 4 bytes
+  *(int *)dst = _mm_cvtsi128_si32(temp);
+}
+
+static void transpose4x4_to_dst(const uint8_t *const src,
+                                const ptrdiff_t src_stride, uint8_t *const dst,
+                                const ptrdiff_t dst_stride) {
+  __m128i s[4];
+
+  load_8bit_4x4(src, src_stride, s);
+  s[0] = transpose_8bit_4x4(s);
+  s[1] = _mm_srli_si128(s[0], 4);
+  s[2] = _mm_srli_si128(s[0], 8);
+  s[3] = _mm_srli_si128(s[0], 12);
+  store_8bit_4x4(s, dst, dst_stride);
+}
+
+static void scaledconvolve_horiz_w4(const uint8_t *src,
+                                    const ptrdiff_t src_stride, uint8_t *dst,
+                                    const ptrdiff_t dst_stride,
+                                    const InterpKernel *const x_filters,
+                                    const int x0_q4, const int x_step_q4,
+                                    const int w, const int h) {
+  DECLARE_ALIGNED(16, uint8_t, temp[4 * 4]);
+  int x, y, z;
+  src -= SUBPEL_TAPS / 2 - 1;
+
+  for (y = 0; y < h; y += 4) {
+    int x_q4 = x0_q4;
+    for (x = 0; x < w; x += 4) {
+      // process 4 src_x steps
+      for (z = 0; z < 4; ++z) {
+        const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+        const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
+        if (x_q4 & SUBPEL_MASK) {
+          filter_horiz_w4_ssse3(src_x, src_stride, temp + (z * 4), x_filter);
+        } else {
+          int i;
+          for (i = 0; i < 4; ++i) {
+            temp[z * 4 + i] = src_x[i * src_stride + 3];
+          }
+        }
+        x_q4 += x_step_q4;
+      }
+
+      // transpose the 4x4 filters values back to dst
+      transpose4x4_to_dst(temp, 4, dst + x, dst_stride);
+    }
+
+    src += src_stride * 4;
+    dst += dst_stride * 4;
+  }
+}
+
+static __m128i filter_vert_kernel(const __m128i *const s,
+                                  const int16_t *const filter) {
+  __m128i ss[4];
+  __m128i temp;
+
+  // 00 10 01 11 02 12 03 13
+  ss[0] = _mm_unpacklo_epi8(s[0], s[1]);
+  // 20 30 21 31 22 32 23 33
+  ss[1] = _mm_unpacklo_epi8(s[2], s[3]);
+  // 40 50 41 51 42 52 43 53
+  ss[2] = _mm_unpacklo_epi8(s[4], s[5]);
+  // 60 70 61 71 62 72 63 73
+  ss[3] = _mm_unpacklo_epi8(s[6], s[7]);
+
+  temp = shuffle_filter_convolve8_8_ssse3(ss, filter);
+  // shrink to 8 bit each 16 bits
+  return _mm_packus_epi16(temp, temp);
+}
+
+static void filter_vert_w4_ssse3(const uint8_t *const src,
+                                 const ptrdiff_t src_stride, uint8_t *const dst,
+                                 const int16_t *const filter) {
+  __m128i s[8];
+  __m128i temp;
+
+  load_8bit_4x8(src, src_stride, s);
+  temp = filter_vert_kernel(s, filter);
+  // save only 4 bytes
+  *(int *)dst = _mm_cvtsi128_si32(temp);
+}
+
+static void scaledconvolve_vert_w4(
+    const uint8_t *src, const ptrdiff_t src_stride, uint8_t *const dst,
+    const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
+    const int y0_q4, const int y_step_q4, const int w, const int h) {
+  int y;
+  int y_q4 = y0_q4;
+
+  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+  for (y = 0; y < h; ++y) {
+    const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+    const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
+
+    if (y_q4 & SUBPEL_MASK) {
+      filter_vert_w4_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter);
+    } else {
+      memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w);
+    }
+
+    y_q4 += y_step_q4;
+  }
+}
+
+static void filter_vert_w8_ssse3(const uint8_t *const src,
+                                 const ptrdiff_t src_stride, uint8_t *const dst,
+                                 const int16_t *const filter) {
+  __m128i s[8], temp;
+
+  load_8bit_8x8(src, src_stride, s);
+  temp = filter_vert_kernel(s, filter);
+  // save only 8 bytes convolve result
+  _mm_storel_epi64((__m128i *)dst, temp);
+}
+
+static void scaledconvolve_vert_w8(
+    const uint8_t *src, const ptrdiff_t src_stride, uint8_t *const dst,
+    const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
+    const int y0_q4, const int y_step_q4, const int w, const int h) {
+  int y;
+  int y_q4 = y0_q4;
+
+  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+  for (y = 0; y < h; ++y) {
+    const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+    const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
+    if (y_q4 & SUBPEL_MASK) {
+      filter_vert_w8_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter);
+    } else {
+      memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w);
+    }
+    y_q4 += y_step_q4;
+  }
+}
+
+static void filter_vert_w16_ssse3(const uint8_t *src,
+                                  const ptrdiff_t src_stride,
+                                  uint8_t *const dst,
+                                  const int16_t *const filter, const int w) {
+  int i;
+  __m128i f[4];
+  shuffle_filter_ssse3(filter, f);
+
+  for (i = 0; i < w; i += 16) {
+    __m128i s[8], s_lo[4], s_hi[4], temp_lo, temp_hi;
+
+    loadu_8bit_16x8(src, src_stride, s);
+
+    // merge the result together
+    s_lo[0] = _mm_unpacklo_epi8(s[0], s[1]);
+    s_hi[0] = _mm_unpackhi_epi8(s[0], s[1]);
+    s_lo[1] = _mm_unpacklo_epi8(s[2], s[3]);
+    s_hi[1] = _mm_unpackhi_epi8(s[2], s[3]);
+    s_lo[2] = _mm_unpacklo_epi8(s[4], s[5]);
+    s_hi[2] = _mm_unpackhi_epi8(s[4], s[5]);
+    s_lo[3] = _mm_unpacklo_epi8(s[6], s[7]);
+    s_hi[3] = _mm_unpackhi_epi8(s[6], s[7]);
+    temp_lo = convolve8_8_ssse3(s_lo, f);
+    temp_hi = convolve8_8_ssse3(s_hi, f);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first convolve
+    // result and the second lane contain the second convolve result
+    temp_hi = _mm_packus_epi16(temp_lo, temp_hi);
+    src += 16;
+    // save 16 bytes convolve result
+    _mm_store_si128((__m128i *)&dst[i], temp_hi);
+  }
+}
+
+static void scaledconvolve_vert_w16(
+    const uint8_t *src, const ptrdiff_t src_stride, uint8_t *const dst,
+    const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
+    const int y0_q4, const int y_step_q4, const int w, const int h) {
+  int y;
+  int y_q4 = y0_q4;
+
+  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+  for (y = 0; y < h; ++y) {
+    const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+    const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
+    if (y_q4 & SUBPEL_MASK) {
+      filter_vert_w16_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter,
+                            w);
+    } else {
+      memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w);
+    }
+    y_q4 += y_step_q4;
+  }
+}
+
+void aom_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+                         ptrdiff_t dst_stride, const InterpKernel *filter,
+                         int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
+                         int w, int h) {
+  // Note: Fixed size intermediate buffer, temp, places limits on parameters.
+  // 2d filtering proceeds in 2 steps:
+  //   (1) Interpolate horizontally into an intermediate buffer, temp.
+  //   (2) Interpolate temp vertically to derive the sub-pixel result.
+  // Deriving the maximum number of rows in the temp buffer (135):
+  // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
+  // --Largest block size is 64x64 pixels.
+  // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
+  //   original frame (in 1/16th pixel units).
+  // --Must round-up because block may be located at sub-pixel position.
+  // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
+  // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
+  // --Require an additional 8 rows for the horiz_w8 transpose tail.
+  // When calling in frame scaling function, the smallest scaling factor is x1/4
+  // ==> y_step_q4 = 64. Since w and h are at most 16, the temp buffer is still
+  // big enough.
+  DECLARE_ALIGNED(16, uint8_t, temp[(135 + 8) * 64]);
+  const int intermediate_height =
+      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
+
+  assert(w <= 64);
+  assert(h <= 64);
+  assert(y_step_q4 <= 32 || (y_step_q4 <= 64 && h <= 32));
+  assert(x_step_q4 <= 64);
+
+  if (w >= 8) {
+    scaledconvolve_horiz_w8(src - src_stride * (SUBPEL_TAPS / 2 - 1),
+                            src_stride, temp, 64, filter, x0_q4, x_step_q4, w,
+                            intermediate_height);
+  } else {
+    scaledconvolve_horiz_w4(src - src_stride * (SUBPEL_TAPS / 2 - 1),
+                            src_stride, temp, 64, filter, x0_q4, x_step_q4, w,
+                            intermediate_height);
+  }
+
+  if (w >= 16) {
+    scaledconvolve_vert_w16(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
+                            dst_stride, filter, y0_q4, y_step_q4, w, h);
+  } else if (w == 8) {
+    scaledconvolve_vert_w8(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
+                           dst_stride, filter, y0_q4, y_step_q4, w, h);
+  } else {
+    scaledconvolve_vert_w4(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
+                           dst_stride, filter, y0_q4, y_step_q4, w, h);
+  }
+}
+
 filter8_1dfunction aom_filter_block1d16_v8_ssse3;
 filter8_1dfunction aom_filter_block1d16_h8_ssse3;
 filter8_1dfunction aom_filter_block1d8_v8_ssse3;
diff --git a/aom_dsp/x86/aom_subpixel_8t_sse2.asm b/aom_dsp/x86/aom_subpixel_8t_sse2.asm
index c88fc9f..640c5b2 100644
--- a/aom_dsp/x86/aom_subpixel_8t_sse2.asm
+++ b/aom_dsp/x86/aom_subpixel_8t_sse2.asm
@@ -190,7 +190,7 @@
 ;    unsigned int   output_height,
 ;    short *filter
 ;)
-global sym(aom_filter_block1d4_v8_sse2) PRIVATE
+globalsym(aom_filter_block1d4_v8_sse2)
 sym(aom_filter_block1d4_v8_sse2):
     push        rbp
     mov         rbp, rsp
@@ -257,7 +257,7 @@
 ;    unsigned int   output_height,
 ;    short *filter
 ;)
-global sym(aom_filter_block1d8_v8_sse2) PRIVATE
+globalsym(aom_filter_block1d8_v8_sse2)
 sym(aom_filter_block1d8_v8_sse2):
     push        rbp
     mov         rbp, rsp
@@ -316,7 +316,7 @@
 ;    unsigned int   output_height,
 ;    short *filter
 ;)
-global sym(aom_filter_block1d16_v8_sse2) PRIVATE
+globalsym(aom_filter_block1d16_v8_sse2)
 sym(aom_filter_block1d16_v8_sse2):
     push        rbp
     mov         rbp, rsp
@@ -379,7 +379,7 @@
 ;    unsigned int    output_height,
 ;    short *filter
 ;)
-global sym(aom_filter_block1d4_h8_sse2) PRIVATE
+globalsym(aom_filter_block1d4_h8_sse2)
 sym(aom_filter_block1d4_h8_sse2):
     push        rbp
     mov         rbp, rsp
@@ -453,7 +453,7 @@
 ;    unsigned int    output_height,
 ;    short *filter
 ;)
-global sym(aom_filter_block1d8_h8_sse2) PRIVATE
+globalsym(aom_filter_block1d8_h8_sse2)
 sym(aom_filter_block1d8_h8_sse2):
     push        rbp
     mov         rbp, rsp
@@ -528,7 +528,7 @@
 ;    unsigned int    output_height,
 ;    short *filter
 ;)
-global sym(aom_filter_block1d16_h8_sse2) PRIVATE
+globalsym(aom_filter_block1d16_h8_sse2)
 sym(aom_filter_block1d16_h8_sse2):
     push        rbp
     mov         rbp, rsp
diff --git a/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm b/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm
index d0b4b28..90dd55a 100644
--- a/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm
+++ b/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm
@@ -136,7 +136,7 @@
 
 SECTION .text
 
-global sym(aom_filter_block1d4_v2_sse2) PRIVATE
+globalsym(aom_filter_block1d4_v2_sse2)
 sym(aom_filter_block1d4_v2_sse2):
     push        rbp
     mov         rbp, rsp
@@ -160,7 +160,7 @@
     pop         rbp
     ret
 
-global sym(aom_filter_block1d8_v2_sse2) PRIVATE
+globalsym(aom_filter_block1d8_v2_sse2)
 sym(aom_filter_block1d8_v2_sse2):
     push        rbp
     mov         rbp, rsp
@@ -186,7 +186,7 @@
     pop         rbp
     ret
 
-global sym(aom_filter_block1d16_v2_sse2) PRIVATE
+globalsym(aom_filter_block1d16_v2_sse2)
 sym(aom_filter_block1d16_v2_sse2):
     push        rbp
     mov         rbp, rsp
@@ -214,7 +214,7 @@
     pop         rbp
     ret
 
-global sym(aom_filter_block1d4_h2_sse2) PRIVATE
+globalsym(aom_filter_block1d4_h2_sse2)
 sym(aom_filter_block1d4_h2_sse2):
     push        rbp
     mov         rbp, rsp
@@ -239,7 +239,7 @@
     pop         rbp
     ret
 
-global sym(aom_filter_block1d8_h2_sse2) PRIVATE
+globalsym(aom_filter_block1d8_h2_sse2)
 sym(aom_filter_block1d8_h2_sse2):
     push        rbp
     mov         rbp, rsp
@@ -266,7 +266,7 @@
     pop         rbp
     ret
 
-global sym(aom_filter_block1d16_h2_sse2) PRIVATE
+globalsym(aom_filter_block1d16_h2_sse2)
 sym(aom_filter_block1d16_h2_sse2):
     push        rbp
     mov         rbp, rsp
diff --git a/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm b/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm
index 59edc49..253bc26 100644
--- a/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm
+++ b/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm
@@ -110,7 +110,7 @@
 
 SECTION .text
 
-global sym(aom_filter_block1d4_v2_ssse3) PRIVATE
+globalsym(aom_filter_block1d4_v2_ssse3)
 sym(aom_filter_block1d4_v2_ssse3):
     push        rbp
     mov         rbp, rsp
@@ -134,7 +134,7 @@
     pop         rbp
     ret
 
-global sym(aom_filter_block1d8_v2_ssse3) PRIVATE
+globalsym(aom_filter_block1d8_v2_ssse3)
 sym(aom_filter_block1d8_v2_ssse3):
     push        rbp
     mov         rbp, rsp
@@ -160,7 +160,7 @@
     pop         rbp
     ret
 
-global sym(aom_filter_block1d16_v2_ssse3) PRIVATE
+globalsym(aom_filter_block1d16_v2_ssse3)
 sym(aom_filter_block1d16_v2_ssse3):
     push        rbp
     mov         rbp, rsp
@@ -187,7 +187,7 @@
     pop         rbp
     ret
 
-global sym(aom_filter_block1d4_h2_ssse3) PRIVATE
+globalsym(aom_filter_block1d4_h2_ssse3)
 sym(aom_filter_block1d4_h2_ssse3):
     push        rbp
     mov         rbp, rsp
@@ -212,7 +212,7 @@
     pop         rbp
     ret
 
-global sym(aom_filter_block1d8_h2_ssse3) PRIVATE
+globalsym(aom_filter_block1d8_h2_ssse3)
 sym(aom_filter_block1d8_h2_ssse3):
     push        rbp
     mov         rbp, rsp
@@ -239,7 +239,7 @@
     pop         rbp
     ret
 
-global sym(aom_filter_block1d16_h2_ssse3) PRIVATE
+globalsym(aom_filter_block1d16_h2_ssse3)
 sym(aom_filter_block1d16_h2_ssse3):
     push        rbp
     mov         rbp, rsp
diff --git a/aom_dsp/x86/convolve_avx2.h b/aom_dsp/x86/convolve_avx2.h
index d516de5..09a7e94 100644
--- a/aom_dsp/x86/convolve_avx2.h
+++ b/aom_dsp/x86/convolve_avx2.h
@@ -59,6 +59,166 @@
   6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
 };
 
+#define CONVOLVE_SR_HORIZONTAL_FILTER_4TAP                                     \
+  for (i = 0; i < (im_h - 2); i += 2) {                                        \
+    __m256i data = _mm256_castsi128_si256(                                     \
+        _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j]));           \
+    data = _mm256_inserti128_si256(                                            \
+        data,                                                                  \
+        _mm_loadu_si128(                                                       \
+            (__m128i *)&src_ptr[(i * src_stride) + j + src_stride]),           \
+        1);                                                                    \
+    __m256i res = convolve_lowbd_x_4tap(data, coeffs_h + 1, filt);             \
+    res =                                                                      \
+        _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), round_shift_h); \
+    _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);              \
+  }                                                                            \
+  __m256i data_1 = _mm256_castsi128_si256(                                     \
+      _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j]));             \
+  __m256i res = convolve_lowbd_x_4tap(data_1, coeffs_h + 1, filt);             \
+  res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), round_shift_h); \
+  _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);
+
+#define CONVOLVE_SR_VERTICAL_FILTER_4TAP                                      \
+  __m256i s[6];                                                               \
+  __m256i src_0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));  \
+  __m256i src_1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));  \
+  __m256i src_2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));  \
+  __m256i src_3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));  \
+                                                                              \
+  s[0] = _mm256_unpacklo_epi16(src_0, src_1);                                 \
+  s[1] = _mm256_unpacklo_epi16(src_2, src_3);                                 \
+  s[3] = _mm256_unpackhi_epi16(src_0, src_1);                                 \
+  s[4] = _mm256_unpackhi_epi16(src_2, src_3);                                 \
+                                                                              \
+  for (i = 0; i < h; i += 2) {                                                \
+    const int16_t *data = &im_block[i * im_stride];                           \
+    const __m256i s4 = _mm256_loadu_si256((__m256i *)(data + 4 * im_stride)); \
+    const __m256i s5 = _mm256_loadu_si256((__m256i *)(data + 5 * im_stride)); \
+    s[2] = _mm256_unpacklo_epi16(s4, s5);                                     \
+    s[5] = _mm256_unpackhi_epi16(s4, s5);                                     \
+                                                                              \
+    __m256i res_a = convolve_4tap(s, coeffs_v + 1);                           \
+    __m256i res_b = convolve_4tap(s + 3, coeffs_v + 1);                       \
+                                                                              \
+    res_a =                                                                   \
+        _mm256_sra_epi32(_mm256_add_epi32(res_a, sum_round_v), sum_shift_v);  \
+    res_b =                                                                   \
+        _mm256_sra_epi32(_mm256_add_epi32(res_b, sum_round_v), sum_shift_v);  \
+    const __m256i res_a_round = _mm256_sra_epi32(                             \
+        _mm256_add_epi32(res_a, round_const_v), round_shift_v);               \
+    const __m256i res_b_round = _mm256_sra_epi32(                             \
+        _mm256_add_epi32(res_b, round_const_v), round_shift_v);               \
+    const __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round);   \
+    const __m256i res_8b = _mm256_packus_epi16(res_16bit, res_16bit);         \
+    const __m128i res_0 = _mm256_castsi256_si128(res_8b);                     \
+    const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);                \
+                                                                              \
+    __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];                 \
+    __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + j + dst_stride];    \
+    if (w - j > 4) {                                                          \
+      _mm_storel_epi64(p_0, res_0);                                           \
+      _mm_storel_epi64(p_1, res_1);                                           \
+    } else if (w == 4) {                                                      \
+      xx_storel_32(p_0, res_0);                                               \
+      xx_storel_32(p_1, res_1);                                               \
+    } else {                                                                  \
+      *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0);                  \
+      *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1);                  \
+    }                                                                         \
+                                                                              \
+    s[0] = s[1];                                                              \
+    s[1] = s[2];                                                              \
+    s[3] = s[4];                                                              \
+    s[4] = s[5];                                                              \
+  }
+
+#define CONVOLVE_SR_HORIZONTAL_FILTER_6TAP                                     \
+  for (i = 0; i < (im_h - 2); i += 2) {                                        \
+    __m256i data = _mm256_castsi128_si256(                                     \
+        _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j]));           \
+    data = _mm256_inserti128_si256(                                            \
+        data,                                                                  \
+        _mm_loadu_si128(                                                       \
+            (__m128i *)&src_ptr[(i * src_stride) + j + src_stride]),           \
+        1);                                                                    \
+                                                                               \
+    __m256i res = convolve_lowbd_x_6tap(data, coeffs_h, filt);                 \
+    res =                                                                      \
+        _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), round_shift_h); \
+    _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);              \
+  }                                                                            \
+                                                                               \
+  __m256i data_1 = _mm256_castsi128_si256(                                     \
+      _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j]));             \
+                                                                               \
+  __m256i res = convolve_lowbd_x_6tap(data_1, coeffs_h, filt);                 \
+                                                                               \
+  res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), round_shift_h); \
+                                                                               \
+  _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);
+
+#define CONVOLVE_SR_VERTICAL_FILTER_6TAP                                      \
+  __m256i src_0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));  \
+  __m256i src_1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));  \
+  __m256i src_2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));  \
+  __m256i src_3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));  \
+                                                                              \
+  __m256i s[8];                                                               \
+  s[0] = _mm256_unpacklo_epi16(src_0, src_1);                                 \
+  s[1] = _mm256_unpacklo_epi16(src_2, src_3);                                 \
+                                                                              \
+  s[3] = _mm256_unpackhi_epi16(src_0, src_1);                                 \
+  s[4] = _mm256_unpackhi_epi16(src_2, src_3);                                 \
+                                                                              \
+  for (i = 0; i < h; i += 2) {                                                \
+    const int16_t *data = &im_block[i * im_stride];                           \
+                                                                              \
+    const __m256i s6 = _mm256_loadu_si256((__m256i *)(data + 4 * im_stride)); \
+    const __m256i s7 = _mm256_loadu_si256((__m256i *)(data + 5 * im_stride)); \
+                                                                              \
+    s[2] = _mm256_unpacklo_epi16(s6, s7);                                     \
+    s[5] = _mm256_unpackhi_epi16(s6, s7);                                     \
+                                                                              \
+    __m256i res_a = convolve_6tap(s, coeffs_v);                               \
+    __m256i res_b = convolve_6tap(s + 3, coeffs_v);                           \
+                                                                              \
+    res_a =                                                                   \
+        _mm256_sra_epi32(_mm256_add_epi32(res_a, sum_round_v), sum_shift_v);  \
+    res_b =                                                                   \
+        _mm256_sra_epi32(_mm256_add_epi32(res_b, sum_round_v), sum_shift_v);  \
+                                                                              \
+    const __m256i res_a_round = _mm256_sra_epi32(                             \
+        _mm256_add_epi32(res_a, round_const_v), round_shift_v);               \
+    const __m256i res_b_round = _mm256_sra_epi32(                             \
+        _mm256_add_epi32(res_b, round_const_v), round_shift_v);               \
+                                                                              \
+    const __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round);   \
+    const __m256i res_8b = _mm256_packus_epi16(res_16bit, res_16bit);         \
+                                                                              \
+    const __m128i res_0 = _mm256_castsi256_si128(res_8b);                     \
+    const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);                \
+                                                                              \
+    __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];                 \
+    __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + j + dst_stride];    \
+    if (w - j > 4) {                                                          \
+      _mm_storel_epi64(p_0, res_0);                                           \
+      _mm_storel_epi64(p_1, res_1);                                           \
+    } else if (w == 4) {                                                      \
+      xx_storel_32(p_0, res_0);                                               \
+      xx_storel_32(p_1, res_1);                                               \
+    } else {                                                                  \
+      *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0);                  \
+      *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1);                  \
+    }                                                                         \
+                                                                              \
+    s[0] = s[1];                                                              \
+    s[1] = s[2];                                                              \
+                                                                              \
+    s[3] = s[4];                                                              \
+    s[4] = s[5];                                                              \
+  }
+
 #define CONVOLVE_SR_HORIZONTAL_FILTER_8TAP                                     \
   for (i = 0; i < (im_h - 2); i += 2) {                                        \
     __m256i data = _mm256_castsi128_si256(                                     \
@@ -295,6 +455,49 @@
   coeffs[3] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0e0cu));
 }
 
+static INLINE void prepare_coeffs_6t_lowbd(
+    const InterpFilterParams *const filter_params, const int subpel_q4,
+    __m256i *const coeffs /* [4] */) {
+  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
+      filter_params, subpel_q4 & SUBPEL_MASK);
+  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
+  const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8);
+
+  // right shift all filter co-efficients by 1 to reduce the bits required.
+  // This extra right shift will be taken care of at the end while rounding
+  // the result.
+  // Since all filter co-efficients are even, this change will not affect the
+  // end result
+  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
+                            _mm_set1_epi16((int16_t)0xffff)));
+
+  const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1);
+
+  // coeffs 1 2 1 2 1 2 1 2
+  coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0402u));
+  // coeffs 3 4 3 4 3 4 3 4
+  coeffs[1] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0806u));
+  // coeffs 5 6 5 6 5 6 5 6
+  coeffs[2] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0c0au));
+}
+
+static INLINE void prepare_coeffs_6t(
+    const InterpFilterParams *const filter_params, const int subpel_q4,
+    __m256i *const coeffs /* [4] */) {
+  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
+      filter_params, subpel_q4 & SUBPEL_MASK);
+
+  const __m128i coeff_8 = _mm_loadu_si128((__m128i *)(filter + 1));
+  const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
+
+  // coeffs 1 2 1 2 1 2 1 2
+  coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00);
+  // coeffs 3 4 3 4 3 4 3 4
+  coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55);
+  // coeffs 5 6 5 6 5 6 5 6
+  coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa);
+}
+
 static INLINE void prepare_coeffs(const InterpFilterParams *const filter_params,
                                   const int subpel_q4,
                                   __m256i *const coeffs /* [4] */) {
@@ -328,6 +531,19 @@
   return res;
 }
 
+static INLINE __m256i convolve_lowbd_6tap(const __m256i *const s,
+                                          const __m256i *const coeffs) {
+  const __m256i res_01 = _mm256_maddubs_epi16(s[0], coeffs[0]);
+  const __m256i res_23 = _mm256_maddubs_epi16(s[1], coeffs[1]);
+  const __m256i res_45 = _mm256_maddubs_epi16(s[2], coeffs[2]);
+
+  // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+  const __m256i res =
+      _mm256_add_epi16(_mm256_add_epi16(res_01, res_45), res_23);
+
+  return res;
+}
+
 static INLINE __m256i convolve_lowbd_4tap(const __m256i *const s,
                                           const __m256i *const coeffs) {
   const __m256i res_23 = _mm256_maddubs_epi16(s[0], coeffs[0]);
@@ -339,6 +555,17 @@
   return res;
 }
 
+static INLINE __m256i convolve_6tap(const __m256i *const s,
+                                    const __m256i *const coeffs) {
+  const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]);
+  const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]);
+  const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]);
+
+  const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1), res_2);
+
+  return res;
+}
+
 static INLINE __m256i convolve(const __m256i *const s,
                                const __m256i *const coeffs) {
   const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]);
@@ -374,6 +601,18 @@
   return convolve_lowbd(s, coeffs);
 }
 
+static INLINE __m256i convolve_lowbd_x_6tap(const __m256i data,
+                                            const __m256i *const coeffs,
+                                            const __m256i *const filt) {
+  __m256i s[4];
+
+  s[0] = _mm256_shuffle_epi8(data, filt[0]);
+  s[1] = _mm256_shuffle_epi8(data, filt[1]);
+  s[2] = _mm256_shuffle_epi8(data, filt[2]);
+
+  return convolve_lowbd_6tap(s, coeffs);
+}
+
 static INLINE __m256i convolve_lowbd_x_4tap(const __m256i data,
                                             const __m256i *const coeffs,
                                             const __m256i *const filt) {
diff --git a/aom_dsp/x86/convolve_sse2.h b/aom_dsp/x86/convolve_sse2.h
index 385c7c7..ab254c0 100644
--- a/aom_dsp/x86/convolve_sse2.h
+++ b/aom_dsp/x86/convolve_sse2.h
@@ -9,6 +9,12 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
+#include <tmmintrin.h>  // SSSE3
+
+#include "av1/common/resize.h"
+#include "config/av1_rtcd.h"
+#include "config/aom_scale_rtcd.h"
+
 #ifndef AOM_AOM_DSP_X86_CONVOLVE_SSE2_H_
 #define AOM_AOM_DSP_X86_CONVOLVE_SSE2_H_
 
@@ -118,4 +124,37 @@
   return res_round;
 }
 
+static INLINE void shuffle_filter_ssse3(const int16_t *const filter,
+                                        __m128i *const f) {
+  const __m128i f_values = _mm_load_si128((const __m128i *)filter);
+  // pack and duplicate the filter values
+  f[0] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
+  f[1] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
+  f[2] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
+  f[3] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
+}
+
+static INLINE __m128i convolve8_8_ssse3(const __m128i *const s,
+                                        const __m128i *const f) {
+  // multiply 2 adjacent elements with the filter and add the result
+  const __m128i k_64 = _mm_set1_epi16(1 << 6);
+  const __m128i x0 = _mm_maddubs_epi16(s[0], f[0]);
+  const __m128i x1 = _mm_maddubs_epi16(s[1], f[1]);
+  const __m128i x2 = _mm_maddubs_epi16(s[2], f[2]);
+  const __m128i x3 = _mm_maddubs_epi16(s[3], f[3]);
+  __m128i sum1, sum2;
+
+  // sum the results together, saturating only on the final step
+  // adding x0 with x2 and x1 with x3 is the only order that prevents
+  // outranges for all filters
+  sum1 = _mm_add_epi16(x0, x2);
+  sum2 = _mm_add_epi16(x1, x3);
+  // add the rounding offset early to avoid another saturated add
+  sum1 = _mm_add_epi16(sum1, k_64);
+  sum1 = _mm_adds_epi16(sum1, sum2);
+  // shift by 7 bit each 16 bit
+  sum1 = _mm_srai_epi16(sum1, 7);
+  return sum1;
+}
+
 #endif  // AOM_AOM_DSP_X86_CONVOLVE_SSE2_H_
diff --git a/aom_dsp/x86/highbd_convolve_avx2.c b/aom_dsp/x86/highbd_convolve_avx2.c
index b43a7d7..0af7d87 100644
--- a/aom_dsp/x86/highbd_convolve_avx2.c
+++ b/aom_dsp/x86/highbd_convolve_avx2.c
@@ -28,105 +28,13 @@
                                              4, 5, 6,  7,  6,  7,  8,  9,
                                              8, 9, 10, 11, 10, 11, 12, 13 };
 
-void aom_highbd_convolve_copy_avx2(const uint8_t *src8, ptrdiff_t src_stride,
-                                   uint8_t *dst8, ptrdiff_t dst_stride,
-                                   const int16_t *filter_x, int filter_x_stride,
-                                   const int16_t *filter_y, int filter_y_stride,
-                                   int width, int h, int bd) {
-  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
-  (void)filter_x;
-  (void)filter_y;
-  (void)filter_x_stride;
-  (void)filter_y_stride;
-  (void)bd;
-
-  assert(width % 4 == 0);
-  if (width > 32) {  // width = 64
-    do {
-      const __m256i p0 = _mm256_loadu_si256((const __m256i *)src);
-      const __m256i p1 = _mm256_loadu_si256((const __m256i *)(src + 16));
-      const __m256i p2 = _mm256_loadu_si256((const __m256i *)(src + 32));
-      const __m256i p3 = _mm256_loadu_si256((const __m256i *)(src + 48));
-      src += src_stride;
-      _mm256_storeu_si256((__m256i *)dst, p0);
-      _mm256_storeu_si256((__m256i *)(dst + 16), p1);
-      _mm256_storeu_si256((__m256i *)(dst + 32), p2);
-      _mm256_storeu_si256((__m256i *)(dst + 48), p3);
-      dst += dst_stride;
-      h--;
-    } while (h > 0);
-  } else if (width > 16) {  // width = 32
-    do {
-      const __m256i p0 = _mm256_loadu_si256((const __m256i *)src);
-      const __m256i p1 = _mm256_loadu_si256((const __m256i *)(src + 16));
-      src += src_stride;
-      _mm256_storeu_si256((__m256i *)dst, p0);
-      _mm256_storeu_si256((__m256i *)(dst + 16), p1);
-      dst += dst_stride;
-      h--;
-    } while (h > 0);
-  } else if (width > 8) {  // width = 16
-    __m256i p0, p1;
-    do {
-      p0 = _mm256_loadu_si256((const __m256i *)src);
-      src += src_stride;
-      p1 = _mm256_loadu_si256((const __m256i *)src);
-      src += src_stride;
-
-      _mm256_storeu_si256((__m256i *)dst, p0);
-      dst += dst_stride;
-      _mm256_storeu_si256((__m256i *)dst, p1);
-      dst += dst_stride;
-      h -= 2;
-    } while (h > 0);
-  } else if (width > 4) {  // width = 8
-    __m128i p0, p1;
-    do {
-      p0 = _mm_loadu_si128((const __m128i *)src);
-      src += src_stride;
-      p1 = _mm_loadu_si128((const __m128i *)src);
-      src += src_stride;
-
-      _mm_storeu_si128((__m128i *)dst, p0);
-      dst += dst_stride;
-      _mm_storeu_si128((__m128i *)dst, p1);
-      dst += dst_stride;
-      h -= 2;
-    } while (h > 0);
-  } else {  // width = 4
-    __m128i p0, p1;
-    do {
-      p0 = _mm_loadl_epi64((const __m128i *)src);
-      src += src_stride;
-      p1 = _mm_loadl_epi64((const __m128i *)src);
-      src += src_stride;
-
-      _mm_storel_epi64((__m128i *)dst, p0);
-      dst += dst_stride;
-      _mm_storel_epi64((__m128i *)dst, p1);
-      dst += dst_stride;
-      h -= 2;
-    } while (h > 0);
-  }
-}
-
 void av1_highbd_convolve_y_sr_avx2(const uint16_t *src, int src_stride,
                                    uint16_t *dst, int dst_stride, int w, int h,
-                                   const InterpFilterParams *filter_params_x,
                                    const InterpFilterParams *filter_params_y,
-                                   const int subpel_x_qn, const int subpel_y_qn,
-                                   ConvolveParams *conv_params, int bd) {
+                                   const int subpel_y_qn, int bd) {
   int i, j;
   const int fo_vert = filter_params_y->taps / 2 - 1;
   const uint16_t *const src_ptr = src - fo_vert * src_stride;
-  (void)filter_params_x;
-  (void)subpel_x_qn;
-  (void)conv_params;
-
-  assert(conv_params->round_0 <= FILTER_BITS);
-  assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) ||
-         ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS)));
 
   __m256i s[8], coeffs_y[4];
 
@@ -263,14 +171,11 @@
 void av1_highbd_convolve_x_sr_avx2(const uint16_t *src, int src_stride,
                                    uint16_t *dst, int dst_stride, int w, int h,
                                    const InterpFilterParams *filter_params_x,
-                                   const InterpFilterParams *filter_params_y,
-                                   const int subpel_x_qn, const int subpel_y_qn,
+                                   const int subpel_x_qn,
                                    ConvolveParams *conv_params, int bd) {
   int i, j;
   const int fo_horiz = filter_params_x->taps / 2 - 1;
   const uint16_t *const src_ptr = src - fo_horiz;
-  (void)subpel_y_qn;
-  (void)filter_params_y;
 
   // Check that, even with 12-bit input, the intermediate values will fit
   // into an unsigned 16-bit intermediate array.
diff --git a/aom_dsp/x86/highbd_convolve_ssse3.c b/aom_dsp/x86/highbd_convolve_ssse3.c
index a79350f..491c647 100644
--- a/aom_dsp/x86/highbd_convolve_ssse3.c
+++ b/aom_dsp/x86/highbd_convolve_ssse3.c
@@ -18,21 +18,11 @@
 
 void av1_highbd_convolve_y_sr_ssse3(const uint16_t *src, int src_stride,
                                     uint16_t *dst, int dst_stride, int w, int h,
-                                    const InterpFilterParams *filter_params_x,
                                     const InterpFilterParams *filter_params_y,
-                                    const int subpel_x_qn,
-                                    const int subpel_y_qn,
-                                    ConvolveParams *conv_params, int bd) {
+                                    const int subpel_y_qn, int bd) {
   int i, j;
   const int fo_vert = filter_params_y->taps / 2 - 1;
   const uint16_t *const src_ptr = src - fo_vert * src_stride;
-  (void)filter_params_x;
-  (void)subpel_x_qn;
-  (void)conv_params;
-
-  assert(conv_params->round_0 <= FILTER_BITS);
-  assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) ||
-         ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS)));
 
   __m128i s[16], coeffs_y[4];
 
@@ -167,15 +157,11 @@
 void av1_highbd_convolve_x_sr_ssse3(const uint16_t *src, int src_stride,
                                     uint16_t *dst, int dst_stride, int w, int h,
                                     const InterpFilterParams *filter_params_x,
-                                    const InterpFilterParams *filter_params_y,
                                     const int subpel_x_qn,
-                                    const int subpel_y_qn,
                                     ConvolveParams *conv_params, int bd) {
   int i, j;
   const int fo_horiz = filter_params_x->taps / 2 - 1;
   const uint16_t *const src_ptr = src - fo_horiz;
-  (void)subpel_y_qn;
-  (void)filter_params_y;
 
   // Check that, even with 12-bit input, the intermediate values will fit
   // into an unsigned 16-bit intermediate array.
diff --git a/aom_dsp/x86/highbd_sad4d_sse2.asm b/aom_dsp/x86/highbd_sad4d_sse2.asm
index e0d2252..7ae1ca1 100644
--- a/aom_dsp/x86/highbd_sad4d_sse2.asm
+++ b/aom_dsp/x86/highbd_sad4d_sse2.asm
@@ -215,15 +215,28 @@
 ; void aom_highbd_sadNxNx4d_sse2(uint8_t *src,    int src_stride,
 ;                         uint8_t *ref[4], int ref_stride,
 ;                         uint32_t res[4]);
-; where NxN = 64x64, 32x32, 16x16, 16x8, 8x16 or 8x8
-%macro HIGH_SADNXN4D 2
+; Macro Arguments:
+;   1: Width
+;   2: Height
+;   3: If 0, then normal sad, if 2, then skip every other row
+%macro HIGH_SADNXN4D 2-3 0
+%if %3 == 0  ; normal sad
 %if UNIX64
 cglobal highbd_sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \
                               res, ref2, ref3, ref4
 %else
 cglobal highbd_sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
                               ref2, ref3, ref4
-%endif
+%endif  ; UNIX64
+%else  ; %3 == 2, downsample
+%if UNIX64
+cglobal highbd_sad_skip_%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \
+                              res, ref2, ref3, ref4
+%else
+cglobal highbd_sad_skip_%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
+                              ref2, ref3, ref4
+%endif  ; UNIX64
+%endif  ; sad/avg/skip
 
 ; set m1
   push                srcq
@@ -232,6 +245,10 @@
   pshufd                m1, m1, 0x0
   pop                 srcq
 
+%if %3 == 2  ; skip rows
+  lea          src_strided, [2*src_strided]
+  lea          ref_strided, [2*ref_strided]
+%endif  ; skip rows
   movsxdifnidn src_strideq, src_strided
   movsxdifnidn ref_strideq, ref_strided
   mov                ref2q, [ref1q+gprsize*1]
@@ -247,9 +264,15 @@
   shl                ref1q, 1
 
   HIGH_PROCESS_%1x2x4 1, 0, 0, src_strideq, ref_strideq, 1
-%rep (%2-4)/2
+%if %3 == 2  ;  Downsampling by two
+%define num_rep (%2-8)/4
+%else
+%define num_rep (%2-4)/2
+%endif
+%rep num_rep
   HIGH_PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 1
 %endrep
+%undef rep
   HIGH_PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 0
   ; N.B. HIGH_PROCESS outputs dwords (32 bits)
   ; so in high bit depth even the smallest width (4) needs 128bits i.e. XMM
@@ -268,6 +291,9 @@
   paddd                 m4, m0
   paddd                 m6, m1
   punpcklqdq            m4, m6
+%if %3 == 2  ; skip rows
+  pslld                 m4, 1
+%endif
   movifnidn             r4, r4mp
   movu                [r4], m4
   RET
@@ -294,3 +320,25 @@
 HIGH_SADNXN4D 32,  8
 HIGH_SADNXN4D 16, 64
 HIGH_SADNXN4D 64, 16
+
+HIGH_SADNXN4D 64, 64, 2
+HIGH_SADNXN4D 64, 32, 2
+HIGH_SADNXN4D 32, 64, 2
+HIGH_SADNXN4D 32, 32, 2
+HIGH_SADNXN4D 32, 16, 2
+HIGH_SADNXN4D 16, 32, 2
+HIGH_SADNXN4D 16, 16, 2
+HIGH_SADNXN4D 16,  8, 2
+HIGH_SADNXN4D  8, 16, 2
+HIGH_SADNXN4D  8,  8, 2
+HIGH_SADNXN4D  4,  8, 2
+HIGH_SADNXN4D  4, 16, 2
+HIGH_SADNXN4D  8, 32, 2
+HIGH_SADNXN4D 32,  8, 2
+HIGH_SADNXN4D 16, 64, 2
+HIGH_SADNXN4D 64, 16, 2
+
+; Current code cannot handle the case when the height is downsampled to 2
+; HIGH_SADNXN4D 16,  4, 2
+; HIGH_SADNXN4D  8,  4, 2
+; HIGH_SADNXN4D  4,  4, 2
diff --git a/aom_dsp/x86/highbd_sad_sse2.asm b/aom_dsp/x86/highbd_sad_sse2.asm
index 09e64d5..58f1ac9 100644
--- a/aom_dsp/x86/highbd_sad_sse2.asm
+++ b/aom_dsp/x86/highbd_sad_sse2.asm
@@ -15,6 +15,11 @@
 
 SECTION .text
 
+; Macro Arguments
+; Arg 1: Width
+; Arg 2: Height
+; Arg 3: Number of general purpose registers: 5 for 32-bit build, 6 for 64-bit
+; Arg 4: Type of function: if 0, normal sad; if 1, avg; if 2, skip rows
 %macro HIGH_SAD_FN 4
 %if %4 == 0
 %if %3 == 5
@@ -23,7 +28,7 @@
 cglobal highbd_sad%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, \
                             src_stride3, ref_stride3, n_rows
 %endif ; %3 == 5/7
-%else ; avg
+%elif %4 == 1 ; avg
 %if %3 == 5
 cglobal highbd_sad%1x%2_avg, 5, 1 + %3, 7, src, src_stride, ref, ref_stride, \
                                     second_pred, n_rows
@@ -38,7 +43,18 @@
 %define n_rowsd dword r0m
 %endif ; x86-32/64
 %endif ; %3 == 5/7
-%endif ; avg/sad
+%else  ; %4 == 2, skip rows
+%if %3 == 5
+cglobal highbd_sad_skip_%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, n_rows
+%else ; %3 == 7
+cglobal highbd_sad_skip_%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, \
+                            src_stride3, ref_stride3, n_rows
+%endif ; %3 == 5/7
+%endif ; sad/avg/skip
+%if %4 == 2  ; double the stride if we are skipping rows
+  lea          src_strided, [src_strided*2]
+  lea          ref_strided, [ref_strided*2]
+%endif
   movsxdifnidn src_strideq, src_strided
   movsxdifnidn ref_strideq, ref_strided
 %if %3 == 7
@@ -57,7 +73,11 @@
 ;                                    uint8_t *ref, int ref_stride);
 %macro HIGH_SAD64XN 1-2 0
   HIGH_SAD_FN 64, %1, 5, %2
+%if %2 == 2  ; skip rows, so divide number of rows by 2
+  mov              n_rowsd, %1/2
+%else
   mov              n_rowsd, %1
+%endif
   pxor                  m0, m0
   pxor                  m6, m6
 
@@ -149,6 +169,9 @@
   punpckldq             m0, m6
   movhlps               m1, m0
   paddd                 m0, m1
+%if %2 == 2  ; we skipped rows, so we need to double the sad
+  pslld                 m0, 1
+%endif
   movd                 eax, m0
   RET
 %endmacro
@@ -156,16 +179,23 @@
 INIT_XMM sse2
 HIGH_SAD64XN 64 ; highbd_sad64x64_sse2
 HIGH_SAD64XN 32 ; highbd_sad64x32_sse2
+HIGH_SAD64XN 16 ; highbd_sad_64x16_sse2
 HIGH_SAD64XN 64, 1 ; highbd_sad64x64_avg_sse2
 HIGH_SAD64XN 32, 1 ; highbd_sad64x32_avg_sse2
-HIGH_SAD64XN 16 ; highbd_sad_64x16_sse2
 HIGH_SAD64XN 16, 1 ; highbd_sad_64x16_avg_sse2
+HIGH_SAD64XN 64, 2 ; highbd_sad_skip_64x64_sse2
+HIGH_SAD64XN 32, 2 ; highbd_sad_skip_64x32_sse2
+HIGH_SAD64XN 16, 2 ; highbd_sad_skip_64x16_sse2
 
 ; unsigned int aom_highbd_sad32x{16,32,64}_sse2(uint8_t *src, int src_stride,
 ;                                    uint8_t *ref, int ref_stride);
 %macro HIGH_SAD32XN 1-2 0
   HIGH_SAD_FN 32, %1, 5, %2
+%if %2 == 2  ; skip rows, so divide number of rows by 2
+  mov              n_rowsd, %1/2
+%else
   mov              n_rowsd, %1
+%endif
   pxor                  m0, m0
   pxor                  m6, m6
 
@@ -217,6 +247,9 @@
   punpckldq             m0, m6
   movhlps               m1, m0
   paddd                 m0, m1
+%if %2 == 2  ; we skipped rows, so we need to double the sad
+  pslld                 m0, 1
+%endif
   movd                 eax, m0
   RET
 %endmacro
@@ -225,17 +258,25 @@
 HIGH_SAD32XN 64 ; highbd_sad32x64_sse2
 HIGH_SAD32XN 32 ; highbd_sad32x32_sse2
 HIGH_SAD32XN 16 ; highbd_sad32x16_sse2
+HIGH_SAD32XN  8 ; highbd_sad_32x8_sse2
 HIGH_SAD32XN 64, 1 ; highbd_sad32x64_avg_sse2
 HIGH_SAD32XN 32, 1 ; highbd_sad32x32_avg_sse2
 HIGH_SAD32XN 16, 1 ; highbd_sad32x16_avg_sse2
-HIGH_SAD32XN 8 ; highbd_sad_32x8_sse2
-HIGH_SAD32XN 8, 1 ; highbd_sad_32x8_avg_sse2
+HIGH_SAD32XN  8, 1 ; highbd_sad_32x8_avg_sse2
+HIGH_SAD32XN 64, 2 ; highbd_sad_skip_32x64_sse2
+HIGH_SAD32XN 32, 2 ; highbd_sad_skip_32x32_sse2
+HIGH_SAD32XN 16, 2 ; highbd_sad_skip_32x16_sse2
+HIGH_SAD32XN  8, 2 ; highbd_sad_skip_32x8_sse2
 
 ; unsigned int aom_highbd_sad16x{8,16,32}_sse2(uint8_t *src, int src_stride,
 ;                                    uint8_t *ref, int ref_stride);
 %macro HIGH_SAD16XN 1-2 0
   HIGH_SAD_FN 16, %1, 5, %2
+%if %2 == 2  ; skip rows, so divide number of rows by 2
+  mov              n_rowsd, %1/4
+%else
   mov              n_rowsd, %1/2
+%endif
   pxor                  m0, m0
   pxor                  m6, m6
 
@@ -287,27 +328,40 @@
   punpckldq             m0, m6
   movhlps               m1, m0
   paddd                 m0, m1
+%if %2 == 2  ; we skipped rows, so we need to double the sad
+  pslld                 m0, 1
+%endif
   movd                 eax, m0
   RET
 %endmacro
 
 INIT_XMM sse2
+HIGH_SAD16XN 64 ; highbd_sad_16x64_sse2
 HIGH_SAD16XN 32 ; highbd_sad16x32_sse2
 HIGH_SAD16XN 16 ; highbd_sad16x16_sse2
 HIGH_SAD16XN  8 ; highbd_sad16x8_sse2
+HIGH_SAD16XN  4 ; highbd_sad_16x4_sse2
+HIGH_SAD16XN 64, 1 ; highbd_sad_16x64_avg_sse2
 HIGH_SAD16XN 32, 1 ; highbd_sad16x32_avg_sse2
 HIGH_SAD16XN 16, 1 ; highbd_sad16x16_avg_sse2
 HIGH_SAD16XN  8, 1 ; highbd_sad16x8_avg_sse2
-HIGH_SAD16XN 4 ; highbd_sad_16x4_sse2
-HIGH_SAD16XN 4, 1 ; highbd_sad_16x4_avg_sse2
-HIGH_SAD16XN 64 ; highbd_sad_16x64_sse2
-HIGH_SAD16XN 64, 1 ; highbd_sad_16x64_avg_sse2
+HIGH_SAD16XN  4, 1 ; highbd_sad_16x4_avg_sse2
+HIGH_SAD16XN 64, 2 ; highbd_sad_skip_16x64_sse2
+HIGH_SAD16XN 32, 2 ; highbd_sad_skip_16x32_sse2
+HIGH_SAD16XN 16, 2 ; highbd_sad_skip_16x16_sse2
+HIGH_SAD16XN  8, 2 ; highbd_sad_skip_16x8_sse2
+; Current code fails there are only 2 rows
+; HIGH_SAD16XN  4, 2 ; highbd_sad_skip_16x4_sse2
 
 ; unsigned int aom_highbd_sad8x{4,8,16}_sse2(uint8_t *src, int src_stride,
 ;                                    uint8_t *ref, int ref_stride);
 %macro HIGH_SAD8XN 1-2 0
   HIGH_SAD_FN 8, %1, 7, %2
+%if %2 == 2  ; skip rows, so divide number of rows by 2
+  mov              n_rowsd, %1/8
+%else
   mov              n_rowsd, %1/4
+%endif
   pxor                  m0, m0
   pxor                  m6, m6
 
@@ -359,25 +413,37 @@
   punpckldq             m0, m6
   movhlps               m1, m0
   paddd                 m0, m1
+%if %2 == 2  ; we skipped rows, so we need to double the sad
+  pslld                 m0, 1
+%endif
   movd                 eax, m0
   RET
 %endmacro
 
 INIT_XMM sse2
+HIGH_SAD8XN 32 ; highbd_sad_8x32_sse2
 HIGH_SAD8XN 16 ; highbd_sad8x16_sse2
 HIGH_SAD8XN  8 ; highbd_sad8x8_sse2
 HIGH_SAD8XN  4 ; highbd_sad8x4_sse2
+HIGH_SAD8XN 32, 1 ; highbd_sad_8x32_avg_sse2
 HIGH_SAD8XN 16, 1 ; highbd_sad8x16_avg_sse2
 HIGH_SAD8XN  8, 1 ; highbd_sad8x8_avg_sse2
 HIGH_SAD8XN  4, 1 ; highbd_sad8x4_avg_sse2
-HIGH_SAD8XN 32 ; highbd_sad_8x32_sse2
-HIGH_SAD8XN 32, 1 ; highbd_sad_8x32_avg_sse2
+HIGH_SAD8XN 32, 2 ; highbd_sad_skip_8x32_sse2
+HIGH_SAD8XN 16, 2 ; highbd_sad_skip_8x16_sse2
+HIGH_SAD8XN  8, 2 ; highbd_sad_skip_8x8_sse2
+; Current code fails there are only 2 rows
+; HIGH_SAD8XN  4, 2 ; highbd_sad8x4_avg_sse2
 
 ; unsigned int aom_highbd_sad4x{4,8,16}_sse2(uint8_t *src, int src_stride,
 ;                                    uint8_t *ref, int ref_stride);
 %macro HIGH_SAD4XN 1-2 0
   HIGH_SAD_FN 4, %1, 7, %2
+%if %2 == 2  ; skip rows, so divide number of rows by 2
+  mov              n_rowsd, %1/8
+%else
   mov              n_rowsd, %1/4
+%endif
   pxor                  m0, m0
   pxor                  m6, m6
 
@@ -429,6 +495,9 @@
   punpckldq             m0, m6
   movhlps               m1, m0
   paddd                 m0, m1
+%if %2 == 2  ; we skipped rows, so we need to double the sad
+  pslld                 m0, 1
+%endif
   movd                 eax, m0
   RET
 %endmacro
@@ -440,3 +509,7 @@
 HIGH_SAD4XN 16, 1 ; highbd_sad4x16_avg_sse2
 HIGH_SAD4XN  8, 1 ; highbd_sad4x8_avg_sse2
 HIGH_SAD4XN  4, 1 ; highbd_sad4x4_avg_sse2
+HIGH_SAD4XN 16, 2 ; highbd_sad_skip_4x16_sse2
+HIGH_SAD4XN  8, 2 ; highbd_sad_skip_4x8_sse2
+; Current code fails there are only 2 rows
+; HIGH_SAD4XN  4, 2 ; highbd_sad_skip_4x4_sse2
diff --git a/aom_dsp/x86/highbd_variance_avx2.c b/aom_dsp/x86/highbd_variance_avx2.c
index 9b1b4c9..3d76c78 100644
--- a/aom_dsp/x86/highbd_variance_avx2.c
+++ b/aom_dsp/x86/highbd_variance_avx2.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
@@ -13,11 +13,611 @@
 #include <immintrin.h>  // AVX2
 
 #include "config/aom_dsp_rtcd.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/x86/synonyms.h"
 
 typedef void (*high_variance_fn_t)(const uint16_t *src, int src_stride,
                                    const uint16_t *ref, int ref_stride,
                                    uint32_t *sse, int *sum);
 
+static uint32_t aom_highbd_var_filter_block2d_bil_avx2(
+    const uint8_t *src_ptr8, unsigned int src_pixels_per_line, int pixel_step,
+    unsigned int output_height, unsigned int output_width,
+    const uint32_t xoffset, const uint32_t yoffset, const uint8_t *dst_ptr8,
+    int dst_stride, uint32_t *sse) {
+  const __m256i filter1 =
+      _mm256_set1_epi32((uint32_t)(bilinear_filters_2t[xoffset][1] << 16) |
+                        bilinear_filters_2t[xoffset][0]);
+  const __m256i filter2 =
+      _mm256_set1_epi32((uint32_t)(bilinear_filters_2t[yoffset][1] << 16) |
+                        bilinear_filters_2t[yoffset][0]);
+  const __m256i one = _mm256_set1_epi16(1);
+  const uint32_t bitshift = (uint32_t)0x40;
+  (void)pixel_step;
+  unsigned int i, j, prev = 0, curr = 2;
+  uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src_ptr8);
+  uint16_t *dst_ptr = CONVERT_TO_SHORTPTR(dst_ptr8);
+  uint16_t *src_ptr_ref = src_ptr;
+  uint16_t *dst_ptr_ref = dst_ptr;
+  int64_t sum_long = 0;
+  uint64_t sse_long = 0;
+  unsigned int rshift = 0, inc = 1;
+  __m256i rbias = _mm256_set1_epi32(bitshift);
+  __m256i opointer[8];
+  unsigned int range;
+  if (xoffset == 0) {
+    if (yoffset == 0) {  // xoffset==0 && yoffset==0
+      range = output_width / 16;
+      if (output_height == 8) inc = 2;
+      if (output_height == 4) inc = 4;
+      for (j = 0; j < range * output_height * inc / 16; j++) {
+        if (j % (output_height * inc / 16) == 0) {
+          src_ptr = src_ptr_ref;
+          src_ptr_ref += 16;
+          dst_ptr = dst_ptr_ref;
+          dst_ptr_ref += 16;
+        }
+        __m256i sum1 = _mm256_setzero_si256();
+        __m256i sse1 = _mm256_setzero_si256();
+        for (i = 0; i < 16 / inc; ++i) {
+          __m256i V_S_SRC = _mm256_loadu_si256((const __m256i *)src_ptr);
+          src_ptr += src_pixels_per_line;
+          __m256i V_D_DST = _mm256_loadu_si256((const __m256i *)dst_ptr);
+          dst_ptr += dst_stride;
+
+          __m256i V_R_SUB = _mm256_sub_epi16(V_S_SRC, V_D_DST);
+          __m256i V_R_MAD = _mm256_madd_epi16(V_R_SUB, V_R_SUB);
+
+          sum1 = _mm256_add_epi16(sum1, V_R_SUB);
+          sse1 = _mm256_add_epi32(sse1, V_R_MAD);
+        }
+
+        __m256i v_sum0 = _mm256_madd_epi16(sum1, one);
+        __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, sse1);
+        __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, sse1);
+        __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h);
+        const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh);
+        const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1);
+        __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d);
+        v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8));
+        sum_long += _mm_extract_epi32(v_d, 0);
+        sse_long += _mm_extract_epi32(v_d, 1);
+      }
+
+      rshift = get_msb(output_height) + get_msb(output_width);
+
+    } else if (yoffset == 4) {  // xoffset==0 && yoffset==4
+      range = output_width / 16;
+      if (output_height == 8) inc = 2;
+      if (output_height == 4) inc = 4;
+      for (j = 0; j < range * output_height * inc / 16; j++) {
+        if (j % (output_height * inc / 16) == 0) {
+          src_ptr = src_ptr_ref;
+          src_ptr_ref += 16;
+          dst_ptr = dst_ptr_ref;
+          dst_ptr_ref += 16;
+
+          opointer[0] = _mm256_loadu_si256((const __m256i *)src_ptr);
+          src_ptr += src_pixels_per_line;
+          curr = 0;
+        }
+
+        __m256i sum1 = _mm256_setzero_si256();
+        __m256i sse1 = _mm256_setzero_si256();
+
+        for (i = 0; i < 16 / inc; ++i) {
+          prev = curr;
+          curr = (curr == 0) ? 1 : 0;
+          opointer[curr] = _mm256_loadu_si256((const __m256i *)src_ptr);
+          src_ptr += src_pixels_per_line;
+
+          __m256i V_S_SRC = _mm256_avg_epu16(opointer[curr], opointer[prev]);
+
+          __m256i V_D_DST = _mm256_loadu_si256((const __m256i *)dst_ptr);
+          dst_ptr += dst_stride;
+          __m256i V_R_SUB = _mm256_sub_epi16(V_S_SRC, V_D_DST);
+          __m256i V_R_MAD = _mm256_madd_epi16(V_R_SUB, V_R_SUB);
+          sum1 = _mm256_add_epi16(sum1, V_R_SUB);
+          sse1 = _mm256_add_epi32(sse1, V_R_MAD);
+        }
+
+        __m256i v_sum0 = _mm256_madd_epi16(sum1, one);
+        __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, sse1);
+        __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, sse1);
+        __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h);
+        const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh);
+        const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1);
+        __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d);
+        v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8));
+        sum_long += _mm_extract_epi32(v_d, 0);
+        sse_long += _mm_extract_epi32(v_d, 1);
+      }
+
+      rshift = get_msb(output_height) + get_msb(output_width);
+
+    } else {  // xoffset==0 && yoffset==1,2,3,5,6,7
+      range = output_width / 16;
+      if (output_height == 8) inc = 2;
+      if (output_height == 4) inc = 4;
+      for (j = 0; j < range * output_height * inc / 16; j++) {
+        if (j % (output_height * inc / 16) == 0) {
+          src_ptr = src_ptr_ref;
+          src_ptr_ref += 16;
+          dst_ptr = dst_ptr_ref;
+          dst_ptr_ref += 16;
+
+          opointer[0] = _mm256_loadu_si256((const __m256i *)src_ptr);
+          src_ptr += src_pixels_per_line;
+          curr = 0;
+        }
+
+        __m256i sum1 = _mm256_setzero_si256();
+        __m256i sse1 = _mm256_setzero_si256();
+
+        for (i = 0; i < 16 / inc; ++i) {
+          prev = curr;
+          curr = (curr == 0) ? 1 : 0;
+          opointer[curr] = _mm256_loadu_si256((const __m256i *)src_ptr);
+          src_ptr += src_pixels_per_line;
+
+          __m256i V_S_M1 =
+              _mm256_unpacklo_epi16(opointer[prev], opointer[curr]);
+          __m256i V_S_M2 =
+              _mm256_unpackhi_epi16(opointer[prev], opointer[curr]);
+
+          __m256i V_S_MAD1 = _mm256_madd_epi16(V_S_M1, filter2);
+          __m256i V_S_MAD2 = _mm256_madd_epi16(V_S_M2, filter2);
+
+          __m256i V_S_S1 =
+              _mm256_srli_epi32(_mm256_add_epi32(V_S_MAD1, rbias), 7);
+          __m256i V_S_S2 =
+              _mm256_srli_epi32(_mm256_add_epi32(V_S_MAD2, rbias), 7);
+
+          __m256i V_S_SRC = _mm256_packus_epi32(V_S_S1, V_S_S2);
+
+          __m256i V_D_DST = _mm256_loadu_si256((const __m256i *)dst_ptr);
+          dst_ptr += dst_stride;
+
+          __m256i V_R_SUB = _mm256_sub_epi16(V_S_SRC, V_D_DST);
+          __m256i V_R_MAD = _mm256_madd_epi16(V_R_SUB, V_R_SUB);
+
+          sum1 = _mm256_add_epi16(sum1, V_R_SUB);
+          sse1 = _mm256_add_epi32(sse1, V_R_MAD);
+        }
+
+        __m256i v_sum0 = _mm256_madd_epi16(sum1, one);
+        __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, sse1);
+        __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, sse1);
+        __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h);
+        const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh);
+        const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1);
+        __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d);
+        v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8));
+        sum_long += _mm_extract_epi32(v_d, 0);
+        sse_long += _mm_extract_epi32(v_d, 1);
+      }
+
+      rshift = get_msb(output_height) + get_msb(output_width);
+    }
+  } else if (xoffset == 4) {
+    if (yoffset == 0) {  // xoffset==4 && yoffset==0
+      range = output_width / 16;
+      if (output_height == 8) inc = 2;
+      if (output_height == 4) inc = 4;
+      for (j = 0; j < range * output_height * inc / 16; j++) {
+        if (j % (output_height * inc / 16) == 0) {
+          src_ptr = src_ptr_ref;
+          src_ptr_ref += 16;
+          dst_ptr = dst_ptr_ref;
+          dst_ptr_ref += 16;
+          __m256i V_H_D1 = _mm256_loadu_si256((const __m256i *)src_ptr);
+          __m256i V_H_D2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 1));
+          src_ptr += src_pixels_per_line;
+
+          opointer[0] = _mm256_avg_epu16(V_H_D1, V_H_D2);
+
+          curr = 0;
+        }
+
+        __m256i sum1 = _mm256_setzero_si256();
+        __m256i sse1 = _mm256_setzero_si256();
+
+        for (i = 0; i < 16 / inc; ++i) {
+          prev = curr;
+          curr = (curr == 0) ? 1 : 0;
+          __m256i V_V_D1 = _mm256_loadu_si256((const __m256i *)src_ptr);
+          __m256i V_V_D2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 1));
+          src_ptr += src_pixels_per_line;
+
+          opointer[curr] = _mm256_avg_epu16(V_V_D1, V_V_D2);
+
+          __m256i V_S_M1 =
+              _mm256_unpacklo_epi16(opointer[prev], opointer[curr]);
+          __m256i V_S_M2 =
+              _mm256_unpackhi_epi16(opointer[prev], opointer[curr]);
+
+          __m256i V_S_MAD1 = _mm256_madd_epi16(V_S_M1, filter2);
+          __m256i V_S_MAD2 = _mm256_madd_epi16(V_S_M2, filter2);
+
+          __m256i V_S_S1 =
+              _mm256_srli_epi32(_mm256_add_epi32(V_S_MAD1, rbias), 7);
+          __m256i V_S_S2 =
+              _mm256_srli_epi32(_mm256_add_epi32(V_S_MAD2, rbias), 7);
+
+          __m256i V_S_SRC = _mm256_packus_epi32(V_S_S1, V_S_S2);
+
+          __m256i V_D_DST = _mm256_loadu_si256((const __m256i *)dst_ptr);
+          dst_ptr += dst_stride;
+
+          __m256i V_R_SUB = _mm256_sub_epi16(V_S_SRC, V_D_DST);
+          __m256i V_R_MAD = _mm256_madd_epi16(V_R_SUB, V_R_SUB);
+
+          sum1 = _mm256_add_epi16(sum1, V_R_SUB);
+          sse1 = _mm256_add_epi32(sse1, V_R_MAD);
+        }
+
+        __m256i v_sum0 = _mm256_madd_epi16(sum1, one);
+        __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, sse1);
+        __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, sse1);
+        __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h);
+        const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh);
+        const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1);
+        __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d);
+        v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8));
+        sum_long += _mm_extract_epi32(v_d, 0);
+        sse_long += _mm_extract_epi32(v_d, 1);
+      }
+
+      rshift = get_msb(output_height) + get_msb(output_width);
+
+    } else if (yoffset == 4) {  // xoffset==4 && yoffset==4
+      range = output_width / 16;
+      if (output_height == 8) inc = 2;
+      if (output_height == 4) inc = 4;
+      for (j = 0; j < range * output_height * inc / 16; j++) {
+        if (j % (output_height * inc / 16) == 0) {
+          src_ptr = src_ptr_ref;
+          src_ptr_ref += 16;
+          dst_ptr = dst_ptr_ref;
+          dst_ptr_ref += 16;
+
+          __m256i V_H_D1 = _mm256_loadu_si256((const __m256i *)src_ptr);
+          __m256i V_H_D2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 1));
+          src_ptr += src_pixels_per_line;
+          opointer[0] = _mm256_avg_epu16(V_H_D1, V_H_D2);
+          curr = 0;
+        }
+
+        __m256i sum1 = _mm256_setzero_si256();
+        __m256i sse1 = _mm256_setzero_si256();
+
+        for (i = 0; i < 16 / inc; ++i) {
+          prev = curr;
+          curr = (curr == 0) ? 1 : 0;
+          __m256i V_V_D1 = _mm256_loadu_si256((const __m256i *)src_ptr);
+          __m256i V_V_D2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 1));
+          src_ptr += src_pixels_per_line;
+          opointer[curr] = _mm256_avg_epu16(V_V_D1, V_V_D2);
+          __m256i V_S_SRC = _mm256_avg_epu16(opointer[curr], opointer[prev]);
+
+          __m256i V_D_DST = _mm256_loadu_si256((const __m256i *)dst_ptr);
+          dst_ptr += dst_stride;
+          __m256i V_R_SUB = _mm256_sub_epi16(V_S_SRC, V_D_DST);
+          __m256i V_R_MAD = _mm256_madd_epi16(V_R_SUB, V_R_SUB);
+          sum1 = _mm256_add_epi16(sum1, V_R_SUB);
+          sse1 = _mm256_add_epi32(sse1, V_R_MAD);
+        }
+
+        __m256i v_sum0 = _mm256_madd_epi16(sum1, one);
+        __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, sse1);
+        __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, sse1);
+        __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h);
+        const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh);
+        const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1);
+        __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d);
+        v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8));
+        sum_long += _mm_extract_epi32(v_d, 0);
+        sse_long += _mm_extract_epi32(v_d, 1);
+      }
+
+      rshift = get_msb(output_height) + get_msb(output_width);
+
+    } else {  // xoffset==4 && yoffset==1,2,3,5,6,7
+      range = output_width / 16;
+      if (output_height == 8) inc = 2;
+      if (output_height == 4) inc = 4;
+      for (j = 0; j < range * output_height * inc / 16; j++) {
+        if (j % (output_height * inc / 16) == 0) {
+          src_ptr = src_ptr_ref;
+          src_ptr_ref += 16;
+          dst_ptr = dst_ptr_ref;
+          dst_ptr_ref += 16;
+
+          __m256i V_H_D1 = _mm256_loadu_si256((const __m256i *)src_ptr);
+          __m256i V_H_D2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 1));
+          src_ptr += src_pixels_per_line;
+          opointer[0] = _mm256_avg_epu16(V_H_D1, V_H_D2);
+          curr = 0;
+        }
+
+        __m256i sum1 = _mm256_setzero_si256();
+        __m256i sse1 = _mm256_setzero_si256();
+
+        for (i = 0; i < 16 / inc; ++i) {
+          prev = curr;
+          curr = (curr == 0) ? 1 : 0;
+          __m256i V_V_D1 = _mm256_loadu_si256((const __m256i *)src_ptr);
+          __m256i V_V_D2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 1));
+          src_ptr += src_pixels_per_line;
+          opointer[curr] = _mm256_avg_epu16(V_V_D1, V_V_D2);
+
+          __m256i V_S_M1 =
+              _mm256_unpacklo_epi16(opointer[prev], opointer[curr]);
+          __m256i V_S_M2 =
+              _mm256_unpackhi_epi16(opointer[prev], opointer[curr]);
+
+          __m256i V_S_MAD1 = _mm256_madd_epi16(V_S_M1, filter2);
+          __m256i V_S_MAD2 = _mm256_madd_epi16(V_S_M2, filter2);
+
+          __m256i V_S_S1 =
+              _mm256_srli_epi32(_mm256_add_epi32(V_S_MAD1, rbias), 7);
+          __m256i V_S_S2 =
+              _mm256_srli_epi32(_mm256_add_epi32(V_S_MAD2, rbias), 7);
+
+          __m256i V_S_SRC = _mm256_packus_epi32(V_S_S1, V_S_S2);
+
+          __m256i V_D_DST = _mm256_loadu_si256((const __m256i *)dst_ptr);
+          dst_ptr += dst_stride;
+
+          __m256i V_R_SUB = _mm256_sub_epi16(V_S_SRC, V_D_DST);
+          __m256i V_R_MAD = _mm256_madd_epi16(V_R_SUB, V_R_SUB);
+
+          sum1 = _mm256_add_epi16(sum1, V_R_SUB);
+          sse1 = _mm256_add_epi32(sse1, V_R_MAD);
+        }
+
+        __m256i v_sum0 = _mm256_madd_epi16(sum1, one);
+        __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, sse1);
+        __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, sse1);
+        __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h);
+        const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh);
+        const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1);
+        __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d);
+        v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8));
+        sum_long += _mm_extract_epi32(v_d, 0);
+        sse_long += _mm_extract_epi32(v_d, 1);
+      }
+
+      rshift = get_msb(output_height) + get_msb(output_width);
+    }
+  } else if (yoffset == 0) {  // xoffset==1,2,3,5,6,7 && yoffset==0
+    range = output_width / 16;
+    if (output_height == 8) inc = 2;
+    if (output_height == 4) inc = 4;
+    for (j = 0; j < range * output_height * inc / 16; j++) {
+      if (j % (output_height * inc / 16) == 0) {
+        src_ptr = src_ptr_ref;
+        src_ptr_ref += 16;
+        dst_ptr = dst_ptr_ref;
+        dst_ptr_ref += 16;
+
+        curr = 0;
+      }
+
+      __m256i sum1 = _mm256_setzero_si256();
+      __m256i sse1 = _mm256_setzero_si256();
+
+      for (i = 0; i < 16 / inc; ++i) {
+        __m256i V_V_D1 = _mm256_loadu_si256((const __m256i *)src_ptr);
+        __m256i V_V_D2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 1));
+        src_ptr += src_pixels_per_line;
+        __m256i V_V_M1 = _mm256_unpacklo_epi16(V_V_D1, V_V_D2);
+        __m256i V_V_M2 = _mm256_unpackhi_epi16(V_V_D1, V_V_D2);
+        __m256i V_V_MAD1 = _mm256_madd_epi16(V_V_M1, filter1);
+        __m256i V_V_MAD2 = _mm256_madd_epi16(V_V_M2, filter1);
+        __m256i V_V_S1 =
+            _mm256_srli_epi32(_mm256_add_epi32(V_V_MAD1, rbias), 7);
+        __m256i V_V_S2 =
+            _mm256_srli_epi32(_mm256_add_epi32(V_V_MAD2, rbias), 7);
+        opointer[curr] = _mm256_packus_epi32(V_V_S1, V_V_S2);
+
+        __m256i V_D_DST = _mm256_loadu_si256((const __m256i *)dst_ptr);
+        dst_ptr += dst_stride;
+        __m256i V_R_SUB = _mm256_sub_epi16(opointer[curr], V_D_DST);
+        __m256i V_R_MAD = _mm256_madd_epi16(V_R_SUB, V_R_SUB);
+
+        sum1 = _mm256_add_epi16(sum1, V_R_SUB);
+        sse1 = _mm256_add_epi32(sse1, V_R_MAD);
+      }
+
+      __m256i v_sum0 = _mm256_madd_epi16(sum1, one);
+      __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, sse1);
+      __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, sse1);
+      __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h);
+      const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh);
+      const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1);
+      __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d);
+      v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8));
+      sum_long += _mm_extract_epi32(v_d, 0);
+      sse_long += _mm_extract_epi32(v_d, 1);
+    }
+
+    rshift = get_msb(output_height) + get_msb(output_width);
+
+  } else if (yoffset == 4) {  // xoffset==1,2,3,5,6,7 && yoffset==4
+
+    range = output_width / 16;
+    if (output_height == 8) inc = 2;
+    if (output_height == 4) inc = 4;
+    for (j = 0; j < range * output_height * inc / 16; j++) {
+      if (j % (output_height * inc / 16) == 0) {
+        src_ptr = src_ptr_ref;
+        src_ptr_ref += 16;
+        dst_ptr = dst_ptr_ref;
+        dst_ptr_ref += 16;
+
+        __m256i V_H_D1 = _mm256_loadu_si256((const __m256i *)src_ptr);
+        __m256i V_H_D2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 1));
+        src_ptr += src_pixels_per_line;
+
+        __m256i V_H_M1 = _mm256_unpacklo_epi16(V_H_D1, V_H_D2);
+        __m256i V_H_M2 = _mm256_unpackhi_epi16(V_H_D1, V_H_D2);
+
+        __m256i V_H_MAD1 = _mm256_madd_epi16(V_H_M1, filter1);
+        __m256i V_H_MAD2 = _mm256_madd_epi16(V_H_M2, filter1);
+
+        __m256i V_H_S1 =
+            _mm256_srli_epi32(_mm256_add_epi32(V_H_MAD1, rbias), 7);
+        __m256i V_H_S2 =
+            _mm256_srli_epi32(_mm256_add_epi32(V_H_MAD2, rbias), 7);
+
+        opointer[0] = _mm256_packus_epi32(V_H_S1, V_H_S2);
+
+        curr = 0;
+      }
+
+      __m256i sum1 = _mm256_setzero_si256();
+      __m256i sse1 = _mm256_setzero_si256();
+
+      for (i = 0; i < 16 / inc; ++i) {
+        prev = curr;
+        curr = (curr == 0) ? 1 : 0;
+        __m256i V_V_D1 = _mm256_loadu_si256((const __m256i *)src_ptr);
+        __m256i V_V_D2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 1));
+        src_ptr += src_pixels_per_line;
+        __m256i V_V_M1 = _mm256_unpacklo_epi16(V_V_D1, V_V_D2);
+        __m256i V_V_M2 = _mm256_unpackhi_epi16(V_V_D1, V_V_D2);
+        __m256i V_V_MAD1 = _mm256_madd_epi16(V_V_M1, filter1);
+        __m256i V_V_MAD2 = _mm256_madd_epi16(V_V_M2, filter1);
+        __m256i V_V_S1 =
+            _mm256_srli_epi32(_mm256_add_epi32(V_V_MAD1, rbias), 7);
+        __m256i V_V_S2 =
+            _mm256_srli_epi32(_mm256_add_epi32(V_V_MAD2, rbias), 7);
+        opointer[curr] = _mm256_packus_epi32(V_V_S1, V_V_S2);
+
+        __m256i V_S_SRC = _mm256_avg_epu16(opointer[prev], opointer[curr]);
+
+        __m256i V_D_DST = _mm256_loadu_si256((const __m256i *)dst_ptr);
+        dst_ptr += dst_stride;
+
+        __m256i V_R_SUB = _mm256_sub_epi16(V_S_SRC, V_D_DST);
+        __m256i V_R_MAD = _mm256_madd_epi16(V_R_SUB, V_R_SUB);
+
+        sum1 = _mm256_add_epi16(sum1, V_R_SUB);
+        sse1 = _mm256_add_epi32(sse1, V_R_MAD);
+      }
+
+      __m256i v_sum0 = _mm256_madd_epi16(sum1, one);
+      __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, sse1);
+      __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, sse1);
+      __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h);
+      const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh);
+      const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1);
+      __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d);
+      v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8));
+      sum_long += _mm_extract_epi32(v_d, 0);
+      sse_long += _mm_extract_epi32(v_d, 1);
+    }
+
+    rshift = get_msb(output_height) + get_msb(output_width);
+
+  } else {  // xoffset==1,2,3,5,6,7 && yoffset==1,2,3,5,6,7
+    range = output_width / 16;
+    if (output_height == 8) inc = 2;
+    if (output_height == 4) inc = 4;
+    unsigned int nloop = 16 / inc;
+    for (j = 0; j < range * output_height * inc / 16; j++) {
+      if (j % (output_height * inc / 16) == 0) {
+        src_ptr = src_ptr_ref;
+        src_ptr_ref += 16;
+        dst_ptr = dst_ptr_ref;
+        dst_ptr_ref += 16;
+
+        __m256i V_H_D1 = _mm256_loadu_si256((const __m256i *)src_ptr);
+        __m256i V_H_D2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 1));
+        src_ptr += src_pixels_per_line;
+
+        __m256i V_H_M1 = _mm256_unpacklo_epi16(V_H_D1, V_H_D2);
+        __m256i V_H_M2 = _mm256_unpackhi_epi16(V_H_D1, V_H_D2);
+
+        __m256i V_H_MAD1 = _mm256_madd_epi16(V_H_M1, filter1);
+        __m256i V_H_MAD2 = _mm256_madd_epi16(V_H_M2, filter1);
+
+        __m256i V_H_S1 =
+            _mm256_srli_epi32(_mm256_add_epi32(V_H_MAD1, rbias), 7);
+        __m256i V_H_S2 =
+            _mm256_srli_epi32(_mm256_add_epi32(V_H_MAD2, rbias), 7);
+
+        opointer[0] = _mm256_packus_epi32(V_H_S1, V_H_S2);
+
+        curr = 0;
+      }
+
+      __m256i sum1 = _mm256_setzero_si256();
+      __m256i sse1 = _mm256_setzero_si256();
+
+      for (i = 0; i < nloop; ++i) {
+        prev = curr;
+        curr = !curr;
+        __m256i V_V_D1 = _mm256_loadu_si256((const __m256i *)src_ptr);
+        __m256i V_V_D2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 1));
+        src_ptr += src_pixels_per_line;
+        __m256i V_V_M1 = _mm256_unpacklo_epi16(V_V_D1, V_V_D2);
+        __m256i V_V_M2 = _mm256_unpackhi_epi16(V_V_D1, V_V_D2);
+        __m256i V_V_MAD1 = _mm256_madd_epi16(V_V_M1, filter1);
+        __m256i V_V_MAD2 = _mm256_madd_epi16(V_V_M2, filter1);
+        __m256i V_V_S1 =
+            _mm256_srli_epi32(_mm256_add_epi32(V_V_MAD1, rbias), 7);
+        __m256i V_V_S2 =
+            _mm256_srli_epi32(_mm256_add_epi32(V_V_MAD2, rbias), 7);
+        opointer[curr] = _mm256_packus_epi32(V_V_S1, V_V_S2);
+
+        __m256i V_S_M1 = _mm256_unpacklo_epi16(opointer[prev], opointer[curr]);
+        __m256i V_S_M2 = _mm256_unpackhi_epi16(opointer[prev], opointer[curr]);
+
+        __m256i V_S_MAD1 = _mm256_madd_epi16(V_S_M1, filter2);
+        __m256i V_S_MAD2 = _mm256_madd_epi16(V_S_M2, filter2);
+
+        __m256i V_S_S1 =
+            _mm256_srli_epi32(_mm256_add_epi32(V_S_MAD1, rbias), 7);
+        __m256i V_S_S2 =
+            _mm256_srli_epi32(_mm256_add_epi32(V_S_MAD2, rbias), 7);
+
+        __m256i V_S_SRC = _mm256_packus_epi32(V_S_S1, V_S_S2);
+
+        __m256i V_D_DST = _mm256_loadu_si256((const __m256i *)dst_ptr);
+        dst_ptr += dst_stride;
+
+        __m256i V_R_SUB = _mm256_sub_epi16(V_S_SRC, V_D_DST);
+        __m256i V_R_MAD = _mm256_madd_epi16(V_R_SUB, V_R_SUB);
+
+        sum1 = _mm256_add_epi16(sum1, V_R_SUB);
+        sse1 = _mm256_add_epi32(sse1, V_R_MAD);
+      }
+
+      __m256i v_sum0 = _mm256_madd_epi16(sum1, one);
+      __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, sse1);
+      __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, sse1);
+      __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h);
+      const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh);
+      const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1);
+      __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d);
+      v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8));
+      sum_long += _mm_extract_epi32(v_d, 0);
+      sse_long += _mm_extract_epi32(v_d, 1);
+    }
+
+    rshift = get_msb(output_height) + get_msb(output_width);
+  }
+
+  *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);
+  int sum = (int)ROUND_POWER_OF_TWO(sum_long, 2);
+
+  int32_t var = *sse - (uint32_t)(((int64_t)sum * sum) >> rshift);
+
+  return (var > 0) ? var : 0;
+}
+
 void aom_highbd_calc8x8var_avx2(const uint16_t *src, int src_stride,
                                 const uint16_t *ref, int ref_stride,
                                 uint32_t *sse, int *sum) {
@@ -129,12 +729,172 @@
 VAR_FN(16, 32, 16, 9);
 VAR_FN(16, 16, 16, 8);
 VAR_FN(16, 8, 8, 7);
-VAR_FN(8, 16, 8, 7);
-VAR_FN(8, 8, 8, 6);
 VAR_FN(16, 4, 16, 6);
 VAR_FN(8, 32, 8, 8);
 VAR_FN(32, 8, 8, 8);
 VAR_FN(16, 64, 16, 10);
 VAR_FN(64, 16, 16, 10);
+VAR_FN(8, 16, 8, 7);
+VAR_FN(8, 8, 8, 6);
 
 #undef VAR_FN
+
+#define SSE2_Height(H)                                                 \
+  uint32_t aom_highbd_10_sub_pixel_variance8x##H##_sse2(               \
+      const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
+      const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr);
+
+SSE2_Height(8);
+SSE2_Height(16);
+#undef SSE2_Height
+
+#define HIGHBD_SUBPIX_VAR(W, H)                                              \
+  uint32_t aom_highbd_10_sub_pixel_variance##W##x##H##_avx2(                 \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,          \
+      const uint8_t *dst, int dst_stride, uint32_t *sse) {                   \
+    if (W == 8 && H == 16)                                                   \
+      return aom_highbd_10_sub_pixel_variance8x16_sse2(                      \
+          src, src_stride, xoffset, yoffset, dst, dst_stride, sse);          \
+    else if (W == 8 && H == 8)                                               \
+      return aom_highbd_10_sub_pixel_variance8x8_sse2(                       \
+          src, src_stride, xoffset, yoffset, dst, dst_stride, sse);          \
+    else                                                                     \
+      return aom_highbd_var_filter_block2d_bil_avx2(                         \
+          src, src_stride, 1, H, W, xoffset, yoffset, dst, dst_stride, sse); \
+  }
+
+HIGHBD_SUBPIX_VAR(128, 128);
+HIGHBD_SUBPIX_VAR(128, 64);
+HIGHBD_SUBPIX_VAR(64, 128);
+HIGHBD_SUBPIX_VAR(64, 64);
+HIGHBD_SUBPIX_VAR(64, 32);
+HIGHBD_SUBPIX_VAR(32, 64);
+HIGHBD_SUBPIX_VAR(32, 32);
+HIGHBD_SUBPIX_VAR(32, 16);
+HIGHBD_SUBPIX_VAR(16, 32);
+HIGHBD_SUBPIX_VAR(16, 16);
+HIGHBD_SUBPIX_VAR(16, 8);
+HIGHBD_SUBPIX_VAR(8, 16);
+HIGHBD_SUBPIX_VAR(8, 8);
+#undef HIGHBD_SUBPIX_VAR
+
+uint64_t aom_mse_4xh_16bit_highbd_avx2(uint16_t *dst, int dstride,
+                                       uint16_t *src, int sstride, int h) {
+  uint64_t sum = 0;
+  __m128i reg0_4x16, reg1_4x16, reg2_4x16, reg3_4x16;
+  __m256i src0_8x16, src1_8x16, src_16x16;
+  __m256i dst0_8x16, dst1_8x16, dst_16x16;
+  __m256i res0_4x64, res1_4x64, res2_4x64, res3_4x64;
+  __m256i sub_result;
+  const __m256i zeros = _mm256_broadcastsi128_si256(_mm_setzero_si128());
+  __m256i square_result = _mm256_broadcastsi128_si256(_mm_setzero_si128());
+  for (int i = 0; i < h; i += 4) {
+    reg0_4x16 = _mm_loadl_epi64((__m128i const *)(&dst[(i + 0) * dstride]));
+    reg1_4x16 = _mm_loadl_epi64((__m128i const *)(&dst[(i + 1) * dstride]));
+    reg2_4x16 = _mm_loadl_epi64((__m128i const *)(&dst[(i + 2) * dstride]));
+    reg3_4x16 = _mm_loadl_epi64((__m128i const *)(&dst[(i + 3) * dstride]));
+    dst0_8x16 =
+        _mm256_castsi128_si256(_mm_unpacklo_epi64(reg0_4x16, reg1_4x16));
+    dst1_8x16 =
+        _mm256_castsi128_si256(_mm_unpacklo_epi64(reg2_4x16, reg3_4x16));
+    dst_16x16 = _mm256_permute2x128_si256(dst0_8x16, dst1_8x16, 0x20);
+
+    reg0_4x16 = _mm_loadl_epi64((__m128i const *)(&src[(i + 0) * sstride]));
+    reg1_4x16 = _mm_loadl_epi64((__m128i const *)(&src[(i + 1) * sstride]));
+    reg2_4x16 = _mm_loadl_epi64((__m128i const *)(&src[(i + 2) * sstride]));
+    reg3_4x16 = _mm_loadl_epi64((__m128i const *)(&src[(i + 3) * sstride]));
+    src0_8x16 =
+        _mm256_castsi128_si256(_mm_unpacklo_epi64(reg0_4x16, reg1_4x16));
+    src1_8x16 =
+        _mm256_castsi128_si256(_mm_unpacklo_epi64(reg2_4x16, reg3_4x16));
+    src_16x16 = _mm256_permute2x128_si256(src0_8x16, src1_8x16, 0x20);
+
+    sub_result = _mm256_abs_epi16(_mm256_sub_epi16(src_16x16, dst_16x16));
+
+    src_16x16 = _mm256_unpacklo_epi16(sub_result, zeros);
+    dst_16x16 = _mm256_unpackhi_epi16(sub_result, zeros);
+
+    src_16x16 = _mm256_madd_epi16(src_16x16, src_16x16);
+    dst_16x16 = _mm256_madd_epi16(dst_16x16, dst_16x16);
+
+    res0_4x64 = _mm256_unpacklo_epi32(src_16x16, zeros);
+    res1_4x64 = _mm256_unpackhi_epi32(src_16x16, zeros);
+    res2_4x64 = _mm256_unpacklo_epi32(dst_16x16, zeros);
+    res3_4x64 = _mm256_unpackhi_epi32(dst_16x16, zeros);
+
+    square_result = _mm256_add_epi64(
+        square_result,
+        _mm256_add_epi64(
+            _mm256_add_epi64(_mm256_add_epi64(res0_4x64, res1_4x64), res2_4x64),
+            res3_4x64));
+  }
+  const __m128i sum_2x64 =
+      _mm_add_epi64(_mm256_castsi256_si128(square_result),
+                    _mm256_extracti128_si256(square_result, 1));
+  const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8));
+  xx_storel_64(&sum, sum_1x64);
+  return sum;
+}
+
+uint64_t aom_mse_8xh_16bit_highbd_avx2(uint16_t *dst, int dstride,
+                                       uint16_t *src, int sstride, int h) {
+  uint64_t sum = 0;
+  __m256i src0_8x16, src1_8x16, src_16x16;
+  __m256i dst0_8x16, dst1_8x16, dst_16x16;
+  __m256i res0_4x64, res1_4x64, res2_4x64, res3_4x64;
+  __m256i sub_result;
+  const __m256i zeros = _mm256_broadcastsi128_si256(_mm_setzero_si128());
+  __m256i square_result = _mm256_broadcastsi128_si256(_mm_setzero_si128());
+
+  for (int i = 0; i < h; i += 2) {
+    dst0_8x16 =
+        _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)&dst[i * dstride]));
+    dst1_8x16 = _mm256_castsi128_si256(
+        _mm_loadu_si128((__m128i *)&dst[(i + 1) * dstride]));
+    dst_16x16 = _mm256_permute2x128_si256(dst0_8x16, dst1_8x16, 0x20);
+
+    src0_8x16 =
+        _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)&src[i * sstride]));
+    src1_8x16 = _mm256_castsi128_si256(
+        _mm_loadu_si128((__m128i *)&src[(i + 1) * sstride]));
+    src_16x16 = _mm256_permute2x128_si256(src0_8x16, src1_8x16, 0x20);
+
+    sub_result = _mm256_abs_epi16(_mm256_sub_epi16(src_16x16, dst_16x16));
+
+    src_16x16 = _mm256_unpacklo_epi16(sub_result, zeros);
+    dst_16x16 = _mm256_unpackhi_epi16(sub_result, zeros);
+
+    src_16x16 = _mm256_madd_epi16(src_16x16, src_16x16);
+    dst_16x16 = _mm256_madd_epi16(dst_16x16, dst_16x16);
+
+    res0_4x64 = _mm256_unpacklo_epi32(src_16x16, zeros);
+    res1_4x64 = _mm256_unpackhi_epi32(src_16x16, zeros);
+    res2_4x64 = _mm256_unpacklo_epi32(dst_16x16, zeros);
+    res3_4x64 = _mm256_unpackhi_epi32(dst_16x16, zeros);
+
+    square_result = _mm256_add_epi64(
+        square_result,
+        _mm256_add_epi64(
+            _mm256_add_epi64(_mm256_add_epi64(res0_4x64, res1_4x64), res2_4x64),
+            res3_4x64));
+  }
+
+  const __m128i sum_2x64 =
+      _mm_add_epi64(_mm256_castsi256_si128(square_result),
+                    _mm256_extracti128_si256(square_result, 1));
+  const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8));
+  xx_storel_64(&sum, sum_1x64);
+  return sum;
+}
+
+uint64_t aom_mse_wxh_16bit_highbd_avx2(uint16_t *dst, int dstride,
+                                       uint16_t *src, int sstride, int w,
+                                       int h) {
+  assert((w == 8 || w == 4) && (h == 8 || h == 4) &&
+         "w=8/4 and h=8/4 must satisfy");
+  switch (w) {
+    case 4: return aom_mse_4xh_16bit_highbd_avx2(dst, dstride, src, sstride, h);
+    case 8: return aom_mse_8xh_16bit_highbd_avx2(dst, dstride, src, sstride, h);
+    default: assert(0 && "unsupported width"); return -1;
+  }
+}
diff --git a/aom_dsp/x86/highbd_variance_impl_sse2.asm b/aom_dsp/x86/highbd_variance_impl_sse2.asm
index 0d954e1..ec6c7e9 100644
--- a/aom_dsp/x86/highbd_variance_impl_sse2.asm
+++ b/aom_dsp/x86/highbd_variance_impl_sse2.asm
@@ -25,7 +25,7 @@
 ;    unsigned int    *  SSE,
 ;    int             *  Sum
 ;)
-global sym(aom_highbd_calc16x16var_sse2) PRIVATE
+globalsym(aom_highbd_calc16x16var_sse2)
 sym(aom_highbd_calc16x16var_sse2):
     push        rbp
     mov         rbp, rsp
@@ -178,7 +178,7 @@
 ;    unsigned int    *  SSE,
 ;    int             *  Sum
 ;)
-global sym(aom_highbd_calc8x8var_sse2) PRIVATE
+globalsym(aom_highbd_calc8x8var_sse2)
 sym(aom_highbd_calc8x8var_sse2):
     push        rbp
     mov         rbp, rsp
diff --git a/aom_dsp/x86/highbd_variance_sse2.c b/aom_dsp/x86/highbd_variance_sse2.c
index b7d15f9..d1bd7d4 100644
--- a/aom_dsp/x86/highbd_variance_sse2.c
+++ b/aom_dsp/x86/highbd_variance_sse2.c
@@ -840,3 +840,100 @@
     pred += 8;
   }
 }
+
+uint64_t aom_mse_4xh_16bit_highbd_sse2(uint16_t *dst, int dstride,
+                                       uint16_t *src, int sstride, int h) {
+  uint64_t sum = 0;
+  __m128i reg0_4x16, reg1_4x16;
+  __m128i src_8x16;
+  __m128i dst_8x16;
+  __m128i res0_4x32, res1_4x32, res0_4x64, res1_4x64, res2_4x64, res3_4x64;
+  __m128i sub_result_8x16;
+  const __m128i zeros = _mm_setzero_si128();
+  __m128i square_result = _mm_setzero_si128();
+  for (int i = 0; i < h; i += 2) {
+    reg0_4x16 = _mm_loadl_epi64((__m128i const *)(&dst[(i + 0) * dstride]));
+    reg1_4x16 = _mm_loadl_epi64((__m128i const *)(&dst[(i + 1) * dstride]));
+    dst_8x16 = _mm_unpacklo_epi64(reg0_4x16, reg1_4x16);
+
+    reg0_4x16 = _mm_loadl_epi64((__m128i const *)(&src[(i + 0) * sstride]));
+    reg1_4x16 = _mm_loadl_epi64((__m128i const *)(&src[(i + 1) * sstride]));
+    src_8x16 = _mm_unpacklo_epi64(reg0_4x16, reg1_4x16);
+
+    sub_result_8x16 = _mm_sub_epi16(src_8x16, dst_8x16);
+
+    res0_4x32 = _mm_unpacklo_epi16(sub_result_8x16, zeros);
+    res1_4x32 = _mm_unpackhi_epi16(sub_result_8x16, zeros);
+
+    res0_4x32 = _mm_madd_epi16(res0_4x32, res0_4x32);
+    res1_4x32 = _mm_madd_epi16(res1_4x32, res1_4x32);
+
+    res0_4x64 = _mm_unpacklo_epi32(res0_4x32, zeros);
+    res1_4x64 = _mm_unpackhi_epi32(res0_4x32, zeros);
+    res2_4x64 = _mm_unpacklo_epi32(res1_4x32, zeros);
+    res3_4x64 = _mm_unpackhi_epi32(res1_4x32, zeros);
+
+    square_result = _mm_add_epi64(
+        square_result,
+        _mm_add_epi64(
+            _mm_add_epi64(_mm_add_epi64(res0_4x64, res1_4x64), res2_4x64),
+            res3_4x64));
+  }
+
+  const __m128i sum_1x64 =
+      _mm_add_epi64(square_result, _mm_srli_si128(square_result, 8));
+  xx_storel_64(&sum, sum_1x64);
+  return sum;
+}
+
+uint64_t aom_mse_8xh_16bit_highbd_sse2(uint16_t *dst, int dstride,
+                                       uint16_t *src, int sstride, int h) {
+  uint64_t sum = 0;
+  __m128i src_8x16;
+  __m128i dst_8x16;
+  __m128i res0_4x32, res1_4x32, res0_4x64, res1_4x64, res2_4x64, res3_4x64;
+  __m128i sub_result_8x16;
+  const __m128i zeros = _mm_setzero_si128();
+  __m128i square_result = _mm_setzero_si128();
+
+  for (int i = 0; i < h; i++) {
+    dst_8x16 = _mm_loadu_si128((__m128i *)&dst[i * dstride]);
+    src_8x16 = _mm_loadu_si128((__m128i *)&src[i * sstride]);
+
+    sub_result_8x16 = _mm_sub_epi16(src_8x16, dst_8x16);
+
+    res0_4x32 = _mm_unpacklo_epi16(sub_result_8x16, zeros);
+    res1_4x32 = _mm_unpackhi_epi16(sub_result_8x16, zeros);
+
+    res0_4x32 = _mm_madd_epi16(res0_4x32, res0_4x32);
+    res1_4x32 = _mm_madd_epi16(res1_4x32, res1_4x32);
+
+    res0_4x64 = _mm_unpacklo_epi32(res0_4x32, zeros);
+    res1_4x64 = _mm_unpackhi_epi32(res0_4x32, zeros);
+    res2_4x64 = _mm_unpacklo_epi32(res1_4x32, zeros);
+    res3_4x64 = _mm_unpackhi_epi32(res1_4x32, zeros);
+
+    square_result = _mm_add_epi64(
+        square_result,
+        _mm_add_epi64(
+            _mm_add_epi64(_mm_add_epi64(res0_4x64, res1_4x64), res2_4x64),
+            res3_4x64));
+  }
+
+  const __m128i sum_1x64 =
+      _mm_add_epi64(square_result, _mm_srli_si128(square_result, 8));
+  xx_storel_64(&sum, sum_1x64);
+  return sum;
+}
+
+uint64_t aom_mse_wxh_16bit_highbd_sse2(uint16_t *dst, int dstride,
+                                       uint16_t *src, int sstride, int w,
+                                       int h) {
+  assert((w == 8 || w == 4) && (h == 8 || h == 4) &&
+         "w=8/4 and h=8/4 must satisfy");
+  switch (w) {
+    case 4: return aom_mse_4xh_16bit_highbd_sse2(dst, dstride, src, sstride, h);
+    case 8: return aom_mse_8xh_16bit_highbd_sse2(dst, dstride, src, sstride, h);
+    default: assert(0 && "unsupported width"); return -1;
+  }
+}
diff --git a/aom_dsp/x86/intrapred_avx2.c b/aom_dsp/x86/intrapred_avx2.c
index 546ee74..58789c3 100644
--- a/aom_dsp/x86/intrapred_avx2.c
+++ b/aom_dsp/x86/intrapred_avx2.c
@@ -1989,7 +1989,7 @@
     int base_x = (-y * dx) >> frac_bits_x;
     int base_shift = 0;
     if (base_x < (min_base_x - 1)) {
-      base_shift = (min_base_x - base_x) >> upsample_above;
+      base_shift = (min_base_x - base_x - 1) >> upsample_above;
     }
     int base_min_diff =
         (min_base_x - base_x + upsample_above) >> upsample_above;
@@ -2115,7 +2115,7 @@
     int base_x = (-y * dx) >> frac_bits_x;
     int base_shift = 0;
     if (base_x < (min_base_x - 1)) {
-      base_shift = (min_base_x - base_x) >> upsample_above;
+      base_shift = (min_base_x - base_x - 1) >> upsample_above;
     }
     int base_min_diff =
         (min_base_x - base_x + upsample_above) >> upsample_above;
@@ -2237,7 +2237,7 @@
     int base_x = (-y * dx) >> frac_bits_x;
     int base_shift = 0;
     if (base_x < (min_base_x - 1)) {
-      base_shift = (min_base_x - base_x) >> upsample_above;
+      base_shift = (min_base_x - base_x - 1) >> upsample_above;
     }
     int base_min_diff =
         (min_base_x - base_x + upsample_above) >> upsample_above;
@@ -2387,7 +2387,7 @@
     int base_x = (-y * dx) >> frac_bits_x;
     int base_shift = 0;
     if (base_x < (min_base_x - 1)) {
-      base_shift = (min_base_x - base_x) >> upsample_above;
+      base_shift = (min_base_x - base_x - 1) >> upsample_above;
     }
     int base_min_diff =
         (min_base_x - base_x + upsample_above) >> upsample_above;
diff --git a/aom_dsp/x86/jnt_variance_ssse3.c b/aom_dsp/x86/jnt_variance_ssse3.c
index c8b02f5..b74f4bf 100644
--- a/aom_dsp/x86/jnt_variance_ssse3.c
+++ b/aom_dsp/x86/jnt_variance_ssse3.c
@@ -184,9 +184,12 @@
 DIST_WTD_SUBPIX_AVG_VAR(8, 4)
 DIST_WTD_SUBPIX_AVG_VAR(4, 8)
 DIST_WTD_SUBPIX_AVG_VAR(4, 4)
+
+#if !CONFIG_REALTIME_ONLY
 DIST_WTD_SUBPIX_AVG_VAR(4, 16)
 DIST_WTD_SUBPIX_AVG_VAR(16, 4)
 DIST_WTD_SUBPIX_AVG_VAR(8, 32)
 DIST_WTD_SUBPIX_AVG_VAR(32, 8)
 DIST_WTD_SUBPIX_AVG_VAR(16, 64)
 DIST_WTD_SUBPIX_AVG_VAR(64, 16)
+#endif
diff --git a/aom_dsp/x86/masked_variance_intrin_ssse3.c b/aom_dsp/x86/masked_variance_intrin_ssse3.c
index fa93f0d..bfd86ee 100644
--- a/aom_dsp/x86/masked_variance_intrin_ssse3.c
+++ b/aom_dsp/x86/masked_variance_intrin_ssse3.c
@@ -1052,12 +1052,14 @@
       mask += (mask_stride << 1);
       i += 2;
     } while (i < height);
-  } else {  // width == 32
-    assert(width == 32);
+  } else {
     do {
-      comp_mask_pred_16_ssse3(src0, src1, mask, comp_pred);
-      comp_mask_pred_16_ssse3(src0 + 16, src1 + 16, mask + 16, comp_pred + 16);
-      comp_pred += (width);
+      for (int x = 0; x < width; x += 32) {
+        comp_mask_pred_16_ssse3(src0 + x, src1 + x, mask + x, comp_pred);
+        comp_mask_pred_16_ssse3(src0 + x + 16, src1 + x + 16, mask + x + 16,
+                                comp_pred + 16);
+        comp_pred += 32;
+      }
       src0 += (stride0);
       src1 += (stride1);
       mask += (mask_stride);
diff --git a/aom_dsp/x86/mem_sse2.h b/aom_dsp/x86/mem_sse2.h
index 6c82167..dacb613 100644
--- a/aom_dsp/x86/mem_sse2.h
+++ b/aom_dsp/x86/mem_sse2.h
@@ -13,11 +13,34 @@
 #define AOM_AOM_DSP_X86_MEM_SSE2_H_
 
 #include <emmintrin.h>  // SSE2
+#include <string.h>
 
 #include "config/aom_config.h"
 
 #include "aom/aom_integer.h"
 
+static INLINE uint16_t loadu_uint16(const void *src) {
+  uint16_t v;
+  memcpy(&v, src, sizeof(v));
+  return v;
+}
+
+static INLINE uint32_t loadu_uint32(const void *src) {
+  uint32_t v;
+  memcpy(&v, src, sizeof(v));
+  return v;
+}
+
+static INLINE uint64_t loadu_uint64(const void *src) {
+  uint64_t v;
+  memcpy(&v, src, sizeof(v));
+  return v;
+}
+
+static INLINE void _mm_storeh_epi64(__m128i *const d, const __m128i s) {
+  _mm_storeh_pi((__m64 *)d, _mm_castsi128_ps(s));
+}
+
 static INLINE __m128i loadh_epi64(const void *const src, const __m128i s) {
   return _mm_castps_si128(
       _mm_loadh_pi(_mm_castsi128_ps(s), (const __m64 *)src));
@@ -25,10 +48,10 @@
 
 static INLINE __m128i load_8bit_4x4_to_1_reg_sse2(const void *const src,
                                                   const int byte_stride) {
-  return _mm_setr_epi32(*(const int32_t *)((int8_t *)src + 0 * byte_stride),
-                        *(const int32_t *)((int8_t *)src + 1 * byte_stride),
-                        *(const int32_t *)((int8_t *)src + 2 * byte_stride),
-                        *(const int32_t *)((int8_t *)src + 3 * byte_stride));
+  return _mm_setr_epi32(loadu_uint32((int8_t *)src + 0 * byte_stride),
+                        loadu_uint32((int8_t *)src + 1 * byte_stride),
+                        loadu_uint32((int8_t *)src + 2 * byte_stride),
+                        loadu_uint32((int8_t *)src + 3 * byte_stride));
 }
 
 static INLINE __m128i load_8bit_8x2_to_1_reg_sse2(const void *const src,
@@ -39,4 +62,106 @@
   return dst;
 }
 
+static INLINE void store_8bit_8x4_from_16x2(const __m128i *const s,
+                                            uint8_t *const d,
+                                            const ptrdiff_t stride) {
+  _mm_storel_epi64((__m128i *)(d + 0 * stride), s[0]);
+  _mm_storeh_epi64((__m128i *)(d + 1 * stride), s[0]);
+  _mm_storel_epi64((__m128i *)(d + 2 * stride), s[1]);
+  _mm_storeh_epi64((__m128i *)(d + 3 * stride), s[1]);
+}
+
+static INLINE void store_8bit_4x4(const __m128i *const s, uint8_t *const d,
+                                  const ptrdiff_t stride) {
+  *(int *)(d + 0 * stride) = _mm_cvtsi128_si32(s[0]);
+  *(int *)(d + 1 * stride) = _mm_cvtsi128_si32(s[1]);
+  *(int *)(d + 2 * stride) = _mm_cvtsi128_si32(s[2]);
+  *(int *)(d + 3 * stride) = _mm_cvtsi128_si32(s[3]);
+}
+
+static INLINE void store_8bit_4x4_sse2(const __m128i s, uint8_t *const d,
+                                       const ptrdiff_t stride) {
+  __m128i ss[4];
+
+  ss[0] = s;
+  ss[1] = _mm_srli_si128(s, 4);
+  ss[2] = _mm_srli_si128(s, 8);
+  ss[3] = _mm_srli_si128(s, 12);
+  store_8bit_4x4(ss, d, stride);
+}
+
+static INLINE void load_8bit_4x4(const uint8_t *const s, const ptrdiff_t stride,
+                                 __m128i *const d) {
+  d[0] = _mm_cvtsi32_si128(*(const int *)(s + 0 * stride));
+  d[1] = _mm_cvtsi32_si128(*(const int *)(s + 1 * stride));
+  d[2] = _mm_cvtsi32_si128(*(const int *)(s + 2 * stride));
+  d[3] = _mm_cvtsi32_si128(*(const int *)(s + 3 * stride));
+}
+
+static INLINE void load_8bit_4x8(const uint8_t *const s, const ptrdiff_t stride,
+                                 __m128i *const d) {
+  load_8bit_4x4(s + 0 * stride, stride, &d[0]);
+  load_8bit_4x4(s + 4 * stride, stride, &d[4]);
+}
+
+static INLINE void load_8bit_8x4(const uint8_t *const s, const ptrdiff_t stride,
+                                 __m128i *const d) {
+  d[0] = _mm_loadl_epi64((const __m128i *)(s + 0 * stride));
+  d[1] = _mm_loadl_epi64((const __m128i *)(s + 1 * stride));
+  d[2] = _mm_loadl_epi64((const __m128i *)(s + 2 * stride));
+  d[3] = _mm_loadl_epi64((const __m128i *)(s + 3 * stride));
+}
+
+static INLINE void loadu_8bit_16x4(const uint8_t *const s,
+                                   const ptrdiff_t stride, __m128i *const d) {
+  d[0] = _mm_loadu_si128((const __m128i *)(s + 0 * stride));
+  d[1] = _mm_loadu_si128((const __m128i *)(s + 1 * stride));
+  d[2] = _mm_loadu_si128((const __m128i *)(s + 2 * stride));
+  d[3] = _mm_loadu_si128((const __m128i *)(s + 3 * stride));
+}
+
+static INLINE void load_8bit_8x8(const uint8_t *const s, const ptrdiff_t stride,
+                                 __m128i *const d) {
+  load_8bit_8x4(s + 0 * stride, stride, &d[0]);
+  load_8bit_8x4(s + 4 * stride, stride, &d[4]);
+}
+
+static INLINE void load_8bit_16x8(const uint8_t *const s,
+                                  const ptrdiff_t stride, __m128i *const d) {
+  d[0] = _mm_load_si128((const __m128i *)(s + 0 * stride));
+  d[1] = _mm_load_si128((const __m128i *)(s + 1 * stride));
+  d[2] = _mm_load_si128((const __m128i *)(s + 2 * stride));
+  d[3] = _mm_load_si128((const __m128i *)(s + 3 * stride));
+  d[4] = _mm_load_si128((const __m128i *)(s + 4 * stride));
+  d[5] = _mm_load_si128((const __m128i *)(s + 5 * stride));
+  d[6] = _mm_load_si128((const __m128i *)(s + 6 * stride));
+  d[7] = _mm_load_si128((const __m128i *)(s + 7 * stride));
+}
+
+static INLINE void loadu_8bit_16x8(const uint8_t *const s,
+                                   const ptrdiff_t stride, __m128i *const d) {
+  loadu_8bit_16x4(s + 0 * stride, stride, &d[0]);
+  loadu_8bit_16x4(s + 4 * stride, stride, &d[4]);
+}
+
+static INLINE void store_8bit_8x8(const __m128i *const s, uint8_t *const d,
+                                  const ptrdiff_t stride) {
+  _mm_storel_epi64((__m128i *)(d + 0 * stride), s[0]);
+  _mm_storel_epi64((__m128i *)(d + 1 * stride), s[1]);
+  _mm_storel_epi64((__m128i *)(d + 2 * stride), s[2]);
+  _mm_storel_epi64((__m128i *)(d + 3 * stride), s[3]);
+  _mm_storel_epi64((__m128i *)(d + 4 * stride), s[4]);
+  _mm_storel_epi64((__m128i *)(d + 5 * stride), s[5]);
+  _mm_storel_epi64((__m128i *)(d + 6 * stride), s[6]);
+  _mm_storel_epi64((__m128i *)(d + 7 * stride), s[7]);
+}
+
+static INLINE void storeu_8bit_16x4(const __m128i *const s, uint8_t *const d,
+                                    const ptrdiff_t stride) {
+  _mm_storeu_si128((__m128i *)(d + 0 * stride), s[0]);
+  _mm_storeu_si128((__m128i *)(d + 1 * stride), s[1]);
+  _mm_storeu_si128((__m128i *)(d + 2 * stride), s[2]);
+  _mm_storeu_si128((__m128i *)(d + 3 * stride), s[3]);
+}
+
 #endif  // AOM_AOM_DSP_X86_MEM_SSE2_H_
diff --git a/aom_dsp/x86/quantize_avx_x86_64.asm b/aom_dsp/x86/quantize_avx_x86_64.asm
deleted file mode 100644
index d6e15c4..0000000
--- a/aom_dsp/x86/quantize_avx_x86_64.asm
+++ /dev/null
@@ -1,464 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION .text
-
-%macro QUANTIZE_FN 2
-cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, zbin, round, quant, \
-                                shift, qcoeff, dqcoeff, dequant, \
-                                eob, scan, iscan
-
-  vzeroupper
-
-%ifnidn %1, b_32x32
-
-  ; Special case for ncoeff == 16, as it is frequent and we can save on
-  ; not setting up a loop.
-  cmp                       ncoeffmp, 16
-  jne .generic
-
-  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-  ;; Special case of ncoeff == 16
-  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-.single:
-
-  movifnidn                   coeffq, coeffmp
-  movifnidn                    zbinq, zbinmp
-  mova                            m0, [zbinq]              ; m0 = zbin
-
-  ; Get DC and first 15 AC coeffs - in this special case, that is all.
-  ; coeff stored as 32bit numbers but we process them as 16 bit numbers
-  mova                            m9, [coeffq]
-  packssdw                        m9, [coeffq+16]          ; m9 = c[i]
-  mova                           m10, [coeffq+32]
-  packssdw                       m10, [coeffq+48]          ; m10 = c[i]
-
-  mov                             r0, eobmp                ; Output pointer
-  mov                             r1, qcoeffmp             ; Output pointer
-  mov                             r2, dqcoeffmp            ; Output pointer
-
-  pxor                            m5, m5                   ; m5 = dedicated zero
-
-  pcmpeqw                         m4, m4                   ; All word lanes -1
-  paddw                           m0, m4                   ; m0 = zbin - 1
-
-  pabsw                           m6, m9                   ; m6 = abs(m9)
-  pabsw                          m11, m10                  ; m11 = abs(m10)
-  pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
-  punpckhqdq                      m0, m0
-  pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin
-
-  ; Check if all coeffs are less than zbin. If yes, we just write zeros
-  ; to the outputs and we are done.
-  por                            m14, m7, m12
-  ptest                          m14, m14
-  jnz .single_nonzero
-
-  mova                       [r1   ], ymm5
-  mova                       [r1+32], ymm5
-  mova                       [r2   ], ymm5
-  mova                       [r2+32], ymm5
-  mov                           [r0], word 0
-
-  vzeroupper
-  RET
-
-.single_nonzero:
-
-  ; Actual quantization of size 16 block - setup pointers, rounders, etc.
-  movifnidn                       r3, roundmp
-  movifnidn                       r4, quantmp
-  mov                             r6, dequantmp
-  mov                             r5, shiftmp
-  mova                            m1, [r3]              ; m1 = round
-  mova                            m2, [r4]              ; m2 = quant
-  mova                            m3, [r6]              ; m3 = dequant
-  mova                            m4, [r5]              ; m4 = shift
-
-  mov                             r3, iscanmp
-
-  DEFINE_ARGS eob, qcoeff, dqcoeff, iscan
-
-  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-  paddsw                          m6, m1                   ; m6 += round
-  punpckhqdq                      m1, m1
-  paddsw                         m11, m1                   ; m11 += round
-  pmulhw                          m8, m6, m2               ; m8 = m6*q>>16
-  punpckhqdq                      m2, m2
-  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
-  paddw                           m8, m6                   ; m8 += m6
-  paddw                          m13, m11                  ; m13 += m11
-  pmulhw                          m8, m4                   ; m8 = m8*qsh>>16
-  punpckhqdq                      m4, m4
-  pmulhw                         m13, m4                   ; m13 = m13*qsh>>16
-  psignw                          m8, m9                   ; m8 = reinsert sign
-  psignw                         m13, m10                  ; m13 = reinsert sign
-  pand                            m8, m7
-  pand                           m13, m12
-
-  ; Store 16bit numbers as 32bit numbers in array pointed to by qcoeff
-  pcmpgtw                         m6, m5, m8
-  punpckhwd                       m6, m8, m6
-  pmovsxwd                       m11, m8
-  mova                  [qcoeffq   ], m11
-  mova                  [qcoeffq+16], m6
-  pcmpgtw                         m6, m5, m13
-  punpckhwd                       m6, m13, m6
-  pmovsxwd                       m11, m13
-  mova                  [qcoeffq+32], m11
-  mova                  [qcoeffq+48], m6
-
-  pmullw                          m8, m3                   ; dqc[i] = qc[i] * q
-  punpckhqdq                      m3, m3
-  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
-
-  ; Store 16bit numbers as 32bit numbers in array pointed to by dqcoeff
-  pcmpgtw                         m6, m5, m8
-  punpckhwd                       m6, m8, m6
-  pmovsxwd                       m11, m8
-  mova                 [dqcoeffq   ], m11
-  mova                 [dqcoeffq+16], m6
-  pcmpgtw                         m6, m5, m13
-  punpckhwd                       m6, m13, m6
-  pmovsxwd                       m11, m13
-  mova                 [dqcoeffq+32], m11
-  mova                 [dqcoeffq+48], m6
-
-  mova                            m6, [iscanq]            ; m6 = scan[i]
-  mova                           m11, [iscanq+16]         ; m11 = scan[i]
-
-  pcmpeqw                         m8,  m8,  m5            ; m8 = c[i] == 0
-  pcmpeqw                        m13, m13,  m5            ; m13 = c[i] == 0
-  psubw                           m6,  m6,  m7            ; m6 = scan[i] + 1
-  psubw                          m11, m11, m12            ; m11 = scan[i] + 1
-  pandn                           m8,  m8,  m6            ; m8 = max(eob)
-  pandn                          m13, m13, m11            ; m13 = max(eob)
-  pmaxsw                          m8,  m8, m13
-
-  ; Horizontally accumulate/max eobs and write into [eob] memory pointer
-  pshufd                          m7, m8, 0xe
-  pmaxsw                          m8, m7
-  pshuflw                         m7, m8, 0xe
-  pmaxsw                          m8, m7
-  pshuflw                         m7, m8, 0x1
-  pmaxsw                          m8, m7
-  movq                           rax, m8
-  mov                         [eobq], ax
-
-  vzeroupper
-  RET
-
-  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-  ;; Generic case of ncoeff != 16
-  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-.generic:
-
-%endif ; %ifnidn %1, b_32x32
-
-DEFINE_ARGS coeff, ncoeff, zbin, round, quant, shift, \
-            qcoeff, dqcoeff, dequant, eob, scan, iscan
-
-  ; Actual quantization loop - setup pointers, rounders, etc.
-  movifnidn                   coeffq, coeffmp
-  movifnidn                  ncoeffq, ncoeffmp
-  movifnidn                    zbinq, zbinmp
-  movifnidn                   roundq, roundmp
-  movifnidn                   quantq, quantmp
-  movifnidn                 dequantq, dequantmp
-  mova                            m0, [zbinq]              ; m0 = zbin
-  mova                            m1, [roundq]             ; m1 = round
-  mova                            m2, [quantq]             ; m2 = quant
-  mova                            m3, [dequantq]           ; m3 = dequant
-  pcmpeqw                         m4, m4                   ; All lanes -1
-%ifidn %1, b_32x32
-  psubw                           m0, m4
-  psubw                           m1, m4
-  psrlw                           m0, 1                    ; m0 = (m0 + 1) / 2
-  psrlw                           m1, 1                    ; m1 = (m1 + 1) / 2
-%endif
-  paddw                           m0, m4                   ; m0 = m0 + 1
-
-  mov                             r2, shiftmp
-  mov                             r3, qcoeffmp
-  mova                            m4, [r2]            ; m4 = shift
-  mov                             r4, dqcoeffmp
-  mov                             r5, iscanmp
-  pxor                            m5, m5              ; m5 = dedicated zero
-
-  DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, eob
-
-
-  lea                         coeffq, [  coeffq+ncoeffq*4]
-  lea                        qcoeffq, [ qcoeffq+ncoeffq*4]
-  lea                       dqcoeffq, [dqcoeffq+ncoeffq*4]
-
-  lea                         iscanq, [  iscanq+ncoeffq*2]
-  neg                        ncoeffq
-
-  ; get DC and first 15 AC coeffs
-  ; coeff stored as 32bit numbers & require 16bit numbers
-  mova                            m9, [coeffq+ncoeffq*4+ 0]
-  packssdw                        m9, [coeffq+ncoeffq*4+16]
-  mova                           m10, [coeffq+ncoeffq*4+32]
-  packssdw                       m10, [coeffq+ncoeffq*4+48]
-
-  pabsw                           m6, m9                   ; m6 = abs(m9)
-  pabsw                          m11, m10                  ; m11 = abs(m10)
-  pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
-  punpckhqdq                      m0, m0
-  pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin
-
-  ; Check if all coeffs are less than zbin. If yes, skip forward quickly.
-  por                            m14, m7, m12
-  ptest                          m14, m14
-  jnz .first_nonzero
-
-  mova        [qcoeffq+ncoeffq*4   ], ymm5
-  mova        [qcoeffq+ncoeffq*4+32], ymm5
-  mova       [dqcoeffq+ncoeffq*4   ], ymm5
-  mova       [dqcoeffq+ncoeffq*4+32], ymm5
-  add                        ncoeffq, mmsize
-
-  punpckhqdq                      m1, m1
-  punpckhqdq                      m2, m2
-  punpckhqdq                      m3, m3
-  punpckhqdq                      m4, m4
-  pxor                            m8, m8
-
-  jmp .ac_only_loop
-
-.first_nonzero:
-
-  paddsw                          m6, m1                   ; m6 += round
-  punpckhqdq                      m1, m1
-  paddsw                         m11, m1                   ; m11 += round
-  pmulhw                          m8, m6, m2               ; m8 = m6*q>>16
-  punpckhqdq                      m2, m2
-  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
-  paddw                           m8, m6                   ; m8 += m6
-  paddw                          m13, m11                  ; m13 += m11
-  %ifidn %1, b_32x32
-  pmullw                          m5, m8, m4               ; store the lower 16 bits of m8*qsh
-  %endif
-  pmulhw                          m8, m4                   ; m8 = m8*qsh>>16
-  %ifidn %1, b_32x32
-  psllw                           m8, 1
-  psrlw                           m5, 15
-  por                             m8, m5
-  %endif
-  punpckhqdq                      m4, m4
-  %ifidn %1, b_32x32
-  pmullw                          m5, m13, m4              ; store the lower 16 bits of m13*qsh
-  %endif
-  pmulhw                         m13, m4                   ; m13 = m13*qsh>>16
-  %ifidn %1, b_32x32
-  psllw                          m13, 1
-  psrlw                           m5, 15
-  por                            m13, m5
-  pxor                            m5, m5                   ; reset m5 to zero register
-  %endif
-  psignw                          m8, m9                   ; m8 = reinsert sign
-  psignw                         m13, m10                  ; m13 = reinsert sign
-  pand                            m8, m7
-  pand                           m13, m12
-
-  ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
-  pcmpgtw                         m6, m5, m8
-  punpckhwd                       m6, m8, m6
-  pmovsxwd                       m11, m8
-  mova        [qcoeffq+ncoeffq*4+ 0], m11
-  mova        [qcoeffq+ncoeffq*4+16], m6
-  pcmpgtw                         m6, m5, m13
-  punpckhwd                       m6, m13, m6
-  pmovsxwd                       m11, m13
-  mova        [qcoeffq+ncoeffq*4+32], m11
-  mova        [qcoeffq+ncoeffq*4+48], m6
-
-%ifidn %1, b_32x32
-  pabsw                           m8, m8
-  pabsw                          m13, m13
-%endif
-  pmullw                          m8, m3                   ; dqc[i] = qc[i] * q
-  punpckhqdq                      m3, m3
-  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
-%ifidn %1, b_32x32
-  psrlw                           m8, 1
-  psrlw                          m13, 1
-  psignw                          m8, m9
-  psignw                         m13, m10
-%endif
-
-  ; store 16bit numbers as 32bit numbers in array pointed to by dqcoeff
-  pcmpgtw                         m6, m5, m8
-  punpckhwd                       m6, m8, m6
-  pmovsxwd                       m11, m8
-  mova       [dqcoeffq+ncoeffq*4+ 0], m11
-  mova       [dqcoeffq+ncoeffq*4+16], m6
-  pcmpgtw                         m6, m5, m13
-  punpckhwd                       m6, m13, m6
-  pmovsxwd                       m11, m13
-  mova       [dqcoeffq+ncoeffq*4+32], m11
-  mova       [dqcoeffq+ncoeffq*4+48], m6
-
-  pcmpeqw                         m8, m5                    ; m8 = c[i] == 0
-  pcmpeqw                        m13, m5                    ; m13 = c[i] == 0
-  mova                            m6, [iscanq+ncoeffq*2]    ; m6 = scan[i]
-  mova                           m11, [iscanq+ncoeffq*2+16] ; m11 = scan[i]
-  psubw                           m6, m7                    ; m6 = scan[i] + 1
-  psubw                          m11, m12                   ; m11 = scan[i] + 1
-  pandn                           m8, m6                    ; m8 = max(eob)
-  pandn                          m13, m11                   ; m13 = max(eob)
-  pmaxsw                          m8, m13
-  add                        ncoeffq, mmsize
-
-.ac_only_loop:
-
-  ; pack coeff from 32bit to 16bit array
-  mova                            m9, [coeffq+ncoeffq*4+ 0]
-  packssdw                        m9, [coeffq+ncoeffq*4+16]
-  mova                           m10, [coeffq+ncoeffq*4+32]
-  packssdw                       m10, [coeffq+ncoeffq*4+48]
-
-  pabsw                           m6, m9                   ; m6 = abs(m9)
-  pabsw                          m11, m10                  ; m11 = abs(m10)
-  pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
-  pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin
-
-  ; Check if all coeffs are less than zbin. If yes, skip this itertion.
-  ; And just write zeros as the result would be.
-  por                            m14, m7, m12
-  ptest                          m14, m14
-  jnz .rest_nonzero
-
-  mova        [qcoeffq+ncoeffq*4+ 0], ymm5
-  mova        [qcoeffq+ncoeffq*4+32], ymm5
-  mova       [dqcoeffq+ncoeffq*4+ 0], ymm5
-  mova       [dqcoeffq+ncoeffq*4+32], ymm5
-
-  add                        ncoeffq, mmsize
-  jnz .ac_only_loop
-
-  ; Horizontally accumulate/max eobs and write into [eob] memory pointer
-  mov                             r2, eobmp
-  pshufd                          m7, m8, 0xe
-  pmaxsw                          m8, m7
-  pshuflw                         m7, m8, 0xe
-  pmaxsw                          m8, m7
-  pshuflw                         m7, m8, 0x1
-  pmaxsw                          m8, m7
-  movq                           rax, m8
-  mov                           [r2], ax
-  vzeroupper
-  RET
-
-.rest_nonzero:
-  paddsw                          m6, m1                   ; m6 += round
-  paddsw                         m11, m1                   ; m11 += round
-  pmulhw                         m14, m6, m2               ; m14 = m6*q>>16
-  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
-  paddw                          m14, m6                   ; m14 += m6
-  paddw                          m13, m11                  ; m13 += m11
-  %ifidn %1, b_32x32
-  pmullw                          m5, m14, m4              ; store the lower 16 bits of m14*qsh
-  %endif
-  pmulhw                         m14, m4                   ; m14 = m14*qsh>>16
-  %ifidn %1, b_32x32
-  psllw                          m14, 1
-  psrlw                           m5, 15
-  por                            m14, m5
-  pmullw                          m5, m13, m4              ; store the lower 16 bits of m13*qsh
-  %endif
-  pmulhw                         m13, m4                   ; m13 = m13*qsh>>16
-  %ifidn %1, b_32x32
-  psllw                          m13, 1
-  psrlw                           m5, 15
-  por                            m13, m5
-  pxor                            m5, m5                   ; reset m5 to zero register
-  %endif
-  psignw                         m14, m9                   ; m14 = reinsert sign
-  psignw                         m13, m10                  ; m13 = reinsert sign
-  pand                           m14, m7
-  pand                           m13, m12
-
-  ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
-  pcmpgtw                         m6, m5, m14
-  punpckhwd                       m6, m14, m6
-  pmovsxwd                       m11, m14
-  mova        [qcoeffq+ncoeffq*4+ 0], m11
-  mova        [qcoeffq+ncoeffq*4+16], m6
-  pcmpgtw                         m6, m5, m13
-  punpckhwd                       m6, m13, m6
-  pmovsxwd                       m11, m13
-  mova        [qcoeffq+ncoeffq*4+32], m11
-  mova        [qcoeffq+ncoeffq*4+48], m6
-
-%ifidn %1, b_32x32
-  pabsw                          m14, m14
-  pabsw                          m13, m13
-%endif
-  pmullw                         m14, m3                   ; dqc[i] = qc[i] * q
-  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
-%ifidn %1, b_32x32
-  psrlw                          m14, 1
-  psrlw                          m13, 1
-  psignw                         m14, m9
-  psignw                         m13, m10
-%endif
-
-  ; store 16bit numbers as 32bit numbers in array pointed to by dqcoeff
-  pcmpgtw                         m6, m5, m14
-  punpckhwd                       m6, m14, m6
-  pmovsxwd                       m11, m14
-  mova       [dqcoeffq+ncoeffq*4+ 0], m11
-  mova       [dqcoeffq+ncoeffq*4+16], m6
-  pcmpgtw                         m6, m5, m13
-  punpckhwd                       m6, m13, m6
-  pmovsxwd                       m11, m13
-  mova       [dqcoeffq+ncoeffq*4+32], m11
-  mova       [dqcoeffq+ncoeffq*4+48], m6
-
-  pcmpeqw                        m14, m5                    ; m14 = c[i] == 0
-  pcmpeqw                        m13, m5                    ; m13 = c[i] == 0
-  mova                            m6, [iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
-  mova                           m11, [iscanq+ncoeffq*2+16] ; m11 = scan[i]
-  psubw                           m6, m7                    ; m6 = scan[i] + 1
-  psubw                          m11, m12                   ; m11 = scan[i] + 1
-  pandn                          m14, m6                    ; m14 = max(eob)
-  pandn                          m13, m11                   ; m13 = max(eob)
-  pmaxsw                          m8, m14
-  pmaxsw                          m8, m13
-  add                        ncoeffq, mmsize
-  jnz .ac_only_loop
-
-  ; Horizontally accumulate/max eobs and write into [eob] memory pointer
-  mov                             r2, eobmp
-  pshufd                          m7, m8, 0xe
-  pmaxsw                          m8, m7
-  pshuflw                         m7, m8, 0xe
-  pmaxsw                          m8, m7
-  pshuflw                         m7, m8, 0x1
-  pmaxsw                          m8, m7
-  movq                           rax, m8
-  mov                           [r2], ax
-  vzeroupper
-  RET
-%endmacro
-
-INIT_XMM avx
-QUANTIZE_FN b, 9
-QUANTIZE_FN b_32x32, 9
diff --git a/aom_dsp/x86/sad4d_avx2.c b/aom_dsp/x86/sad4d_avx2.c
index 0771252..0e0b904 100644
--- a/aom_dsp/x86/sad4d_avx2.c
+++ b/aom_dsp/x86/sad4d_avx2.c
@@ -104,3 +104,28 @@
 
 sadMxN_avx2(128, 64);
 sadMxN_avx2(128, 128);
+
+#define sad_skip_MxN_avx2(m, n)                                             \
+  void aom_sad_skip_##m##x##n##x4d_avx2(const uint8_t *src, int src_stride, \
+                                        const uint8_t *const ref[4],        \
+                                        int ref_stride, uint32_t res[4]) {  \
+    aom_sadMxNx4d_avx2(m, ((n) >> 1), src, 2 * src_stride, ref,             \
+                       2 * ref_stride, res);                                \
+    res[0] <<= 1;                                                           \
+    res[1] <<= 1;                                                           \
+    res[2] <<= 1;                                                           \
+    res[3] <<= 1;                                                           \
+  }
+
+sad_skip_MxN_avx2(32, 8);
+sad_skip_MxN_avx2(32, 16);
+sad_skip_MxN_avx2(32, 32);
+sad_skip_MxN_avx2(32, 64);
+
+sad_skip_MxN_avx2(64, 16);
+sad_skip_MxN_avx2(64, 32);
+sad_skip_MxN_avx2(64, 64);
+sad_skip_MxN_avx2(64, 128);
+
+sad_skip_MxN_avx2(128, 64);
+sad_skip_MxN_avx2(128, 128);
diff --git a/aom_dsp/x86/sad4d_sse2.asm b/aom_dsp/x86/sad4d_sse2.asm
index a904374..343a14d 100644
--- a/aom_dsp/x86/sad4d_sse2.asm
+++ b/aom_dsp/x86/sad4d_sse2.asm
@@ -312,9 +312,21 @@
 ; void aom_sadNxNx4d_sse2(uint8_t *src,    int src_stride,
 ;                         uint8_t *ref[4], int ref_stride,
 ;                         uint32_t res[4]);
-; where NxN = 64x64, 32x32, 16x16, 16x8, 8x16, 8x8, 8x4, 4x8 and 4x4
-%macro SADNXN4D 2-3 0
-%if %3 == 0
+; Macro Arguments:
+;   1: Width
+;   2: Height
+;   3: If 0, then normal sad, else avg
+;   4: If 0, then normal sad, else skip rows
+%macro SADNXN4D 2-4 0,0
+%if %4 == 1  ; skip rows
+%if UNIX64
+cglobal sad_skip_%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \
+                              res, ref2, ref3, ref4
+%else
+cglobal sad_skip_%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
+                              ref2, ref3, ref4
+%endif
+%elif %3 == 0  ; normal sad
 %if UNIX64
 cglobal sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \
                               res, ref2, ref3, ref4
@@ -323,7 +335,6 @@
                               ref2, ref3, ref4
 %endif
 %else ; avg
-
 %if UNIX64
 cglobal sad%1x%2x4d_avg, 6, 10, 8, src, src_stride, ref1, ref_stride, \
                                   second_pred, res, ref2, ref3, ref4
@@ -336,6 +347,10 @@
 %endif
 
   %define mflag ((1 - UNIX64) & %3)
+%if %4 == 1
+  lea          src_strided, [2*src_strided]
+  lea          ref_strided, [2*ref_strided]
+%endif
   movsxdifnidn src_strideq, src_strided
   movsxdifnidn ref_strideq, ref_strided
 
@@ -345,9 +360,15 @@
   mov                ref1q, [ref1q+gprsize*0]
 
   PROCESS_%1x2x4 1, 0, 0, 0, ref_strideq, %3, 0, 1, 2
-%rep (%2-4)/2
+%if %4 == 1  ; downsample number of rows by 2
+%define num_rep (%2-8)/4
+%else
+%define num_rep (%2-4)/2
+%endif
+%rep num_rep
   PROCESS_%1x2x4 0, 0, 0, 0, ref_strideq, %3, 0, 1, 2
 %endrep
+%undef num_rep
   PROCESS_%1x2x4 0, 0, 0, 0, ref_strideq, %3, 0, 1, 2
 
 %if %3 == 0
@@ -368,12 +389,19 @@
   punpcklqdq            m4, m6
   punpckhqdq            m5, m7
   paddd                 m4, m5
+%if %4 == 1
+  pslld                 m4, 1
+%endif
   movifnidn             resultq, resultmp
   movu                [resultq], m4
   RET
 %else
   pshufd            m6, m6, 0x08
   pshufd            m7, m7, 0x08
+%if %4 == 1
+  pslld                 m6, 1
+  pslld                 m7, 1
+%endif
   movifnidn             resultq, resultmp
   movq              [resultq+0], m6
   movq              [resultq+8], m7
@@ -383,46 +411,76 @@
 
 INIT_XMM sse2
 SADNXN4D 128, 128
-SADNXN4D 128, 64
-SADNXN4D 64,  128
-SADNXN4D 64, 64
-SADNXN4D 64, 32
-SADNXN4D 32, 64
-SADNXN4D 32, 32
-SADNXN4D 32, 16
-SADNXN4D 16, 32
-SADNXN4D 16, 16
-SADNXN4D 16,  8
-SADNXN4D  8, 16
-SADNXN4D  8,  8
-SADNXN4D  8,  4
-SADNXN4D  4,  8
-SADNXN4D  4,  4
-SADNXN4D  4, 16
-SADNXN4D 16,  4
-SADNXN4D  8, 32
-SADNXN4D 32,  8
-SADNXN4D 16, 64
-SADNXN4D 64, 16
+SADNXN4D 128,  64
+SADNXN4D  64, 128
+SADNXN4D  64,  64
+SADNXN4D  64,  32
+SADNXN4D  32,  64
+SADNXN4D  32,  32
+SADNXN4D  32,  16
+SADNXN4D  16,  32
+SADNXN4D  16,  16
+SADNXN4D  16,   8
+SADNXN4D   8,  16
+SADNXN4D   8,   8
+SADNXN4D   8,   4
+SADNXN4D   4,   8
+SADNXN4D   4,   4
+%if CONFIG_REALTIME_ONLY==0
+SADNXN4D   4,  16
+SADNXN4D  16,   4
+SADNXN4D   8,  32
+SADNXN4D  32,   8
+SADNXN4D  16,  64
+SADNXN4D  64,  16
+%endif
 SADNXN4D 128, 128, 1
-SADNXN4D 128, 64, 1
-SADNXN4D 64,  128, 1
-SADNXN4D 64, 64, 1
-SADNXN4D 64, 32, 1
-SADNXN4D 32, 64, 1
-SADNXN4D 32, 32, 1
-SADNXN4D 32, 16, 1
-SADNXN4D 16, 32, 1
-SADNXN4D 16, 16, 1
-SADNXN4D 16,  8, 1
-SADNXN4D  8, 16, 1
-SADNXN4D  8,  8, 1
-SADNXN4D  8,  4, 1
-SADNXN4D  4,  8, 1
-SADNXN4D  4,  4, 1
-SADNXN4D  4, 16, 1
-SADNXN4D 16,  4, 1
-SADNXN4D  8, 32, 1
-SADNXN4D 32,  8, 1
-SADNXN4D 16, 64, 1
-SADNXN4D 64, 16, 1
+SADNXN4D 128,  64, 1
+SADNXN4D  64, 128, 1
+SADNXN4D  64,  64, 1
+SADNXN4D  64,  32, 1
+SADNXN4D  32,  64, 1
+SADNXN4D  32,  32, 1
+SADNXN4D  32,  16, 1
+SADNXN4D  16,  32, 1
+SADNXN4D  16,  16, 1
+SADNXN4D  16,   8, 1
+SADNXN4D   8,  16, 1
+SADNXN4D   8,   8, 1
+SADNXN4D   8,   4, 1
+SADNXN4D   4,   8, 1
+SADNXN4D   4,   4, 1
+%if CONFIG_REALTIME_ONLY==0
+SADNXN4D   4,  16, 1
+SADNXN4D  16,   4, 1
+SADNXN4D   8,  32, 1
+SADNXN4D  32,   8, 1
+SADNXN4D  16,  64, 1
+SADNXN4D  64,  16, 1
+%endif
+SADNXN4D 128, 128, 0, 1
+SADNXN4D 128,  64, 0, 1
+SADNXN4D  64, 128, 0, 1
+SADNXN4D  64,  64, 0, 1
+SADNXN4D  64,  32, 0, 1
+SADNXN4D  32,  64, 0, 1
+SADNXN4D  32,  32, 0, 1
+SADNXN4D  32,  16, 0, 1
+SADNXN4D  16,  32, 0, 1
+SADNXN4D  16,  16, 0, 1
+SADNXN4D  16,   8, 0, 1
+SADNXN4D   8,  16, 0, 1
+SADNXN4D   8,   8, 0, 1
+SADNXN4D   4,   8, 0, 1
+%if CONFIG_REALTIME_ONLY==0
+SADNXN4D   4,  16, 0, 1
+SADNXN4D   8,  32, 0, 1
+SADNXN4D  32,   8, 0, 1
+SADNXN4D  16,  64, 0, 1
+SADNXN4D  64,  16, 0, 1
+%endif
+
+; Different assembly is needed when the height gets subsampled to 2
+; SADNXN4D 16,  4, 0, 1
+; SADNXN4D  8,  4, 0, 1
+; SADNXN4D  4,  4, 0, 1
diff --git a/aom_dsp/x86/sad_avx2.c b/aom_dsp/x86/sad_avx2.c
index a50dba6..9ab9812 100644
--- a/aom_dsp/x86/sad_avx2.c
+++ b/aom_dsp/x86/sad_avx2.c
@@ -14,76 +14,106 @@
 
 #include "aom_ports/mem.h"
 
+static INLINE unsigned int sad64xh_avx2(const uint8_t *src_ptr, int src_stride,
+                                        const uint8_t *ref_ptr, int ref_stride,
+                                        int h) {
+  int i, res;
+  __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;
+  __m256i sum_sad = _mm256_setzero_si256();
+  __m256i sum_sad_h;
+  __m128i sum_sad128;
+  for (i = 0; i < h; i++) {
+    ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr);
+    ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32));
+    sad1_reg =
+        _mm256_sad_epu8(ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr));
+    sad2_reg = _mm256_sad_epu8(
+        ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32)));
+    sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg));
+    ref_ptr += ref_stride;
+    src_ptr += src_stride;
+  }
+  sum_sad_h = _mm256_srli_si256(sum_sad, 8);
+  sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);
+  sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);
+  sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);
+  res = _mm_cvtsi128_si32(sum_sad128);
+  _mm256_zeroupper();
+  return res;
+}
+
+static INLINE unsigned int sad32xh_avx2(const uint8_t *src_ptr, int src_stride,
+                                        const uint8_t *ref_ptr, int ref_stride,
+                                        int h) {
+  int i, res;
+  __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;
+  __m256i sum_sad = _mm256_setzero_si256();
+  __m256i sum_sad_h;
+  __m128i sum_sad128;
+  int ref2_stride = ref_stride << 1;
+  int src2_stride = src_stride << 1;
+  int max = h >> 1;
+  for (i = 0; i < max; i++) {
+    ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr);
+    ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride));
+    sad1_reg =
+        _mm256_sad_epu8(ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr));
+    sad2_reg = _mm256_sad_epu8(
+        ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride)));
+    sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg));
+    ref_ptr += ref2_stride;
+    src_ptr += src2_stride;
+  }
+  sum_sad_h = _mm256_srli_si256(sum_sad, 8);
+  sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);
+  sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);
+  sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);
+  res = _mm_cvtsi128_si32(sum_sad128);
+  _mm256_zeroupper();
+  return res;
+}
+
 #define FSAD64_H(h)                                                           \
   unsigned int aom_sad64x##h##_avx2(const uint8_t *src_ptr, int src_stride,   \
                                     const uint8_t *ref_ptr, int ref_stride) { \
-    int i, res;                                                               \
-    __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;                           \
-    __m256i sum_sad = _mm256_setzero_si256();                                 \
-    __m256i sum_sad_h;                                                        \
-    __m128i sum_sad128;                                                       \
-    for (i = 0; i < h; i++) {                                                 \
-      ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr);                \
-      ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32));         \
-      sad1_reg = _mm256_sad_epu8(                                             \
-          ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr));            \
-      sad2_reg = _mm256_sad_epu8(                                             \
-          ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32)));     \
-      sum_sad =                                                               \
-          _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg));    \
-      ref_ptr += ref_stride;                                                  \
-      src_ptr += src_stride;                                                  \
-    }                                                                         \
-    sum_sad_h = _mm256_srli_si256(sum_sad, 8);                                \
-    sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);                           \
-    sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);                        \
-    sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);  \
-    res = _mm_cvtsi128_si32(sum_sad128);                                      \
-    _mm256_zeroupper();                                                       \
-    return res;                                                               \
+    return sad64xh_avx2(src_ptr, src_stride, ref_ptr, ref_stride, h);         \
+  }
+
+#define FSADS64_H(h)                                                          \
+  unsigned int aom_sad_skip_64x##h##_avx2(                                    \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,         \
+      int ref_stride) {                                                       \
+    return 2 * sad64xh_avx2(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \
+                            h / 2);                                           \
   }
 
 #define FSAD32_H(h)                                                           \
   unsigned int aom_sad32x##h##_avx2(const uint8_t *src_ptr, int src_stride,   \
                                     const uint8_t *ref_ptr, int ref_stride) { \
-    int i, res;                                                               \
-    __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;                           \
-    __m256i sum_sad = _mm256_setzero_si256();                                 \
-    __m256i sum_sad_h;                                                        \
-    __m128i sum_sad128;                                                       \
-    int ref2_stride = ref_stride << 1;                                        \
-    int src2_stride = src_stride << 1;                                        \
-    int max = h >> 1;                                                         \
-    for (i = 0; i < max; i++) {                                               \
-      ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr);                \
-      ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride)); \
-      sad1_reg = _mm256_sad_epu8(                                             \
-          ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr));            \
-      sad2_reg = _mm256_sad_epu8(                                             \
-          ref2_reg,                                                           \
-          _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride)));       \
-      sum_sad =                                                               \
-          _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg));    \
-      ref_ptr += ref2_stride;                                                 \
-      src_ptr += src2_stride;                                                 \
-    }                                                                         \
-    sum_sad_h = _mm256_srli_si256(sum_sad, 8);                                \
-    sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);                           \
-    sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);                        \
-    sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);  \
-    res = _mm_cvtsi128_si32(sum_sad128);                                      \
-    _mm256_zeroupper();                                                       \
-    return res;                                                               \
+    return sad32xh_avx2(src_ptr, src_stride, ref_ptr, ref_stride, h);         \
   }
 
-#define FSAD64  \
-  FSAD64_H(64); \
-  FSAD64_H(32);
+#define FSADS32_H(h)                                                          \
+  unsigned int aom_sad_skip_32x##h##_avx2(                                    \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,         \
+      int ref_stride) {                                                       \
+    return 2 * sad32xh_avx2(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \
+                            h / 2);                                           \
+  }
 
-#define FSAD32  \
-  FSAD32_H(64); \
-  FSAD32_H(32); \
-  FSAD32_H(16);
+#define FSAD64   \
+  FSAD64_H(64);  \
+  FSAD64_H(32);  \
+  FSADS64_H(64); \
+  FSADS64_H(32);
+
+#define FSAD32   \
+  FSAD32_H(64);  \
+  FSAD32_H(32);  \
+  FSAD32_H(16);  \
+  FSADS32_H(64); \
+  FSADS32_H(32); \
+  FSADS32_H(16);
 
 /* clang-format off */
 FSAD64
diff --git a/aom_dsp/x86/sad_highbd_avx2.c b/aom_dsp/x86/sad_highbd_avx2.c
index 2cff2e6..7d0626a 100644
--- a/aom_dsp/x86/sad_highbd_avx2.c
+++ b/aom_dsp/x86/sad_highbd_avx2.c
@@ -259,6 +259,14 @@
     return aom_highbd_sad##m##xN_avx2(n, src, src_stride, ref, ref_stride); \
   }
 
+#define highbd_sad_skip_MxN_avx2(m, n)                                       \
+  unsigned int aom_highbd_sad_skip_##m##x##n##_avx2(                         \
+      const uint8_t *src, int src_stride, const uint8_t *ref,                \
+      int ref_stride) {                                                      \
+    return 2 * aom_highbd_sad##m##xN_avx2((n / 2), src, 2 * src_stride, ref, \
+                                          2 * ref_stride);                   \
+  }
+
 highbd_sadMxN_avx2(16, 4);
 highbd_sadMxN_avx2(16, 8);
 highbd_sadMxN_avx2(16, 16);
@@ -278,6 +286,24 @@
 highbd_sadMxN_avx2(128, 64);
 highbd_sadMxN_avx2(128, 128);
 
+highbd_sad_skip_MxN_avx2(16, 8);
+highbd_sad_skip_MxN_avx2(16, 16);
+highbd_sad_skip_MxN_avx2(16, 32);
+highbd_sad_skip_MxN_avx2(16, 64);
+
+highbd_sad_skip_MxN_avx2(32, 8);
+highbd_sad_skip_MxN_avx2(32, 16);
+highbd_sad_skip_MxN_avx2(32, 32);
+highbd_sad_skip_MxN_avx2(32, 64);
+
+highbd_sad_skip_MxN_avx2(64, 16);
+highbd_sad_skip_MxN_avx2(64, 32);
+highbd_sad_skip_MxN_avx2(64, 64);
+highbd_sad_skip_MxN_avx2(64, 128);
+
+highbd_sad_skip_MxN_avx2(128, 64);
+highbd_sad_skip_MxN_avx2(128, 128);
+
 unsigned int aom_highbd_sad16x4_avg_avx2(const uint8_t *src, int src_stride,
                                          const uint8_t *ref, int ref_stride,
                                          const uint8_t *second_pred) {
@@ -678,6 +704,17 @@
     aom_highbd_sad##m##xNx4d_avx2(n, src, src_stride, ref_array, ref_stride, \
                                   sad_array);                                \
   }
+#define highbd_sad_skip_MxNx4d_avx2(m, n)                                   \
+  void aom_highbd_sad_skip_##m##x##n##x4d_avx2(                             \
+      const uint8_t *src, int src_stride, const uint8_t *const ref_array[], \
+      int ref_stride, uint32_t *sad_array) {                                \
+    aom_highbd_sad##m##xNx4d_avx2((n / 2), src, 2 * src_stride, ref_array,  \
+                                  2 * ref_stride, sad_array);               \
+    sad_array[0] <<= 1;                                                     \
+    sad_array[1] <<= 1;                                                     \
+    sad_array[2] <<= 1;                                                     \
+    sad_array[3] <<= 1;                                                     \
+  }
 
 highbd_sadMxNx4d_avx2(16, 4);
 highbd_sadMxNx4d_avx2(16, 8);
@@ -697,3 +734,21 @@
 
 highbd_sadMxNx4d_avx2(128, 64);
 highbd_sadMxNx4d_avx2(128, 128);
+
+highbd_sad_skip_MxNx4d_avx2(16, 8);
+highbd_sad_skip_MxNx4d_avx2(16, 16);
+highbd_sad_skip_MxNx4d_avx2(16, 32);
+highbd_sad_skip_MxNx4d_avx2(16, 64);
+
+highbd_sad_skip_MxNx4d_avx2(32, 8);
+highbd_sad_skip_MxNx4d_avx2(32, 16);
+highbd_sad_skip_MxNx4d_avx2(32, 32);
+highbd_sad_skip_MxNx4d_avx2(32, 64);
+
+highbd_sad_skip_MxNx4d_avx2(64, 16);
+highbd_sad_skip_MxNx4d_avx2(64, 32);
+highbd_sad_skip_MxNx4d_avx2(64, 64);
+highbd_sad_skip_MxNx4d_avx2(64, 128);
+
+highbd_sad_skip_MxNx4d_avx2(128, 64);
+highbd_sad_skip_MxNx4d_avx2(128, 128);
diff --git a/aom_dsp/x86/sad_impl_avx2.c b/aom_dsp/x86/sad_impl_avx2.c
index f77a585..2afae4b 100644
--- a/aom_dsp/x86/sad_impl_avx2.c
+++ b/aom_dsp/x86/sad_impl_avx2.c
@@ -84,6 +84,30 @@
   return sum;
 }
 
+unsigned int aom_sad_skip_128x64_avx2(const uint8_t *src_ptr, int src_stride,
+                                      const uint8_t *ref_ptr, int ref_stride) {
+  const uint32_t half_width = 64;
+  uint32_t sum = sad64x32(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2);
+  src_ptr += half_width;
+  ref_ptr += half_width;
+  sum += sad64x32(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2);
+  return 2 * sum;
+}
+
+unsigned int aom_sad_skip_64x128_avx2(const uint8_t *src_ptr, int src_stride,
+                                      const uint8_t *ref_ptr, int ref_stride) {
+  const uint32_t sum =
+      sad64x64(src_ptr, 2 * src_stride, ref_ptr, 2 * ref_stride);
+  return 2 * sum;
+}
+
+unsigned int aom_sad_skip_128x128_avx2(const uint8_t *src_ptr, int src_stride,
+                                       const uint8_t *ref_ptr, int ref_stride) {
+  const uint32_t sum =
+      aom_sad128x64_avx2(src_ptr, 2 * src_stride, ref_ptr, 2 * ref_stride);
+  return 2 * sum;
+}
+
 static unsigned int sad_w64_avg_avx2(const uint8_t *src_ptr, int src_stride,
                                      const uint8_t *ref_ptr, int ref_stride,
                                      const int h, const uint8_t *second_pred,
diff --git a/aom_dsp/x86/sad_sse2.asm b/aom_dsp/x86/sad_sse2.asm
index 3251b76..de9845a 100644
--- a/aom_dsp/x86/sad_sse2.asm
+++ b/aom_dsp/x86/sad_sse2.asm
@@ -15,15 +15,29 @@
 
 SECTION .text
 
+; Macro Arguments
+; Arg 1: Width
+; Arg 2: Height
+; Arg 3: Number of general purpose registers: 5 for 32-bit build, 6 for 64-bit
+; Arg 4: Type of function: if 0, normal sad; if 1, avg; if 2, skip rows
 %macro SAD_FN 4
-%if %4 == 0
+%if %4 == 0 ; normal sad
 %if %3 == 5
 cglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows
 %else ; %3 == 7
 cglobal sad%1x%2, 4, %3, 6, src, src_stride, ref, ref_stride, \
                             src_stride3, ref_stride3, n_rows
 %endif ; %3 == 5/7
-%else ; avg
+
+%elif %4 == 2 ; skip
+%if %3 == 5
+cglobal sad_skip_%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows
+%else ; %3 == 7
+cglobal sad_skip_%1x%2, 4, %3, 6, src, src_stride, ref, ref_stride, \
+                            src_stride3, ref_stride3, n_rows
+%endif ; %3 == 5/7
+
+%else
 %if %3 == 5
 cglobal sad%1x%2_avg, 5, 1 + %3, 5, src, src_stride, ref, ref_stride, \
                                     second_pred, n_rows
@@ -38,7 +52,11 @@
 %define n_rowsd dword r0m
 %endif ; x86-32/64
 %endif ; %3 == 5/7
-%endif ; avg/sad
+%endif ; sad/avg/skip
+%if %4 == 2; skip rows so double the stride
+lea           src_strided, [src_strided*2]
+lea           ref_strided, [ref_strided*2]
+%endif ; %4 skip
   movsxdifnidn src_strideq, src_strided
   movsxdifnidn ref_strideq, ref_strided
 %if %3 == 7
@@ -51,7 +69,11 @@
 ;                                  uint8_t *ref, int ref_stride);
 %macro SAD128XN 1-2 0
   SAD_FN 128, %1, 5, %2
+%if %2 == 2
+  mov              n_rowsd, %1/2
+%else
   mov              n_rowsd, %1
+%endif
   pxor                  m0, m0
 
 .loop:
@@ -104,6 +126,9 @@
 
   movhlps               m1, m0
   paddd                 m0, m1
+%if %2 == 2 ; we skipped rows, so now we need to double the sad
+  pslld                 m0, 1
+%endif
   movd                 eax, m0
   RET
 %endmacro
@@ -111,15 +136,21 @@
 INIT_XMM sse2
 SAD128XN 128     ; sad128x128_sse2
 SAD128XN 128, 1  ; sad128x128_avg_sse2
+SAD128XN 128, 2  ; sad128x128_skip_sse2
 SAD128XN 64      ; sad128x64_sse2
 SAD128XN 64, 1   ; sad128x64_avg_sse2
+SAD128XN 64, 2   ; sad128x64_skip_sse2
 
 
 ; unsigned int aom_sad64x64_sse2(uint8_t *src, int src_stride,
 ;                                uint8_t *ref, int ref_stride);
 %macro SAD64XN 1-2 0
   SAD_FN 64, %1, 5, %2
+%if %2 == 2
+  mov              n_rowsd, %1/2
+%else
   mov              n_rowsd, %1
+%endif
   pxor                  m0, m0
 .loop:
   movu                  m1, [refq]
@@ -148,25 +179,36 @@
 
   movhlps               m1, m0
   paddd                 m0, m1
+%if %2 == 2 ; we skipped rows, so now we need to double the sad
+  pslld                 m0, 1
+%endif
   movd                 eax, m0
   RET
 %endmacro
 
 INIT_XMM sse2
 SAD64XN 128     ; sad64x128_sse2
+SAD64XN  64     ; sad64x64_sse2
+SAD64XN  32     ; sad64x32_sse2
+SAD64XN  16     ; sad64x16_sse2
 SAD64XN 128, 1  ; sad64x128_avg_sse2
-SAD64XN 64 ; sad64x64_sse2
-SAD64XN 32 ; sad64x32_sse2
-SAD64XN 64, 1 ; sad64x64_avg_sse2
-SAD64XN 32, 1 ; sad64x32_avg_sse2
-SAD64XN 16 ; sad64x16_sse2
-SAD64XN 16, 1 ; sad64x16_avg_sse2
+SAD64XN  64, 1  ; sad64x64_avg_sse2
+SAD64XN  32, 1  ; sad64x32_avg_sse2
+SAD64XN  16, 1  ; sad64x16_avg_sse2
+SAD64XN 128, 2  ; sad64x128_skip_sse2
+SAD64XN  64, 2  ; sad64x64_skip_sse2
+SAD64XN  32, 2  ; sad64x32_skip_sse2
+SAD64XN  16, 2  ; sad64x16_skip_sse2
 
 ; unsigned int aom_sad32x32_sse2(uint8_t *src, int src_stride,
 ;                                uint8_t *ref, int ref_stride);
 %macro SAD32XN 1-2 0
   SAD_FN 32, %1, 5, %2
+%if %2 == 2
+  mov              n_rowsd, %1/4
+%else
   mov              n_rowsd, %1/2
+%endif
   pxor                  m0, m0
 .loop:
   movu                  m1, [refq]
@@ -195,25 +237,36 @@
 
   movhlps               m1, m0
   paddd                 m0, m1
+%if %2 == 2 ; we skipped rows, so now we need to double the sad
+  pslld                 m0, 1
+%endif
   movd                 eax, m0
   RET
 %endmacro
 
 INIT_XMM sse2
-SAD32XN 64 ; sad32x64_sse2
-SAD32XN 32 ; sad32x32_sse2
-SAD32XN 16 ; sad32x16_sse2
+SAD32XN 64    ; sad32x64_sse2
+SAD32XN 32    ; sad32x32_sse2
+SAD32XN 16    ; sad32x16_sse2
+SAD32XN  8    ; sad_32x8_sse2
 SAD32XN 64, 1 ; sad32x64_avg_sse2
 SAD32XN 32, 1 ; sad32x32_avg_sse2
 SAD32XN 16, 1 ; sad32x16_avg_sse2
-SAD32XN 8 ; sad_32x8_sse2
-SAD32XN 8, 1 ; sad_32x8_avg_sse2
+SAD32XN  8, 1 ; sad_32x8_avg_sse2
+SAD32XN 64, 2 ; sad32x64_skip_sse2
+SAD32XN 32, 2 ; sad32x32_skip_sse2
+SAD32XN 16, 2 ; sad32x16_skip_sse2
+SAD32XN  8, 2 ; sad_32x8_skip_sse2
 
 ; unsigned int aom_sad16x{8,16}_sse2(uint8_t *src, int src_stride,
 ;                                    uint8_t *ref, int ref_stride);
 %macro SAD16XN 1-2 0
   SAD_FN 16, %1, 7, %2
+%if %2 == 2
+  mov              n_rowsd, %1/8
+%else
   mov              n_rowsd, %1/4
+%endif
   pxor                  m0, m0
 
 .loop:
@@ -243,27 +296,38 @@
 
   movhlps               m1, m0
   paddd                 m0, m1
+%if %2 == 2 ; we skipped rows, so now we need to double the sad
+  pslld                 m0, 1
+%endif
   movd                 eax, m0
   RET
 %endmacro
 
 INIT_XMM sse2
-SAD16XN 32 ; sad16x32_sse2
-SAD16XN 16 ; sad16x16_sse2
-SAD16XN  8 ; sad16x8_sse2
+SAD16XN 64    ; sad_16x64_sse2
+SAD16XN 32    ; sad16x32_sse2
+SAD16XN 16    ; sad16x16_sse2
+SAD16XN  8    ; sad16x8_sse2
+SAD16XN  4    ; sad_16x4_sse2
+SAD16XN 64, 1 ; sad_16x64_avg_sse2
 SAD16XN 32, 1 ; sad16x32_avg_sse2
 SAD16XN 16, 1 ; sad16x16_avg_sse2
 SAD16XN  8, 1 ; sad16x8_avg_sse2
-SAD16XN 4 ; sad_16x4_sse2
-SAD16XN 4, 1 ; sad_16x4_avg_sse2
-SAD16XN 64 ; sad_16x64_sse2
-SAD16XN 64, 1 ; sad_16x64_avg_sse2
+SAD16XN  4, 1 ; sad_16x4_avg_sse2
+SAD16XN 64, 2 ; sad_16x64_skip_sse2
+SAD16XN 32, 2 ; sad16x32_skip_sse2
+SAD16XN 16, 2 ; sad16x16_skip_sse2
+SAD16XN  8, 2 ; sad16x8_skip_sse2
 
 ; unsigned int aom_sad8x{8,16}_sse2(uint8_t *src, int src_stride,
 ;                                   uint8_t *ref, int ref_stride);
 %macro SAD8XN 1-2 0
   SAD_FN 8, %1, 7, %2
+%if %2 == 2
+  mov              n_rowsd, %1/8
+%else
   mov              n_rowsd, %1/4
+%endif
   pxor                  m0, m0
 
 .loop:
@@ -291,25 +355,35 @@
 
   movhlps               m1, m0
   paddd                 m0, m1
+%if %2 == 2 ; we skipped rows, so now we need to double the sad
+  pslld                 m0, 1
+%endif
   movd                 eax, m0
   RET
 %endmacro
 
 INIT_XMM sse2
-SAD8XN 16 ; sad8x16_sse2
-SAD8XN  8 ; sad8x8_sse2
-SAD8XN  4 ; sad8x4_sse2
+SAD8XN 32    ; sad_8x32_sse2
+SAD8XN 16    ; sad8x16_sse2
+SAD8XN  8    ; sad8x8_sse2
+SAD8XN  4    ; sad8x4_sse2
+SAD8XN 32, 1 ; sad_8x32_avg_sse2
 SAD8XN 16, 1 ; sad8x16_avg_sse2
 SAD8XN  8, 1 ; sad8x8_avg_sse2
 SAD8XN  4, 1 ; sad8x4_avg_sse2
-SAD8XN 32 ; sad_8x32_sse2
-SAD8XN 32, 1 ; sad_8x32_avg_sse2
+SAD8XN 32, 2 ; sad_8x32_skip_sse2
+SAD8XN 16, 2 ; sad8x16_skip_sse2
+SAD8XN  8, 2 ; sad8x8_skip_sse2
 
 ; unsigned int aom_sad4x{4, 8}_sse2(uint8_t *src, int src_stride,
 ;                                   uint8_t *ref, int ref_stride);
 %macro SAD4XN 1-2 0
   SAD_FN 4, %1, 7, %2
+%if %2 == 2
+  mov              n_rowsd, %1/8
+%else
   mov              n_rowsd, %1/4
+%endif
   pxor                  m0, m0
 
 .loop:
@@ -340,14 +414,19 @@
 
   movhlps               m1, m0
   paddd                 m0, m1
+%if %2 == 2 ; we skipped rows, so now we need to double the sad
+  pslld                 m0, 1
+%endif
   movd                 eax, m0
   RET
 %endmacro
 
 INIT_XMM sse2
+SAD4XN 16 ; sad_4x16_sse2
 SAD4XN  8 ; sad4x8_sse
 SAD4XN  4 ; sad4x4_sse
+SAD4XN 16, 1 ; sad_4x16_avg_sse2
 SAD4XN  8, 1 ; sad4x8_avg_sse
 SAD4XN  4, 1 ; sad4x4_avg_sse
-SAD4XN 16 ; sad_4x16_sse2
-SAD4XN 16, 1 ; sad_4x16_avg_sse2
+SAD4XN 16, 2 ; sad_4x16_skip_sse2
+SAD4XN  8, 2 ; sad4x8_skip_sse
diff --git a/aom_dsp/x86/ssim_sse2_x86_64.asm b/aom_dsp/x86/ssim_sse2_x86_64.asm
index 6d9b5a1..49bc655 100644
--- a/aom_dsp/x86/ssim_sse2_x86_64.asm
+++ b/aom_dsp/x86/ssim_sse2_x86_64.asm
@@ -67,7 +67,7 @@
 ; or pavgb At this point this is just meant to be first pass for calculating
 ; all the parms needed for 16x16 ssim so we can play with dssim as distortion
 ; in mode selection code.
-global sym(aom_ssim_parms_16x16_sse2) PRIVATE
+globalsym(aom_ssim_parms_16x16_sse2)
 sym(aom_ssim_parms_16x16_sse2):
     push        rbp
     mov         rbp, rsp
@@ -157,7 +157,7 @@
 ; or pavgb At this point this is just meant to be first pass for calculating
 ; all the parms needed for 16x16 ssim so we can play with dssim as distortion
 ; in mode selection code.
-global sym(aom_ssim_parms_8x8_sse2) PRIVATE
+globalsym(aom_ssim_parms_8x8_sse2)
 sym(aom_ssim_parms_8x8_sse2):
     push        rbp
     mov         rbp, rsp
diff --git a/aom_dsp/x86/sum_squares_avx2.c b/aom_dsp/x86/sum_squares_avx2.c
index 97d78b6..0d63db2 100644
--- a/aom_dsp/x86/sum_squares_avx2.c
+++ b/aom_dsp/x86/sum_squares_avx2.c
@@ -78,6 +78,84 @@
   }
 }
 
+static uint64_t aom_sum_sse_2d_i16_nxn_avx2(const int16_t *src, int stride,
+                                            int width, int height, int *sum) {
+  uint64_t result;
+  const __m256i zero_reg = _mm256_setzero_si256();
+  const __m256i one_reg = _mm256_set1_epi16(1);
+
+  __m256i v_sse_total = zero_reg;
+  __m256i v_sum_total = zero_reg;
+
+  for (int col = 0; col < height; col += 4) {
+    __m256i v_sse_row = zero_reg;
+    for (int row = 0; row < width; row += 16) {
+      const int16_t *tempsrc = src + row;
+      const __m256i v_val_0_w =
+          _mm256_loadu_si256((const __m256i *)(tempsrc + 0 * stride));
+      const __m256i v_val_1_w =
+          _mm256_loadu_si256((const __m256i *)(tempsrc + 1 * stride));
+      const __m256i v_val_2_w =
+          _mm256_loadu_si256((const __m256i *)(tempsrc + 2 * stride));
+      const __m256i v_val_3_w =
+          _mm256_loadu_si256((const __m256i *)(tempsrc + 3 * stride));
+
+      const __m256i v_sum_01 = _mm256_add_epi16(v_val_0_w, v_val_1_w);
+      const __m256i v_sum_23 = _mm256_add_epi16(v_val_2_w, v_val_3_w);
+      __m256i v_sum_0123 = _mm256_add_epi16(v_sum_01, v_sum_23);
+      v_sum_0123 = _mm256_madd_epi16(v_sum_0123, one_reg);
+      v_sum_total = _mm256_add_epi32(v_sum_total, v_sum_0123);
+
+      const __m256i v_sq_0_d = _mm256_madd_epi16(v_val_0_w, v_val_0_w);
+      const __m256i v_sq_1_d = _mm256_madd_epi16(v_val_1_w, v_val_1_w);
+      const __m256i v_sq_2_d = _mm256_madd_epi16(v_val_2_w, v_val_2_w);
+      const __m256i v_sq_3_d = _mm256_madd_epi16(v_val_3_w, v_val_3_w);
+      const __m256i v_sq_01_d = _mm256_add_epi32(v_sq_0_d, v_sq_1_d);
+      const __m256i v_sq_23_d = _mm256_add_epi32(v_sq_2_d, v_sq_3_d);
+      const __m256i v_sq_0123_d = _mm256_add_epi32(v_sq_01_d, v_sq_23_d);
+      v_sse_row = _mm256_add_epi32(v_sse_row, v_sq_0123_d);
+    }
+    const __m256i v_sse_row_low = _mm256_unpacklo_epi32(v_sse_row, zero_reg);
+    const __m256i v_sse_row_hi = _mm256_unpackhi_epi32(v_sse_row, zero_reg);
+    v_sse_row = _mm256_add_epi64(v_sse_row_low, v_sse_row_hi);
+    v_sse_total = _mm256_add_epi64(v_sse_total, v_sse_row);
+    src += 4 * stride;
+  }
+
+  const __m128i v_sum_total_low = _mm256_castsi256_si128(v_sum_total);
+  const __m128i v_sum_total_hi = _mm256_extracti128_si256(v_sum_total, 1);
+  __m128i sum_128bit = _mm_add_epi32(v_sum_total_hi, v_sum_total_low);
+  sum_128bit = _mm_add_epi32(sum_128bit, _mm_srli_si128(sum_128bit, 8));
+  sum_128bit = _mm_add_epi32(sum_128bit, _mm_srli_si128(sum_128bit, 4));
+  *sum += _mm_cvtsi128_si32(sum_128bit);
+
+  __m128i v_sse_total_lo = _mm256_castsi256_si128(v_sse_total);
+  __m128i v_sse_total_hi = _mm256_extracti128_si256(v_sse_total, 1);
+  __m128i sse_128bit = _mm_add_epi64(v_sse_total_lo, v_sse_total_hi);
+
+  sse_128bit =
+      _mm_add_epi64(sse_128bit, _mm_unpackhi_epi64(sse_128bit, sse_128bit));
+
+  xx_storel_64(&result, sse_128bit);
+
+  return result;
+}
+
+uint64_t aom_sum_sse_2d_i16_avx2(const int16_t *src, int src_stride, int width,
+                                 int height, int *sum) {
+  if (LIKELY(width == 4 && height == 4)) {
+    return aom_sum_sse_2d_i16_4x4_sse2(src, src_stride, sum);
+  } else if (LIKELY(width == 4 && (height & 3) == 0)) {
+    return aom_sum_sse_2d_i16_4xn_sse2(src, src_stride, height, sum);
+  } else if (LIKELY(width == 8 && (height & 3) == 0)) {
+    return aom_sum_sse_2d_i16_nxn_sse2(src, src_stride, width, height, sum);
+  } else if (LIKELY(((width & 15) == 0) && ((height & 3) == 0))) {
+    return aom_sum_sse_2d_i16_nxn_avx2(src, src_stride, width, height, sum);
+  } else {
+    return aom_sum_sse_2d_i16_c(src, src_stride, width, height, sum);
+  }
+}
+
 // Accumulate sum of 16-bit elements in the vector
 static AOM_INLINE int32_t mm256_accumulate_epi16(__m256i vec_a) {
   __m128i vtmp1 = _mm256_extracti128_si256(vec_a, 1);
diff --git a/aom_dsp/x86/sum_squares_sse2.c b/aom_dsp/x86/sum_squares_sse2.c
index 85b301a..0bdeee9 100644
--- a/aom_dsp/x86/sum_squares_sse2.c
+++ b/aom_dsp/x86/sum_squares_sse2.c
@@ -53,6 +53,27 @@
   return (uint64_t)_mm_cvtsi128_si32(v_sum_d);
 }
 
+uint64_t aom_sum_sse_2d_i16_4x4_sse2(const int16_t *src, int stride, int *sum) {
+  const __m128i one_reg = _mm_set1_epi16(1);
+  const __m128i v_val_0_w = xx_loadl_64(src + 0 * stride);
+  const __m128i v_val_2_w = xx_loadl_64(src + 2 * stride);
+  __m128i v_val_01_w = xx_loadh_64(v_val_0_w, src + 1 * stride);
+  __m128i v_val_23_w = xx_loadh_64(v_val_2_w, src + 3 * stride);
+
+  __m128i v_sum_0123_d = _mm_add_epi16(v_val_01_w, v_val_23_w);
+  v_sum_0123_d = _mm_madd_epi16(v_sum_0123_d, one_reg);
+  v_sum_0123_d = _mm_add_epi32(v_sum_0123_d, _mm_srli_si128(v_sum_0123_d, 8));
+  v_sum_0123_d = _mm_add_epi32(v_sum_0123_d, _mm_srli_si128(v_sum_0123_d, 4));
+  *sum = _mm_cvtsi128_si32(v_sum_0123_d);
+
+  const __m128i v_sq_01_d = _mm_madd_epi16(v_val_01_w, v_val_01_w);
+  const __m128i v_sq_23_d = _mm_madd_epi16(v_val_23_w, v_val_23_w);
+  __m128i v_sq_0123_d = _mm_add_epi32(v_sq_01_d, v_sq_23_d);
+  v_sq_0123_d = _mm_add_epi32(v_sq_0123_d, _mm_srli_si128(v_sq_0123_d, 8));
+  v_sq_0123_d = _mm_add_epi32(v_sq_0123_d, _mm_srli_si128(v_sq_0123_d, 4));
+  return (uint64_t)_mm_cvtsi128_si32(v_sq_0123_d);
+}
+
 uint64_t aom_sum_squares_2d_i16_4xn_sse2(const int16_t *src, int stride,
                                          int height) {
   int r = 0;
@@ -70,6 +91,20 @@
   return xx_cvtsi128_si64(v_acc_64);
 }
 
+uint64_t aom_sum_sse_2d_i16_4xn_sse2(const int16_t *src, int stride, int height,
+                                     int *sum) {
+  int r = 0;
+  uint64_t sse = 0;
+  do {
+    int curr_sum = 0;
+    sse += aom_sum_sse_2d_i16_4x4_sse2(src, stride, &curr_sum);
+    *sum += curr_sum;
+    src += stride << 2;
+    r += 4;
+  } while (r < height);
+  return sse;
+}
+
 #ifdef __GNUC__
 // This prevents GCC/Clang from inlining this function into
 // aom_sum_squares_2d_i16_sse2, which in turn saves some stack
@@ -120,6 +155,69 @@
   return xx_cvtsi128_si64(v_acc_q);
 }
 
+#ifdef __GNUC__
+// This prevents GCC/Clang from inlining this function into
+// aom_sum_sse_2d_i16_nxn_sse2, which in turn saves some stack
+// maintenance instructions in the common case of 4x4.
+__attribute__((noinline))
+#endif
+uint64_t
+aom_sum_sse_2d_i16_nxn_sse2(const int16_t *src, int stride, int width,
+                            int height, int *sum) {
+  int r = 0;
+  uint64_t result;
+  const __m128i zero_reg = _mm_setzero_si128();
+  const __m128i one_reg = _mm_set1_epi16(1);
+
+  __m128i v_sse_total = zero_reg;
+  __m128i v_sum_total = zero_reg;
+
+  do {
+    int c = 0;
+    __m128i v_sse_row = zero_reg;
+    do {
+      const int16_t *b = src + c;
+
+      __m128i v_val_0_w = xx_load_128(b + 0 * stride);
+      __m128i v_val_1_w = xx_load_128(b + 1 * stride);
+      __m128i v_val_2_w = xx_load_128(b + 2 * stride);
+      __m128i v_val_3_w = xx_load_128(b + 3 * stride);
+
+      const __m128i v_sq_0_d = _mm_madd_epi16(v_val_0_w, v_val_0_w);
+      const __m128i v_sq_1_d = _mm_madd_epi16(v_val_1_w, v_val_1_w);
+      const __m128i v_sq_2_d = _mm_madd_epi16(v_val_2_w, v_val_2_w);
+      const __m128i v_sq_3_d = _mm_madd_epi16(v_val_3_w, v_val_3_w);
+      const __m128i v_sq_01_d = _mm_add_epi32(v_sq_0_d, v_sq_1_d);
+      const __m128i v_sq_23_d = _mm_add_epi32(v_sq_2_d, v_sq_3_d);
+      const __m128i v_sq_0123_d = _mm_add_epi32(v_sq_01_d, v_sq_23_d);
+      v_sse_row = _mm_add_epi32(v_sse_row, v_sq_0123_d);
+
+      const __m128i v_sum_01 = _mm_add_epi16(v_val_0_w, v_val_1_w);
+      const __m128i v_sum_23 = _mm_add_epi16(v_val_2_w, v_val_3_w);
+      __m128i v_sum_0123_d = _mm_add_epi16(v_sum_01, v_sum_23);
+      v_sum_0123_d = _mm_madd_epi16(v_sum_0123_d, one_reg);
+      v_sum_total = _mm_add_epi32(v_sum_total, v_sum_0123_d);
+
+      c += 8;
+    } while (c < width);
+
+    const __m128i v_sse_row_low = _mm_unpacklo_epi32(v_sse_row, zero_reg);
+    const __m128i v_sse_row_hi = _mm_unpackhi_epi32(v_sse_row, zero_reg);
+    v_sse_row = _mm_add_epi64(v_sse_row_low, v_sse_row_hi);
+    v_sse_total = _mm_add_epi64(v_sse_total, v_sse_row);
+    src += 4 * stride;
+    r += 4;
+  } while (r < height);
+
+  v_sum_total = _mm_add_epi32(v_sum_total, _mm_srli_si128(v_sum_total, 8));
+  v_sum_total = _mm_add_epi32(v_sum_total, _mm_srli_si128(v_sum_total, 4));
+  *sum += _mm_cvtsi128_si32(v_sum_total);
+
+  v_sse_total = _mm_add_epi64(v_sse_total, _mm_srli_si128(v_sse_total, 8));
+  xx_storel_64(&result, v_sse_total);
+  return result;
+}
+
 uint64_t aom_sum_squares_2d_i16_sse2(const int16_t *src, int stride, int width,
                                      int height) {
   // 4 elements per row only requires half an XMM register, so this
@@ -137,6 +235,20 @@
   }
 }
 
+uint64_t aom_sum_sse_2d_i16_sse2(const int16_t *src, int src_stride, int width,
+                                 int height, int *sum) {
+  if (LIKELY(width == 4 && height == 4)) {
+    return aom_sum_sse_2d_i16_4x4_sse2(src, src_stride, sum);
+  } else if (LIKELY(width == 4 && (height & 3) == 0)) {
+    return aom_sum_sse_2d_i16_4xn_sse2(src, src_stride, height, sum);
+  } else if (LIKELY((width & 7) == 0 && (height & 3) == 0)) {
+    // Generic case
+    return aom_sum_sse_2d_i16_nxn_sse2(src, src_stride, width, height, sum);
+  } else {
+    return aom_sum_sse_2d_i16_c(src, src_stride, width, height, sum);
+  }
+}
+
 //////////////////////////////////////////////////////////////////////////////
 // 1D version
 //////////////////////////////////////////////////////////////////////////////
diff --git a/aom_dsp/x86/sum_squares_sse2.h b/aom_dsp/x86/sum_squares_sse2.h
index 491e31c..5ed3f2c 100644
--- a/aom_dsp/x86/sum_squares_sse2.h
+++ b/aom_dsp/x86/sum_squares_sse2.h
@@ -19,4 +19,10 @@
                                          int height);
 uint64_t aom_sum_squares_2d_i16_4x4_sse2(const int16_t *src, int stride);
 
+uint64_t aom_sum_sse_2d_i16_4x4_sse2(const int16_t *src, int stride, int *sum);
+uint64_t aom_sum_sse_2d_i16_4xn_sse2(const int16_t *src, int stride, int height,
+                                     int *sum);
+uint64_t aom_sum_sse_2d_i16_nxn_sse2(const int16_t *src, int stride, int width,
+                                     int height, int *sum);
+
 #endif  // AOM_DSP_X86_SUM_SQUARES_SSE2_H_
diff --git a/aom_dsp/x86/synonyms.h b/aom_dsp/x86/synonyms.h
index 2e99bee..d538015 100644
--- a/aom_dsp/x86/synonyms.h
+++ b/aom_dsp/x86/synonyms.h
@@ -100,6 +100,12 @@
   return _mm_srli_epi32(v_tmp_d, bits);
 }
 
+static INLINE __m128i xx_roundn_epi16_unsigned(__m128i v_val_d, int bits) {
+  const __m128i v_bias_d = _mm_set1_epi16((1 << bits) >> 1);
+  const __m128i v_tmp_d = _mm_add_epi16(v_val_d, v_bias_d);
+  return _mm_srai_epi16(v_tmp_d, bits);
+}
+
 // This is equivalent to ROUND_POWER_OF_TWO(v_val_d, bits)
 static INLINE __m128i xx_roundn_epi32_unsigned(__m128i v_val_d, int bits) {
   const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1);
diff --git a/aom_dsp/x86/variance_avx2.c b/aom_dsp/x86/variance_avx2.c
index c4919ba..f69a1c9 100644
--- a/aom_dsp/x86/variance_avx2.c
+++ b/aom_dsp/x86/variance_avx2.c
@@ -14,6 +14,7 @@
 #include "config/aom_dsp_rtcd.h"
 
 #include "aom_dsp/x86/masked_variance_intrin_ssse3.h"
+#include "aom_dsp/x86/synonyms.h"
 
 static INLINE __m128i mm256_add_hi_lo_epi16(const __m256i val) {
   return _mm_add_epi16(_mm256_castsi256_si128(val),
@@ -185,20 +186,23 @@
     return *sse - (uint32_t)(((int64_t)sum * sum) >> bits);                   \
   }
 
-AOM_VAR_NO_LOOP_AVX2(16, 4, 6, 512);
 AOM_VAR_NO_LOOP_AVX2(16, 8, 7, 512);
 AOM_VAR_NO_LOOP_AVX2(16, 16, 8, 512);
 AOM_VAR_NO_LOOP_AVX2(16, 32, 9, 512);
-AOM_VAR_NO_LOOP_AVX2(16, 64, 10, 1024);
 
-AOM_VAR_NO_LOOP_AVX2(32, 8, 8, 512);
 AOM_VAR_NO_LOOP_AVX2(32, 16, 9, 512);
 AOM_VAR_NO_LOOP_AVX2(32, 32, 10, 1024);
 AOM_VAR_NO_LOOP_AVX2(32, 64, 11, 2048);
 
-AOM_VAR_NO_LOOP_AVX2(64, 16, 10, 1024);
 AOM_VAR_NO_LOOP_AVX2(64, 32, 11, 2048);
 
+#if !CONFIG_REALTIME_ONLY
+AOM_VAR_NO_LOOP_AVX2(64, 16, 10, 1024);
+AOM_VAR_NO_LOOP_AVX2(32, 8, 8, 512);
+AOM_VAR_NO_LOOP_AVX2(16, 64, 10, 1024);
+AOM_VAR_NO_LOOP_AVX2(16, 4, 6, 512);
+#endif
+
 #define AOM_VAR_LOOP_AVX2(bw, bh, bits, uh)                                   \
   unsigned int aom_variance##bw##x##bh##_avx2(                                \
       const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
@@ -280,11 +284,13 @@
 AOM_SUB_PIXEL_VAR_AVX2(32, 64, 32, 5, 6);
 AOM_SUB_PIXEL_VAR_AVX2(32, 32, 32, 5, 5);
 AOM_SUB_PIXEL_VAR_AVX2(32, 16, 32, 5, 4);
-AOM_SUB_PIXEL_VAR_AVX2(16, 64, 16, 4, 6);
 AOM_SUB_PIXEL_VAR_AVX2(16, 32, 16, 4, 5);
 AOM_SUB_PIXEL_VAR_AVX2(16, 16, 16, 4, 4);
 AOM_SUB_PIXEL_VAR_AVX2(16, 8, 16, 4, 3);
+#if !CONFIG_REALTIME_ONLY
+AOM_SUB_PIXEL_VAR_AVX2(16, 64, 16, 4, 6);
 AOM_SUB_PIXEL_VAR_AVX2(16, 4, 16, 4, 2);
+#endif
 
 #define AOM_SUB_PIXEL_AVG_VAR_AVX2(w, h, wf, wlog2, hlog2)                \
   unsigned int aom_sub_pixel_avg_variance##w##x##h##_avx2(                \
@@ -394,25 +400,20 @@
       comp_pred += (16 << 2);
       i += 4;
     } while (i < height);
-  } else {  // for width == 32
+  } else {
     do {
-      const __m256i sA0 = _mm256_lddqu_si256((const __m256i *)(src0));
-      const __m256i sA1 = _mm256_lddqu_si256((const __m256i *)(src1));
-      const __m256i aA = _mm256_lddqu_si256((const __m256i *)(mask));
+      for (int x = 0; x < width; x += 32) {
+        const __m256i sA0 = _mm256_lddqu_si256((const __m256i *)(src0 + x));
+        const __m256i sA1 = _mm256_lddqu_si256((const __m256i *)(src1 + x));
+        const __m256i aA = _mm256_lddqu_si256((const __m256i *)(mask + x));
 
-      const __m256i sB0 = _mm256_lddqu_si256((const __m256i *)(src0 + stride0));
-      const __m256i sB1 = _mm256_lddqu_si256((const __m256i *)(src1 + stride1));
-      const __m256i aB =
-          _mm256_lddqu_si256((const __m256i *)(mask + mask_stride));
-
-      comp_mask_pred_line_avx2(sA0, sA1, aA, comp_pred);
-      comp_mask_pred_line_avx2(sB0, sB1, aB, comp_pred + 32);
-      comp_pred += (32 << 1);
-
-      src0 += (stride0 << 1);
-      src1 += (stride1 << 1);
-      mask += (mask_stride << 1);
-      i += 2;
+        comp_mask_pred_line_avx2(sA0, sA1, aA, comp_pred);
+        comp_pred += 32;
+      }
+      src0 += stride0;
+      src1 += stride1;
+      mask += mask_stride;
+      i++;
     } while (i < height);
   }
 }
@@ -498,29 +499,148 @@
       comp_pred += width;
       i += 1;
     } while (i < height);
-  } else if (width == 32) {
+  } else {
     do {
-      const __m256i s0 = _mm256_loadu_si256((const __m256i *)src0);
-      const __m256i s2 = _mm256_loadu_si256((const __m256i *)(src0 + 16));
-      const __m256i s1 = _mm256_loadu_si256((const __m256i *)src1);
-      const __m256i s3 = _mm256_loadu_si256((const __m256i *)(src1 + 16));
+      for (int x = 0; x < width; x += 32) {
+        const __m256i s0 = _mm256_loadu_si256((const __m256i *)(src0 + x));
+        const __m256i s2 = _mm256_loadu_si256((const __m256i *)(src0 + x + 16));
+        const __m256i s1 = _mm256_loadu_si256((const __m256i *)(src1 + x));
+        const __m256i s3 = _mm256_loadu_si256((const __m256i *)(src1 + x + 16));
 
-      const __m256i m01_16 =
-          _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)mask));
-      const __m256i m23_16 =
-          _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)(mask + 16)));
+        const __m256i m01_16 =
+            _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)(mask + x)));
+        const __m256i m23_16 = _mm256_cvtepu8_epi16(
+            _mm_loadu_si128((const __m128i *)(mask + x + 16)));
 
-      const __m256i comp = highbd_comp_mask_pred_line_avx2(s0, s1, m01_16);
-      const __m256i comp1 = highbd_comp_mask_pred_line_avx2(s2, s3, m23_16);
+        const __m256i comp = highbd_comp_mask_pred_line_avx2(s0, s1, m01_16);
+        const __m256i comp1 = highbd_comp_mask_pred_line_avx2(s2, s3, m23_16);
 
-      _mm256_storeu_si256((__m256i *)comp_pred, comp);
-      _mm256_storeu_si256((__m256i *)(comp_pred + 16), comp1);
+        _mm256_storeu_si256((__m256i *)comp_pred, comp);
+        _mm256_storeu_si256((__m256i *)(comp_pred + 16), comp1);
 
+        comp_pred += 32;
+      }
       src0 += stride0;
       src1 += stride1;
       mask += mask_stride;
-      comp_pred += width;
       i += 1;
     } while (i < height);
   }
 }
+
+uint64_t aom_mse_4xh_16bit_avx2(uint8_t *dst, int dstride, uint16_t *src,
+                                int sstride, int h) {
+  uint64_t sum = 0;
+  __m128i dst0_4x8, dst1_4x8, dst2_4x8, dst3_4x8, dst_16x8;
+  __m128i src0_4x16, src1_4x16, src2_4x16, src3_4x16;
+  __m256i src0_8x16, src1_8x16, dst_16x16, src_16x16;
+  __m256i res0_4x64, res1_4x64, res2_4x64, res3_4x64;
+  __m256i sub_result;
+  const __m256i zeros = _mm256_broadcastsi128_si256(_mm_setzero_si128());
+  __m256i square_result = _mm256_broadcastsi128_si256(_mm_setzero_si128());
+  for (int i = 0; i < h; i += 4) {
+    dst0_4x8 = _mm_cvtsi32_si128(*(uint32_t const *)(&dst[(i + 0) * dstride]));
+    dst1_4x8 = _mm_cvtsi32_si128(*(uint32_t const *)(&dst[(i + 1) * dstride]));
+    dst2_4x8 = _mm_cvtsi32_si128(*(uint32_t const *)(&dst[(i + 2) * dstride]));
+    dst3_4x8 = _mm_cvtsi32_si128(*(uint32_t const *)(&dst[(i + 3) * dstride]));
+    dst_16x8 = _mm_unpacklo_epi64(_mm_unpacklo_epi32(dst0_4x8, dst1_4x8),
+                                  _mm_unpacklo_epi32(dst2_4x8, dst3_4x8));
+    dst_16x16 = _mm256_cvtepu8_epi16(dst_16x8);
+
+    src0_4x16 = _mm_loadl_epi64((__m128i const *)(&src[(i + 0) * sstride]));
+    src1_4x16 = _mm_loadl_epi64((__m128i const *)(&src[(i + 1) * sstride]));
+    src2_4x16 = _mm_loadl_epi64((__m128i const *)(&src[(i + 2) * sstride]));
+    src3_4x16 = _mm_loadl_epi64((__m128i const *)(&src[(i + 3) * sstride]));
+    src0_8x16 =
+        _mm256_castsi128_si256(_mm_unpacklo_epi64(src0_4x16, src1_4x16));
+    src1_8x16 =
+        _mm256_castsi128_si256(_mm_unpacklo_epi64(src2_4x16, src3_4x16));
+    src_16x16 = _mm256_permute2x128_si256(src0_8x16, src1_8x16, 0x20);
+
+    sub_result = _mm256_abs_epi16(_mm256_sub_epi16(src_16x16, dst_16x16));
+
+    src_16x16 = _mm256_unpacklo_epi16(sub_result, zeros);
+    dst_16x16 = _mm256_unpackhi_epi16(sub_result, zeros);
+
+    src_16x16 = _mm256_madd_epi16(src_16x16, src_16x16);  // 32bit store
+    dst_16x16 = _mm256_madd_epi16(dst_16x16, dst_16x16);  // 32bit store
+
+    res0_4x64 = _mm256_unpacklo_epi32(src_16x16, zeros);
+    res1_4x64 = _mm256_unpackhi_epi32(src_16x16, zeros);
+    res2_4x64 = _mm256_unpacklo_epi32(dst_16x16, zeros);
+    res3_4x64 = _mm256_unpackhi_epi32(dst_16x16, zeros);
+
+    square_result = _mm256_add_epi64(
+        square_result,
+        _mm256_add_epi64(
+            _mm256_add_epi64(_mm256_add_epi64(res0_4x64, res1_4x64), res2_4x64),
+            res3_4x64));
+  }
+  const __m128i sum_2x64 =
+      _mm_add_epi64(_mm256_castsi256_si128(square_result),
+                    _mm256_extracti128_si256(square_result, 1));
+  const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8));
+  xx_storel_64(&sum, sum_1x64);
+  return sum;
+}
+
+uint64_t aom_mse_8xh_16bit_avx2(uint8_t *dst, int dstride, uint16_t *src,
+                                int sstride, int h) {
+  uint64_t sum = 0;
+  __m128i dst0_8x8, dst1_8x8, dst3_16x8;
+  __m256i src0_8x16, src1_8x16, src_16x16, dst_16x16;
+  __m256i res0_4x64, res1_4x64, res2_4x64, res3_4x64;
+  __m256i sub_result;
+  const __m256i zeros = _mm256_broadcastsi128_si256(_mm_setzero_si128());
+  __m256i square_result = _mm256_broadcastsi128_si256(_mm_setzero_si128());
+
+  for (int i = 0; i < h; i += 2) {
+    dst0_8x8 = _mm_loadl_epi64((__m128i const *)(&dst[(i + 0) * dstride]));
+    dst1_8x8 = _mm_loadl_epi64((__m128i const *)(&dst[(i + 1) * dstride]));
+    dst3_16x8 = _mm_unpacklo_epi64(dst0_8x8, dst1_8x8);
+    dst_16x16 = _mm256_cvtepu8_epi16(dst3_16x8);
+
+    src0_8x16 =
+        _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)&src[i * sstride]));
+    src1_8x16 = _mm256_castsi128_si256(
+        _mm_loadu_si128((__m128i *)&src[(i + 1) * sstride]));
+    src_16x16 = _mm256_permute2x128_si256(src0_8x16, src1_8x16, 0x20);
+
+    sub_result = _mm256_abs_epi16(_mm256_sub_epi16(src_16x16, dst_16x16));
+
+    src_16x16 = _mm256_unpacklo_epi16(sub_result, zeros);
+    dst_16x16 = _mm256_unpackhi_epi16(sub_result, zeros);
+
+    src_16x16 = _mm256_madd_epi16(src_16x16, src_16x16);
+    dst_16x16 = _mm256_madd_epi16(dst_16x16, dst_16x16);
+
+    res0_4x64 = _mm256_unpacklo_epi32(src_16x16, zeros);
+    res1_4x64 = _mm256_unpackhi_epi32(src_16x16, zeros);
+    res2_4x64 = _mm256_unpacklo_epi32(dst_16x16, zeros);
+    res3_4x64 = _mm256_unpackhi_epi32(dst_16x16, zeros);
+
+    square_result = _mm256_add_epi64(
+        square_result,
+        _mm256_add_epi64(
+            _mm256_add_epi64(_mm256_add_epi64(res0_4x64, res1_4x64), res2_4x64),
+            res3_4x64));
+  }
+
+  const __m128i sum_2x64 =
+      _mm_add_epi64(_mm256_castsi256_si128(square_result),
+                    _mm256_extracti128_si256(square_result, 1));
+  const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8));
+  xx_storel_64(&sum, sum_1x64);
+  return sum;
+}
+
+uint64_t aom_mse_wxh_16bit_avx2(uint8_t *dst, int dstride, uint16_t *src,
+                                int sstride, int w, int h) {
+  assert((w == 8 || w == 4) && (h == 8 || h == 4) &&
+         "w=8/4 and h=8/4 must satisfy");
+  switch (w) {
+    case 4: return aom_mse_4xh_16bit_avx2(dst, dstride, src, sstride, h);
+    case 8: return aom_mse_8xh_16bit_avx2(dst, dstride, src, sstride, h);
+    default: assert(0 && "unsupported width"); return -1;
+  }
+}
diff --git a/aom_dsp/x86/variance_sse2.c b/aom_dsp/x86/variance_sse2.c
index 4e2b5a1..e372a4b 100644
--- a/aom_dsp/x86/variance_sse2.c
+++ b/aom_dsp/x86/variance_sse2.c
@@ -17,6 +17,7 @@
 #include "config/av1_rtcd.h"
 
 #include "aom_dsp/blend.h"
+#include "aom_dsp/x86/mem_sse2.h"
 #include "aom_dsp/x86/synonyms.h"
 
 #include "aom_ports/mem.h"
@@ -42,8 +43,8 @@
 }
 
 static INLINE __m128i load4x2_sse2(const uint8_t *const p, const int stride) {
-  const __m128i p0 = _mm_cvtsi32_si128(*(const uint32_t *)(p + 0 * stride));
-  const __m128i p1 = _mm_cvtsi32_si128(*(const uint32_t *)(p + 1 * stride));
+  const __m128i p0 = _mm_cvtsi32_si128(loadu_uint32(p + 0 * stride));
+  const __m128i p1 = _mm_cvtsi32_si128(loadu_uint32(p + 1 * stride));
   return _mm_unpacklo_epi8(_mm_unpacklo_epi32(p0, p1), _mm_setzero_si128());
 }
 
@@ -267,18 +268,21 @@
 AOM_VAR_NO_LOOP_SSE2(8, 4, 5, 128);
 AOM_VAR_NO_LOOP_SSE2(8, 8, 6, 128);
 AOM_VAR_NO_LOOP_SSE2(8, 16, 7, 128);
-AOM_VAR_NO_LOOP_SSE2(8, 32, 8, 256);
 
-AOM_VAR_NO_LOOP_SSE2(16, 4, 6, 128);
 AOM_VAR_NO_LOOP_SSE2(16, 8, 7, 128);
 AOM_VAR_NO_LOOP_SSE2(16, 16, 8, 256);
 AOM_VAR_NO_LOOP_SSE2(16, 32, 9, 512);
-AOM_VAR_NO_LOOP_SSE2(16, 64, 10, 1024);
 
 AOM_VAR_NO_LOOP_SSE2(32, 8, 8, 256);
 AOM_VAR_NO_LOOP_SSE2(32, 16, 9, 512);
 AOM_VAR_NO_LOOP_SSE2(32, 32, 10, 1024);
 
+#if !CONFIG_REALTIME_ONLY
+AOM_VAR_NO_LOOP_SSE2(16, 4, 6, 128);
+AOM_VAR_NO_LOOP_SSE2(8, 32, 8, 256);
+AOM_VAR_NO_LOOP_SSE2(16, 64, 10, 1024);
+#endif
+
 #define AOM_VAR_LOOP_SSE2(bw, bh, bits, uh)                                   \
   unsigned int aom_variance##bw##x##bh##_sse2(                                \
       const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
@@ -302,7 +306,6 @@
 
 AOM_VAR_LOOP_SSE2(32, 64, 11, 32);  // 32x32 * ( 64/32 )
 
-AOM_VAR_NO_LOOP_SSE2(64, 16, 10, 1024);
 AOM_VAR_LOOP_SSE2(64, 32, 11, 16);   // 64x16 * ( 32/16 )
 AOM_VAR_LOOP_SSE2(64, 64, 12, 16);   // 64x16 * ( 64/16 )
 AOM_VAR_LOOP_SSE2(64, 128, 13, 16);  // 64x16 * ( 128/16 )
@@ -310,6 +313,10 @@
 AOM_VAR_LOOP_SSE2(128, 64, 13, 8);   // 128x8 * ( 64/8 )
 AOM_VAR_LOOP_SSE2(128, 128, 14, 8);  // 128x8 * ( 128/8 )
 
+#if !CONFIG_REALTIME_ONLY
+AOM_VAR_NO_LOOP_SSE2(64, 16, 10, 1024);
+#endif
+
 unsigned int aom_mse8x8_sse2(const uint8_t *src, int src_stride,
                              const uint8_t *ref, int ref_stride,
                              unsigned int *sse) {
@@ -383,6 +390,7 @@
     return sse - (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2));  \
   }
 
+#if !CONFIG_REALTIME_ONLY
 #define FNS(opt)                                     \
   FN(128, 128, 16, 7, 7, opt, (int64_t), (int64_t)); \
   FN(128, 64, 16, 7, 6, opt, (int64_t), (int64_t));  \
@@ -406,6 +414,25 @@
   FN(32, 8, 16, 5, 3, opt, (uint32_t), (int64_t));   \
   FN(16, 64, 16, 4, 6, opt, (int64_t), (int64_t));   \
   FN(64, 16, 16, 6, 4, opt, (int64_t), (int64_t))
+#else
+#define FNS(opt)                                     \
+  FN(128, 128, 16, 7, 7, opt, (int64_t), (int64_t)); \
+  FN(128, 64, 16, 7, 6, opt, (int64_t), (int64_t));  \
+  FN(64, 128, 16, 6, 7, opt, (int64_t), (int64_t));  \
+  FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t));   \
+  FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t));   \
+  FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t));   \
+  FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t));   \
+  FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t));   \
+  FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t));   \
+  FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t));  \
+  FN(16, 8, 16, 4, 3, opt, (int32_t), (int32_t));    \
+  FN(8, 16, 8, 3, 4, opt, (int32_t), (int32_t));     \
+  FN(8, 8, 8, 3, 3, opt, (int32_t), (int32_t));      \
+  FN(8, 4, 8, 3, 2, opt, (int32_t), (int32_t));      \
+  FN(4, 8, 4, 2, 3, opt, (int32_t), (int32_t));      \
+  FN(4, 4, 4, 2, 2, opt, (int32_t), (int32_t));
+#endif
 
 FNS(sse2);
 FNS(ssse3);
@@ -462,6 +489,7 @@
     return sse - (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2)); \
   }
 
+#if !CONFIG_REALTIME_ONLY
 #define FNS(opt)                                     \
   FN(128, 128, 16, 7, 7, opt, (int64_t), (int64_t)); \
   FN(128, 64, 16, 7, 6, opt, (int64_t), (int64_t));  \
@@ -485,6 +513,25 @@
   FN(32, 8, 16, 5, 3, opt, (uint32_t), (int64_t));   \
   FN(16, 64, 16, 4, 6, opt, (int64_t), (int64_t));   \
   FN(64, 16, 16, 6, 4, opt, (int64_t), (int64_t))
+#else
+#define FNS(opt)                                     \
+  FN(128, 128, 16, 7, 7, opt, (int64_t), (int64_t)); \
+  FN(128, 64, 16, 7, 6, opt, (int64_t), (int64_t));  \
+  FN(64, 128, 16, 6, 7, opt, (int64_t), (int64_t));  \
+  FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t));   \
+  FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t));   \
+  FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t));   \
+  FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t));   \
+  FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t));   \
+  FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t));   \
+  FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t));  \
+  FN(16, 8, 16, 4, 3, opt, (uint32_t), (int32_t));   \
+  FN(8, 16, 8, 3, 4, opt, (uint32_t), (int32_t));    \
+  FN(8, 8, 8, 3, 3, opt, (uint32_t), (int32_t));     \
+  FN(8, 4, 8, 3, 2, opt, (uint32_t), (int32_t));     \
+  FN(4, 8, 4, 2, 3, opt, (uint32_t), (int32_t));     \
+  FN(4, 4, 4, 2, 2, opt, (uint32_t), (int32_t));
+#endif
 
 FNS(sse2);
 FNS(ssse3);
@@ -727,31 +774,131 @@
       comp_pred += width;
       i += 1;
     } while (i < height);
-  } else if (width == 32) {
+  } else {
     do {
-      for (int j = 0; j < 2; j++) {
-        const __m128i s0 = _mm_loadu_si128((const __m128i *)(src0 + j * 16));
-        const __m128i s2 =
-            _mm_loadu_si128((const __m128i *)(src0 + 8 + j * 16));
-        const __m128i s1 = _mm_loadu_si128((const __m128i *)(src1 + j * 16));
-        const __m128i s3 =
-            _mm_loadu_si128((const __m128i *)(src1 + 8 + j * 16));
+      for (int x = 0; x < width; x += 32) {
+        for (int j = 0; j < 2; j++) {
+          const __m128i s0 =
+              _mm_loadu_si128((const __m128i *)(src0 + x + j * 16));
+          const __m128i s2 =
+              _mm_loadu_si128((const __m128i *)(src0 + x + 8 + j * 16));
+          const __m128i s1 =
+              _mm_loadu_si128((const __m128i *)(src1 + x + j * 16));
+          const __m128i s3 =
+              _mm_loadu_si128((const __m128i *)(src1 + x + 8 + j * 16));
 
-        const __m128i m_8 = _mm_loadu_si128((const __m128i *)(mask + j * 16));
-        const __m128i m01_16 = _mm_unpacklo_epi8(m_8, zero);
-        const __m128i m23_16 = _mm_unpackhi_epi8(m_8, zero);
+          const __m128i m_8 =
+              _mm_loadu_si128((const __m128i *)(mask + x + j * 16));
+          const __m128i m01_16 = _mm_unpacklo_epi8(m_8, zero);
+          const __m128i m23_16 = _mm_unpackhi_epi8(m_8, zero);
 
-        const __m128i comp = highbd_comp_mask_pred_line_sse2(s0, s1, m01_16);
-        const __m128i comp1 = highbd_comp_mask_pred_line_sse2(s2, s3, m23_16);
+          const __m128i comp = highbd_comp_mask_pred_line_sse2(s0, s1, m01_16);
+          const __m128i comp1 = highbd_comp_mask_pred_line_sse2(s2, s3, m23_16);
 
-        _mm_storeu_si128((__m128i *)(comp_pred + j * 16), comp);
-        _mm_storeu_si128((__m128i *)(comp_pred + 8 + j * 16), comp1);
+          _mm_storeu_si128((__m128i *)(comp_pred + j * 16), comp);
+          _mm_storeu_si128((__m128i *)(comp_pred + 8 + j * 16), comp1);
+        }
+        comp_pred += 32;
       }
       src0 += stride0;
       src1 += stride1;
       mask += mask_stride;
-      comp_pred += width;
       i += 1;
     } while (i < height);
   }
 }
+
+uint64_t aom_mse_4xh_16bit_sse2(uint8_t *dst, int dstride, uint16_t *src,
+                                int sstride, int h) {
+  uint64_t sum = 0;
+  __m128i dst0_8x8, dst1_8x8, dst_16x8;
+  __m128i src0_16x4, src1_16x4, src_16x8;
+  __m128i res0_32x4, res1_32x4, res0_64x4, res1_64x4, res2_64x4, res3_64x4;
+  __m128i sub_result_16x8;
+  const __m128i zeros = _mm_setzero_si128();
+  __m128i square_result = _mm_setzero_si128();
+  for (int i = 0; i < h; i += 2) {
+    dst0_8x8 = _mm_cvtsi32_si128(*(uint32_t const *)(&dst[(i + 0) * dstride]));
+    dst1_8x8 = _mm_cvtsi32_si128(*(uint32_t const *)(&dst[(i + 1) * dstride]));
+    dst_16x8 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(dst0_8x8, dst1_8x8), zeros);
+
+    src0_16x4 = _mm_loadl_epi64((__m128i const *)(&src[(i + 0) * sstride]));
+    src1_16x4 = _mm_loadl_epi64((__m128i const *)(&src[(i + 1) * sstride]));
+    src_16x8 = _mm_unpacklo_epi64(src0_16x4, src1_16x4);
+
+    sub_result_16x8 = _mm_sub_epi16(src_16x8, dst_16x8);
+
+    res0_32x4 = _mm_unpacklo_epi16(sub_result_16x8, zeros);
+    res1_32x4 = _mm_unpackhi_epi16(sub_result_16x8, zeros);
+
+    res0_32x4 = _mm_madd_epi16(res0_32x4, res0_32x4);
+    res1_32x4 = _mm_madd_epi16(res1_32x4, res1_32x4);
+
+    res0_64x4 = _mm_unpacklo_epi32(res0_32x4, zeros);
+    res1_64x4 = _mm_unpackhi_epi32(res0_32x4, zeros);
+    res2_64x4 = _mm_unpacklo_epi32(res1_32x4, zeros);
+    res3_64x4 = _mm_unpackhi_epi32(res1_32x4, zeros);
+
+    square_result = _mm_add_epi64(
+        square_result,
+        _mm_add_epi64(
+            _mm_add_epi64(_mm_add_epi64(res0_64x4, res1_64x4), res2_64x4),
+            res3_64x4));
+  }
+  const __m128i sum_1x64 =
+      _mm_add_epi64(square_result, _mm_srli_si128(square_result, 8));
+  xx_storel_64(&sum, sum_1x64);
+  return sum;
+}
+
+uint64_t aom_mse_8xh_16bit_sse2(uint8_t *dst, int dstride, uint16_t *src,
+                                int sstride, int h) {
+  uint64_t sum = 0;
+  __m128i dst_8x8, dst_16x8;
+  __m128i src_16x8;
+  __m128i res0_32x4, res1_32x4, res0_64x4, res1_64x4, res2_64x4, res3_64x4;
+  __m128i sub_result_16x8;
+  const __m128i zeros = _mm_setzero_si128();
+  __m128i square_result = _mm_setzero_si128();
+
+  for (int i = 0; i < h; i++) {
+    dst_8x8 = _mm_loadl_epi64((__m128i const *)(&dst[(i + 0) * dstride]));
+    dst_16x8 = _mm_unpacklo_epi8(dst_8x8, zeros);
+
+    src_16x8 = _mm_loadu_si128((__m128i *)&src[i * sstride]);
+
+    sub_result_16x8 = _mm_sub_epi16(src_16x8, dst_16x8);
+
+    res0_32x4 = _mm_unpacklo_epi16(sub_result_16x8, zeros);
+    res1_32x4 = _mm_unpackhi_epi16(sub_result_16x8, zeros);
+
+    res0_32x4 = _mm_madd_epi16(res0_32x4, res0_32x4);
+    res1_32x4 = _mm_madd_epi16(res1_32x4, res1_32x4);
+
+    res0_64x4 = _mm_unpacklo_epi32(res0_32x4, zeros);
+    res1_64x4 = _mm_unpackhi_epi32(res0_32x4, zeros);
+    res2_64x4 = _mm_unpacklo_epi32(res1_32x4, zeros);
+    res3_64x4 = _mm_unpackhi_epi32(res1_32x4, zeros);
+
+    square_result = _mm_add_epi64(
+        square_result,
+        _mm_add_epi64(
+            _mm_add_epi64(_mm_add_epi64(res0_64x4, res1_64x4), res2_64x4),
+            res3_64x4));
+  }
+  const __m128i sum_1x64 =
+      _mm_add_epi64(square_result, _mm_srli_si128(square_result, 8));
+  xx_storel_64(&sum, sum_1x64);
+  return sum;
+}
+
+uint64_t aom_mse_wxh_16bit_sse2(uint8_t *dst, int dstride, uint16_t *src,
+                                int sstride, int w, int h) {
+  assert((w == 8 || w == 4) && (h == 8 || h == 4) &&
+         "w=8/4 and h=8/4 must satisfy");
+  switch (w) {
+    case 4: return aom_mse_4xh_16bit_sse2(dst, dstride, src, sstride, h);
+    case 8: return aom_mse_8xh_16bit_sse2(dst, dstride, src, sstride, h);
+    default: assert(0 && "unsupported width"); return -1;
+  }
+}
diff --git a/aom_ports/emms.asm b/aom_ports/emms.asm
index 90776ba..038635d 100644
--- a/aom_ports/emms.asm
+++ b/aom_ports/emms.asm
@@ -15,14 +15,14 @@
 %include "aom_ports/x86_abi_support.asm"
 
 section .text
-global sym(aom_reset_mmx_state) PRIVATE
+globalsym(aom_reset_mmx_state)
 sym(aom_reset_mmx_state):
     emms
     ret
 
 
 %if LIBAOM_YASM_WIN64
-global sym(aom_winx64_fldcw) PRIVATE
+globalsym(aom_winx64_fldcw)
 sym(aom_winx64_fldcw):
     sub   rsp, 8
     mov   [rsp], rcx ; win x64 specific
@@ -31,7 +31,7 @@
     ret
 
 
-global sym(aom_winx64_fstcw) PRIVATE
+globalsym(aom_winx64_fstcw)
 sym(aom_winx64_fstcw):
     sub   rsp, 8
     fstcw [rsp]
diff --git a/aom_ports/x86_abi_support.asm b/aom_ports/x86_abi_support.asm
index 6448990..f1a65f5 100644
--- a/aom_ports/x86_abi_support.asm
+++ b/aom_ports/x86_abi_support.asm
@@ -92,34 +92,51 @@
 %define LIBAOM_YASM_WIN64 0
 %endif
 
+; Declare groups of platforms
+%ifidn   __OUTPUT_FORMAT__,elf32
+  %define LIBAOM_ELF 1
+%elifidn   __OUTPUT_FORMAT__,elfx32
+  %define LIBAOM_ELF 1
+%elifidn   __OUTPUT_FORMAT__,elf64
+  %define LIBAOM_ELF 1
+%else
+  %define LIBAOM_ELF 0
+%endif
+
+%ifidn __OUTPUT_FORMAT__,macho32
+  %define LIBAOM_MACHO 1
+%elifidn __OUTPUT_FORMAT__,macho64
+  %define LIBAOM_MACHO 1
+%else
+  %define LIBAOM_MACHO 0
+%endif
+
 ; sym()
 ; Return the proper symbol name for the target ABI.
 ;
 ; Certain ABIs, notably MS COFF and Darwin MACH-O, require that symbols
 ; with C linkage be prefixed with an underscore.
 ;
-%ifidn   __OUTPUT_FORMAT__,elf32
-%define sym(x) x
-%elifidn __OUTPUT_FORMAT__,elf64
-%define sym(x) x
-%elifidn __OUTPUT_FORMAT__,elfx32
-%define sym(x) x
-%elif LIBAOM_YASM_WIN64
-%define sym(x) x
+%if LIBAOM_ELF || LIBAOM_YASM_WIN64
+  %define sym(x) x
 %else
-%define sym(x) _ %+ x
+  ; Mach-O / COFF
+  %define sym(x) _ %+ x
 %endif
 
-;  PRIVATE
-;  Macro for the attribute to hide a global symbol for the target ABI.
-;  This is only active if CHROMIUM is defined.
+; globalsym()
+; Return a global declaration with the proper decoration for the target ABI.
 ;
-;  Chromium doesn't like exported global symbols due to symbol clashing with
-;  plugins among other things.
+; When CHROMIUM is defined, include attributes to hide the symbol from the
+; global namespace.
 ;
-;  Requires Chromium's patched copy of yasm:
-;    http://src.chromium.org/viewvc/chrome?view=rev&revision=73761
-;    http://www.tortall.net/projects/yasm/ticket/236
+; Chromium doesn't like exported global symbols due to symbol clashing with
+; plugins among other things.
+;
+; Requires Chromium's patched copy of yasm:
+;   http://src.chromium.org/viewvc/chrome?view=rev&revision=73761
+;   http://www.tortall.net/projects/yasm/ticket/236
+; or nasm > 2.14.
 ;
 %ifdef CHROMIUM
   %ifdef __NASM_VER__
@@ -129,19 +146,16 @@
     %endif
   %endif
 
-  %ifidn   __OUTPUT_FORMAT__,elf32
-    %define PRIVATE :hidden
-  %elifidn __OUTPUT_FORMAT__,elf64
-    %define PRIVATE :hidden
-  %elifidn __OUTPUT_FORMAT__,elfx32
-    %define PRIVATE :hidden
-  %elif LIBAOM_YASM_WIN64
-    %define PRIVATE
+  %if LIBAOM_ELF
+    %define globalsym(x) global sym(x) %+ :function hidden
+  %elif LIBAOM_MACHO
+    %define globalsym(x) global sym(x) %+ :private_extern
   %else
-    %define PRIVATE :private_extern
+    ; COFF / PE32+
+    %define globalsym(x) global sym(x)
   %endif
 %else
-  %define PRIVATE
+  %define globalsym(x) global sym(x)
 %endif
 
 ; arg()
diff --git a/aom_scale/generic/yv12config.c b/aom_scale/generic/yv12config.c
index 1f80d7b..9b9242c 100644
--- a/aom_scale/generic/yv12config.c
+++ b/aom_scale/generic/yv12config.c
@@ -251,6 +251,7 @@
 int aom_copy_metadata_to_frame_buffer(YV12_BUFFER_CONFIG *ybf,
                                       const aom_metadata_array_t *arr) {
   if (!ybf || !arr || !arr->metadata_array) return -1;
+  if (ybf->metadata == arr) return 0;
   aom_remove_metadata_from_frame_buffer(ybf);
   ybf->metadata = aom_img_metadata_array_alloc(arr->sz);
   if (!ybf->metadata) return -1;
diff --git a/aom_scale/generic/yv12extend.c b/aom_scale/generic/yv12extend.c
index 834a59d..3d0f4a7 100644
--- a/aom_scale/generic/yv12extend.c
+++ b/aom_scale/generic/yv12extend.c
@@ -22,6 +22,7 @@
 static void extend_plane(uint8_t *const src, int src_stride, int width,
                          int height, int extend_top, int extend_left,
                          int extend_bottom, int extend_right) {
+  assert(src != NULL);
   int i;
   const int linesize = extend_left + extend_right + width;
 
@@ -220,13 +221,8 @@
 // Note: The frames are assumed to be identical in size.
 void aom_yv12_copy_frame_c(const YV12_BUFFER_CONFIG *src_bc,
                            YV12_BUFFER_CONFIG *dst_bc, const int num_planes) {
-#if 0
-  /* These assertions are valid in the codec, but the libaom-tester uses
-   * this code slightly differently.
-   */
   assert(src_bc->y_width == dst_bc->y_width);
   assert(src_bc->y_height == dst_bc->y_height);
-#endif
 
 #if CONFIG_AV1_HIGHBITDEPTH
   assert((src_bc->flags & YV12_FLAG_HIGHBITDEPTH) ==
diff --git a/aom_scale/yv12config.h b/aom_scale/yv12config.h
index 3642bb7..ea92c92 100644
--- a/aom_scale/yv12config.h
+++ b/aom_scale/yv12config.h
@@ -23,13 +23,20 @@
 #include "aom/aom_integer.h"
 #include "aom/internal/aom_image_internal.h"
 
+/*!\cond */
+
 #define AOMINNERBORDERINPIXELS 160
 #define AOM_INTERP_EXTEND 4
 #define AOM_BORDER_IN_PIXELS 288
 #define AOM_ENC_NO_SCALE_BORDER 160
 #define AOM_DEC_BORDER_IN_PIXELS 64
 
+/*!\endcond */
+/*!
+ * \brief YV12 frame buffer data structure
+ */
 typedef struct yv12_buffer_config {
+  /*!\cond */
   union {
     struct {
       int y_width;
@@ -106,8 +113,11 @@
   int corrupted;
   int flags;
   aom_metadata_array_t *metadata;
+  /*!\endcond */
 } YV12_BUFFER_CONFIG;
 
+/*!\cond */
+
 #define YV12_FLAG_HIGHBITDEPTH 8
 
 int aom_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height,
@@ -129,6 +139,7 @@
 
 int aom_free_frame_buffer(YV12_BUFFER_CONFIG *ybf);
 
+/*!\endcond */
 /*!\brief Removes metadata from YUV_BUFFER_CONFIG struct.
  *
  * Frees metadata in frame buffer.
@@ -140,10 +151,12 @@
 
 /*!\brief Copy metadata to YUV_BUFFER_CONFIG struct.
  *
- * Copies metadata in frame buffer.
+ * Copies metadata to frame buffer.
  * Frame buffer will clear any previous metadata and will reallocate the
  * metadata array to the new metadata size. Then, it will copy the new metadata
  * array into it.
+ * If arr metadata pointer points to the same address as current metadata in the
+ * frame buffer, function will do nothing and return 0.
  * Returns 0 on success or -1 on failure.
  *
  * \param[in]    ybf       Frame buffer struct pointer
diff --git a/aom_util/debug_util.c b/aom_util/debug_util.c
index 5762e69..3e9c314 100644
--- a/aom_util/debug_util.c
+++ b/aom_util/debug_util.c
@@ -10,6 +10,7 @@
  */
 
 #include <assert.h>
+#include <stdbool.h>
 #include <stdio.h>
 #include <string.h>
 #include "aom_util/debug_util.h"
@@ -22,7 +23,7 @@
   frame_idx_w = frame_idx;
 }
 
-int aom_bitstream_queue_get_frame_writee(void) { return frame_idx_w; }
+int aom_bitstream_queue_get_frame_write(void) { return frame_idx_w; }
 
 void aom_bitstream_queue_set_frame_read(int frame_idx) {
   frame_idx_r = frame_idx;
@@ -68,6 +69,19 @@
 }
 
 void bitstream_queue_push(int result, const aom_cdf_prob *cdf, int nsymbs) {
+  // If you observe a CDF error:
+  // - Set 'debug_cdf_mismatch' to true
+  // - Set target_frame_idx_r and target_queue_r to where CDF error was reported
+  // - Set a breakpoint in debugger at the 'fprintf' below.
+  const bool debug_cdf_mismatch = false;
+  if (debug_cdf_mismatch) {
+    int target_frame_idx_r = 1;
+    int target_queue_r = 18005;
+    if (frame_idx_w == target_frame_idx_r && queue_w == target_queue_r) {
+      fprintf(stderr, "\n *** bitstream queue at frame_idx_w %d queue_w %d\n",
+              frame_idx_w, queue_w);
+    }
+  }
   if (!skip_w) {
     result_queue[queue_w] = result;
     nsymbs_queue[queue_w] = nsymbs;
diff --git a/aomedia_logo_200.png b/aomedia_logo_200.png
new file mode 100644
index 0000000..4a3b9fc
--- /dev/null
+++ b/aomedia_logo_200.png
Binary files differ
diff --git a/apps/aomdec.c b/apps/aomdec.c
index 2591d41..60805aa 100644
--- a/apps/aomdec.c
+++ b/apps/aomdec.c
@@ -82,6 +82,8 @@
     ARG_DEF("o", "output", 1, "Output file name pattern (see below)");
 static const arg_def_t threadsarg =
     ARG_DEF("t", "threads", 1, "Max threads to use");
+static const arg_def_t rowmtarg =
+    ARG_DEF(NULL, "row-mt", 1, "Enable row based multi-threading, default: 0");
 static const arg_def_t verbosearg =
     ARG_DEF("v", "verbose", 0, "Show version string");
 static const arg_def_t scalearg =
@@ -106,11 +108,13 @@
     ARG_DEF(NULL, "skip-film-grain", 0, "Skip film grain application");
 
 static const arg_def_t *all_args[] = {
-  &help,       &codecarg,   &use_yv12,      &use_i420,      &flipuvarg,
-  &rawvideo,   &noblitarg,  &progressarg,   &limitarg,      &skiparg,
-  &summaryarg, &outputfile, &threadsarg,    &verbosearg,    &scalearg,
-  &fb_arg,     &md5arg,     &framestatsarg, &continuearg,   &outbitdeptharg,
-  &isannexb,   &oppointarg, &outallarg,     &skipfilmgrain, NULL
+  &help,           &codecarg, &use_yv12,      &use_i420,
+  &flipuvarg,      &rawvideo, &noblitarg,     &progressarg,
+  &limitarg,       &skiparg,  &summaryarg,    &outputfile,
+  &threadsarg,     &rowmtarg, &verbosearg,    &scalearg,
+  &fb_arg,         &md5arg,   &framestatsarg, &continuearg,
+  &outbitdeptharg, &isannexb, &oppointarg,    &outallarg,
+  &skipfilmgrain,  NULL
 };
 
 #if CONFIG_LIBYUV
@@ -166,9 +170,9 @@
   fprintf(fout, "\nIncluded decoders:\n\n");
 
   for (int i = 0; i < get_aom_decoder_count(); ++i) {
-    const AvxInterface *const decoder = get_aom_decoder_by_index(i);
-    fprintf(fout, "    %-6s - %s\n", decoder->name,
-            aom_codec_iface_name(decoder->codec_interface()));
+    aom_codec_iface_t *decoder = get_aom_decoder_by_index(i);
+    fprintf(fout, "    %-6s - %s\n", get_short_name_by_aom_decoder(decoder),
+            aom_codec_iface_name(decoder));
   }
 }
 
@@ -254,11 +258,10 @@
 
     if (mem_get_le32(buf) < 256 * 1024 * 1024) {
       for (i = 0; i < get_aom_decoder_count(); ++i) {
-        const AvxInterface *const decoder = get_aom_decoder_by_index(i);
-        if (!aom_codec_peek_stream_info(decoder->codec_interface(), buf + 4,
-                                        32 - 4, &si)) {
+        aom_codec_iface_t *decoder = get_aom_decoder_by_index(i);
+        if (!aom_codec_peek_stream_info(decoder, buf + 4, 32 - 4, &si)) {
           is_raw = 1;
-          input->fourcc = decoder->fourcc;
+          input->fourcc = get_fourcc_by_aom_decoder(decoder);
           input->width = si.w;
           input->height = si.h;
           input->framerate.numerator = 30;
@@ -436,8 +439,6 @@
   int stop_after = 0, summary = 0, quiet = 1;
   int arg_skip = 0;
   int keep_going = 0;
-  const AvxInterface *interface = NULL;
-  const AvxInterface *fourcc_interface = NULL;
   uint64_t dx_time = 0;
   struct arg arg;
   char **argv, **argi, **argj;
@@ -456,6 +457,7 @@
   int operating_point = 0;
   int output_all_layers = 0;
   int skip_film_grain = 0;
+  int enable_row_mt = 0;
   aom_image_t *scaled_img = NULL;
   aom_image_t *img_shifted = NULL;
   int frame_avail, got_data, flush_decoder = 0;
@@ -490,6 +492,7 @@
   exec_name = argv_[0];
   argv = argv_dup(argc - 1, argv_ + 1);
 
+  aom_codec_iface_t *interface = NULL;
   for (argi = argj = argv; (*argj = *argi); argi += arg.argv_step) {
     memset(&arg, 0, sizeof(arg));
     arg.argv_step = 1;
@@ -498,7 +501,7 @@
       show_help(stdout, 0);
       exit(EXIT_SUCCESS);
     } else if (arg_match(&arg, &codecarg, argi)) {
-      interface = get_aom_decoder_by_name(arg.val);
+      interface = get_aom_decoder_by_short_name(arg.val);
       if (!interface)
         die("Error: Unrecognized argument (%s) to --codec\n", arg.val);
     } else if (arg_match(&arg, &looparg, argi)) {
@@ -551,6 +554,8 @@
             cfg.threads);
       }
 #endif
+    } else if (arg_match(&arg, &rowmtarg, argi)) {
+      enable_row_mt = arg_parse_uint(&arg);
     } else if (arg_match(&arg, &verbosearg, argi)) {
       quiet = 0;
     } else if (arg_match(&arg, &scalearg, argi)) {
@@ -600,6 +605,7 @@
     fprintf(stderr,
             "Not dumping raw video to your terminal. Use '-o -' to "
             "override.\n");
+    free(argv);
     return EXIT_FAILURE;
   }
 #endif
@@ -658,21 +664,22 @@
 #endif
   }
 
-  fourcc_interface = get_aom_decoder_by_fourcc(aom_input_ctx.fourcc);
+  aom_codec_iface_t *fourcc_interface =
+      get_aom_decoder_by_fourcc(aom_input_ctx.fourcc);
 
   if (is_ivf && !fourcc_interface)
     fatal("Unsupported fourcc: %x\n", aom_input_ctx.fourcc);
 
   if (interface && fourcc_interface && interface != fourcc_interface)
-    warn("Header indicates codec: %s\n", fourcc_interface->name);
+    warn("Header indicates codec: %s\n",
+         aom_codec_iface_name(fourcc_interface));
   else
     interface = fourcc_interface;
 
   if (!interface) interface = get_aom_decoder_by_index(0);
 
   dec_flags = 0;
-  if (aom_codec_dec_init(&decoder, interface->codec_interface(), &cfg,
-                         dec_flags)) {
+  if (aom_codec_dec_init(&decoder, interface, &cfg, dec_flags)) {
     fprintf(stderr, "Failed to initialize decoder: %s\n",
             aom_codec_error(&decoder));
     goto fail2;
@@ -706,6 +713,12 @@
     goto fail;
   }
 
+  if (AOM_CODEC_CONTROL_TYPECHECKED(&decoder, AV1D_SET_ROW_MT, enable_row_mt)) {
+    fprintf(stderr, "Failed to set row multithreading mode: %s\n",
+            aom_codec_error(&decoder));
+    goto fail;
+  }
+
   if (arg_skip) fprintf(stderr, "Skipping first %d frames.\n", arg_skip);
   while (arg_skip) {
     if (read_frame(&input, &buf, &bytes_in_buffer, &buffer_size)) break;
diff --git a/apps/aomenc.c b/apps/aomenc.c
index bb57726..19a4524 100644
--- a/apps/aomenc.c
+++ b/apps/aomenc.c
@@ -156,8 +156,12 @@
     ARG_DEF("q", "quiet", 0, "Do not print encode progress");
 static const arg_def_t verbosearg =
     ARG_DEF("v", "verbose", 0, "Show encoder parameters");
-static const arg_def_t psnrarg =
-    ARG_DEF(NULL, "psnr", 0, "Show PSNR in status line");
+static const arg_def_t psnrarg = ARG_DEF(
+    NULL, "psnr", -1,
+    "Show PSNR in status line"
+    "(0: Disable PSNR status line display, 1: PSNR calculated using input "
+    "bit-depth (default), 2: PSNR calculated using stream bit-depth), "
+    "takes default option when arguments are not specified");
 static const arg_def_t use_cfg = ARG_DEF("c", "cfg", 1, "Config file to use");
 
 static const struct arg_enum_list test_decode_enum[] = {
@@ -263,6 +267,8 @@
     ARG_DEF(NULL, "monochrome", 0, "Monochrome video (no chroma planes)");
 static const arg_def_t full_still_picture_hdr = ARG_DEF(
     NULL, "full-still-picture-hdr", 0, "Use full header for still picture");
+static const arg_def_t use_16bit_internal =
+    ARG_DEF(NULL, "use-16bit-internal", 0, "Force use of 16-bit pipeline");
 
 static const arg_def_t *global_args[] = { &use_yv12,
                                           &use_i420,
@@ -286,6 +292,7 @@
                                           &large_scale_tile,
                                           &monochrome,
                                           &full_still_picture_hdr,
+                                          &use_16bit_internal,
                                           NULL };
 
 static const arg_def_t dropframe_thresh =
@@ -331,6 +338,12 @@
     ARG_DEF(NULL, "buf-initial-sz", 1, "Client initial buffer size (ms)");
 static const arg_def_t buf_optimal_sz =
     ARG_DEF(NULL, "buf-optimal-sz", 1, "Client optimal buffer size (ms)");
+static const arg_def_t bias_pct =
+    ARG_DEF(NULL, "bias-pct", 1, "CBR/VBR bias (0=CBR, 100=VBR)");
+static const arg_def_t minsection_pct =
+    ARG_DEF(NULL, "minsection-pct", 1, "GOP min bitrate (% of target)");
+static const arg_def_t maxsection_pct =
+    ARG_DEF(NULL, "maxsection-pct", 1, "GOP max bitrate (% of target)");
 static const arg_def_t *rc_args[] = { &dropframe_thresh,
                                       &resize_mode,
                                       &resize_denominator,
@@ -349,16 +362,11 @@
                                       &buf_sz,
                                       &buf_initial_sz,
                                       &buf_optimal_sz,
+                                      &bias_pct,
+                                      &minsection_pct,
+                                      &maxsection_pct,
                                       NULL };
 
-static const arg_def_t bias_pct =
-    ARG_DEF(NULL, "bias-pct", 1, "CBR/VBR bias (0=CBR, 100=VBR)");
-static const arg_def_t minsection_pct =
-    ARG_DEF(NULL, "minsection-pct", 1, "GOP min bitrate (% of target)");
-static const arg_def_t maxsection_pct =
-    ARG_DEF(NULL, "maxsection-pct", 1, "GOP max bitrate (% of target)");
-static const arg_def_t *rc_twopass_args[] = { &bias_pct, &minsection_pct,
-                                              &maxsection_pct, NULL };
 static const arg_def_t fwd_kf_enabled =
     ARG_DEF(NULL, "enable-fwd-kf", 1, "Enable forward reference keyframes");
 static const arg_def_t kf_min_dist =
@@ -378,7 +386,7 @@
 static const arg_def_t noise_sens =
     ARG_DEF(NULL, "noise-sensitivity", 1, "Noise sensitivity (frames to blur)");
 static const arg_def_t sharpness =
-    ARG_DEF(NULL, "sharpness", 1, "Loop filter sharpness (0..7)");
+    ARG_DEF(NULL, "sharpness", 1, "Loop filter sharpness (0..7), default is 0");
 static const arg_def_t static_thresh =
     ARG_DEF(NULL, "static-thresh", 1, "Motion detection threshold");
 static const arg_def_t auto_altref =
@@ -393,6 +401,7 @@
   { "vmaf_with_preprocessing", AOM_TUNE_VMAF_WITH_PREPROCESSING },
   { "vmaf_without_preprocessing", AOM_TUNE_VMAF_WITHOUT_PREPROCESSING },
   { "vmaf", AOM_TUNE_VMAF_MAX_GAIN },
+  { "vmaf_neg", AOM_TUNE_VMAF_NEG_MAX_GAIN },
   { NULL, 0 }
 };
 static const arg_def_t tune_metric =
@@ -405,7 +414,7 @@
 #if CONFIG_AV1_ENCODER
 static const arg_def_t cpu_used_av1 =
     ARG_DEF(NULL, "cpu-used", 1,
-            "Speed setting (0..6 in good mode, 6..8 in realtime mode)");
+            "Speed setting (0..6 in good mode, 6..9 in realtime mode)");
 static const arg_def_t rowmtarg =
     ARG_DEF(NULL, "row-mt", 1,
             "Enable row based multi-threading (0: off, 1: on (default))");
@@ -420,8 +429,10 @@
             "This is required for deltaq mode.");
 static const arg_def_t enable_keyframe_filtering =
     ARG_DEF(NULL, "enable-keyframe-filtering", 1,
-            "Apply temporal filtering on key frame "
-            "(0: false, 1: true (default)");
+            "Apply temporal filtering on key frame"
+            "(0: no filter, 1: filter without overlay (default), "
+            "2: filter with overlay - experimental, may break random access in "
+            "players.)");
 static const arg_def_t tile_width =
     ARG_DEF(NULL, "tile-width", 1, "Tile widths (comma separated)");
 static const arg_def_t tile_height =
@@ -448,13 +459,13 @@
             "Enable 1:4 and 4:1 partitions "
             "(0: false, 1: true (default))");
 static const arg_def_t min_partition_size =
-    ARG_DEF(NULL, "min-partition-size", 4,
+    ARG_DEF(NULL, "min-partition-size", 1,
             "Set min partition size "
             "(4:4x4, 8:8x8, 16:16x16, 32:32x32, 64:64x64, 128:128x128). "
             "On frame with 4k+ resolutions or higher speed settings, the min "
             "partition size will have a minimum of 8.");
 static const arg_def_t max_partition_size =
-    ARG_DEF(NULL, "max-partition-size", 128,
+    ARG_DEF(NULL, "max-partition-size", 1,
             "Set max partition size "
             "(4:4x4, 8:8x8, 16:16x16, 32:32x32, 64:64x64, 128:128x128)");
 static const arg_def_t enable_dual_filter =
@@ -482,6 +493,9 @@
             "including FLIPADST_DCT, DCT_FLIPADST, FLIPADST_FLIPADST, "
             "ADST_FLIPADST, FLIPADST_ADST, IDTX, V_DCT, H_DCT, V_ADST, "
             "H_ADST, V_FLIPADST, H_FLIPADST");
+static const arg_def_t enable_rect_tx =
+    ARG_DEF(NULL, "enable-rect-tx", 1,
+            "Enable rectangular transform (0: false, 1: true (default))");
 static const arg_def_t enable_dist_wtd_comp =
     ARG_DEF(NULL, "enable-dist-wtd-comp", 1,
             "Enable distance-weighted compound "
@@ -581,11 +595,11 @@
 static const arg_def_t coeff_cost_upd_freq =
     ARG_DEF(NULL, "coeff-cost-upd-freq", 1,
             "Update freq for coeff costs"
-            "0: SB, 1: SB Row per Tile, 2: Tile");
+            "0: SB, 1: SB Row per Tile, 2: Tile, 3: Off");
 static const arg_def_t mode_cost_upd_freq =
     ARG_DEF(NULL, "mode-cost-upd-freq", 1,
             "Update freq for mode costs"
-            "0: SB, 1: SB Row per Tile, 2: Tile");
+            "0: SB, 1: SB Row per Tile, 2: Tile, 3: Off");
 static const arg_def_t mv_cost_upd_freq =
     ARG_DEF(NULL, "mv-cost-upd-freq", 1,
             "Update freq for mv costs"
@@ -814,6 +828,11 @@
             "specified (default), offsets are adaptively chosen by the "
             "encoder.");
 
+static const arg_def_t vbr_corpus_complexity_lap = ARG_DEF(
+    NULL, "vbr-corpus-complexity-lap", 1,
+    "Set average corpus complexity per mb for single pass VBR using lap. "
+    "(0..10000), default is 0");
+
 static const arg_def_t *av1_args[] = { &cpu_used_av1,
                                        &auto_altref,
                                        &sharpness,
@@ -844,6 +863,7 @@
                                        &enable_order_hint,
                                        &enable_tx64,
                                        &enable_flip_idtx,
+                                       &enable_rect_tx,
                                        &enable_dist_wtd_comp,
                                        &enable_masked_comp,
                                        &enable_onesided_comp,
@@ -909,6 +929,7 @@
                                        &target_seq_level_idx,
                                        &set_tier_mask,
                                        &set_min_cr,
+                                       &vbr_corpus_complexity_lap,
                                        &bitdeptharg,
                                        &inbitdeptharg,
                                        &input_chroma_subsampling_x,
@@ -950,6 +971,7 @@
                                         AV1E_SET_ENABLE_ORDER_HINT,
                                         AV1E_SET_ENABLE_TX64,
                                         AV1E_SET_ENABLE_FLIP_IDTX,
+                                        AV1E_SET_ENABLE_RECT_TX,
                                         AV1E_SET_ENABLE_DIST_WTD_COMP,
                                         AV1E_SET_ENABLE_MASKED_COMP,
                                         AV1E_SET_ENABLE_ONESIDED_COMP,
@@ -1015,6 +1037,7 @@
                                         AV1E_SET_TARGET_SEQ_LEVEL_IDX,
                                         AV1E_SET_TIER_MASK,
                                         AV1E_SET_MIN_CR,
+                                        AV1E_SET_VBR_CORPUS_COMPLEXITY_LAP,
 #if CONFIG_TUNE_VMAF
                                         AV1E_SET_VMAF_MODEL_PATH,
 #endif
@@ -1038,8 +1061,6 @@
   arg_show_usage(fout, global_args);
   fprintf(fout, "\nRate Control Options:\n");
   arg_show_usage(fout, rc_args);
-  fprintf(fout, "\nTwopass Rate Control Options:\n");
-  arg_show_usage(fout, rc_twopass_args);
   fprintf(fout, "\nKeyframe Placement Options:\n");
   arg_show_usage(fout, kf_args);
 #if CONFIG_AV1_ENCODER
@@ -1054,10 +1075,10 @@
 
   const int num_encoder = get_aom_encoder_count();
   for (int i = 0; i < num_encoder; ++i) {
-    const AvxInterface *const encoder = get_aom_encoder_by_index(i);
+    aom_codec_iface_t *encoder = get_aom_encoder_by_index(i);
     const char *defstr = (i == (num_encoder - 1)) ? "(default)" : "";
-    fprintf(fout, "    %-6s - %s %s\n", encoder->name,
-            aom_codec_iface_name(encoder->codec_interface()), defstr);
+    fprintf(fout, "    %-6s - %s %s\n", get_short_name_by_aom_encoder(encoder),
+            aom_codec_iface_name(encoder), defstr);
   }
   fprintf(fout, "\n        ");
   fprintf(fout, "Use --codec to switch to a non-default encoder.\n\n");
@@ -1104,10 +1125,10 @@
   FILE *file;
   struct rate_hist *rate_hist;
   struct WebmOutputContext webm_ctx;
-  uint64_t psnr_sse_total;
-  uint64_t psnr_samples_total;
-  double psnr_totals[4];
-  int psnr_count;
+  uint64_t psnr_sse_total[2];
+  uint64_t psnr_samples_total[2];
+  double psnr_totals[2][4];
+  int psnr_count[2];
   int counts[64];
   aom_codec_ctx_t encoder;
   unsigned int frames_out;
@@ -1157,6 +1178,7 @@
   global->passes = 0;
   global->color_type = I420;
   global->csp = AOM_CSP_UNKNOWN;
+  global->show_psnr = 0;
 
   int cfg_included = 0;
   init_config(&global->encoder_config);
@@ -1165,16 +1187,15 @@
     arg.argv_step = 1;
 
     if (arg_match(&arg, &use_cfg, argi)) {
-      if (cfg_included) continue;
-      parse_cfg(arg.val, &global->encoder_config);
-      cfg_included = 1;
-      continue;
-    }
-    if (arg_match(&arg, &help, argi)) {
+      if (!cfg_included) {
+        parse_cfg(arg.val, &global->encoder_config);
+        cfg_included = 1;
+      }
+    } else if (arg_match(&arg, &help, argi)) {
       show_help(stdout, 0);
       exit(EXIT_SUCCESS);
     } else if (arg_match(&arg, &codecarg, argi)) {
-      global->codec = get_aom_encoder_by_name(arg.val);
+      global->codec = get_aom_encoder_by_short_name(arg.val);
       if (!global->codec)
         die("Error: Unrecognized argument (%s) to --codec\n", arg.val);
     } else if (arg_match(&arg, &passes, argi)) {
@@ -1213,11 +1234,14 @@
       global->limit = arg_parse_uint(&arg);
     else if (arg_match(&arg, &skip, argi))
       global->skip_frames = arg_parse_uint(&arg);
-    else if (arg_match(&arg, &psnrarg, argi))
-      global->show_psnr = 1;
-    else if (arg_match(&arg, &recontest, argi))
+    else if (arg_match(&arg, &psnrarg, argi)) {
+      if (arg.val)
+        global->show_psnr = arg_parse_int(&arg);
+      else
+        global->show_psnr = 1;
+    } else if (arg_match(&arg, &recontest, argi)) {
       global->test_decode = arg_parse_enum_or_int(&arg);
-    else if (arg_match(&arg, &framerate, argi)) {
+    } else if (arg_match(&arg, &framerate, argi)) {
       global->framerate = arg_parse_rational(&arg);
       validate_positive_rational(arg.name, &global->framerate);
       global->have_framerate = 1;
@@ -1248,11 +1272,12 @@
 #if CONFIG_AV1_ENCODER
     // Make default AV1 passes = 2 until there is a better quality 1-pass
     // encoder
-    if (global->codec != NULL && global->codec->name != NULL)
-      global->passes = (strcmp(global->codec->name, "av1") == 0 &&
-                        global->usage != AOM_USAGE_REALTIME)
-                           ? 2
-                           : 1;
+    if (global->codec != NULL)
+      global->passes =
+          (strcmp(get_short_name_by_aom_encoder(global->codec), "av1") == 0 &&
+           global->usage != AOM_USAGE_REALTIME)
+              ? 2
+              : 1;
 #else
     global->passes = 1;
 #endif
@@ -1333,8 +1358,8 @@
     aom_codec_err_t res;
 
     /* Populate encoder configuration */
-    res = aom_codec_enc_config_default(global->codec->codec_interface(),
-                                       &stream->config.cfg, global->usage);
+    res = aom_codec_enc_config_default(global->codec, &stream->config.cfg,
+                                       global->usage);
     if (res) fatal("Failed to get config: %s\n", aom_codec_err_to_string(res));
 
     /* Change the default timebase to a high enough value so that the
@@ -1422,7 +1447,7 @@
   // Handle codec specific options
   if (0) {
 #if CONFIG_AV1_ENCODER
-  } else if (strcmp(global->codec->name, "av1") == 0) {
+  } else if (strcmp(get_short_name_by_aom_encoder(global->codec), "av1") == 0) {
     // TODO(jingning): Reuse AV1 specific encoder configuration parameters.
     // Consider to expand this set for AV1 encoder control.
     ctrl_args = av1_args;
@@ -1504,19 +1529,20 @@
       config->cfg.g_error_resilient = arg_parse_uint(&arg);
     } else if (arg_match(&arg, &lag_in_frames, argi)) {
       config->cfg.g_lag_in_frames = arg_parse_uint(&arg);
-      if (global->usage == AOM_USAGE_REALTIME &&
-          config->cfg.rc_end_usage == AOM_CBR &&
-          config->cfg.g_lag_in_frames != 0) {
-        warn("non-zero %s option ignored in realtime CBR mode.\n", arg.name);
-        config->cfg.g_lag_in_frames = 0;
-      }
     } else if (arg_match(&arg, &large_scale_tile, argi)) {
       config->cfg.large_scale_tile = arg_parse_uint(&arg);
-      if (config->cfg.large_scale_tile) global->codec = get_aom_lst_encoder();
+      if (config->cfg.large_scale_tile) {
+        global->codec = get_aom_encoder_by_short_name("av1");
+      }
     } else if (arg_match(&arg, &monochrome, argi)) {
       config->cfg.monochrome = 1;
     } else if (arg_match(&arg, &full_still_picture_hdr, argi)) {
       config->cfg.full_still_picture_hdr = 1;
+    } else if (arg_match(&arg, &use_16bit_internal, argi)) {
+      config->use_16bit_internal = CONFIG_AV1_HIGHBITDEPTH;
+      if (!config->use_16bit_internal) {
+        warn("%s option ignored with CONFIG_AV1_HIGHBITDEPTH=0.\n", arg.name);
+      }
     } else if (arg_match(&arg, &dropframe_thresh, argi)) {
       config->cfg.rc_dropframe_thresh = arg_parse_uint(&arg);
     } else if (arg_match(&arg, &resize_mode, argi)) {
@@ -1615,13 +1641,18 @@
           if (ctrl_args_map) {
             set_config_arg_ctrls(config, ctrl_args_map[i], &arg);
           }
+          break;
         }
       }
       if (!match) argj++;
     }
   }
-  config->use_16bit_internal =
-      config->cfg.g_bit_depth > AOM_BITS_8 || FORCE_HIGHBITDEPTH_DECODING;
+  config->use_16bit_internal |= config->cfg.g_bit_depth > AOM_BITS_8;
+
+  if (global->usage == AOM_USAGE_REALTIME && config->cfg.g_lag_in_frames != 0) {
+    warn("non-zero lag-in-frames option ignored in realtime mode.\n");
+    config->cfg.g_lag_in_frames = 0;
+  }
   return eos_mark_found;
 }
 
@@ -1714,8 +1745,7 @@
   fprintf(stderr, "    %-28s = %d\n", #field, stream->config.cfg.field)
 
   if (stream->index == 0) {
-    fprintf(stderr, "Codec: %s\n",
-            aom_codec_iface_name(global->codec->codec_interface()));
+    fprintf(stderr, "Codec: %s\n", aom_codec_iface_name(global->codec));
     fprintf(stderr, "Source file: %s File Type: %s Format: %s\n",
             input->filename, file_type_to_string(input->file_type),
             image_format_to_string(input->fmt));
@@ -1769,45 +1799,48 @@
 #define SHOW_PARAMS(field)                    \
   fprintf(stderr, "    %-28s = %d\n", #field, \
           stream->config.cfg.encoder_cfg.field)
-  SHOW_PARAMS(super_block_size);
-  SHOW_PARAMS(max_partition_size);
-  SHOW_PARAMS(min_partition_size);
-  SHOW_PARAMS(disable_ab_partition_type);
-  SHOW_PARAMS(disable_rect_partition_type);
-  SHOW_PARAMS(disable_1to4_partition_type);
-  SHOW_PARAMS(disable_flip_idtx);
-  SHOW_PARAMS(disable_cdef);
-  SHOW_PARAMS(disable_lr);
-  SHOW_PARAMS(disable_obmc);
-  SHOW_PARAMS(disable_warp_motion);
-  SHOW_PARAMS(disable_global_motion);
-  SHOW_PARAMS(disable_dist_wtd_comp);
-  SHOW_PARAMS(disable_diff_wtd_comp);
-  SHOW_PARAMS(disable_inter_intra_comp);
-  SHOW_PARAMS(disable_masked_comp);
-  SHOW_PARAMS(disable_one_sided_comp);
-  SHOW_PARAMS(disable_palette);
-  SHOW_PARAMS(disable_intrabc);
-  SHOW_PARAMS(disable_cfl);
-  SHOW_PARAMS(disable_smooth_intra);
-  SHOW_PARAMS(disable_filter_intra);
-  SHOW_PARAMS(disable_dual_filter);
-  SHOW_PARAMS(disable_intra_angle_delta);
-  SHOW_PARAMS(disable_intra_edge_filter);
-  SHOW_PARAMS(disable_tx_64x64);
-  SHOW_PARAMS(disable_smooth_inter_intra);
-  SHOW_PARAMS(disable_inter_inter_wedge);
-  SHOW_PARAMS(disable_inter_intra_wedge);
-  SHOW_PARAMS(disable_paeth_intra);
-  SHOW_PARAMS(disable_trellis_quant);
-  SHOW_PARAMS(disable_ref_frame_mv);
-  SHOW_PARAMS(reduced_reference_set);
-  SHOW_PARAMS(reduced_tx_type_set);
+  if (global->encoder_config.init_by_cfg_file) {
+    SHOW_PARAMS(super_block_size);
+    SHOW_PARAMS(max_partition_size);
+    SHOW_PARAMS(min_partition_size);
+    SHOW_PARAMS(disable_ab_partition_type);
+    SHOW_PARAMS(disable_rect_partition_type);
+    SHOW_PARAMS(disable_1to4_partition_type);
+    SHOW_PARAMS(disable_flip_idtx);
+    SHOW_PARAMS(disable_cdef);
+    SHOW_PARAMS(disable_lr);
+    SHOW_PARAMS(disable_obmc);
+    SHOW_PARAMS(disable_warp_motion);
+    SHOW_PARAMS(disable_global_motion);
+    SHOW_PARAMS(disable_dist_wtd_comp);
+    SHOW_PARAMS(disable_diff_wtd_comp);
+    SHOW_PARAMS(disable_inter_intra_comp);
+    SHOW_PARAMS(disable_masked_comp);
+    SHOW_PARAMS(disable_one_sided_comp);
+    SHOW_PARAMS(disable_palette);
+    SHOW_PARAMS(disable_intrabc);
+    SHOW_PARAMS(disable_cfl);
+    SHOW_PARAMS(disable_smooth_intra);
+    SHOW_PARAMS(disable_filter_intra);
+    SHOW_PARAMS(disable_dual_filter);
+    SHOW_PARAMS(disable_intra_angle_delta);
+    SHOW_PARAMS(disable_intra_edge_filter);
+    SHOW_PARAMS(disable_tx_64x64);
+    SHOW_PARAMS(disable_smooth_inter_intra);
+    SHOW_PARAMS(disable_inter_inter_wedge);
+    SHOW_PARAMS(disable_inter_intra_wedge);
+    SHOW_PARAMS(disable_paeth_intra);
+    SHOW_PARAMS(disable_trellis_quant);
+    SHOW_PARAMS(disable_ref_frame_mv);
+    SHOW_PARAMS(reduced_reference_set);
+    SHOW_PARAMS(reduced_tx_type_set);
+  }
 }
 
 static void open_output_file(struct stream_state *stream,
                              struct AvxEncoderConfig *global,
-                             const struct AvxRational *pixel_aspect_ratio) {
+                             const struct AvxRational *pixel_aspect_ratio,
+                             const char *encoder_settings) {
   const char *fn = stream->config.out_fn;
   const struct aom_codec_enc_cfg *const cfg = &stream->config.cfg;
 
@@ -1824,17 +1857,20 @@
   if (stream->config.write_webm) {
     stream->webm_ctx.stream = stream->file;
     if (write_webm_file_header(&stream->webm_ctx, &stream->encoder, cfg,
-                               stream->config.stereo_fmt, global->codec->fourcc,
-                               pixel_aspect_ratio) != 0) {
+                               stream->config.stereo_fmt,
+                               get_fourcc_by_aom_encoder(global->codec),
+                               pixel_aspect_ratio, encoder_settings) != 0) {
       fatal("WebM writer initialization failed.");
     }
   }
 #else
   (void)pixel_aspect_ratio;
+  (void)encoder_settings;
 #endif
 
   if (!stream->config.write_webm && stream->config.write_ivf) {
-    ivf_write_file_header(stream->file, cfg, global->codec->fourcc, 0);
+    ivf_write_file_header(stream->file, cfg,
+                          get_fourcc_by_aom_encoder(global->codec), 0);
   }
 }
 
@@ -1888,12 +1924,12 @@
   int i;
   int flags = 0;
 
-  flags |= global->show_psnr ? AOM_CODEC_USE_PSNR : 0;
+  flags |= (global->show_psnr >= 1) ? AOM_CODEC_USE_PSNR : 0;
   flags |= stream->config.use_16bit_internal ? AOM_CODEC_USE_HIGHBITDEPTH : 0;
 
   /* Construct Encoder Context */
-  aom_codec_enc_init(&stream->encoder, global->codec->codec_interface(),
-                     &stream->config.cfg, flags);
+  aom_codec_enc_init(&stream->encoder, global->codec, &stream->config.cfg,
+                     flags);
   ctx_exit_on_error(&stream->encoder, "Failed to initialize encoder");
 
   for (i = 0; i < stream->config.arg_ctrl_cnt; i++) {
@@ -1919,11 +1955,12 @@
 
 #if CONFIG_AV1_DECODER
   if (global->test_decode != TEST_DECODE_OFF) {
-    const AvxInterface *decoder = get_aom_decoder_by_name(global->codec->name);
-    aom_codec_dec_cfg_t cfg = { 0, 0, 0, !FORCE_HIGHBITDEPTH_DECODING };
-    aom_codec_dec_init(&stream->decoder, decoder->codec_interface(), &cfg, 0);
+    aom_codec_iface_t *decoder = get_aom_decoder_by_short_name(
+        get_short_name_by_aom_encoder(global->codec));
+    aom_codec_dec_cfg_t cfg = { 0, 0, 0, !stream->config.use_16bit_internal };
+    aom_codec_dec_init(&stream->decoder, decoder, &cfg, 0);
 
-    if (strcmp(global->codec->name, "av1") == 0) {
+    if (strcmp(get_short_name_by_aom_encoder(global->codec), "av1") == 0) {
       AOM_CODEC_CONTROL_TYPECHECKED(&stream->decoder, AV1_SET_TILE_MODE,
                                     stream->config.cfg.large_scale_tile);
       ctx_exit_on_error(&stream->decoder, "Failed to set decode_tile_mode");
@@ -2113,17 +2150,31 @@
         break;
       case AOM_CODEC_PSNR_PKT:
 
-        if (global->show_psnr) {
+        if (global->show_psnr >= 1) {
           int i;
 
-          stream->psnr_sse_total += pkt->data.psnr.sse[0];
-          stream->psnr_samples_total += pkt->data.psnr.samples[0];
+          stream->psnr_sse_total[0] += pkt->data.psnr.sse[0];
+          stream->psnr_samples_total[0] += pkt->data.psnr.samples[0];
           for (i = 0; i < 4; i++) {
             if (!global->quiet)
               fprintf(stderr, "%.3f ", pkt->data.psnr.psnr[i]);
-            stream->psnr_totals[i] += pkt->data.psnr.psnr[i];
+            stream->psnr_totals[0][i] += pkt->data.psnr.psnr[i];
           }
-          stream->psnr_count++;
+          stream->psnr_count[0]++;
+
+#if CONFIG_AV1_HIGHBITDEPTH
+          if (stream->config.cfg.g_input_bit_depth <
+              (unsigned int)stream->config.cfg.g_bit_depth) {
+            stream->psnr_sse_total[1] += pkt->data.psnr.sse_hbd[0];
+            stream->psnr_samples_total[1] += pkt->data.psnr.samples_hbd[0];
+            for (i = 0; i < 4; i++) {
+              if (!global->quiet)
+                fprintf(stderr, "%.3f ", pkt->data.psnr.psnr_hbd[i]);
+              stream->psnr_totals[1][i] += pkt->data.psnr.psnr_hbd[i];
+            }
+            stream->psnr_count[1]++;
+          }
+#endif
         }
 
         break;
@@ -2136,15 +2187,15 @@
   int i;
   double ovpsnr;
 
-  if (!stream->psnr_count) return;
+  if (!stream->psnr_count[0]) return;
 
   fprintf(stderr, "Stream %d PSNR (Overall/Avg/Y/U/V)", stream->index);
-  ovpsnr = sse_to_psnr((double)stream->psnr_samples_total, peak,
-                       (double)stream->psnr_sse_total);
+  ovpsnr = sse_to_psnr((double)stream->psnr_samples_total[0], peak,
+                       (double)stream->psnr_sse_total[0]);
   fprintf(stderr, " %.3f", ovpsnr);
 
   for (i = 0; i < 4; i++) {
-    fprintf(stderr, " %.3f", stream->psnr_totals[i] / stream->psnr_count);
+    fprintf(stderr, " %.3f", stream->psnr_totals[0][i] / stream->psnr_count[0]);
   }
   if (bps > 0) {
     fprintf(stderr, " %7" PRId64 " bps", bps);
@@ -2153,6 +2204,30 @@
   fprintf(stderr, "\n");
 }
 
+#if CONFIG_AV1_HIGHBITDEPTH
+static void show_psnr_hbd(struct stream_state *stream, double peak,
+                          int64_t bps) {
+  int i;
+  double ovpsnr;
+  // Compute PSNR based on stream bit depth
+  if (!stream->psnr_count[1]) return;
+
+  fprintf(stderr, "Stream %d PSNR (Overall/Avg/Y/U/V)", stream->index);
+  ovpsnr = sse_to_psnr((double)stream->psnr_samples_total[1], peak,
+                       (double)stream->psnr_sse_total[1]);
+  fprintf(stderr, " %.3f", ovpsnr);
+
+  for (i = 0; i < 4; i++) {
+    fprintf(stderr, " %.3f", stream->psnr_totals[1][i] / stream->psnr_count[1]);
+  }
+  if (bps > 0) {
+    fprintf(stderr, " %7" PRId64 " bps", bps);
+  }
+  fprintf(stderr, " %7" PRId64 " ms", stream->cx_time / 1000);
+  fprintf(stderr, "\n");
+}
+#endif
+
 static float usec_to_fps(uint64_t usec, unsigned int frames) {
   return (float)(usec > 0 ? frames * 1000000.0 / (float)usec : 0);
 }
@@ -2236,7 +2311,7 @@
   aom_image_t raw;
   aom_image_t raw_shift;
   int allocated_raw_shift = 0;
-  int use_16bit_internal = 0;
+  int do_16bit_internal = 0;
   int input_shift = 0;
   int frame_avail, got_data;
 
@@ -2311,7 +2386,8 @@
   }
 
   /* Decide if other chroma subsamplings than 4:2:0 are supported */
-  if (global.codec->fourcc == AV1_FOURCC) input.only_i420 = 0;
+  if (get_fourcc_by_aom_encoder(global.codec) == AV1_FOURCC)
+    input.only_i420 = 0;
 
   for (pass = global.pass ? global.pass - 1 : 0; pass < global.passes; pass++) {
     int frames_in = 0, seen_frames = 0;
@@ -2435,6 +2511,11 @@
                   stream->config.cfg.g_input_bit_depth);
         }
       }
+#if !CONFIG_AV1_HIGHBITDEPTH
+      if (stream->config.cfg.g_bit_depth > 8) {
+        fatal("Unsupported bit-depth with CONFIG_AV1_HIGHBITDEPTH=0\n");
+      }
+#endif  // CONFIG_AV1_HIGHBITDEPTH
       if (stream->config.cfg.g_bit_depth > 10) {
         switch (stream->config.cfg.g_profile) {
           case 0:
@@ -2454,6 +2535,20 @@
                 "match input format.\n",
                 stream->config.cfg.g_profile);
       }
+      if ((global.show_psnr == 2) && (stream->config.cfg.g_input_bit_depth ==
+                                      stream->config.cfg.g_bit_depth)) {
+        fprintf(stderr,
+                "Warning: --psnr==2 and --psnr==1 will provide same "
+                "results when input bit-depth == stream bit-depth, "
+                "falling back to default psnr value\n");
+        global.show_psnr = 1;
+      }
+      if (global.show_psnr < 0 || global.show_psnr > 2) {
+        fprintf(stderr,
+                "Warning: --psnr can take only 0,1,2 as values,"
+                "falling back to default psnr value\n");
+        global.show_psnr = 1;
+      }
       /* Set limit */
       stream->config.cfg.g_limit = global.limit;
     }
@@ -2521,17 +2616,37 @@
     FOREACH_STREAM(stream, streams) { setup_pass(stream, &global, pass); }
     FOREACH_STREAM(stream, streams) { initialize_encoder(stream, &global); }
     FOREACH_STREAM(stream, streams) {
-      open_output_file(stream, &global, &input.pixel_aspect_ratio);
+      char *encoder_settings = NULL;
+#if CONFIG_WEBM_IO
+      // Test frameworks may compare outputs from different versions, but only
+      // wish to check for bitstream changes. The encoder-settings tag, however,
+      // can vary if the version is updated, even if no encoder algorithm
+      // changes were made. To work around this issue, do not output
+      // the encoder-settings tag when --debug is enabled (which is the flag
+      // that test frameworks should use, when they want deterministic output
+      // from the container format).
+      if (stream->config.write_webm && !stream->webm_ctx.debug) {
+        encoder_settings = extract_encoder_settings(
+            aom_codec_version_str(), argv_, argc, input.filename);
+        if (encoder_settings == NULL) {
+          fprintf(
+              stderr,
+              "Warning: unable to extract encoder settings. Continuing...\n");
+        }
+      }
+#endif
+      open_output_file(stream, &global, &input.pixel_aspect_ratio,
+                       encoder_settings);
+      free(encoder_settings);
     }
 
-    if (strcmp(global.codec->name, "av1") == 0 ||
-        strcmp(global.codec->name, "av1") == 0) {
+    if (strcmp(get_short_name_by_aom_encoder(global.codec), "av1") == 0) {
       // Check to see if at least one stream uses 16 bit internal.
       // Currently assume that the bit_depths for all streams using
       // highbitdepth are the same.
       FOREACH_STREAM(stream, streams) {
         if (stream->config.use_16bit_internal) {
-          use_16bit_internal = 1;
+          do_16bit_internal = 1;
         }
         input_shift = (int)stream->config.cfg.g_bit_depth -
                       stream->config.cfg.g_input_bit_depth;
@@ -2574,8 +2689,8 @@
 
       if (frames_in > global.skip_frames) {
         aom_image_t *frame_to_encode;
-        if (input_shift || (use_16bit_internal && input.bit_depth == 8)) {
-          assert(use_16bit_internal);
+        if (input_shift || (do_16bit_internal && input.bit_depth == 8)) {
+          assert(do_16bit_internal);
           // Input bit depth and stream bit depth do not match, so up
           // shift frame to stream bit depth
           if (!allocated_raw_shift) {
@@ -2589,7 +2704,7 @@
           frame_to_encode = &raw;
         }
         aom_usec_timer_start(&timer);
-        if (use_16bit_internal) {
+        if (do_16bit_internal) {
           assert(frame_to_encode->fmt & AOM_IMG_FMT_HIGHBITDEPTH);
           FOREACH_STREAM(stream, streams) {
             if (stream->config.use_16bit_internal)
@@ -2674,16 +2789,27 @@
       }
     }
 
-    if (global.show_psnr) {
-      if (global.codec->fourcc == AV1_FOURCC) {
+    if (global.show_psnr >= 1) {
+      if (get_fourcc_by_aom_encoder(global.codec) == AV1_FOURCC) {
         FOREACH_STREAM(stream, streams) {
           int64_t bps = 0;
-          if (stream->psnr_count && seen_frames && global.framerate.den) {
-            bps = (int64_t)stream->nbytes * 8 * (int64_t)global.framerate.num /
-                  global.framerate.den / seen_frames;
+          if (global.show_psnr == 1) {
+            if (stream->psnr_count[0] && seen_frames && global.framerate.den) {
+              bps = (int64_t)stream->nbytes * 8 *
+                    (int64_t)global.framerate.num / global.framerate.den /
+                    seen_frames;
+            }
+            show_psnr(stream, (1 << stream->config.cfg.g_input_bit_depth) - 1,
+                      bps);
           }
-          show_psnr(stream, (1 << stream->config.cfg.g_input_bit_depth) - 1,
-                    bps);
+          if (global.show_psnr == 2) {
+#if CONFIG_AV1_HIGHBITDEPTH
+            if (stream->config.cfg.g_input_bit_depth <
+                (unsigned int)stream->config.cfg.g_bit_depth)
+              show_psnr_hbd(stream, (1 << stream->config.cfg.g_bit_depth) - 1,
+                            bps);
+#endif
+          }
         }
       } else {
         FOREACH_STREAM(stream, streams) { show_psnr(stream, 255.0, 0); }
@@ -2702,7 +2828,7 @@
       FOREACH_STREAM(stream, streams) { res |= stream->mismatch_seen; }
     }
     FOREACH_STREAM(stream, streams) {
-      close_output_file(stream, global.codec->fourcc);
+      close_output_file(stream, get_fourcc_by_aom_encoder(global.codec));
     }
 
     FOREACH_STREAM(stream, streams) {
diff --git a/apps/aomenc.h b/apps/aomenc.h
index a38258b..8d4c25c 100644
--- a/apps/aomenc.h
+++ b/apps/aomenc.h
@@ -31,11 +31,9 @@
   YV12,  // 4:2:0 with uv flipped, only 8-bit depth
 } ColorInputType;
 
-struct AvxInterface;
-
 /* Configuration elements common to all streams. */
 struct AvxEncoderConfig {
-  const struct AvxInterface *codec;
+  aom_codec_iface_t *codec;
   int passes;
   int pass;
   unsigned int usage;
diff --git a/av1/av1.cmake b/av1/av1.cmake
index 2ab3496..9187d20 100644
--- a/av1/av1.cmake
+++ b/av1/av1.cmake
@@ -88,6 +88,13 @@
             "${AOM_ROOT}/av1/common/warped_motion.c"
             "${AOM_ROOT}/av1/common/warped_motion.h")
 
+if(CONFIG_REALTIME_ONLY)
+  list(REMOVE_ITEM AOM_AV1_COMMON_SOURCES "${AOM_ROOT}/av1/common/restoration.c"
+                   "${AOM_ROOT}/av1/common/restoration.h"
+                   "${AOM_ROOT}/av1/common/warped_motion.c"
+                   "${AOM_ROOT}/av1/common/warped_motion.h")
+endif()
+
 if(CONFIG_LPF_MASK)
   list(APPEND AOM_AV1_COMMON_SOURCES "${AOM_ROOT}/av1/common/loopfiltermask.c")
 endif()
@@ -121,8 +128,6 @@
             "${AOM_ROOT}/av1/encoder/av1_fwd_txfm1d.h"
             "${AOM_ROOT}/av1/encoder/av1_fwd_txfm1d_cfg.h"
             "${AOM_ROOT}/av1/encoder/av1_fwd_txfm2d.c"
-            "${AOM_ROOT}/av1/encoder/av1_multi_thread.c"
-            "${AOM_ROOT}/av1/encoder/av1_multi_thread.h"
             "${AOM_ROOT}/av1/encoder/av1_quantize.c"
             "${AOM_ROOT}/av1/encoder/av1_quantize.h"
             "${AOM_ROOT}/av1/encoder/bitstream.c"
@@ -142,6 +147,8 @@
             "${AOM_ROOT}/av1/encoder/cost.h"
             "${AOM_ROOT}/av1/encoder/encodeframe.c"
             "${AOM_ROOT}/av1/encoder/encodeframe.h"
+            "${AOM_ROOT}/av1/encoder/encodeframe_utils.c"
+            "${AOM_ROOT}/av1/encoder/encodeframe_utils.h"
             "${AOM_ROOT}/av1/encoder/encodemb.c"
             "${AOM_ROOT}/av1/encoder/encodemb.h"
             "${AOM_ROOT}/av1/encoder/encodemv.c"
@@ -150,6 +157,9 @@
             "${AOM_ROOT}/av1/encoder/encode_strategy.h"
             "${AOM_ROOT}/av1/encoder/encoder.c"
             "${AOM_ROOT}/av1/encoder/encoder.h"
+            "${AOM_ROOT}/av1/encoder/encoder_alloc.h"
+            "${AOM_ROOT}/av1/encoder/encoder_utils.c"
+            "${AOM_ROOT}/av1/encoder/encoder_utils.h"
             "${AOM_ROOT}/av1/encoder/encodetxb.c"
             "${AOM_ROOT}/av1/encoder/encodetxb.h"
             "${AOM_ROOT}/av1/encoder/ethread.c"
@@ -160,6 +170,8 @@
             "${AOM_ROOT}/av1/encoder/firstpass.h"
             "${AOM_ROOT}/av1/encoder/global_motion.c"
             "${AOM_ROOT}/av1/encoder/global_motion.h"
+            "${AOM_ROOT}/av1/encoder/global_motion_facade.c"
+            "${AOM_ROOT}/av1/encoder/global_motion_facade.h"
             "${AOM_ROOT}/av1/encoder/gop_structure.c"
             "${AOM_ROOT}/av1/encoder/gop_structure.h"
             "${AOM_ROOT}/av1/encoder/grain_test_vectors.h"
@@ -186,11 +198,14 @@
             "${AOM_ROOT}/av1/encoder/mv_prec.h"
             "${AOM_ROOT}/av1/encoder/palette.c"
             "${AOM_ROOT}/av1/encoder/palette.h"
+            "${AOM_ROOT}/av1/encoder/partition_search.h"
+            "${AOM_ROOT}/av1/encoder/partition_search.c"
             "${AOM_ROOT}/av1/encoder/partition_strategy.h"
             "${AOM_ROOT}/av1/encoder/partition_strategy.c"
             "${AOM_ROOT}/av1/encoder/pass2_strategy.h"
             "${AOM_ROOT}/av1/encoder/pass2_strategy.c"
             "${AOM_ROOT}/av1/encoder/pickcdef.c"
+            "${AOM_ROOT}/av1/encoder/pickcdef.h"
             "${AOM_ROOT}/av1/encoder/picklpf.c"
             "${AOM_ROOT}/av1/encoder/picklpf.h"
             "${AOM_ROOT}/av1/encoder/pickrst.c"
@@ -199,6 +214,7 @@
             "${AOM_ROOT}/av1/encoder/ransac.h"
             "${AOM_ROOT}/av1/encoder/ratectrl.c"
             "${AOM_ROOT}/av1/encoder/ratectrl.h"
+            "${AOM_ROOT}/av1/encoder/rc_utils.h"
             "${AOM_ROOT}/av1/encoder/rd.c"
             "${AOM_ROOT}/av1/encoder/rd.h"
             "${AOM_ROOT}/av1/encoder/rdopt.c"
@@ -212,6 +228,8 @@
             "${AOM_ROOT}/av1/encoder/segmentation.h"
             "${AOM_ROOT}/av1/encoder/speed_features.c"
             "${AOM_ROOT}/av1/encoder/speed_features.h"
+            "${AOM_ROOT}/av1/encoder/superres_scale.c"
+            "${AOM_ROOT}/av1/encoder/superres_scale.h"
             "${AOM_ROOT}/av1/encoder/svc_layercontext.c"
             "${AOM_ROOT}/av1/encoder/svc_layercontext.h"
             "${AOM_ROOT}/av1/encoder/temporal_filter.c"
@@ -224,9 +242,12 @@
             "${AOM_ROOT}/av1/encoder/tx_search.h"
             "${AOM_ROOT}/av1/encoder/intra_mode_search.c"
             "${AOM_ROOT}/av1/encoder/intra_mode_search.h"
+            "${AOM_ROOT}/av1/encoder/intra_mode_search_utils.h"
             "${AOM_ROOT}/av1/encoder/wedge_utils.c"
             "${AOM_ROOT}/av1/encoder/var_based_part.c"
             "${AOM_ROOT}/av1/encoder/var_based_part.h"
+            "${AOM_ROOT}/av1/encoder/av1_noise_estimate.c"
+            "${AOM_ROOT}/av1/encoder/av1_noise_estimate.h"
             "${AOM_ROOT}/third_party/fastfeat/fast.c"
             "${AOM_ROOT}/third_party/fastfeat/fast.h"
             "${AOM_ROOT}/third_party/fastfeat/fast_9.c"
@@ -241,17 +262,26 @@
               "${AOM_ROOT}/av1/encoder/tune_vmaf.h")
 endif()
 
+if(CONFIG_OPTICAL_FLOW_API)
+  list(APPEND AOM_AV1_ENCODER_SOURCES "${AOM_ROOT}/av1/encoder/optical_flow.c"
+              "${AOM_ROOT}/av1/encoder/optical_flow.h")
+endif()
+
 list(APPEND AOM_AV1_COMMON_INTRIN_SSE2
             "${AOM_ROOT}/av1/common/cdef_block_sse2.c"
             "${AOM_ROOT}/av1/common/x86/cfl_sse2.c"
             "${AOM_ROOT}/av1/common/x86/convolve_2d_sse2.c"
             "${AOM_ROOT}/av1/common/x86/convolve_sse2.c"
-            "${AOM_ROOT}/av1/common/x86/highbd_convolve_2d_sse2.c"
             "${AOM_ROOT}/av1/common/x86/jnt_convolve_sse2.c"
             "${AOM_ROOT}/av1/common/x86/wiener_convolve_sse2.c"
             "${AOM_ROOT}/av1/common/x86/av1_txfm_sse2.h"
             "${AOM_ROOT}/av1/common/x86/warp_plane_sse2.c")
 
+if(CONFIG_REALTIME_ONLY)
+  list(REMOVE_ITEM AOM_AV1_COMMON_INTRIN_SSE2
+                   "${AOM_ROOT}/av1/common/x86/warp_plane_sse2.c")
+endif()
+
 if(NOT CONFIG_AV1_HIGHBITDEPTH)
   list(REMOVE_ITEM AOM_AV1_COMMON_INTRIN_SSE2
                    "${AOM_ROOT}/av1/common/x86/highbd_convolve_2d_sse2.c")
@@ -265,7 +295,8 @@
             "${AOM_ROOT}/av1/common/x86/highbd_convolve_2d_ssse3.c"
             "${AOM_ROOT}/av1/common/x86/highbd_wiener_convolve_ssse3.c"
             "${AOM_ROOT}/av1/common/x86/jnt_convolve_ssse3.c"
-            "${AOM_ROOT}/av1/common/x86/reconinter_ssse3.c")
+            "${AOM_ROOT}/av1/common/x86/reconinter_ssse3.c"
+            "${AOM_ROOT}/av1/common/x86/resize_ssse3.c")
 
 if(NOT CONFIG_AV1_HIGHBITDEPTH)
   list(REMOVE_ITEM AOM_AV1_COMMON_INTRIN_SSSE3
@@ -294,6 +325,13 @@
                    "${AOM_ROOT}/av1/common/x86/highbd_warp_plane_sse4.c")
 endif()
 
+if(CONFIG_REALTIME_ONLY)
+  list(REMOVE_ITEM AOM_AV1_COMMON_INTRIN_SSE4_1
+                   "${AOM_ROOT}/av1/common/x86/highbd_warp_plane_sse4.c"
+                   "${AOM_ROOT}/av1/common/x86/selfguided_sse4.c"
+                   "${AOM_ROOT}/av1/common/x86/warp_plane_sse4.c")
+endif()
+
 list(APPEND AOM_AV1_COMMON_INTRIN_AVX2
             "${AOM_ROOT}/av1/common/cdef_block_avx2.c"
             "${AOM_ROOT}/av1/common/x86/av1_inv_txfm_avx2.c"
@@ -305,6 +343,7 @@
             "${AOM_ROOT}/av1/common/x86/highbd_inv_txfm_avx2.c"
             "${AOM_ROOT}/av1/common/x86/highbd_jnt_convolve_avx2.c"
             "${AOM_ROOT}/av1/common/x86/highbd_wiener_convolve_avx2.c"
+            "${AOM_ROOT}/av1/common/x86/highbd_warp_affine_avx2.c"
             "${AOM_ROOT}/av1/common/x86/jnt_convolve_avx2.c"
             "${AOM_ROOT}/av1/common/x86/reconinter_avx2.c"
             "${AOM_ROOT}/av1/common/x86/selfguided_avx2.c"
@@ -313,9 +352,17 @@
 
 if(NOT CONFIG_AV1_HIGHBITDEPTH)
   list(REMOVE_ITEM AOM_AV1_COMMON_INTRIN_AVX2
+                   "${AOM_ROOT}/av1/common/x86/highbd_warp_affine_avx2.c"
                    "${AOM_ROOT}/av1/common/x86/highbd_convolve_2d_avx2.c")
 endif()
 
+if(CONFIG_REALTIME_ONLY)
+  list(REMOVE_ITEM AOM_AV1_COMMON_INTRIN_AVX2
+                   "${AOM_ROOT}/av1/common/x86/highbd_warp_affine_avx2.c"
+                   "${AOM_ROOT}/av1/common/x86/selfguided_avx2.c"
+                   "${AOM_ROOT}/av1/common/x86/warp_plane_avx2.c")
+endif()
+
 list(APPEND AOM_AV1_ENCODER_ASM_SSE2 "${AOM_ROOT}/av1/encoder/x86/dct_sse2.asm"
             "${AOM_ROOT}/av1/encoder/x86/error_sse2.asm")
 
@@ -326,12 +373,14 @@
             "${AOM_ROOT}/av1/encoder/x86/encodetxb_sse2.c"
             "${AOM_ROOT}/av1/encoder/x86/highbd_block_error_intrin_sse2.c"
             "${AOM_ROOT}/av1/encoder/x86/temporal_filter_sse2.c"
+            "${AOM_ROOT}/av1/encoder/x86/highbd_temporal_filter_sse2.c"
             "${AOM_ROOT}/av1/encoder/x86/wedge_utils_sse2.c")
 
 if(NOT CONFIG_AV1_HIGHBITDEPTH)
   list(
     REMOVE_ITEM AOM_AV1_ENCODER_INTRIN_SSE2
-                "${AOM_ROOT}/av1/encoder/x86/highbd_block_error_intrin_sse2.c")
+                "${AOM_ROOT}/av1/encoder/x86/highbd_block_error_intrin_sse2.c"
+                "${AOM_ROOT}/av1/encoder/x86/highbd_temporal_filter_sse2.c")
 endif()
 
 list(APPEND AOM_AV1_ENCODER_INTRIN_SSE3 "${AOM_ROOT}/av1/encoder/x86/ml_sse3.c")
@@ -347,10 +396,13 @@
             "${AOM_ROOT}/av1/encoder/x86/encodetxb_sse4.c"
             "${AOM_ROOT}/av1/encoder/x86/highbd_fwd_txfm_sse4.c"
             "${AOM_ROOT}/av1/encoder/x86/rdopt_sse4.c"
-            "${AOM_ROOT}/av1/encoder/x86/temporal_filter_constants.h"
-            "${AOM_ROOT}/av1/encoder/x86/temporal_filter_sse4.c"
             "${AOM_ROOT}/av1/encoder/x86/pickrst_sse4.c")
 
+if(CONFIG_REALTIME_ONLY)
+  list(REMOVE_ITEM AOM_AV1_ENCODER_INTRIN_SSE4_1
+                   "${AOM_ROOT}/av1/encoder/x86/pickrst_sse4.c")
+endif()
+
 list(APPEND AOM_AV1_ENCODER_INTRIN_AVX2
             "${AOM_ROOT}/av1/encoder/x86/av1_quantize_avx2.c"
             "${AOM_ROOT}/av1/encoder/x86/av1_highbd_quantize_avx2.c"
@@ -363,18 +415,33 @@
             "${AOM_ROOT}/av1/encoder/x86/wedge_utils_avx2.c"
             "${AOM_ROOT}/av1/encoder/x86/encodetxb_avx2.c"
             "${AOM_ROOT}/av1/encoder/x86/rdopt_avx2.c"
+            "${AOM_ROOT}/av1/encoder/x86/av1_k_means_avx2.c"
             "${AOM_ROOT}/av1/encoder/x86/temporal_filter_avx2.c"
+            "${AOM_ROOT}/av1/encoder/x86/highbd_temporal_filter_avx2.c"
             "${AOM_ROOT}/av1/encoder/x86/pickrst_avx2.c")
 
 if(NOT CONFIG_AV1_HIGHBITDEPTH)
   list(
     REMOVE_ITEM AOM_AV1_ENCODER_INTRIN_AVX2
-                "${AOM_ROOT}/av1/encoder/x86/highbd_block_error_intrin_avx2.c")
+                "${AOM_ROOT}/av1/encoder/x86/highbd_block_error_intrin_avx2.c"
+                "${AOM_ROOT}/av1/encoder/x86/highbd_temporal_filter_avx2.c")
+endif()
+
+if(CONFIG_REALTIME_ONLY)
+  list(REMOVE_ITEM AOM_AV1_ENCODER_INTRIN_AVX2
+                   "${AOM_ROOT}/av1/encoder/x86/pickrst_avx2.c")
 endif()
 
 list(APPEND AOM_AV1_ENCODER_INTRIN_NEON
             "${AOM_ROOT}/av1/encoder/arm/neon/quantize_neon.c"
-            "${AOM_ROOT}/av1/encoder/arm/neon/av1_error_neon.c")
+            "${AOM_ROOT}/av1/encoder/arm/neon/ml_neon.c"
+            "${AOM_ROOT}/av1/encoder/arm/neon/picksrt_neon.c"
+            "${AOM_ROOT}/av1/encoder/arm/neon/rdopt_neon.c"
+            "${AOM_ROOT}/av1/encoder/arm/neon/av1_error_neon.c"
+            "${AOM_ROOT}/av1/encoder/arm/neon/encodetxb_neon.c"
+            "${AOM_ROOT}/av1/encoder/arm/neon/hybrid_fwd_txfm_neon.c"
+            "${AOM_ROOT}/av1/encoder/arm/neon/av1_fwd_txfm2d_neon.c"
+            "${AOM_ROOT}/av1/encoder/arm/neon/highbd_fwd_txfm_neon.c")
 
 list(APPEND AOM_AV1_ENCODER_INTRIN_MSA
             "${AOM_ROOT}/av1/encoder/mips/msa/error_msa.c"
@@ -392,13 +459,22 @@
             "${AOM_ROOT}/av1/common/arm/blend_a64_hmask_neon.c"
             "${AOM_ROOT}/av1/common/arm/blend_a64_vmask_neon.c"
             "${AOM_ROOT}/av1/common/arm/reconinter_neon.c"
+            "${AOM_ROOT}/av1/common/arm/reconintra_neon.c"
+            "${AOM_ROOT}/av1/common/arm/resize_neon.c"
             "${AOM_ROOT}/av1/common/arm/wiener_convolve_neon.c"
             "${AOM_ROOT}/av1/common/arm/selfguided_neon.c"
             "${AOM_ROOT}/av1/common/arm/av1_inv_txfm_neon.c"
             "${AOM_ROOT}/av1/common/arm/av1_inv_txfm_neon.h"
+            "${AOM_ROOT}/av1/common/arm/highbd_inv_txfm_neon.c"
             "${AOM_ROOT}/av1/common/arm/warp_plane_neon.c"
             "${AOM_ROOT}/av1/common/cdef_block_neon.c")
 
+if(CONFIG_REALTIME_ONLY)
+  list(REMOVE_ITEM AOM_AV1_COMMON_INTRIN_NEON
+                   "${AOM_ROOT}/av1/common/arm/selfguided_neon.c"
+                   "${AOM_ROOT}/av1/common/arm/warp_plane_neon.c")
+endif()
+
 list(APPEND AOM_AV1_ENCODER_INTRIN_SSE4_2
             "${AOM_ROOT}/av1/encoder/x86/hash_sse42.c")
 
@@ -424,18 +500,22 @@
                    "${AOM_ROOT}/av1/encoder/cnn.h"
                    "${AOM_ROOT}/av1/encoder/firstpass.c"
                    "${AOM_ROOT}/av1/encoder/firstpass.h"
+                   "${AOM_ROOT}/av1/encoder/global_motion.c"
+                   "${AOM_ROOT}/av1/encoder/global_motion.h"
+                   "${AOM_ROOT}/av1/encoder/global_motion_facade.c"
+                   "${AOM_ROOT}/av1/encoder/global_motion_facade.h"
                    "${AOM_ROOT}/av1/encoder/gop_structure.c"
                    "${AOM_ROOT}/av1/encoder/gop_structure.h"
                    "${AOM_ROOT}/av1/encoder/misc_model_weights.h"
                    "${AOM_ROOT}/av1/encoder/partition_cnn_weights.h"
                    "${AOM_ROOT}/av1/encoder/partition_model_weights.h"
                    "${AOM_ROOT}/av1/encoder/pass2_strategy.c"
+                   "${AOM_ROOT}/av1/encoder/picklpf.h"
+                   "${AOM_ROOT}/av1/encoder/pickrst.c"
                    "${AOM_ROOT}/av1/encoder/temporal_filter.c"
                    "${AOM_ROOT}/av1/encoder/temporal_filter.h"
-                   "${AOM_ROOT}/av1/encoder/temporal_filter_constants.h"
                    "${AOM_ROOT}/av1/encoder/tpl_model.c"
-                   "${AOM_ROOT}/av1/encoder/tpl_model.h"
-                   "${AOM_ROOT}/av1/encoder/x86/temporal_filter_sse4.c")
+                   "${AOM_ROOT}/av1/encoder/tpl_model.h")
 endif()
 
 # Setup AV1 common/decoder/encoder targets. The libaom target must exist before
@@ -555,10 +635,12 @@
                                     "AOM_AV1_COMMON_INTRIN_NEON")
     endif()
 
-    if(AOM_AV1_ENCODER_INTRIN_NEON)
-      add_intrinsics_object_library("${AOM_NEON_INTRIN_FLAG}" "neon"
-                                    "aom_av1_encoder"
-                                    "AOM_AV1_ENCODER_INTRIN_NEON")
+    if(CONFIG_AV1_ENCODER)
+      if(AOM_AV1_ENCODER_INTRIN_NEON)
+        add_intrinsics_object_library("${AOM_NEON_INTRIN_FLAG}" "neon"
+                                      "aom_av1_encoder"
+                                      "AOM_AV1_ENCODER_INTRIN_NEON")
+      endif()
     endif()
   endif()
 
diff --git a/av1/av1_cx_iface.c b/av1/av1_cx_iface.c
index 676eaa0..d3d7a04 100644
--- a/av1/av1_cx_iface.c
+++ b/av1/av1_cx_iface.c
@@ -24,6 +24,7 @@
 #include "av1/av1_iface_common.h"
 #include "av1/encoder/bitstream.h"
 #include "av1/encoder/encoder.h"
+#include "av1/encoder/ethread.h"
 #include "av1/encoder/firstpass.h"
 
 #define MAG_SIZE (4)
@@ -102,8 +103,9 @@
   int enable_order_hint;         // enable order hint for sequence
   int enable_tx64;               // enable 64-pt transform usage for sequence
   int enable_flip_idtx;          // enable flip and identity transform types
-  int enable_dist_wtd_comp;      // enable dist wtd compound for sequence
-  int max_reference_frames;      // maximum number of references per frame
+  int enable_rect_tx;        // enable rectangular transform usage for sequence
+  int enable_dist_wtd_comp;  // enable dist wtd compound for sequence
+  int max_reference_frames;  // maximum number of references per frame
   int enable_reduced_reference_set;  // enable reduced set of references
   int enable_ref_frame_mvs;          // sequence level
   int allow_ref_frame_mvs;           // frame level
@@ -138,6 +140,7 @@
   int use_inter_dct_only;
   int use_intra_default_tx_only;
   int quant_b_adapt;
+  unsigned int vbr_corpus_complexity_lap;
   AV1_LEVEL target_seq_level_idx[MAX_NUM_OPERATING_POINTS];
   // Bit mask to specify which tier each of the 32 possible operating points
   // conforms to.
@@ -196,10 +199,10 @@
   NO_AQ,                        // aq_mode
   DELTA_Q_OBJECTIVE,            // deltaq_mode
   0,                            // delta lf mode
-  0,                            // frame_periodic_delta_q
+  0,                            // frame_periodic_boost
   AOM_BITS_8,                   // Bit depth
   AOM_CONTENT_DEFAULT,          // content
-  AOM_CICP_CP_UNSPECIFIED,      // CICP color space
+  AOM_CICP_CP_UNSPECIFIED,      // CICP color primaries
   AOM_CICP_TC_UNSPECIFIED,      // CICP transfer characteristics
   AOM_CICP_MC_UNSPECIFIED,      // CICP matrix coefficients
   AOM_CSP_UNKNOWN,              // chroma sample position
@@ -223,6 +226,7 @@
   1,                            // frame order hint
   1,                            // enable 64-pt transform usage
   1,                            // enable flip and identity transform
+  1,                            // enable rectangular transform usage
   1,                            // dist-wtd compound
   7,                            // max_reference_frames
   0,                            // enable_reduced_reference_set
@@ -258,6 +262,7 @@
   0,  // use_inter_dct_only
   0,  // use_intra_default_tx_only
   0,  // quant_b_adapt
+  0,  // vbr_corpus_complexity_lap
   {
       SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX,
       SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX,
@@ -308,7 +313,7 @@
 };
 
 static INLINE int gcd(int64_t a, int b) {
-  int remainder;  // remainder
+  int remainder;
   while (b > 0) {
     remainder = (int)(a % b);
     a = b;
@@ -406,7 +411,7 @@
               SCALE_NUMERATOR << 1);
   RANGE_CHECK(cfg, rc_resize_kf_denominator, SCALE_NUMERATOR,
               SCALE_NUMERATOR << 1);
-  RANGE_CHECK_HI(cfg, rc_superres_mode, SUPERRES_MODES - 1);
+  RANGE_CHECK_HI(cfg, rc_superres_mode, AOM_SUPERRES_AUTO);
   RANGE_CHECK(cfg, rc_superres_denominator, SCALE_NUMERATOR,
               SCALE_NUMERATOR << 1);
   RANGE_CHECK(cfg, rc_superres_kf_denominator, SCALE_NUMERATOR,
@@ -415,20 +420,12 @@
   RANGE_CHECK(cfg, rc_superres_kf_qthresh, 1, 63);
   RANGE_CHECK_HI(extra_cfg, cdf_update_mode, 2);
 
-  // AV1 does not support a lower bound on the keyframe interval in
-  // automatic keyframe placement mode.
-  if (cfg->kf_mode != AOM_KF_DISABLED && cfg->kf_min_dist != cfg->kf_max_dist &&
-      cfg->kf_min_dist > 0)
-    ERROR(
-        "kf_min_dist not supported in auto mode, use 0 "
-        "or kf_max_dist instead.");
-
   RANGE_CHECK_HI(extra_cfg, motion_vector_unit_test, 2);
   RANGE_CHECK_HI(extra_cfg, sb_multipass_unit_test, 1);
   RANGE_CHECK_HI(extra_cfg, ext_tile_debug, 1);
   RANGE_CHECK_HI(extra_cfg, enable_auto_alt_ref, 1);
   RANGE_CHECK_HI(extra_cfg, enable_auto_bwd_ref, 2);
-  RANGE_CHECK(extra_cfg, cpu_used, 0, 8);
+  RANGE_CHECK(extra_cfg, cpu_used, 0, 9);
   RANGE_CHECK_HI(extra_cfg, noise_sensitivity, 6);
   RANGE_CHECK(extra_cfg, superblock_size, AOM_SUPERBLOCK_SIZE_64X64,
               AOM_SUPERBLOCK_SIZE_DYNAMIC);
@@ -510,18 +507,36 @@
               AOM_CICP_MC_ICTCP);
   RANGE_CHECK(extra_cfg, color_range, 0, 1);
 
+  /* Average corpus complexity is supported only in the case of single pass
+   * VBR*/
+  if (cfg->g_pass == AOM_RC_ONE_PASS && cfg->rc_end_usage == AOM_VBR)
+    RANGE_CHECK_HI(extra_cfg, vbr_corpus_complexity_lap,
+                   MAX_VBR_CORPUS_COMPLEXITY);
+  else if (extra_cfg->vbr_corpus_complexity_lap != 0)
+    ERROR(
+        "VBR corpus complexity is supported only in the case of single pass "
+        "VBR mode.");
+
 #if !CONFIG_TUNE_VMAF
-  if (extra_cfg->tuning == AOM_TUNE_VMAF_WITH_PREPROCESSING ||
-      extra_cfg->tuning == AOM_TUNE_VMAF_WITHOUT_PREPROCESSING ||
-      extra_cfg->tuning == AOM_TUNE_VMAF_MAX_GAIN) {
+  if (extra_cfg->tuning >= AOM_TUNE_VMAF_WITH_PREPROCESSING &&
+      extra_cfg->tuning <= AOM_TUNE_VMAF_NEG_MAX_GAIN) {
     ERROR(
         "This error may be related to the wrong configuration options: try to "
         "set -DCONFIG_TUNE_VMAF=1 at the time CMake is run.");
   }
 #endif
 
+#if !CONFIG_USE_VMAF_RC
+  if (extra_cfg->tuning == AOM_TUNE_VMAF_NEG_MAX_GAIN) {
+    ERROR(
+        "This error may be related to the wrong configuration options: try to "
+        "set -DCONFIG_TUNE_VMAF=1 and -DCONFIG_USE_VMAF_RC=1 at the time CMake"
+        " is run.");
+  }
+#endif
+
 #if CONFIG_TUNE_VMAF
-  RANGE_CHECK(extra_cfg, tuning, AOM_TUNE_PSNR, AOM_TUNE_VMAF_MAX_GAIN);
+  RANGE_CHECK(extra_cfg, tuning, AOM_TUNE_PSNR, AOM_TUNE_VMAF_NEG_MAX_GAIN);
 #else
   RANGE_CHECK(extra_cfg, tuning, AOM_TUNE_PSNR, AOM_TUNE_SSIM);
 #endif
@@ -538,19 +553,14 @@
       ERROR("Only --enable_chroma_deltaq=0 can be used with --lossless=1.");
   }
 
-  if (cfg->rc_resize_mode != RESIZE_NONE &&
-      extra_cfg->aq_mode == CYCLIC_REFRESH_AQ) {
-    ERROR("--aq_mode=3 is only supported for --resize-mode=0.");
-  }
-
   RANGE_CHECK(extra_cfg, max_reference_frames, 3, 7);
   RANGE_CHECK(extra_cfg, enable_reduced_reference_set, 0, 1);
   RANGE_CHECK_HI(extra_cfg, chroma_subsampling_x, 1);
   RANGE_CHECK_HI(extra_cfg, chroma_subsampling_y, 1);
 
   RANGE_CHECK_HI(extra_cfg, disable_trellis_quant, 3);
-  RANGE_CHECK(extra_cfg, coeff_cost_upd_freq, 0, 2);
-  RANGE_CHECK(extra_cfg, mode_cost_upd_freq, 0, 2);
+  RANGE_CHECK(extra_cfg, coeff_cost_upd_freq, 0, 3);
+  RANGE_CHECK(extra_cfg, mode_cost_upd_freq, 0, 3);
   RANGE_CHECK(extra_cfg, mv_cost_upd_freq, 0, 3);
 
   RANGE_CHECK(extra_cfg, min_partition_size, 4, 128);
@@ -597,10 +607,6 @@
   if (img->d_w != ctx->cfg.g_w || img->d_h != ctx->cfg.g_h)
     ERROR("Image size must match encoder init configuration size");
 
-  if (img->fmt != AOM_IMG_FMT_I420 && !ctx->extra_cfg.enable_tx64) {
-    ERROR("TX64 can only be disabled on I420 images.");
-  }
-
   return AOM_CODEC_OK;
 }
 
@@ -620,12 +626,12 @@
 }
 
 // Set appropriate options to disable frame super-resolution.
-static void disable_superres(AV1EncoderConfig *const oxcf) {
-  oxcf->superres_mode = SUPERRES_NONE;
-  oxcf->superres_scale_denominator = SCALE_NUMERATOR;
-  oxcf->superres_kf_scale_denominator = SCALE_NUMERATOR;
-  oxcf->superres_qthresh = 255;
-  oxcf->superres_kf_qthresh = 255;
+static void disable_superres(SuperResCfg *const superres_cfg) {
+  superres_cfg->superres_mode = AOM_SUPERRES_NONE;
+  superres_cfg->superres_scale_denominator = SCALE_NUMERATOR;
+  superres_cfg->superres_kf_scale_denominator = SCALE_NUMERATOR;
+  superres_cfg->superres_qthresh = 255;
+  superres_cfg->superres_kf_qthresh = 255;
 }
 
 static void update_default_encoder_config(const cfg_options_t *cfg,
@@ -695,32 +701,87 @@
     update_default_encoder_config(&cfg->encoder_cfg, extra_cfg);
   }
 
+  TuneCfg *const tune_cfg = &oxcf->tune_cfg;
+
+  FrameDimensionCfg *const frm_dim_cfg = &oxcf->frm_dim_cfg;
+
+  TileConfig *const tile_cfg = &oxcf->tile_cfg;
+
+  ResizeCfg *const resize_cfg = &oxcf->resize_cfg;
+
+  GFConfig *const gf_cfg = &oxcf->gf_cfg;
+
+  PartitionCfg *const part_cfg = &oxcf->part_cfg;
+
+  IntraModeCfg *const intra_mode_cfg = &oxcf->intra_mode_cfg;
+
+  TxfmSizeTypeCfg *const txfm_cfg = &oxcf->txfm_cfg;
+
+  CompoundTypeCfg *const comp_type_cfg = &oxcf->comp_type_cfg;
+
+  SuperResCfg *const superres_cfg = &oxcf->superres_cfg;
+
+  KeyFrameCfg *const kf_cfg = &oxcf->kf_cfg;
+
+  DecoderModelCfg *const dec_model_cfg = &oxcf->dec_model_cfg;
+
+  RateControlCfg *const rc_cfg = &oxcf->rc_cfg;
+
+  QuantizationCfg *const q_cfg = &oxcf->q_cfg;
+
+  ColorCfg *const color_cfg = &oxcf->color_cfg;
+
+  InputCfg *const input_cfg = &oxcf->input_cfg;
+
+  AlgoCfg *const algo_cfg = &oxcf->algo_cfg;
+
+  ToolCfg *const tool_cfg = &oxcf->tool_cfg;
+
   const int is_vbr = cfg->rc_end_usage == AOM_VBR;
   oxcf->profile = cfg->g_profile;
-  oxcf->fwd_kf_enabled = cfg->fwd_kf_enabled;
   oxcf->max_threads = (int)cfg->g_threads;
   oxcf->mode = (cfg->g_usage == AOM_USAGE_REALTIME) ? REALTIME : GOOD;
-  oxcf->width = cfg->g_w;
-  oxcf->height = cfg->g_h;
-  oxcf->forced_max_frame_width = cfg->g_forced_max_frame_width;
-  oxcf->forced_max_frame_height = cfg->g_forced_max_frame_height;
-  oxcf->bit_depth = cfg->g_bit_depth;
-  oxcf->input_bit_depth = cfg->g_input_bit_depth;
+
+  // Set frame-dimension related configuration.
+  frm_dim_cfg->width = cfg->g_w;
+  frm_dim_cfg->height = cfg->g_h;
+  frm_dim_cfg->forced_max_frame_width = cfg->g_forced_max_frame_width;
+  frm_dim_cfg->forced_max_frame_height = cfg->g_forced_max_frame_height;
+  frm_dim_cfg->render_width = extra_cfg->render_width;
+  frm_dim_cfg->render_height = extra_cfg->render_height;
+
+  // Set input video related configuration.
+  input_cfg->input_bit_depth = cfg->g_input_bit_depth;
   // guess a frame rate if out of whack, use 30
-  oxcf->init_framerate = (double)cfg->g_timebase.den / cfg->g_timebase.num;
+  input_cfg->init_framerate = (double)cfg->g_timebase.den / cfg->g_timebase.num;
+  if (cfg->g_pass == AOM_RC_LAST_PASS) {
+    const size_t packet_sz = sizeof(FIRSTPASS_STATS);
+    const int n_packets = (int)(cfg->rc_twopass_stats_in.sz / packet_sz);
+    input_cfg->limit = n_packets - 1;
+  } else {
+    input_cfg->limit = cfg->g_limit;
+  }
+  input_cfg->chroma_subsampling_x = extra_cfg->chroma_subsampling_x;
+  input_cfg->chroma_subsampling_y = extra_cfg->chroma_subsampling_y;
+  if (input_cfg->init_framerate > 180) {
+    input_cfg->init_framerate = 30;
+    dec_model_cfg->timing_info_present = 0;
+  }
+
+  // Set Decoder model configuration.
   if (extra_cfg->timing_info_type == AOM_TIMING_EQUAL ||
       extra_cfg->timing_info_type == AOM_TIMING_DEC_MODEL) {
-    oxcf->timing_info_present = 1;
-    oxcf->timing_info.num_units_in_display_tick = cfg->g_timebase.num;
-    oxcf->timing_info.time_scale = cfg->g_timebase.den;
-    oxcf->timing_info.num_ticks_per_picture = 1;
+    dec_model_cfg->timing_info_present = 1;
+    dec_model_cfg->timing_info.num_units_in_display_tick = cfg->g_timebase.num;
+    dec_model_cfg->timing_info.time_scale = cfg->g_timebase.den;
+    dec_model_cfg->timing_info.num_ticks_per_picture = 1;
   } else {
-    oxcf->timing_info_present = 0;
+    dec_model_cfg->timing_info_present = 0;
   }
   if (extra_cfg->timing_info_type == AOM_TIMING_EQUAL) {
-    oxcf->timing_info.equal_picture_interval = 1;
-    oxcf->decoder_model_info_present_flag = 0;
-    oxcf->display_model_info_present_flag = 1;
+    dec_model_cfg->timing_info.equal_picture_interval = 1;
+    dec_model_cfg->decoder_model_info_present_flag = 0;
+    dec_model_cfg->display_model_info_present_flag = 1;
   } else if (extra_cfg->timing_info_type == AOM_TIMING_DEC_MODEL) {
     //    if( extra_cfg->arnr_strength > 0 )
     //    {
@@ -732,17 +793,11 @@
     //      printf("Only --superres-mode=0 can currently be used with
     //      --timing-info=model."); return AOM_CODEC_INVALID_PARAM;
     //    }
-    oxcf->buffer_model.num_units_in_decoding_tick = cfg->g_timebase.num;
-    oxcf->timing_info.equal_picture_interval = 0;
-    oxcf->decoder_model_info_present_flag = 1;
-    oxcf->buffer_removal_time_present = 1;
-    oxcf->display_model_info_present_flag = 1;
+    dec_model_cfg->num_units_in_decoding_tick = cfg->g_timebase.num;
+    dec_model_cfg->timing_info.equal_picture_interval = 0;
+    dec_model_cfg->decoder_model_info_present_flag = 1;
+    dec_model_cfg->display_model_info_present_flag = 1;
   }
-  if (oxcf->init_framerate > 180) {
-    oxcf->init_framerate = 30;
-    oxcf->timing_info_present = 0;
-  }
-  oxcf->encoder_cfg = &cfg->encoder_cfg;
 
   switch (cfg->g_pass) {
     case AOM_RC_ONE_PASS: oxcf->pass = 0; break;
@@ -750,285 +805,303 @@
     case AOM_RC_LAST_PASS: oxcf->pass = 2; break;
   }
 
-  oxcf->lag_in_frames = clamp(cfg->g_lag_in_frames, 0, MAX_LAG_BUFFERS);
-  oxcf->rc_mode = cfg->rc_end_usage;
-
-  // Convert target bandwidth from Kbit/s to Bit/s
-  oxcf->target_bandwidth = 1000 * cfg->rc_target_bitrate;
-  oxcf->rc_max_intra_bitrate_pct = extra_cfg->rc_max_intra_bitrate_pct;
-  oxcf->rc_max_inter_bitrate_pct = extra_cfg->rc_max_inter_bitrate_pct;
-  oxcf->gf_cbr_boost_pct = extra_cfg->gf_cbr_boost_pct;
-
-  oxcf->best_allowed_q =
+  // Set Rate Control configuration.
+  rc_cfg->max_intra_bitrate_pct = extra_cfg->rc_max_intra_bitrate_pct;
+  rc_cfg->max_inter_bitrate_pct = extra_cfg->rc_max_inter_bitrate_pct;
+  rc_cfg->gf_cbr_boost_pct = extra_cfg->gf_cbr_boost_pct;
+  rc_cfg->mode = cfg->rc_end_usage;
+  rc_cfg->min_cr = extra_cfg->min_cr;
+  rc_cfg->best_allowed_q =
       extra_cfg->lossless ? 0 : av1_quantizer_to_qindex(cfg->rc_min_quantizer);
-  oxcf->worst_allowed_q =
+  rc_cfg->worst_allowed_q =
       extra_cfg->lossless ? 0 : av1_quantizer_to_qindex(cfg->rc_max_quantizer);
-  oxcf->cq_level = av1_quantizer_to_qindex(extra_cfg->cq_level);
-  oxcf->fixed_q = -1;
+  rc_cfg->cq_level = av1_quantizer_to_qindex(extra_cfg->cq_level);
+  rc_cfg->under_shoot_pct = cfg->rc_undershoot_pct;
+  rc_cfg->over_shoot_pct = cfg->rc_overshoot_pct;
+  rc_cfg->maximum_buffer_size_ms = is_vbr ? 240000 : cfg->rc_buf_sz;
+  rc_cfg->starting_buffer_level_ms = is_vbr ? 60000 : cfg->rc_buf_initial_sz;
+  rc_cfg->optimal_buffer_level_ms = is_vbr ? 60000 : cfg->rc_buf_optimal_sz;
+  // Convert target bandwidth from Kbit/s to Bit/s
+  rc_cfg->target_bandwidth = 1000 * cfg->rc_target_bitrate;
+  rc_cfg->drop_frames_water_mark = cfg->rc_dropframe_thresh;
+  rc_cfg->vbr_corpus_complexity_lap = extra_cfg->vbr_corpus_complexity_lap;
+  rc_cfg->vbrbias = cfg->rc_2pass_vbr_bias_pct;
+  rc_cfg->vbrmin_section = cfg->rc_2pass_vbr_minsection_pct;
+  rc_cfg->vbrmax_section = cfg->rc_2pass_vbr_maxsection_pct;
 
-  oxcf->enable_cdef = extra_cfg->enable_cdef;
-  oxcf->enable_restoration =
+  // Set Toolset related configuration.
+  tool_cfg->bit_depth = cfg->g_bit_depth;
+  tool_cfg->enable_cdef = extra_cfg->enable_cdef;
+  tool_cfg->enable_restoration =
       (cfg->g_usage == AOM_USAGE_REALTIME) ? 0 : extra_cfg->enable_restoration;
-  oxcf->force_video_mode = extra_cfg->force_video_mode;
-  oxcf->enable_obmc = extra_cfg->enable_obmc;
-  oxcf->enable_overlay = extra_cfg->enable_overlay;
-  oxcf->enable_palette = extra_cfg->enable_palette;
-  oxcf->enable_intrabc = extra_cfg->enable_intrabc;
-  oxcf->enable_angle_delta = extra_cfg->enable_angle_delta;
-  oxcf->disable_trellis_quant = extra_cfg->disable_trellis_quant;
-  oxcf->allow_ref_frame_mvs = extra_cfg->enable_ref_frame_mvs;
-  oxcf->using_qm = extra_cfg->enable_qm;
-  oxcf->qm_y = extra_cfg->qm_y;
-  oxcf->qm_u = extra_cfg->qm_u;
-  oxcf->qm_v = extra_cfg->qm_v;
-  oxcf->qm_minlevel = extra_cfg->qm_min;
-  oxcf->qm_maxlevel = extra_cfg->qm_max;
-  oxcf->reduced_tx_type_set = extra_cfg->reduced_tx_type_set;
-  oxcf->use_intra_dct_only = extra_cfg->use_intra_dct_only;
-  oxcf->use_inter_dct_only = extra_cfg->use_inter_dct_only;
-  oxcf->use_intra_default_tx_only = extra_cfg->use_intra_default_tx_only;
-  oxcf->quant_b_adapt = extra_cfg->quant_b_adapt;
-  oxcf->coeff_cost_upd_freq = (COST_UPDATE_TYPE)extra_cfg->coeff_cost_upd_freq;
-  oxcf->mode_cost_upd_freq = (COST_UPDATE_TYPE)extra_cfg->mode_cost_upd_freq;
-  oxcf->mv_cost_upd_freq = (COST_UPDATE_TYPE)extra_cfg->mv_cost_upd_freq;
-  oxcf->num_tile_groups = extra_cfg->num_tg;
-  // In large-scale tile encoding mode, num_tile_groups is always 1.
-  if (cfg->large_scale_tile) oxcf->num_tile_groups = 1;
-  oxcf->mtu = extra_cfg->mtu_size;
-
+  tool_cfg->force_video_mode = extra_cfg->force_video_mode;
+  tool_cfg->enable_palette = extra_cfg->enable_palette;
   // FIXME(debargha): Should this be:
-  // oxcf->allow_ref_frame_mvs = extra_cfg->allow_ref_frame_mvs &
-  //                             extra_cfg->enable_order_hint ?
+  // tool_cfg->enable_ref_frame_mvs  = extra_cfg->allow_ref_frame_mvs &
+  //                                         extra_cfg->enable_order_hint ?
   // Disallow using temporal MVs while large_scale_tile = 1.
-  oxcf->allow_ref_frame_mvs =
+  tool_cfg->enable_ref_frame_mvs =
       extra_cfg->allow_ref_frame_mvs && !cfg->large_scale_tile;
-  oxcf->under_shoot_pct = cfg->rc_undershoot_pct;
-  oxcf->over_shoot_pct = cfg->rc_overshoot_pct;
+  tool_cfg->superblock_size = extra_cfg->superblock_size;
+  tool_cfg->enable_monochrome = cfg->monochrome;
+  tool_cfg->full_still_picture_hdr = cfg->full_still_picture_hdr;
+  tool_cfg->enable_dual_filter = extra_cfg->enable_dual_filter;
+  tool_cfg->enable_order_hint = extra_cfg->enable_order_hint;
+  tool_cfg->enable_interintra_comp = extra_cfg->enable_interintra_comp;
+  tool_cfg->ref_frame_mvs_present =
+      extra_cfg->enable_ref_frame_mvs & extra_cfg->enable_order_hint;
+  tool_cfg->enable_global_motion = extra_cfg->enable_global_motion;
+  tool_cfg->error_resilient_mode =
+      cfg->g_error_resilient | extra_cfg->error_resilient_mode;
+  tool_cfg->frame_parallel_decoding_mode =
+      extra_cfg->frame_parallel_decoding_mode;
 
-  oxcf->resize_mode = (RESIZE_MODE)cfg->rc_resize_mode;
-  oxcf->resize_scale_denominator = (uint8_t)cfg->rc_resize_denominator;
-  oxcf->resize_kf_scale_denominator = (uint8_t)cfg->rc_resize_kf_denominator;
-  if (oxcf->resize_mode == RESIZE_FIXED &&
-      oxcf->resize_scale_denominator == SCALE_NUMERATOR &&
-      oxcf->resize_kf_scale_denominator == SCALE_NUMERATOR)
-    oxcf->resize_mode = RESIZE_NONE;
-
-  if (extra_cfg->lossless || cfg->large_scale_tile) {
-    disable_superres(oxcf);
-  } else {
-    oxcf->superres_mode = (SUPERRES_MODE)cfg->rc_superres_mode;
-    oxcf->superres_scale_denominator = (uint8_t)cfg->rc_superres_denominator;
-    oxcf->superres_kf_scale_denominator =
-        (uint8_t)cfg->rc_superres_kf_denominator;
-    oxcf->superres_qthresh = av1_quantizer_to_qindex(cfg->rc_superres_qthresh);
-    oxcf->superres_kf_qthresh =
-        av1_quantizer_to_qindex(cfg->rc_superres_kf_qthresh);
-    if (oxcf->superres_mode == SUPERRES_FIXED &&
-        oxcf->superres_scale_denominator == SCALE_NUMERATOR &&
-        oxcf->superres_kf_scale_denominator == SCALE_NUMERATOR) {
-      disable_superres(oxcf);
-    }
-    if (oxcf->superres_mode == SUPERRES_QTHRESH &&
-        oxcf->superres_qthresh == 255 && oxcf->superres_kf_qthresh == 255) {
-      disable_superres(oxcf);
+  // Set Quantization related configuration.
+  q_cfg->using_qm = extra_cfg->enable_qm;
+  q_cfg->qm_minlevel = extra_cfg->qm_min;
+  q_cfg->qm_maxlevel = extra_cfg->qm_max;
+  q_cfg->quant_b_adapt = extra_cfg->quant_b_adapt;
+  q_cfg->enable_chroma_deltaq = extra_cfg->enable_chroma_deltaq;
+  q_cfg->aq_mode = extra_cfg->aq_mode;
+  q_cfg->deltaq_mode = extra_cfg->deltaq_mode;
+  q_cfg->use_fixed_qp_offsets =
+      cfg->use_fixed_qp_offsets && (rc_cfg->mode == AOM_Q);
+  for (int i = 0; i < FIXED_QP_OFFSET_COUNT; ++i) {
+    if (q_cfg->use_fixed_qp_offsets) {
+      if (cfg->fixed_qp_offsets[i] >= 0) {  // user-provided qp offset
+        q_cfg->fixed_qp_offsets[i] = convert_qp_offset(
+            rc_cfg->cq_level, cfg->fixed_qp_offsets[i], tool_cfg->bit_depth);
+      } else {  // auto-selected qp offset
+        q_cfg->fixed_qp_offsets[i] =
+            get_modeled_qp_offset(rc_cfg->cq_level, i, tool_cfg->bit_depth);
+      }
+    } else {
+      q_cfg->fixed_qp_offsets[i] = -1.0;
     }
   }
 
-  oxcf->maximum_buffer_size_ms = is_vbr ? 240000 : cfg->rc_buf_sz;
-  oxcf->starting_buffer_level_ms = is_vbr ? 60000 : cfg->rc_buf_initial_sz;
-  oxcf->optimal_buffer_level_ms = is_vbr ? 60000 : cfg->rc_buf_optimal_sz;
+  tool_cfg->enable_deltalf_mode =
+      (q_cfg->deltaq_mode != NO_DELTA_Q) && extra_cfg->deltalf_mode;
 
-  oxcf->drop_frames_water_mark = cfg->rc_dropframe_thresh;
+  // Set cost update frequency configuration.
+  oxcf->cost_upd_freq.coeff = (COST_UPDATE_TYPE)extra_cfg->coeff_cost_upd_freq;
+  oxcf->cost_upd_freq.mode = (COST_UPDATE_TYPE)extra_cfg->mode_cost_upd_freq;
+  oxcf->cost_upd_freq.mv = (COST_UPDATE_TYPE)extra_cfg->mv_cost_upd_freq;
 
-  oxcf->two_pass_vbrbias = cfg->rc_2pass_vbr_bias_pct;
-  oxcf->two_pass_vbrmin_section = cfg->rc_2pass_vbr_minsection_pct;
-  oxcf->two_pass_vbrmax_section = cfg->rc_2pass_vbr_maxsection_pct;
+  // Set frame resize mode configuration.
+  resize_cfg->resize_mode = (RESIZE_MODE)cfg->rc_resize_mode;
+  resize_cfg->resize_scale_denominator = (uint8_t)cfg->rc_resize_denominator;
+  resize_cfg->resize_kf_scale_denominator =
+      (uint8_t)cfg->rc_resize_kf_denominator;
+  if (resize_cfg->resize_mode == RESIZE_FIXED &&
+      resize_cfg->resize_scale_denominator == SCALE_NUMERATOR &&
+      resize_cfg->resize_kf_scale_denominator == SCALE_NUMERATOR)
+    resize_cfg->resize_mode = RESIZE_NONE;
 
-  oxcf->auto_key =
+  // Set encoder algorithm related configuration.
+  algo_cfg->enable_overlay = extra_cfg->enable_overlay;
+  algo_cfg->disable_trellis_quant = extra_cfg->disable_trellis_quant;
+  algo_cfg->sharpness = extra_cfg->sharpness;
+  algo_cfg->arnr_max_frames = extra_cfg->arnr_max_frames;
+  algo_cfg->arnr_strength = extra_cfg->arnr_strength;
+  algo_cfg->cdf_update_mode = (uint8_t)extra_cfg->cdf_update_mode;
+  // TODO(any): Fix and Enable TPL for resize-mode > 0
+  algo_cfg->enable_tpl_model =
+      resize_cfg->resize_mode ? 0 : extra_cfg->enable_tpl_model;
+
+  // Set two-pass stats configuration.
+  oxcf->twopass_stats_in = cfg->rc_twopass_stats_in;
+
+  // Set Key frame configuration.
+  kf_cfg->fwd_kf_enabled = cfg->fwd_kf_enabled;
+  kf_cfg->auto_key =
       cfg->kf_mode == AOM_KF_AUTO && cfg->kf_min_dist != cfg->kf_max_dist;
+  kf_cfg->key_freq_min = cfg->kf_min_dist;
+  kf_cfg->key_freq_max = cfg->kf_max_dist;
+  kf_cfg->sframe_dist = cfg->sframe_dist;
+  kf_cfg->sframe_mode = cfg->sframe_mode;
+  kf_cfg->enable_sframe = extra_cfg->s_frame_mode;
+  kf_cfg->enable_keyframe_filtering = extra_cfg->enable_keyframe_filtering;
+  kf_cfg->enable_intrabc = extra_cfg->enable_intrabc;
 
-  oxcf->key_freq = cfg->kf_max_dist;
-  oxcf->sframe_dist = cfg->sframe_dist;
-  oxcf->sframe_mode = cfg->sframe_mode;
-  oxcf->sframe_enabled = cfg->sframe_dist != 0;
   oxcf->speed = extra_cfg->cpu_used;
-  oxcf->enable_auto_arf = extra_cfg->enable_auto_alt_ref;
-  oxcf->enable_auto_brf = extra_cfg->enable_auto_bwd_ref;
-  oxcf->noise_sensitivity = extra_cfg->noise_sensitivity;
-  oxcf->sharpness = extra_cfg->sharpness;
 
-  oxcf->two_pass_stats_in = cfg->rc_twopass_stats_in;
+  // Set Color related configuration.
+  color_cfg->color_primaries = extra_cfg->color_primaries;
+  color_cfg->transfer_characteristics = extra_cfg->transfer_characteristics;
+  color_cfg->matrix_coefficients = extra_cfg->matrix_coefficients;
+  color_cfg->color_range = extra_cfg->color_range;
+  color_cfg->chroma_sample_position = extra_cfg->chroma_sample_position;
 
-  oxcf->color_primaries = extra_cfg->color_primaries;
-  oxcf->transfer_characteristics = extra_cfg->transfer_characteristics;
-  oxcf->matrix_coefficients = extra_cfg->matrix_coefficients;
-  oxcf->chroma_sample_position = extra_cfg->chroma_sample_position;
+  // Set Group of frames configuration.
+  gf_cfg->lag_in_frames = clamp(cfg->g_lag_in_frames, 0, MAX_LAG_BUFFERS);
+  gf_cfg->enable_auto_arf = extra_cfg->enable_auto_alt_ref;
+  gf_cfg->enable_auto_brf = extra_cfg->enable_auto_bwd_ref;
+  gf_cfg->min_gf_interval = extra_cfg->min_gf_interval;
+  gf_cfg->max_gf_interval = extra_cfg->max_gf_interval;
+  gf_cfg->gf_min_pyr_height = extra_cfg->gf_min_pyr_height;
+  gf_cfg->gf_max_pyr_height = extra_cfg->gf_max_pyr_height;
 
-  oxcf->color_range = extra_cfg->color_range;
-  oxcf->render_width = extra_cfg->render_width;
-  oxcf->render_height = extra_cfg->render_height;
-  oxcf->arnr_max_frames = extra_cfg->arnr_max_frames;
-  oxcf->arnr_strength = extra_cfg->arnr_strength;
-  oxcf->min_gf_interval = extra_cfg->min_gf_interval;
-  oxcf->max_gf_interval = extra_cfg->max_gf_interval;
-  oxcf->gf_min_pyr_height = extra_cfg->gf_min_pyr_height;
-  oxcf->gf_max_pyr_height = extra_cfg->gf_max_pyr_height;
-
-  oxcf->tuning = extra_cfg->tuning;
-  oxcf->vmaf_model_path = extra_cfg->vmaf_model_path;
-  oxcf->content = extra_cfg->content;
-  oxcf->cdf_update_mode = (uint8_t)extra_cfg->cdf_update_mode;
-  oxcf->superblock_size = extra_cfg->superblock_size;
+  // Set tune related configuration.
+  tune_cfg->tuning = extra_cfg->tuning;
+  tune_cfg->vmaf_model_path = extra_cfg->vmaf_model_path;
+  tune_cfg->content = extra_cfg->content;
   if (cfg->large_scale_tile) {
-    oxcf->film_grain_test_vector = 0;
-    oxcf->film_grain_table_filename = NULL;
+    tune_cfg->film_grain_test_vector = 0;
+    tune_cfg->film_grain_table_filename = NULL;
   } else {
-    oxcf->film_grain_test_vector = extra_cfg->film_grain_test_vector;
-    oxcf->film_grain_table_filename = extra_cfg->film_grain_table_filename;
+    tune_cfg->film_grain_test_vector = extra_cfg->film_grain_test_vector;
+    tune_cfg->film_grain_table_filename = extra_cfg->film_grain_table_filename;
   }
 #if CONFIG_DENOISE
   oxcf->noise_level = extra_cfg->noise_level;
   oxcf->noise_block_size = extra_cfg->noise_block_size;
 #endif
-  oxcf->large_scale_tile = cfg->large_scale_tile;
-  oxcf->single_tile_decoding =
-      (oxcf->large_scale_tile) ? extra_cfg->single_tile_decoding : 0;
-  if (oxcf->large_scale_tile) {
+
+  // Set Tile related configuration.
+  tile_cfg->num_tile_groups = extra_cfg->num_tg;
+  // In large-scale tile encoding mode, num_tile_groups is always 1.
+  if (cfg->large_scale_tile) tile_cfg->num_tile_groups = 1;
+  tile_cfg->mtu = extra_cfg->mtu_size;
+  tile_cfg->enable_large_scale_tile = cfg->large_scale_tile;
+  tile_cfg->enable_single_tile_decoding =
+      (tile_cfg->enable_large_scale_tile) ? extra_cfg->single_tile_decoding : 0;
+  tile_cfg->tile_columns = extra_cfg->tile_columns;
+  tile_cfg->tile_rows = extra_cfg->tile_rows;
+  tile_cfg->tile_width_count = AOMMIN(cfg->tile_width_count, MAX_TILE_COLS);
+  tile_cfg->tile_height_count = AOMMIN(cfg->tile_height_count, MAX_TILE_ROWS);
+  for (int i = 0; i < tile_cfg->tile_width_count; i++) {
+    tile_cfg->tile_widths[i] = AOMMAX(cfg->tile_widths[i], 1);
+  }
+  for (int i = 0; i < tile_cfg->tile_height_count; i++) {
+    tile_cfg->tile_heights[i] = AOMMAX(cfg->tile_heights[i], 1);
+  }
+  tile_cfg->enable_ext_tile_debug = extra_cfg->ext_tile_debug;
+
+  if (tile_cfg->enable_large_scale_tile) {
     // The superblock_size can only be AOM_SUPERBLOCK_SIZE_64X64 or
-    // AOM_SUPERBLOCK_SIZE_128X128 while oxcf->large_scale_tile = 1. If
-    // superblock_size = AOM_SUPERBLOCK_SIZE_DYNAMIC, hard set it to
+    // AOM_SUPERBLOCK_SIZE_128X128 while tile_cfg->enable_large_scale_tile = 1.
+    // If superblock_size = AOM_SUPERBLOCK_SIZE_DYNAMIC, hard set it to
     // AOM_SUPERBLOCK_SIZE_64X64(default value in large_scale_tile).
     if (extra_cfg->superblock_size != AOM_SUPERBLOCK_SIZE_64X64 &&
         extra_cfg->superblock_size != AOM_SUPERBLOCK_SIZE_128X128)
-      oxcf->superblock_size = AOM_SUPERBLOCK_SIZE_64X64;
+      tool_cfg->superblock_size = AOM_SUPERBLOCK_SIZE_64X64;
   }
 
+  // Set reference frame related configuration.
+  oxcf->ref_frm_cfg.max_reference_frames = extra_cfg->max_reference_frames;
+  oxcf->ref_frm_cfg.enable_reduced_reference_set =
+      extra_cfg->enable_reduced_reference_set;
+  oxcf->ref_frm_cfg.enable_onesided_comp = extra_cfg->enable_onesided_comp;
+
   oxcf->row_mt = extra_cfg->row_mt;
 
-  oxcf->tile_columns = extra_cfg->tile_columns;
-  oxcf->tile_rows = extra_cfg->tile_rows;
-
-  oxcf->monochrome = cfg->monochrome;
-  oxcf->full_still_picture_hdr = cfg->full_still_picture_hdr;
-  oxcf->enable_dual_filter = extra_cfg->enable_dual_filter;
-  oxcf->enable_rect_partitions = extra_cfg->enable_rect_partitions;
-  oxcf->enable_ab_partitions = extra_cfg->enable_ab_partitions;
-  oxcf->enable_1to4_partitions = extra_cfg->enable_1to4_partitions;
-  oxcf->min_partition_size = extra_cfg->min_partition_size;
-  oxcf->max_partition_size = extra_cfg->max_partition_size;
-  oxcf->enable_intra_edge_filter = extra_cfg->enable_intra_edge_filter;
-  oxcf->enable_tx64 = extra_cfg->enable_tx64;
-  oxcf->enable_flip_idtx = extra_cfg->enable_flip_idtx;
-  oxcf->enable_order_hint = extra_cfg->enable_order_hint;
-  oxcf->enable_dist_wtd_comp =
-      extra_cfg->enable_dist_wtd_comp & extra_cfg->enable_order_hint;
-  oxcf->max_reference_frames = extra_cfg->max_reference_frames;
-  oxcf->enable_reduced_reference_set = extra_cfg->enable_reduced_reference_set;
-  oxcf->enable_masked_comp = extra_cfg->enable_masked_comp;
-  oxcf->enable_onesided_comp = extra_cfg->enable_onesided_comp;
-  oxcf->enable_diff_wtd_comp =
-      extra_cfg->enable_masked_comp & extra_cfg->enable_diff_wtd_comp;
-  oxcf->enable_interinter_wedge =
-      extra_cfg->enable_masked_comp & extra_cfg->enable_interinter_wedge;
-  oxcf->enable_interintra_comp = extra_cfg->enable_interintra_comp;
-  oxcf->enable_smooth_interintra =
-      extra_cfg->enable_interintra_comp && extra_cfg->enable_smooth_interintra;
-  oxcf->enable_interintra_wedge =
-      extra_cfg->enable_interintra_comp & extra_cfg->enable_interintra_wedge;
-  oxcf->enable_ref_frame_mvs =
-      extra_cfg->enable_ref_frame_mvs & extra_cfg->enable_order_hint;
-
-  oxcf->enable_global_motion = extra_cfg->enable_global_motion;
-  oxcf->enable_warped_motion = extra_cfg->enable_warped_motion;
-  oxcf->allow_warped_motion =
+  // Set motion mode related configuration.
+  oxcf->motion_mode_cfg.enable_obmc = extra_cfg->enable_obmc;
+  oxcf->motion_mode_cfg.enable_warped_motion = extra_cfg->enable_warped_motion;
+  oxcf->motion_mode_cfg.allow_warped_motion =
       (cfg->g_usage == AOM_USAGE_REALTIME)
-          ? 0
+          ? false
           : (extra_cfg->allow_warped_motion & extra_cfg->enable_warped_motion);
-  oxcf->enable_filter_intra = extra_cfg->enable_filter_intra;
-  oxcf->enable_smooth_intra = extra_cfg->enable_smooth_intra;
-  oxcf->enable_paeth_intra = extra_cfg->enable_paeth_intra;
-  oxcf->enable_cfl_intra = extra_cfg->enable_cfl_intra;
 
-  oxcf->enable_superres =
-      (oxcf->superres_mode != SUPERRES_NONE) && extra_cfg->enable_superres;
-  if (!oxcf->enable_superres) {
-    disable_superres(oxcf);
-  }
+  // Set partition related configuration.
+  part_cfg->enable_rect_partitions = extra_cfg->enable_rect_partitions;
+  part_cfg->enable_ab_partitions = extra_cfg->enable_ab_partitions;
+  part_cfg->enable_1to4_partitions = extra_cfg->enable_1to4_partitions;
+  part_cfg->min_partition_size = extra_cfg->min_partition_size;
+  part_cfg->max_partition_size = extra_cfg->max_partition_size;
 
-  oxcf->tile_width_count = AOMMIN(cfg->tile_width_count, MAX_TILE_COLS);
-  oxcf->tile_height_count = AOMMIN(cfg->tile_height_count, MAX_TILE_ROWS);
-  for (int i = 0; i < oxcf->tile_width_count; i++) {
-    oxcf->tile_widths[i] = AOMMAX(cfg->tile_widths[i], 1);
-  }
-  for (int i = 0; i < oxcf->tile_height_count; i++) {
-    oxcf->tile_heights[i] = AOMMAX(cfg->tile_heights[i], 1);
-  }
-  oxcf->error_resilient_mode =
-      cfg->g_error_resilient | extra_cfg->error_resilient_mode;
-  oxcf->s_frame_mode = extra_cfg->s_frame_mode;
-  oxcf->frame_parallel_decoding_mode = extra_cfg->frame_parallel_decoding_mode;
-  if (cfg->g_pass == AOM_RC_LAST_PASS) {
-    const size_t packet_sz = sizeof(FIRSTPASS_STATS);
-    const int n_packets = (int)(cfg->rc_twopass_stats_in.sz / packet_sz);
-    oxcf->limit = n_packets - 1;
+  // Set intra mode configuration.
+  intra_mode_cfg->enable_angle_delta = extra_cfg->enable_angle_delta;
+  intra_mode_cfg->enable_intra_edge_filter =
+      extra_cfg->enable_intra_edge_filter;
+  intra_mode_cfg->enable_filter_intra = extra_cfg->enable_filter_intra;
+  intra_mode_cfg->enable_smooth_intra = extra_cfg->enable_smooth_intra;
+  intra_mode_cfg->enable_paeth_intra = extra_cfg->enable_paeth_intra;
+  intra_mode_cfg->enable_cfl_intra = extra_cfg->enable_cfl_intra;
+
+  // Set transform size/type configuration.
+  txfm_cfg->enable_tx64 = extra_cfg->enable_tx64;
+  txfm_cfg->enable_flip_idtx = extra_cfg->enable_flip_idtx;
+  txfm_cfg->enable_rect_tx = extra_cfg->enable_rect_tx;
+  txfm_cfg->reduced_tx_type_set = extra_cfg->reduced_tx_type_set;
+  txfm_cfg->use_intra_dct_only = extra_cfg->use_intra_dct_only;
+  txfm_cfg->use_inter_dct_only = extra_cfg->use_inter_dct_only;
+  txfm_cfg->use_intra_default_tx_only = extra_cfg->use_intra_default_tx_only;
+
+  // Set compound type configuration.
+  comp_type_cfg->enable_dist_wtd_comp =
+      extra_cfg->enable_dist_wtd_comp & extra_cfg->enable_order_hint;
+  comp_type_cfg->enable_masked_comp = extra_cfg->enable_masked_comp;
+  comp_type_cfg->enable_diff_wtd_comp =
+      extra_cfg->enable_masked_comp & extra_cfg->enable_diff_wtd_comp;
+  comp_type_cfg->enable_interinter_wedge =
+      extra_cfg->enable_masked_comp & extra_cfg->enable_interinter_wedge;
+  comp_type_cfg->enable_smooth_interintra =
+      extra_cfg->enable_interintra_comp && extra_cfg->enable_smooth_interintra;
+  comp_type_cfg->enable_interintra_wedge =
+      extra_cfg->enable_interintra_comp & extra_cfg->enable_interintra_wedge;
+
+  // Set Super-resolution mode configuration.
+  if (extra_cfg->lossless || cfg->large_scale_tile) {
+    disable_superres(superres_cfg);
   } else {
-    oxcf->limit = cfg->g_limit;
+    superres_cfg->superres_mode = cfg->rc_superres_mode;
+    superres_cfg->superres_scale_denominator =
+        (uint8_t)cfg->rc_superres_denominator;
+    superres_cfg->superres_kf_scale_denominator =
+        (uint8_t)cfg->rc_superres_kf_denominator;
+    superres_cfg->superres_qthresh =
+        av1_quantizer_to_qindex(cfg->rc_superres_qthresh);
+    superres_cfg->superres_kf_qthresh =
+        av1_quantizer_to_qindex(cfg->rc_superres_kf_qthresh);
+    if (superres_cfg->superres_mode == AOM_SUPERRES_FIXED &&
+        superres_cfg->superres_scale_denominator == SCALE_NUMERATOR &&
+        superres_cfg->superres_kf_scale_denominator == SCALE_NUMERATOR) {
+      disable_superres(superres_cfg);
+    }
+    if (superres_cfg->superres_mode == AOM_SUPERRES_QTHRESH &&
+        superres_cfg->superres_qthresh == 255 &&
+        superres_cfg->superres_kf_qthresh == 255) {
+      disable_superres(superres_cfg);
+    }
   }
 
-  if (oxcf->limit == 1) {
+  superres_cfg->enable_superres =
+      (superres_cfg->superres_mode != AOM_SUPERRES_NONE) &&
+      extra_cfg->enable_superres;
+  if (!superres_cfg->enable_superres) {
+    disable_superres(superres_cfg);
+  }
+
+  if (input_cfg->limit == 1) {
     // still picture mode, display model and timing is meaningless
-    oxcf->display_model_info_present_flag = 0;
-    oxcf->timing_info_present = 0;
+    dec_model_cfg->display_model_info_present_flag = 0;
+    dec_model_cfg->timing_info_present = 0;
   }
 
-  oxcf->enable_tpl_model = extra_cfg->enable_tpl_model;
-  oxcf->enable_keyframe_filtering = extra_cfg->enable_keyframe_filtering;
-
-  oxcf->enable_chroma_deltaq = extra_cfg->enable_chroma_deltaq;
-  oxcf->aq_mode = extra_cfg->aq_mode;
-  oxcf->deltaq_mode = extra_cfg->deltaq_mode;
-
-  oxcf->deltalf_mode =
-      (oxcf->deltaq_mode != NO_DELTA_Q) && extra_cfg->deltalf_mode;
-
   oxcf->save_as_annexb = cfg->save_as_annexb;
 
-  oxcf->frame_periodic_boost = extra_cfg->frame_periodic_boost;
-  oxcf->motion_vector_unit_test = extra_cfg->motion_vector_unit_test;
-  oxcf->sb_multipass_unit_test = extra_cfg->sb_multipass_unit_test;
-  oxcf->ext_tile_debug = extra_cfg->ext_tile_debug;
+  // Set unit test related configuration.
+  oxcf->unit_test_cfg.motion_vector_unit_test =
+      extra_cfg->motion_vector_unit_test;
+  oxcf->unit_test_cfg.sb_multipass_unit_test =
+      extra_cfg->sb_multipass_unit_test;
 
-  oxcf->chroma_subsampling_x = extra_cfg->chroma_subsampling_x;
-  oxcf->chroma_subsampling_y = extra_cfg->chroma_subsampling_y;
-  oxcf->border_in_pixels = (oxcf->resize_mode || oxcf->superres_mode)
-                               ? AOM_BORDER_IN_PIXELS
-                               : AOM_ENC_NO_SCALE_BORDER;
+  oxcf->border_in_pixels =
+      (resize_cfg->resize_mode || superres_cfg->superres_mode)
+          ? AOM_BORDER_IN_PIXELS
+          : AOM_ENC_NO_SCALE_BORDER;
   memcpy(oxcf->target_seq_level_idx, extra_cfg->target_seq_level_idx,
          sizeof(oxcf->target_seq_level_idx));
   oxcf->tier_mask = extra_cfg->tier_mask;
 
-  oxcf->use_fixed_qp_offsets =
-      cfg->use_fixed_qp_offsets && (oxcf->rc_mode == AOM_Q);
-  for (int i = 0; i < FIXED_QP_OFFSET_COUNT; ++i) {
-    if (oxcf->use_fixed_qp_offsets) {
-      if (cfg->fixed_qp_offsets[i] >= 0) {  // user-provided qp offset
-        oxcf->fixed_qp_offsets[i] = convert_qp_offset(
-            oxcf->cq_level, cfg->fixed_qp_offsets[i], oxcf->bit_depth);
-      } else {  // auto-selected qp offset
-        oxcf->fixed_qp_offsets[i] =
-            get_modeled_qp_offset(oxcf->cq_level, i, oxcf->bit_depth);
-      }
-    } else {
-      oxcf->fixed_qp_offsets[i] = -1.0;
-    }
-  }
-
-  oxcf->min_cr = extra_cfg->min_cr;
   return AOM_CODEC_OK;
 }
 
 static aom_codec_err_t encoder_set_config(aom_codec_alg_priv_t *ctx,
                                           const aom_codec_enc_cfg_t *cfg) {
+  InitialDimensions *const initial_dimensions = &ctx->cpi->initial_dimensions;
   aom_codec_err_t res;
   int force_key = 0;
 
@@ -1036,8 +1109,10 @@
     if (cfg->g_lag_in_frames > 1 || cfg->g_pass != AOM_RC_ONE_PASS)
       ERROR("Cannot change width or height after initialization");
     if (!valid_ref_frame_size(ctx->cfg.g_w, ctx->cfg.g_h, cfg->g_w, cfg->g_h) ||
-        (ctx->cpi->initial_width && (int)cfg->g_w > ctx->cpi->initial_width) ||
-        (ctx->cpi->initial_height && (int)cfg->g_h > ctx->cpi->initial_height))
+        (initial_dimensions->width &&
+         (int)cfg->g_w > initial_dimensions->width) ||
+        (initial_dimensions->height &&
+         (int)cfg->g_h > initial_dimensions->height))
       force_key = 1;
   }
 
@@ -1060,6 +1135,9 @@
     // On profile change, request a key frame
     force_key |= ctx->cpi->common.seq_params.profile != ctx->oxcf.profile;
     av1_change_config(ctx->cpi, &ctx->oxcf);
+    if (ctx->cpi_lap != NULL) {
+      av1_change_config(ctx->cpi_lap, &ctx->oxcf);
+    }
   }
 
   if (force_key) ctx->next_frame_flags |= AOM_EFLAG_FORCE_KF;
@@ -1087,6 +1165,14 @@
   return AOM_CODEC_OK;
 }
 
+static aom_codec_err_t ctrl_get_baseline_gf_interval(aom_codec_alg_priv_t *ctx,
+                                                     va_list args) {
+  int *const arg = va_arg(args, int *);
+  if (arg == NULL) return AOM_CODEC_INVALID_PARAM;
+  *arg = ctx->cpi->rc.baseline_gf_interval;
+  return AOM_CODEC_OK;
+}
+
 static aom_codec_err_t update_extra_cfg(aom_codec_alg_priv_t *ctx,
                                         struct av1_extracfg *extra_cfg) {
   const aom_codec_err_t res = validate_config(ctx, &ctx->cfg, extra_cfg);
@@ -1094,6 +1180,9 @@
     ctx->extra_cfg = *extra_cfg;
     set_encoder_config(&ctx->oxcf, &ctx->cfg, &ctx->extra_cfg);
     av1_change_config(ctx->cpi, &ctx->oxcf);
+    if (ctx->cpi_lap != NULL) {
+      av1_change_config(ctx->cpi_lap, &ctx->oxcf);
+    }
   }
   return res;
 }
@@ -1403,6 +1492,13 @@
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
+static aom_codec_err_t ctrl_set_enable_rect_tx(aom_codec_alg_priv_t *ctx,
+                                               va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.enable_rect_tx = CAST(AV1E_SET_ENABLE_RECT_TX, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
 static aom_codec_err_t ctrl_set_enable_dist_wtd_comp(aom_codec_alg_priv_t *ctx,
                                                      va_list args) {
   struct av1_extracfg extra_cfg = ctx->extra_cfg;
@@ -1648,6 +1744,13 @@
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
+static aom_codec_err_t ctrl_set_vbr_corpus_complexity_lap(
+    aom_codec_alg_priv_t *ctx, va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.vbr_corpus_complexity_lap =
+      CAST(AV1E_SET_VBR_CORPUS_COMPLEXITY_LAP, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
 static aom_codec_err_t ctrl_set_coeff_cost_upd_freq(aom_codec_alg_priv_t *ctx,
                                                     va_list args) {
   struct av1_extracfg extra_cfg = ctx->extra_cfg;
@@ -1900,14 +2003,14 @@
       reduce_ratio(&priv->timestamp_ratio);
 
       set_encoder_config(&priv->oxcf, &priv->cfg, &priv->extra_cfg);
-      if (priv->oxcf.rc_mode == AOM_Q && priv->oxcf.pass == 0 &&
+      if (priv->oxcf.rc_cfg.mode != AOM_CBR && priv->oxcf.pass == 0 &&
           priv->oxcf.mode == GOOD) {
-        // Enable look ahead
+        // Enable look ahead - enabled for AOM_Q, AOM_CQ, AOM_VBR
         *num_lap_buffers = priv->cfg.g_lag_in_frames;
         *num_lap_buffers =
-            clamp(*num_lap_buffers, 1,
-                  AOMMIN(MAX_LAP_BUFFERS,
-                         priv->oxcf.key_freq + SCENE_CUT_KEY_TEST_INTERVAL));
+            clamp(*num_lap_buffers, 0,
+                  AOMMIN(MAX_LAP_BUFFERS, priv->oxcf.kf_cfg.key_freq_max +
+                                              SCENE_CUT_KEY_TEST_INTERVAL));
         if ((int)priv->cfg.g_lag_in_frames - (*num_lap_buffers) >=
             LAP_LAG_IN_FRAMES) {
           lap_lag_in_frames = LAP_LAG_IN_FRAMES;
@@ -1974,9 +2077,15 @@
 
 static aom_codec_frame_flags_t get_frame_pkt_flags(const AV1_COMP *cpi,
                                                    unsigned int lib_flags) {
+  const SVC *const svc = &cpi->svc;
   aom_codec_frame_flags_t flags = lib_flags << 16;
 
-  if (lib_flags & FRAMEFLAGS_KEY) flags |= AOM_FRAME_IS_KEY;
+  if (lib_flags & FRAMEFLAGS_KEY ||
+      (cpi->use_svc &&
+       svc->layer_context[svc->spatial_layer_id * svc->number_temporal_layers +
+                          svc->temporal_layer_id]
+           .is_key_frame))
+    flags |= AOM_FRAME_IS_KEY;
   if (lib_flags & FRAMEFLAGS_INTRAONLY) flags |= AOM_FRAME_IS_INTRAONLY;
   if (lib_flags & FRAMEFLAGS_SWITCH) flags |= AOM_FRAME_IS_SWITCH;
   if (lib_flags & FRAMEFLAGS_ERROR_RESILIENT)
@@ -2029,12 +2138,6 @@
     av1_change_config(ctx->cpi, &ctx->oxcf);
   }
 
-  if (!ctx->pts_offset_initialized) {
-    ctx->pts_offset = ptsvol;
-    ctx->pts_offset_initialized = 1;
-  }
-  ptsvol -= ctx->pts_offset;
-
   aom_codec_pkt_list_init(&ctx->pkt_list);
 
   volatile aom_enc_frame_flags_t flags = enc_flags;
@@ -2067,6 +2170,11 @@
     av1_apply_encoding_flags(cpi_lap, flags);
   }
 
+#if CONFIG_USE_VMAF_RC
+  aom_init_vmaf_model_rc(&cpi->vmaf_info.vmaf_model,
+                         cpi->oxcf.tune_cfg.vmaf_model_path);
+#endif
+
   // Handle fixed keyframe intervals
   if (is_stat_generation_stage(cpi)) {
     if (ctx->cfg.kf_mode == AOM_KF_AUTO &&
@@ -2080,29 +2188,40 @@
   }
 
   if (res == AOM_CODEC_OK) {
-    int64_t dst_time_stamp = timebase_units_to_ticks(timestamp_ratio, ptsvol);
-    int64_t dst_end_time_stamp =
-        timebase_units_to_ticks(timestamp_ratio, ptsvol + duration);
-
     // Set up internal flags
     if (ctx->base.init_flags & AOM_CODEC_USE_PSNR) cpi->b_calculate_psnr = 1;
 
     if (img != NULL) {
+      if (!ctx->pts_offset_initialized) {
+        ctx->pts_offset = ptsvol;
+        ctx->pts_offset_initialized = 1;
+      }
+      ptsvol -= ctx->pts_offset;
+      int64_t src_time_stamp = timebase_units_to_ticks(timestamp_ratio, ptsvol);
+      int64_t src_end_time_stamp =
+          timebase_units_to_ticks(timestamp_ratio, ptsvol + duration);
+
       YV12_BUFFER_CONFIG sd;
-      int use_highbitdepth, subsampling_x, subsampling_y;
       res = image2yuvconfig(img, &sd);
-      use_highbitdepth = (sd.flags & YV12_FLAG_HIGHBITDEPTH) != 0;
-      subsampling_x = sd.subsampling_x;
-      subsampling_y = sd.subsampling_y;
+      // When generating a monochrome stream, make |sd| a monochrome image.
+      if (ctx->cfg.monochrome) {
+        sd.u_buffer = sd.v_buffer = NULL;
+        sd.uv_stride = 0;
+        sd.monochrome = 1;
+      }
+      int use_highbitdepth = (sd.flags & YV12_FLAG_HIGHBITDEPTH) != 0;
+      int subsampling_x = sd.subsampling_x;
+      int subsampling_y = sd.subsampling_y;
 
       if (!cpi->lookahead) {
-        int lag_in_frames = cpi_lap != NULL ? cpi_lap->oxcf.lag_in_frames
-                                            : cpi->oxcf.lag_in_frames;
+        int lag_in_frames = cpi_lap != NULL ? cpi_lap->oxcf.gf_cfg.lag_in_frames
+                                            : cpi->oxcf.gf_cfg.lag_in_frames;
 
         cpi->lookahead = av1_lookahead_init(
-            cpi->oxcf.width, cpi->oxcf.height, subsampling_x, subsampling_y,
-            use_highbitdepth, lag_in_frames, cpi->oxcf.border_in_pixels,
-            cpi->common.features.byte_alignment, ctx->num_lap_buffers);
+            cpi->oxcf.frm_dim_cfg.width, cpi->oxcf.frm_dim_cfg.height,
+            subsampling_x, subsampling_y, use_highbitdepth, lag_in_frames,
+            cpi->oxcf.border_in_pixels, cpi->common.features.byte_alignment,
+            ctx->num_lap_buffers);
       }
       if (!cpi->lookahead)
         aom_internal_error(&cpi->common.error, AOM_CODEC_MEM_ERROR,
@@ -2119,7 +2238,7 @@
       // Store the original flags in to the frame buffer. Will extract the
       // key frame flag when we actually encode this frame.
       if (av1_receive_raw_frame(cpi, flags | ctx->next_frame_flags, &sd,
-                                dst_time_stamp, dst_end_time_stamp)) {
+                                src_time_stamp, src_end_time_stamp)) {
         res = update_error_state(ctx, &cpi->common.error);
       }
       ctx->next_frame_flags = 0;
@@ -2150,17 +2269,31 @@
     unsigned int lib_flags = 0;
     int is_frame_visible = 0;
     int index_size = 0;
-    int has_fwd_keyframe = 0;
+    int has_no_show_keyframe = 0;
+    int num_workers = 0;
+
+    if (cpi->oxcf.pass == 1) {
+#if !CONFIG_REALTIME_ONLY
+      num_workers = av1_fp_compute_num_enc_workers(cpi);
+#endif
+    } else {
+      num_workers = av1_compute_num_enc_workers(cpi, cpi->oxcf.max_threads);
+    }
+    if ((num_workers > 1) && (cpi->mt_info.num_workers == 0))
+      av1_create_workers(cpi, num_workers);
 
     // Call for LAP stage
     if (cpi_lap != NULL) {
-      int status;
-      aom_rational64_t timestamp_ratio_la = *timestamp_ratio;
-      int64_t dst_time_stamp_la = dst_time_stamp;
-      int64_t dst_end_time_stamp_la = dst_end_time_stamp;
-      status = av1_get_compressed_data(
+      int64_t dst_time_stamp_la;
+      int64_t dst_end_time_stamp_la;
+      if (cpi_lap->mt_info.workers == NULL) {
+        cpi_lap->mt_info.workers = cpi->mt_info.workers;
+        cpi_lap->mt_info.tile_thr_data = cpi->mt_info.tile_thr_data;
+      }
+      cpi_lap->mt_info.num_workers = cpi->mt_info.num_workers;
+      const int status = av1_get_compressed_data(
           cpi_lap, &lib_flags, &frame_size, NULL, &dst_time_stamp_la,
-          &dst_end_time_stamp_la, !img, &timestamp_ratio_la);
+          &dst_end_time_stamp_la, !img, timestamp_ratio);
       if (status != -1) {
         if (status != AOM_CODEC_OK) {
           aom_internal_error(&cpi_lap->common.error, AOM_CODEC_ERROR, NULL);
@@ -2171,7 +2304,10 @@
       frame_size = 0;
     }
 
-    // invisible frames get packed with the next visible frame
+    // Get the next visible frame. Invisible frames get packed with the next
+    // visible frame.
+    int64_t dst_time_stamp;
+    int64_t dst_end_time_stamp;
     while (cx_data_sz - index_size >= ctx->cx_data_sz / 2 &&
            !is_frame_visible) {
       const int status = av1_get_compressed_data(
@@ -2245,14 +2381,17 @@
 
         is_frame_visible = cpi->common.show_frame;
 
-        has_fwd_keyframe |= (!is_frame_visible &&
-                             cpi->common.current_frame.frame_type == KEY_FRAME);
+        has_no_show_keyframe |=
+            (!is_frame_visible &&
+             cpi->common.current_frame.frame_type == KEY_FRAME);
       }
     }
     if (is_frame_visible) {
       // Add the frame packet to the list of returned packets.
       aom_codec_cx_pkt_t pkt;
 
+      // decrement frames_left counter
+      cpi->frames_left = AOMMAX(0, cpi->frames_left - 1);
       if (ctx->oxcf.save_as_annexb) {
         //  B_PRIME (add TU size)
         size_t tu_size = ctx->pending_cx_data_sz;
@@ -2280,7 +2419,7 @@
           ticks_to_timebase_units(timestamp_ratio, dst_time_stamp) +
           ctx->pts_offset;
       pkt.data.frame.flags = get_frame_pkt_flags(cpi, lib_flags);
-      if (has_fwd_keyframe) {
+      if (has_no_show_keyframe) {
         // If one of the invisible frames in the packet is a keyframe, set
         // the delayed random access point flag.
         pkt.data.frame.flags |= AOM_FRAME_IS_DELAYED_RANDOM_ACCESS_POINT;
@@ -2490,19 +2629,18 @@
 static aom_codec_err_t ctrl_set_svc_params(aom_codec_alg_priv_t *ctx,
                                            va_list args) {
   AV1_COMP *const cpi = ctx->cpi;
+  AV1_COMMON *const cm = &cpi->common;
   aom_svc_params_t *const params = va_arg(args, aom_svc_params_t *);
-  cpi->common.number_spatial_layers = params->number_spatial_layers;
-  cpi->common.number_temporal_layers = params->number_temporal_layers;
+  cm->number_spatial_layers = params->number_spatial_layers;
+  cm->number_temporal_layers = params->number_temporal_layers;
   cpi->svc.number_spatial_layers = params->number_spatial_layers;
   cpi->svc.number_temporal_layers = params->number_temporal_layers;
-  if (cpi->common.number_spatial_layers > 1 ||
-      cpi->common.number_temporal_layers > 1) {
+  if (cm->number_spatial_layers > 1 || cm->number_temporal_layers > 1) {
     unsigned int sl, tl;
     cpi->use_svc = 1;
-    for (sl = 0; sl < cpi->common.number_spatial_layers; ++sl) {
-      for (tl = 0; tl < cpi->common.number_temporal_layers; ++tl) {
-        const int layer =
-            LAYER_IDS_TO_IDX(sl, tl, cpi->common.number_temporal_layers);
+    for (sl = 0; sl < cm->number_spatial_layers; ++sl) {
+      for (tl = 0; tl < cm->number_temporal_layers; ++tl) {
+        const int layer = LAYER_IDS_TO_IDX(sl, tl, cm->number_temporal_layers);
         LAYER_CONTEXT *lc = &cpi->svc.layer_context[layer];
         lc->max_q = params->max_quantizers[layer];
         lc->min_q = params->min_quantizers[layer];
@@ -2512,10 +2650,17 @@
         lc->framerate_factor = params->framerate_factor[tl];
       }
     }
-    if (cpi->common.current_frame.frame_number == 0)
+    if (cm->current_frame.frame_number == 0) {
+      if (!cpi->seq_params_locked) {
+        SequenceHeader *const seq_params = &cm->seq_params;
+        seq_params->operating_points_cnt_minus_1 =
+            cm->number_spatial_layers * cm->number_temporal_layers - 1;
+        av1_init_seq_coding_tools(&cm->seq_params, cm, &cpi->oxcf, 1);
+      }
       av1_init_layer_context(cpi);
-    else
-      av1_update_layer_context_change_config(cpi, cpi->oxcf.target_bandwidth);
+    }
+    av1_update_layer_context_change_config(cpi,
+                                           cpi->oxcf.rc_cfg.target_bandwidth);
   }
   return AOM_CODEC_OK;
 }
@@ -2682,6 +2827,7 @@
   { AV1E_SET_ENABLE_ORDER_HINT, ctrl_set_enable_order_hint },
   { AV1E_SET_ENABLE_TX64, ctrl_set_enable_tx64 },
   { AV1E_SET_ENABLE_FLIP_IDTX, ctrl_set_enable_flip_idtx },
+  { AV1E_SET_ENABLE_RECT_TX, ctrl_set_enable_rect_tx },
   { AV1E_SET_ENABLE_DIST_WTD_COMP, ctrl_set_enable_dist_wtd_comp },
   { AV1E_SET_MAX_REFERENCE_FRAMES, ctrl_set_max_reference_frames },
   { AV1E_SET_REDUCED_REFERENCE_SET, ctrl_set_enable_reduced_reference_set },
@@ -2746,6 +2892,7 @@
   { AV1E_SET_SVC_LAYER_ID, ctrl_set_layer_id },
   { AV1E_SET_SVC_PARAMS, ctrl_set_svc_params },
   { AV1E_SET_SVC_REF_FRAME_CONFIG, ctrl_set_svc_ref_frame_config },
+  { AV1E_SET_VBR_CORPUS_COMPLEXITY_LAP, ctrl_set_vbr_corpus_complexity_lap },
   { AV1E_ENABLE_SB_MULTIPASS_UNIT_TEST, ctrl_enable_sb_multipass_unit_test },
 
   // Getters
@@ -2758,7 +2905,9 @@
   { AV1E_SET_CHROMA_SUBSAMPLING_X, ctrl_set_chroma_subsampling_x },
   { AV1E_SET_CHROMA_SUBSAMPLING_Y, ctrl_set_chroma_subsampling_y },
   { AV1E_GET_SEQ_LEVEL_IDX, ctrl_get_seq_level_idx },
-  { -1, NULL },
+  { AV1E_GET_BASELINE_GF_INTERVAL, ctrl_get_baseline_gf_interval },
+
+  CTRL_MAP_END,
 };
 
 static const aom_codec_enc_cfg_t encoder_usage_cfg[] = {
@@ -2768,8 +2917,8 @@
       0,                       // g_threads
       0,                       // g_profile
 
-      320,         // g_width
-      240,         // g_height
+      320,         // g_w
+      240,         // g_h
       0,           // g_limit
       0,           // g_forced_max_frame_width
       0,           // g_forced_max_frame_height
@@ -2789,11 +2938,11 @@
       SCALE_NUMERATOR,  // rc_resize_denominator
       SCALE_NUMERATOR,  // rc_resize_kf_denominator
 
-      SUPERRES_NONE,    // rc_superres_mode
-      SCALE_NUMERATOR,  // rc_superres_denominator
-      SCALE_NUMERATOR,  // rc_superres_kf_denominator
-      63,               // rc_superres_qthresh
-      32,               // rc_superres_kf_qthresh
+      AOM_SUPERRES_NONE,  // rc_superres_mode
+      SCALE_NUMERATOR,    // rc_superres_denominator
+      SCALE_NUMERATOR,    // rc_superres_kf_denominator
+      63,                 // rc_superres_qthresh
+      32,                 // rc_superres_kf_qthresh
 
       AOM_VBR,      // rc_end_usage
       { NULL, 0 },  // rc_twopass_stats_in
@@ -2814,7 +2963,7 @@
 
       // keyframing settings (kf)
       0,                       // fwd_kf_enabled
-      AOM_KF_AUTO,             // g_kfmode
+      AOM_KF_AUTO,             // kf_mode
       0,                       // kf_min_dist
       9999,                    // kf_max_dist
       0,                       // sframe_dist
@@ -2838,8 +2987,8 @@
       0,                   // g_threads
       0,                   // g_profile
 
-      320,         // g_width
-      240,         // g_height
+      320,         // g_w
+      240,         // g_h
       0,           // g_limit
       0,           // g_forced_max_frame_width
       0,           // g_forced_max_frame_height
@@ -2859,11 +3008,11 @@
       SCALE_NUMERATOR,  // rc_resize_denominator
       SCALE_NUMERATOR,  // rc_resize_kf_denominator
 
-      0,                // rc_superres_mode
-      SCALE_NUMERATOR,  // rc_superres_denominator
-      SCALE_NUMERATOR,  // rc_superres_kf_denominator
-      63,               // rc_superres_qthresh
-      32,               // rc_superres_kf_qthresh
+      AOM_SUPERRES_NONE,  // rc_superres_mode
+      SCALE_NUMERATOR,    // rc_superres_denominator
+      SCALE_NUMERATOR,    // rc_superres_kf_denominator
+      63,                 // rc_superres_qthresh
+      32,                 // rc_superres_kf_qthresh
 
       AOM_CBR,      // rc_end_usage
       { NULL, 0 },  // rc_twopass_stats_in
@@ -2884,7 +3033,7 @@
 
       // keyframing settings (kf)
       0,                       // fwd_kf_enabled
-      AOM_KF_AUTO,             // g_kfmode
+      AOM_KF_AUTO,             // kf_mode
       0,                       // kf_min_dist
       9999,                    // kf_max_dist
       0,                       // sframe_dist
@@ -2904,10 +3053,11 @@
   },
 };
 
+// This data structure and function are exported in aom/aomcx.h
 #ifndef VERSION_STRING
 #define VERSION_STRING
 #endif
-CODEC_INTERFACE(aom_codec_av1_cx) = {
+aom_codec_iface_t aom_codec_av1_cx_algo = {
   "AOMedia Project AV1 Encoder" VERSION_STRING,
   AOM_CODEC_INTERNAL_ABI_VERSION,
   AOM_CODEC_CAP_HIGHBITDEPTH | AOM_CODEC_CAP_ENCODER |
@@ -2934,3 +3084,5 @@
       encoder_get_preview          // aom_codec_get_preview_frame_fn_t
   }
 };
+
+aom_codec_iface_t *aom_codec_av1_cx(void) { return &aom_codec_av1_cx_algo; }
diff --git a/av1/av1_dx_iface.c b/av1/av1_dx_iface.c
index d821a52..a9ae27e 100644
--- a/av1/av1_dx_iface.c
+++ b/av1/av1_dx_iface.c
@@ -119,7 +119,9 @@
     aom_free(frame_worker_data->pbi->common.tpl_mvs);
     frame_worker_data->pbi->common.tpl_mvs = NULL;
     av1_remove_common(&frame_worker_data->pbi->common);
+#if !CONFIG_REALTIME_ONLY
     av1_free_restoration_buffers(&frame_worker_data->pbi->common);
+#endif
     av1_decoder_remove(frame_worker_data->pbi);
     aom_free(frame_worker_data);
 #if CONFIG_MULTITHREAD
@@ -467,7 +469,8 @@
   frame_worker_data->pbi->output_all_layers = ctx->output_all_layers;
   frame_worker_data->pbi->ext_tile_debug = ctx->ext_tile_debug;
   frame_worker_data->pbi->row_mt = ctx->row_mt;
-
+  frame_worker_data->pbi->is_fwd_kf_present = 0;
+  frame_worker_data->pbi->is_arf_frame_present = 0;
   worker->hook = frame_worker_hook;
 
   init_buffer_callbacks(ctx);
@@ -969,6 +972,184 @@
   return AOM_CODEC_OK;
 }
 
+static aom_codec_err_t ctrl_get_fwd_kf_value(aom_codec_alg_priv_t *ctx,
+                                             va_list args) {
+  int *const arg = va_arg(args, int *);
+  if (arg == NULL) return AOM_CODEC_INVALID_PARAM;
+  *arg = ((FrameWorkerData *)ctx->frame_worker->data1)->pbi->is_fwd_kf_present;
+  return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_get_altref_present(aom_codec_alg_priv_t *ctx,
+                                               va_list args) {
+  int *const arg = va_arg(args, int *);
+  if (arg == NULL) return AOM_CODEC_INVALID_PARAM;
+  *arg =
+      ((FrameWorkerData *)ctx->frame_worker->data1)->pbi->is_arf_frame_present;
+  return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_get_frame_flags(aom_codec_alg_priv_t *ctx,
+                                            va_list args) {
+  int *const arg = va_arg(args, int *);
+  if (arg == NULL) return AOM_CODEC_INVALID_PARAM;
+  AV1Decoder *pbi = ((FrameWorkerData *)ctx->frame_worker->data1)->pbi;
+  *arg = 0;
+  switch (pbi->common.current_frame.frame_type) {
+    case KEY_FRAME:
+      *arg |= AOM_FRAME_IS_KEY;
+      *arg |= AOM_FRAME_IS_INTRAONLY;
+      if (!pbi->common.show_frame) {
+        *arg |= AOM_FRAME_IS_DELAYED_RANDOM_ACCESS_POINT;
+      }
+      break;
+    case INTRA_ONLY_FRAME: *arg |= AOM_FRAME_IS_INTRAONLY; break;
+    case S_FRAME: *arg |= AOM_FRAME_IS_SWITCH; break;
+  }
+  if (pbi->common.features.error_resilient_mode) {
+    *arg |= AOM_FRAME_IS_ERROR_RESILIENT;
+  }
+  return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_get_tile_info(aom_codec_alg_priv_t *ctx,
+                                          va_list args) {
+  aom_tile_info *const tile_info = va_arg(args, aom_tile_info *);
+
+  if (tile_info) {
+    if (ctx->frame_worker) {
+      AVxWorker *const worker = ctx->frame_worker;
+      FrameWorkerData *const frame_worker_data =
+          (FrameWorkerData *)worker->data1;
+      const AV1Decoder *pbi = frame_worker_data->pbi;
+      const CommonTileParams *tiles = &pbi->common.tiles;
+
+      int tile_rows = tiles->rows;
+      int tile_cols = tiles->cols;
+
+      if (tiles->uniform_spacing) {
+        tile_info->tile_rows = 1 << tiles->log2_rows;
+        tile_info->tile_columns = 1 << tiles->log2_cols;
+      } else {
+        tile_info->tile_rows = tile_rows;
+        tile_info->tile_columns = tile_cols;
+      }
+
+      for (int tile_col = 1; tile_col <= tile_cols; tile_col++) {
+        tile_info->tile_widths[tile_col - 1] =
+            tiles->col_start_sb[tile_col] - tiles->col_start_sb[tile_col - 1];
+      }
+
+      for (int tile_row = 1; tile_row <= tile_rows; tile_row++) {
+        tile_info->tile_heights[tile_row - 1] =
+            tiles->row_start_sb[tile_row] - tiles->row_start_sb[tile_row - 1];
+      }
+      tile_info->num_tile_groups = pbi->num_tile_groups;
+      return AOM_CODEC_OK;
+    } else {
+      return AOM_CODEC_ERROR;
+    }
+  }
+
+  return AOM_CODEC_INVALID_PARAM;
+}
+
+static aom_codec_err_t ctrl_get_screen_content_tools_info(
+    aom_codec_alg_priv_t *ctx, va_list args) {
+  aom_screen_content_tools_info *const sc_info =
+      va_arg(args, aom_screen_content_tools_info *);
+  if (sc_info) {
+    if (ctx->frame_worker) {
+      AVxWorker *const worker = ctx->frame_worker;
+      FrameWorkerData *const frame_worker_data =
+          (FrameWorkerData *)worker->data1;
+      const AV1Decoder *pbi = frame_worker_data->pbi;
+      sc_info->allow_screen_content_tools =
+          pbi->common.features.allow_screen_content_tools;
+      sc_info->allow_intrabc = pbi->common.features.allow_intrabc;
+      sc_info->force_integer_mv =
+          (int)pbi->common.features.cur_frame_force_integer_mv;
+      return AOM_CODEC_OK;
+    } else {
+      return AOM_CODEC_ERROR;
+    }
+  }
+  return AOM_CODEC_INVALID_PARAM;
+}
+
+static aom_codec_err_t ctrl_get_still_picture(aom_codec_alg_priv_t *ctx,
+                                              va_list args) {
+  aom_still_picture_info *const still_picture_info =
+      va_arg(args, aom_still_picture_info *);
+  if (still_picture_info) {
+    if (ctx->frame_worker) {
+      AVxWorker *const worker = ctx->frame_worker;
+      FrameWorkerData *const frame_worker_data =
+          (FrameWorkerData *)worker->data1;
+      const AV1Decoder *pbi = frame_worker_data->pbi;
+      still_picture_info->is_still_picture =
+          (int)pbi->common.seq_params.still_picture;
+      still_picture_info->is_reduced_still_picture_hdr =
+          (int)(pbi->common.seq_params.reduced_still_picture_hdr);
+      return AOM_CODEC_OK;
+    } else {
+      return AOM_CODEC_ERROR;
+    }
+  }
+  return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_get_sb_size(aom_codec_alg_priv_t *ctx,
+                                        va_list args) {
+  aom_superblock_size_t *const sb_size = va_arg(args, aom_superblock_size_t *);
+  if (sb_size) {
+    if (ctx->frame_worker) {
+      AVxWorker *const worker = ctx->frame_worker;
+      FrameWorkerData *const frame_worker_data =
+          (FrameWorkerData *)worker->data1;
+      const AV1Decoder *pbi = frame_worker_data->pbi;
+      if (pbi->common.seq_params.sb_size == BLOCK_128X128) {
+        *sb_size = AOM_SUPERBLOCK_SIZE_128X128;
+      } else {
+        *sb_size = AOM_SUPERBLOCK_SIZE_64X64;
+      }
+      return AOM_CODEC_OK;
+    } else {
+      return AOM_CODEC_ERROR;
+    }
+  }
+  return AOM_CODEC_INVALID_PARAM;
+}
+
+static aom_codec_err_t ctrl_get_show_existing_frame_flag(
+    aom_codec_alg_priv_t *ctx, va_list args) {
+  int *const arg = va_arg(args, int *);
+  if (arg == NULL) return AOM_CODEC_INVALID_PARAM;
+  *arg = ((FrameWorkerData *)ctx->frame_worker->data1)
+             ->pbi->common.show_existing_frame;
+  return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_get_s_frame_info(aom_codec_alg_priv_t *ctx,
+                                             va_list args) {
+  aom_s_frame_info *const s_frame_info = va_arg(args, aom_s_frame_info *);
+  if (s_frame_info) {
+    if (ctx->frame_worker) {
+      AVxWorker *const worker = ctx->frame_worker;
+      FrameWorkerData *const frame_worker_data =
+          (FrameWorkerData *)worker->data1;
+      const AV1Decoder *pbi = frame_worker_data->pbi;
+      s_frame_info->is_s_frame = pbi->sframe_info.is_s_frame;
+      s_frame_info->is_s_frame_at_altref =
+          pbi->sframe_info.is_s_frame_at_altref;
+      return AOM_CODEC_OK;
+    } else {
+      return AOM_CODEC_ERROR;
+    }
+  }
+  return AOM_CODEC_OK;
+}
+
 static aom_codec_err_t ctrl_get_frame_corrupted(aom_codec_alg_priv_t *ctx,
                                                 va_list args) {
   int *corrupted = va_arg(args, int *);
@@ -1361,14 +1542,24 @@
   { AV1_GET_REFERENCE, ctrl_get_reference },
   { AV1D_GET_FRAME_HEADER_INFO, ctrl_get_frame_header_info },
   { AV1D_GET_TILE_DATA, ctrl_get_tile_data },
+  { AOMD_GET_FWD_KF_PRESENT, ctrl_get_fwd_kf_value },
+  { AOMD_GET_ALTREF_PRESENT, ctrl_get_altref_present },
+  { AOMD_GET_FRAME_FLAGS, ctrl_get_frame_flags },
+  { AOMD_GET_TILE_INFO, ctrl_get_tile_info },
+  { AOMD_GET_SCREEN_CONTENT_TOOLS_INFO, ctrl_get_screen_content_tools_info },
+  { AOMD_GET_STILL_PICTURE, ctrl_get_still_picture },
+  { AOMD_GET_SB_SIZE, ctrl_get_sb_size },
+  { AOMD_GET_SHOW_EXISTING_FRAME_FLAG, ctrl_get_show_existing_frame_flag },
+  { AOMD_GET_S_FRAME_INFO, ctrl_get_s_frame_info },
 
-  { -1, NULL },
+  CTRL_MAP_END,
 };
 
+// This data structure and function are exported in aom/aomdx.h
 #ifndef VERSION_STRING
 #define VERSION_STRING
 #endif
-CODEC_INTERFACE(aom_codec_av1_dx) = {
+aom_codec_iface_t aom_codec_av1_dx_algo = {
   "AOMedia Project AV1 Decoder" VERSION_STRING,
   AOM_CODEC_INTERNAL_ABI_VERSION,
   AOM_CODEC_CAP_DECODER |
@@ -1395,3 +1586,5 @@
       NULL   // aom_codec_get_preview_frame_fn_t
   }
 };
+
+aom_codec_iface_t *aom_codec_av1_dx(void) { return &aom_codec_av1_dx_algo; }
diff --git a/av1/common/alloccommon.c b/av1/common/alloccommon.c
index badee3d..cd997cd 100644
--- a/av1/common/alloccommon.c
+++ b/av1/common/alloccommon.c
@@ -51,6 +51,7 @@
   }
 }
 
+#if !CONFIG_REALTIME_ONLY
 // Assumes cm->rst_info[p].restoration_unit_size is already initialized
 void av1_alloc_restoration_buffers(AV1_COMMON *cm) {
   const int num_planes = av1_num_planes(cm);
@@ -131,6 +132,7 @@
 
   aom_free_frame_buffer(&cm->rst_frame);
 }
+#endif  // !CONFIG_REALTIME_ONLY
 
 void av1_free_above_context_buffers(CommonContexts *above_contexts) {
   int i;
diff --git a/av1/common/alloccommon.h b/av1/common/alloccommon.h
index fe8e0c5..e75c226 100644
--- a/av1/common/alloccommon.h
+++ b/av1/common/alloccommon.h
@@ -36,8 +36,10 @@
 void av1_free_context_buffers(struct AV1Common *cm);
 
 void av1_free_ref_frame_buffers(struct BufferPool *pool);
+#if !CONFIG_REALTIME_ONLY
 void av1_alloc_restoration_buffers(struct AV1Common *cm);
 void av1_free_restoration_buffers(struct AV1Common *cm);
+#endif
 
 int av1_alloc_state_buffers(struct AV1Common *cm, int width, int height);
 void av1_free_state_buffers(struct AV1Common *cm);
diff --git a/av1/common/arm/convolve_neon.c b/av1/common/arm/convolve_neon.c
index 51c9696..fc4a333 100644
--- a/av1/common/arm/convolve_neon.c
+++ b/av1/common/arm/convolve_neon.c
@@ -194,16 +194,11 @@
 void av1_convolve_x_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
                             int dst_stride, int w, int h,
                             const InterpFilterParams *filter_params_x,
-                            const InterpFilterParams *filter_params_y,
-                            const int subpel_x_qn, const int subpel_y_qn,
+                            const int subpel_x_qn,
                             ConvolveParams *conv_params) {
   const uint8_t horiz_offset = filter_params_x->taps / 2 - 1;
   const int8_t bits = FILTER_BITS - conv_params->round_0;
 
-  (void)subpel_y_qn;
-  (void)conv_params;
-  (void)filter_params_y;
-
   uint8x8_t t0;
 #if defined(__aarch64__)
   uint8x8_t t1, t2, t3;
@@ -601,22 +596,12 @@
 
 void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
                             int dst_stride, int w, int h,
-                            const InterpFilterParams *filter_params_x,
                             const InterpFilterParams *filter_params_y,
-                            const int subpel_x_qn, const int subpel_y_qn,
-                            ConvolveParams *conv_params) {
+                            const int subpel_y_qn) {
   const int vert_offset = filter_params_y->taps / 2 - 1;
 
   src -= vert_offset * src_stride;
 
-  (void)filter_params_x;
-  (void)subpel_x_qn;
-  (void)conv_params;
-
-  assert(conv_params->round_0 <= FILTER_BITS);
-  assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) ||
-         ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS)));
-
   const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
       filter_params_y, subpel_y_qn & SUBPEL_MASK);
 
@@ -1543,51 +1528,307 @@
     }
   }
 }
-void av1_convolve_2d_copy_sr_neon(const uint8_t *src, int src_stride,
-                                  uint8_t *dst, int dst_stride, int w, int h,
-                                  const InterpFilterParams *filter_params_x,
-                                  const InterpFilterParams *filter_params_y,
-                                  const int subpel_x_qn, const int subpel_y_qn,
-                                  ConvolveParams *conv_params) {
-  (void)filter_params_x;
-  (void)filter_params_y;
-  (void)subpel_x_qn;
-  (void)subpel_y_qn;
-  (void)conv_params;
 
-  const uint8_t *src1;
-  uint8_t *dst1;
-  int y;
+static INLINE void scaledconvolve_horiz_w4(
+    const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
+    const ptrdiff_t dst_stride, const InterpKernel *const x_filters,
+    const int x0_q4, const int x_step_q4, const int w, const int h) {
+  DECLARE_ALIGNED(16, uint8_t, temp[4 * 4]);
+  int x, y, z;
 
-  if (!(w & 0x0F)) {
-    for (y = 0; y < h; ++y) {
-      src1 = src;
-      dst1 = dst;
-      for (int x = 0; x < (w >> 4); ++x) {
-        vst1q_u8(dst1, vld1q_u8(src1));
-        src1 += 16;
-        dst1 += 16;
+  src -= SUBPEL_TAPS / 2 - 1;
+
+  y = h;
+  do {
+    int x_q4 = x0_q4;
+    x = 0;
+    do {
+      // process 4 src_x steps
+      for (z = 0; z < 4; ++z) {
+        const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+        if (x_q4 & SUBPEL_MASK) {
+          const int16x8_t filters = vld1q_s16(x_filters[x_q4 & SUBPEL_MASK]);
+          const int16x4_t filter3 = vdup_lane_s16(vget_low_s16(filters), 3);
+          const int16x4_t filter4 = vdup_lane_s16(vget_high_s16(filters), 0);
+          uint8x8_t s[8], d;
+          int16x8_t ss[4];
+          int16x4_t t[8], tt;
+
+          load_u8_8x4(src_x, src_stride, &s[0], &s[1], &s[2], &s[3]);
+          transpose_u8_8x4(&s[0], &s[1], &s[2], &s[3]);
+
+          ss[0] = vreinterpretq_s16_u16(vmovl_u8(s[0]));
+          ss[1] = vreinterpretq_s16_u16(vmovl_u8(s[1]));
+          ss[2] = vreinterpretq_s16_u16(vmovl_u8(s[2]));
+          ss[3] = vreinterpretq_s16_u16(vmovl_u8(s[3]));
+          t[0] = vget_low_s16(ss[0]);
+          t[1] = vget_low_s16(ss[1]);
+          t[2] = vget_low_s16(ss[2]);
+          t[3] = vget_low_s16(ss[3]);
+          t[4] = vget_high_s16(ss[0]);
+          t[5] = vget_high_s16(ss[1]);
+          t[6] = vget_high_s16(ss[2]);
+          t[7] = vget_high_s16(ss[3]);
+
+          tt = convolve8_4(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7],
+                           filters, filter3, filter4);
+          d = vqrshrun_n_s16(vcombine_s16(tt, tt), 7);
+          vst1_lane_u32((uint32_t *)&temp[4 * z], vreinterpret_u32_u8(d), 0);
+        } else {
+          int i;
+          for (i = 0; i < 4; ++i) {
+            temp[z * 4 + i] = src_x[i * src_stride + 3];
+          }
+        }
+        x_q4 += x_step_q4;
       }
-      src += src_stride;
-      dst += dst_stride;
+
+      // transpose the 4x4 filters values back to dst
+      {
+        const uint8x8x4_t d4 = vld4_u8(temp);
+        vst1_lane_u32((uint32_t *)&dst[x + 0 * dst_stride],
+                      vreinterpret_u32_u8(d4.val[0]), 0);
+        vst1_lane_u32((uint32_t *)&dst[x + 1 * dst_stride],
+                      vreinterpret_u32_u8(d4.val[1]), 0);
+        vst1_lane_u32((uint32_t *)&dst[x + 2 * dst_stride],
+                      vreinterpret_u32_u8(d4.val[2]), 0);
+        vst1_lane_u32((uint32_t *)&dst[x + 3 * dst_stride],
+                      vreinterpret_u32_u8(d4.val[3]), 0);
+      }
+      x += 4;
+    } while (x < w);
+
+    src += src_stride * 4;
+    dst += dst_stride * 4;
+    y -= 4;
+  } while (y > 0);
+}
+
+static INLINE void scaledconvolve_horiz_w8(
+    const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
+    const ptrdiff_t dst_stride, const InterpKernel *const x_filters,
+    const int x0_q4, const int x_step_q4, const int w, const int h) {
+  DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]);
+  int x, y, z;
+  src -= SUBPEL_TAPS / 2 - 1;
+
+  // This function processes 8x8 areas. The intermediate height is not always
+  // a multiple of 8, so force it to be a multiple of 8 here.
+  y = (h + 7) & ~7;
+
+  do {
+    int x_q4 = x0_q4;
+    x = 0;
+    do {
+      uint8x8_t d[8];
+      // process 8 src_x steps
+      for (z = 0; z < 8; ++z) {
+        const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+
+        if (x_q4 & SUBPEL_MASK) {
+          const int16x8_t filters = vld1q_s16(x_filters[x_q4 & SUBPEL_MASK]);
+          uint8x8_t s[8];
+          load_u8_8x8(src_x, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4],
+                      &s[5], &s[6], &s[7]);
+          transpose_u8_8x8(&s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6],
+                           &s[7]);
+          d[0] = scale_filter_8(s, filters);
+          vst1_u8(&temp[8 * z], d[0]);
+        } else {
+          int i;
+          for (i = 0; i < 8; ++i) {
+            temp[z * 8 + i] = src_x[i * src_stride + 3];
+          }
+        }
+        x_q4 += x_step_q4;
+      }
+
+      // transpose the 8x8 filters values back to dst
+      load_u8_8x8(temp, 8, &d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6],
+                  &d[7]);
+      transpose_u8_8x8(&d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], &d[7]);
+      vst1_u8(&dst[x + 0 * dst_stride], d[0]);
+      vst1_u8(&dst[x + 1 * dst_stride], d[1]);
+      vst1_u8(&dst[x + 2 * dst_stride], d[2]);
+      vst1_u8(&dst[x + 3 * dst_stride], d[3]);
+      vst1_u8(&dst[x + 4 * dst_stride], d[4]);
+      vst1_u8(&dst[x + 5 * dst_stride], d[5]);
+      vst1_u8(&dst[x + 6 * dst_stride], d[6]);
+      vst1_u8(&dst[x + 7 * dst_stride], d[7]);
+      x += 8;
+    } while (x < w);
+
+    src += src_stride * 8;
+    dst += dst_stride * 8;
+  } while (y -= 8);
+}
+
+static INLINE void scaledconvolve_vert_w4(
+    const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
+    const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
+    const int y0_q4, const int y_step_q4, const int w, const int h) {
+  int y;
+  int y_q4 = y0_q4;
+
+  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+  y = h;
+  do {
+    const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+
+    if (y_q4 & SUBPEL_MASK) {
+      const int16x8_t filters = vld1q_s16(y_filters[y_q4 & SUBPEL_MASK]);
+      const int16x4_t filter3 = vdup_lane_s16(vget_low_s16(filters), 3);
+      const int16x4_t filter4 = vdup_lane_s16(vget_high_s16(filters), 0);
+      uint8x8_t s[8], d;
+      int16x4_t t[8], tt;
+
+      load_u8_8x8(src_y, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5],
+                  &s[6], &s[7]);
+      t[0] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[0])));
+      t[1] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[1])));
+      t[2] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[2])));
+      t[3] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[3])));
+      t[4] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[4])));
+      t[5] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[5])));
+      t[6] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[6])));
+      t[7] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[7])));
+
+      tt = convolve8_4(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7], filters,
+                       filter3, filter4);
+      d = vqrshrun_n_s16(vcombine_s16(tt, tt), 7);
+      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d), 0);
+    } else {
+      memcpy(dst, &src_y[3 * src_stride], w);
     }
-  } else if (!(w & 0x07)) {
-    for (y = 0; y < h; ++y) {
-      vst1_u8(dst, vld1_u8(src));
-      src += src_stride;
-      dst += dst_stride;
+
+    dst += dst_stride;
+    y_q4 += y_step_q4;
+  } while (--y);
+}
+
+static INLINE void scaledconvolve_vert_w8(
+    const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
+    const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
+    const int y0_q4, const int y_step_q4, const int w, const int h) {
+  int y;
+  int y_q4 = y0_q4;
+
+  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+  y = h;
+  do {
+    const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+    if (y_q4 & SUBPEL_MASK) {
+      const int16x8_t filters = vld1q_s16(y_filters[y_q4 & SUBPEL_MASK]);
+      uint8x8_t s[8], d;
+      load_u8_8x8(src_y, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5],
+                  &s[6], &s[7]);
+      d = scale_filter_8(s, filters);
+      vst1_u8(dst, d);
+    } else {
+      memcpy(dst, &src_y[3 * src_stride], w);
     }
-  } else if (!(w & 0x03)) {
-    for (y = 0; y < h; ++y) {
-      vst1_lane_u32((uint32_t *)(dst), vreinterpret_u32_u8(vld1_u8(src)), 0);
-      src += src_stride;
-      dst += dst_stride;
+    dst += dst_stride;
+    y_q4 += y_step_q4;
+  } while (--y);
+}
+
+static INLINE void scaledconvolve_vert_w16(
+    const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
+    const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
+    const int y0_q4, const int y_step_q4, const int w, const int h) {
+  int x, y;
+  int y_q4 = y0_q4;
+
+  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+  y = h;
+  do {
+    const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+    if (y_q4 & SUBPEL_MASK) {
+      x = 0;
+      do {
+        const int16x8_t filters = vld1q_s16(y_filters[y_q4 & SUBPEL_MASK]);
+        uint8x16_t ss[8];
+        uint8x8_t s[8], d[2];
+        load_u8_16x8(src_y, src_stride, &ss[0], &ss[1], &ss[2], &ss[3], &ss[4],
+                     &ss[5], &ss[6], &ss[7]);
+        s[0] = vget_low_u8(ss[0]);
+        s[1] = vget_low_u8(ss[1]);
+        s[2] = vget_low_u8(ss[2]);
+        s[3] = vget_low_u8(ss[3]);
+        s[4] = vget_low_u8(ss[4]);
+        s[5] = vget_low_u8(ss[5]);
+        s[6] = vget_low_u8(ss[6]);
+        s[7] = vget_low_u8(ss[7]);
+        d[0] = scale_filter_8(s, filters);
+
+        s[0] = vget_high_u8(ss[0]);
+        s[1] = vget_high_u8(ss[1]);
+        s[2] = vget_high_u8(ss[2]);
+        s[3] = vget_high_u8(ss[3]);
+        s[4] = vget_high_u8(ss[4]);
+        s[5] = vget_high_u8(ss[5]);
+        s[6] = vget_high_u8(ss[6]);
+        s[7] = vget_high_u8(ss[7]);
+        d[1] = scale_filter_8(s, filters);
+        vst1q_u8(&dst[x], vcombine_u8(d[0], d[1]));
+        src_y += 16;
+        x += 16;
+      } while (x < w);
+    } else {
+      memcpy(dst, &src_y[3 * src_stride], w);
     }
-  } else if (!(w & 0x01)) {
-    for (y = 0; y < h; ++y) {
-      vst1_lane_u16((uint16_t *)(dst), vreinterpret_u16_u8(vld1_u8(src)), 0);
-      src += src_stride;
-      dst += dst_stride;
-    }
+    dst += dst_stride;
+    y_q4 += y_step_q4;
+  } while (--y);
+}
+
+void aom_scaled_2d_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+                        ptrdiff_t dst_stride, const InterpKernel *filter,
+                        int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
+                        int w, int h) {
+  // Note: Fixed size intermediate buffer, temp, places limits on parameters.
+  // 2d filtering proceeds in 2 steps:
+  //   (1) Interpolate horizontally into an intermediate buffer, temp.
+  //   (2) Interpolate temp vertically to derive the sub-pixel result.
+  // Deriving the maximum number of rows in the temp buffer (135):
+  // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
+  // --Largest block size is 64x64 pixels.
+  // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
+  //   original frame (in 1/16th pixel units).
+  // --Must round-up because block may be located at sub-pixel position.
+  // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
+  // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
+  // --Require an additional 8 rows for the horiz_w8 transpose tail.
+  // When calling in frame scaling function, the smallest scaling factor is x1/4
+  // ==> y_step_q4 = 64. Since w and h are at most 16, the temp buffer is still
+  // big enough.
+  DECLARE_ALIGNED(16, uint8_t, temp[(135 + 8) * 64]);
+  const int intermediate_height =
+      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
+
+  assert(w <= 64);
+  assert(h <= 64);
+  assert(y_step_q4 <= 32 || (y_step_q4 <= 64 && h <= 32));
+  assert(x_step_q4 <= 64);
+
+  if (w >= 8) {
+    scaledconvolve_horiz_w8(src - src_stride * (SUBPEL_TAPS / 2 - 1),
+                            src_stride, temp, 64, filter, x0_q4, x_step_q4, w,
+                            intermediate_height);
+  } else {
+    scaledconvolve_horiz_w4(src - src_stride * (SUBPEL_TAPS / 2 - 1),
+                            src_stride, temp, 64, filter, x0_q4, x_step_q4, w,
+                            intermediate_height);
+  }
+
+  if (w >= 16) {
+    scaledconvolve_vert_w16(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
+                            dst_stride, filter, y0_q4, y_step_q4, w, h);
+  } else if (w == 8) {
+    scaledconvolve_vert_w8(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
+                           dst_stride, filter, y0_q4, y_step_q4, w, h);
+  } else {
+    scaledconvolve_vert_w4(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
+                           dst_stride, filter, y0_q4, y_step_q4, w, h);
   }
 }
diff --git a/av1/common/arm/convolve_neon.h b/av1/common/arm/convolve_neon.h
index dbcfab6..27a996c 100644
--- a/av1/common/arm/convolve_neon.h
+++ b/av1/common/arm/convolve_neon.h
@@ -15,6 +15,69 @@
 
 #define HORIZ_EXTRA_ROWS ((SUBPEL_TAPS + 7) & ~0x07)
 
+static INLINE int16x4_t convolve8_4(const int16x4_t s0, const int16x4_t s1,
+                                    const int16x4_t s2, const int16x4_t s3,
+                                    const int16x4_t s4, const int16x4_t s5,
+                                    const int16x4_t s6, const int16x4_t s7,
+                                    const int16x8_t filters,
+                                    const int16x4_t filter3,
+                                    const int16x4_t filter4) {
+  const int16x4_t filters_lo = vget_low_s16(filters);
+  const int16x4_t filters_hi = vget_high_s16(filters);
+  int16x4_t sum;
+
+  sum = vmul_lane_s16(s0, filters_lo, 0);
+  sum = vmla_lane_s16(sum, s1, filters_lo, 1);
+  sum = vmla_lane_s16(sum, s2, filters_lo, 2);
+  sum = vmla_lane_s16(sum, s5, filters_hi, 1);
+  sum = vmla_lane_s16(sum, s6, filters_hi, 2);
+  sum = vmla_lane_s16(sum, s7, filters_hi, 3);
+  sum = vqadd_s16(sum, vmul_s16(s3, filter3));
+  sum = vqadd_s16(sum, vmul_s16(s4, filter4));
+  return sum;
+}
+
+static INLINE uint8x8_t convolve8_8(const int16x8_t s0, const int16x8_t s1,
+                                    const int16x8_t s2, const int16x8_t s3,
+                                    const int16x8_t s4, const int16x8_t s5,
+                                    const int16x8_t s6, const int16x8_t s7,
+                                    const int16x8_t filters,
+                                    const int16x8_t filter3,
+                                    const int16x8_t filter4) {
+  const int16x4_t filters_lo = vget_low_s16(filters);
+  const int16x4_t filters_hi = vget_high_s16(filters);
+  int16x8_t sum;
+
+  sum = vmulq_lane_s16(s0, filters_lo, 0);
+  sum = vmlaq_lane_s16(sum, s1, filters_lo, 1);
+  sum = vmlaq_lane_s16(sum, s2, filters_lo, 2);
+  sum = vmlaq_lane_s16(sum, s5, filters_hi, 1);
+  sum = vmlaq_lane_s16(sum, s6, filters_hi, 2);
+  sum = vmlaq_lane_s16(sum, s7, filters_hi, 3);
+  sum = vqaddq_s16(sum, vmulq_s16(s3, filter3));
+  sum = vqaddq_s16(sum, vmulq_s16(s4, filter4));
+  return vqrshrun_n_s16(sum, 7);
+}
+
+static INLINE uint8x8_t scale_filter_8(const uint8x8_t *const s,
+                                       const int16x8_t filters) {
+  const int16x8_t filter3 = vdupq_lane_s16(vget_low_s16(filters), 3);
+  const int16x8_t filter4 = vdupq_lane_s16(vget_high_s16(filters), 0);
+  int16x8_t ss[8];
+
+  ss[0] = vreinterpretq_s16_u16(vmovl_u8(s[0]));
+  ss[1] = vreinterpretq_s16_u16(vmovl_u8(s[1]));
+  ss[2] = vreinterpretq_s16_u16(vmovl_u8(s[2]));
+  ss[3] = vreinterpretq_s16_u16(vmovl_u8(s[3]));
+  ss[4] = vreinterpretq_s16_u16(vmovl_u8(s[4]));
+  ss[5] = vreinterpretq_s16_u16(vmovl_u8(s[5]));
+  ss[6] = vreinterpretq_s16_u16(vmovl_u8(s[6]));
+  ss[7] = vreinterpretq_s16_u16(vmovl_u8(s[7]));
+
+  return convolve8_8(ss[0], ss[1], ss[2], ss[3], ss[4], ss[5], ss[6], ss[7],
+                     filters, filter3, filter4);
+}
+
 static INLINE uint8x8_t wiener_convolve8_vert_4x8(
     const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
     const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
diff --git a/av1/common/arm/highbd_inv_txfm_neon.c b/av1/common/arm/highbd_inv_txfm_neon.c
new file mode 100644
index 0000000..9bd9c4a
--- /dev/null
+++ b/av1/common/arm/highbd_inv_txfm_neon.c
@@ -0,0 +1,6072 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you canzip
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "av1/common/av1_inv_txfm1d_cfg.h"
+#include "av1/common/idct.h"
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#if defined(__aarch64__)
+#define TRANSPOSE_4X4(x0, x1, x2, x3, y0, y1, y2, y3)         \
+  do {                                                        \
+    int32x4x2_t swap_low = vtrnq_s32(x0, x1);                 \
+    int32x4x2_t swap_high = vtrnq_s32(x2, x3);                \
+    y0 = vreinterpretq_s32_s64(                               \
+        vzip1q_s64(vreinterpretq_s64_s32(swap_low.val[0]),    \
+                   vreinterpretq_s64_s32(swap_high.val[0]))); \
+    y1 = vreinterpretq_s32_s64(                               \
+        vzip1q_s64(vreinterpretq_s64_s32(swap_low.val[1]),    \
+                   vreinterpretq_s64_s32(swap_high.val[1]))); \
+    y2 = vreinterpretq_s32_s64(                               \
+        vzip2q_s64(vreinterpretq_s64_s32(swap_low.val[0]),    \
+                   vreinterpretq_s64_s32(swap_high.val[0]))); \
+    y3 = vreinterpretq_s32_s64(                               \
+        vzip2q_s64(vreinterpretq_s64_s32(swap_low.val[1]),    \
+                   vreinterpretq_s64_s32(swap_high.val[1]))); \
+  } while (0)
+#else
+#define TRANSPOSE_4X4(x0, x1, x2, x3, y0, y1, y2, y3)                    \
+  do {                                                                   \
+    int32x4x2_t swap_low = vtrnq_s32(x0, x1);                            \
+    int32x4x2_t swap_high = vtrnq_s32(x2, x3);                           \
+    y0 = vextq_s32(vextq_s32(swap_low.val[0], swap_low.val[0], 2),       \
+                   swap_high.val[0], 2);                                 \
+    y1 = vextq_s32(vextq_s32(swap_low.val[1], swap_low.val[1], 2),       \
+                   swap_high.val[1], 2);                                 \
+    y2 = vextq_s32(swap_low.val[0],                                      \
+                   vextq_s32(swap_high.val[0], swap_high.val[0], 2), 2); \
+    y3 = vextq_s32(swap_low.val[1],                                      \
+                   vextq_s32(swap_high.val[1], swap_high.val[1], 2), 2); \
+  } while (0)
+#endif  // (__aarch64__)
+
+static INLINE void transpose_8x8(const int32x4_t *in, int32x4_t *out) {
+  TRANSPOSE_4X4(in[0], in[2], in[4], in[6], out[0], out[2], out[4], out[6]);
+  TRANSPOSE_4X4(in[1], in[3], in[5], in[7], out[8], out[10], out[12], out[14]);
+  TRANSPOSE_4X4(in[8], in[10], in[12], in[14], out[1], out[3], out[5], out[7]);
+  TRANSPOSE_4X4(in[9], in[11], in[13], in[15], out[9], out[11], out[13],
+                out[15]);
+}
+
+static INLINE void av1_round_shift_array_32_neon(int32x4_t *input,
+                                                 int32x4_t *output,
+                                                 const int size,
+                                                 const int bit) {
+  const int32x4_t v_bit = vdupq_n_s32(-bit);
+  const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
+  int i;
+  for (i = 0; i < size; i++) {
+    int32x4_t vradd = vaddq_s32(input[i], rnding);
+    output[i] = vshlq_s32(vradd, v_bit);
+  }
+}
+
+static INLINE void av1_round_shift_rect_array_32_neon(int32x4_t *input,
+                                                      int32x4_t *output,
+                                                      const int size,
+                                                      const int bit,
+                                                      const int val) {
+  const int32x4_t sqrt2 = vdupq_n_s32(val);
+  const int32x4_t v_bit = vdupq_n_s32(-bit);
+  const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
+  const int32x4_t rnding2 = vdupq_n_s32(1 << (NewSqrt2Bits - 1));
+  int i;
+  if (bit > 0) {
+    for (i = 0; i < size; i++) {
+      int32x4_t vradd = vshlq_s32(input[i], rnding);
+      const int32x4_t r0 = vshlq_s32(vradd, v_bit);
+      const int32x4_t r1 = vmlaq_s32(rnding2, sqrt2, r0);
+      output[i] = vshrq_n_s32(r1, NewSqrt2Bits);
+    }
+  } else {
+    for (i = 0; i < size; i++) {
+      const int32x4_t r0 = vshlq_s32(input[i], v_bit);
+      const int32x4_t r1 = vmlaq_s32(rnding2, sqrt2, r0);
+      output[i] = vshrq_n_s32(r1, NewSqrt2Bits);
+    }
+  }
+}
+
+static INLINE int32x4_t half_btf_neon_r(const int32_t *n0, const int32x4_t *w0,
+                                        const int32_t *n1, const int32x4_t *w1,
+                                        const int32x4_t *v_bit,
+                                        const int32x4_t *rnding) {
+  int32x4_t x;
+  x = vmlaq_n_s32(*rnding, *w0, *n0);
+  x = vmlaq_n_s32(x, *w1, *n1);
+  x = vshlq_s32(x, *v_bit);
+  return x;
+}
+
+static INLINE int32x4_t half_btf_neon_mode11_r(
+    const int32_t *n0, const int32x4_t *w0, const int32_t *n1,
+    const int32x4_t *w1, const int32x4_t *v_bit, const int32x4_t *rnding) {
+  int32x4_t x;
+  x = vmlaq_n_s32(*rnding, *w0, -*n0);
+  x = vmlaq_n_s32(x, *w1, -*n1);
+  x = vshlq_s32(x, *v_bit);
+  return x;
+}
+
+static INLINE int32x4_t half_btf_neon_mode01_r(
+    const int32_t *n0, const int32x4_t *w0, const int32_t *n1,
+    const int32x4_t *w1, const int32x4_t *v_bit, const int32x4_t *rnding) {
+  int32x4_t x;
+  x = vmlaq_n_s32(*rnding, *w0, *n0);
+  x = vmlsq_n_s32(x, *w1, *n1);
+  x = vshlq_s32(x, *v_bit);
+  return x;
+}
+
+static INLINE int32x4_t half_btf_neon_mode10_r(
+    const int32_t *n0, const int32x4_t *w0, const int32_t *n1,
+    const int32x4_t *w1, const int32x4_t *v_bit, const int32x4_t *rnding) {
+  int32x4_t x;
+  x = vmlaq_n_s32(*rnding, *w1, *n1);
+  x = vmlsq_n_s32(x, *w0, *n0);
+  x = vshlq_s32(x, *v_bit);
+  return x;
+}
+
+static INLINE int32x4_t half_btf_0_neon_r(const int32_t *n0,
+                                          const int32x4_t *w0,
+                                          const int32x4_t *v_bit,
+                                          const int32x4_t *rnding) {
+  int32x4_t x;
+  x = vmlaq_n_s32(*rnding, *w0, *n0);
+  x = vshlq_s32(x, *v_bit);
+  return x;
+}
+
+static INLINE int32x4_t half_btf_0_m_neon_r(const int32_t *n0,
+                                            const int32x4_t *w0,
+                                            const int32x4_t *v_bit,
+                                            const int32x4_t *rnding) {
+  int32x4_t x;
+  x = vmlaq_n_s32(*rnding, *w0, -*n0);
+  x = vshlq_s32(x, *v_bit);
+  return x;
+}
+
+static INLINE void flip_buf_neon(int32x4_t *in, int32x4_t *out, int size) {
+  for (int i = 0; i < size; ++i) {
+    out[size - i - 1] = in[i];
+  }
+}
+
+typedef void (*fwd_transform_1d_neon)(int32x4_t *in, int32x4_t *out, int bit,
+                                      const int num_cols);
+
+typedef void (*transform_1d_neon)(int32x4_t *in, int32x4_t *out, int32_t bit,
+                                  int32_t do_cols, int32_t bd,
+                                  int32_t out_shift);
+
+static INLINE uint16x8_t highbd_clamp_u16(uint16x8_t *u, const uint16x8_t *min,
+                                          const uint16x8_t *max) {
+  int16x8_t clamped;
+  clamped = vminq_s16(vreinterpretq_s16_u16(*u), vreinterpretq_s16_u16(*max));
+  clamped = vmaxq_s16(clamped, vreinterpretq_s16_u16(*min));
+  return vreinterpretq_u16_s16(clamped);
+}
+
+static INLINE void round_shift_4x4(int32x4_t *in, int shift,
+                                   const int32x4_t *rnding) {
+  if (shift != 0) {
+    const int32x4_t v_shift = vdupq_n_s32(-shift);
+    int32x4_t vradd = vaddq_s32(in[0], *rnding);
+    in[0] = vshlq_s32(vradd, v_shift);
+    vradd = vaddq_s32(in[1], *rnding);
+    in[1] = vshlq_s32(vradd, v_shift);
+    vradd = vaddq_s32(in[2], *rnding);
+    in[2] = vshlq_s32(vradd, v_shift);
+    vradd = vaddq_s32(in[3], *rnding);
+    in[3] = vshlq_s32(vradd, v_shift);
+  }
+}
+
+static void round_shift_8x8(int32x4_t *in, int shift, const int32x4_t *rnding) {
+  if (shift != 0) {
+    const int32x4_t v_shift = vdupq_n_s32(-shift);
+    int32x4_t vradd = vaddq_s32(in[0], *rnding);
+    in[0] = vshlq_s32(vradd, v_shift);
+    vradd = vaddq_s32(in[1], *rnding);
+    in[1] = vshlq_s32(vradd, v_shift);
+    vradd = vaddq_s32(in[2], *rnding);
+    in[2] = vshlq_s32(vradd, v_shift);
+    vradd = vaddq_s32(in[3], *rnding);
+    in[3] = vshlq_s32(vradd, v_shift);
+    vradd = vaddq_s32(in[4], *rnding);
+    in[4] = vshlq_s32(vradd, v_shift);
+    vradd = vaddq_s32(in[5], *rnding);
+    in[5] = vshlq_s32(vradd, v_shift);
+    vradd = vaddq_s32(in[6], *rnding);
+    in[6] = vshlq_s32(vradd, v_shift);
+    vradd = vaddq_s32(in[7], *rnding);
+    in[7] = vshlq_s32(vradd, v_shift);
+    vradd = vaddq_s32(in[8], *rnding);
+    in[8] = vshlq_s32(vradd, v_shift);
+    vradd = vaddq_s32(in[9], *rnding);
+    in[9] = vshlq_s32(vradd, v_shift);
+    vradd = vaddq_s32(in[10], *rnding);
+    in[10] = vshlq_s32(vradd, v_shift);
+    vradd = vaddq_s32(in[11], *rnding);
+    in[11] = vshlq_s32(vradd, v_shift);
+    vradd = vaddq_s32(in[12], *rnding);
+    in[12] = vshlq_s32(vradd, v_shift);
+    vradd = vaddq_s32(in[13], *rnding);
+    in[13] = vshlq_s32(vradd, v_shift);
+    vradd = vaddq_s32(in[14], *rnding);
+    in[14] = vshlq_s32(vradd, v_shift);
+    vradd = vaddq_s32(in[15], *rnding);
+    in[15] = vshlq_s32(vradd, v_shift);
+  }
+}
+
+static void highbd_clamp_s32_neon(int32x4_t *in, int32x4_t *out,
+                                  const int32x4_t *clamp_lo,
+                                  const int32x4_t *clamp_hi, int size) {
+  int32x4_t a0, a1;
+  for (int i = 0; i < size; i += 4) {
+    a0 = vmaxq_s32(in[i], *clamp_lo);
+    out[i] = vminq_s32(a0, *clamp_hi);
+
+    a1 = vmaxq_s32(in[i + 1], *clamp_lo);
+    out[i + 1] = vminq_s32(a1, *clamp_hi);
+
+    a0 = vmaxq_s32(in[i + 2], *clamp_lo);
+    out[i + 2] = vminq_s32(a0, *clamp_hi);
+
+    a1 = vmaxq_s32(in[i + 3], *clamp_lo);
+    out[i + 3] = vminq_s32(a1, *clamp_hi);
+  }
+}
+
+static INLINE uint16x8_t highbd_get_recon_8x8_neon(const uint16x8_t pred,
+                                                   int32x4_t res0,
+                                                   int32x4_t res1,
+                                                   const int bd) {
+  const uint16x8_t v_zero = vdupq_n_u16(0);
+  int32x4_t min_clip_val = vreinterpretq_s32_u16(v_zero);
+  int32x4_t max_clip_val = vdupq_n_s32((1 << bd) - 1);
+  uint16x8x2_t x;
+  x.val[0] = vreinterpretq_u16_s32(
+      vaddw_s16(res0, vreinterpret_s16_u16(vget_low_u16(pred))));
+  x.val[1] = vreinterpretq_u16_s32(
+      vaddw_s16(res1, vreinterpret_s16_u16(vget_high_u16(pred))));
+  x.val[0] = vreinterpretq_u16_s32(
+      vmaxq_s32(vreinterpretq_s32_u16(x.val[0]), min_clip_val));
+  x.val[0] = vreinterpretq_u16_s32(
+      vminq_s32(vreinterpretq_s32_u16(x.val[0]), max_clip_val));
+  x.val[1] = vreinterpretq_u16_s32(
+      vmaxq_s32(vreinterpretq_s32_u16(x.val[1]), min_clip_val));
+  x.val[1] = vreinterpretq_u16_s32(
+      vminq_s32(vreinterpretq_s32_u16(x.val[1]), max_clip_val));
+  uint16x8_t res = vcombine_u16(vqmovn_u32(vreinterpretq_u32_u16(x.val[0])),
+                                vqmovn_u32(vreinterpretq_u32_u16(x.val[1])));
+  return res;
+}
+
+static INLINE uint16x4_t highbd_get_recon_4xn_neon(uint16x4_t pred,
+                                                   int32x4_t res0,
+                                                   const int bd) {
+  uint16x4_t x0_ = vreinterpret_u16_s16(
+      vmovn_s32(vaddw_s16(res0, vreinterpret_s16_u16(pred))));
+  uint16x8_t x0 = vcombine_u16(x0_, x0_);
+  const uint16x8_t vmin = vdupq_n_u16(0);
+  const uint16x8_t vmax = vdupq_n_u16((1 << bd) - 1);
+  x0 = highbd_clamp_u16(&x0, &vmin, &vmax);
+  return vget_low_u16(x0);
+}
+
+static INLINE void highbd_write_buffer_4xn_neon(int32x4_t *in, uint16_t *output,
+                                                int stride, int flipud,
+                                                int height, const int bd) {
+  int j = flipud ? (height - 1) : 0;
+  const int step = flipud ? -1 : 1;
+  for (int i = 0; i < height; ++i, j += step) {
+    uint16x4_t v = vld1_u16(output + i * stride);
+    uint16x4_t u = highbd_get_recon_4xn_neon(v, in[j], bd);
+
+    vst1_u16(output + i * stride, u);
+  }
+}
+
+static INLINE void highbd_write_buffer_8xn_neon(int32x4_t *in, uint16_t *output,
+                                                int stride, int flipud,
+                                                int height, const int bd) {
+  int j = flipud ? (height - 1) : 0;
+  const int step = flipud ? -1 : 1;
+  for (int i = 0; i < height; ++i, j += step) {
+    uint16x8_t v = vld1q_u16(output + i * stride);
+    uint16x8_t u = highbd_get_recon_8x8_neon(v, in[j], in[j + height], bd);
+
+    vst1q_u16(output + i * stride, u);
+  }
+}
+
+static INLINE void load_buffer_32bit_input(const int32_t *in, int stride,
+                                           int32x4_t *out, int out_size) {
+  for (int i = 0; i < out_size; ++i) {
+    out[i] = vld1q_s32(in + i * stride);
+  }
+}
+
+static INLINE void load_buffer_4x4(const int32_t *coeff, int32x4_t *in) {
+  in[0] = vld1q_s32(coeff + 0);
+  in[1] = vld1q_s32(coeff + 4);
+  in[2] = vld1q_s32(coeff + 8);
+  in[3] = vld1q_s32(coeff + 12);
+}
+
+static void addsub_neon(const int32x4_t in0, const int32x4_t in1,
+                        int32x4_t *out0, int32x4_t *out1,
+                        const int32x4_t *clamp_lo, const int32x4_t *clamp_hi) {
+  int32x4_t a0 = vaddq_s32(in0, in1);
+  int32x4_t a1 = vsubq_s32(in0, in1);
+
+  a0 = vmaxq_s32(a0, *clamp_lo);
+  a0 = vminq_s32(a0, *clamp_hi);
+  a1 = vmaxq_s32(a1, *clamp_lo);
+  a1 = vminq_s32(a1, *clamp_hi);
+
+  *out0 = a0;
+  *out1 = a1;
+}
+
+static void shift_and_clamp_neon(int32x4_t *in0, int32x4_t *in1,
+                                 const int32x4_t *clamp_lo,
+                                 const int32x4_t *clamp_hi,
+                                 const int32x4_t *v_shift) {
+  int32x4_t in0_w_offset = vrshlq_s32(*in0, *v_shift);
+  int32x4_t in1_w_offset = vrshlq_s32(*in1, *v_shift);
+
+  in0_w_offset = vmaxq_s32(in0_w_offset, *clamp_lo);
+  in0_w_offset = vminq_s32(in0_w_offset, *clamp_hi);
+  in1_w_offset = vmaxq_s32(in1_w_offset, *clamp_lo);
+  in1_w_offset = vminq_s32(in1_w_offset, *clamp_hi);
+
+  *in0 = in0_w_offset;
+  *in1 = in1_w_offset;
+}
+
+static INLINE void idct32_stage4_neon(int32x4_t *bf1, const int32_t *cospi,
+                                      const int32x4_t *v_bit,
+                                      const int32x4_t *rnding) {
+  int32x4_t temp1, temp2;
+  temp1 = half_btf_neon_mode10_r(&cospi[8], &bf1[17], &cospi[56], &bf1[30],
+                                 v_bit, rnding);
+  bf1[30] =
+      half_btf_neon_r(&cospi[56], &bf1[17], &cospi[8], &bf1[30], v_bit, rnding);
+  bf1[17] = temp1;
+
+  temp2 = half_btf_neon_mode11_r(&cospi[56], &bf1[18], &cospi[8], &bf1[29],
+                                 v_bit, rnding);
+  bf1[29] = half_btf_neon_mode10_r(&cospi[8], &bf1[18], &cospi[56], &bf1[29],
+                                   v_bit, rnding);
+  bf1[18] = temp2;
+
+  temp1 = half_btf_neon_mode10_r(&cospi[40], &bf1[21], &cospi[24], &bf1[26],
+                                 v_bit, rnding);
+  bf1[26] = half_btf_neon_r(&cospi[24], &bf1[21], &cospi[40], &bf1[26], v_bit,
+                            rnding);
+  bf1[21] = temp1;
+
+  temp2 = half_btf_neon_mode11_r(&cospi[24], &bf1[22], &cospi[40], &bf1[25],
+                                 v_bit, rnding);
+  bf1[25] = half_btf_neon_mode10_r(&cospi[40], &bf1[22], &cospi[24], &bf1[25],
+                                   v_bit, rnding);
+  bf1[22] = temp2;
+}
+
+static INLINE void idct32_stage5_neon(int32x4_t *bf1, const int32_t *cospi,
+                                      const int32x4_t *clamp_lo,
+                                      const int32x4_t *clamp_hi,
+                                      const int32x4_t *v_bit,
+                                      const int32x4_t *rnding) {
+  int32x4_t temp1, temp2;
+  temp1 = half_btf_neon_mode10_r(&cospi[16], &bf1[9], &cospi[48], &bf1[14],
+                                 v_bit, rnding);
+  bf1[14] =
+      half_btf_neon_r(&cospi[48], &bf1[9], &cospi[16], &bf1[14], v_bit, rnding);
+  bf1[9] = temp1;
+
+  temp2 = half_btf_neon_mode11_r(&cospi[48], &bf1[10], &cospi[16], &bf1[13],
+                                 v_bit, rnding);
+  bf1[13] = half_btf_neon_mode10_r(&cospi[16], &bf1[10], &cospi[48], &bf1[13],
+                                   v_bit, rnding);
+  bf1[10] = temp2;
+
+  addsub_neon(bf1[16], bf1[19], bf1 + 16, bf1 + 19, clamp_lo, clamp_hi);
+  addsub_neon(bf1[17], bf1[18], bf1 + 17, bf1 + 18, clamp_lo, clamp_hi);
+  addsub_neon(bf1[23], bf1[20], bf1 + 23, bf1 + 20, clamp_lo, clamp_hi);
+  addsub_neon(bf1[22], bf1[21], bf1 + 22, bf1 + 21, clamp_lo, clamp_hi);
+  addsub_neon(bf1[24], bf1[27], bf1 + 24, bf1 + 27, clamp_lo, clamp_hi);
+  addsub_neon(bf1[25], bf1[26], bf1 + 25, bf1 + 26, clamp_lo, clamp_hi);
+  addsub_neon(bf1[31], bf1[28], bf1 + 31, bf1 + 28, clamp_lo, clamp_hi);
+  addsub_neon(bf1[30], bf1[29], bf1 + 30, bf1 + 29, clamp_lo, clamp_hi);
+}
+
+static INLINE void idct32_stage6_neon(int32x4_t *bf1, const int32_t *cospi,
+                                      const int32x4_t *clamp_lo,
+                                      const int32x4_t *clamp_hi,
+                                      const int32x4_t *v_bit,
+                                      const int32x4_t *rnding) {
+  int32x4_t temp1, temp2;
+  temp1 = half_btf_neon_mode10_r(&cospi[32], &bf1[5], &cospi[32], &bf1[6],
+                                 v_bit, rnding);
+  bf1[6] =
+      half_btf_neon_r(&cospi[32], &bf1[5], &cospi[32], &bf1[6], v_bit, rnding);
+  bf1[5] = temp1;
+
+  addsub_neon(bf1[8], bf1[11], bf1 + 8, bf1 + 11, clamp_lo, clamp_hi);
+  addsub_neon(bf1[9], bf1[10], bf1 + 9, bf1 + 10, clamp_lo, clamp_hi);
+  addsub_neon(bf1[15], bf1[12], bf1 + 15, bf1 + 12, clamp_lo, clamp_hi);
+  addsub_neon(bf1[14], bf1[13], bf1 + 14, bf1 + 13, clamp_lo, clamp_hi);
+
+  temp1 = half_btf_neon_mode10_r(&cospi[16], &bf1[18], &cospi[48], &bf1[29],
+                                 v_bit, rnding);
+  bf1[29] = half_btf_neon_r(&cospi[48], &bf1[18], &cospi[16], &bf1[29], v_bit,
+                            rnding);
+  bf1[18] = temp1;
+  temp2 = half_btf_neon_mode10_r(&cospi[16], &bf1[19], &cospi[48], &bf1[28],
+                                 v_bit, rnding);
+  bf1[28] = half_btf_neon_r(&cospi[48], &bf1[19], &cospi[16], &bf1[28], v_bit,
+                            rnding);
+  bf1[19] = temp2;
+  temp1 = half_btf_neon_mode11_r(&cospi[48], &bf1[20], &cospi[16], &bf1[27],
+                                 v_bit, rnding);
+  bf1[27] = half_btf_neon_mode10_r(&cospi[16], &bf1[20], &cospi[48], &bf1[27],
+                                   v_bit, rnding);
+  bf1[20] = temp1;
+  temp2 = half_btf_neon_mode11_r(&cospi[48], &bf1[21], &cospi[16], &bf1[26],
+                                 v_bit, rnding);
+  bf1[26] = half_btf_neon_mode10_r(&cospi[16], &bf1[21], &cospi[48], &bf1[26],
+                                   v_bit, rnding);
+  bf1[21] = temp2;
+}
+
+static INLINE void idct32_stage7_neon(int32x4_t *bf1, const int32_t *cospi,
+                                      const int32x4_t *clamp_lo,
+                                      const int32x4_t *clamp_hi,
+                                      const int32x4_t *v_bit,
+                                      const int32x4_t *rnding) {
+  int32x4_t temp1, temp2;
+  addsub_neon(bf1[0], bf1[7], bf1 + 0, bf1 + 7, clamp_lo, clamp_hi);
+  addsub_neon(bf1[1], bf1[6], bf1 + 1, bf1 + 6, clamp_lo, clamp_hi);
+  addsub_neon(bf1[2], bf1[5], bf1 + 2, bf1 + 5, clamp_lo, clamp_hi);
+  addsub_neon(bf1[3], bf1[4], bf1 + 3, bf1 + 4, clamp_lo, clamp_hi);
+  temp1 = half_btf_neon_mode10_r(&cospi[32], &bf1[10], &cospi[32], &bf1[13],
+                                 v_bit, rnding);
+  bf1[13] = half_btf_neon_r(&cospi[32], &bf1[10], &cospi[32], &bf1[13], v_bit,
+                            rnding);
+  bf1[10] = temp1;
+  temp2 = half_btf_neon_mode10_r(&cospi[32], &bf1[11], &cospi[32], &bf1[12],
+                                 v_bit, rnding);
+  bf1[12] = half_btf_neon_r(&cospi[32], &bf1[11], &cospi[32], &bf1[12], v_bit,
+                            rnding);
+  bf1[11] = temp2;
+
+  addsub_neon(bf1[16], bf1[23], bf1 + 16, bf1 + 23, clamp_lo, clamp_hi);
+  addsub_neon(bf1[17], bf1[22], bf1 + 17, bf1 + 22, clamp_lo, clamp_hi);
+  addsub_neon(bf1[18], bf1[21], bf1 + 18, bf1 + 21, clamp_lo, clamp_hi);
+  addsub_neon(bf1[19], bf1[20], bf1 + 19, bf1 + 20, clamp_lo, clamp_hi);
+  addsub_neon(bf1[31], bf1[24], bf1 + 31, bf1 + 24, clamp_lo, clamp_hi);
+  addsub_neon(bf1[30], bf1[25], bf1 + 30, bf1 + 25, clamp_lo, clamp_hi);
+  addsub_neon(bf1[29], bf1[26], bf1 + 29, bf1 + 26, clamp_lo, clamp_hi);
+  addsub_neon(bf1[28], bf1[27], bf1 + 28, bf1 + 27, clamp_lo, clamp_hi);
+}
+
+static INLINE void idct32_stage8_neon(int32x4_t *bf1, const int32_t *cospi,
+                                      const int32x4_t *clamp_lo,
+                                      const int32x4_t *clamp_hi,
+                                      const int32x4_t *v_bit,
+                                      const int32x4_t *rnding) {
+  int32x4_t temp1, temp2;
+  addsub_neon(bf1[0], bf1[15], bf1 + 0, bf1 + 15, clamp_lo, clamp_hi);
+  addsub_neon(bf1[1], bf1[14], bf1 + 1, bf1 + 14, clamp_lo, clamp_hi);
+  addsub_neon(bf1[2], bf1[13], bf1 + 2, bf1 + 13, clamp_lo, clamp_hi);
+  addsub_neon(bf1[3], bf1[12], bf1 + 3, bf1 + 12, clamp_lo, clamp_hi);
+  addsub_neon(bf1[4], bf1[11], bf1 + 4, bf1 + 11, clamp_lo, clamp_hi);
+  addsub_neon(bf1[5], bf1[10], bf1 + 5, bf1 + 10, clamp_lo, clamp_hi);
+  addsub_neon(bf1[6], bf1[9], bf1 + 6, bf1 + 9, clamp_lo, clamp_hi);
+  addsub_neon(bf1[7], bf1[8], bf1 + 7, bf1 + 8, clamp_lo, clamp_hi);
+  temp1 = half_btf_neon_mode10_r(&cospi[32], &bf1[20], &cospi[32], &bf1[27],
+                                 v_bit, rnding);
+  bf1[27] = half_btf_neon_r(&cospi[32], &bf1[20], &cospi[32], &bf1[27], v_bit,
+                            rnding);
+  bf1[20] = temp1;
+  temp2 = half_btf_neon_mode10_r(&cospi[32], &bf1[21], &cospi[32], &bf1[26],
+                                 v_bit, rnding);
+  bf1[26] = half_btf_neon_r(&cospi[32], &bf1[21], &cospi[32], &bf1[26], v_bit,
+                            rnding);
+  bf1[21] = temp2;
+  temp1 = half_btf_neon_mode10_r(&cospi[32], &bf1[22], &cospi[32], &bf1[25],
+                                 v_bit, rnding);
+  bf1[25] = half_btf_neon_r(&cospi[32], &bf1[22], &cospi[32], &bf1[25], v_bit,
+                            rnding);
+  bf1[22] = temp1;
+  temp2 = half_btf_neon_mode10_r(&cospi[32], &bf1[23], &cospi[32], &bf1[24],
+                                 v_bit, rnding);
+  bf1[24] = half_btf_neon_r(&cospi[32], &bf1[23], &cospi[32], &bf1[24], v_bit,
+                            rnding);
+  bf1[23] = temp2;
+}
+
+static INLINE void idct32_stage9_neon(int32x4_t *bf1, int32x4_t *out,
+                                      const int do_cols, const int bd,
+                                      const int out_shift,
+                                      const int32x4_t *clamp_lo,
+                                      const int32x4_t *clamp_hi) {
+  addsub_neon(bf1[0], bf1[31], out + 0, out + 31, clamp_lo, clamp_hi);
+  addsub_neon(bf1[1], bf1[30], out + 1, out + 30, clamp_lo, clamp_hi);
+  addsub_neon(bf1[2], bf1[29], out + 2, out + 29, clamp_lo, clamp_hi);
+  addsub_neon(bf1[3], bf1[28], out + 3, out + 28, clamp_lo, clamp_hi);
+  addsub_neon(bf1[4], bf1[27], out + 4, out + 27, clamp_lo, clamp_hi);
+  addsub_neon(bf1[5], bf1[26], out + 5, out + 26, clamp_lo, clamp_hi);
+  addsub_neon(bf1[6], bf1[25], out + 6, out + 25, clamp_lo, clamp_hi);
+  addsub_neon(bf1[7], bf1[24], out + 7, out + 24, clamp_lo, clamp_hi);
+  addsub_neon(bf1[8], bf1[23], out + 8, out + 23, clamp_lo, clamp_hi);
+  addsub_neon(bf1[9], bf1[22], out + 9, out + 22, clamp_lo, clamp_hi);
+  addsub_neon(bf1[10], bf1[21], out + 10, out + 21, clamp_lo, clamp_hi);
+  addsub_neon(bf1[11], bf1[20], out + 11, out + 20, clamp_lo, clamp_hi);
+  addsub_neon(bf1[12], bf1[19], out + 12, out + 19, clamp_lo, clamp_hi);
+  addsub_neon(bf1[13], bf1[18], out + 13, out + 18, clamp_lo, clamp_hi);
+  addsub_neon(bf1[14], bf1[17], out + 14, out + 17, clamp_lo, clamp_hi);
+  addsub_neon(bf1[15], bf1[16], out + 15, out + 16, clamp_lo, clamp_hi);
+
+  if (!do_cols) {
+    const int log_range_out = AOMMAX(16, bd + 6);
+    const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1)));
+    const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1);
+    const int32x4_t rnding = vdupq_n_s32(1 << (out_shift - 1));
+    for (int i = 0; i < 32; i += 8) {
+      round_shift_4x4(out + i, out_shift, &rnding);
+      round_shift_4x4(out + i + 4, out_shift, &rnding);
+    }
+    highbd_clamp_s32_neon(out, out, &clamp_lo_out, &clamp_hi_out, 32);
+  }
+}
+
+static void neg_shift_neon(const int32x4_t *in0, const int32x4_t *in1,
+                           int32x4_t *out0, int32x4_t *out1,
+                           const int32x4_t *clamp_lo, const int32x4_t *clamp_hi,
+                           const int32x4_t *v_shift, int32x4_t *offset) {
+  int32x4_t a0 = vaddq_s32(*offset, *in0);
+  int32x4_t a1 = vsubq_s32(*offset, *in1);
+
+  a0 = vshlq_s32(a0, *v_shift);
+  a1 = vshlq_s32(a1, *v_shift);
+
+  a0 = vmaxq_s32(a0, *clamp_lo);
+  a0 = vminq_s32(a0, *clamp_hi);
+  a1 = vmaxq_s32(a1, *clamp_lo);
+  a1 = vminq_s32(a1, *clamp_hi);
+
+  *out0 = a0;
+  *out1 = a1;
+}
+
+static void idct4x4_neon(int32x4_t *in, int32x4_t *out, int bit, int do_cols,
+                         int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
+  int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
+  int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
+
+  int32x4_t u0, u1, u2, u3;
+  int32x4_t v0, v1, v2, v3, x, y;
+
+  // Stage 0-1-2
+
+  TRANSPOSE_4X4(in[0], in[1], in[2], in[3], u0, u1, u2, u3);
+
+  const int32x4_t v_bit = vdupq_n_s32(-bit);
+
+  x = vmlaq_n_s32(rnding, u0, cospi[32]);
+  y = vmulq_n_s32(u2, cospi[32]);
+  v0 = vaddq_s32(x, y);
+  v0 = vshlq_s32(v0, v_bit);
+
+  v1 = vsubq_s32(x, y);
+  v1 = vshlq_s32(v1, v_bit);
+
+  x = vmlaq_n_s32(rnding, u1, cospi[48]);
+  v2 = vmlsq_n_s32(x, u3, cospi[16]);
+  v2 = vshlq_s32(v2, v_bit);
+
+  x = vmlaq_n_s32(rnding, u1, cospi[16]);
+  v3 = vmlaq_n_s32(x, u3, cospi[48]);
+  v3 = vshlq_s32(v3, v_bit);
+  // Stage 3
+  addsub_neon(v0, v3, out + 0, out + 3, &clamp_lo, &clamp_hi);
+  addsub_neon(v1, v2, out + 1, out + 2, &clamp_lo, &clamp_hi);
+
+  if (!do_cols) {
+    log_range = AOMMAX(16, bd + 6);
+    clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
+    clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
+    const int32x4_t v_shift = vdupq_n_s32(-out_shift);
+    shift_and_clamp_neon(out + 0, out + 3, &clamp_lo, &clamp_hi, &v_shift);
+    shift_and_clamp_neon(out + 1, out + 2, &clamp_lo, &clamp_hi, &v_shift);
+  }
+}
+
+static void iadst4x4_neon(int32x4_t *in, int32x4_t *out, int bit, int do_cols,
+                          int bd, int out_shift) {
+  const int32_t *sinpi = sinpi_arr(bit);
+  const int32x4_t zero = vdupq_n_s32(0);
+  int64x2_t rnding = vdupq_n_s64(1 << (bit + 4 - 1));
+  const int32x2_t mul = vdup_n_s32(1 << 4);
+  int32x4_t t;
+  int32x4_t s0, s1, s2, s3, s4, s5, s6, s7;
+  int32x4_t x0, x1, x2, x3;
+  int32x4_t u0, u1, u2, u3;
+
+  TRANSPOSE_4X4(in[0], in[1], in[2], in[3], x0, x1, x2, x3);
+
+  s0 = vmulq_n_s32(x0, sinpi[1]);
+  s1 = vmulq_n_s32(x0, sinpi[2]);
+  s2 = vmulq_n_s32(x1, sinpi[3]);
+  s3 = vmulq_n_s32(x2, sinpi[4]);
+  s4 = vmulq_n_s32(x2, sinpi[1]);
+  s5 = vmulq_n_s32(x3, sinpi[2]);
+  s6 = vmulq_n_s32(x3, sinpi[4]);
+  t = vsubq_s32(x0, x2);
+  s7 = vaddq_s32(t, x3);
+
+  t = vaddq_s32(s0, s3);
+  s0 = vaddq_s32(t, s5);
+  t = vsubq_s32(s1, s4);
+  s1 = vsubq_s32(t, s6);
+  s3 = s2;
+  s2 = vmulq_n_s32(s7, sinpi[3]);
+
+  u0 = vaddq_s32(s0, s3);
+  u1 = vaddq_s32(s1, s3);
+  u2 = s2;
+  t = vaddq_s32(s0, s1);
+  u3 = vsubq_s32(t, s3);
+
+  // u0
+  int32x4x2_t u0x;
+  u0x.val[0] = vreinterpretq_s32_s64(
+      vmull_s32(vmovn_s64(vreinterpretq_s64_s32(u0)), mul));
+  u0x.val[0] = vreinterpretq_s32_s64(
+      vaddq_s64(vreinterpretq_s64_s32(u0x.val[0]), rnding));
+
+  u0 = vextq_s32(u0, zero, 1);
+  u0x.val[1] = vreinterpretq_s32_s64(
+      vmull_s32(vmovn_s64(vreinterpretq_s64_s32(u0)), mul));
+  u0x.val[1] = vreinterpretq_s32_s64(
+      vaddq_s64(vreinterpretq_s64_s32(u0x.val[1]), rnding));
+
+  u0x.val[0] = vreinterpretq_s32_s16(vextq_s16(
+      vreinterpretq_s16_s32(u0x.val[0]), vreinterpretq_s16_s32(zero), 1));
+  u0x.val[1] = vreinterpretq_s32_s16(vextq_s16(
+      vreinterpretq_s16_s32(u0x.val[1]), vreinterpretq_s16_s32(zero), 1));
+
+  u0x = vzipq_s32(u0x.val[0], u0x.val[1]);
+#if defined(__aarch64__)
+  u0 = vreinterpretq_s32_s64(vzip1q_s64(vreinterpretq_s64_s32(u0x.val[0]),
+                                        vreinterpretq_s64_s32(u0x.val[1])));
+#else
+  u0 = vcombine_s32(vget_low_s32(u0x.val[0]), vget_low_s32(u0x.val[1]));
+#endif  // (__aarch64__)
+  // u1
+  int32x4x2_t u1x;
+  u1x.val[0] = vreinterpretq_s32_s64(
+      vmull_s32(vmovn_s64(vreinterpretq_s64_s32(u1)), mul));
+  u1x.val[0] = vreinterpretq_s32_s64(
+      vaddq_s64(vreinterpretq_s64_s32(u1x.val[0]), rnding));
+
+  u1 = vextq_s32(u1, zero, 1);
+  u1x.val[1] = vreinterpretq_s32_s64(
+      vmull_s32(vmovn_s64(vreinterpretq_s64_s32(u1)), mul));
+  u1x.val[1] = vreinterpretq_s32_s64(
+      vaddq_s64(vreinterpretq_s64_s32(u1x.val[1]), rnding));
+
+  u1x.val[0] = vreinterpretq_s32_s16(vextq_s16(
+      vreinterpretq_s16_s32(u1x.val[0]), vreinterpretq_s16_s32(zero), 1));
+  u1x.val[1] = vreinterpretq_s32_s16(vextq_s16(
+      vreinterpretq_s16_s32(u1x.val[1]), vreinterpretq_s16_s32(zero), 1));
+
+  u1x = vzipq_s32(u1x.val[0], u1x.val[1]);
+#if defined(__aarch64__)
+  u1 = vreinterpretq_s32_s64(vzip1q_s64(vreinterpretq_s64_s32(u1x.val[0]),
+                                        vreinterpretq_s64_s32(u1x.val[1])));
+#else
+  u1 = vcombine_s32(vget_low_s32(u1x.val[0]), vget_low_s32(u1x.val[1]));
+#endif  // (__aarch64__)
+
+  // u2
+  int32x4x2_t u2x;
+  u2x.val[0] = vreinterpretq_s32_s64(
+      vmull_s32(vmovn_s64(vreinterpretq_s64_s32(u2)), mul));
+  u2x.val[0] = vreinterpretq_s32_s64(
+      vaddq_s64(vreinterpretq_s64_s32(u2x.val[0]), rnding));
+
+  u2 = vextq_s32(u2, zero, 1);
+  u2x.val[1] = vreinterpretq_s32_s64(
+      vmull_s32(vmovn_s64(vreinterpretq_s64_s32(u2)), mul));
+  u2x.val[1] = vreinterpretq_s32_s64(
+      vaddq_s64(vreinterpretq_s64_s32(u2x.val[1]), rnding));
+
+  u2x.val[0] = vreinterpretq_s32_s16(vextq_s16(
+      vreinterpretq_s16_s32(u2x.val[0]), vreinterpretq_s16_s32(zero), 1));
+  u2x.val[1] = vreinterpretq_s32_s16(vextq_s16(
+      vreinterpretq_s16_s32(u2x.val[1]), vreinterpretq_s16_s32(zero), 1));
+
+  u2x = vzipq_s32(u2x.val[0], u2x.val[1]);
+#if defined(__aarch64__)
+  u2 = vreinterpretq_s32_s64(vzip1q_s64(vreinterpretq_s64_s32(u2x.val[0]),
+                                        vreinterpretq_s64_s32(u2x.val[1])));
+#else
+  u2 = vcombine_s32(vget_low_s32(u2x.val[0]), vget_low_s32(u2x.val[1]));
+#endif  // (__aarch64__)
+
+  // u3
+  int32x4x2_t u3x;
+  u3x.val[0] = vreinterpretq_s32_s64(
+      vmull_s32(vmovn_s64(vreinterpretq_s64_s32(u3)), mul));
+  u3x.val[0] = vreinterpretq_s32_s64(
+      vaddq_s64(vreinterpretq_s64_s32(u3x.val[0]), rnding));
+
+  u3 = vextq_s32(u3, zero, 1);
+  u3x.val[1] = vreinterpretq_s32_s64(
+      vmull_s32(vmovn_s64(vreinterpretq_s64_s32(u3)), mul));
+  u3x.val[1] = vreinterpretq_s32_s64(
+      vaddq_s64(vreinterpretq_s64_s32(u3x.val[1]), rnding));
+
+  u3x.val[0] = vreinterpretq_s32_s16(vextq_s16(
+      vreinterpretq_s16_s32(u3x.val[0]), vreinterpretq_s16_s32(zero), 1));
+  u3x.val[1] = vreinterpretq_s32_s16(vextq_s16(
+      vreinterpretq_s16_s32(u3x.val[1]), vreinterpretq_s16_s32(zero), 1));
+
+  u3x = vzipq_s32(u3x.val[0], u3x.val[1]);
+#if defined(__aarch64__)
+  u3 = vreinterpretq_s32_s64(vzip1q_s64(vreinterpretq_s64_s32(u3x.val[0]),
+                                        vreinterpretq_s64_s32(u3x.val[1])));
+#else
+  u3 = vcombine_s32(vget_low_s32(u3x.val[0]), vget_low_s32(u3x.val[1]));
+#endif  // (__aarch64__)
+
+  out[0] = u0;
+  out[1] = u1;
+  out[2] = u2;
+  out[3] = u3;
+
+  if (!do_cols) {
+    const int log_range = AOMMAX(16, bd + 6);
+    const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
+    const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
+    const int32x4_t rnding32 = vdupq_n_s32(1 << (out_shift - 1));
+    round_shift_4x4(out, out_shift, &rnding32);
+    highbd_clamp_s32_neon(out, out, &clamp_lo, &clamp_hi, 4);
+  }
+}
+
+static void write_buffer_4x4(int32x4_t *in, uint16_t *output, int stride,
+                             int fliplr, int flipud, int shift, int bd) {
+  uint32x4_t u0, u1, u2, u3;
+  uint16x4_t v0, v1, v2, v3;
+  const int32x4_t rnding = vdupq_n_s32(1 << (shift - 1));
+  round_shift_4x4(in, shift, &rnding);
+
+  v0 = vld1_u16(output + 0 * stride);
+  v1 = vld1_u16(output + 1 * stride);
+  v2 = vld1_u16(output + 2 * stride);
+  v3 = vld1_u16(output + 3 * stride);
+
+  if (fliplr) {
+    u0 = vrev64q_u32(vreinterpretq_u32_s32(in[0]));
+    in[0] = vreinterpretq_s32_u32(vextq_u32(u0, u0, 2));
+    u0 = vrev64q_u32(vreinterpretq_u32_s32(in[1]));
+    in[1] = vreinterpretq_s32_u32(vextq_u32(u0, u0, 2));
+    u0 = vrev64q_u32(vreinterpretq_u32_s32(in[2]));
+    in[2] = vreinterpretq_s32_u32(vextq_u32(u0, u0, 2));
+    u0 = vrev64q_u32(vreinterpretq_u32_s32(in[3]));
+    in[3] = vreinterpretq_s32_u32(vextq_u32(u0, u0, 2));
+  }
+
+  if (flipud) {
+    u0 = vaddw_u16(vreinterpretq_u32_s32(in[3]), v0);
+    u1 = vaddw_u16(vreinterpretq_u32_s32(in[2]), v1);
+    u2 = vaddw_u16(vreinterpretq_u32_s32(in[1]), v2);
+    u3 = vaddw_u16(vreinterpretq_u32_s32(in[0]), v3);
+  } else {
+    u0 = vaddw_u16(vreinterpretq_u32_s32(in[0]), v0);
+    u1 = vaddw_u16(vreinterpretq_u32_s32(in[1]), v1);
+    u2 = vaddw_u16(vreinterpretq_u32_s32(in[2]), v2);
+    u3 = vaddw_u16(vreinterpretq_u32_s32(in[3]), v3);
+  }
+
+  uint16x8_t u4 = vcombine_u16(vqmovn_u32(u0), vqmovn_u32(u1));
+  uint16x8_t u5 = vcombine_u16(vqmovn_u32(u2), vqmovn_u32(u3));
+  const uint16x8_t vmin = vdupq_n_u16(0);
+  const uint16x8_t vmax = vdupq_n_u16((1 << bd) - 1);
+  u4 = highbd_clamp_u16(&u4, &vmin, &vmax);
+  u5 = highbd_clamp_u16(&u5, &vmin, &vmax);
+
+  vst1_u16(output + 0 * stride, vget_low_u16(u4));
+  vst1_u16(output + 1 * stride, vget_high_u16(u4));
+  vst1_u16(output + 2 * stride, vget_low_u16(u5));
+  vst1_u16(output + 3 * stride, vget_high_u16(u5));
+}
+
+static void iidentity4_neon(int32x4_t *in, int32x4_t *out, int bit, int do_cols,
+                            int bd, int out_shift) {
+  (void)bit;
+  int32x4_t v[4];
+  int32x4_t zero = vdupq_n_s32(0);
+  int32x2_t fact = vdup_n_s32(NewSqrt2);
+  int32x4x2_t a0;
+  const int64x2_t rnding = vdupq_n_s64(1 << (NewSqrt2Bits - 1));
+
+  for (int i = 0; i < 4; i++) {
+    a0.val[0] = vreinterpretq_s32_s64(
+        vmlal_s32(rnding, vmovn_s64(vreinterpretq_s64_s32(in[i])), fact));
+    a0.val[0] = vreinterpretq_s32_s64(
+        vshrq_n_s64(vreinterpretq_s64_s32(a0.val[0]), NewSqrt2Bits));
+    a0.val[1] = vextq_s32(in[i], zero, 1);
+    a0.val[1] = vreinterpretq_s32_s64(
+        vmlal_s32(rnding, vmovn_s64(vreinterpretq_s64_s32(a0.val[1])), fact));
+    a0.val[1] = vreinterpretq_s32_s64(
+        vshrq_n_s64(vreinterpretq_s64_s32(a0.val[1]), NewSqrt2Bits));
+
+    a0 = vzipq_s32(a0.val[0], a0.val[1]);
+#if defined(__aarch64__)
+    out[i] = vreinterpretq_s32_s64(vzip1q_s64(
+        vreinterpretq_s64_s32(a0.val[0]), vreinterpretq_s64_s32(a0.val[1])));
+#else
+    out[i] = vextq_s32(vextq_s32(a0.val[0], a0.val[0], 2), a0.val[1], 2);
+#endif
+  }
+  if (!do_cols) {
+    const int log_range = AOMMAX(16, bd + 6);
+    const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
+    const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
+    const int32x4_t rnding32 = vdupq_n_s32(1 << (out_shift - 1));
+    round_shift_4x4(out, out_shift, &rnding32);
+    highbd_clamp_s32_neon(out, out, &clamp_lo, &clamp_hi, 4);
+  }
+  v[0] = out[0];
+  v[1] = out[1];
+  v[2] = out[2];
+  v[3] = out[3];
+
+  // Transpose for 4x4
+  TRANSPOSE_4X4(v[0], v[1], v[2], v[3], out[0], out[1], out[2], out[3]);
+}
+void av1_inv_txfm2d_add_4x4_neon(const int32_t *input, uint16_t *output,
+                                 int stride, TX_TYPE tx_type, int bd) {
+  int32x4_t in[4];
+
+  const int8_t *shift = av1_inv_txfm_shift_ls[TX_4X4];
+  const int txw_idx = get_txw_idx(TX_4X4);
+  const int txh_idx = get_txh_idx(TX_4X4);
+
+  switch (tx_type) {
+    case DCT_DCT:
+      load_buffer_4x4(input, in);
+      idct4x4_neon(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
+      idct4x4_neon(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
+      break;
+    case ADST_DCT:
+      load_buffer_4x4(input, in);
+      idct4x4_neon(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
+      iadst4x4_neon(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
+      break;
+    case DCT_ADST:
+      load_buffer_4x4(input, in);
+      iadst4x4_neon(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
+      idct4x4_neon(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
+      break;
+    case ADST_ADST:
+      load_buffer_4x4(input, in);
+      iadst4x4_neon(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
+      iadst4x4_neon(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
+      break;
+    case FLIPADST_DCT:
+      load_buffer_4x4(input, in);
+      idct4x4_neon(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
+      iadst4x4_neon(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd);
+      break;
+    case DCT_FLIPADST:
+      load_buffer_4x4(input, in);
+      iadst4x4_neon(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
+      idct4x4_neon(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd);
+      break;
+    case FLIPADST_FLIPADST:
+      load_buffer_4x4(input, in);
+      iadst4x4_neon(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
+      iadst4x4_neon(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      write_buffer_4x4(in, output, stride, 1, 1, -shift[1], bd);
+      break;
+    case ADST_FLIPADST:
+      load_buffer_4x4(input, in);
+      iadst4x4_neon(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
+      iadst4x4_neon(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd);
+      break;
+    case FLIPADST_ADST:
+      load_buffer_4x4(input, in);
+      iadst4x4_neon(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
+      iadst4x4_neon(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd);
+      break;
+    case IDTX:
+      load_buffer_4x4(input, in);
+      iidentity4_neon(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
+      iidentity4_neon(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
+      break;
+    case V_DCT:
+      load_buffer_4x4(input, in);
+      iidentity4_neon(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
+      idct4x4_neon(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
+      break;
+    case H_DCT:
+      load_buffer_4x4(input, in);
+      idct4x4_neon(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
+      iidentity4_neon(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
+      break;
+    case V_ADST:
+      load_buffer_4x4(input, in);
+      iidentity4_neon(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
+      iadst4x4_neon(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
+      break;
+    case H_ADST:
+      load_buffer_4x4(input, in);
+      iadst4x4_neon(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
+      iidentity4_neon(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
+      break;
+    case V_FLIPADST:
+      load_buffer_4x4(input, in);
+      iidentity4_neon(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
+      iadst4x4_neon(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd);
+      break;
+    case H_FLIPADST:
+      load_buffer_4x4(input, in);
+      iadst4x4_neon(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
+      iidentity4_neon(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd);
+      break;
+    default: assert(0);
+  }
+}
+
+// 8x8
+static void load_buffer_8x8(const int32_t *coeff, int32x4_t *in) {
+  in[0] = vld1q_s32(coeff + 0);
+  in[1] = vld1q_s32(coeff + 4);
+  in[2] = vld1q_s32(coeff + 8);
+  in[3] = vld1q_s32(coeff + 12);
+  in[4] = vld1q_s32(coeff + 16);
+  in[5] = vld1q_s32(coeff + 20);
+  in[6] = vld1q_s32(coeff + 24);
+  in[7] = vld1q_s32(coeff + 28);
+  in[8] = vld1q_s32(coeff + 32);
+  in[9] = vld1q_s32(coeff + 36);
+  in[10] = vld1q_s32(coeff + 40);
+  in[11] = vld1q_s32(coeff + 44);
+  in[12] = vld1q_s32(coeff + 48);
+  in[13] = vld1q_s32(coeff + 52);
+  in[14] = vld1q_s32(coeff + 56);
+  in[15] = vld1q_s32(coeff + 60);
+}
+
+static void idct8x8_neon(int32x4_t *in, int32x4_t *out, int bit, int do_cols,
+                         int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
+  const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
+  int32x4_t u0, u1, u2, u3, u4, u5, u6, u7;
+  int32x4_t v0, v1, v2, v3, v4, v5, v6, v7;
+  int32x4_t x, y;
+  int col;
+  const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
+  const int32x4_t v_bit = vdupq_n_s32(-bit);
+  // Note:
+  //  Even column: 0, 2, ..., 14
+  //  Odd column: 1, 3, ..., 15
+  //  one even column plus one odd column constructs one row (8 coeffs)
+  //  total we have 8 rows (8x8).
+  for (col = 0; col < 2; ++col) {
+    // stage 0
+    // stage 1
+    // stage 2
+    u0 = in[0 * 2 + col];
+    u1 = in[4 * 2 + col];
+    u2 = in[2 * 2 + col];
+    u3 = in[6 * 2 + col];
+
+    x = vmulq_n_s32(in[1 * 2 + col], cospi[56]);
+    u4 = vmlaq_n_s32(x, in[7 * 2 + col], -cospi[8]);
+    u4 = vaddq_s32(u4, rnding);
+    u4 = vshlq_s32(u4, v_bit);
+
+    x = vmulq_n_s32(in[1 * 2 + col], cospi[8]);
+    u7 = vmlaq_n_s32(x, in[7 * 2 + col], cospi[56]);
+    u7 = vaddq_s32(u7, rnding);
+    u7 = vshlq_s32(u7, v_bit);
+
+    x = vmulq_n_s32(in[5 * 2 + col], cospi[24]);
+    u5 = vmlaq_n_s32(x, in[3 * 2 + col], -cospi[40]);
+    u5 = vaddq_s32(u5, rnding);
+    u5 = vshlq_s32(u5, v_bit);
+
+    x = vmulq_n_s32(in[5 * 2 + col], cospi[40]);
+    u6 = vmlaq_n_s32(x, in[3 * 2 + col], cospi[24]);
+    u6 = vaddq_s32(u6, rnding);
+    u6 = vshlq_s32(u6, v_bit);
+
+    // stage 3
+    x = vmulq_n_s32(u0, cospi[32]);
+    y = vmulq_n_s32(u1, cospi[32]);
+    v0 = vaddq_s32(x, y);
+    v0 = vaddq_s32(v0, rnding);
+    v0 = vshlq_s32(v0, v_bit);
+
+    v1 = vsubq_s32(x, y);
+    v1 = vaddq_s32(v1, rnding);
+    v1 = vshlq_s32(v1, v_bit);
+
+    x = vmulq_n_s32(u2, cospi[48]);
+    v2 = vmlaq_n_s32(x, u3, -cospi[16]);
+    v2 = vaddq_s32(v2, rnding);
+    v2 = vshlq_s32(v2, v_bit);
+
+    x = vmulq_n_s32(u2, cospi[16]);
+    v3 = vmlaq_n_s32(x, u3, cospi[48]);
+    v3 = vaddq_s32(v3, rnding);
+    v3 = vshlq_s32(v3, v_bit);
+
+    addsub_neon(u4, u5, &v4, &v5, &clamp_lo, &clamp_hi);
+    addsub_neon(u7, u6, &v7, &v6, &clamp_lo, &clamp_hi);
+
+    // stage 4
+    addsub_neon(v0, v3, &u0, &u3, &clamp_lo, &clamp_hi);
+    addsub_neon(v1, v2, &u1, &u2, &clamp_lo, &clamp_hi);
+    u4 = v4;
+    u7 = v7;
+
+    x = vmulq_n_s32(v5, cospi[32]);
+    y = vmulq_n_s32(v6, cospi[32]);
+    u6 = vaddq_s32(y, x);
+    u6 = vaddq_s32(u6, rnding);
+    u6 = vshlq_s32(u6, v_bit);
+
+    u5 = vsubq_s32(y, x);
+    u5 = vaddq_s32(u5, rnding);
+    u5 = vshlq_s32(u5, v_bit);
+
+    // stage 5
+    addsub_neon(u0, u7, out + 0 * 2 + col, out + 7 * 2 + col, &clamp_lo,
+                &clamp_hi);
+    addsub_neon(u1, u6, out + 1 * 2 + col, out + 6 * 2 + col, &clamp_lo,
+                &clamp_hi);
+    addsub_neon(u2, u5, out + 2 * 2 + col, out + 5 * 2 + col, &clamp_lo,
+                &clamp_hi);
+    addsub_neon(u3, u4, out + 3 * 2 + col, out + 4 * 2 + col, &clamp_lo,
+                &clamp_hi);
+  }
+
+  if (!do_cols) {
+    const int32x4_t rnding_shift = vdupq_n_s32(1 << (out_shift - 1));
+    const int log_range_out = AOMMAX(16, bd + 6);
+    const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1)));
+    const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1);
+    round_shift_8x8(out, out_shift, &rnding_shift);
+    highbd_clamp_s32_neon(out, out, &clamp_lo_out, &clamp_hi_out, 16);
+  }
+}
+
+static void iadst8x8_neon(int32x4_t *in, int32x4_t *out, int bit, int do_cols,
+                          int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const int32x4_t kZero = vdupq_n_s32(0);
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
+  const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
+  int32x4_t u[8], v[8], x;
+  const int32x4_t v_bit = vdupq_n_s32(-bit);
+  const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
+  // stage 0-1-2
+  // (1)
+  u[0] = vmlaq_n_s32(rnding, in[14], cospi[4]);
+  u[0] = vmlaq_n_s32(u[0], in[0], cospi[60]);
+  u[0] = vshlq_s32(u[0], v_bit);
+
+  u[1] = vmlaq_n_s32(rnding, in[14], cospi[60]);
+  u[1] = vmlsq_n_s32(u[1], in[0], cospi[4]);
+  u[1] = vshlq_s32(u[1], v_bit);
+
+  // (2)
+  u[2] = vmlaq_n_s32(rnding, in[10], cospi[20]);
+  u[2] = vmlaq_n_s32(u[2], in[4], cospi[44]);
+  u[2] = vshlq_s32(u[2], v_bit);
+
+  u[3] = vmlaq_n_s32(rnding, in[10], cospi[44]);
+  u[3] = vmlsq_n_s32(u[3], in[4], cospi[20]);
+  u[3] = vshlq_s32(u[3], v_bit);
+
+  // (3)
+  u[4] = vmlaq_n_s32(rnding, in[6], cospi[36]);
+  u[4] = vmlaq_n_s32(u[4], in[8], cospi[28]);
+  u[4] = vshlq_s32(u[4], v_bit);
+
+  u[5] = vmlaq_n_s32(rnding, in[6], cospi[28]);
+  u[5] = vmlsq_n_s32(u[5], in[8], cospi[36]);
+  u[5] = vshlq_s32(u[5], v_bit);
+
+  // (4)
+  u[6] = vmlaq_n_s32(rnding, in[2], cospi[52]);
+  u[6] = vmlaq_n_s32(u[6], in[12], cospi[12]);
+  u[6] = vshlq_s32(u[6], v_bit);
+
+  u[7] = vmlaq_n_s32(rnding, in[2], cospi[12]);
+  u[7] = vmlsq_n_s32(u[7], in[12], cospi[52]);
+  u[7] = vshlq_s32(u[7], v_bit);
+
+  // stage 3
+  addsub_neon(u[0], u[4], &v[0], &v[4], &clamp_lo, &clamp_hi);
+  addsub_neon(u[1], u[5], &v[1], &v[5], &clamp_lo, &clamp_hi);
+  addsub_neon(u[2], u[6], &v[2], &v[6], &clamp_lo, &clamp_hi);
+  addsub_neon(u[3], u[7], &v[3], &v[7], &clamp_lo, &clamp_hi);
+
+  // stage 4
+  u[0] = v[0];
+  u[1] = v[1];
+  u[2] = v[2];
+  u[3] = v[3];
+
+  u[4] = vmlaq_n_s32(rnding, v[4], cospi[16]);
+  u[4] = vmlaq_n_s32(u[4], v[5], cospi[48]);
+  u[4] = vshlq_s32(u[4], v_bit);
+
+  u[5] = vmlaq_n_s32(rnding, v[4], cospi[48]);
+  u[5] = vmlsq_n_s32(u[5], v[5], cospi[16]);
+  u[5] = vshlq_s32(u[5], v_bit);
+
+  u[6] = vmlaq_n_s32(rnding, v[7], cospi[16]);
+  u[6] = vmlsq_n_s32(u[6], v[6], cospi[48]);
+  u[6] = vshlq_s32(u[6], v_bit);
+
+  u[7] = vmlaq_n_s32(rnding, v[7], cospi[48]);
+  u[7] = vmlaq_n_s32(u[7], v[6], cospi[16]);
+  u[7] = vshlq_s32(u[7], v_bit);
+
+  // stage 5
+  addsub_neon(u[0], u[2], &v[0], &v[2], &clamp_lo, &clamp_hi);
+  addsub_neon(u[1], u[3], &v[1], &v[3], &clamp_lo, &clamp_hi);
+  addsub_neon(u[4], u[6], &v[4], &v[6], &clamp_lo, &clamp_hi);
+  addsub_neon(u[5], u[7], &v[5], &v[7], &clamp_lo, &clamp_hi);
+
+  // stage 6
+  u[0] = v[0];
+  u[1] = v[1];
+  u[4] = v[4];
+  u[5] = v[5];
+
+  v[0] = vmlaq_n_s32(rnding, v[2], cospi[32]);
+  x = vmulq_n_s32(v[3], cospi[32]);
+  u[2] = vaddq_s32(v[0], x);
+  u[2] = vshlq_s32(u[2], v_bit);
+
+  u[3] = vsubq_s32(v[0], x);
+  u[3] = vshlq_s32(u[3], v_bit);
+
+  v[0] = vmlaq_n_s32(rnding, v[6], cospi[32]);
+  x = vmulq_n_s32(v[7], cospi[32]);
+  u[6] = vaddq_s32(v[0], x);
+  u[6] = vshlq_s32(u[6], v_bit);
+
+  u[7] = vsubq_s32(v[0], x);
+  u[7] = vshlq_s32(u[7], v_bit);
+
+  // stage 7
+  if (do_cols) {
+    out[0] = u[0];
+    out[2] = vsubq_s32(kZero, u[4]);
+    out[4] = u[6];
+    out[6] = vsubq_s32(kZero, u[2]);
+    out[8] = u[3];
+    out[10] = vsubq_s32(kZero, u[7]);
+    out[12] = u[5];
+    out[14] = vsubq_s32(kZero, u[1]);
+  } else {
+    const int log_range_out = AOMMAX(16, bd + 6);
+    const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1)));
+    const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1);
+    const int32x4_t v_shift = vdupq_n_s32(-out_shift);
+    int32x4_t offset = vdupq_n_s32((1 << out_shift) >> 1);
+    neg_shift_neon(&u[0], &u[4], out + 0, out + 2, &clamp_lo_out, &clamp_hi_out,
+                   &v_shift, &offset);
+    neg_shift_neon(&u[6], &u[2], out + 4, out + 6, &clamp_lo_out, &clamp_hi_out,
+                   &v_shift, &offset);
+    neg_shift_neon(&u[3], &u[7], out + 8, out + 10, &clamp_lo_out,
+                   &clamp_hi_out, &v_shift, &offset);
+    neg_shift_neon(&u[5], &u[1], out + 12, out + 14, &clamp_lo_out,
+                   &clamp_hi_out, &v_shift, &offset);
+  }
+
+  // Odd 8 points: 1, 3, ..., 15
+  // stage 0
+  // stage 1
+  // stage 2
+  // (1)
+  u[0] = vmlaq_n_s32(rnding, in[15], cospi[4]);
+  u[0] = vmlaq_n_s32(u[0], in[1], cospi[60]);
+  u[0] = vshlq_s32(u[0], v_bit);
+
+  u[1] = vmlaq_n_s32(rnding, in[15], cospi[60]);
+  u[1] = vmlsq_n_s32(u[1], in[1], cospi[4]);
+  u[1] = vshlq_s32(u[1], v_bit);
+
+  // (2)
+  u[2] = vmlaq_n_s32(rnding, in[11], cospi[20]);
+  u[2] = vmlaq_n_s32(u[2], in[5], cospi[44]);
+  u[2] = vshlq_s32(u[2], v_bit);
+
+  u[3] = vmlaq_n_s32(rnding, in[11], cospi[44]);
+  u[3] = vmlsq_n_s32(u[3], in[5], cospi[20]);
+  u[3] = vshlq_s32(u[3], v_bit);
+
+  // (3)
+  u[4] = vmlaq_n_s32(rnding, in[7], cospi[36]);
+  u[4] = vmlaq_n_s32(u[4], in[9], cospi[28]);
+  u[4] = vshlq_s32(u[4], v_bit);
+
+  u[5] = vmlaq_n_s32(rnding, in[7], cospi[28]);
+  u[5] = vmlsq_n_s32(u[5], in[9], cospi[36]);
+  u[5] = vshlq_s32(u[5], v_bit);
+
+  // (4)
+  u[6] = vmlaq_n_s32(rnding, in[3], cospi[52]);
+  u[6] = vmlaq_n_s32(u[6], in[13], cospi[12]);
+  u[6] = vshlq_s32(u[6], v_bit);
+
+  u[7] = vmlaq_n_s32(rnding, in[3], cospi[12]);
+  u[7] = vmlsq_n_s32(u[7], in[13], cospi[52]);
+  u[7] = vshlq_s32(u[7], v_bit);
+
+  // stage 3
+  addsub_neon(u[0], u[4], &v[0], &v[4], &clamp_lo, &clamp_hi);
+  addsub_neon(u[1], u[5], &v[1], &v[5], &clamp_lo, &clamp_hi);
+  addsub_neon(u[2], u[6], &v[2], &v[6], &clamp_lo, &clamp_hi);
+  addsub_neon(u[3], u[7], &v[3], &v[7], &clamp_lo, &clamp_hi);
+
+  // stage 4
+  u[0] = v[0];
+  u[1] = v[1];
+  u[2] = v[2];
+  u[3] = v[3];
+
+  u[4] = vmlaq_n_s32(rnding, v[4], cospi[16]);
+  u[4] = vmlaq_n_s32(u[4], v[5], cospi[48]);
+  u[4] = vshlq_s32(u[4], v_bit);
+
+  u[5] = vmlaq_n_s32(rnding, v[4], cospi[48]);
+  u[5] = vmlsq_n_s32(u[5], v[5], cospi[16]);
+  u[5] = vshlq_s32(u[5], v_bit);
+
+  u[6] = vmlaq_n_s32(rnding, v[7], cospi[16]);
+  u[6] = vmlsq_n_s32(u[6], v[6], cospi[48]);
+  u[6] = vshlq_s32(u[6], v_bit);
+
+  u[7] = vmlaq_n_s32(rnding, v[6], cospi[16]);
+  u[7] = vmlaq_n_s32(u[7], v[7], cospi[48]);
+  u[7] = vshlq_s32(u[7], v_bit);
+
+  // stage 5
+  addsub_neon(u[0], u[2], &v[0], &v[2], &clamp_lo, &clamp_hi);
+  addsub_neon(u[1], u[3], &v[1], &v[3], &clamp_lo, &clamp_hi);
+  addsub_neon(u[4], u[6], &v[4], &v[6], &clamp_lo, &clamp_hi);
+  addsub_neon(u[5], u[7], &v[5], &v[7], &clamp_lo, &clamp_hi);
+
+  // stage 6
+  u[0] = v[0];
+  u[1] = v[1];
+  u[4] = v[4];
+  u[5] = v[5];
+
+  v[0] = vmlaq_n_s32(rnding, v[2], cospi[32]);
+  x = vmulq_n_s32(v[3], cospi[32]);
+  u[2] = vaddq_s32(v[0], x);
+  u[2] = vshlq_s32(u[2], v_bit);
+
+  u[3] = vsubq_s32(v[0], x);
+  u[3] = vshlq_s32(u[3], v_bit);
+
+  v[0] = vmlaq_n_s32(rnding, v[6], cospi[32]);
+  x = vmulq_n_s32(v[7], cospi[32]);
+  u[6] = vaddq_s32(v[0], x);
+  u[6] = vshlq_s32(u[6], v_bit);
+
+  u[7] = vsubq_s32(v[0], x);
+  u[7] = vshlq_s32(u[7], v_bit);
+
+  // stage 7
+  if (do_cols) {
+    out[1] = u[0];
+    out[3] = vsubq_s32(kZero, u[4]);
+    out[5] = u[6];
+    out[7] = vsubq_s32(kZero, u[2]);
+    out[9] = u[3];
+    out[11] = vsubq_s32(kZero, u[7]);
+    out[13] = u[5];
+    out[15] = vsubq_s32(kZero, u[1]);
+  } else {
+    const int log_range_out = AOMMAX(16, bd + 6);
+    const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1)));
+    const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1);
+    const int32x4_t v_shift = vdupq_n_s32(-out_shift);
+    int32x4_t offset = vdupq_n_s32((1 << out_shift) >> 1);
+    neg_shift_neon(&u[0], &u[4], out + 1, out + 3, &clamp_lo_out, &clamp_hi_out,
+                   &v_shift, &offset);
+    neg_shift_neon(&u[6], &u[2], out + 5, out + 7, &clamp_lo_out, &clamp_hi_out,
+                   &v_shift, &offset);
+    neg_shift_neon(&u[3], &u[7], out + 9, out + 11, &clamp_lo_out,
+                   &clamp_hi_out, &v_shift, &offset);
+    neg_shift_neon(&u[5], &u[1], out + 13, out + 15, &clamp_lo_out,
+                   &clamp_hi_out, &v_shift, &offset);
+  }
+}
+
+static void iidentity8_neon(int32x4_t *in, int32x4_t *out, int bit, int do_cols,
+                            int bd, int out_shift) {
+  (void)bit;
+  out[0] = vaddq_s32(in[0], in[0]);
+  out[1] = vaddq_s32(in[1], in[1]);
+  out[2] = vaddq_s32(in[2], in[2]);
+  out[3] = vaddq_s32(in[3], in[3]);
+  out[4] = vaddq_s32(in[4], in[4]);
+  out[5] = vaddq_s32(in[5], in[5]);
+  out[6] = vaddq_s32(in[6], in[6]);
+  out[7] = vaddq_s32(in[7], in[7]);
+
+  if (!do_cols) {
+    const int log_range = AOMMAX(16, bd + 6);
+    const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
+    const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
+    const int32x4_t rnding = vdupq_n_s32(1 << (out_shift - 1));
+    round_shift_4x4(out, out_shift, &rnding);
+    round_shift_4x4(out + 4, out_shift, &rnding);
+    highbd_clamp_s32_neon(out, out, &clamp_lo, &clamp_hi, 8);
+  }
+}
+
+static uint16x8_t get_recon_8x8(const uint16x8_t pred, int32x4_t res_lo,
+                                int32x4_t res_hi, int fliplr, int bd) {
+  uint16x8x2_t x;
+
+  if (fliplr) {
+    res_lo = vrev64q_s32(res_lo);
+    res_lo = vextq_s32(res_lo, res_lo, 2);
+    res_hi = vrev64q_s32(res_hi);
+    res_hi = vextq_s32(res_hi, res_hi, 2);
+    x.val[0] = vreinterpretq_u16_s32(
+        vaddw_s16(res_hi, vreinterpret_s16_u16(vget_low_u16(pred))));
+    x.val[1] = vreinterpretq_u16_s32(
+        vaddw_s16(res_lo, vreinterpret_s16_u16(vget_high_u16(pred))));
+
+  } else {
+    x.val[0] = vreinterpretq_u16_s32(
+        vaddw_s16(res_lo, vreinterpret_s16_u16(vget_low_u16(pred))));
+    x.val[1] = vreinterpretq_u16_s32(
+        vaddw_s16(res_hi, vreinterpret_s16_u16(vget_high_u16(pred))));
+  }
+
+  uint16x8_t x2 = vcombine_u16(vqmovn_u32(vreinterpretq_u32_u16(x.val[0])),
+                               vqmovn_u32(vreinterpretq_u32_u16(x.val[1])));
+  const uint16x8_t vmin = vdupq_n_u16(0);
+  const uint16x8_t vmax = vdupq_n_u16((1 << bd) - 1);
+  return highbd_clamp_u16(&x2, &vmin, &vmax);
+}
+
+static void write_buffer_8x8(int32x4_t *in, uint16_t *output, int stride,
+                             int fliplr, int flipud, int shift, int bd) {
+  uint16x8_t u0, u1, u2, u3, u4, u5, u6, u7;
+  uint16x8_t v0, v1, v2, v3, v4, v5, v6, v7;
+  const int32x4_t rnding = vdupq_n_s32(1 << (shift - 1));
+  round_shift_8x8(in, shift, &rnding);
+
+  v0 = vld1q_u16(output + 0 * stride);
+  v1 = vld1q_u16(output + 1 * stride);
+  v2 = vld1q_u16(output + 2 * stride);
+  v3 = vld1q_u16(output + 3 * stride);
+  v4 = vld1q_u16(output + 4 * stride);
+  v5 = vld1q_u16(output + 5 * stride);
+  v6 = vld1q_u16(output + 6 * stride);
+  v7 = vld1q_u16(output + 7 * stride);
+
+  if (flipud) {
+    u0 = get_recon_8x8(v0, in[14], in[15], fliplr, bd);
+    u1 = get_recon_8x8(v1, in[12], in[13], fliplr, bd);
+    u2 = get_recon_8x8(v2, in[10], in[11], fliplr, bd);
+    u3 = get_recon_8x8(v3, in[8], in[9], fliplr, bd);
+    u4 = get_recon_8x8(v4, in[6], in[7], fliplr, bd);
+    u5 = get_recon_8x8(v5, in[4], in[5], fliplr, bd);
+    u6 = get_recon_8x8(v6, in[2], in[3], fliplr, bd);
+    u7 = get_recon_8x8(v7, in[0], in[1], fliplr, bd);
+  } else {
+    u0 = get_recon_8x8(v0, in[0], in[1], fliplr, bd);
+    u1 = get_recon_8x8(v1, in[2], in[3], fliplr, bd);
+    u2 = get_recon_8x8(v2, in[4], in[5], fliplr, bd);
+    u3 = get_recon_8x8(v3, in[6], in[7], fliplr, bd);
+    u4 = get_recon_8x8(v4, in[8], in[9], fliplr, bd);
+    u5 = get_recon_8x8(v5, in[10], in[11], fliplr, bd);
+    u6 = get_recon_8x8(v6, in[12], in[13], fliplr, bd);
+    u7 = get_recon_8x8(v7, in[14], in[15], fliplr, bd);
+  }
+
+  vst1q_u16(output + 0 * stride, u0);
+  vst1q_u16(output + 1 * stride, u1);
+  vst1q_u16(output + 2 * stride, u2);
+  vst1q_u16(output + 3 * stride, u3);
+  vst1q_u16(output + 4 * stride, u4);
+  vst1q_u16(output + 5 * stride, u5);
+  vst1q_u16(output + 6 * stride, u6);
+  vst1q_u16(output + 7 * stride, u7);
+}
+
+void av1_inv_txfm2d_add_8x8_neon(const int32_t *input, uint16_t *output,
+                                 int stride, TX_TYPE tx_type, int bd) {
+  int32x4_t in[16], out[16];
+  const int8_t *shift = av1_inv_txfm_shift_ls[TX_8X8];
+  const int txw_idx = get_txw_idx(TX_8X8);
+  const int txh_idx = get_txh_idx(TX_8X8);
+
+  switch (tx_type) {
+    case DCT_DCT:
+      load_buffer_8x8(input, in);
+      transpose_8x8(in, out);
+      idct8x8_neon(out, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+                   -shift[0]);
+      transpose_8x8(in, out);
+      idct8x8_neon(out, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      write_buffer_8x8(in, output, stride, 0, 0, -shift[1], bd);
+      break;
+    case DCT_ADST:
+      load_buffer_8x8(input, in);
+      transpose_8x8(in, out);
+      iadst8x8_neon(out, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+                    -shift[0]);
+      transpose_8x8(in, out);
+      idct8x8_neon(out, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      write_buffer_8x8(in, output, stride, 0, 0, -shift[1], bd);
+      break;
+    case ADST_DCT:
+      load_buffer_8x8(input, in);
+      transpose_8x8(in, out);
+      idct8x8_neon(out, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+                   -shift[0]);
+      transpose_8x8(in, out);
+      iadst8x8_neon(out, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      write_buffer_8x8(in, output, stride, 0, 0, -shift[1], bd);
+      break;
+    case ADST_ADST:
+      load_buffer_8x8(input, in);
+      transpose_8x8(in, out);
+      iadst8x8_neon(out, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+                    -shift[0]);
+      transpose_8x8(in, out);
+      iadst8x8_neon(out, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      write_buffer_8x8(in, output, stride, 0, 0, -shift[1], bd);
+      break;
+    case FLIPADST_DCT:
+      load_buffer_8x8(input, in);
+      transpose_8x8(in, out);
+      idct8x8_neon(out, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+                   -shift[0]);
+      transpose_8x8(in, out);
+      iadst8x8_neon(out, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      write_buffer_8x8(in, output, stride, 0, 1, -shift[1], bd);
+      break;
+    case DCT_FLIPADST:
+      load_buffer_8x8(input, in);
+      transpose_8x8(in, out);
+      iadst8x8_neon(out, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+                    -shift[0]);
+      transpose_8x8(in, out);
+      idct8x8_neon(out, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      write_buffer_8x8(in, output, stride, 1, 0, -shift[1], bd);
+      break;
+    case ADST_FLIPADST:
+      load_buffer_8x8(input, in);
+      transpose_8x8(in, out);
+      iadst8x8_neon(out, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+                    -shift[0]);
+      transpose_8x8(in, out);
+      iadst8x8_neon(out, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      write_buffer_8x8(in, output, stride, 1, 0, -shift[1], bd);
+      break;
+    case FLIPADST_FLIPADST:
+      load_buffer_8x8(input, in);
+      transpose_8x8(in, out);
+      iadst8x8_neon(out, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+                    -shift[0]);
+      transpose_8x8(in, out);
+      iadst8x8_neon(out, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      write_buffer_8x8(in, output, stride, 1, 1, -shift[1], bd);
+      break;
+    case FLIPADST_ADST:
+      load_buffer_8x8(input, in);
+      transpose_8x8(in, out);
+      iadst8x8_neon(out, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+                    -shift[0]);
+      transpose_8x8(in, out);
+      iadst8x8_neon(out, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      write_buffer_8x8(in, output, stride, 0, 1, -shift[1], bd);
+      break;
+    default: assert(0);
+  }
+}
+
+static void idct8x8_low1_neon(int32x4_t *in, int32x4_t *out, int bit,
+                              int do_cols, int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
+  int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
+  int32x4_t x;
+  const int32x4_t v_bit = vdupq_n_s32(-bit);
+  const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
+  // stage 0-1-2-3
+  x = vmulq_n_s32(in[0], cospi[32]);
+  x = vaddq_s32(vshlq_s32(x, v_bit), rnding);
+
+  // stage 4-5
+  if (!do_cols) {
+    const int log_range_out = AOMMAX(16, bd + 6);
+    clamp_lo = vdupq_n_s32(-(1 << (log_range_out - 1)));
+    clamp_hi = vdupq_n_s32((1 << (log_range_out - 1)) - 1);
+
+    int32x4_t offset = vdupq_n_s32((1 << out_shift) >> 1);
+    x = vaddq_s32(x, offset);
+    x = vshlq_s32(x, vdupq_n_s32(-out_shift));
+  }
+
+  x = vmaxq_s32(x, clamp_lo);
+  x = vminq_s32(x, clamp_hi);
+  out[0] = x;
+  out[1] = x;
+  out[2] = x;
+  out[3] = x;
+  out[4] = x;
+  out[5] = x;
+  out[6] = x;
+  out[7] = x;
+}
+
+static void idct8x8_new_neon(int32x4_t *in, int32x4_t *out, int bit,
+                             int do_cols, int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
+  const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
+  int32x4_t u0, u1, u2, u3, u4, u5, u6, u7;
+  int32x4_t v0, v1, v2, v3, v4, v5, v6, v7;
+  int32x4_t x, y;
+  const int32x4_t v_bit = vdupq_n_s32(-bit);
+  const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
+
+  // stage 0
+  // stage 1
+  // stage 2
+  u0 = in[0];
+  u1 = in[4];
+  u2 = in[2];
+  u3 = in[6];
+
+  x = vmlaq_n_s32(rnding, in[1], cospi[56]);
+  u4 = vmlaq_n_s32(x, in[7], -cospi[8]);
+  u4 = vshlq_s32(u4, v_bit);
+
+  x = vmlaq_n_s32(rnding, in[1], cospi[8]);
+  u7 = vmlaq_n_s32(x, in[7], cospi[56]);
+  u7 = vshlq_s32(u7, v_bit);
+
+  x = vmlaq_n_s32(rnding, in[5], cospi[24]);
+  u5 = vmlaq_n_s32(x, in[3], -cospi[40]);
+  u5 = vshlq_s32(u5, v_bit);
+
+  x = vmlaq_n_s32(rnding, in[5], cospi[40]);
+  u6 = vmlaq_n_s32(x, in[3], cospi[24]);
+  u6 = vshlq_s32(u6, v_bit);
+
+  // stage 3
+  x = vmlaq_n_s32(rnding, u0, cospi[32]);
+  y = vmulq_n_s32(u1, cospi[32]);
+  v0 = vaddq_s32(x, y);
+  v0 = vshlq_s32(v0, v_bit);
+
+  v1 = vsubq_s32(x, y);
+  v1 = vshlq_s32(v1, v_bit);
+
+  x = vmlaq_n_s32(rnding, u2, cospi[48]);
+  v2 = vmlaq_n_s32(x, u3, -cospi[16]);
+  v2 = vshlq_s32(v2, v_bit);
+
+  x = vmlaq_n_s32(rnding, u2, cospi[16]);
+  v3 = vmlaq_n_s32(x, u3, cospi[48]);
+  v3 = vshlq_s32(v3, v_bit);
+
+  addsub_neon(u4, u5, &v4, &v5, &clamp_lo, &clamp_hi);
+  addsub_neon(u7, u6, &v7, &v6, &clamp_lo, &clamp_hi);
+
+  // stage 4
+  addsub_neon(v0, v3, &u0, &u3, &clamp_lo, &clamp_hi);
+  addsub_neon(v1, v2, &u1, &u2, &clamp_lo, &clamp_hi);
+  u4 = v4;
+  u7 = v7;
+
+  x = vmulq_n_s32(v5, cospi[32]);
+  y = vmlaq_n_s32(rnding, v6, cospi[32]);
+  u6 = vaddq_s32(y, x);
+  u6 = vshlq_s32(u6, v_bit);
+
+  u5 = vsubq_s32(y, x);
+  u5 = vshlq_s32(u5, v_bit);
+
+  // stage 5
+  addsub_neon(u0, u7, out + 0, out + 7, &clamp_lo, &clamp_hi);
+  addsub_neon(u1, u6, out + 1, out + 6, &clamp_lo, &clamp_hi);
+  addsub_neon(u2, u5, out + 2, out + 5, &clamp_lo, &clamp_hi);
+  addsub_neon(u3, u4, out + 3, out + 4, &clamp_lo, &clamp_hi);
+
+  if (!do_cols) {
+    const int log_range_out = AOMMAX(16, bd + 6);
+    const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1)));
+    const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1);
+    const int32x4_t rnding32 = vdupq_n_s32(1 << (out_shift - 1));
+    round_shift_4x4(out, out_shift, &rnding32);
+    round_shift_4x4(out + 4, out_shift, &rnding32);
+    highbd_clamp_s32_neon(out, out, &clamp_lo_out, &clamp_hi_out, 8);
+  }
+}
+
+static void iadst8x8_low1_neon(int32x4_t *in, int32x4_t *out, int bit,
+                               int do_cols, int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  int32x4_t u[8], x;
+  const int32x4_t v_bit = vdupq_n_s32(-bit);
+  const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
+  // stage 0-2
+
+  u[0] = vmlaq_n_s32(rnding, in[0], cospi[60]);
+  u[0] = vshlq_s32(u[0], v_bit);
+
+  u[1] = vmlaq_n_s32(rnding, in[0], cospi[4]);
+  u[1] = vshlq_s32(vnegq_s32(u[1]), v_bit);
+
+  // stage 3-4
+  int32x4_t temp1, temp2;
+  temp1 = vmlaq_n_s32(rnding, u[0], cospi[16]);
+  temp1 = vmlaq_n_s32(temp1, u[1], cospi[48]);
+  temp1 = vshlq_s32(temp1, v_bit);
+  u[4] = temp1;
+
+  temp2 = vmlaq_n_s32(rnding, u[0], cospi[48]);
+  u[5] = vmlsq_n_s32(temp2, u[1], cospi[16]);
+  u[5] = vshlq_s32(u[5], v_bit);
+
+  // stage 5-6
+  temp1 = vmlaq_n_s32(rnding, u[0], cospi[32]);
+  x = vmulq_n_s32(u[1], cospi[32]);
+  u[2] = vaddq_s32(temp1, x);
+  u[2] = vshlq_s32(u[2], v_bit);
+
+  u[3] = vsubq_s32(temp1, x);
+  u[3] = vshlq_s32(u[3], v_bit);
+
+  temp1 = vmlaq_n_s32(rnding, u[4], cospi[32]);
+  x = vmulq_n_s32(u[5], cospi[32]);
+  u[6] = vaddq_s32(temp1, x);
+  u[6] = vshlq_s32(u[6], v_bit);
+
+  u[7] = vsubq_s32(temp1, x);
+  u[7] = vshlq_s32(u[7], v_bit);
+
+  // stage 7
+  if (do_cols) {
+    out[0] = u[0];
+    out[1] = vnegq_s32(u[4]);
+    out[2] = u[6];
+    out[3] = vnegq_s32(u[2]);
+    out[4] = u[3];
+    out[5] = vnegq_s32(u[7]);
+    out[6] = u[5];
+    out[7] = vnegq_s32(u[1]);
+  } else {
+    const int log_range_out = AOMMAX(16, bd + 6);
+    const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1)));
+    const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1);
+    const int32x4_t v_shift = vdupq_n_s32(-out_shift);
+    int32x4_t offset = vdupq_n_s32((1 << out_shift) >> 1);
+    neg_shift_neon(&u[0], &u[4], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
+                   &v_shift, &offset);
+    neg_shift_neon(&u[6], &u[2], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out,
+                   &v_shift, &offset);
+    neg_shift_neon(&u[3], &u[7], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out,
+                   &v_shift, &offset);
+    neg_shift_neon(&u[5], &u[1], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out,
+                   &v_shift, &offset);
+  }
+}
+
+static void iadst8x8_new_neon(int32x4_t *in, int32x4_t *out, int bit,
+                              int do_cols, int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  // const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
+  const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
+  int32x4_t u[8], v[8], x;
+  const int32x4_t v_bit = vdupq_n_s32(-bit);
+  const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
+  // stage 0-2
+
+  u[0] = vmlaq_n_s32(rnding, in[7], cospi[4]);
+  u[0] = vmlaq_n_s32(u[0], in[0], cospi[60]);
+  u[0] = vshlq_s32(u[0], v_bit);
+
+  u[1] = vmlaq_n_s32(rnding, in[7], cospi[60]);
+  u[1] = vmlsq_n_s32(u[1], in[0], cospi[4]);
+  u[1] = vshlq_s32(u[1], v_bit);
+
+  // (2)
+  u[2] = vmlaq_n_s32(rnding, in[5], cospi[20]);
+  u[2] = vmlaq_n_s32(u[2], in[2], cospi[44]);
+  u[2] = vshlq_s32(u[2], v_bit);
+
+  u[3] = vmlaq_n_s32(rnding, in[5], cospi[44]);
+  u[3] = vmlsq_n_s32(u[3], in[2], cospi[20]);
+  u[3] = vshlq_s32(u[3], v_bit);
+
+  // (3)
+  u[4] = vmlaq_n_s32(rnding, in[3], cospi[36]);
+  u[4] = vmlaq_n_s32(u[4], in[4], cospi[28]);
+  u[4] = vshlq_s32(u[4], v_bit);
+
+  u[5] = vmlaq_n_s32(rnding, in[3], cospi[28]);
+  u[5] = vmlsq_n_s32(u[5], in[4], cospi[36]);
+  u[5] = vshlq_s32(u[5], v_bit);
+
+  // (4)
+  u[6] = vmulq_n_s32(in[1], cospi[52]);
+  u[6] = vmlaq_n_s32(u[6], in[6], cospi[12]);
+  u[6] = vaddq_s32(u[6], rnding);
+  u[6] = vshlq_s32(u[6], v_bit);
+
+  u[7] = vmulq_n_s32(in[1], cospi[12]);
+  u[7] = vmlsq_n_s32(u[7], in[6], cospi[52]);
+  u[7] = vaddq_s32(u[7], rnding);
+  u[7] = vshlq_s32(u[7], v_bit);
+
+  // stage 3
+  addsub_neon(u[0], u[4], &v[0], &v[4], &clamp_lo, &clamp_hi);
+  addsub_neon(u[1], u[5], &v[1], &v[5], &clamp_lo, &clamp_hi);
+  addsub_neon(u[2], u[6], &v[2], &v[6], &clamp_lo, &clamp_hi);
+  addsub_neon(u[3], u[7], &v[3], &v[7], &clamp_lo, &clamp_hi);
+
+  // stage 4
+  u[0] = v[0];
+  u[1] = v[1];
+  u[2] = v[2];
+  u[3] = v[3];
+
+  u[4] = vmlaq_n_s32(rnding, v[4], cospi[16]);
+  u[4] = vmlaq_n_s32(u[4], v[5], cospi[48]);
+  u[4] = vshlq_s32(u[4], v_bit);
+
+  u[5] = vmlaq_n_s32(rnding, v[4], cospi[48]);
+  u[5] = vmlsq_n_s32(u[5], v[5], cospi[16]);
+  u[5] = vshlq_s32(u[5], v_bit);
+
+  u[6] = vmlsq_n_s32(rnding, v[6], cospi[48]);
+  u[6] = vmlaq_n_s32(u[6], v[7], cospi[16]);
+  u[6] = vshlq_s32(u[6], v_bit);
+
+  u[7] = vmlaq_n_s32(rnding, v[6], cospi[16]);
+  u[7] = vmlaq_n_s32(u[7], v[7], cospi[48]);
+  u[7] = vshlq_s32(u[7], v_bit);
+
+  // stage 5
+  addsub_neon(u[0], u[2], &v[0], &v[2], &clamp_lo, &clamp_hi);
+  addsub_neon(u[1], u[3], &v[1], &v[3], &clamp_lo, &clamp_hi);
+  addsub_neon(u[4], u[6], &v[4], &v[6], &clamp_lo, &clamp_hi);
+  addsub_neon(u[5], u[7], &v[5], &v[7], &clamp_lo, &clamp_hi);
+
+  // stage 6
+  u[0] = v[0];
+  u[1] = v[1];
+  u[4] = v[4];
+  u[5] = v[5];
+
+  v[0] = vmlaq_n_s32(rnding, v[2], cospi[32]);
+  x = vmulq_n_s32(v[3], cospi[32]);
+  u[2] = vaddq_s32(v[0], x);
+  u[2] = vshlq_s32(u[2], v_bit);
+
+  u[3] = vsubq_s32(v[0], x);
+  u[3] = vshlq_s32(u[3], v_bit);
+
+  v[0] = vmlaq_n_s32(rnding, v[6], cospi[32]);
+  x = vmulq_n_s32(v[7], cospi[32]);
+  u[6] = vaddq_s32(v[0], x);
+  u[6] = vshlq_s32(u[6], v_bit);
+
+  u[7] = vsubq_s32(v[0], x);
+  u[7] = vshlq_s32(u[7], v_bit);
+
+  // stage 7
+  if (do_cols) {
+    out[0] = u[0];
+    out[1] = vnegq_s32(u[4]);
+    out[2] = u[6];
+    out[3] = vnegq_s32(u[2]);
+    out[4] = u[3];
+    out[5] = vnegq_s32(u[7]);
+    out[6] = u[5];
+    out[7] = vnegq_s32(u[1]);
+  } else {
+    const int log_range_out = AOMMAX(16, bd + 6);
+    const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1)));
+    const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1);
+    const int32x4_t v_shift = vdupq_n_s32(-out_shift);
+    int32x4_t offset = vdupq_n_s32((1 << out_shift) >> 1);
+    neg_shift_neon(&u[0], &u[4], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
+                   &v_shift, &offset);
+    neg_shift_neon(&u[6], &u[2], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out,
+                   &v_shift, &offset);
+    neg_shift_neon(&u[3], &u[7], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out,
+                   &v_shift, &offset);
+    neg_shift_neon(&u[5], &u[1], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out,
+                   &v_shift, &offset);
+  }
+}
+
+static void idct16x16_low1_neon(int32x4_t *in, int32x4_t *out, int bit,
+                                int do_cols, int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
+  int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
+  const int32x4_t v_bit = vdupq_n_s32(-bit);
+  const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
+  // stage 0-4
+  in[0] = vmlaq_n_s32(rnding, in[0], cospi[32]);
+  in[0] = vshlq_s32(in[0], v_bit);
+
+  // stage 5-7
+  if (!do_cols) {
+    log_range = AOMMAX(16, bd + 6);
+    clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
+    clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
+    if (out_shift != 0) {
+      int32x4_t offset = vdupq_n_s32((1 << out_shift) >> 1);
+      in[0] = vaddq_s32(in[0], offset);
+      in[0] = vshlq_s32(in[0], vdupq_n_s32(-out_shift));
+    }
+  }
+
+  in[0] = vmaxq_s32(in[0], clamp_lo);
+  in[0] = vminq_s32(in[0], clamp_hi);
+  out[0] = in[0];
+  out[1] = in[0];
+  out[2] = in[0];
+  out[3] = in[0];
+  out[4] = in[0];
+  out[5] = in[0];
+  out[6] = in[0];
+  out[7] = in[0];
+  out[8] = in[0];
+  out[9] = in[0];
+  out[10] = in[0];
+  out[11] = in[0];
+  out[12] = in[0];
+  out[13] = in[0];
+  out[14] = in[0];
+  out[15] = in[0];
+}
+
+static void idct16x16_low8_neon(int32x4_t *in, int32x4_t *out, int bit,
+                                int do_cols, int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
+  const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
+  const int32x4_t v_bit = vdupq_n_s32(-bit);
+  const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
+  int32x4_t u[16], x, y;
+  // stage 0-1
+  u[0] = in[0];
+  u[2] = in[4];
+  u[4] = in[2];
+  u[6] = in[6];
+  u[8] = in[1];
+  u[10] = in[5];
+  u[12] = in[3];
+  u[14] = in[7];
+
+  // stage 2
+  u[15] = half_btf_0_neon_r(&cospi[4], &u[8], &v_bit, &rnding);
+  u[8] = half_btf_0_neon_r(&cospi[60], &u[8], &v_bit, &rnding);
+
+  u[9] = half_btf_0_m_neon_r(&cospi[36], &u[14], &v_bit, &rnding);
+  u[14] = half_btf_0_neon_r(&cospi[28], &u[14], &v_bit, &rnding);
+
+  u[13] = half_btf_0_neon_r(&cospi[20], &u[10], &v_bit, &rnding);
+  u[10] = half_btf_0_neon_r(&cospi[44], &u[10], &v_bit, &rnding);
+
+  u[11] = half_btf_0_m_neon_r(&cospi[52], &u[12], &v_bit, &rnding);
+  u[12] = half_btf_0_neon_r(&cospi[12], &u[12], &v_bit, &rnding);
+
+  // stage 3
+  u[7] = half_btf_0_neon_r(&cospi[8], &u[4], &v_bit, &rnding);
+  u[4] = half_btf_0_neon_r(&cospi[56], &u[4], &v_bit, &rnding);
+  u[5] = half_btf_0_m_neon_r(&cospi[40], &u[6], &v_bit, &rnding);
+  u[6] = half_btf_0_neon_r(&cospi[24], &u[6], &v_bit, &rnding);
+
+  addsub_neon(u[8], u[9], &u[8], &u[9], &clamp_lo, &clamp_hi);
+  addsub_neon(u[11], u[10], &u[11], &u[10], &clamp_lo, &clamp_hi);
+  addsub_neon(u[12], u[13], &u[12], &u[13], &clamp_lo, &clamp_hi);
+  addsub_neon(u[15], u[14], &u[15], &u[14], &clamp_lo, &clamp_hi);
+
+  // stage 4
+  x = vmlaq_n_s32(rnding, u[0], cospi[32]);
+  u[0] = vshlq_s32(x, v_bit);
+  u[1] = u[0];
+
+  u[3] = half_btf_0_neon_r(&cospi[16], &u[2], &v_bit, &rnding);
+  u[2] = half_btf_0_neon_r(&cospi[48], &u[2], &v_bit, &rnding);
+
+  addsub_neon(u[4], u[5], &u[4], &u[5], &clamp_lo, &clamp_hi);
+  addsub_neon(u[7], u[6], &u[7], &u[6], &clamp_lo, &clamp_hi);
+
+  x = half_btf_neon_mode10_r(&cospi[16], &u[9], &cospi[48], &u[14], &v_bit,
+                             &rnding);
+  u[14] =
+      half_btf_neon_r(&cospi[48], &u[9], &cospi[16], &u[14], &v_bit, &rnding);
+  u[9] = x;
+  y = half_btf_neon_mode11_r(&cospi[48], &u[10], &cospi[16], &u[13], &v_bit,
+                             &rnding);
+  u[13] = half_btf_neon_mode10_r(&cospi[16], &u[10], &cospi[48], &u[13], &v_bit,
+                                 &rnding);
+  u[10] = y;
+
+  // stage 5
+  addsub_neon(u[0], u[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
+  addsub_neon(u[1], u[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
+
+  x = vmulq_n_s32(u[5], cospi[32]);
+  y = vmlaq_n_s32(rnding, u[6], cospi[32]);
+  u[5] = vsubq_s32(y, x);
+  u[5] = vshlq_s32(u[5], v_bit);
+
+  u[6] = vaddq_s32(y, x);
+  u[6] = vshlq_s32(u[6], v_bit);
+
+  addsub_neon(u[8], u[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
+  addsub_neon(u[9], u[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
+  addsub_neon(u[15], u[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
+  addsub_neon(u[14], u[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
+
+  // stage 6
+  addsub_neon(u[0], u[7], &u[0], &u[7], &clamp_lo, &clamp_hi);
+  addsub_neon(u[1], u[6], &u[1], &u[6], &clamp_lo, &clamp_hi);
+  addsub_neon(u[2], u[5], &u[2], &u[5], &clamp_lo, &clamp_hi);
+  addsub_neon(u[3], u[4], &u[3], &u[4], &clamp_lo, &clamp_hi);
+
+  x = vmulq_n_s32(u[10], cospi[32]);
+  y = vmlaq_n_s32(rnding, u[13], cospi[32]);
+  u[10] = vsubq_s32(y, x);
+  u[10] = vshlq_s32(u[10], v_bit);
+
+  u[13] = vaddq_s32(x, y);
+  u[13] = vshlq_s32(u[13], v_bit);
+
+  x = vmulq_n_s32(u[11], cospi[32]);
+  y = vmlaq_n_s32(rnding, u[12], cospi[32]);
+  u[11] = vsubq_s32(y, x);
+  u[11] = vshlq_s32(u[11], v_bit);
+
+  u[12] = vaddq_s32(x, y);
+  u[12] = vshlq_s32(u[12], v_bit);
+  // stage 7
+  addsub_neon(u[0], u[15], out + 0, out + 15, &clamp_lo, &clamp_hi);
+  addsub_neon(u[1], u[14], out + 1, out + 14, &clamp_lo, &clamp_hi);
+  addsub_neon(u[2], u[13], out + 2, out + 13, &clamp_lo, &clamp_hi);
+  addsub_neon(u[3], u[12], out + 3, out + 12, &clamp_lo, &clamp_hi);
+  addsub_neon(u[4], u[11], out + 4, out + 11, &clamp_lo, &clamp_hi);
+  addsub_neon(u[5], u[10], out + 5, out + 10, &clamp_lo, &clamp_hi);
+  addsub_neon(u[6], u[9], out + 6, out + 9, &clamp_lo, &clamp_hi);
+  addsub_neon(u[7], u[8], out + 7, out + 8, &clamp_lo, &clamp_hi);
+
+  if (!do_cols) {
+    const int32x4_t rnding_shift = vdupq_n_s32(1 << (out_shift - 1));
+    const int log_range_out = AOMMAX(16, bd + 6);
+    const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1)));
+    const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1);
+    round_shift_8x8(out, out_shift, &rnding_shift);
+    highbd_clamp_s32_neon(out, out, &clamp_lo_out, &clamp_hi_out, 16);
+  }
+}
+
+static void iadst16x16_low1_neon(int32x4_t *in, int32x4_t *out, int bit,
+                                 int do_cols, int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  int32x4_t v[16], x, y, temp1, temp2;
+  const int32x4_t v_bit = vdupq_n_s32(-bit);
+  const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
+  // stage 0
+  // stage 1
+  // stage 2
+  v[0] = vmlaq_n_s32(rnding, in[0], cospi[62]);
+  v[0] = vshlq_s32(v[0], v_bit);
+
+  v[1] = vmlsq_n_s32(rnding, in[0], cospi[2]);
+  v[1] = vshlq_s32(v[1], v_bit);
+
+  // stage 3
+  v[8] = v[0];
+  v[9] = v[1];
+
+  // stage 4
+  temp1 = vmlaq_n_s32(rnding, v[8], cospi[8]);
+  temp1 = vmlaq_n_s32(temp1, v[9], cospi[56]);
+  temp1 = vshlq_s32(temp1, v_bit);
+
+  temp2 = vmlaq_n_s32(rnding, v[8], cospi[56]);
+  temp2 = vmlsq_n_s32(temp2, v[9], cospi[8]);
+  temp2 = vshlq_s32(temp2, v_bit);
+  v[8] = temp1;
+  v[9] = temp2;
+
+  // stage 5
+  v[4] = v[0];
+  v[5] = v[1];
+  v[12] = v[8];
+  v[13] = v[9];
+
+  // stage 6
+  temp1 = vmlaq_n_s32(rnding, v[4], cospi[16]);
+  temp1 = vmlaq_n_s32(temp1, v[5], cospi[48]);
+  temp1 = vshlq_s32(temp1, v_bit);
+
+  temp2 = vmlaq_n_s32(rnding, v[4], cospi[48]);
+  temp2 = vmlsq_n_s32(temp2, v[5], cospi[16]);
+  temp2 = vshlq_s32(temp2, v_bit);
+  v[4] = temp1;
+  v[5] = temp2;
+
+  temp1 = vmlaq_n_s32(rnding, v[12], cospi[16]);
+  temp1 = vmlaq_n_s32(temp1, v[13], cospi[48]);
+  temp1 = vshlq_s32(temp1, v_bit);
+
+  temp2 = vmlaq_n_s32(rnding, v[12], cospi[48]);
+  temp2 = vmlsq_n_s32(temp2, v[13], cospi[16]);
+  temp2 = vshlq_s32(temp2, v_bit);
+  v[12] = temp1;
+  v[13] = temp2;
+
+  // stage 7
+  v[2] = v[0];
+  v[3] = v[1];
+  v[6] = v[4];
+  v[7] = v[5];
+  v[10] = v[8];
+  v[11] = v[9];
+  v[14] = v[12];
+  v[15] = v[13];
+
+  // stage 8
+  y = vmlaq_n_s32(rnding, v[2], cospi[32]);
+  x = vmulq_n_s32(v[3], cospi[32]);
+  v[2] = vaddq_s32(y, x);
+  v[2] = vshlq_s32(v[2], v_bit);
+
+  v[3] = vsubq_s32(y, x);
+  v[3] = vshlq_s32(v[3], v_bit);
+
+  y = vmlaq_n_s32(rnding, v[6], cospi[32]);
+  x = vmulq_n_s32(v[7], cospi[32]);
+  v[6] = vaddq_s32(y, x);
+  v[6] = vshlq_s32(v[6], v_bit);
+
+  v[7] = vsubq_s32(y, x);
+  v[7] = vshlq_s32(v[7], v_bit);
+
+  y = vmlaq_n_s32(rnding, v[10], cospi[32]);
+  x = vmulq_n_s32(v[11], cospi[32]);
+  v[10] = vaddq_s32(y, x);
+  v[10] = vshlq_s32(v[10], v_bit);
+
+  v[11] = vsubq_s32(y, x);
+  v[11] = vshlq_s32(v[11], v_bit);
+
+  y = vmlaq_n_s32(rnding, v[14], cospi[32]);
+  x = vmulq_n_s32(v[15], cospi[32]);
+  v[14] = vaddq_s32(y, x);
+  v[14] = vshlq_s32(v[14], v_bit);
+
+  v[15] = vsubq_s32(y, x);
+  v[15] = vshlq_s32(v[15], v_bit);
+
+  // stage 9
+  if (do_cols) {
+    out[0] = v[0];
+    out[1] = vnegq_s32(v[8]);
+    out[2] = v[12];
+    out[3] = vnegq_s32(v[4]);
+    out[4] = v[6];
+    out[5] = vnegq_s32(v[14]);
+    out[6] = v[10];
+    out[7] = vnegq_s32(v[2]);
+    out[8] = v[3];
+    out[9] = vnegq_s32(v[11]);
+    out[10] = v[15];
+    out[11] = vnegq_s32(v[7]);
+    out[12] = v[5];
+    out[13] = vnegq_s32(v[13]);
+    out[14] = v[9];
+    out[15] = vnegq_s32(v[1]);
+  } else {
+    const int log_range_out = AOMMAX(16, bd + 6);
+    const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1)));
+    const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1);
+    const int32x4_t v_shift = vdupq_n_s32(-out_shift);
+    int32x4_t offset = vdupq_n_s32((1 << out_shift) >> 1);
+    neg_shift_neon(&v[0], &v[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
+                   &v_shift, &offset);
+    neg_shift_neon(&v[12], &v[4], out + 2, out + 3, &clamp_lo_out,
+                   &clamp_hi_out, &v_shift, &offset);
+    neg_shift_neon(&v[6], &v[14], out + 4, out + 5, &clamp_lo_out,
+                   &clamp_hi_out, &v_shift, &offset);
+    neg_shift_neon(&v[10], &v[2], out + 6, out + 7, &clamp_lo_out,
+                   &clamp_hi_out, &v_shift, &offset);
+    neg_shift_neon(&v[3], &v[11], out + 8, out + 9, &clamp_lo_out,
+                   &clamp_hi_out, &v_shift, &offset);
+    neg_shift_neon(&v[15], &v[7], out + 10, out + 11, &clamp_lo_out,
+                   &clamp_hi_out, &v_shift, &offset);
+    neg_shift_neon(&v[5], &v[13], out + 12, out + 13, &clamp_lo_out,
+                   &clamp_hi_out, &v_shift, &offset);
+    neg_shift_neon(&v[9], &v[1], out + 14, out + 15, &clamp_lo_out,
+                   &clamp_hi_out, &v_shift, &offset);
+  }
+}
+
+static void iadst16x16_low8_neon(int32x4_t *in, int32x4_t *out, int bit,
+                                 int do_cols, int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
+  const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
+  int32x4_t zero = vdupq_n_s32(0);
+  int32x4_t u[16], x, y;
+  const int32x4_t v_bit = vdupq_n_s32(-bit);
+  const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
+  // stage 0-2
+  u[0] = vmlaq_n_s32(rnding, in[0], cospi[62]);
+  u[0] = vshlq_s32(u[0], v_bit);
+
+  u[1] = vmlsq_n_s32(rnding, in[0], cospi[2]);
+  u[1] = vshlq_s32(u[1], v_bit);
+
+  u[2] = vmlaq_n_s32(rnding, in[2], cospi[54]);
+  u[2] = vshlq_s32(u[2], v_bit);
+
+  u[3] = vmlsq_n_s32(rnding, in[2], cospi[10]);
+  u[3] = vshlq_s32(u[3], v_bit);
+
+  u[4] = vmlaq_n_s32(rnding, in[4], cospi[46]);
+  u[4] = vshlq_s32(u[4], v_bit);
+
+  u[5] = vmlsq_n_s32(rnding, in[4], cospi[18]);
+  u[5] = vshlq_s32(u[5], v_bit);
+
+  u[6] = vmlaq_n_s32(rnding, in[6], cospi[38]);
+  u[6] = vshlq_s32(u[6], v_bit);
+
+  u[7] = vmlsq_n_s32(rnding, in[6], cospi[26]);
+  u[7] = vshlq_s32(u[7], v_bit);
+
+  u[8] = vmlaq_n_s32(rnding, in[7], cospi[34]);
+  u[8] = vshlq_s32(u[8], v_bit);
+
+  u[9] = vmlaq_n_s32(rnding, in[7], cospi[30]);
+  u[9] = vshlq_s32(u[9], v_bit);
+
+  u[10] = vmlaq_n_s32(rnding, in[5], cospi[42]);
+  u[10] = vshlq_s32(u[10], v_bit);
+
+  u[11] = vmlaq_n_s32(rnding, in[5], cospi[22]);
+  u[11] = vshlq_s32(u[11], v_bit);
+
+  u[12] = vmlaq_n_s32(rnding, in[3], cospi[50]);
+  u[12] = vshlq_s32(u[12], v_bit);
+
+  u[13] = vmlaq_n_s32(rnding, in[3], cospi[14]);
+  u[13] = vshlq_s32(u[13], v_bit);
+
+  u[14] = vmlaq_n_s32(rnding, in[1], cospi[58]);
+  u[14] = vshlq_s32(u[14], v_bit);
+
+  u[15] = vmlaq_n_s32(rnding, in[1], cospi[6]);
+  u[15] = vshlq_s32(u[15], v_bit);
+
+  // stage 3
+  addsub_neon(u[0], u[8], &u[0], &u[8], &clamp_lo, &clamp_hi);
+  addsub_neon(u[1], u[9], &u[1], &u[9], &clamp_lo, &clamp_hi);
+  addsub_neon(u[2], u[10], &u[2], &u[10], &clamp_lo, &clamp_hi);
+  addsub_neon(u[3], u[11], &u[3], &u[11], &clamp_lo, &clamp_hi);
+  addsub_neon(u[4], u[12], &u[4], &u[12], &clamp_lo, &clamp_hi);
+  addsub_neon(u[5], u[13], &u[5], &u[13], &clamp_lo, &clamp_hi);
+  addsub_neon(u[6], u[14], &u[6], &u[14], &clamp_lo, &clamp_hi);
+  addsub_neon(u[7], u[15], &u[7], &u[15], &clamp_lo, &clamp_hi);
+
+  // stage 4
+  y = vmlaq_n_s32(rnding, u[8], cospi[56]);
+  u[8] = vmlaq_n_s32(rnding, u[8], cospi[8]);
+  u[8] = vmlaq_n_s32(u[8], u[9], cospi[56]);
+  u[8] = vshlq_s32(u[8], v_bit);
+
+  u[9] = vmlsq_n_s32(y, u[9], cospi[8]);
+  u[9] = vshlq_s32(u[9], v_bit);
+
+  y = vmlaq_n_s32(rnding, u[10], cospi[24]);
+  u[10] = vmlaq_n_s32(rnding, u[10], cospi[40]);
+  u[10] = vmlaq_n_s32(u[10], u[11], cospi[24]);
+  u[10] = vshlq_s32(u[10], v_bit);
+
+  u[11] = vmlsq_n_s32(y, u[11], cospi[40]);
+  u[11] = vshlq_s32(u[11], v_bit);
+
+  y = vmlaq_n_s32(rnding, u[12], cospi[8]);
+  u[12] = vmlsq_n_s32(rnding, u[12], cospi[56]);
+  u[12] = vmlaq_n_s32(u[12], u[13], cospi[8]);
+  u[12] = vshlq_s32(u[12], v_bit);
+
+  u[13] = vmlaq_n_s32(y, u[13], cospi[56]);
+  u[13] = vshlq_s32(u[13], v_bit);
+
+  y = vmlaq_n_s32(rnding, u[14], cospi[40]);
+  u[14] = vmlsq_n_s32(rnding, u[14], cospi[24]);
+  u[14] = vmlaq_n_s32(u[14], u[15], cospi[40]);
+  u[14] = vshlq_s32(u[14], v_bit);
+
+  u[15] = vmlaq_n_s32(y, u[15], cospi[24]);
+  u[15] = vshlq_s32(u[15], v_bit);
+
+  // stage 5
+  addsub_neon(u[0], u[4], &u[0], &u[4], &clamp_lo, &clamp_hi);
+  addsub_neon(u[1], u[5], &u[1], &u[5], &clamp_lo, &clamp_hi);
+  addsub_neon(u[2], u[6], &u[2], &u[6], &clamp_lo, &clamp_hi);
+  addsub_neon(u[3], u[7], &u[3], &u[7], &clamp_lo, &clamp_hi);
+  addsub_neon(u[8], u[12], &u[8], &u[12], &clamp_lo, &clamp_hi);
+  addsub_neon(u[9], u[13], &u[9], &u[13], &clamp_lo, &clamp_hi);
+  addsub_neon(u[10], u[14], &u[10], &u[14], &clamp_lo, &clamp_hi);
+  addsub_neon(u[11], u[15], &u[11], &u[15], &clamp_lo, &clamp_hi);
+
+  // stage 6
+  y = vmlaq_n_s32(rnding, u[4], cospi[48]);
+  u[4] = vmlaq_n_s32(rnding, u[4], cospi[16]);
+  u[4] = vmlaq_n_s32(u[4], u[5], cospi[48]);
+  u[4] = vshlq_s32(u[4], v_bit);
+
+  u[5] = vmlsq_n_s32(y, u[5], cospi[16]);
+  u[5] = vshlq_s32(u[5], v_bit);
+
+  y = vmlaq_n_s32(rnding, u[6], cospi[16]);
+  u[6] = vmlsq_n_s32(rnding, u[6], cospi[48]);
+  u[6] = vmlaq_n_s32(u[6], u[7], cospi[16]);
+  u[6] = vshlq_s32(u[6], v_bit);
+
+  u[7] = vmlaq_n_s32(y, u[7], cospi[48]);
+  u[7] = vshlq_s32(u[7], v_bit);
+
+  y = vmlaq_n_s32(rnding, u[12], cospi[48]);
+  u[12] = vmulq_n_s32(u[12], cospi[16]);
+  u[12] = vmlaq_n_s32(u[12], u[13], cospi[48]);
+  u[12] = vshlq_s32(u[12], v_bit);
+
+  u[13] = vmlsq_n_s32(y, u[13], cospi[16]);
+  u[13] = vshlq_s32(u[13], v_bit);
+
+  y = vmlaq_n_s32(rnding, u[14], cospi[16]);
+  u[14] = vmlsq_n_s32(rnding, u[14], cospi[48]);
+  u[14] = vmlaq_n_s32(u[14], u[15], cospi[16]);
+  u[14] = vshlq_s32(u[14], v_bit);
+
+  u[15] = vmlaq_n_s32(y, u[15], cospi[48]);
+  u[15] = vshlq_s32(u[15], v_bit);
+
+  // stage 7
+  addsub_neon(u[0], u[2], &u[0], &u[2], &clamp_lo, &clamp_hi);
+  addsub_neon(u[1], u[3], &u[1], &u[3], &clamp_lo, &clamp_hi);
+  addsub_neon(u[4], u[6], &u[4], &u[6], &clamp_lo, &clamp_hi);
+  addsub_neon(u[5], u[7], &u[5], &u[7], &clamp_lo, &clamp_hi);
+  addsub_neon(u[8], u[10], &u[8], &u[10], &clamp_lo, &clamp_hi);
+  addsub_neon(u[9], u[11], &u[9], &u[11], &clamp_lo, &clamp_hi);
+  addsub_neon(u[12], u[14], &u[12], &u[14], &clamp_lo, &clamp_hi);
+  addsub_neon(u[13], u[15], &u[13], &u[15], &clamp_lo, &clamp_hi);
+
+  // stage 8
+  y = vmlaq_n_s32(rnding, u[2], cospi[32]);
+  x = vmulq_n_s32(u[3], cospi[32]);
+  u[2] = vaddq_s32(y, x);
+  u[2] = vshlq_s32(u[2], v_bit);
+
+  u[3] = vsubq_s32(y, x);
+  u[3] = vshlq_s32(u[3], v_bit);
+  y = vmlaq_n_s32(rnding, u[6], cospi[32]);
+  x = vmulq_n_s32(u[7], cospi[32]);
+  u[6] = vaddq_s32(y, x);
+  u[6] = vshlq_s32(u[6], v_bit);
+
+  u[7] = vsubq_s32(y, x);
+  u[7] = vshlq_s32(u[7], v_bit);
+
+  y = vmlaq_n_s32(rnding, u[10], cospi[32]);
+  x = vmulq_n_s32(u[11], cospi[32]);
+  u[10] = vaddq_s32(y, x);
+  u[10] = vshlq_s32(u[10], v_bit);
+
+  u[11] = vsubq_s32(y, x);
+  u[11] = vshlq_s32(u[11], v_bit);
+
+  y = vmlaq_n_s32(rnding, u[14], cospi[32]);
+  x = vmulq_n_s32(u[15], cospi[32]);
+  u[14] = vaddq_s32(y, x);
+  u[14] = vshlq_s32(u[14], v_bit);
+
+  u[15] = vsubq_s32(y, x);
+  u[15] = vshlq_s32(u[15], v_bit);
+
+  // stage 9
+  if (do_cols) {
+    out[0] = u[0];
+    out[1] = vsubq_s32(zero, u[8]);
+    out[2] = u[12];
+    out[3] = vsubq_s32(zero, u[4]);
+    out[4] = u[6];
+    out[5] = vsubq_s32(zero, u[14]);
+    out[6] = u[10];
+    out[7] = vsubq_s32(zero, u[2]);
+    out[8] = u[3];
+    out[9] = vsubq_s32(zero, u[11]);
+    out[10] = u[15];
+    out[11] = vsubq_s32(zero, u[7]);
+    out[12] = u[5];
+    out[13] = vsubq_s32(zero, u[13]);
+    out[14] = u[9];
+    out[15] = vsubq_s32(zero, u[1]);
+  } else {
+    const int log_range_out = AOMMAX(16, bd + 6);
+    const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1)));
+    const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1);
+    const int32x4_t v_shift = vdupq_n_s32(-out_shift);
+    int32x4_t offset = vdupq_n_s32((1 << out_shift) >> 1);
+    neg_shift_neon(&u[0], &u[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
+                   &v_shift, &offset);
+    neg_shift_neon(&u[12], &u[4], out + 2, out + 3, &clamp_lo_out,
+                   &clamp_hi_out, &v_shift, &offset);
+    neg_shift_neon(&u[6], &u[14], out + 4, out + 5, &clamp_lo_out,
+                   &clamp_hi_out, &v_shift, &offset);
+    neg_shift_neon(&u[10], &u[2], out + 6, out + 7, &clamp_lo_out,
+                   &clamp_hi_out, &v_shift, &offset);
+    neg_shift_neon(&u[3], &u[11], out + 8, out + 9, &clamp_lo_out,
+                   &clamp_hi_out, &v_shift, &offset);
+    neg_shift_neon(&u[15], &u[7], out + 10, out + 11, &clamp_lo_out,
+                   &clamp_hi_out, &v_shift, &offset);
+    neg_shift_neon(&u[5], &u[13], out + 12, out + 13, &clamp_lo_out,
+                   &clamp_hi_out, &v_shift, &offset);
+    neg_shift_neon(&u[9], &u[1], out + 14, out + 15, &clamp_lo_out,
+                   &clamp_hi_out, &v_shift, &offset);
+  }
+}
+
+static void idct16x16_neon(int32x4_t *in, int32x4_t *out, int bit, int do_cols,
+                           int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
+  const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
+  int32x4_t u[16], v[16], x, y;
+  const int32x4_t v_bit = vdupq_n_s32(-bit);
+  const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
+
+  {
+    // stage 0-1
+    u[0] = in[0];
+    u[1] = in[8];
+    u[2] = in[4];
+    u[3] = in[12];
+    u[4] = in[2];
+    u[5] = in[10];
+    u[6] = in[6];
+    u[7] = in[14];
+    u[8] = in[1];
+    u[9] = in[9];
+    u[10] = in[5];
+    u[11] = in[13];
+    u[12] = in[3];
+    u[13] = in[11];
+    u[14] = in[7];
+    u[15] = in[15];
+
+    // stage 2
+    v[0] = u[0];
+    v[1] = u[1];
+    v[2] = u[2];
+    v[3] = u[3];
+    v[4] = u[4];
+    v[5] = u[5];
+    v[6] = u[6];
+    v[7] = u[7];
+
+    v[8] = half_btf_neon_mode01_r(&cospi[60], &u[8], &cospi[4], &u[15], &v_bit,
+                                  &rnding);
+    v[9] = half_btf_neon_mode01_r(&cospi[28], &u[9], &cospi[36], &u[14], &v_bit,
+                                  &rnding);
+    v[10] = half_btf_neon_mode01_r(&cospi[44], &u[10], &cospi[20], &u[13],
+                                   &v_bit, &rnding);
+    v[11] = half_btf_neon_mode01_r(&cospi[12], &u[11], &cospi[52], &u[12],
+                                   &v_bit, &rnding);
+    v[12] = half_btf_neon_r(&cospi[52], &u[11], &cospi[12], &u[12], &v_bit,
+                            &rnding);
+    v[13] = half_btf_neon_r(&cospi[20], &u[10], &cospi[44], &u[13], &v_bit,
+                            &rnding);
+    v[14] =
+        half_btf_neon_r(&cospi[36], &u[9], &cospi[28], &u[14], &v_bit, &rnding);
+    v[15] =
+        half_btf_neon_r(&cospi[4], &u[8], &cospi[60], &u[15], &v_bit, &rnding);
+
+    // stage 3
+    u[0] = v[0];
+    u[1] = v[1];
+    u[2] = v[2];
+    u[3] = v[3];
+    u[4] = half_btf_neon_mode01_r(&cospi[56], &v[4], &cospi[8], &v[7], &v_bit,
+                                  &rnding);
+    u[5] = half_btf_neon_mode01_r(&cospi[24], &v[5], &cospi[40], &v[6], &v_bit,
+                                  &rnding);
+    u[6] =
+        half_btf_neon_r(&cospi[40], &v[5], &cospi[24], &v[6], &v_bit, &rnding);
+    u[7] =
+        half_btf_neon_r(&cospi[8], &v[4], &cospi[56], &v[7], &v_bit, &rnding);
+    addsub_neon(v[8], v[9], &u[8], &u[9], &clamp_lo, &clamp_hi);
+    addsub_neon(v[11], v[10], &u[11], &u[10], &clamp_lo, &clamp_hi);
+    addsub_neon(v[12], v[13], &u[12], &u[13], &clamp_lo, &clamp_hi);
+    addsub_neon(v[15], v[14], &u[15], &u[14], &clamp_lo, &clamp_hi);
+
+    // stage 4
+    x = vmlaq_n_s32(rnding, u[0], cospi[32]);
+    y = vmulq_n_s32(u[1], cospi[32]);
+    v[0] = vaddq_s32(x, y);
+    v[0] = vshlq_s32(v[0], v_bit);
+
+    v[1] = vsubq_s32(x, y);
+    v[1] = vshlq_s32(v[1], v_bit);
+
+    v[2] = half_btf_neon_mode01_r(&cospi[48], &u[2], &cospi[16], &u[3], &v_bit,
+                                  &rnding);
+    v[3] =
+        half_btf_neon_r(&cospi[16], &u[2], &cospi[48], &u[3], &v_bit, &rnding);
+    addsub_neon(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi);
+    addsub_neon(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi);
+    v[8] = u[8];
+    v[9] = half_btf_neon_mode10_r(&cospi[16], &u[9], &cospi[48], &u[14], &v_bit,
+                                  &rnding);
+    v[10] = half_btf_neon_mode11_r(&cospi[48], &u[10], &cospi[16], &u[13],
+                                   &v_bit, &rnding);
+    v[11] = u[11];
+    v[12] = u[12];
+    v[13] = half_btf_neon_mode10_r(&cospi[16], &u[10], &cospi[48], &u[13],
+                                   &v_bit, &rnding);
+    v[14] =
+        half_btf_neon_r(&cospi[48], &u[9], &cospi[16], &u[14], &v_bit, &rnding);
+    v[15] = u[15];
+
+    // stage 5
+    addsub_neon(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
+    addsub_neon(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
+    u[4] = v[4];
+
+    x = vmulq_n_s32(v[5], cospi[32]);
+    y = vmlaq_n_s32(rnding, v[6], cospi[32]);
+    u[5] = vsubq_s32(y, x);
+    u[5] = vshlq_s32(u[5], v_bit);
+
+    u[6] = vaddq_s32(y, x);
+    u[6] = vshlq_s32(u[6], v_bit);
+
+    u[7] = v[7];
+    addsub_neon(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
+    addsub_neon(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
+    addsub_neon(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
+    addsub_neon(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
+
+    // stage 6
+    addsub_neon(u[0], u[7], &v[0], &v[7], &clamp_lo, &clamp_hi);
+    addsub_neon(u[1], u[6], &v[1], &v[6], &clamp_lo, &clamp_hi);
+    addsub_neon(u[2], u[5], &v[2], &v[5], &clamp_lo, &clamp_hi);
+    addsub_neon(u[3], u[4], &v[3], &v[4], &clamp_lo, &clamp_hi);
+    v[8] = u[8];
+    v[9] = u[9];
+
+    x = vmulq_n_s32(u[10], cospi[32]);
+    y = vmlaq_n_s32(rnding, u[13], cospi[32]);
+    v[10] = vsubq_s32(y, x);
+    v[10] = vshlq_s32(v[10], v_bit);
+
+    v[13] = vaddq_s32(x, y);
+    v[13] = vshlq_s32(v[13], v_bit);
+
+    x = vmulq_n_s32(u[11], cospi[32]);
+    y = vmlaq_n_s32(rnding, u[12], cospi[32]);
+    v[11] = vsubq_s32(y, x);
+    v[11] = vshlq_s32(v[11], v_bit);
+
+    v[12] = vaddq_s32(x, y);
+    v[12] = vshlq_s32(v[12], v_bit);
+
+    v[14] = u[14];
+    v[15] = u[15];
+
+    // stage 7
+    addsub_neon(v[0], v[15], out + 0, out + 15, &clamp_lo, &clamp_hi);
+    addsub_neon(v[1], v[14], out + 1, out + 14, &clamp_lo, &clamp_hi);
+    addsub_neon(v[2], v[13], out + 2, out + 13, &clamp_lo, &clamp_hi);
+    addsub_neon(v[3], v[12], out + 3, out + 12, &clamp_lo, &clamp_hi);
+    addsub_neon(v[4], v[11], out + 4, out + 11, &clamp_lo, &clamp_hi);
+    addsub_neon(v[5], v[10], out + 5, out + 10, &clamp_lo, &clamp_hi);
+    addsub_neon(v[6], v[9], out + 6, out + 9, &clamp_lo, &clamp_hi);
+    addsub_neon(v[7], v[8], out + 7, out + 8, &clamp_lo, &clamp_hi);
+
+    if (!do_cols) {
+      const int32x4_t rnding_shift = vdupq_n_s32(1 << (out_shift - 1));
+      const int log_range_out = AOMMAX(16, bd + 6);
+      const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1)));
+      const int32x4_t clamp_hi_out =
+          vdupq_n_s32((1 << (log_range_out - 1)) - 1);
+      round_shift_8x8(out, out_shift, &rnding_shift);
+      highbd_clamp_s32_neon(out, out, &clamp_lo_out, &clamp_hi_out, 16);
+    }
+  }
+}
+
+static void iadst16x16_neon(int32x4_t *in, int32x4_t *out, int bit, int do_cols,
+                            int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
+  const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
+  const int32x4_t zero = vdupq_n_s32(0);
+  const int32x4_t v_bit = vdupq_n_s32(-bit);
+  const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
+  int32x4_t u[16], v[16], x, y;
+  // Calculate the column 0, 1, 2, 3
+  // stage 0
+  // stage 1
+  // stage 2
+  v[0] = vmlaq_n_s32(rnding, in[15], cospi[2]);
+  v[0] = vmlaq_n_s32(v[0], in[0], cospi[62]);
+  v[0] = vshlq_s32(v[0], v_bit);
+
+  v[1] = vmlaq_n_s32(rnding, in[15], cospi[62]);
+  v[1] = vmlsq_n_s32(v[1], in[0], cospi[2]);
+  v[1] = vshlq_s32(v[1], v_bit);
+
+  v[2] = vmlaq_n_s32(rnding, in[13], cospi[10]);
+  v[2] = vmlaq_n_s32(v[2], in[2], cospi[54]);
+  v[2] = vshlq_s32(v[2], v_bit);
+
+  v[3] = vmlaq_n_s32(rnding, in[13], cospi[54]);
+  v[3] = vmlsq_n_s32(v[3], in[2], cospi[10]);
+  v[3] = vshlq_s32(v[3], v_bit);
+
+  v[4] = vmlaq_n_s32(rnding, in[11], cospi[18]);
+  v[4] = vmlaq_n_s32(v[4], in[4], cospi[46]);
+  v[4] = vshlq_s32(v[4], v_bit);
+
+  v[5] = vmlaq_n_s32(rnding, in[11], cospi[46]);
+  v[5] = vmlsq_n_s32(v[5], in[4], cospi[18]);
+  v[5] = vshlq_s32(v[5], v_bit);
+
+  v[6] = vmlaq_n_s32(rnding, in[9], cospi[26]);
+  v[6] = vmlaq_n_s32(v[6], in[6], cospi[38]);
+  v[6] = vshlq_s32(v[6], v_bit);
+
+  v[7] = vmlaq_n_s32(rnding, in[9], cospi[38]);
+  v[7] = vmlsq_n_s32(v[7], in[6], cospi[26]);
+  v[7] = vshlq_s32(v[7], v_bit);
+
+  v[8] = vmlaq_n_s32(rnding, in[7], cospi[34]);
+  v[8] = vmlaq_n_s32(v[8], in[8], cospi[30]);
+  v[8] = vshlq_s32(v[8], v_bit);
+
+  v[9] = vmlaq_n_s32(rnding, in[7], cospi[30]);
+  v[9] = vmlsq_n_s32(v[9], in[8], cospi[34]);
+  v[9] = vshlq_s32(v[9], v_bit);
+
+  v[10] = vmlaq_n_s32(rnding, in[5], cospi[42]);
+  v[10] = vmlaq_n_s32(v[10], in[10], cospi[22]);
+  v[10] = vshlq_s32(v[10], v_bit);
+
+  v[11] = vmlaq_n_s32(rnding, in[5], cospi[22]);
+  v[11] = vmlsq_n_s32(v[11], in[10], cospi[42]);
+  v[11] = vshlq_s32(v[11], v_bit);
+
+  v[12] = vmlaq_n_s32(rnding, in[3], cospi[50]);
+  v[12] = vmlaq_n_s32(v[12], in[12], cospi[14]);
+  v[12] = vshlq_s32(v[12], v_bit);
+
+  v[13] = vmlaq_n_s32(rnding, in[3], cospi[14]);
+  v[13] = vmlsq_n_s32(v[13], in[12], cospi[50]);
+  v[13] = vshlq_s32(v[13], v_bit);
+
+  v[14] = vmlaq_n_s32(rnding, in[1], cospi[58]);
+  v[14] = vmlaq_n_s32(v[14], in[14], cospi[6]);
+  v[14] = vshlq_s32(v[14], v_bit);
+
+  v[15] = vmlaq_n_s32(rnding, in[1], cospi[6]);
+  v[15] = vmlsq_n_s32(v[15], in[14], cospi[58]);
+  v[15] = vshlq_s32(v[15], v_bit);
+
+  // stage 3
+  addsub_neon(v[0], v[8], &u[0], &u[8], &clamp_lo, &clamp_hi);
+  addsub_neon(v[1], v[9], &u[1], &u[9], &clamp_lo, &clamp_hi);
+  addsub_neon(v[2], v[10], &u[2], &u[10], &clamp_lo, &clamp_hi);
+  addsub_neon(v[3], v[11], &u[3], &u[11], &clamp_lo, &clamp_hi);
+  addsub_neon(v[4], v[12], &u[4], &u[12], &clamp_lo, &clamp_hi);
+  addsub_neon(v[5], v[13], &u[5], &u[13], &clamp_lo, &clamp_hi);
+  addsub_neon(v[6], v[14], &u[6], &u[14], &clamp_lo, &clamp_hi);
+  addsub_neon(v[7], v[15], &u[7], &u[15], &clamp_lo, &clamp_hi);
+
+  // stage 4
+  v[0] = u[0];
+  v[1] = u[1];
+  v[2] = u[2];
+  v[3] = u[3];
+  v[4] = u[4];
+  v[5] = u[5];
+  v[6] = u[6];
+  v[7] = u[7];
+
+  v[8] = vmlaq_n_s32(rnding, u[8], cospi[8]);
+  v[8] = vmlaq_n_s32(v[8], u[9], cospi[56]);
+  v[8] = vshlq_s32(v[8], v_bit);
+
+  v[9] = vmlaq_n_s32(rnding, u[8], cospi[56]);
+  v[9] = vmlsq_n_s32(v[9], u[9], cospi[8]);
+  v[9] = vshlq_s32(v[9], v_bit);
+
+  v[10] = vmlaq_n_s32(rnding, u[10], cospi[40]);
+  v[10] = vmlaq_n_s32(v[10], u[11], cospi[24]);
+  v[10] = vshlq_s32(v[10], v_bit);
+
+  v[11] = vmlaq_n_s32(rnding, u[10], cospi[24]);
+  v[11] = vmlsq_n_s32(v[11], u[11], cospi[40]);
+  v[11] = vshlq_s32(v[11], v_bit);
+
+  v[12] = vmlaq_n_s32(rnding, u[12], -cospi[56]);
+  v[12] = vmlaq_n_s32(v[12], u[13], cospi[8]);
+  v[12] = vshlq_s32(v[12], v_bit);
+
+  v[13] = vmlaq_n_s32(rnding, u[12], cospi[8]);
+  v[13] = vmlsq_n_s32(v[13], u[13], -cospi[56]);
+  v[13] = vshlq_s32(v[13], v_bit);
+
+  v[14] = vmlaq_n_s32(rnding, u[14], -cospi[24]);
+  v[14] = vmlaq_n_s32(v[14], u[15], cospi[40]);
+  v[14] = vshlq_s32(v[14], v_bit);
+
+  v[15] = vmlaq_n_s32(rnding, u[14], cospi[40]);
+  v[15] = vmlsq_n_s32(v[15], u[15], -cospi[24]);
+  v[15] = vshlq_s32(v[15], v_bit);
+
+  // stage 5
+  addsub_neon(v[0], v[4], &u[0], &u[4], &clamp_lo, &clamp_hi);
+  addsub_neon(v[1], v[5], &u[1], &u[5], &clamp_lo, &clamp_hi);
+  addsub_neon(v[2], v[6], &u[2], &u[6], &clamp_lo, &clamp_hi);
+  addsub_neon(v[3], v[7], &u[3], &u[7], &clamp_lo, &clamp_hi);
+  addsub_neon(v[8], v[12], &u[8], &u[12], &clamp_lo, &clamp_hi);
+  addsub_neon(v[9], v[13], &u[9], &u[13], &clamp_lo, &clamp_hi);
+  addsub_neon(v[10], v[14], &u[10], &u[14], &clamp_lo, &clamp_hi);
+  addsub_neon(v[11], v[15], &u[11], &u[15], &clamp_lo, &clamp_hi);
+
+  // stage 6
+  v[0] = u[0];
+  v[1] = u[1];
+  v[2] = u[2];
+  v[3] = u[3];
+
+  v[4] = vmlaq_n_s32(rnding, u[4], cospi[16]);
+  v[4] = vmlaq_n_s32(v[4], u[5], cospi[48]);
+  v[4] = vshlq_s32(v[4], v_bit);
+
+  v[5] = vmlaq_n_s32(rnding, u[4], cospi[48]);
+  v[5] = vmlsq_n_s32(v[5], u[5], cospi[16]);
+  v[5] = vshlq_s32(v[5], v_bit);
+
+  v[6] = vmlaq_n_s32(rnding, u[6], -cospi[48]);
+  v[6] = vmlaq_n_s32(v[6], u[7], cospi[16]);
+  v[6] = vshlq_s32(v[6], v_bit);
+
+  v[7] = vmlaq_n_s32(rnding, u[6], cospi[16]);
+  v[7] = vmlsq_n_s32(v[7], u[7], -cospi[48]);
+  v[7] = vshlq_s32(v[7], v_bit);
+
+  v[8] = u[8];
+  v[9] = u[9];
+  v[10] = u[10];
+  v[11] = u[11];
+
+  v[12] = vmlaq_n_s32(rnding, u[12], cospi[16]);
+  v[12] = vmlaq_n_s32(v[12], u[13], cospi[48]);
+  v[12] = vshlq_s32(v[12], v_bit);
+
+  v[13] = vmlaq_n_s32(rnding, u[12], cospi[48]);
+  v[13] = vmlsq_n_s32(v[13], u[13], cospi[16]);
+  v[13] = vshlq_s32(v[13], v_bit);
+
+  v[14] = vmlaq_n_s32(rnding, u[14], -cospi[48]);
+  v[14] = vmlaq_n_s32(v[14], u[15], cospi[16]);
+  v[14] = vshlq_s32(v[14], v_bit);
+
+  v[15] = vmlaq_n_s32(rnding, u[14], cospi[16]);
+  v[15] = vmlsq_n_s32(v[15], u[15], -cospi[48]);
+  v[15] = vshlq_s32(v[15], v_bit);
+
+  // stage 7
+  addsub_neon(v[0], v[2], &u[0], &u[2], &clamp_lo, &clamp_hi);
+  addsub_neon(v[1], v[3], &u[1], &u[3], &clamp_lo, &clamp_hi);
+  addsub_neon(v[4], v[6], &u[4], &u[6], &clamp_lo, &clamp_hi);
+  addsub_neon(v[5], v[7], &u[5], &u[7], &clamp_lo, &clamp_hi);
+  addsub_neon(v[8], v[10], &u[8], &u[10], &clamp_lo, &clamp_hi);
+  addsub_neon(v[9], v[11], &u[9], &u[11], &clamp_lo, &clamp_hi);
+  addsub_neon(v[12], v[14], &u[12], &u[14], &clamp_lo, &clamp_hi);
+  addsub_neon(v[13], v[15], &u[13], &u[15], &clamp_lo, &clamp_hi);
+
+  // stage 8
+  v[0] = u[0];
+  v[1] = u[1];
+
+  y = vmlaq_n_s32(rnding, u[2], cospi[32]);
+  x = vmulq_n_s32(u[3], cospi[32]);
+  v[2] = vaddq_s32(y, x);
+  v[2] = vshlq_s32(v[2], v_bit);
+
+  v[3] = vsubq_s32(y, x);
+  v[3] = vshlq_s32(v[3], v_bit);
+
+  v[4] = u[4];
+  v[5] = u[5];
+
+  y = vmlaq_n_s32(rnding, u[6], cospi[32]);
+  x = vmulq_n_s32(u[7], cospi[32]);
+  v[6] = vaddq_s32(y, x);
+  v[6] = vshlq_s32(v[6], v_bit);
+
+  v[7] = vsubq_s32(y, x);
+  v[7] = vshlq_s32(v[7], v_bit);
+
+  v[8] = u[8];
+  v[9] = u[9];
+
+  y = vmlaq_n_s32(rnding, u[10], cospi[32]);
+  x = vmulq_n_s32(u[11], cospi[32]);
+  v[10] = vaddq_s32(y, x);
+  v[10] = vshlq_s32(v[10], v_bit);
+
+  v[11] = vsubq_s32(y, x);
+  v[11] = vshlq_s32(v[11], v_bit);
+
+  v[12] = u[12];
+  v[13] = u[13];
+
+  y = vmlaq_n_s32(rnding, u[14], cospi[32]);
+  x = vmulq_n_s32(u[15], cospi[32]);
+  v[14] = vaddq_s32(y, x);
+  v[14] = vshlq_s32(v[14], v_bit);
+
+  v[15] = vsubq_s32(y, x);
+  v[15] = vshlq_s32(v[15], v_bit);
+
+  // stage 9
+  if (do_cols) {
+    out[0] = v[0];
+    out[1] = vsubq_s32(zero, v[8]);
+    out[2] = v[12];
+    out[3] = vsubq_s32(zero, v[4]);
+    out[4] = v[6];
+    out[5] = vsubq_s32(zero, v[14]);
+    out[6] = v[10];
+    out[7] = vsubq_s32(zero, v[2]);
+    out[8] = v[3];
+    out[9] = vsubq_s32(zero, v[11]);
+    out[10] = v[15];
+    out[11] = vsubq_s32(zero, v[7]);
+    out[12] = v[5];
+    out[13] = vsubq_s32(zero, v[13]);
+    out[14] = v[9];
+    out[15] = vsubq_s32(zero, v[1]);
+  } else {
+    const int log_range_out = AOMMAX(16, bd + 6);
+    const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1)));
+    const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1);
+    const int32x4_t v_shift = vdupq_n_s32(-out_shift);
+    int32x4_t offset = vdupq_n_s32((1 << out_shift) >> 1);
+    neg_shift_neon(&v[0], &v[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
+                   &v_shift, &offset);
+    neg_shift_neon(&v[12], &v[4], out + 2, out + 3, &clamp_lo_out,
+                   &clamp_hi_out, &v_shift, &offset);
+    neg_shift_neon(&v[6], &v[14], out + 4, out + 5, &clamp_lo_out,
+                   &clamp_hi_out, &v_shift, &offset);
+    neg_shift_neon(&v[10], &v[2], out + 6, out + 7, &clamp_lo_out,
+                   &clamp_hi_out, &v_shift, &offset);
+    neg_shift_neon(&v[3], &v[11], out + 8, out + 9, &clamp_lo_out,
+                   &clamp_hi_out, &v_shift, &offset);
+    neg_shift_neon(&v[15], &v[7], out + 10, out + 11, &clamp_lo_out,
+                   &clamp_hi_out, &v_shift, &offset);
+    neg_shift_neon(&v[5], &v[13], out + 12, out + 13, &clamp_lo_out,
+                   &clamp_hi_out, &v_shift, &offset);
+    neg_shift_neon(&v[9], &v[1], out + 14, out + 15, &clamp_lo_out,
+                   &clamp_hi_out, &v_shift, &offset);
+  }
+}
+static void iidentity16_neon(int32x4_t *in, int32x4_t *out, int bit,
+                             int do_cols, int bd, int out_shift) {
+  (void)bit;
+  int32x2_t fact = vdup_n_s32(2 * NewSqrt2);
+  int32x4x2_t a0;
+  int32x4_t zero = vdupq_n_s32(0);
+  const int64x2_t rnding = vdupq_n_s64(1 << (NewSqrt2Bits - 1));
+  for (int i = 0; i < 16; i++) {
+    a0.val[0] = vreinterpretq_s32_s64(
+        vmlal_s32(rnding, vmovn_s64(vreinterpretq_s64_s32(in[i])), fact));
+    a0.val[0] = vreinterpretq_s32_s64(
+        vshrq_n_s64(vreinterpretq_s64_s32(a0.val[0]), NewSqrt2Bits));
+    a0.val[1] = vextq_s32(in[i], zero, 1);
+    a0.val[1] = vreinterpretq_s32_s64(
+        vmlal_s32(rnding, vmovn_s64(vreinterpretq_s64_s32(a0.val[1])), fact));
+    a0.val[1] = vreinterpretq_s32_s64(
+        vshrq_n_s64(vreinterpretq_s64_s32(a0.val[1]), NewSqrt2Bits));
+    a0 = vzipq_s32(a0.val[0], a0.val[1]);
+#if defined(__aarch64__)
+    out[i] = vreinterpretq_s32_s64(vzip1q_s64(
+        vreinterpretq_s64_s32(a0.val[0]), vreinterpretq_s64_s32(a0.val[1])));
+#else
+    out[i] = vextq_s32(vextq_s32(a0.val[0], a0.val[0], 2), a0.val[1], 2);
+#endif
+  }
+
+  if (!do_cols) {
+    const int32x4_t rnding_shift = vdupq_n_s32(1 << (out_shift - 1));
+    const int log_range = AOMMAX(16, bd + 6);
+    const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
+    const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
+    round_shift_8x8(out, out_shift, &rnding_shift);
+    highbd_clamp_s32_neon(out, out, &clamp_lo, &clamp_hi, 16);
+  }
+}
+static INLINE void idct64_stage8_neon(int32x4_t *u, const int32_t *cospi,
+                                      const int32x4_t *clamp_lo,
+                                      const int32x4_t *clamp_hi,
+                                      const int32x4_t *v_bit,
+                                      const int32x4_t *rnding) {
+  int i;
+  int32x4_t temp1, temp2, temp3, temp4;
+  temp1 = half_btf_neon_mode10_r(&cospi[32], &u[10], &cospi[32], &u[13], v_bit,
+                                 rnding);
+  u[13] =
+      half_btf_neon_r(&cospi[32], &u[10], &cospi[32], &u[13], v_bit, rnding);
+  u[10] = temp1;
+  temp2 = half_btf_neon_mode10_r(&cospi[32], &u[11], &cospi[32], &u[12], v_bit,
+                                 rnding);
+  u[12] =
+      half_btf_neon_r(&cospi[32], &u[11], &cospi[32], &u[12], v_bit, rnding);
+  u[11] = temp2;
+
+  for (i = 16; i < 20; ++i) {
+    addsub_neon(u[i], u[i ^ 7], &u[i], &u[i ^ 7], clamp_lo, clamp_hi);
+    addsub_neon(u[i ^ 15], u[i ^ 8], &u[i ^ 15], &u[i ^ 8], clamp_lo, clamp_hi);
+  }
+
+  temp1 = half_btf_neon_mode10_r(&cospi[16], &u[36], &cospi[48], &u[59], v_bit,
+                                 rnding);
+  temp2 = half_btf_neon_mode10_r(&cospi[16], &u[37], &cospi[48], &u[58], v_bit,
+                                 rnding);
+  temp3 = half_btf_neon_mode10_r(&cospi[16], &u[38], &cospi[48], &u[57], v_bit,
+                                 rnding);
+  temp4 = half_btf_neon_mode10_r(&cospi[16], &u[39], &cospi[48], &u[56], v_bit,
+                                 rnding);
+  u[56] =
+      half_btf_neon_r(&cospi[48], &u[39], &cospi[16], &u[56], v_bit, rnding);
+  u[57] =
+      half_btf_neon_r(&cospi[48], &u[38], &cospi[16], &u[57], v_bit, rnding);
+  u[58] =
+      half_btf_neon_r(&cospi[48], &u[37], &cospi[16], &u[58], v_bit, rnding);
+  u[59] =
+      half_btf_neon_r(&cospi[48], &u[36], &cospi[16], &u[59], v_bit, rnding);
+  u[36] = temp1;
+  u[37] = temp2;
+  u[38] = temp3;
+  u[39] = temp4;
+
+  temp1 = half_btf_neon_mode11_r(&cospi[48], &u[40], &cospi[16], &u[55], v_bit,
+                                 rnding);
+  temp2 = half_btf_neon_mode11_r(&cospi[48], &u[41], &cospi[16], &u[54], v_bit,
+                                 rnding);
+  temp3 = half_btf_neon_mode11_r(&cospi[48], &u[42], &cospi[16], &u[53], v_bit,
+                                 rnding);
+  temp4 = half_btf_neon_mode11_r(&cospi[48], &u[43], &cospi[16], &u[52], v_bit,
+                                 rnding);
+  u[52] = half_btf_neon_mode10_r(&cospi[16], &u[43], &cospi[48], &u[52], v_bit,
+                                 rnding);
+  u[53] = half_btf_neon_mode10_r(&cospi[16], &u[42], &cospi[48], &u[53], v_bit,
+                                 rnding);
+  u[54] = half_btf_neon_mode10_r(&cospi[16], &u[41], &cospi[48], &u[54], v_bit,
+                                 rnding);
+  u[55] = half_btf_neon_mode10_r(&cospi[16], &u[40], &cospi[48], &u[55], v_bit,
+                                 rnding);
+  u[40] = temp1;
+  u[41] = temp2;
+  u[42] = temp3;
+  u[43] = temp4;
+}
+
+static INLINE void idct64_stage9_neon(int32x4_t *u, const int32_t *cospi,
+                                      const int32x4_t *clamp_lo,
+                                      const int32x4_t *clamp_hi,
+                                      const int32x4_t *v_bit,
+                                      const int32x4_t *rnding) {
+  int i;
+  int32x4_t temp1, temp2, temp3, temp4;
+  for (i = 0; i < 8; ++i) {
+    addsub_neon(u[i], u[15 - i], &u[i], &u[15 - i], clamp_lo, clamp_hi);
+  }
+  temp1 = half_btf_neon_mode10_r(&cospi[32], &u[20], &cospi[32], &u[27], v_bit,
+                                 rnding);
+  temp2 = half_btf_neon_mode10_r(&cospi[32], &u[21], &cospi[32], &u[26], v_bit,
+                                 rnding);
+  temp3 = half_btf_neon_mode10_r(&cospi[32], &u[22], &cospi[32], &u[25], v_bit,
+                                 rnding);
+  temp4 = half_btf_neon_mode10_r(&cospi[32], &u[23], &cospi[32], &u[24], v_bit,
+                                 rnding);
+  u[24] =
+      half_btf_neon_r(&cospi[32], &u[23], &cospi[32], &u[24], v_bit, rnding);
+  u[25] =
+      half_btf_neon_r(&cospi[32], &u[22], &cospi[32], &u[25], v_bit, rnding);
+  u[26] =
+      half_btf_neon_r(&cospi[32], &u[21], &cospi[32], &u[26], v_bit, rnding);
+  u[27] =
+      half_btf_neon_r(&cospi[32], &u[20], &cospi[32], &u[27], v_bit, rnding);
+  u[20] = temp1;
+  u[21] = temp2;
+  u[22] = temp3;
+  u[23] = temp4;
+  for (i = 32; i < 40; i++) {
+    addsub_neon(u[i], u[i ^ 15], &u[i], &u[i ^ 15], clamp_lo, clamp_hi);
+  }
+
+  for (i = 48; i < 56; i++) {
+    addsub_neon(u[i ^ 15], u[i], &u[i ^ 15], &u[i], clamp_lo, clamp_hi);
+  }
+}
+
+static INLINE void idct64_stage10_neon(int32x4_t *u, const int32_t *cospi,
+                                       const int32x4_t *clamp_lo,
+                                       const int32x4_t *clamp_hi,
+                                       const int32x4_t *v_bit,
+                                       const int32x4_t *rnding) {
+  int32x4_t temp1, temp2, temp3, temp4;
+  for (int i = 0; i < 16; i++) {
+    addsub_neon(u[i], u[31 - i], &u[i], &u[31 - i], clamp_lo, clamp_hi);
+  }
+  temp1 = half_btf_neon_mode10_r(&cospi[32], &u[40], &cospi[32], &u[55], v_bit,
+                                 rnding);
+  temp2 = half_btf_neon_mode10_r(&cospi[32], &u[41], &cospi[32], &u[54], v_bit,
+                                 rnding);
+  temp3 = half_btf_neon_mode10_r(&cospi[32], &u[42], &cospi[32], &u[53], v_bit,
+                                 rnding);
+  temp4 = half_btf_neon_mode10_r(&cospi[32], &u[43], &cospi[32], &u[52], v_bit,
+                                 rnding);
+  u[52] =
+      half_btf_neon_r(&cospi[32], &u[43], &cospi[32], &u[52], v_bit, rnding);
+  u[53] =
+      half_btf_neon_r(&cospi[32], &u[42], &cospi[32], &u[53], v_bit, rnding);
+  u[54] =
+      half_btf_neon_r(&cospi[32], &u[41], &cospi[32], &u[54], v_bit, rnding);
+  u[55] =
+      half_btf_neon_r(&cospi[32], &u[40], &cospi[32], &u[55], v_bit, rnding);
+  u[40] = temp1;
+  u[41] = temp2;
+  u[42] = temp3;
+  u[43] = temp4;
+
+  temp1 = half_btf_neon_mode10_r(&cospi[32], &u[44], &cospi[32], &u[51], v_bit,
+                                 rnding);
+  temp2 = half_btf_neon_mode10_r(&cospi[32], &u[45], &cospi[32], &u[50], v_bit,
+                                 rnding);
+  temp3 = half_btf_neon_mode10_r(&cospi[32], &u[46], &cospi[32], &u[49], v_bit,
+                                 rnding);
+  temp4 = half_btf_neon_mode10_r(&cospi[32], &u[47], &cospi[32], &u[48], v_bit,
+                                 rnding);
+  u[48] =
+      half_btf_neon_r(&cospi[32], &u[47], &cospi[32], &u[48], v_bit, rnding);
+  u[49] =
+      half_btf_neon_r(&cospi[32], &u[46], &cospi[32], &u[49], v_bit, rnding);
+  u[50] =
+      half_btf_neon_r(&cospi[32], &u[45], &cospi[32], &u[50], v_bit, rnding);
+  u[51] =
+      half_btf_neon_r(&cospi[32], &u[44], &cospi[32], &u[51], v_bit, rnding);
+  u[44] = temp1;
+  u[45] = temp2;
+  u[46] = temp3;
+  u[47] = temp4;
+}
+
+static INLINE void idct64_stage11_neon(int32x4_t *u, int32x4_t *out,
+                                       int do_cols, int bd, int out_shift,
+                                       const int32x4_t *clamp_lo,
+                                       const int32x4_t *clamp_hi) {
+  for (int i = 0; i < 32; i++) {
+    addsub_neon(u[i], u[63 - i], out + i, out + 63 - i, clamp_lo, clamp_hi);
+  }
+
+  if (!do_cols) {
+    const int log_range_out = AOMMAX(16, bd + 6);
+    const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1)));
+    const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1);
+    const int32x4_t rnding = vdupq_n_s32(1 << (out_shift - 1));
+    for (int i = 0; i < 64; i += 4) {
+      round_shift_4x4(out + i, out_shift, &rnding);
+      highbd_clamp_s32_neon(out + i, out + i, &clamp_lo_out, &clamp_hi_out, 4);
+    }
+  }
+}
+
+static void idct64x64_low1_neon(int32x4_t *in, int32x4_t *out, int bit,
+                                int do_cols, int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
+  int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
+
+  const int32x4_t v_bit = vdupq_n_s32(-bit);
+  const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
+  {
+    int32x4_t x;
+
+    // stage 1
+    // stage 2
+    // stage 3
+    // stage 4
+    // stage 5
+    // stage 6
+    x = half_btf_0_neon_r(&cospi[32], &in[0], &v_bit, &rnding);
+
+    // stage 8
+    // stage 9
+    // stage 10
+    // stage 11
+    if (!do_cols) {
+      const int log_range_out = AOMMAX(16, bd + 6);
+      clamp_lo = vdupq_n_s32(-(1 << (log_range_out - 1)));
+      clamp_hi = vdupq_n_s32((1 << (log_range_out - 1)) - 1);
+      if (out_shift != 0) {
+        int32x4_t offset = vdupq_n_s32((1 << out_shift) >> 1);
+        x = vaddq_s32(x, offset);
+        x = vshlq_s32(x, vdupq_n_s32(-out_shift));
+      }
+    }
+    x = vmaxq_s32(x, clamp_lo);
+    x = vminq_s32(x, clamp_hi);
+    out[0] = x;
+    out[1] = x;
+    out[2] = x;
+    out[3] = x;
+    out[4] = x;
+    out[5] = x;
+    out[6] = x;
+    out[7] = x;
+    out[8] = x;
+    out[9] = x;
+    out[10] = x;
+    out[11] = x;
+    out[12] = x;
+    out[13] = x;
+    out[14] = x;
+    out[15] = x;
+    out[16] = x;
+    out[17] = x;
+    out[18] = x;
+    out[19] = x;
+    out[20] = x;
+    out[21] = x;
+    out[22] = x;
+    out[23] = x;
+    out[24] = x;
+    out[25] = x;
+    out[26] = x;
+    out[27] = x;
+    out[28] = x;
+    out[29] = x;
+    out[30] = x;
+    out[31] = x;
+    out[32] = x;
+    out[33] = x;
+    out[34] = x;
+    out[35] = x;
+    out[36] = x;
+    out[37] = x;
+    out[38] = x;
+    out[39] = x;
+    out[40] = x;
+    out[41] = x;
+    out[42] = x;
+    out[43] = x;
+    out[44] = x;
+    out[45] = x;
+    out[46] = x;
+    out[47] = x;
+    out[48] = x;
+    out[49] = x;
+    out[50] = x;
+    out[51] = x;
+    out[52] = x;
+    out[53] = x;
+    out[54] = x;
+    out[55] = x;
+    out[56] = x;
+    out[57] = x;
+    out[58] = x;
+    out[59] = x;
+    out[60] = x;
+    out[61] = x;
+    out[62] = x;
+    out[63] = x;
+  }
+}
+
+static void idct64x64_low8_neon(int32x4_t *in, int32x4_t *out, int bit,
+                                int do_cols, int bd, int out_shift) {
+  int i, j;
+  const int32_t *cospi = cospi_arr(bit);
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
+  const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
+  const int32x4_t v_bit = vdupq_n_s32(-bit);
+  const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
+  {
+    int32x4_t u[64];
+
+    // stage 1
+    u[0] = in[0];
+    u[8] = in[4];
+    u[16] = in[2];
+    u[24] = in[6];
+    u[32] = in[1];
+    u[40] = in[5];
+    u[48] = in[3];
+    u[56] = in[7];
+
+    // stage 2
+    u[63] = half_btf_0_neon_r(&cospi[1], &u[32], &v_bit, &rnding);
+    u[32] = half_btf_0_neon_r(&cospi[63], &u[32], &v_bit, &rnding);
+    u[39] = half_btf_0_m_neon_r(&cospi[57], &u[56], &v_bit, &rnding);
+    u[56] = half_btf_0_neon_r(&cospi[7], &u[56], &v_bit, &rnding);
+    u[55] = half_btf_0_neon_r(&cospi[5], &u[40], &v_bit, &rnding);
+    u[40] = half_btf_0_neon_r(&cospi[59], &u[40], &v_bit, &rnding);
+    u[47] = half_btf_0_m_neon_r(&cospi[61], &u[48], &v_bit, &rnding);
+    u[48] = half_btf_0_neon_r(&cospi[3], &u[48], &v_bit, &rnding);
+
+    // stage 3
+    u[31] = half_btf_0_neon_r(&cospi[2], &u[16], &v_bit, &rnding);
+    u[16] = half_btf_0_neon_r(&cospi[62], &u[16], &v_bit, &rnding);
+    u[23] = half_btf_0_m_neon_r(&cospi[58], &u[24], &v_bit, &rnding);
+    u[24] = half_btf_0_neon_r(&cospi[6], &u[24], &v_bit, &rnding);
+    u[33] = u[32];
+    u[38] = u[39];
+    u[41] = u[40];
+    u[46] = u[47];
+    u[49] = u[48];
+    u[54] = u[55];
+    u[57] = u[56];
+    u[62] = u[63];
+
+    // stage 4
+    int32x4_t temp1, temp2;
+    u[15] = half_btf_0_neon_r(&cospi[4], &u[8], &v_bit, &rnding);
+    u[8] = half_btf_0_neon_r(&cospi[60], &u[8], &v_bit, &rnding);
+    u[17] = u[16];
+    u[22] = u[23];
+    u[25] = u[24];
+    u[30] = u[31];
+
+    temp1 = half_btf_neon_mode10_r(&cospi[4], &u[33], &cospi[60], &u[62],
+                                   &v_bit, &rnding);
+    u[62] =
+        half_btf_neon_r(&cospi[60], &u[33], &cospi[4], &u[62], &v_bit, &rnding);
+    u[33] = temp1;
+
+    temp2 = half_btf_neon_mode10_r(&cospi[36], &u[38], &cospi[28], &u[57],
+                                   &v_bit, &rnding);
+    u[38] = half_btf_neon_mode11_r(&cospi[28], &u[38], &cospi[36], &u[57],
+                                   &v_bit, &rnding);
+    u[57] = temp2;
+
+    temp1 = half_btf_neon_mode10_r(&cospi[20], &u[41], &cospi[44], &u[54],
+                                   &v_bit, &rnding);
+    u[54] = half_btf_neon_r(&cospi[44], &u[41], &cospi[20], &u[54], &v_bit,
+                            &rnding);
+    u[41] = temp1;
+
+    temp2 = half_btf_neon_mode11_r(&cospi[12], &u[46], &cospi[52], &u[49],
+                                   &v_bit, &rnding);
+    u[49] = half_btf_neon_mode10_r(&cospi[52], &u[46], &cospi[12], &u[49],
+                                   &v_bit, &rnding);
+    u[46] = temp2;
+
+    // stage 5
+    u[9] = u[8];
+    u[14] = u[15];
+
+    temp1 = half_btf_neon_mode10_r(&cospi[8], &u[17], &cospi[56], &u[30],
+                                   &v_bit, &rnding);
+    u[30] =
+        half_btf_neon_r(&cospi[56], &u[17], &cospi[8], &u[30], &v_bit, &rnding);
+    u[17] = temp1;
+
+    temp2 = half_btf_neon_mode11_r(&cospi[24], &u[22], &cospi[40], &u[25],
+                                   &v_bit, &rnding);
+    u[25] = half_btf_neon_mode10_r(&cospi[40], &u[22], &cospi[24], &u[25],
+                                   &v_bit, &rnding);
+    u[22] = temp2;
+
+    u[35] = u[32];
+    u[34] = u[33];
+    u[36] = u[39];
+    u[37] = u[38];
+    u[43] = u[40];
+    u[42] = u[41];
+    u[44] = u[47];
+    u[45] = u[46];
+    u[51] = u[48];
+    u[50] = u[49];
+    u[52] = u[55];
+    u[53] = u[54];
+    u[59] = u[56];
+    u[58] = u[57];
+    u[60] = u[63];
+    u[61] = u[62];
+
+    // stage 6
+    temp1 = half_btf_0_neon_r(&cospi[32], &u[0], &v_bit, &rnding);
+    u[1] = half_btf_0_neon_r(&cospi[32], &u[0], &v_bit, &rnding);
+    u[0] = temp1;
+
+    temp2 = half_btf_neon_mode10_r(&cospi[16], &u[9], &cospi[48], &u[14],
+                                   &v_bit, &rnding);
+    u[14] =
+        half_btf_neon_r(&cospi[48], &u[9], &cospi[16], &u[14], &v_bit, &rnding);
+    u[9] = temp2;
+    u[19] = u[16];
+    u[18] = u[17];
+    u[20] = u[23];
+    u[21] = u[22];
+    u[27] = u[24];
+    u[26] = u[25];
+    u[28] = u[31];
+    u[29] = u[30];
+
+    temp1 = half_btf_neon_mode10_r(&cospi[8], &u[34], &cospi[56], &u[61],
+                                   &v_bit, &rnding);
+    u[61] =
+        half_btf_neon_r(&cospi[56], &u[34], &cospi[8], &u[61], &v_bit, &rnding);
+    u[34] = temp1;
+    temp2 = half_btf_neon_mode10_r(&cospi[8], &u[35], &cospi[56], &u[60],
+                                   &v_bit, &rnding);
+    u[60] =
+        half_btf_neon_r(&cospi[56], &u[35], &cospi[8], &u[60], &v_bit, &rnding);
+    u[35] = temp2;
+    temp1 = half_btf_neon_mode11_r(&cospi[56], &u[36], &cospi[8], &u[59],
+                                   &v_bit, &rnding);
+    u[59] = half_btf_neon_mode10_r(&cospi[8], &u[36], &cospi[56], &u[59],
+                                   &v_bit, &rnding);
+    u[36] = temp1;
+    temp2 = half_btf_neon_mode11_r(&cospi[56], &u[37], &cospi[8], &u[58],
+                                   &v_bit, &rnding);
+    u[58] = half_btf_neon_mode10_r(&cospi[8], &u[37], &cospi[56], &u[58],
+                                   &v_bit, &rnding);
+    u[37] = temp2;
+    temp1 = half_btf_neon_mode10_r(&cospi[40], &u[42], &cospi[24], &u[53],
+                                   &v_bit, &rnding);
+    u[53] = half_btf_neon_r(&cospi[24], &u[42], &cospi[40], &u[53], &v_bit,
+                            &rnding);
+    u[42] = temp1;
+    temp2 = half_btf_neon_mode10_r(&cospi[40], &u[43], &cospi[24], &u[52],
+                                   &v_bit, &rnding);
+    u[52] = half_btf_neon_r(&cospi[24], &u[43], &cospi[40], &u[52], &v_bit,
+                            &rnding);
+    u[43] = temp2;
+    temp1 = half_btf_neon_mode11_r(&cospi[24], &u[44], &cospi[40], &u[51],
+                                   &v_bit, &rnding);
+    u[51] = half_btf_neon_mode10_r(&cospi[40], &u[44], &cospi[24], &u[51],
+                                   &v_bit, &rnding);
+    u[44] = temp1;
+    temp2 = half_btf_neon_mode11_r(&cospi[24], &u[45], &cospi[40], &u[50],
+                                   &v_bit, &rnding);
+    u[50] = half_btf_neon_mode10_r(&cospi[40], &u[45], &cospi[24], &u[50],
+                                   &v_bit, &rnding);
+    u[45] = temp2;
+
+    // stage 7
+    u[3] = u[0];
+    u[2] = u[1];
+    u[11] = u[8];
+    u[10] = u[9];
+    u[12] = u[15];
+    u[13] = u[14];
+
+    temp1 = half_btf_neon_mode10_r(&cospi[16], &u[18], &cospi[48], &u[29],
+                                   &v_bit, &rnding);
+    u[29] = half_btf_neon_r(&cospi[48], &u[18], &cospi[16], &u[29], &v_bit,
+                            &rnding);
+    u[18] = temp1;
+    temp2 = half_btf_neon_mode10_r(&cospi[16], &u[19], &cospi[48], &u[28],
+                                   &v_bit, &rnding);
+    u[28] = half_btf_neon_r(&cospi[48], &u[19], &cospi[16], &u[28], &v_bit,
+                            &rnding);
+    u[19] = temp2;
+    temp1 = half_btf_neon_mode11_r(&cospi[48], &u[20], &cospi[16], &u[27],
+                                   &v_bit, &rnding);
+    u[27] = half_btf_neon_mode10_r(&cospi[16], &u[20], &cospi[48], &u[27],
+                                   &v_bit, &rnding);
+    u[20] = temp1;
+    temp2 = half_btf_neon_mode11_r(&cospi[48], &u[21], &cospi[16], &u[26],
+                                   &v_bit, &rnding);
+    u[26] = half_btf_neon_mode10_r(&cospi[16], &u[21], &cospi[48], &u[26],
+                                   &v_bit, &rnding);
+    u[21] = temp2;
+    for (i = 32; i < 64; i += 16) {
+      for (j = i; j < i + 4; j++) {
+        addsub_neon(u[j], u[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi);
+        addsub_neon(u[j ^ 15], u[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo,
+                    &clamp_hi);
+      }
+    }
+
+    // stage 8
+    u[7] = u[0];
+    u[6] = u[1];
+    u[5] = u[2];
+    u[4] = u[3];
+    u[9] = u[9];
+
+    idct64_stage8_neon(u, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding);
+
+    // stage 9
+    idct64_stage9_neon(u, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding);
+
+    // stage 10
+    idct64_stage10_neon(u, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding);
+
+    // stage 11
+    idct64_stage11_neon(u, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi);
+  }
+}
+
+static void idct64x64_low16_neon(int32x4_t *in, int32x4_t *out, int bit,
+                                 int do_cols, int bd, int out_shift) {
+  int i, j;
+  const int32_t *cospi = cospi_arr(bit);
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
+  const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
+  const int32x4_t v_bit = vdupq_n_s32(-bit);
+  const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
+
+  {
+    int32x4_t u[64];
+    int32x4_t tmp1, tmp2, tmp3, tmp4;
+    // stage 1
+    u[0] = in[0];
+    u[32] = in[1];
+    u[36] = in[9];
+    u[40] = in[5];
+    u[44] = in[13];
+    u[48] = in[3];
+    u[52] = in[11];
+    u[56] = in[7];
+    u[60] = in[15];
+    u[16] = in[2];
+    u[20] = in[10];
+    u[24] = in[6];
+    u[28] = in[14];
+    u[4] = in[8];
+    u[8] = in[4];
+    u[12] = in[12];
+
+    // stage 2
+    u[63] = half_btf_0_neon_r(&cospi[1], &u[32], &v_bit, &rnding);
+    u[32] = half_btf_0_neon_r(&cospi[63], &u[32], &v_bit, &rnding);
+    u[35] = half_btf_0_m_neon_r(&cospi[49], &u[60], &v_bit, &rnding);
+    u[60] = half_btf_0_neon_r(&cospi[15], &u[60], &v_bit, &rnding);
+    u[59] = half_btf_0_neon_r(&cospi[9], &u[36], &v_bit, &rnding);
+    u[36] = half_btf_0_neon_r(&cospi[55], &u[36], &v_bit, &rnding);
+    u[39] = half_btf_0_m_neon_r(&cospi[57], &u[56], &v_bit, &rnding);
+    u[56] = half_btf_0_neon_r(&cospi[7], &u[56], &v_bit, &rnding);
+    u[55] = half_btf_0_neon_r(&cospi[5], &u[40], &v_bit, &rnding);
+    u[40] = half_btf_0_neon_r(&cospi[59], &u[40], &v_bit, &rnding);
+    u[43] = half_btf_0_m_neon_r(&cospi[53], &u[52], &v_bit, &rnding);
+    u[52] = half_btf_0_neon_r(&cospi[11], &u[52], &v_bit, &rnding);
+    u[47] = half_btf_0_m_neon_r(&cospi[61], &u[48], &v_bit, &rnding);
+    u[48] = half_btf_0_neon_r(&cospi[3], &u[48], &v_bit, &rnding);
+    u[51] = half_btf_0_neon_r(&cospi[13], &u[44], &v_bit, &rnding);
+    u[44] = half_btf_0_neon_r(&cospi[51], &u[44], &v_bit, &rnding);
+
+    // stage 3
+    u[31] = half_btf_0_neon_r(&cospi[2], &u[16], &v_bit, &rnding);
+    u[16] = half_btf_0_neon_r(&cospi[62], &u[16], &v_bit, &rnding);
+    u[19] = half_btf_0_m_neon_r(&cospi[50], &u[28], &v_bit, &rnding);
+    u[28] = half_btf_0_neon_r(&cospi[14], &u[28], &v_bit, &rnding);
+    u[27] = half_btf_0_neon_r(&cospi[10], &u[20], &v_bit, &rnding);
+    u[20] = half_btf_0_neon_r(&cospi[54], &u[20], &v_bit, &rnding);
+    u[23] = half_btf_0_m_neon_r(&cospi[58], &u[24], &v_bit, &rnding);
+    u[24] = half_btf_0_neon_r(&cospi[6], &u[24], &v_bit, &rnding);
+    u[33] = u[32];
+    u[34] = u[35];
+    u[37] = u[36];
+    u[38] = u[39];
+    u[41] = u[40];
+    u[42] = u[43];
+    u[45] = u[44];
+    u[46] = u[47];
+    u[49] = u[48];
+    u[50] = u[51];
+    u[53] = u[52];
+    u[54] = u[55];
+    u[57] = u[56];
+    u[58] = u[59];
+    u[61] = u[60];
+    u[62] = u[63];
+
+    // stage 4
+    u[15] = half_btf_0_neon_r(&cospi[4], &u[8], &v_bit, &rnding);
+    u[8] = half_btf_0_neon_r(&cospi[60], &u[8], &v_bit, &rnding);
+    u[11] = half_btf_0_m_neon_r(&cospi[52], &u[12], &v_bit, &rnding);
+    u[12] = half_btf_0_neon_r(&cospi[12], &u[12], &v_bit, &rnding);
+
+    u[17] = u[16];
+    u[18] = u[19];
+    u[21] = u[20];
+    u[22] = u[23];
+    u[25] = u[24];
+    u[26] = u[27];
+    u[29] = u[28];
+    u[30] = u[31];
+
+    tmp1 = half_btf_neon_mode10_r(&cospi[4], &u[33], &cospi[60], &u[62], &v_bit,
+                                  &rnding);
+    tmp2 = half_btf_neon_mode11_r(&cospi[60], &u[34], &cospi[4], &u[61], &v_bit,
+                                  &rnding);
+    tmp3 = half_btf_neon_mode10_r(&cospi[36], &u[37], &cospi[28], &u[58],
+                                  &v_bit, &rnding);
+    tmp4 = half_btf_neon_mode11_r(&cospi[28], &u[38], &cospi[36], &u[57],
+                                  &v_bit, &rnding);
+    u[57] = half_btf_neon_mode10_r(&cospi[36], &u[38], &cospi[28], &u[57],
+                                   &v_bit, &rnding);
+    u[58] = half_btf_neon_r(&cospi[28], &u[37], &cospi[36], &u[58], &v_bit,
+                            &rnding);
+    u[61] = half_btf_neon_mode10_r(&cospi[4], &u[34], &cospi[60], &u[61],
+                                   &v_bit, &rnding);
+    u[62] =
+        half_btf_neon_r(&cospi[60], &u[33], &cospi[4], &u[62], &v_bit, &rnding);
+    u[33] = tmp1;
+    u[34] = tmp2;
+    u[37] = tmp3;
+    u[38] = tmp4;
+
+    tmp1 = half_btf_neon_mode10_r(&cospi[20], &u[41], &cospi[44], &u[54],
+                                  &v_bit, &rnding);
+    tmp2 = half_btf_neon_mode11_r(&cospi[44], &u[42], &cospi[20], &u[53],
+                                  &v_bit, &rnding);
+    tmp3 = half_btf_neon_r(&cospi[52], &u[45], &cospi[12], &u[50], &v_bit,
+                           &rnding);
+    tmp4 = half_btf_neon_mode11_r(&cospi[12], &u[46], &cospi[52], &u[49],
+                                  &v_bit, &rnding);
+    u[49] = half_btf_neon_mode10_r(&cospi[52], &u[46], &cospi[12], &u[49],
+                                   &v_bit, &rnding);
+    u[50] = half_btf_neon_r(&cospi[12], &u[45], &cospi[52], &u[50], &v_bit,
+                            &rnding);
+    u[53] = half_btf_neon_mode10_r(&cospi[20], &u[42], &cospi[44], &u[53],
+                                   &v_bit, &rnding);
+    u[54] = half_btf_neon_r(&cospi[44], &u[41], &cospi[20], &u[54], &v_bit,
+                            &rnding);
+    u[41] = tmp1;
+    u[42] = tmp2;
+    u[45] = tmp3;
+    u[46] = tmp4;
+
+    // stage 5
+    u[7] = half_btf_0_neon_r(&cospi[8], &u[4], &v_bit, &rnding);
+    u[4] = half_btf_0_neon_r(&cospi[56], &u[4], &v_bit, &rnding);
+
+    u[9] = u[8];
+    u[10] = u[11];
+    u[13] = u[12];
+    u[14] = u[15];
+
+    tmp1 = half_btf_neon_mode10_r(&cospi[8], &u[17], &cospi[56], &u[30], &v_bit,
+                                  &rnding);
+    tmp2 = half_btf_neon_mode11_r(&cospi[56], &u[18], &cospi[8], &u[29], &v_bit,
+                                  &rnding);
+    tmp3 = half_btf_neon_mode10_r(&cospi[40], &u[21], &cospi[24], &u[26],
+                                  &v_bit, &rnding);
+    tmp4 = half_btf_neon_mode11_r(&cospi[24], &u[22], &cospi[40], &u[25],
+                                  &v_bit, &rnding);
+    u[25] = half_btf_neon_mode10_r(&cospi[40], &u[22], &cospi[24], &u[25],
+                                   &v_bit, &rnding);
+    u[26] = half_btf_neon_r(&cospi[24], &u[21], &cospi[40], &u[26], &v_bit,
+                            &rnding);
+    u[29] = half_btf_neon_mode10_r(&cospi[8], &u[18], &cospi[56], &u[29],
+                                   &v_bit, &rnding);
+    u[30] =
+        half_btf_neon_r(&cospi[56], &u[17], &cospi[8], &u[30], &v_bit, &rnding);
+    u[17] = tmp1;
+    u[18] = tmp2;
+    u[21] = tmp3;
+    u[22] = tmp4;
+
+    for (i = 32; i < 64; i += 8) {
+      addsub_neon(u[i + 0], u[i + 3], &u[i + 0], &u[i + 3], &clamp_lo,
+                  &clamp_hi);
+      addsub_neon(u[i + 1], u[i + 2], &u[i + 1], &u[i + 2], &clamp_lo,
+                  &clamp_hi);
+
+      addsub_neon(u[i + 7], u[i + 4], &u[i + 7], &u[i + 4], &clamp_lo,
+                  &clamp_hi);
+      addsub_neon(u[i + 6], u[i + 5], &u[i + 6], &u[i + 5], &clamp_lo,
+                  &clamp_hi);
+    }
+
+    // stage 6
+    tmp1 = half_btf_0_neon_r(&cospi[32], &u[0], &v_bit, &rnding);
+    u[1] = half_btf_0_neon_r(&cospi[32], &u[0], &v_bit, &rnding);
+    u[0] = tmp1;
+    u[5] = u[4];
+    u[6] = u[7];
+
+    tmp1 = half_btf_neon_mode10_r(&cospi[16], &u[9], &cospi[48], &u[14], &v_bit,
+                                  &rnding);
+    u[14] =
+        half_btf_neon_r(&cospi[48], &u[9], &cospi[16], &u[14], &v_bit, &rnding);
+    u[9] = tmp1;
+    tmp2 = half_btf_neon_mode01_r(&cospi[48], &u[10], &cospi[16], &u[13],
+                                  &v_bit, &rnding);
+    u[13] = half_btf_neon_mode10_r(&cospi[16], &u[10], &cospi[48], &u[13],
+                                   &v_bit, &rnding);
+    u[10] = tmp2;
+
+    for (i = 16; i < 32; i += 8) {
+      addsub_neon(u[i + 0], u[i + 3], &u[i + 0], &u[i + 3], &clamp_lo,
+                  &clamp_hi);
+      addsub_neon(u[i + 1], u[i + 2], &u[i + 1], &u[i + 2], &clamp_lo,
+                  &clamp_hi);
+
+      addsub_neon(u[i + 7], u[i + 4], &u[i + 7], &u[i + 4], &clamp_lo,
+                  &clamp_hi);
+      addsub_neon(u[i + 6], u[i + 5], &u[i + 6], &u[i + 5], &clamp_lo,
+                  &clamp_hi);
+    }
+
+    tmp1 = half_btf_neon_mode10_r(&cospi[8], &u[34], &cospi[56], &u[61], &v_bit,
+                                  &rnding);
+    tmp2 = half_btf_neon_mode10_r(&cospi[8], &u[35], &cospi[56], &u[60], &v_bit,
+                                  &rnding);
+    tmp3 = half_btf_neon_mode11_r(&cospi[56], &u[36], &cospi[8], &u[59], &v_bit,
+                                  &rnding);
+    tmp4 = half_btf_neon_mode11_r(&cospi[56], &u[37], &cospi[8], &u[58], &v_bit,
+                                  &rnding);
+    u[58] = half_btf_neon_mode10_r(&cospi[8], &u[37], &cospi[56], &u[58],
+                                   &v_bit, &rnding);
+    u[59] = half_btf_neon_mode10_r(&cospi[8], &u[36], &cospi[56], &u[59],
+                                   &v_bit, &rnding);
+    u[60] =
+        half_btf_neon_r(&cospi[56], &u[35], &cospi[8], &u[60], &v_bit, &rnding);
+    u[61] =
+        half_btf_neon_r(&cospi[56], &u[34], &cospi[8], &u[61], &v_bit, &rnding);
+    u[34] = tmp1;
+    u[35] = tmp2;
+    u[36] = tmp3;
+    u[37] = tmp4;
+
+    tmp1 = half_btf_neon_mode10_r(&cospi[40], &u[42], &cospi[24], &u[53],
+                                  &v_bit, &rnding);
+    tmp2 = half_btf_neon_mode10_r(&cospi[40], &u[43], &cospi[24], &u[52],
+                                  &v_bit, &rnding);
+    tmp3 = half_btf_neon_mode11_r(&cospi[24], &u[44], &cospi[40], &u[51],
+                                  &v_bit, &rnding);
+    tmp4 = half_btf_neon_mode11_r(&cospi[24], &u[45], &cospi[40], &u[50],
+                                  &v_bit, &rnding);
+    u[50] = half_btf_neon_mode10_r(&cospi[40], &u[45], &cospi[24], &u[50],
+                                   &v_bit, &rnding);
+    u[51] = half_btf_neon_mode10_r(&cospi[40], &u[44], &cospi[24], &u[51],
+                                   &v_bit, &rnding);
+    u[52] = half_btf_neon_r(&cospi[24], &u[43], &cospi[40], &u[52], &v_bit,
+                            &rnding);
+    u[53] = half_btf_neon_r(&cospi[24], &u[42], &cospi[40], &u[53], &v_bit,
+                            &rnding);
+    u[42] = tmp1;
+    u[43] = tmp2;
+    u[44] = tmp3;
+    u[45] = tmp4;
+
+    // stage 7
+    u[3] = u[0];
+    u[2] = u[1];
+    tmp1 = half_btf_neon_mode10_r(&cospi[32], &u[5], &cospi[32], &u[6], &v_bit,
+                                  &rnding);
+    u[6] =
+        half_btf_neon_r(&cospi[32], &u[5], &cospi[32], &u[6], &v_bit, &rnding);
+    u[5] = tmp1;
+    addsub_neon(u[8], u[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
+    addsub_neon(u[9], u[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
+    addsub_neon(u[15], u[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
+    addsub_neon(u[14], u[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
+
+    tmp1 = half_btf_neon_mode10_r(&cospi[16], &u[18], &cospi[48], &u[29],
+                                  &v_bit, &rnding);
+    tmp2 = half_btf_neon_mode10_r(&cospi[16], &u[19], &cospi[48], &u[28],
+                                  &v_bit, &rnding);
+    tmp3 = half_btf_neon_mode11_r(&cospi[48], &u[20], &cospi[16], &u[27],
+                                  &v_bit, &rnding);
+    tmp4 = half_btf_neon_mode11_r(&cospi[48], &u[21], &cospi[16], &u[26],
+                                  &v_bit, &rnding);
+    u[26] = half_btf_neon_mode10_r(&cospi[16], &u[21], &cospi[48], &u[26],
+                                   &v_bit, &rnding);
+    u[27] = half_btf_neon_mode10_r(&cospi[16], &u[20], &cospi[48], &u[27],
+                                   &v_bit, &rnding);
+    u[28] = half_btf_neon_r(&cospi[48], &u[19], &cospi[16], &u[28], &v_bit,
+                            &rnding);
+    u[29] = half_btf_neon_r(&cospi[48], &u[18], &cospi[16], &u[29], &v_bit,
+                            &rnding);
+    u[18] = tmp1;
+    u[19] = tmp2;
+    u[20] = tmp3;
+    u[21] = tmp4;
+
+    for (i = 32; i < 64; i += 16) {
+      for (j = i; j < i + 4; j++) {
+        addsub_neon(u[j], u[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi);
+        addsub_neon(u[j ^ 15], u[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo,
+                    &clamp_hi);
+      }
+    }
+
+    // stage 8
+    for (i = 0; i < 4; ++i) {
+      addsub_neon(u[i], u[7 - i], &u[i], &u[7 - i], &clamp_lo, &clamp_hi);
+    }
+
+    idct64_stage8_neon(u, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding);
+
+    // stage 9
+    idct64_stage9_neon(u, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding);
+
+    // stage 10
+    idct64_stage10_neon(u, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding);
+
+    // stage 11
+    idct64_stage11_neon(u, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi);
+  }
+}
+
+static void idct64x64_neon(int32x4_t *in, int32x4_t *out, int bit, int do_cols,
+                           int bd, int out_shift) {
+  int i, j;
+  const int32_t *cospi = cospi_arr(bit);
+  const int32x4_t v_bit = vdupq_n_s32(-bit);
+  const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
+
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
+  const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
+
+  {
+    int32x4_t u[64], v[64];
+
+    // stage 1
+    u[32] = in[1];
+    u[34] = in[17];
+    u[36] = in[9];
+    u[38] = in[25];
+    u[40] = in[5];
+    u[42] = in[21];
+    u[44] = in[13];
+    u[46] = in[29];
+    u[48] = in[3];
+    u[50] = in[19];
+    u[52] = in[11];
+    u[54] = in[27];
+    u[56] = in[7];
+    u[58] = in[23];
+    u[60] = in[15];
+    u[62] = in[31];
+
+    v[16] = in[2];
+    v[18] = in[18];
+    v[20] = in[10];
+    v[22] = in[26];
+    v[24] = in[6];
+    v[26] = in[22];
+    v[28] = in[14];
+    v[30] = in[30];
+
+    u[8] = in[4];
+    u[10] = in[20];
+    u[12] = in[12];
+    u[14] = in[28];
+
+    v[4] = in[8];
+    v[6] = in[24];
+
+    u[0] = in[0];
+    u[2] = in[16];
+
+    // stage 2
+    v[32] = half_btf_0_neon_r(&cospi[63], &u[32], &v_bit, &rnding);
+    v[33] = half_btf_0_m_neon_r(&cospi[33], &u[62], &v_bit, &rnding);
+    v[34] = half_btf_0_neon_r(&cospi[47], &u[34], &v_bit, &rnding);
+    v[35] = half_btf_0_m_neon_r(&cospi[49], &u[60], &v_bit, &rnding);
+    v[36] = half_btf_0_neon_r(&cospi[55], &u[36], &v_bit, &rnding);
+    v[37] = half_btf_0_m_neon_r(&cospi[41], &u[58], &v_bit, &rnding);
+    v[38] = half_btf_0_neon_r(&cospi[39], &u[38], &v_bit, &rnding);
+    v[39] = half_btf_0_m_neon_r(&cospi[57], &u[56], &v_bit, &rnding);
+    v[40] = half_btf_0_neon_r(&cospi[59], &u[40], &v_bit, &rnding);
+    v[41] = half_btf_0_m_neon_r(&cospi[37], &u[54], &v_bit, &rnding);
+    v[42] = half_btf_0_neon_r(&cospi[43], &u[42], &v_bit, &rnding);
+    v[43] = half_btf_0_m_neon_r(&cospi[53], &u[52], &v_bit, &rnding);
+    v[44] = half_btf_0_neon_r(&cospi[51], &u[44], &v_bit, &rnding);
+    v[45] = half_btf_0_m_neon_r(&cospi[45], &u[50], &v_bit, &rnding);
+    v[46] = half_btf_0_neon_r(&cospi[35], &u[46], &v_bit, &rnding);
+    v[47] = half_btf_0_m_neon_r(&cospi[61], &u[48], &v_bit, &rnding);
+    v[48] = half_btf_0_neon_r(&cospi[3], &u[48], &v_bit, &rnding);
+    v[49] = half_btf_0_neon_r(&cospi[29], &u[46], &v_bit, &rnding);
+    v[50] = half_btf_0_neon_r(&cospi[19], &u[50], &v_bit, &rnding);
+    v[51] = half_btf_0_neon_r(&cospi[13], &u[44], &v_bit, &rnding);
+    v[52] = half_btf_0_neon_r(&cospi[11], &u[52], &v_bit, &rnding);
+    v[53] = half_btf_0_neon_r(&cospi[21], &u[42], &v_bit, &rnding);
+    v[54] = half_btf_0_neon_r(&cospi[27], &u[54], &v_bit, &rnding);
+    v[55] = half_btf_0_neon_r(&cospi[5], &u[40], &v_bit, &rnding);
+    v[56] = half_btf_0_neon_r(&cospi[7], &u[56], &v_bit, &rnding);
+    v[57] = half_btf_0_neon_r(&cospi[25], &u[38], &v_bit, &rnding);
+    v[58] = half_btf_0_neon_r(&cospi[23], &u[58], &v_bit, &rnding);
+    v[59] = half_btf_0_neon_r(&cospi[9], &u[36], &v_bit, &rnding);
+    v[60] = half_btf_0_neon_r(&cospi[15], &u[60], &v_bit, &rnding);
+    v[61] = half_btf_0_neon_r(&cospi[17], &u[34], &v_bit, &rnding);
+    v[62] = half_btf_0_neon_r(&cospi[31], &u[62], &v_bit, &rnding);
+    v[63] = half_btf_0_neon_r(&cospi[1], &u[32], &v_bit, &rnding);
+
+    // stage 3
+    u[16] = half_btf_0_neon_r(&cospi[62], &v[16], &v_bit, &rnding);
+    u[17] = half_btf_0_m_neon_r(&cospi[34], &v[30], &v_bit, &rnding);
+    u[18] = half_btf_0_neon_r(&cospi[46], &v[18], &v_bit, &rnding);
+    u[19] = half_btf_0_m_neon_r(&cospi[50], &v[28], &v_bit, &rnding);
+    u[20] = half_btf_0_neon_r(&cospi[54], &v[20], &v_bit, &rnding);
+    u[21] = half_btf_0_m_neon_r(&cospi[42], &v[26], &v_bit, &rnding);
+    u[22] = half_btf_0_neon_r(&cospi[38], &v[22], &v_bit, &rnding);
+    u[23] = half_btf_0_m_neon_r(&cospi[58], &v[24], &v_bit, &rnding);
+    u[24] = half_btf_0_neon_r(&cospi[6], &v[24], &v_bit, &rnding);
+    u[25] = half_btf_0_neon_r(&cospi[26], &v[22], &v_bit, &rnding);
+    u[26] = half_btf_0_neon_r(&cospi[22], &v[26], &v_bit, &rnding);
+    u[27] = half_btf_0_neon_r(&cospi[10], &v[20], &v_bit, &rnding);
+    u[28] = half_btf_0_neon_r(&cospi[14], &v[28], &v_bit, &rnding);
+    u[29] = half_btf_0_neon_r(&cospi[18], &v[18], &v_bit, &rnding);
+    u[30] = half_btf_0_neon_r(&cospi[30], &v[30], &v_bit, &rnding);
+    u[31] = half_btf_0_neon_r(&cospi[2], &v[16], &v_bit, &rnding);
+
+    for (i = 32; i < 64; i += 4) {
+      addsub_neon(v[i + 0], v[i + 1], &u[i + 0], &u[i + 1], &clamp_lo,
+                  &clamp_hi);
+      addsub_neon(v[i + 3], v[i + 2], &u[i + 3], &u[i + 2], &clamp_lo,
+                  &clamp_hi);
+    }
+
+    // stage 4
+    v[8] = half_btf_0_neon_r(&cospi[60], &u[8], &v_bit, &rnding);
+    v[9] = half_btf_0_m_neon_r(&cospi[36], &u[14], &v_bit, &rnding);
+    v[10] = half_btf_0_neon_r(&cospi[44], &u[10], &v_bit, &rnding);
+    v[11] = half_btf_0_m_neon_r(&cospi[52], &u[12], &v_bit, &rnding);
+    v[12] = half_btf_0_neon_r(&cospi[12], &u[12], &v_bit, &rnding);
+    v[13] = half_btf_0_neon_r(&cospi[20], &u[10], &v_bit, &rnding);
+    v[14] = half_btf_0_neon_r(&cospi[28], &u[14], &v_bit, &rnding);
+    v[15] = half_btf_0_neon_r(&cospi[4], &u[8], &v_bit, &rnding);
+
+    for (i = 16; i < 32; i += 4) {
+      addsub_neon(u[i + 0], u[i + 1], &v[i + 0], &v[i + 1], &clamp_lo,
+                  &clamp_hi);
+      addsub_neon(u[i + 3], u[i + 2], &v[i + 3], &v[i + 2], &clamp_lo,
+                  &clamp_hi);
+    }
+
+    for (i = 32; i < 64; i += 4) {
+      v[i + 0] = u[i + 0];
+      v[i + 3] = u[i + 3];
+    }
+
+    v[33] = half_btf_neon_mode10_r(&cospi[4], &u[33], &cospi[60], &u[62],
+                                   &v_bit, &rnding);
+    v[34] = half_btf_neon_mode11_r(&cospi[60], &u[34], &cospi[4], &u[61],
+                                   &v_bit, &rnding);
+    v[37] = half_btf_neon_mode10_r(&cospi[36], &u[37], &cospi[28], &u[58],
+                                   &v_bit, &rnding);
+    v[38] = half_btf_neon_mode11_r(&cospi[28], &u[38], &cospi[36], &u[57],
+                                   &v_bit, &rnding);
+    v[41] = half_btf_neon_mode10_r(&cospi[20], &u[41], &cospi[44], &u[54],
+                                   &v_bit, &rnding);
+    v[42] = half_btf_neon_mode11_r(&cospi[44], &u[42], &cospi[20], &u[53],
+                                   &v_bit, &rnding);
+    v[45] = half_btf_neon_mode10_r(&cospi[52], &u[45], &cospi[12], &u[50],
+                                   &v_bit, &rnding);
+    v[46] = half_btf_neon_mode11_r(&cospi[12], &u[46], &cospi[52], &u[49],
+                                   &v_bit, &rnding);
+    v[49] = half_btf_neon_mode10_r(&cospi[52], &u[46], &cospi[12], &u[49],
+                                   &v_bit, &rnding);
+    v[50] = half_btf_neon_r(&cospi[12], &u[45], &cospi[52], &u[50], &v_bit,
+                            &rnding);
+    v[53] = half_btf_neon_mode10_r(&cospi[20], &u[42], &cospi[44], &u[53],
+                                   &v_bit, &rnding);
+    v[54] = half_btf_neon_r(&cospi[44], &u[41], &cospi[20], &u[54], &v_bit,
+                            &rnding);
+    v[57] = half_btf_neon_mode10_r(&cospi[36], &u[38], &cospi[28], &u[57],
+                                   &v_bit, &rnding);
+    v[58] = half_btf_neon_r(&cospi[28], &u[37], &cospi[36], &u[58], &v_bit,
+                            &rnding);
+    v[61] = half_btf_neon_mode10_r(&cospi[4], &u[34], &cospi[60], &u[61],
+                                   &v_bit, &rnding);
+    v[62] =
+        half_btf_neon_r(&cospi[60], &u[33], &cospi[4], &u[62], &v_bit, &rnding);
+
+    // stage 5
+    u[4] = half_btf_0_neon_r(&cospi[56], &v[4], &v_bit, &rnding);
+    u[5] = half_btf_0_m_neon_r(&cospi[40], &v[6], &v_bit, &rnding);
+    u[6] = half_btf_0_neon_r(&cospi[24], &v[6], &v_bit, &rnding);
+    u[7] = half_btf_0_neon_r(&cospi[8], &v[4], &v_bit, &rnding);
+
+    for (i = 8; i < 16; i += 4) {
+      addsub_neon(v[i + 0], v[i + 1], &u[i + 0], &u[i + 1], &clamp_lo,
+                  &clamp_hi);
+      addsub_neon(v[i + 3], v[i + 2], &u[i + 3], &u[i + 2], &clamp_lo,
+                  &clamp_hi);
+    }
+
+    for (i = 16; i < 32; i += 4) {
+      u[i + 0] = v[i + 0];
+      u[i + 3] = v[i + 3];
+    }
+
+    u[17] = half_btf_neon_mode10_r(&cospi[8], &v[17], &cospi[56], &v[30],
+                                   &v_bit, &rnding);
+    u[18] = half_btf_neon_mode11_r(&cospi[56], &v[18], &cospi[8], &v[29],
+                                   &v_bit, &rnding);
+    u[21] = half_btf_neon_mode10_r(&cospi[40], &v[21], &cospi[24], &v[26],
+                                   &v_bit, &rnding);
+    u[22] = half_btf_neon_mode11_r(&cospi[24], &v[22], &cospi[40], &v[25],
+                                   &v_bit, &rnding);
+    u[25] = half_btf_neon_mode10_r(&cospi[40], &v[22], &cospi[24], &v[25],
+                                   &v_bit, &rnding);
+    u[26] = half_btf_neon_r(&cospi[24], &v[21], &cospi[40], &v[26], &v_bit,
+                            &rnding);
+    u[29] = half_btf_neon_mode10_r(&cospi[8], &v[18], &cospi[56], &v[29],
+                                   &v_bit, &rnding);
+    u[30] =
+        half_btf_neon_r(&cospi[56], &v[17], &cospi[8], &v[30], &v_bit, &rnding);
+
+    for (i = 32; i < 64; i += 8) {
+      addsub_neon(v[i + 0], v[i + 3], &u[i + 0], &u[i + 3], &clamp_lo,
+                  &clamp_hi);
+      addsub_neon(v[i + 1], v[i + 2], &u[i + 1], &u[i + 2], &clamp_lo,
+                  &clamp_hi);
+
+      addsub_neon(v[i + 7], v[i + 4], &u[i + 7], &u[i + 4], &clamp_lo,
+                  &clamp_hi);
+      addsub_neon(v[i + 6], v[i + 5], &u[i + 6], &u[i + 5], &clamp_lo,
+                  &clamp_hi);
+    }
+
+    // stage 6
+    v[0] = half_btf_0_neon_r(&cospi[32], &u[0], &v_bit, &rnding);
+    v[1] = half_btf_0_neon_r(&cospi[32], &u[0], &v_bit, &rnding);
+    v[2] = half_btf_0_neon_r(&cospi[48], &u[2], &v_bit, &rnding);
+    v[3] = half_btf_0_neon_r(&cospi[16], &u[2], &v_bit, &rnding);
+
+    addsub_neon(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi);
+    addsub_neon(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi);
+
+    for (i = 8; i < 16; i += 4) {
+      v[i + 0] = u[i + 0];
+      v[i + 3] = u[i + 3];
+    }
+
+    v[9] = half_btf_neon_mode10_r(&cospi[16], &u[9], &cospi[48], &u[14], &v_bit,
+                                  &rnding);
+    v[10] = half_btf_neon_mode11_r(&cospi[48], &u[10], &cospi[16], &u[13],
+                                   &v_bit, &rnding);
+    v[13] = half_btf_neon_mode10_r(&cospi[16], &u[10], &cospi[48], &u[13],
+                                   &v_bit, &rnding);
+    v[14] =
+        half_btf_neon_r(&cospi[48], &u[9], &cospi[16], &u[14], &v_bit, &rnding);
+
+    for (i = 16; i < 32; i += 8) {
+      addsub_neon(u[i + 0], u[i + 3], &v[i + 0], &v[i + 3], &clamp_lo,
+                  &clamp_hi);
+      addsub_neon(u[i + 1], u[i + 2], &v[i + 1], &v[i + 2], &clamp_lo,
+                  &clamp_hi);
+
+      addsub_neon(u[i + 7], u[i + 4], &v[i + 7], &v[i + 4], &clamp_lo,
+                  &clamp_hi);
+      addsub_neon(u[i + 6], u[i + 5], &v[i + 6], &v[i + 5], &clamp_lo,
+                  &clamp_hi);
+    }
+
+    for (i = 32; i < 64; i += 8) {
+      v[i + 0] = u[i + 0];
+      v[i + 1] = u[i + 1];
+      v[i + 6] = u[i + 6];
+      v[i + 7] = u[i + 7];
+    }
+
+    v[34] = half_btf_neon_mode10_r(&cospi[8], &u[34], &cospi[56], &u[61],
+                                   &v_bit, &rnding);
+    v[35] = half_btf_neon_mode10_r(&cospi[8], &u[35], &cospi[56], &u[60],
+                                   &v_bit, &rnding);
+    v[36] = half_btf_neon_mode11_r(&cospi[56], &u[36], &cospi[8], &u[59],
+                                   &v_bit, &rnding);
+    v[37] = half_btf_neon_mode11_r(&cospi[56], &u[37], &cospi[8], &u[58],
+                                   &v_bit, &rnding);
+    v[42] = half_btf_neon_mode10_r(&cospi[40], &u[42], &cospi[24], &u[53],
+                                   &v_bit, &rnding);
+    v[43] = half_btf_neon_mode10_r(&cospi[40], &u[43], &cospi[24], &u[52],
+                                   &v_bit, &rnding);
+    v[44] = half_btf_neon_mode11_r(&cospi[24], &u[44], &cospi[40], &u[51],
+                                   &v_bit, &rnding);
+    v[45] = half_btf_neon_mode11_r(&cospi[24], &u[45], &cospi[40], &u[50],
+                                   &v_bit, &rnding);
+    v[50] = half_btf_neon_mode10_r(&cospi[40], &u[45], &cospi[24], &u[50],
+                                   &v_bit, &rnding);
+    v[51] = half_btf_neon_mode10_r(&cospi[40], &u[44], &cospi[24], &u[51],
+                                   &v_bit, &rnding);
+    v[52] = half_btf_neon_r(&cospi[24], &u[43], &cospi[40], &u[52], &v_bit,
+                            &rnding);
+    v[53] = half_btf_neon_r(&cospi[24], &u[42], &cospi[40], &u[53], &v_bit,
+                            &rnding);
+    v[58] = half_btf_neon_mode10_r(&cospi[8], &u[37], &cospi[56], &u[58],
+                                   &v_bit, &rnding);
+    v[59] = half_btf_neon_mode10_r(&cospi[8], &u[36], &cospi[56], &u[59],
+                                   &v_bit, &rnding);
+    v[60] =
+        half_btf_neon_r(&cospi[56], &u[35], &cospi[8], &u[60], &v_bit, &rnding);
+    v[61] =
+        half_btf_neon_r(&cospi[56], &u[34], &cospi[8], &u[61], &v_bit, &rnding);
+
+    // stage 7
+    addsub_neon(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
+    addsub_neon(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
+
+    u[4] = v[4];
+    u[7] = v[7];
+    u[5] = half_btf_neon_mode10_r(&cospi[32], &v[5], &cospi[32], &v[6], &v_bit,
+                                  &rnding);
+    u[6] =
+        half_btf_neon_r(&cospi[32], &v[5], &cospi[32], &v[6], &v_bit, &rnding);
+
+    addsub_neon(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
+    addsub_neon(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
+    addsub_neon(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
+    addsub_neon(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
+
+    for (i = 16; i < 32; i += 8) {
+      u[i + 0] = v[i + 0];
+      u[i + 1] = v[i + 1];
+      u[i + 6] = v[i + 6];
+      u[i + 7] = v[i + 7];
+    }
+
+    u[18] = half_btf_neon_mode10_r(&cospi[16], &v[18], &cospi[48], &v[29],
+                                   &v_bit, &rnding);
+    u[19] = half_btf_neon_mode10_r(&cospi[16], &v[19], &cospi[48], &v[28],
+                                   &v_bit, &rnding);
+    u[20] = half_btf_neon_mode11_r(&cospi[48], &v[20], &cospi[16], &v[27],
+                                   &v_bit, &rnding);
+    u[21] = half_btf_neon_mode11_r(&cospi[48], &v[21], &cospi[16], &v[26],
+                                   &v_bit, &rnding);
+    u[26] = half_btf_neon_mode10_r(&cospi[16], &v[21], &cospi[48], &v[26],
+                                   &v_bit, &rnding);
+    u[27] = half_btf_neon_mode10_r(&cospi[16], &v[20], &cospi[48], &v[27],
+                                   &v_bit, &rnding);
+    u[28] = half_btf_neon_r(&cospi[48], &v[19], &cospi[16], &v[28], &v_bit,
+                            &rnding);
+    u[29] = half_btf_neon_r(&cospi[48], &v[18], &cospi[16], &v[29], &v_bit,
+                            &rnding);
+
+    for (i = 32; i < 64; i += 16) {
+      for (j = i; j < i + 4; j++) {
+        addsub_neon(v[j], v[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi);
+        addsub_neon(v[j ^ 15], v[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo,
+                    &clamp_hi);
+      }
+    }
+
+    // stage 8
+    for (i = 0; i < 4; ++i) {
+      addsub_neon(u[i], u[7 - i], &v[i], &v[7 - i], &clamp_lo, &clamp_hi);
+    }
+
+    v[8] = u[8];
+    v[9] = u[9];
+    v[14] = u[14];
+    v[15] = u[15];
+
+    v[10] = half_btf_neon_mode10_r(&cospi[32], &u[10], &cospi[32], &u[13],
+                                   &v_bit, &rnding);
+    v[11] = half_btf_neon_mode10_r(&cospi[32], &u[11], &cospi[32], &u[12],
+                                   &v_bit, &rnding);
+    v[12] = half_btf_neon_r(&cospi[32], &u[11], &cospi[32], &u[12], &v_bit,
+                            &rnding);
+    v[13] = half_btf_neon_r(&cospi[32], &u[10], &cospi[32], &u[13], &v_bit,
+                            &rnding);
+
+    for (i = 16; i < 20; ++i) {
+      addsub_neon(u[i], u[i ^ 7], &v[i], &v[i ^ 7], &clamp_lo, &clamp_hi);
+      addsub_neon(u[i ^ 15], u[i ^ 8], &v[i ^ 15], &v[i ^ 8], &clamp_lo,
+                  &clamp_hi);
+    }
+
+    for (i = 32; i < 36; ++i) {
+      v[i] = u[i];
+      v[i + 12] = u[i + 12];
+      v[i + 16] = u[i + 16];
+      v[i + 28] = u[i + 28];
+    }
+
+    v[36] = half_btf_neon_mode10_r(&cospi[16], &u[36], &cospi[48], &u[59],
+                                   &v_bit, &rnding);
+    v[37] = half_btf_neon_mode10_r(&cospi[16], &u[37], &cospi[48], &u[58],
+                                   &v_bit, &rnding);
+    v[38] = half_btf_neon_mode10_r(&cospi[16], &u[38], &cospi[48], &u[57],
+                                   &v_bit, &rnding);
+    v[39] = half_btf_neon_mode10_r(&cospi[16], &u[39], &cospi[48], &u[56],
+                                   &v_bit, &rnding);
+    v[40] = half_btf_neon_mode11_r(&cospi[48], &u[40], &cospi[16], &u[55],
+                                   &v_bit, &rnding);
+    v[41] = half_btf_neon_mode11_r(&cospi[48], &u[41], &cospi[16], &u[54],
+                                   &v_bit, &rnding);
+    v[42] = half_btf_neon_mode11_r(&cospi[48], &u[42], &cospi[16], &u[53],
+                                   &v_bit, &rnding);
+    v[43] = half_btf_neon_mode11_r(&cospi[48], &u[43], &cospi[16], &u[52],
+                                   &v_bit, &rnding);
+    v[52] = half_btf_neon_mode10_r(&cospi[16], &u[43], &cospi[48], &u[52],
+                                   &v_bit, &rnding);
+    v[53] = half_btf_neon_mode10_r(&cospi[16], &u[42], &cospi[48], &u[53],
+                                   &v_bit, &rnding);
+    v[54] = half_btf_neon_mode10_r(&cospi[16], &u[41], &cospi[48], &u[54],
+                                   &v_bit, &rnding);
+    v[55] = half_btf_neon_mode10_r(&cospi[16], &u[40], &cospi[48], &u[55],
+                                   &v_bit, &rnding);
+    v[56] = half_btf_neon_r(&cospi[48], &u[39], &cospi[16], &u[56], &v_bit,
+                            &rnding);
+    v[57] = half_btf_neon_r(&cospi[48], &u[38], &cospi[16], &u[57], &v_bit,
+                            &rnding);
+    v[58] = half_btf_neon_r(&cospi[48], &u[37], &cospi[16], &u[58], &v_bit,
+                            &rnding);
+    v[59] = half_btf_neon_r(&cospi[48], &u[36], &cospi[16], &u[59], &v_bit,
+                            &rnding);
+
+    // stage 9
+    for (i = 0; i < 8; ++i) {
+      addsub_neon(v[i], v[15 - i], &u[i], &u[15 - i], &clamp_lo, &clamp_hi);
+    }
+
+    for (i = 16; i < 20; ++i) {
+      u[i] = v[i];
+      u[i + 12] = v[i + 12];
+    }
+
+    u[20] = half_btf_neon_mode10_r(&cospi[32], &v[20], &cospi[32], &v[27],
+                                   &v_bit, &rnding);
+    u[21] = half_btf_neon_mode10_r(&cospi[32], &v[21], &cospi[32], &v[26],
+                                   &v_bit, &rnding);
+    u[22] = half_btf_neon_mode10_r(&cospi[32], &v[22], &cospi[32], &v[25],
+                                   &v_bit, &rnding);
+    u[23] = half_btf_neon_mode10_r(&cospi[32], &v[23], &cospi[32], &v[24],
+                                   &v_bit, &rnding);
+    u[24] = half_btf_neon_r(&cospi[32], &v[23], &cospi[32], &v[24], &v_bit,
+                            &rnding);
+    u[25] = half_btf_neon_r(&cospi[32], &v[22], &cospi[32], &v[25], &v_bit,
+                            &rnding);
+    u[26] = half_btf_neon_r(&cospi[32], &v[21], &cospi[32], &v[26], &v_bit,
+                            &rnding);
+    u[27] = half_btf_neon_r(&cospi[32], &v[20], &cospi[32], &v[27], &v_bit,
+                            &rnding);
+
+    for (i = 32; i < 40; i++) {
+      addsub_neon(v[i], v[i ^ 15], &u[i], &u[i ^ 15], &clamp_lo, &clamp_hi);
+    }
+
+    for (i = 48; i < 56; i++) {
+      addsub_neon(v[i ^ 15], v[i], &u[i ^ 15], &u[i], &clamp_lo, &clamp_hi);
+    }
+
+    // stage 10
+    for (i = 0; i < 16; i++) {
+      addsub_neon(u[i], u[31 - i], &v[i], &v[31 - i], &clamp_lo, &clamp_hi);
+    }
+
+    for (i = 32; i < 40; i++) v[i] = u[i];
+
+    v[40] = half_btf_neon_mode10_r(&cospi[32], &u[40], &cospi[32], &u[55],
+                                   &v_bit, &rnding);
+    v[41] = half_btf_neon_mode10_r(&cospi[32], &u[41], &cospi[32], &u[54],
+                                   &v_bit, &rnding);
+    v[42] = half_btf_neon_mode10_r(&cospi[32], &u[42], &cospi[32], &u[53],
+                                   &v_bit, &rnding);
+    v[43] = half_btf_neon_mode10_r(&cospi[32], &u[43], &cospi[32], &u[52],
+                                   &v_bit, &rnding);
+    v[44] = half_btf_neon_mode10_r(&cospi[32], &u[44], &cospi[32], &u[51],
+                                   &v_bit, &rnding);
+    v[45] = half_btf_neon_mode10_r(&cospi[32], &u[45], &cospi[32], &u[50],
+                                   &v_bit, &rnding);
+    v[46] = half_btf_neon_mode10_r(&cospi[32], &u[46], &cospi[32], &u[49],
+                                   &v_bit, &rnding);
+    v[47] = half_btf_neon_mode10_r(&cospi[32], &u[47], &cospi[32], &u[48],
+                                   &v_bit, &rnding);
+    v[48] = half_btf_neon_r(&cospi[32], &u[47], &cospi[32], &u[48], &v_bit,
+                            &rnding);
+    v[49] = half_btf_neon_r(&cospi[32], &u[46], &cospi[32], &u[49], &v_bit,
+                            &rnding);
+    v[50] = half_btf_neon_r(&cospi[32], &u[45], &cospi[32], &u[50], &v_bit,
+                            &rnding);
+    v[51] = half_btf_neon_r(&cospi[32], &u[44], &cospi[32], &u[51], &v_bit,
+                            &rnding);
+    v[52] = half_btf_neon_r(&cospi[32], &u[43], &cospi[32], &u[52], &v_bit,
+                            &rnding);
+    v[53] = half_btf_neon_r(&cospi[32], &u[42], &cospi[32], &u[53], &v_bit,
+                            &rnding);
+    v[54] = half_btf_neon_r(&cospi[32], &u[41], &cospi[32], &u[54], &v_bit,
+                            &rnding);
+    v[55] = half_btf_neon_r(&cospi[32], &u[40], &cospi[32], &u[55], &v_bit,
+                            &rnding);
+
+    for (i = 56; i < 64; i++) v[i] = u[i];
+
+    // stage 11
+    for (i = 0; i < 32; i++) {
+      addsub_neon(v[i], v[63 - i], &out[(i)], &out[(63 - i)], &clamp_lo,
+                  &clamp_hi);
+    }
+
+    if (!do_cols) {
+      const int log_range_out = AOMMAX(16, bd + 6);
+      const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1)));
+      const int32x4_t clamp_hi_out =
+          vdupq_n_s32((1 << (log_range_out - 1)) - 1);
+      const int32x4_t rnding32 = vdupq_n_s32(1 << (out_shift - 1));
+      for (i = 0; i < 64; i += 4) {
+        round_shift_4x4(out + i, out_shift, &rnding32);
+        highbd_clamp_s32_neon(out + i, out + i, &clamp_lo_out, &clamp_hi_out,
+                              4);
+      }
+    }
+  }
+}
+
+static void idct32x32_low1_neon(int32x4_t *in, int32x4_t *out, int bit,
+                                int do_cols, int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
+  int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
+  int32x4_t bf1;
+  const int32x4_t v_bit = vdupq_n_s32(-bit);
+  const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
+  // stage 0-1
+  bf1 = in[0];
+
+  // stage 2-5
+  bf1 = half_btf_0_neon_r(&cospi[32], &bf1, &v_bit, &rnding);
+
+  // stage 6-9
+  if (do_cols) {
+    bf1 = vmaxq_s32(bf1, clamp_lo);
+    bf1 = vminq_s32(bf1, clamp_hi);
+  } else {
+    const int log_range_out = AOMMAX(16, bd + 6);
+    clamp_lo = vdupq_n_s32(-(1 << (log_range_out - 1)));
+    clamp_hi = vdupq_n_s32((1 << (log_range_out - 1)) - 1);
+    if (out_shift != 0) {
+      bf1 = vrshlq_s32(bf1, vdupq_n_s32(-out_shift));
+    }
+  }
+
+  bf1 = vmaxq_s32(bf1, clamp_lo);
+  bf1 = vminq_s32(bf1, clamp_hi);
+
+  for (int i = 0; i < 32; i++) out[i] = bf1;
+}
+
+static void idct32x32_low8_neon(int32x4_t *in, int32x4_t *out, int bit,
+                                int do_cols, int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
+  const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
+  int32x4_t bf1[32];
+  const int32x4_t v_bit = vdupq_n_s32(-bit);
+  const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
+
+  // stage 0-1
+  bf1[0] = in[0];
+  bf1[4] = in[4];
+  bf1[8] = in[2];
+  bf1[12] = in[6];
+  bf1[16] = in[1];
+  bf1[20] = in[5];
+  bf1[24] = in[3];
+  bf1[28] = in[7];
+
+  // stage 2
+  bf1[31] = half_btf_0_neon_r(&cospi[2], &bf1[16], &v_bit, &rnding);
+  bf1[16] = half_btf_0_neon_r(&cospi[62], &bf1[16], &v_bit, &rnding);
+  bf1[19] = half_btf_0_m_neon_r(&cospi[50], &bf1[28], &v_bit, &rnding);
+  bf1[28] = half_btf_0_neon_r(&cospi[14], &bf1[28], &v_bit, &rnding);
+  bf1[27] = half_btf_0_neon_r(&cospi[10], &bf1[20], &v_bit, &rnding);
+  bf1[20] = half_btf_0_neon_r(&cospi[54], &bf1[20], &v_bit, &rnding);
+  bf1[23] = half_btf_0_m_neon_r(&cospi[58], &bf1[24], &v_bit, &rnding);
+  bf1[24] = half_btf_0_neon_r(&cospi[6], &bf1[24], &v_bit, &rnding);
+
+  // stage 3
+  bf1[15] = half_btf_0_neon_r(&cospi[4], &bf1[8], &v_bit, &rnding);
+  bf1[8] = half_btf_0_neon_r(&cospi[60], &bf1[8], &v_bit, &rnding);
+
+  bf1[11] = half_btf_0_m_neon_r(&cospi[52], &bf1[12], &v_bit, &rnding);
+  bf1[12] = half_btf_0_neon_r(&cospi[12], &bf1[12], &v_bit, &rnding);
+  bf1[17] = bf1[16];
+  bf1[18] = bf1[19];
+  bf1[21] = bf1[20];
+  bf1[22] = bf1[23];
+  bf1[25] = bf1[24];
+  bf1[26] = bf1[27];
+  bf1[29] = bf1[28];
+  bf1[30] = bf1[31];
+
+  // stage 4 :
+  bf1[7] = half_btf_0_neon_r(&cospi[8], &bf1[4], &v_bit, &rnding);
+  bf1[4] = half_btf_0_neon_r(&cospi[56], &bf1[4], &v_bit, &rnding);
+
+  bf1[9] = bf1[8];
+  bf1[10] = bf1[11];
+  bf1[13] = bf1[12];
+  bf1[14] = bf1[15];
+
+  idct32_stage4_neon(bf1, cospi, &v_bit, &rnding);
+
+  // stage 5
+  bf1[0] = half_btf_0_neon_r(&cospi[32], &bf1[0], &v_bit, &rnding);
+  bf1[1] = bf1[0];
+  bf1[5] = bf1[4];
+  bf1[6] = bf1[7];
+
+  idct32_stage5_neon(bf1, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding);
+
+  // stage 6
+  bf1[3] = bf1[0];
+  bf1[2] = bf1[1];
+
+  idct32_stage6_neon(bf1, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding);
+
+  // stage 7
+  idct32_stage7_neon(bf1, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding);
+
+  // stage 8
+  idct32_stage8_neon(bf1, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding);
+
+  // stage 9
+  idct32_stage9_neon(bf1, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi);
+}
+
+static void idct32x32_low16_neon(int32x4_t *in, int32x4_t *out, int bit,
+                                 int do_cols, int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
+  const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
+  int32x4_t bf1[32];
+  const int32x4_t v_bit = vdupq_n_s32(-bit);
+  const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
+
+  // stage 0-1
+
+  bf1[0] = in[0];
+  bf1[2] = in[8];
+  bf1[4] = in[4];
+  bf1[6] = in[12];
+  bf1[8] = in[2];
+  bf1[10] = in[10];
+  bf1[12] = in[6];
+  bf1[14] = in[14];
+  bf1[16] = in[1];
+  bf1[18] = in[9];
+  bf1[20] = in[5];
+  bf1[22] = in[13];
+  bf1[24] = in[3];
+  bf1[26] = in[11];
+  bf1[28] = in[7];
+  bf1[30] = in[15];
+
+  // stage 2
+  bf1[31] = half_btf_0_neon_r(&cospi[2], &bf1[16], &v_bit, &rnding);
+  bf1[16] = half_btf_0_neon_r(&cospi[62], &bf1[16], &v_bit, &rnding);
+  bf1[17] = half_btf_0_m_neon_r(&cospi[34], &bf1[30], &v_bit, &rnding);
+  bf1[30] = half_btf_0_neon_r(&cospi[30], &bf1[30], &v_bit, &rnding);
+  bf1[29] = half_btf_0_neon_r(&cospi[18], &bf1[18], &v_bit, &rnding);
+  bf1[18] = half_btf_0_neon_r(&cospi[46], &bf1[18], &v_bit, &rnding);
+  bf1[19] = half_btf_0_m_neon_r(&cospi[50], &bf1[28], &v_bit, &rnding);
+  bf1[28] = half_btf_0_neon_r(&cospi[14], &bf1[28], &v_bit, &rnding);
+  bf1[27] = half_btf_0_neon_r(&cospi[10], &bf1[20], &v_bit, &rnding);
+  bf1[20] = half_btf_0_neon_r(&cospi[54], &bf1[20], &v_bit, &rnding);
+  bf1[21] = half_btf_0_m_neon_r(&cospi[42], &bf1[26], &v_bit, &rnding);
+  bf1[26] = half_btf_0_neon_r(&cospi[22], &bf1[26], &v_bit, &rnding);
+  bf1[25] = half_btf_0_neon_r(&cospi[26], &bf1[22], &v_bit, &rnding);
+  bf1[22] = half_btf_0_neon_r(&cospi[38], &bf1[22], &v_bit, &rnding);
+  bf1[23] = half_btf_0_m_neon_r(&cospi[58], &bf1[24], &v_bit, &rnding);
+  bf1[24] = half_btf_0_neon_r(&cospi[6], &bf1[24], &v_bit, &rnding);
+
+  // stage 3
+  bf1[15] = half_btf_0_neon_r(&cospi[4], &bf1[8], &v_bit, &rnding);
+  bf1[8] = half_btf_0_neon_r(&cospi[60], &bf1[8], &v_bit, &rnding);
+  bf1[9] = half_btf_0_m_neon_r(&cospi[36], &bf1[14], &v_bit, &rnding);
+  bf1[14] = half_btf_0_neon_r(&cospi[28], &bf1[14], &v_bit, &rnding);
+  bf1[13] = half_btf_0_neon_r(&cospi[20], &bf1[10], &v_bit, &rnding);
+  bf1[10] = half_btf_0_neon_r(&cospi[44], &bf1[10], &v_bit, &rnding);
+  bf1[11] = half_btf_0_m_neon_r(&cospi[52], &bf1[12], &v_bit, &rnding);
+  bf1[12] = half_btf_0_neon_r(&cospi[12], &bf1[12], &v_bit, &rnding);
+
+  addsub_neon(bf1[16], bf1[17], bf1 + 16, bf1 + 17, &clamp_lo, &clamp_hi);
+  addsub_neon(bf1[19], bf1[18], bf1 + 19, bf1 + 18, &clamp_lo, &clamp_hi);
+  addsub_neon(bf1[20], bf1[21], bf1 + 20, bf1 + 21, &clamp_lo, &clamp_hi);
+  addsub_neon(bf1[23], bf1[22], bf1 + 23, bf1 + 22, &clamp_lo, &clamp_hi);
+  addsub_neon(bf1[24], bf1[25], bf1 + 24, bf1 + 25, &clamp_lo, &clamp_hi);
+  addsub_neon(bf1[27], bf1[26], bf1 + 27, bf1 + 26, &clamp_lo, &clamp_hi);
+  addsub_neon(bf1[28], bf1[29], bf1 + 28, bf1 + 29, &clamp_lo, &clamp_hi);
+  addsub_neon(bf1[31], bf1[30], bf1 + 31, bf1 + 30, &clamp_lo, &clamp_hi);
+  // stage 4
+  bf1[7] = half_btf_0_neon_r(&cospi[8], &bf1[4], &v_bit, &rnding);
+  bf1[4] = half_btf_0_neon_r(&cospi[56], &bf1[4], &v_bit, &rnding);
+  bf1[5] = half_btf_0_m_neon_r(&cospi[40], &bf1[6], &v_bit, &rnding);
+  bf1[6] = half_btf_0_neon_r(&cospi[24], &bf1[6], &v_bit, &rnding);
+
+  addsub_neon(bf1[8], bf1[9], bf1 + 8, bf1 + 9, &clamp_lo, &clamp_hi);
+  addsub_neon(bf1[11], bf1[10], bf1 + 11, bf1 + 10, &clamp_lo, &clamp_hi);
+  addsub_neon(bf1[12], bf1[13], bf1 + 12, bf1 + 13, &clamp_lo, &clamp_hi);
+  addsub_neon(bf1[15], bf1[14], bf1 + 15, bf1 + 14, &clamp_lo, &clamp_hi);
+
+  idct32_stage4_neon(bf1, cospi, &v_bit, &rnding);
+
+  // stage 5
+  bf1[0] = half_btf_0_neon_r(&cospi[32], &bf1[0], &v_bit, &rnding);
+  bf1[1] = bf1[0];
+  bf1[3] = half_btf_0_neon_r(&cospi[16], &bf1[2], &v_bit, &rnding);
+  bf1[2] = half_btf_0_neon_r(&cospi[48], &bf1[2], &v_bit, &rnding);
+
+  addsub_neon(bf1[4], bf1[5], bf1 + 4, bf1 + 5, &clamp_lo, &clamp_hi);
+  addsub_neon(bf1[7], bf1[6], bf1 + 7, bf1 + 6, &clamp_lo, &clamp_hi);
+
+  idct32_stage5_neon(bf1, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding);
+
+  // stage 6
+  addsub_neon(bf1[0], bf1[3], bf1 + 0, bf1 + 3, &clamp_lo, &clamp_hi);
+  addsub_neon(bf1[1], bf1[2], bf1 + 1, bf1 + 2, &clamp_lo, &clamp_hi);
+
+  idct32_stage6_neon(bf1, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding);
+
+  // stage 7
+  idct32_stage7_neon(bf1, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding);
+
+  // stage 8
+  idct32_stage8_neon(bf1, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding);
+  // stage 9
+  idct32_stage9_neon(bf1, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi);
+}
+
+static void idct32x32_neon(int32x4_t *in, int32x4_t *out, int bit, int do_cols,
+                           int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
+  const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
+  int32x4_t bf1[32], bf0[32];
+  const int32x4_t v_bit = vdupq_n_s32(-bit);
+  const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
+  // stage 0
+  // stage 1
+  bf1[0] = in[0];
+  bf1[1] = in[16];
+  bf1[2] = in[8];
+  bf1[3] = in[24];
+  bf1[4] = in[4];
+  bf1[5] = in[20];
+  bf1[6] = in[12];
+  bf1[7] = in[28];
+  bf1[8] = in[2];
+  bf1[9] = in[18];
+  bf1[10] = in[10];
+  bf1[11] = in[26];
+  bf1[12] = in[6];
+  bf1[13] = in[22];
+  bf1[14] = in[14];
+  bf1[15] = in[30];
+  bf1[16] = in[1];
+  bf1[17] = in[17];
+  bf1[18] = in[9];
+  bf1[19] = in[25];
+  bf1[20] = in[5];
+  bf1[21] = in[21];
+  bf1[22] = in[13];
+  bf1[23] = in[29];
+  bf1[24] = in[3];
+  bf1[25] = in[19];
+  bf1[26] = in[11];
+  bf1[27] = in[27];
+  bf1[28] = in[7];
+  bf1[29] = in[23];
+  bf1[30] = in[15];
+  bf1[31] = in[31];
+
+  // stage 2
+  for (int i = 0; i < 16; i++) bf0[i] = bf1[i];
+
+  bf0[16] = half_btf_neon_mode01_r(&cospi[62], &bf1[16], &cospi[2], &bf1[31],
+                                   &v_bit, &rnding);
+  bf0[17] = half_btf_neon_mode01_r(&cospi[30], &bf1[17], &cospi[34], &bf1[30],
+                                   &v_bit, &rnding);
+  bf0[18] = half_btf_neon_mode01_r(&cospi[46], &bf1[18], &cospi[18], &bf1[29],
+                                   &v_bit, &rnding);
+  bf0[19] = half_btf_neon_mode01_r(&cospi[14], &bf1[19], &cospi[50], &bf1[28],
+                                   &v_bit, &rnding);
+  bf0[20] = half_btf_neon_mode01_r(&cospi[54], &bf1[20], &cospi[10], &bf1[27],
+                                   &v_bit, &rnding);
+  bf0[21] = half_btf_neon_mode01_r(&cospi[22], &bf1[21], &cospi[42], &bf1[26],
+                                   &v_bit, &rnding);
+  bf0[22] = half_btf_neon_mode01_r(&cospi[38], &bf1[22], &cospi[26], &bf1[25],
+                                   &v_bit, &rnding);
+  bf0[23] = half_btf_neon_mode01_r(&cospi[6], &bf1[23], &cospi[58], &bf1[24],
+                                   &v_bit, &rnding);
+  bf0[24] = half_btf_neon_r(&cospi[58], &bf1[23], &cospi[6], &bf1[24], &v_bit,
+                            &rnding);
+  bf0[25] = half_btf_neon_r(&cospi[26], &bf1[22], &cospi[38], &bf1[25], &v_bit,
+                            &rnding);
+  bf0[26] = half_btf_neon_r(&cospi[42], &bf1[21], &cospi[22], &bf1[26], &v_bit,
+                            &rnding);
+  bf0[27] = half_btf_neon_r(&cospi[10], &bf1[20], &cospi[54], &bf1[27], &v_bit,
+                            &rnding);
+  bf0[28] = half_btf_neon_r(&cospi[50], &bf1[19], &cospi[14], &bf1[28], &v_bit,
+                            &rnding);
+  bf0[29] = half_btf_neon_r(&cospi[18], &bf1[18], &cospi[46], &bf1[29], &v_bit,
+                            &rnding);
+  bf0[30] = half_btf_neon_r(&cospi[34], &bf1[17], &cospi[30], &bf1[30], &v_bit,
+                            &rnding);
+  bf0[31] = half_btf_neon_r(&cospi[2], &bf1[16], &cospi[62], &bf1[31], &v_bit,
+                            &rnding);
+
+  // stage 3
+  for (int i = 0; i < 8; i++) bf1[i] = bf0[i];
+
+  bf1[8] = half_btf_neon_mode01_r(&cospi[60], &bf0[8], &cospi[4], &bf0[15],
+                                  &v_bit, &rnding);
+  bf1[9] = half_btf_neon_mode01_r(&cospi[28], &bf0[9], &cospi[36], &bf0[14],
+                                  &v_bit, &rnding);
+  bf1[10] = half_btf_neon_mode01_r(&cospi[44], &bf0[10], &cospi[20], &bf0[13],
+                                   &v_bit, &rnding);
+  bf1[11] = half_btf_neon_mode01_r(&cospi[12], &bf0[11], &cospi[52], &bf0[12],
+                                   &v_bit, &rnding);
+  bf1[12] = half_btf_neon_r(&cospi[52], &bf0[11], &cospi[12], &bf0[12], &v_bit,
+                            &rnding);
+  bf1[13] = half_btf_neon_r(&cospi[20], &bf0[10], &cospi[44], &bf0[13], &v_bit,
+                            &rnding);
+  bf1[14] = half_btf_neon_r(&cospi[36], &bf0[9], &cospi[28], &bf0[14], &v_bit,
+                            &rnding);
+  bf1[15] = half_btf_neon_r(&cospi[4], &bf0[8], &cospi[60], &bf0[15], &v_bit,
+                            &rnding);
+
+  addsub_neon(bf0[16], bf0[17], bf1 + 16, bf1 + 17, &clamp_lo, &clamp_hi);
+  addsub_neon(bf0[19], bf0[18], bf1 + 19, bf1 + 18, &clamp_lo, &clamp_hi);
+  addsub_neon(bf0[20], bf0[21], bf1 + 20, bf1 + 21, &clamp_lo, &clamp_hi);
+  addsub_neon(bf0[23], bf0[22], bf1 + 23, bf1 + 22, &clamp_lo, &clamp_hi);
+  addsub_neon(bf0[24], bf0[25], bf1 + 24, bf1 + 25, &clamp_lo, &clamp_hi);
+  addsub_neon(bf0[27], bf0[26], bf1 + 27, bf1 + 26, &clamp_lo, &clamp_hi);
+  addsub_neon(bf0[28], bf0[29], bf1 + 28, bf1 + 29, &clamp_lo, &clamp_hi);
+  addsub_neon(bf0[31], bf0[30], bf1 + 31, bf1 + 30, &clamp_lo, &clamp_hi);
+
+  // stage 4
+  bf0[0] = bf1[0];
+  bf0[1] = bf1[1];
+  bf0[2] = bf1[2];
+  bf0[3] = bf1[3];
+  bf0[4] = half_btf_neon_mode01_r(&cospi[56], &bf1[4], &cospi[8], &bf1[7],
+                                  &v_bit, &rnding);
+  bf0[5] = half_btf_neon_mode01_r(&cospi[24], &bf1[5], &cospi[40], &bf1[6],
+                                  &v_bit, &rnding);
+  bf0[6] = half_btf_neon_r(&cospi[40], &bf1[5], &cospi[24], &bf1[6], &v_bit,
+                           &rnding);
+  bf0[7] =
+      half_btf_neon_r(&cospi[8], &bf1[4], &cospi[56], &bf1[7], &v_bit, &rnding);
+
+  addsub_neon(bf1[8], bf1[9], bf0 + 8, bf0 + 9, &clamp_lo, &clamp_hi);
+  addsub_neon(bf1[11], bf1[10], bf0 + 11, bf0 + 10, &clamp_lo, &clamp_hi);
+  addsub_neon(bf1[12], bf1[13], bf0 + 12, bf0 + 13, &clamp_lo, &clamp_hi);
+  addsub_neon(bf1[15], bf1[14], bf0 + 15, bf0 + 14, &clamp_lo, &clamp_hi);
+
+  bf0[16] = bf1[16];
+  bf0[17] = half_btf_neon_mode10_r(&cospi[8], &bf1[17], &cospi[56], &bf1[30],
+                                   &v_bit, &rnding);
+  bf0[18] = half_btf_neon_mode11_r(&cospi[56], &bf1[18], &cospi[8], &bf1[29],
+                                   &v_bit, &rnding);
+  bf0[19] = bf1[19];
+  bf0[20] = bf1[20];
+  bf0[21] = half_btf_neon_mode10_r(&cospi[40], &bf1[21], &cospi[24], &bf1[26],
+                                   &v_bit, &rnding);
+  bf0[22] = half_btf_neon_mode11_r(&cospi[24], &bf1[22], &cospi[40], &bf1[25],
+                                   &v_bit, &rnding);
+  bf0[23] = bf1[23];
+  bf0[24] = bf1[24];
+  bf0[25] = half_btf_neon_mode10_r(&cospi[40], &bf1[22], &cospi[24], &bf1[25],
+                                   &v_bit, &rnding);
+  bf0[26] = half_btf_neon_r(&cospi[24], &bf1[21], &cospi[40], &bf1[26], &v_bit,
+                            &rnding);
+  bf0[27] = bf1[27];
+  bf0[28] = bf1[28];
+  bf0[29] = half_btf_neon_mode10_r(&cospi[8], &bf1[18], &cospi[56], &bf1[29],
+                                   &v_bit, &rnding);
+  bf0[30] = half_btf_neon_r(&cospi[56], &bf1[17], &cospi[8], &bf1[30], &v_bit,
+                            &rnding);
+  bf0[31] = bf1[31];
+
+  // stage 5
+  bf1[0] = half_btf_neon_r(&cospi[32], &bf0[0], &cospi[32], &bf0[1], &v_bit,
+                           &rnding);
+  bf1[1] = half_btf_neon_mode01_r(&cospi[32], &bf0[0], &cospi[32], &bf0[1],
+                                  &v_bit, &rnding);
+  bf1[2] = half_btf_neon_mode01_r(&cospi[48], &bf0[2], &cospi[16], &bf0[3],
+                                  &v_bit, &rnding);
+  bf1[3] = half_btf_neon_r(&cospi[16], &bf0[2], &cospi[48], &bf0[3], &v_bit,
+                           &rnding);
+  addsub_neon(bf0[4], bf0[5], bf1 + 4, bf1 + 5, &clamp_lo, &clamp_hi);
+  addsub_neon(bf0[7], bf0[6], bf1 + 7, bf1 + 6, &clamp_lo, &clamp_hi);
+  bf1[8] = bf0[8];
+  bf1[9] = half_btf_neon_mode10_r(&cospi[16], &bf0[9], &cospi[48], &bf0[14],
+                                  &v_bit, &rnding);
+  bf1[10] = half_btf_neon_mode11_r(&cospi[48], &bf0[10], &cospi[16], &bf0[13],
+                                   &v_bit, &rnding);
+  bf1[11] = bf0[11];
+  bf1[12] = bf0[12];
+  bf1[13] = half_btf_neon_mode10_r(&cospi[16], &bf0[10], &cospi[48], &bf0[13],
+                                   &v_bit, &rnding);
+  bf1[14] = half_btf_neon_r(&cospi[48], &bf0[9], &cospi[16], &bf0[14], &v_bit,
+                            &rnding);
+  bf1[15] = bf0[15];
+  addsub_neon(bf0[16], bf0[19], bf1 + 16, bf1 + 19, &clamp_lo, &clamp_hi);
+  addsub_neon(bf0[17], bf0[18], bf1 + 17, bf1 + 18, &clamp_lo, &clamp_hi);
+  addsub_neon(bf0[23], bf0[20], bf1 + 23, bf1 + 20, &clamp_lo, &clamp_hi);
+  addsub_neon(bf0[22], bf0[21], bf1 + 22, bf1 + 21, &clamp_lo, &clamp_hi);
+  addsub_neon(bf0[24], bf0[27], bf1 + 24, bf1 + 27, &clamp_lo, &clamp_hi);
+  addsub_neon(bf0[25], bf0[26], bf1 + 25, bf1 + 26, &clamp_lo, &clamp_hi);
+  addsub_neon(bf0[31], bf0[28], bf1 + 31, bf1 + 28, &clamp_lo, &clamp_hi);
+  addsub_neon(bf0[30], bf0[29], bf1 + 30, bf1 + 29, &clamp_lo, &clamp_hi);
+
+  // stage 6
+  addsub_neon(bf1[0], bf1[3], bf0 + 0, bf0 + 3, &clamp_lo, &clamp_hi);
+  addsub_neon(bf1[1], bf1[2], bf0 + 1, bf0 + 2, &clamp_lo, &clamp_hi);
+  bf0[4] = bf1[4];
+  bf0[5] = half_btf_neon_mode10_r(&cospi[32], &bf1[5], &cospi[32], &bf1[6],
+                                  &v_bit, &rnding);
+  bf0[6] = half_btf_neon_r(&cospi[32], &bf1[5], &cospi[32], &bf1[6], &v_bit,
+                           &rnding);
+  bf0[7] = bf1[7];
+  addsub_neon(bf1[8], bf1[11], bf0 + 8, bf0 + 11, &clamp_lo, &clamp_hi);
+  addsub_neon(bf1[9], bf1[10], bf0 + 9, bf0 + 10, &clamp_lo, &clamp_hi);
+  addsub_neon(bf1[15], bf1[12], bf0 + 15, bf0 + 12, &clamp_lo, &clamp_hi);
+  addsub_neon(bf1[14], bf1[13], bf0 + 14, bf0 + 13, &clamp_lo, &clamp_hi);
+  bf0[16] = bf1[16];
+  bf0[17] = bf1[17];
+  bf0[18] = half_btf_neon_mode10_r(&cospi[16], &bf1[18], &cospi[48], &bf1[29],
+                                   &v_bit, &rnding);
+  bf0[19] = half_btf_neon_mode10_r(&cospi[16], &bf1[19], &cospi[48], &bf1[28],
+                                   &v_bit, &rnding);
+  bf0[20] = half_btf_neon_mode11_r(&cospi[48], &bf1[20], &cospi[16], &bf1[27],
+                                   &v_bit, &rnding);
+  bf0[21] = half_btf_neon_mode11_r(&cospi[48], &bf1[21], &cospi[16], &bf1[26],
+                                   &v_bit, &rnding);
+  bf0[22] = bf1[22];
+  bf0[23] = bf1[23];
+  bf0[24] = bf1[24];
+  bf0[25] = bf1[25];
+  bf0[26] = half_btf_neon_mode10_r(&cospi[16], &bf1[21], &cospi[48], &bf1[26],
+                                   &v_bit, &rnding);
+  bf0[27] = half_btf_neon_mode10_r(&cospi[16], &bf1[20], &cospi[48], &bf1[27],
+                                   &v_bit, &rnding);
+  bf0[28] = half_btf_neon_r(&cospi[48], &bf1[19], &cospi[16], &bf1[28], &v_bit,
+                            &rnding);
+  bf0[29] = half_btf_neon_r(&cospi[48], &bf1[18], &cospi[16], &bf1[29], &v_bit,
+                            &rnding);
+  bf0[30] = bf1[30];
+  bf0[31] = bf1[31];
+
+  // stage 7
+  addsub_neon(bf0[0], bf0[7], bf1 + 0, bf1 + 7, &clamp_lo, &clamp_hi);
+  addsub_neon(bf0[1], bf0[6], bf1 + 1, bf1 + 6, &clamp_lo, &clamp_hi);
+  addsub_neon(bf0[2], bf0[5], bf1 + 2, bf1 + 5, &clamp_lo, &clamp_hi);
+  addsub_neon(bf0[3], bf0[4], bf1 + 3, bf1 + 4, &clamp_lo, &clamp_hi);
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = half_btf_neon_mode10_r(&cospi[32], &bf0[10], &cospi[32], &bf0[13],
+                                   &v_bit, &rnding);
+  bf1[11] = half_btf_neon_mode10_r(&cospi[32], &bf0[11], &cospi[32], &bf0[12],
+                                   &v_bit, &rnding);
+  bf1[12] = half_btf_neon_r(&cospi[32], &bf0[11], &cospi[32], &bf0[12], &v_bit,
+                            &rnding);
+  bf1[13] = half_btf_neon_r(&cospi[32], &bf0[10], &cospi[32], &bf0[13], &v_bit,
+                            &rnding);
+  bf1[14] = bf0[14];
+  bf1[15] = bf0[15];
+  addsub_neon(bf0[16], bf0[23], bf1 + 16, bf1 + 23, &clamp_lo, &clamp_hi);
+  addsub_neon(bf0[17], bf0[22], bf1 + 17, bf1 + 22, &clamp_lo, &clamp_hi);
+  addsub_neon(bf0[18], bf0[21], bf1 + 18, bf1 + 21, &clamp_lo, &clamp_hi);
+  addsub_neon(bf0[19], bf0[20], bf1 + 19, bf1 + 20, &clamp_lo, &clamp_hi);
+  addsub_neon(bf0[31], bf0[24], bf1 + 31, bf1 + 24, &clamp_lo, &clamp_hi);
+  addsub_neon(bf0[30], bf0[25], bf1 + 30, bf1 + 25, &clamp_lo, &clamp_hi);
+  addsub_neon(bf0[29], bf0[26], bf1 + 29, bf1 + 26, &clamp_lo, &clamp_hi);
+  addsub_neon(bf0[28], bf0[27], bf1 + 28, bf1 + 27, &clamp_lo, &clamp_hi);
+
+  // stage 8
+  addsub_neon(bf1[0], bf1[15], bf0 + 0, bf0 + 15, &clamp_lo, &clamp_hi);
+  addsub_neon(bf1[1], bf1[14], bf0 + 1, bf0 + 14, &clamp_lo, &clamp_hi);
+  addsub_neon(bf1[2], bf1[13], bf0 + 2, bf0 + 13, &clamp_lo, &clamp_hi);
+  addsub_neon(bf1[3], bf1[12], bf0 + 3, bf0 + 12, &clamp_lo, &clamp_hi);
+  addsub_neon(bf1[4], bf1[11], bf0 + 4, bf0 + 11, &clamp_lo, &clamp_hi);
+  addsub_neon(bf1[5], bf1[10], bf0 + 5, bf0 + 10, &clamp_lo, &clamp_hi);
+  addsub_neon(bf1[6], bf1[9], bf0 + 6, bf0 + 9, &clamp_lo, &clamp_hi);
+  addsub_neon(bf1[7], bf1[8], bf0 + 7, bf0 + 8, &clamp_lo, &clamp_hi);
+  bf0[16] = bf1[16];
+  bf0[17] = bf1[17];
+  bf0[18] = bf1[18];
+  bf0[19] = bf1[19];
+  bf0[20] = half_btf_neon_mode10_r(&cospi[32], &bf1[20], &cospi[32], &bf1[27],
+                                   &v_bit, &rnding);
+  bf0[21] = half_btf_neon_mode10_r(&cospi[32], &bf1[21], &cospi[32], &bf1[26],
+                                   &v_bit, &rnding);
+  bf0[22] = half_btf_neon_mode10_r(&cospi[32], &bf1[22], &cospi[32], &bf1[25],
+                                   &v_bit, &rnding);
+  bf0[23] = half_btf_neon_mode10_r(&cospi[32], &bf1[23], &cospi[32], &bf1[24],
+                                   &v_bit, &rnding);
+  bf0[24] = half_btf_neon_r(&cospi[32], &bf1[23], &cospi[32], &bf1[24], &v_bit,
+                            &rnding);
+  bf0[25] = half_btf_neon_r(&cospi[32], &bf1[22], &cospi[32], &bf1[25], &v_bit,
+                            &rnding);
+  bf0[26] = half_btf_neon_r(&cospi[32], &bf1[21], &cospi[32], &bf1[26], &v_bit,
+                            &rnding);
+  bf0[27] = half_btf_neon_r(&cospi[32], &bf1[20], &cospi[32], &bf1[27], &v_bit,
+                            &rnding);
+  bf0[28] = bf1[28];
+  bf0[29] = bf1[29];
+  bf0[30] = bf1[30];
+  bf0[31] = bf1[31];
+
+  // stage 9
+  addsub_neon(bf0[0], bf0[31], out + 0, out + 31, &clamp_lo, &clamp_hi);
+  addsub_neon(bf0[1], bf0[30], out + 1, out + 30, &clamp_lo, &clamp_hi);
+  addsub_neon(bf0[2], bf0[29], out + 2, out + 29, &clamp_lo, &clamp_hi);
+  addsub_neon(bf0[3], bf0[28], out + 3, out + 28, &clamp_lo, &clamp_hi);
+  addsub_neon(bf0[4], bf0[27], out + 4, out + 27, &clamp_lo, &clamp_hi);
+  addsub_neon(bf0[5], bf0[26], out + 5, out + 26, &clamp_lo, &clamp_hi);
+  addsub_neon(bf0[6], bf0[25], out + 6, out + 25, &clamp_lo, &clamp_hi);
+  addsub_neon(bf0[7], bf0[24], out + 7, out + 24, &clamp_lo, &clamp_hi);
+  addsub_neon(bf0[8], bf0[23], out + 8, out + 23, &clamp_lo, &clamp_hi);
+  addsub_neon(bf0[9], bf0[22], out + 9, out + 22, &clamp_lo, &clamp_hi);
+  addsub_neon(bf0[10], bf0[21], out + 10, out + 21, &clamp_lo, &clamp_hi);
+  addsub_neon(bf0[11], bf0[20], out + 11, out + 20, &clamp_lo, &clamp_hi);
+  addsub_neon(bf0[12], bf0[19], out + 12, out + 19, &clamp_lo, &clamp_hi);
+  addsub_neon(bf0[13], bf0[18], out + 13, out + 18, &clamp_lo, &clamp_hi);
+  addsub_neon(bf0[14], bf0[17], out + 14, out + 17, &clamp_lo, &clamp_hi);
+  addsub_neon(bf0[15], bf0[16], out + 15, out + 16, &clamp_lo, &clamp_hi);
+
+  if (!do_cols) {
+    const int32x4_t rnding_shift = vdupq_n_s32(1 << (out_shift - 1));
+    const int log_range_out = AOMMAX(16, bd + 6);
+    const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1)));
+    const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1);
+    round_shift_8x8(out, out_shift, &rnding_shift);
+    round_shift_8x8(out + 16, out_shift, &rnding_shift);
+    highbd_clamp_s32_neon(out, out, &clamp_lo_out, &clamp_hi_out, 32);
+  }
+}
+
+static void iidentity32_neon(int32x4_t *in, int32x4_t *out, int bit,
+                             int do_cols, int bd, int out_shift) {
+  (void)bit;
+  for (int i = 0; i < 32; i += 16) {
+    out[i] = vshlq_n_s32(in[i], 2);
+    out[i + 1] = vshlq_n_s32(in[i + 1], 2);
+    out[i + 2] = vshlq_n_s32(in[i + 2], 2);
+    out[i + 3] = vshlq_n_s32(in[i + 3], 2);
+    out[i + 4] = vshlq_n_s32(in[i + 4], 2);
+    out[i + 5] = vshlq_n_s32(in[i + 5], 2);
+    out[i + 6] = vshlq_n_s32(in[i + 6], 2);
+    out[i + 7] = vshlq_n_s32(in[i + 7], 2);
+    out[i + 8] = vshlq_n_s32(in[i + 8], 2);
+    out[i + 9] = vshlq_n_s32(in[i + 9], 2);
+    out[i + 10] = vshlq_n_s32(in[i + 10], 2);
+    out[i + 11] = vshlq_n_s32(in[i + 11], 2);
+    out[i + 12] = vshlq_n_s32(in[i + 12], 2);
+    out[i + 13] = vshlq_n_s32(in[i + 13], 2);
+    out[i + 14] = vshlq_n_s32(in[i + 14], 2);
+    out[i + 15] = vshlq_n_s32(in[i + 15], 2);
+  }
+
+  if (!do_cols) {
+    const int32x4_t rnding_shift = vdupq_n_s32(1 << (out_shift - 1));
+    const int log_range_out = AOMMAX(16, bd + 6);
+    const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1)));
+    const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1);
+    round_shift_8x8(out, out_shift, &rnding_shift);
+    round_shift_8x8(out + 16, out_shift, &rnding_shift);
+    highbd_clamp_s32_neon(out, out, &clamp_lo_out, &clamp_hi_out, 32);
+  }
+}
+
+// 1D itx types
+typedef enum ATTRIBUTE_PACKED {
+  IDCT_1D,
+  IADST_1D,
+  IFLIPADST_1D = IADST_1D,
+  IIDENTITY_1D,
+  ITX_TYPES_1D,
+} ITX_TYPE_1D;
+
+static const ITX_TYPE_1D vitx_1d_tab[TX_TYPES] = {
+  IDCT_1D,      IADST_1D,     IDCT_1D,      IADST_1D,
+  IFLIPADST_1D, IDCT_1D,      IFLIPADST_1D, IADST_1D,
+  IFLIPADST_1D, IIDENTITY_1D, IDCT_1D,      IIDENTITY_1D,
+  IADST_1D,     IIDENTITY_1D, IFLIPADST_1D, IIDENTITY_1D,
+};
+static const ITX_TYPE_1D hitx_1d_tab[TX_TYPES] = {
+  IDCT_1D,      IDCT_1D,      IADST_1D,     IADST_1D,
+  IDCT_1D,      IFLIPADST_1D, IFLIPADST_1D, IFLIPADST_1D,
+  IADST_1D,     IIDENTITY_1D, IIDENTITY_1D, IDCT_1D,
+  IIDENTITY_1D, IADST_1D,     IIDENTITY_1D, IFLIPADST_1D,
+};
+
+static const transform_1d_neon
+    highbd_txfm_all_1d_zeros_w8_arr[TX_SIZES][ITX_TYPES_1D][4] = {
+      {
+          { idct4x4_neon, NULL, NULL, NULL },
+          { iadst4x4_neon, NULL, NULL, NULL },
+          { iidentity4_neon, iidentity4_neon, iidentity4_neon, NULL },
+      },
+      { { idct8x8_low1_neon, idct8x8_new_neon, NULL, NULL },
+        { iadst8x8_low1_neon, iadst8x8_new_neon, NULL, NULL },
+        { iidentity8_neon, iidentity8_neon, NULL, NULL } },
+      {
+          { idct16x16_low1_neon, idct16x16_low8_neon, idct16x16_neon, NULL },
+          { iadst16x16_low1_neon, iadst16x16_low8_neon, iadst16x16_neon, NULL },
+          { iidentity16_neon, NULL, iidentity16_neon, NULL },
+      },
+      { { idct32x32_low1_neon, idct32x32_low8_neon, idct32x32_low16_neon,
+          idct32x32_neon },
+        { NULL, NULL, NULL, NULL },
+        { iidentity32_neon, NULL, NULL, NULL } },
+      { { idct64x64_low1_neon, idct64x64_low8_neon, idct64x64_low16_neon,
+          idct64x64_neon },
+        { NULL, NULL, NULL, NULL },
+        { NULL, NULL, NULL, NULL } }
+    };
+
+void av1_inv_txfm2d_add_4x8_neon(const tran_low_t *input, uint16_t *output,
+                                 int stride, TX_TYPE tx_type, const int bd) {
+  TX_SIZE tx_size = TX_4X8;
+  int32x4_t buf1[32] = { vdupq_n_s32(0) };
+
+  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const transform_1d_neon row_txfm =
+      highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0];
+  const transform_1d_neon col_txfm =
+      highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][1];
+  const int input_stride = AOMMIN(32, txfm_size_col);
+
+  assert(col_txfm != NULL);
+  assert(row_txfm != NULL);
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  // 1st stage: column transform
+  int32x4_t buf0[8];
+  const int32_t *input_row = input;
+  int32x4_t *buf0_cur = buf0;
+  load_buffer_32bit_input(input_row, input_stride, buf0_cur, txfm_size_row);
+  av1_round_shift_rect_array_32_neon(buf0, buf0, txfm_size_row, 0, NewInvSqrt2);
+  row_txfm(buf0, buf0, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]);
+  row_txfm(buf0 + 4, buf0 + 4, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+           -shift[0]);
+
+  if (lr_flip) {
+    TRANSPOSE_4X4(buf0[3], buf0[2], buf0[1], buf0[0], buf1[0], buf1[1], buf1[2],
+                  buf1[3]);
+
+    TRANSPOSE_4X4(buf0[7], buf0[6], buf0[5], buf0[4], buf1[4], buf1[5], buf1[6],
+                  buf1[7]);
+  } else {
+    TRANSPOSE_4X4(buf0[0], buf0[1], buf0[2], buf0[3], buf1[0], buf1[1], buf1[2],
+                  buf1[3]);
+
+    TRANSPOSE_4X4(buf0[4], buf0[5], buf0[6], buf0[7], buf1[4], buf1[5], buf1[6],
+                  buf1[7]);
+  }
+
+  // 2nd stage: column transform
+  col_txfm(buf1, buf1, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+
+  av1_round_shift_array_32_neon(buf1, buf1, txfm_size_row, -shift[1]);
+
+  // write to buffer
+  highbd_write_buffer_4xn_neon(buf1, output, stride, ud_flip, txfm_size_row,
+                               bd);
+}
+
+void av1_inv_txfm2d_add_8x4_neon(const int32_t *input, uint16_t *output,
+                                 int stride, TX_TYPE tx_type, const int bd) {
+  TX_SIZE tx_size = TX_8X4;
+  int32x4_t buf1[8];
+  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const transform_1d_neon row_txfm =
+      highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][1];
+  const transform_1d_neon col_txfm =
+      highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0];
+
+  assert(col_txfm != NULL);
+  assert(row_txfm != NULL);
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  // 1st stage: column transform
+  int32x4_t buf0[8];
+  const int32_t *input_row = input;
+  load_buffer_32bit_input(input_row, 4, buf0, txfm_size_col);
+
+  TRANSPOSE_4X4(buf0[0], buf0[2], buf0[4], buf0[6], buf1[0], buf1[1], buf1[2],
+                buf1[3]);
+  TRANSPOSE_4X4(buf0[1], buf0[3], buf0[5], buf0[7], buf1[4], buf1[5], buf1[6],
+                buf1[7]);
+
+  av1_round_shift_rect_array_32_neon(buf1, buf0, txfm_size_col, 0, NewInvSqrt2);
+  row_txfm(buf0, buf0, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]);
+
+  int32x4_t *buf1_ptr;
+  if (lr_flip) {
+    flip_buf_neon(buf0, buf1, txfm_size_col);
+    buf1_ptr = buf1;
+  } else {
+    buf1_ptr = buf0;
+  }
+
+  // 2nd stage: column transform
+  for (int i = 0; i < 2; i++) {
+    col_txfm(buf1_ptr + i * txfm_size_row, buf1_ptr + i * txfm_size_row,
+             av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+  }
+  av1_round_shift_array_32_neon(buf1_ptr, buf1_ptr, txfm_size_col, -shift[1]);
+  // write to buffer
+  highbd_write_buffer_8xn_neon(buf1_ptr, output, stride, ud_flip, txfm_size_row,
+                               bd);
+}
+
+void av1_inv_txfm2d_add_4x16_neon(const int32_t *input, uint16_t *output,
+                                  int stride, TX_TYPE tx_type, const int bd) {
+  TX_SIZE tx_size = TX_4X16;
+  int32x4_t buf1[16];
+  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int buf_size_h_div8 = txfm_size_row >> 2;
+  const transform_1d_neon row_txfm =
+      highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0];
+  const transform_1d_neon col_txfm =
+      highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][2];
+  const int input_stride = AOMMIN(32, txfm_size_col);
+
+  assert(col_txfm != NULL);
+  assert(row_txfm != NULL);
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  // 1st stage: column transform
+  int32x4_t buf0[16];
+  const int32_t *input_row = input;
+  int32x4_t *buf0_cur = buf0;
+  load_buffer_32bit_input(input_row, input_stride, buf0_cur, txfm_size_row);
+  for (int i = 0; i < (txfm_size_row >> 2); i++) {
+    row_txfm(buf0 + (i << 2), buf0 + (i << 2),
+             av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]);
+  }
+
+  if (lr_flip) {
+    for (int j = 0; j < buf_size_h_div8; ++j) {
+      TRANSPOSE_4X4(buf0[4 * j + 3], buf0[4 * j + 2], buf0[4 * j + 1],
+                    buf0[4 * j], buf1[4 * j], buf1[4 * j + 1], buf1[4 * j + 2],
+                    buf1[4 * j + 3]);
+    }
+  } else {
+    for (int j = 0; j < buf_size_h_div8; ++j) {
+      TRANSPOSE_4X4(buf0[4 * j], buf0[4 * j + 1], buf0[4 * j + 2],
+                    buf0[4 * j + 3], buf1[4 * j], buf1[4 * j + 1],
+                    buf1[4 * j + 2], buf1[4 * j + 3]);
+    }
+  }
+
+  // 2nd stage: column transform
+  col_txfm(buf1, buf1, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+
+  av1_round_shift_array_32_neon(buf1, buf1, txfm_size_row, -shift[1]);
+
+  // write to buffer
+  highbd_write_buffer_4xn_neon(buf1, output, stride, ud_flip, txfm_size_row,
+                               bd);
+}
+
+void av1_inv_txfm2d_add_16x4_neon(const int32_t *input, uint16_t *output,
+                                  int stride, TX_TYPE tx_type, const int bd) {
+  TX_SIZE tx_size = TX_16X4;
+  int32x4_t buf1[16];
+  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int buf_size_w_div8 = txfm_size_col >> 2;
+  const transform_1d_neon row_txfm =
+      highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][2];
+  const transform_1d_neon col_txfm =
+      highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0];
+
+  assert(col_txfm != NULL);
+  assert(row_txfm != NULL);
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  // 1st stage: column transform
+  int32x4_t buf0[16];
+  const int32_t *input_row = input;
+  load_buffer_32bit_input(input_row, 4, buf0, txfm_size_col);
+
+  for (int j = 0; j < buf_size_w_div8; j++) {
+    TRANSPOSE_4X4(buf0[j], buf0[j + 4], buf0[j + 8], buf0[j + 12], buf1[4 * j],
+                  buf1[4 * j + 1], buf1[4 * j + 2], buf1[4 * j + 3]);
+  }
+  row_txfm(buf1, buf0, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]);
+
+  int32x4_t *buf1_ptr;
+  if (lr_flip) {
+    flip_buf_neon(buf0, buf1, txfm_size_col);
+    buf1_ptr = buf1;
+  } else {
+    buf1_ptr = buf0;
+  }
+
+  // 2nd stage: column transform
+  for (int i = 0; i < buf_size_w_div8; i++) {
+    col_txfm(buf1_ptr + i * txfm_size_row, buf1_ptr + i * txfm_size_row,
+             av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+  }
+  av1_round_shift_array_32_neon(buf1_ptr, buf1_ptr, txfm_size_col, -shift[1]);
+
+  // write to buffer
+  for (int i = 0; i < (txfm_size_col >> 3); i++) {
+    highbd_write_buffer_8xn_neon(buf1_ptr + i * txfm_size_row * 2,
+                                 output + 8 * i, stride, ud_flip, txfm_size_row,
+                                 bd);
+  }
+}
+
+void highbd_inv_txfm2d_add_4x16_neon(const int32_t *input, uint16_t *output,
+                                     int stride, TX_TYPE tx_type, int eob,
+                                     const int bd) {
+  (void)eob;
+  TX_SIZE tx_size = TX_4X16;
+  int32x4_t buf1[16];
+  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int buf_size_h_div8 = txfm_size_row >> 2;
+  const transform_1d_neon row_txfm =
+      highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0];
+  const transform_1d_neon col_txfm =
+      highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][2];
+  const int input_stride = AOMMIN(32, txfm_size_col);
+
+  assert(col_txfm != NULL);
+  assert(row_txfm != NULL);
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  // 1st stage: column transform
+  int32x4_t buf0[16];
+  const int32_t *input_row = input;
+  int32x4_t *buf0_cur = buf0;
+  load_buffer_32bit_input(input_row, input_stride, buf0_cur, txfm_size_row);
+  for (int i = 0; i < (txfm_size_row >> 2); i++) {
+    row_txfm(buf0 + (i << 2), buf0 + (i << 2),
+             av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]);
+  }
+
+  if (lr_flip) {
+    for (int j = 0; j < buf_size_h_div8; ++j) {
+      TRANSPOSE_4X4(buf0[4 * j + 3], buf0[4 * j + 2], buf0[4 * j + 1],
+                    buf0[4 * j], buf1[4 * j], buf1[4 * j + 1], buf1[4 * j + 2],
+                    buf1[4 * j + 3]);
+    }
+  } else {
+    for (int j = 0; j < buf_size_h_div8; ++j) {
+      TRANSPOSE_4X4(buf0[4 * j], buf0[4 * j + 1], buf0[4 * j + 2],
+                    buf0[4 * j + 3], buf1[4 * j], buf1[4 * j + 1],
+                    buf1[4 * j + 2], buf1[4 * j + 3]);
+    }
+  }
+
+  // 2nd stage: column transform
+  col_txfm(buf1, buf1, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+
+  av1_round_shift_array_32_neon(buf1, buf1, txfm_size_row, -shift[1]);
+
+  // write to buffer
+  highbd_write_buffer_4xn_neon(buf1, output, stride, ud_flip, txfm_size_row,
+                               bd);
+}
+
+void highbd_inv_txfm2d_add_16x4_neon(const int32_t *input, uint16_t *output,
+                                     int stride, TX_TYPE tx_type, int eob,
+                                     const int bd) {
+  (void)eob;
+  TX_SIZE tx_size = TX_16X4;
+  int32x4_t buf1[16];
+  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int buf_size_w_div8 = txfm_size_col >> 2;
+  const transform_1d_neon row_txfm =
+      highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][2];
+  const transform_1d_neon col_txfm =
+      highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0];
+
+  assert(col_txfm != NULL);
+  assert(row_txfm != NULL);
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  // 1st stage: column transform
+  int32x4_t buf0[16];
+  const int32_t *input_row = input;
+  load_buffer_32bit_input(input_row, 4, buf0, txfm_size_col);
+
+  for (int j = 0; j < buf_size_w_div8; j++) {
+    TRANSPOSE_4X4(buf0[j], buf0[j + 4], buf0[j + 8], buf0[j + 12], buf1[4 * j],
+                  buf1[4 * j + 1], buf1[4 * j + 2], buf1[4 * j + 3]);
+  }
+  row_txfm(buf1, buf0, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]);
+
+  int32x4_t *buf1_ptr;
+  if (lr_flip) {
+    flip_buf_neon(buf0, buf1, txfm_size_col);
+    buf1_ptr = buf1;
+  } else {
+    buf1_ptr = buf0;
+  }
+
+  // 2nd stage: column transform
+  for (int i = 0; i < buf_size_w_div8; i++) {
+    col_txfm(buf1_ptr + i * txfm_size_row, buf1_ptr + i * txfm_size_row,
+             av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+  }
+  av1_round_shift_array_32_neon(buf1_ptr, buf1_ptr, txfm_size_col, -shift[1]);
+
+  // write to buffer
+  for (int i = 0; i < (txfm_size_col >> 3); i++) {
+    highbd_write_buffer_8xn_neon(buf1_ptr + i * txfm_size_row * 2,
+                                 output + 8 * i, stride, ud_flip, txfm_size_row,
+                                 bd);
+  }
+}
+
+static const int lowbd_txfm_all_1d_zeros_idx[32] = {
+  0, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2,
+  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+};
+
+// Transform block width in log2 for eob (size of 64 map to 32)
+static const int tx_size_wide_log2_eob[TX_SIZES_ALL] = {
+  2, 3, 4, 5, 5, 2, 3, 3, 4, 4, 5, 5, 5, 2, 4, 3, 5, 4, 5,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x8_default[8]) = {
+  0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                av1_eob_to_eobxy_16x16_default[16]) = {
+  0x0707, 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f,
+  0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                av1_eob_to_eobxy_32x32_default[32]) = {
+  0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f,
+  0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f,
+  0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f,
+  0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x16_default[16]) = {
+  0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0f07, 0x0f07, 0x0f07,
+  0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_16x8_default[8]) = {
+  0x0707, 0x0707, 0x070f, 0x070f, 0x070f, 0x070f, 0x070f, 0x070f,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                av1_eob_to_eobxy_16x32_default[32]) = {
+  0x0707, 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f,
+  0x0f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f,
+  0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f,
+  0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                av1_eob_to_eobxy_32x16_default[16]) = {
+  0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f,
+  0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x32_default[32]) = {
+  0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0f07, 0x0f07, 0x0f07,
+  0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x1f07, 0x1f07, 0x1f07,
+  0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07,
+  0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_32x8_default[8]) = {
+  0x0707, 0x070f, 0x070f, 0x071f, 0x071f, 0x071f, 0x071f, 0x071f,
+};
+
+DECLARE_ALIGNED(16, static const int16_t *,
+                av1_eob_to_eobxy_default[TX_SIZES_ALL]) = {
+  NULL,
+  av1_eob_to_eobxy_8x8_default,
+  av1_eob_to_eobxy_16x16_default,
+  av1_eob_to_eobxy_32x32_default,
+  av1_eob_to_eobxy_32x32_default,
+  NULL,
+  NULL,
+  av1_eob_to_eobxy_8x16_default,
+  av1_eob_to_eobxy_16x8_default,
+  av1_eob_to_eobxy_16x32_default,
+  av1_eob_to_eobxy_32x16_default,
+  av1_eob_to_eobxy_32x32_default,
+  av1_eob_to_eobxy_32x32_default,
+  NULL,
+  NULL,
+  av1_eob_to_eobxy_8x32_default,
+  av1_eob_to_eobxy_32x8_default,
+  av1_eob_to_eobxy_16x32_default,
+  av1_eob_to_eobxy_32x16_default,
+};
+
+static INLINE void highbd_get_eobx_eoby_scan_default(int *eobx, int *eoby,
+                                                     TX_SIZE tx_size, int eob) {
+  if (eob == 1) {
+    *eobx = 0;
+    *eoby = 0;
+    return;
+  }
+
+  const int tx_w_log2 = tx_size_wide_log2_eob[tx_size];
+  const int eob_row = (eob - 1) >> tx_w_log2;
+  const int eobxy = av1_eob_to_eobxy_default[tx_size][eob_row];
+  *eobx = eobxy & 0xFF;
+  *eoby = eobxy >> 8;
+}
+
+static INLINE void get_eobx_eoby_scan_default(int *eobx, int *eoby,
+                                              TX_SIZE tx_size) {
+  if (tx_size == 2) {
+    *eoby = 15, *eobx = 15;
+  } else if (tx_size == 3) {
+    *eoby = 31, *eobx = 31;
+  } else if (tx_size == 4) {
+    *eoby = 31, *eobx = 31;
+  } else if (tx_size == 7) {
+    *eoby = 15, *eobx = 7;
+  } else if (tx_size == 8) {
+    *eoby = 7, *eobx = 15;
+  } else if (tx_size == 9) {
+    *eoby = 31, *eobx = 15;
+  } else if (tx_size == 10) {
+    *eoby = 15, *eobx = 31;
+  } else if (tx_size == 11) {
+    *eoby = 31, *eobx = 31;
+  } else if (tx_size == 12) {
+    *eoby = 31, *eobx = 31;
+  } else if (tx_size == 15) {
+    *eoby = 31, *eobx = 7;
+  } else if (tx_size == 16) {
+    *eoby = 7, *eobx = 31;
+  } else if (tx_size == 17) {
+    *eoby = 31, *eobx = 15;
+  } else if (tx_size == 18) {
+    *eoby = 15, *eobx = 31;
+  } else {
+    *eoby = 0, *eobx = 0;
+  }
+}
+
+static INLINE void get_eobx_eoby_scan_v_identity(int *eobx, int *eoby,
+                                                 TX_SIZE tx_size) {
+  const int txfm_size_row = tx_size_high[tx_size];
+  *eoby = AOMMIN(32, txfm_size_row) - 1;
+  *eobx = 0;
+}
+
+static INLINE void get_eobx_eoby_scan_h_identity(int *eobx, int *eoby,
+                                                 TX_SIZE tx_size) {
+  const int txfm_size_col = tx_size_wide[tx_size];
+  *eobx = AOMMIN(32, txfm_size_col) - 1;
+  *eoby = 0;
+}
+
+static void inv_txfm2d_add_h_identity_neon(const int32_t *input,
+                                           uint16_t *output, int stride,
+                                           TX_TYPE tx_type, TX_SIZE tx_size,
+                                           const int bd) {
+  int32x4_t buf1[64];
+  int eobx, eoby;
+  get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size);
+  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int input_stride = AOMMIN(32, txfm_size_col);
+  const int buf_size_w_div4 = input_stride >> 2;
+  const int buf_size_h_div8 = (eoby + 8) >> 3;
+  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+  const int fun_idx = lowbd_txfm_all_1d_zeros_idx[eoby];
+  const transform_1d_neon row_txfm =
+      highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0];
+  const transform_1d_neon col_txfm =
+      highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx];
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  for (int i = 0; i < (buf_size_h_div8 << 1); ++i) {
+    int32x4_t buf0[16];
+    const int32_t *input_row = input + i * input_stride * 4;
+    for (int j = 0; j < buf_size_w_div4; ++j) {
+      int32x4_t *buf0_cur = buf0 + j * 4;
+      load_buffer_32bit_input(input_row + j * 4, input_stride, buf0_cur, 4);
+    }
+    if (rect_type == 1 || rect_type == -1) {
+      av1_round_shift_rect_array_32_neon(buf0, buf0, input_stride, 0,
+                                         NewInvSqrt2);
+    }
+    row_txfm(buf0, buf0, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+             -shift[0]);
+
+    int32x4_t *_buf1 = buf1 + i * 4;
+
+    for (int j = 0; j < buf_size_w_div4; ++j) {
+      _buf1[j * txfm_size_row + 0] = buf0[j * 4 + 0];
+      _buf1[j * txfm_size_row + 1] = buf0[j * 4 + 1];
+      _buf1[j * txfm_size_row + 2] = buf0[j * 4 + 2];
+      _buf1[j * txfm_size_row + 3] = buf0[j * 4 + 3];
+    }
+  }
+  for (int i = 0; i < buf_size_w_div4; i++) {
+    col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row,
+             av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+
+    av1_round_shift_array_32_neon(buf1 + i * txfm_size_row,
+                                  buf1 + i * txfm_size_row, txfm_size_row,
+                                  -shift[1]);
+  }
+
+  // write to buffer
+  for (int i = 0; i < (txfm_size_col >> 3); i++) {
+    highbd_write_buffer_8xn_neon(buf1 + i * txfm_size_row * 2, output + 8 * i,
+                                 stride, ud_flip, txfm_size_row, bd);
+  }
+}
+
+static void inv_txfm2d_add_v_identity_neon(const int32_t *input,
+                                           uint16_t *output, int stride,
+                                           TX_TYPE tx_type, TX_SIZE tx_size,
+                                           const int bd) {
+  int32x4_t buf1[64];
+  int eobx, eoby;
+  get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size);
+  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int input_stride = AOMMIN(32, txfm_size_col);
+  const int buf_size_w_div8 = input_stride >> 2;
+  const int row_max = AOMMIN(32, txfm_size_row);
+  const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
+  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+  const int fun_idx = lowbd_txfm_all_1d_zeros_idx[eobx];
+  const transform_1d_neon row_txfm =
+      highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx];
+  const transform_1d_neon col_txfm =
+      highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0];
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  for (int i = 0; i < (row_max >> 2); ++i) {
+    int32x4_t buf0[16];
+    const int32_t *input_row = input + i * input_stride * 4;
+    for (int j = 0; j < (buf_size_nonzero_w_div8 << 1); ++j) {
+      int32x4_t *buf0_cur = buf0 + j * 4;
+      load_buffer_32bit_input(input_row + j * 4, input_stride, buf0_cur, 4);
+
+      TRANSPOSE_4X4(buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3],
+                    buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3]);
+    }
+    if (rect_type == 1 || rect_type == -1) {
+      av1_round_shift_rect_array_32_neon(
+          buf0, buf0, (buf_size_nonzero_w_div8 << 3), 0, NewInvSqrt2);
+    }
+    row_txfm(buf0, buf0, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+             -shift[0]);
+
+    int32x4_t *_buf1 = buf1 + i * 4;
+    if (lr_flip) {
+      for (int j = 0; j < buf_size_w_div8; ++j) {
+        TRANSPOSE_4X4(buf0[4 * j + 3], buf0[4 * j + 2], buf0[4 * j + 1],
+                      buf0[4 * j],
+                      _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 0],
+                      _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 1],
+                      _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 2],
+                      _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 3]);
+      }
+    } else {
+      for (int j = 0; j < buf_size_w_div8; ++j) {
+        TRANSPOSE_4X4(
+            buf0[j * 4 + 0], buf0[j * 4 + 1], buf0[j * 4 + 2], buf0[j * 4 + 3],
+            _buf1[j * txfm_size_row + 0], _buf1[j * txfm_size_row + 1],
+            _buf1[j * txfm_size_row + 2], _buf1[j * txfm_size_row + 3]);
+      }
+    }
+  }
+  for (int i = 0; i < buf_size_w_div8; i++) {
+    col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row,
+             av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+
+    av1_round_shift_array_32_neon(buf1 + i * txfm_size_row,
+                                  buf1 + i * txfm_size_row, txfm_size_row,
+                                  -shift[1]);
+  }
+
+  // write to buffer
+  {
+    for (int i = 0; i < (txfm_size_col >> 3); i++) {
+      highbd_write_buffer_8xn_neon(buf1 + i * txfm_size_row * 2, output + 8 * i,
+                                   stride, ud_flip, txfm_size_row, bd);
+    }
+  }
+}
+static void inv_txfm2d_add_idtx_neon(const int32_t *input, uint16_t *output,
+                                     int stride, TX_TYPE tx_type,
+                                     TX_SIZE tx_size, const int bd) {
+  int32x4_t buf1[64 * 4];
+  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int input_stride = AOMMIN(32, txfm_size_col);
+  const int row_max = AOMMIN(32, txfm_size_row);
+  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+  const transform_1d_neon row_txfm =
+      highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0];
+  const transform_1d_neon col_txfm =
+      highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0];
+  for (int i = 0; i < (row_max >> 2); ++i) {
+    int32x4_t buf0[32];
+    const int32_t *input_row = input + i * input_stride * 4;
+    for (int j = 0; j < (input_stride >> 2); ++j) {
+      int32x4_t *buf0_cur = buf0 + j * 4;
+      load_buffer_32bit_input(input_row + j * 4, input_stride, buf0_cur, 4);
+    }
+    if (rect_type == 1 || rect_type == -1) {
+      av1_round_shift_rect_array_32_neon(buf0, buf0, input_stride, 0,
+                                         NewInvSqrt2);
+    }
+    row_txfm(buf0, buf0, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+             -shift[0]);
+
+    int32x4_t *_buf1 = buf1 + i * 4;
+    for (int j = 0; j < (input_stride >> 2); ++j) {
+      _buf1[j * txfm_size_row + 0] = buf0[j * 4 + 0];
+      _buf1[j * txfm_size_row + 1] = buf0[j * 4 + 1];
+      _buf1[j * txfm_size_row + 2] = buf0[j * 4 + 2];
+      _buf1[j * txfm_size_row + 3] = buf0[j * 4 + 3];
+    }
+  }
+  for (int i = 0; i < (input_stride >> 2); i++) {
+    col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row,
+             av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+
+    av1_round_shift_array_32_neon(buf1 + i * txfm_size_row,
+                                  buf1 + i * txfm_size_row, txfm_size_row,
+                                  -shift[1]);
+  }
+
+  // write to buffer
+  {
+    for (int i = 0; i < (txfm_size_col >> 3); i++) {
+      highbd_write_buffer_8xn_neon(buf1 + i * txfm_size_row * 2, output + 8 * i,
+                                   stride, 0, txfm_size_row, bd);
+    }
+  }
+}
+void inv_txfm2d_add_no_identity_neon(const int32_t *input, uint16_t *output,
+                                     int stride, TX_TYPE tx_type,
+                                     TX_SIZE tx_size, const int bd) {
+  int32x4_t buf1[64 * 16];
+  int eobx, eoby;
+  get_eobx_eoby_scan_default(&eobx, &eoby, tx_size);
+  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int buf_size_w_div8 = txfm_size_col >> 2;
+  const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
+  const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
+  const int input_stride = AOMMIN(32, txfm_size_col);
+  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+
+  const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
+  const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
+  const transform_1d_neon row_txfm =
+      highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
+  const transform_1d_neon col_txfm =
+      highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
+
+  assert(col_txfm != NULL);
+  assert(row_txfm != NULL);
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  // 1st stage: column transform
+  for (int i = 0; i < buf_size_nonzero_h_div8 << 1; i++) {
+    int32x4_t buf0[64];
+    const int32_t *input_row = input + i * input_stride * 4;
+    for (int j = 0; j < buf_size_nonzero_w_div8 << 1; ++j) {
+      int32x4_t *buf0_cur = &buf0[j * 4];
+      load_buffer_32bit_input(input_row + j * 4, input_stride, buf0_cur, 4);
+
+      TRANSPOSE_4X4(buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3],
+                    buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3]);
+    }
+    if (rect_type == 1 || rect_type == -1) {
+      av1_round_shift_rect_array_32_neon(
+          buf0, buf0, buf_size_nonzero_w_div8 << 3, 0, NewInvSqrt2);
+    }
+    row_txfm(buf0, buf0, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+             -shift[0]);
+
+    int32x4_t *_buf1 = &buf1[i * 4];
+
+    if (lr_flip) {
+      for (int j = 0; j < buf_size_w_div8; ++j) {
+        TRANSPOSE_4X4(buf0[4 * j + 3], buf0[4 * j + 2], buf0[4 * j + 1],
+                      buf0[4 * j],
+                      _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 0],
+                      _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 1],
+                      _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 2],
+                      _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 3]);
+      }
+    } else {
+      for (int j = 0; j < buf_size_w_div8; ++j) {
+        TRANSPOSE_4X4(
+            buf0[j * 4 + 0], buf0[j * 4 + 1], buf0[j * 4 + 2], buf0[j * 4 + 3],
+            _buf1[j * txfm_size_row + 0], _buf1[j * txfm_size_row + 1],
+            _buf1[j * txfm_size_row + 2], _buf1[j * txfm_size_row + 3]);
+      }
+    }
+  }
+  // 2nd stage: column transform
+  for (int i = 0; i < buf_size_w_div8; i++) {
+    col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row,
+             av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+
+    av1_round_shift_array_32_neon(buf1 + i * txfm_size_row,
+                                  buf1 + i * txfm_size_row, txfm_size_row,
+                                  -shift[1]);
+  }
+
+  // write to buffer
+  {
+    for (int i = 0; i < (txfm_size_col >> 3); i++) {
+      highbd_write_buffer_8xn_neon(buf1 + i * txfm_size_row * 2, output + 8 * i,
+                                   stride, ud_flip, txfm_size_row, bd);
+    }
+  }
+}
+
+void highbd_inv_txfm2d_add_no_identity_neon(const int32_t *input,
+                                            uint16_t *output, int stride,
+                                            TX_TYPE tx_type, TX_SIZE tx_size,
+                                            int eob, const int bd) {
+  int32x4_t buf1[64 * 16];
+  int eobx, eoby;
+  highbd_get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
+  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int buf_size_w_div8 = txfm_size_col >> 2;
+  const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
+  const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
+  const int input_stride = AOMMIN(32, txfm_size_col);
+  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+
+  const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
+  const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
+  const transform_1d_neon row_txfm =
+      highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
+  const transform_1d_neon col_txfm =
+      highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
+
+  assert(col_txfm != NULL);
+  assert(row_txfm != NULL);
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  // 1st stage: column transform
+  for (int i = 0; i < buf_size_nonzero_h_div8 << 1; i++) {
+    int32x4_t buf0[64];
+    const int32_t *input_row = input + i * input_stride * 4;
+    for (int j = 0; j < buf_size_nonzero_w_div8 << 1; ++j) {
+      int32x4_t *buf0_cur = &buf0[j * 4];
+      load_buffer_32bit_input(input_row + j * 4, input_stride, buf0_cur, 4);
+
+      TRANSPOSE_4X4(buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3],
+                    buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3]);
+    }
+    if (rect_type == 1 || rect_type == -1) {
+      av1_round_shift_rect_array_32_neon(
+          buf0, buf0, buf_size_nonzero_w_div8 << 3, 0, NewInvSqrt2);
+    }
+    row_txfm(buf0, buf0, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+             -shift[0]);
+
+    int32x4_t *_buf1 = &buf1[i * 4];
+
+    if (lr_flip) {
+      for (int j = 0; j < buf_size_w_div8; ++j) {
+        TRANSPOSE_4X4(buf0[4 * j + 3], buf0[4 * j + 2], buf0[4 * j + 1],
+                      buf0[4 * j],
+                      _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 0],
+                      _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 1],
+                      _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 2],
+                      _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 3]);
+      }
+    } else {
+      for (int j = 0; j < buf_size_w_div8; ++j) {
+        TRANSPOSE_4X4(
+            buf0[j * 4 + 0], buf0[j * 4 + 1], buf0[j * 4 + 2], buf0[j * 4 + 3],
+            _buf1[j * txfm_size_row + 0], _buf1[j * txfm_size_row + 1],
+            _buf1[j * txfm_size_row + 2], _buf1[j * txfm_size_row + 3]);
+      }
+    }
+  }
+  // 2nd stage: column transform
+  for (int i = 0; i < buf_size_w_div8; i++) {
+    col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row,
+             av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+
+    av1_round_shift_array_32_neon(buf1 + i * txfm_size_row,
+                                  buf1 + i * txfm_size_row, txfm_size_row,
+                                  -shift[1]);
+  }
+
+  // write to buffer
+  {
+    for (int i = 0; i < (txfm_size_col >> 3); i++) {
+      highbd_write_buffer_8xn_neon(buf1 + i * txfm_size_row * 2, output + 8 * i,
+                                   stride, ud_flip, txfm_size_row, bd);
+    }
+  }
+}
+
+void av1_highbd_inv_txfm2d_add_universe_neon(const int32_t *input,
+                                             uint8_t *output, int stride,
+                                             TX_TYPE tx_type, TX_SIZE tx_size,
+                                             int eob, const int bd) {
+  switch (tx_type) {
+    case DCT_DCT:
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST:
+    case FLIPADST_DCT:
+    case DCT_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case ADST_FLIPADST:
+    case FLIPADST_ADST:
+      highbd_inv_txfm2d_add_no_identity_neon(input, CONVERT_TO_SHORTPTR(output),
+                                             stride, tx_type, tx_size, eob, bd);
+      break;
+    case V_DCT:
+    case V_ADST:
+    case V_FLIPADST:
+      inv_txfm2d_add_h_identity_neon(input, CONVERT_TO_SHORTPTR(output), stride,
+                                     tx_type, tx_size, bd);
+      break;
+    case H_DCT:
+    case H_ADST:
+    case H_FLIPADST:
+      inv_txfm2d_add_v_identity_neon(input, CONVERT_TO_SHORTPTR(output), stride,
+                                     tx_type, tx_size, bd);
+      break;
+    case IDTX:
+      inv_txfm2d_add_idtx_neon(input, CONVERT_TO_SHORTPTR(output), stride,
+                               tx_type, tx_size, bd);
+      break;
+    default: assert(0); break;
+  }
+}
+
+void av1_inv_txfm2d_add_universe_neon(const int32_t *input, uint8_t *output,
+                                      int stride, TX_TYPE tx_type,
+                                      TX_SIZE tx_size, const int bd) {
+  switch (tx_type) {
+    case DCT_DCT:
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST:
+    case FLIPADST_DCT:
+    case DCT_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case ADST_FLIPADST:
+    case FLIPADST_ADST:
+      inv_txfm2d_add_no_identity_neon(input, CONVERT_TO_SHORTPTR(output),
+                                      stride, tx_type, tx_size, bd);
+      break;
+    case V_DCT:
+    case V_ADST:
+    case V_FLIPADST:
+      inv_txfm2d_add_h_identity_neon(input, CONVERT_TO_SHORTPTR(output), stride,
+                                     tx_type, tx_size, bd);
+      break;
+    case H_DCT:
+    case H_ADST:
+    case H_FLIPADST:
+      inv_txfm2d_add_v_identity_neon(input, CONVERT_TO_SHORTPTR(output), stride,
+                                     tx_type, tx_size, bd);
+      break;
+    case IDTX:
+      inv_txfm2d_add_idtx_neon(input, CONVERT_TO_SHORTPTR(output), stride,
+                               tx_type, tx_size, bd);
+      break;
+    default: assert(0); break;
+  }
+}
+
+void av1_highbd_inv_txfm_add_8x8_neon(const tran_low_t *input, uint8_t *dest,
+                                      int stride, const TxfmParam *txfm_param) {
+  int bd = txfm_param->bd;
+  const TX_TYPE tx_type = txfm_param->tx_type;
+  const int32_t *src = cast_to_int32(input);
+  switch (tx_type) {
+    case IDTX:
+    case H_DCT:
+    case H_ADST:
+    case H_FLIPADST:
+    case V_DCT:
+    case V_ADST:
+    case V_FLIPADST:
+      av1_highbd_inv_txfm2d_add_universe_neon(input, dest, stride, tx_type,
+                                              txfm_param->tx_size,
+                                              txfm_param->eob, bd);
+      break;
+    default:
+      av1_inv_txfm2d_add_8x8_neon(src, CONVERT_TO_SHORTPTR(dest), stride,
+                                  tx_type, bd);
+      break;
+  }
+}
+void av1_highbd_inv_txfm_add_4x4_neon(const tran_low_t *input, uint8_t *dest,
+                                      int stride, const TxfmParam *txfm_param) {
+  assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
+  int eob = txfm_param->eob;
+  int bd = txfm_param->bd;
+  int lossless = txfm_param->lossless;
+  const int32_t *src = cast_to_int32(input);
+  const TX_TYPE tx_type = txfm_param->tx_type;
+  if (lossless) {
+    assert(tx_type == DCT_DCT);
+    av1_highbd_iwht4x4_add(input, dest, stride, eob, bd);
+    return;
+  }
+  av1_inv_txfm2d_add_4x4_neon(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
+                              bd);
+}
+
+void av1_highbd_inv_txfm_add_4x8_neon(const tran_low_t *input, uint8_t *dest,
+                                      int stride, const TxfmParam *txfm_param) {
+  av1_inv_txfm2d_add_4x8_neon(input, CONVERT_TO_SHORTPTR(dest), stride,
+                              txfm_param->tx_type, txfm_param->bd);
+}
+
+void av1_highbd_inv_txfm_add_8x4_neon(const tran_low_t *input, uint8_t *dest,
+                                      int stride, const TxfmParam *txfm_param) {
+  av1_inv_txfm2d_add_8x4_neon(input, CONVERT_TO_SHORTPTR(dest), stride,
+                              txfm_param->tx_type, txfm_param->bd);
+}
+
+void av1_inv_txfm2d_add_8x16_neon(const tran_low_t *input, uint16_t *dest,
+                                  int stride, TX_TYPE tx_type, const int bd) {
+  av1_inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type,
+                                   TX_8X16, bd);
+}
+
+void av1_highbd_inv_txfm_add_4x16_neon(const tran_low_t *input, uint8_t *dest,
+                                       int stride,
+                                       const TxfmParam *txfm_param) {
+  int bd = txfm_param->bd;
+  const TX_TYPE tx_type = txfm_param->tx_type;
+  int eob = txfm_param->eob;
+  highbd_inv_txfm2d_add_4x16_neon(input, CONVERT_TO_SHORTPTR(dest), stride,
+                                  tx_type, eob, bd);
+}
+
+void av1_highbd_inv_txfm_add_16x4_neon(const tran_low_t *input, uint8_t *dest,
+                                       int stride,
+                                       const TxfmParam *txfm_param) {
+  int bd = txfm_param->bd;
+  const TX_TYPE tx_type = txfm_param->tx_type;
+  int eob = txfm_param->eob;
+  highbd_inv_txfm2d_add_16x4_neon(input, CONVERT_TO_SHORTPTR(dest), stride,
+                                  tx_type, eob, bd);
+}
+
+void av1_highbd_inv_txfm_add_8x16_neon(const tran_low_t *input, uint8_t *dest,
+                                       int stride,
+                                       const TxfmParam *txfm_param) {
+  av1_highbd_inv_txfm2d_add_universe_neon(input, dest, stride,
+                                          txfm_param->tx_type, TX_8X16,
+                                          txfm_param->eob, txfm_param->bd);
+}
+
+void av1_highbd_inv_txfm_add_16x8_neon(const tran_low_t *input, uint8_t *dest,
+                                       int stride,
+                                       const TxfmParam *txfm_param) {
+  av1_highbd_inv_txfm2d_add_universe_neon(input, dest, stride,
+                                          txfm_param->tx_type, TX_16X8,
+                                          txfm_param->eob, txfm_param->bd);
+}
+
+void av1_inv_txfm2d_add_16x8_neon(const tran_low_t *input, uint16_t *dest,
+                                  int stride, TX_TYPE tx_type, const int bd) {
+  av1_inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type,
+                                   TX_16X8, bd);
+}
+
+void av1_highbd_inv_txfm_add_16x32_neon(const tran_low_t *input, uint8_t *dest,
+                                        int stride,
+                                        const TxfmParam *txfm_param) {
+  av1_highbd_inv_txfm2d_add_universe_neon(input, dest, stride,
+                                          txfm_param->tx_type, TX_16X32,
+                                          txfm_param->eob, txfm_param->bd);
+}
+
+void av1_inv_txfm2d_add_16x32_neon(const tran_low_t *input, uint16_t *dest,
+                                   int stride, TX_TYPE tx_type, const int bd) {
+  av1_inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type,
+                                   TX_16X32, bd);
+}
+
+void av1_highbd_inv_txfm_add_32x16_neon(const tran_low_t *input, uint8_t *dest,
+                                        int stride,
+                                        const TxfmParam *txfm_param) {
+  av1_highbd_inv_txfm2d_add_universe_neon(input, dest, stride,
+                                          txfm_param->tx_type, TX_32X16,
+                                          txfm_param->eob, txfm_param->bd);
+}
+
+void av1_inv_txfm2d_add_32x16_neon(const tran_low_t *input, uint16_t *dest,
+                                   int stride, TX_TYPE tx_type, const int bd) {
+  av1_inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type,
+                                   TX_32X16, bd);
+}
+
+void av1_highbd_inv_txfm_add_32x32_neon(const tran_low_t *input, uint8_t *dest,
+                                        int stride,
+                                        const TxfmParam *txfm_param) {
+  av1_highbd_inv_txfm2d_add_universe_neon(input, dest, stride,
+                                          txfm_param->tx_type, TX_32X32,
+                                          txfm_param->eob, txfm_param->bd);
+}
+
+void av1_inv_txfm2d_add_32x32_neon(const tran_low_t *input, uint16_t *dest,
+                                   int stride, TX_TYPE tx_type, const int bd) {
+  av1_inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type,
+                                   TX_32X32, bd);
+}
+
+void av1_highbd_inv_txfm_add_64x64_neon(const tran_low_t *input, uint8_t *dest,
+                                        int stride,
+                                        const TxfmParam *txfm_param) {
+  av1_highbd_inv_txfm2d_add_universe_neon(input, dest, stride,
+                                          txfm_param->tx_type, TX_64X64,
+                                          txfm_param->eob, txfm_param->bd);
+}
+
+void av1_inv_txfm2d_add_64x64_neon(const tran_low_t *input, uint16_t *dest,
+                                   int stride, TX_TYPE tx_type, const int bd) {
+  av1_inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type,
+                                   TX_64X64, bd);
+}
+
+void av1_highbd_inv_txfm_add_32x64_neon(const tran_low_t *input, uint8_t *dest,
+                                        int stride,
+                                        const TxfmParam *txfm_param) {
+  av1_highbd_inv_txfm2d_add_universe_neon(input, dest, stride,
+                                          txfm_param->tx_type, TX_32X64,
+                                          txfm_param->eob, txfm_param->bd);
+}
+
+void av1_inv_txfm2d_add_32x64_neon(const tran_low_t *input, uint16_t *dest,
+                                   int stride, TX_TYPE tx_type, const int bd) {
+  av1_inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type,
+                                   TX_32X64, bd);
+}
+
+void av1_highbd_inv_txfm_add_64x32_neon(const tran_low_t *input, uint8_t *dest,
+                                        int stride,
+                                        const TxfmParam *txfm_param) {
+  av1_highbd_inv_txfm2d_add_universe_neon(input, dest, stride,
+                                          txfm_param->tx_type, TX_64X32,
+                                          txfm_param->eob, txfm_param->bd);
+}
+
+void av1_inv_txfm2d_add_64x32_neon(const tran_low_t *input, uint16_t *dest,
+                                   int stride, TX_TYPE tx_type, const int bd) {
+  av1_inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type,
+                                   TX_64X32, bd);
+}
+
+void av1_highbd_inv_txfm_add_64x16_neon(const tran_low_t *input, uint8_t *dest,
+                                        int stride,
+                                        const TxfmParam *txfm_param) {
+  av1_highbd_inv_txfm2d_add_universe_neon(input, dest, stride,
+                                          txfm_param->tx_type, TX_64X16,
+                                          txfm_param->eob, txfm_param->bd);
+}
+void av1_inv_txfm2d_add_64x16_neon(const tran_low_t *input, uint16_t *dest,
+                                   int stride, TX_TYPE tx_type, const int bd) {
+  av1_inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type,
+                                   TX_64X16, bd);
+}
+
+void av1_highbd_inv_txfm_add_16x64_neon(const tran_low_t *input, uint8_t *dest,
+                                        int stride,
+                                        const TxfmParam *txfm_param) {
+  av1_highbd_inv_txfm2d_add_universe_neon(input, dest, stride,
+                                          txfm_param->tx_type, TX_16X64,
+                                          txfm_param->eob, txfm_param->bd);
+}
+
+void av1_inv_txfm2d_add_16x64_neon(const tran_low_t *input, uint16_t *dest,
+                                   int stride, TX_TYPE tx_type, const int bd) {
+  av1_inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type,
+                                   TX_16X64, bd);
+}
+
+void av1_highbd_inv_txfm_add_16x16_neon(const tran_low_t *input, uint8_t *dest,
+                                        int stride,
+                                        const TxfmParam *txfm_param) {
+  av1_highbd_inv_txfm2d_add_universe_neon(input, dest, stride,
+                                          txfm_param->tx_type, TX_16X16,
+                                          txfm_param->eob, txfm_param->bd);
+}
+
+void av1_inv_txfm2d_add_16x16_neon(const tran_low_t *input, uint16_t *dest,
+                                   int stride, TX_TYPE tx_type, const int bd) {
+  av1_inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type,
+                                   TX_16X16, bd);
+}
+
+void av1_highbd_inv_txfm_add_32x8_neon(const tran_low_t *input, uint8_t *dest,
+                                       int stride,
+                                       const TxfmParam *txfm_param) {
+  av1_highbd_inv_txfm2d_add_universe_neon(input, dest, stride,
+                                          txfm_param->tx_type, TX_32X8,
+                                          txfm_param->eob, txfm_param->bd);
+}
+
+void av1_inv_txfm2d_add_32x8_neon(const tran_low_t *input, uint16_t *dest,
+                                  int stride, TX_TYPE tx_type, const int bd) {
+  av1_inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type,
+                                   TX_32X8, bd);
+}
+
+void av1_highbd_inv_txfm_add_8x32_neon(const tran_low_t *input, uint8_t *dest,
+                                       int stride,
+                                       const TxfmParam *txfm_param) {
+  av1_highbd_inv_txfm2d_add_universe_neon(input, dest, stride,
+                                          txfm_param->tx_type, TX_8X32,
+                                          txfm_param->eob, txfm_param->bd);
+}
+
+void av1_inv_txfm2d_add_8x32_neon(const tran_low_t *input, uint16_t *dest,
+                                  int stride, TX_TYPE tx_type, const int bd) {
+  av1_inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type,
+                                   TX_8X32, bd);
+}
+
+void av1_highbd_inv_txfm_add_neon(const tran_low_t *input, uint8_t *dest,
+                                  int stride, const TxfmParam *txfm_param) {
+  const TX_SIZE tx_size = txfm_param->tx_size;
+
+  TX_TYPE tx_type = txfm_param->tx_type;
+  int bd = txfm_param->bd;
+  switch (tx_size) {
+    case TX_8X8:
+      av1_highbd_inv_txfm_add_8x8_neon(input, dest, stride, txfm_param);
+      break;
+    case TX_4X8:
+      av1_inv_txfm2d_add_4x8_neon(input, CONVERT_TO_SHORTPTR(dest), stride,
+                                  txfm_param->tx_type, txfm_param->bd);
+      break;
+    case TX_8X4:
+      av1_inv_txfm2d_add_8x4_neon(input, CONVERT_TO_SHORTPTR(dest), stride,
+                                  txfm_param->tx_type, txfm_param->bd);
+      break;
+    case TX_4X4:
+      av1_highbd_inv_txfm_add_4x4_neon(input, dest, stride, txfm_param);
+      break;
+    case TX_16X4:
+      av1_inv_txfm2d_add_16x4_neon(input, CONVERT_TO_SHORTPTR(dest), stride,
+                                   txfm_param->tx_type, txfm_param->bd);
+      break;
+    case TX_4X16:
+      av1_inv_txfm2d_add_4x16_neon(input, CONVERT_TO_SHORTPTR(dest), stride,
+                                   txfm_param->tx_type, txfm_param->bd);
+      break;
+    case TX_8X16:
+      av1_inv_txfm2d_add_8x16_neon(input, (uint16_t *)dest, stride, tx_type,
+                                   bd);
+      break;
+    case TX_16X8:
+      av1_inv_txfm2d_add_16x8_neon(input, (uint16_t *)dest, stride, tx_type,
+                                   bd);
+      break;
+    case TX_16X32:
+      av1_inv_txfm2d_add_16x32_neon(input, (uint16_t *)dest, stride, tx_type,
+                                    bd);
+      break;
+    case TX_32X16:
+      av1_inv_txfm2d_add_32x16_neon(input, (uint16_t *)dest, stride, tx_type,
+                                    bd);
+      break;
+    case TX_16X16:
+      av1_inv_txfm2d_add_16x16_neon(input, (uint16_t *)dest, stride, tx_type,
+                                    bd);
+      break;
+    case TX_32X32:
+      av1_inv_txfm2d_add_32x32_neon(input, (uint16_t *)dest, stride, tx_type,
+                                    bd);
+      break;
+    case TX_64X64:
+      av1_inv_txfm2d_add_64x64_neon(input, (uint16_t *)dest, stride, tx_type,
+                                    bd);
+      break;
+    case TX_32X64:
+      av1_inv_txfm2d_add_32x64_neon(input, (uint16_t *)dest, stride, tx_type,
+                                    bd);
+      break;
+    case TX_64X32:
+      av1_inv_txfm2d_add_64x32_neon(input, (uint16_t *)dest, stride, tx_type,
+                                    bd);
+      break;
+    case TX_16X64:
+      av1_inv_txfm2d_add_16x64_neon(input, (uint16_t *)dest, stride, tx_type,
+                                    bd);
+      break;
+    case TX_64X16:
+      av1_inv_txfm2d_add_64x16_neon(input, (uint16_t *)dest, stride, tx_type,
+                                    bd);
+      break;
+    case TX_32X8:
+      av1_inv_txfm2d_add_32x8_neon(input, (uint16_t *)dest, stride, tx_type,
+                                   bd);
+      break;
+    case TX_8X32:
+      av1_inv_txfm2d_add_8x32_neon(input, (uint16_t *)dest, stride, tx_type,
+                                   bd);
+      break;
+  }
+}
diff --git a/av1/common/arm/jnt_convolve_neon.c b/av1/common/arm/jnt_convolve_neon.c
index 92112fb..85a5eaa 100644
--- a/av1/common/arm/jnt_convolve_neon.c
+++ b/av1/common/arm/jnt_convolve_neon.c
@@ -751,11 +751,9 @@
                                  conv_params, y_filter, h, w);
 }
 
-void av1_dist_wtd_convolve_2d_copy_neon(
-    const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w,
-    int h, const InterpFilterParams *filter_params_x,
-    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
-    const int subpel_y_qn, ConvolveParams *conv_params) {
+void av1_dist_wtd_convolve_2d_copy_neon(const uint8_t *src, int src_stride,
+                                        uint8_t *dst8, int dst8_stride, int w,
+                                        int h, ConvolveParams *conv_params) {
   uint8x8_t res0_8, res1_8, res2_8, res3_8, tmp_shift0, tmp_shift1, tmp_shift2,
       tmp_shift3;
   uint16x8_t res_q0, res_q1, res_q2, res_q3, tmp_q0, tmp_q1, tmp_q2, tmp_q3;
@@ -776,11 +774,6 @@
   const int16x4_t dup_bits16x4 = vdup_n_s16(bits);
   const int16x8_t dup_bits16x8 = vdupq_n_s16(bits);
 
-  (void)filter_params_x;
-  (void)filter_params_y;
-  (void)subpel_x_qn;
-  (void)subpel_y_qn;
-
   if (!(w & 0x07)) {
     for (y = 0; y < (h >> 2); ++y) {
       src1 = src;
@@ -879,8 +872,7 @@
 void av1_dist_wtd_convolve_x_neon(const uint8_t *src, int src_stride,
                                   uint8_t *dst8, int dst8_stride, int w, int h,
                                   const InterpFilterParams *filter_params_x,
-                                  const InterpFilterParams *filter_params_y,
-                                  const int subpel_x_qn, const int subpel_y_qn,
+                                  const int subpel_x_qn,
                                   ConvolveParams *conv_params) {
   assert(!(w % 4));
   assert(!(h % 4));
@@ -899,9 +891,6 @@
   const uint16_t bck_offset = conv_params->bck_offset;
   const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
 
-  (void)filter_params_y;
-  (void)subpel_y_qn;
-
   // horizontal filter
   const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
       filter_params_x, subpel_x_qn & SUBPEL_MASK);
@@ -1341,9 +1330,8 @@
 
 void av1_dist_wtd_convolve_y_neon(const uint8_t *src, int src_stride,
                                   uint8_t *dst8, int dst8_stride, int w, int h,
-                                  const InterpFilterParams *filter_params_x,
                                   const InterpFilterParams *filter_params_y,
-                                  const int subpel_x_qn, const int subpel_y_qn,
+                                  const int subpel_y_qn,
                                   ConvolveParams *conv_params) {
   assert(!(w % 4));
   assert(!(h % 4));
@@ -1363,9 +1351,6 @@
   const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
   const int shift_value = (conv_params->round_1 - 1 - bits);
 
-  (void)filter_params_x;
-  (void)subpel_x_qn;
-
   // vertical filter
   const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
       filter_params_y, subpel_y_qn & SUBPEL_MASK);
diff --git a/av1/common/arm/reconintra_neon.c b/av1/common/arm/reconintra_neon.c
new file mode 100644
index 0000000..ec6d0d7
--- /dev/null
+++ b/av1/common/arm/reconintra_neon.c
@@ -0,0 +1,154 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/arm/sum_neon.h"
+#include "common/tools_common.h"
+
+DECLARE_ALIGNED(16, const int8_t,
+                av1_filter_intra_taps_neon[FILTER_INTRA_MODES][8][8]) = {
+  {
+      { -6, 0, 0, 0, -5, 10, 0, 0 },
+      { 10, 0, 12, 0, 2, 0, 9, 0 },
+      { -3, 1, 0, 0, -3, 1, 10, 0 },
+      { 1, 10, 7, 0, 1, 2, 5, 0 },
+      { -4, 0, 0, 12, -3, 6, 0, 9 },
+      { 6, 0, 2, 0, 2, 0, 2, 0 },
+      { -3, 2, 0, 7, -3, 2, 6, 5 },
+      { 2, 6, 2, 0, 1, 2, 3, 0 },
+  },
+  {
+      { -10, 0, 0, 0, -6, 16, 0, 0 },
+      { 16, 0, 10, 0, 0, 0, 6, 0 },
+      { -4, 0, 0, 0, -2, 0, 16, 0 },
+      { 0, 16, 4, 0, 0, 0, 2, 0 },
+      { -10, 0, 0, 10, -6, 16, 0, 6 },
+      { 16, 0, 0, 0, 0, 0, 0, 0 },
+      { -4, 0, 0, 4, -2, 0, 16, 2 },
+      { 0, 16, 0, 0, 0, 0, 0, 0 },
+  },
+  {
+      { -8, 0, 0, 0, -8, 8, 0, 0 },
+      { 8, 0, 16, 0, 0, 0, 16, 0 },
+      { -8, 0, 0, 0, -8, 0, 8, 0 },
+      { 0, 8, 16, 0, 0, 0, 16, 0 },
+      { -4, 0, 0, 16, -4, 4, 0, 16 },
+      { 4, 0, 0, 0, 0, 0, 0, 0 },
+      { -4, 0, 0, 16, -4, 0, 4, 16 },
+      { 0, 4, 0, 0, 0, 0, 0, 0 },
+  },
+  {
+      { -2, 0, 0, 0, -1, 8, 0, 0 },
+      { 8, 0, 10, 0, 3, 0, 6, 0 },
+      { -1, 3, 0, 0, 0, 2, 8, 0 },
+      { 2, 8, 4, 0, 1, 3, 2, 0 },
+      { -1, 0, 0, 10, -1, 4, 0, 6 },
+      { 4, 0, 3, 0, 3, 0, 4, 0 },
+      { -1, 3, 0, 4, -1, 2, 4, 3 },
+      { 2, 4, 4, 0, 2, 3, 3, 0 },
+  },
+  {
+      { -12, 0, 0, 0, -10, 14, 0, 0 },
+      { 14, 0, 14, 0, 0, 0, 12, 0 },
+      { -9, 0, 0, 0, -8, 0, 14, 0 },
+      { 0, 14, 11, 0, 0, 0, 10, 0 },
+      { -10, 0, 0, 14, -9, 12, 0, 12 },
+      { 12, 0, 0, 0, 1, 0, 0, 0 },
+      { -8, 0, 0, 11, -7, 0, 12, 9 },
+      { 0, 12, 1, 0, 0, 1, 1, 0 },
+  },
+};
+
+#define FILTER_INTRA_SCALE_BITS 4
+#define SHIFT_INTRA_SCALE_BITS 15 - FILTER_INTRA_SCALE_BITS
+
+#define MASK_LOW \
+  0x604020006040200  // (0 | (2 << 8) | (4 << 16) | (6 << 24)) x 2
+#define MASK_HIGH \
+  0x705030107050301  // (1 | (3 << 8) | (5 << 16) | (7 << 24)) x 2
+
+void av1_filter_intra_predictor_neon(uint8_t *dst, ptrdiff_t stride,
+                                     TX_SIZE tx_size, const uint8_t *above,
+                                     const uint8_t *left, int mode) {
+  int r, c;
+  uint8_t buffer[33][33];
+  const int bw = tx_size_wide[tx_size];
+  const int bh = tx_size_high[tx_size];
+
+  const int8x16_t f1f0 = vld1q_s8(av1_filter_intra_taps_neon[mode][0]);
+  const int8x16_t f3f2 = vld1q_s8(av1_filter_intra_taps_neon[mode][2]);
+  const int8x16_t f5f4 = vld1q_s8(av1_filter_intra_taps_neon[mode][4]);
+  const int8x16_t f7f6 = vld1q_s8(av1_filter_intra_taps_neon[mode][6]);
+  const int16x8_t f1f0_lo = vmovl_s8(vget_low_s8(f1f0));
+  const int16x8_t f1f0_hi = vmovl_s8(vget_high_s8(f1f0));
+  const int16x8_t f3f2_lo = vmovl_s8(vget_low_s8(f3f2));
+  const int16x8_t f3f2_hi = vmovl_s8(vget_high_s8(f3f2));
+  const int16x8_t f5f4_lo = vmovl_s8(vget_low_s8(f5f4));
+  const int16x8_t f5f4_hi = vmovl_s8(vget_high_s8(f5f4));
+  const int16x8_t f7f6_lo = vmovl_s8(vget_low_s8(f7f6));
+  const int16x8_t f7f6_hi = vmovl_s8(vget_high_s8(f7f6));
+  const uint8x8_t vmask_low = vcreate_u8(MASK_LOW);
+  const uint8x8_t vmask_high = vcreate_u8(MASK_HIGH);
+
+  assert(bw <= 32 && bh <= 32);
+
+  for (r = 0; r < bh; ++r) buffer[r + 1][0] = left[r];
+  memcpy(buffer[0], &above[-1], (bw + 1) * sizeof(uint8_t));
+
+  for (r = 1; r < bh + 1; r += 2) {
+    for (c = 1; c < bw + 1; c += 4) {
+      DECLARE_ALIGNED(16, uint8_t, p[8]);
+      memcpy(p, &buffer[r - 1][c - 1], 5 * sizeof(uint8_t));
+      p[5] = buffer[r][c - 1];
+      p[6] = buffer[r + 1][c - 1];
+      p[7] = 0;
+
+      const uint8x8_t p_b = vld1_u8(p);
+
+      const uint16x8_t p_b_lo = vmovl_u8(vtbl1_u8(p_b, vmask_low));
+      const uint16x8_t p_b_hi = vmovl_u8(vtbl1_u8(p_b, vmask_high));
+
+      int16x8_t out_01 = vmulq_s16(vreinterpretq_s16_u16(p_b_lo), f1f0_lo);
+      out_01 = vmlaq_s16(out_01, vreinterpretq_s16_u16(p_b_hi), f1f0_hi);
+      int16x8_t out_23 = vmulq_s16(vreinterpretq_s16_u16(p_b_lo), f3f2_lo);
+      out_23 = vmlaq_s16(out_23, vreinterpretq_s16_u16(p_b_hi), f3f2_hi);
+      int16x8_t out_45 = vmulq_s16(vreinterpretq_s16_u16(p_b_lo), f5f4_lo);
+      out_45 = vmlaq_s16(out_45, vreinterpretq_s16_u16(p_b_hi), f5f4_hi);
+      int16x8_t out_67 = vmulq_s16(vreinterpretq_s16_u16(p_b_lo), f7f6_lo);
+      out_67 = vmlaq_s16(out_67, vreinterpretq_s16_u16(p_b_hi), f7f6_hi);
+#if defined(__aarch64__)
+      const int16x8_t out_0123 = vpaddq_s16(out_01, out_23);
+      const int16x8_t out_4567 = vpaddq_s16(out_45, out_67);
+      const int16x8_t out_01234567 = vpaddq_s16(out_0123, out_4567);
+#else
+      const int16x8_t out_0123 = vcombine_s16(vqmovn_s32(vpaddlq_s16(out_01)),
+                                              vqmovn_s32(vpaddlq_s16(out_23)));
+      const int16x8_t out_4567 = vcombine_s16(vqmovn_s32(vpaddlq_s16(out_45)),
+                                              vqmovn_s32(vpaddlq_s16(out_67)));
+      const int16x8_t out_01234567 = vcombine_s16(
+          vqmovn_s32(vpaddlq_s16(out_0123)), vqmovn_s32(vpaddlq_s16(out_4567)));
+#endif  // (__aarch64__)
+      const uint32x2_t out_r =
+          vreinterpret_u32_u8(vqmovun_s16(vrshrq_n_s16(out_01234567, 4)));
+      // Storing
+      vst1_lane_u32((uint32_t *)&buffer[r][c], out_r, 0);
+      vst1_lane_u32((uint32_t *)&buffer[r + 1][c], out_r, 1);
+    }
+  }
+
+  for (r = 0; r < bh; ++r) {
+    memcpy(dst, &buffer[r + 1][1], bw * sizeof(uint8_t));
+    dst += stride;
+  }
+}
diff --git a/av1/common/arm/resize_neon.c b/av1/common/arm/resize_neon.c
new file mode 100644
index 0000000..e42766e
--- /dev/null
+++ b/av1/common/arm/resize_neon.c
@@ -0,0 +1,805 @@
+/*
+ *
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "av1/common/resize.h"
+#include "av1/common/arm/mem_neon.h"
+#include "av1/common/arm/convolve_neon.h"
+#include "av1/common/arm/transpose_neon.h"
+#include "config/av1_rtcd.h"
+#include "config/aom_scale_rtcd.h"
+
+static INLINE void scale_plane_2_to_1_phase_0(const uint8_t *src,
+                                              const int src_stride,
+                                              uint8_t *dst,
+                                              const int dst_stride, const int w,
+                                              const int h) {
+  const int max_width = (w + 15) & ~15;
+  int y = h;
+
+  assert(w && h);
+
+  do {
+    int x = max_width;
+    do {
+      const uint8x16x2_t s = vld2q_u8(src);
+      vst1q_u8(dst, s.val[0]);
+      src += 32;
+      dst += 16;
+      x -= 16;
+    } while (x);
+    src += 2 * (src_stride - max_width);
+    dst += dst_stride - max_width;
+  } while (--y);
+}
+
+static INLINE void scale_plane_4_to_1_phase_0(const uint8_t *src,
+                                              const int src_stride,
+                                              uint8_t *dst,
+                                              const int dst_stride, const int w,
+                                              const int h) {
+  const int max_width = (w + 15) & ~15;
+  int y = h;
+
+  assert(w && h);
+
+  do {
+    int x = max_width;
+    do {
+      const uint8x16x4_t s = vld4q_u8(src);
+      vst1q_u8(dst, s.val[0]);
+      src += 64;
+      dst += 16;
+      x -= 16;
+    } while (x);
+    src += 4 * (src_stride - max_width);
+    dst += dst_stride - max_width;
+  } while (--y);
+}
+
+static INLINE void scale_plane_bilinear_kernel(
+    const uint8x16_t in0, const uint8x16_t in1, const uint8x16_t in2,
+    const uint8x16_t in3, const uint8x8_t coef0, const uint8x8_t coef1,
+    uint8_t *const dst) {
+  const uint16x8_t h0 = vmull_u8(vget_low_u8(in0), coef0);
+  const uint16x8_t h1 = vmull_u8(vget_high_u8(in0), coef0);
+  const uint16x8_t h2 = vmull_u8(vget_low_u8(in2), coef0);
+  const uint16x8_t h3 = vmull_u8(vget_high_u8(in2), coef0);
+  const uint16x8_t h4 = vmlal_u8(h0, vget_low_u8(in1), coef1);
+  const uint16x8_t h5 = vmlal_u8(h1, vget_high_u8(in1), coef1);
+  const uint16x8_t h6 = vmlal_u8(h2, vget_low_u8(in3), coef1);
+  const uint16x8_t h7 = vmlal_u8(h3, vget_high_u8(in3), coef1);
+
+  const uint8x8_t hor0 = vrshrn_n_u16(h4, 7);  // temp: 00 01 02 03 04 05 06 07
+  const uint8x8_t hor1 = vrshrn_n_u16(h5, 7);  // temp: 08 09 0A 0B 0C 0D 0E 0F
+  const uint8x8_t hor2 = vrshrn_n_u16(h6, 7);  // temp: 10 11 12 13 14 15 16 17
+  const uint8x8_t hor3 = vrshrn_n_u16(h7, 7);  // temp: 18 19 1A 1B 1C 1D 1E 1F
+  const uint16x8_t v0 = vmull_u8(hor0, coef0);
+  const uint16x8_t v1 = vmull_u8(hor1, coef0);
+  const uint16x8_t v2 = vmlal_u8(v0, hor2, coef1);
+  const uint16x8_t v3 = vmlal_u8(v1, hor3, coef1);
+  // dst: 0 1 2 3 4 5 6 7  8 9 A B C D E F
+  const uint8x16_t d = vcombine_u8(vrshrn_n_u16(v2, 7), vrshrn_n_u16(v3, 7));
+  vst1q_u8(dst, d);
+}
+
+static INLINE void scale_plane_2_to_1_bilinear(
+    const uint8_t *const src, const int src_stride, uint8_t *dst,
+    const int dst_stride, const int w, const int h, const int16_t c0,
+    const int16_t c1) {
+  const int max_width = (w + 15) & ~15;
+  const uint8_t *src0 = src;
+  const uint8_t *src1 = src + src_stride;
+  const uint8x8_t coef0 = vdup_n_u8(c0);
+  const uint8x8_t coef1 = vdup_n_u8(c1);
+  int y = h;
+
+  assert(w && h);
+
+  do {
+    int x = max_width;
+    do {
+      // 000 002 004 006 008 00A 00C 00E  010 012 014 016 018 01A 01C 01E
+      // 001 003 005 007 009 00B 00D 00F  011 013 015 017 019 01B 01D 01F
+      const uint8x16x2_t s0 = vld2q_u8(src0);
+      // 100 102 104 106 108 10A 10C 10E  110 112 114 116 118 11A 11C 11E
+      // 101 103 105 107 109 10B 10D 10F  111 113 115 117 119 11B 11D 11F
+      const uint8x16x2_t s1 = vld2q_u8(src1);
+      scale_plane_bilinear_kernel(s0.val[0], s0.val[1], s1.val[0], s1.val[1],
+                                  coef0, coef1, dst);
+      src0 += 32;
+      src1 += 32;
+      dst += 16;
+      x -= 16;
+    } while (x);
+    src0 += 2 * (src_stride - max_width);
+    src1 += 2 * (src_stride - max_width);
+    dst += dst_stride - max_width;
+  } while (--y);
+}
+
+static INLINE void scale_plane_4_to_1_bilinear(
+    const uint8_t *const src, const int src_stride, uint8_t *dst,
+    const int dst_stride, const int w, const int h, const int16_t c0,
+    const int16_t c1) {
+  const int max_width = (w + 15) & ~15;
+  const uint8_t *src0 = src;
+  const uint8_t *src1 = src + src_stride;
+  const uint8x8_t coef0 = vdup_n_u8(c0);
+  const uint8x8_t coef1 = vdup_n_u8(c1);
+  int y = h;
+
+  assert(w && h);
+
+  do {
+    int x = max_width;
+    do {
+      // (*) -- useless
+      // 000 004 008 00C 010 014 018 01C  020 024 028 02C 030 034 038 03C
+      // 001 005 009 00D 011 015 019 01D  021 025 029 02D 031 035 039 03D
+      // 002 006 00A 00E 012 016 01A 01E  022 026 02A 02E 032 036 03A 03E (*)
+      // 003 007 00B 00F 013 017 01B 01F  023 027 02B 02F 033 037 03B 03F (*)
+      const uint8x16x4_t s0 = vld4q_u8(src0);
+      // 100 104 108 10C 110 114 118 11C  120 124 128 12C 130 134 138 13C
+      // 101 105 109 10D 111 115 119 11D  121 125 129 12D 131 135 139 13D
+      // 102 106 10A 10E 112 116 11A 11E  122 126 12A 12E 132 136 13A 13E (*)
+      // 103 107 10B 10F 113 117 11B 11F  123 127 12B 12F 133 137 13B 13F (*)
+      const uint8x16x4_t s1 = vld4q_u8(src1);
+      scale_plane_bilinear_kernel(s0.val[0], s0.val[1], s1.val[0], s1.val[1],
+                                  coef0, coef1, dst);
+      src0 += 64;
+      src1 += 64;
+      dst += 16;
+      x -= 16;
+    } while (x);
+    src0 += 4 * (src_stride - max_width);
+    src1 += 4 * (src_stride - max_width);
+    dst += dst_stride - max_width;
+  } while (--y);
+}
+
+static void scale_plane_2_to_1_general(const uint8_t *src, const int src_stride,
+                                       uint8_t *dst, const int dst_stride,
+                                       const int w, const int h,
+                                       const int16_t *const coef,
+                                       uint8_t *const temp_buffer) {
+  const int width_hor = (w + 3) & ~3;
+  const int width_ver = (w + 7) & ~7;
+  const int height_hor = (2 * h + SUBPEL_TAPS - 2 + 7) & ~7;
+  const int height_ver = (h + 3) & ~3;
+  const int16x8_t filters = vld1q_s16(coef);
+  int x, y = height_hor;
+  uint8_t *t = temp_buffer;
+  uint8x8_t s[14], d[4];
+
+  assert(w && h);
+
+  src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2 + 1;
+
+  // horizontal 4x8
+  // Note: processing 4x8 is about 20% faster than processing row by row using
+  // vld4_u8().
+  do {
+    load_u8_8x8(src + 2, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5],
+                &s[6], &s[7]);
+    transpose_u8_8x8(&s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], &s[7]);
+    x = width_hor;
+
+    do {
+      src += 8;
+      load_u8_8x8(src, src_stride, &s[6], &s[7], &s[8], &s[9], &s[10], &s[11],
+                  &s[12], &s[13]);
+      transpose_u8_8x8(&s[6], &s[7], &s[8], &s[9], &s[10], &s[11], &s[12],
+                       &s[13]);
+
+      d[0] = scale_filter_8(&s[0], filters);  // 00 10 20 30 40 50 60 70
+      d[1] = scale_filter_8(&s[2], filters);  // 01 11 21 31 41 51 61 71
+      d[2] = scale_filter_8(&s[4], filters);  // 02 12 22 32 42 52 62 72
+      d[3] = scale_filter_8(&s[6], filters);  // 03 13 23 33 43 53 63 73
+      // 00 01 02 03 40 41 42 43
+      // 10 11 12 13 50 51 52 53
+      // 20 21 22 23 60 61 62 63
+      // 30 31 32 33 70 71 72 73
+      transpose_u8_8x4(&d[0], &d[1], &d[2], &d[3]);
+      vst1_lane_u32((uint32_t *)(t + 0 * width_hor), vreinterpret_u32_u8(d[0]),
+                    0);
+      vst1_lane_u32((uint32_t *)(t + 1 * width_hor), vreinterpret_u32_u8(d[1]),
+                    0);
+      vst1_lane_u32((uint32_t *)(t + 2 * width_hor), vreinterpret_u32_u8(d[2]),
+                    0);
+      vst1_lane_u32((uint32_t *)(t + 3 * width_hor), vreinterpret_u32_u8(d[3]),
+                    0);
+      vst1_lane_u32((uint32_t *)(t + 4 * width_hor), vreinterpret_u32_u8(d[0]),
+                    1);
+      vst1_lane_u32((uint32_t *)(t + 5 * width_hor), vreinterpret_u32_u8(d[1]),
+                    1);
+      vst1_lane_u32((uint32_t *)(t + 6 * width_hor), vreinterpret_u32_u8(d[2]),
+                    1);
+      vst1_lane_u32((uint32_t *)(t + 7 * width_hor), vreinterpret_u32_u8(d[3]),
+                    1);
+
+      s[0] = s[8];
+      s[1] = s[9];
+      s[2] = s[10];
+      s[3] = s[11];
+      s[4] = s[12];
+      s[5] = s[13];
+
+      t += 4;
+      x -= 4;
+    } while (x);
+    src += 8 * src_stride - 2 * width_hor;
+    t += 7 * width_hor;
+    y -= 8;
+  } while (y);
+
+  // vertical 8x4
+  x = width_ver;
+  t = temp_buffer;
+  do {
+    load_u8_8x8(t, width_hor, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6],
+                &s[7]);
+    t += 6 * width_hor;
+    y = height_ver;
+
+    do {
+      load_u8_8x8(t, width_hor, &s[6], &s[7], &s[8], &s[9], &s[10], &s[11],
+                  &s[12], &s[13]);
+      t += 8 * width_hor;
+
+      d[0] = scale_filter_8(&s[0], filters);  // 00 01 02 03 04 05 06 07
+      d[1] = scale_filter_8(&s[2], filters);  // 10 11 12 13 14 15 16 17
+      d[2] = scale_filter_8(&s[4], filters);  // 20 21 22 23 24 25 26 27
+      d[3] = scale_filter_8(&s[6], filters);  // 30 31 32 33 34 35 36 37
+      vst1_u8(dst + 0 * dst_stride, d[0]);
+      vst1_u8(dst + 1 * dst_stride, d[1]);
+      vst1_u8(dst + 2 * dst_stride, d[2]);
+      vst1_u8(dst + 3 * dst_stride, d[3]);
+
+      s[0] = s[8];
+      s[1] = s[9];
+      s[2] = s[10];
+      s[3] = s[11];
+      s[4] = s[12];
+      s[5] = s[13];
+
+      dst += 4 * dst_stride;
+      y -= 4;
+    } while (y);
+    t -= width_hor * (2 * height_ver + 6);
+    t += 8;
+    dst -= height_ver * dst_stride;
+    dst += 8;
+    x -= 8;
+  } while (x);
+}
+
+static void scale_plane_4_to_1_general(const uint8_t *src, const int src_stride,
+                                       uint8_t *dst, const int dst_stride,
+                                       const int w, const int h,
+                                       const int16_t *const coef,
+                                       uint8_t *const temp_buffer) {
+  const int width_hor = (w + 1) & ~1;
+  const int width_ver = (w + 7) & ~7;
+  const int height_hor = (4 * h + SUBPEL_TAPS - 2 + 7) & ~7;
+  const int height_ver = (h + 1) & ~1;
+  const int16x8_t filters = vld1q_s16(coef);
+  int x, y = height_hor;
+  uint8_t *t = temp_buffer;
+  uint8x8_t s[12], d[2];
+
+  assert(w && h);
+
+  src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2 + 3;
+
+  // horizontal 2x8
+  // Note: processing 2x8 is about 20% faster than processing row by row using
+  // vld4_u8().
+  do {
+    load_u8_8x8(src + 4, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5],
+                &s[6], &s[7]);
+    transpose_u8_4x8(&s[0], &s[1], &s[2], &s[3], s[4], s[5], s[6], s[7]);
+    x = width_hor;
+
+    do {
+      uint8x8x2_t dd;
+      src += 8;
+      load_u8_8x8(src, src_stride, &s[4], &s[5], &s[6], &s[7], &s[8], &s[9],
+                  &s[10], &s[11]);
+      transpose_u8_8x8(&s[4], &s[5], &s[6], &s[7], &s[8], &s[9], &s[10],
+                       &s[11]);
+
+      d[0] = scale_filter_8(&s[0], filters);  // 00 10 20 30 40 50 60 70
+      d[1] = scale_filter_8(&s[4], filters);  // 01 11 21 31 41 51 61 71
+      // dd.val[0]: 00 01 20 21 40 41 60 61
+      // dd.val[1]: 10 11 30 31 50 51 70 71
+      dd = vtrn_u8(d[0], d[1]);
+      vst1_lane_u16((uint16_t *)(t + 0 * width_hor),
+                    vreinterpret_u16_u8(dd.val[0]), 0);
+      vst1_lane_u16((uint16_t *)(t + 1 * width_hor),
+                    vreinterpret_u16_u8(dd.val[1]), 0);
+      vst1_lane_u16((uint16_t *)(t + 2 * width_hor),
+                    vreinterpret_u16_u8(dd.val[0]), 1);
+      vst1_lane_u16((uint16_t *)(t + 3 * width_hor),
+                    vreinterpret_u16_u8(dd.val[1]), 1);
+      vst1_lane_u16((uint16_t *)(t + 4 * width_hor),
+                    vreinterpret_u16_u8(dd.val[0]), 2);
+      vst1_lane_u16((uint16_t *)(t + 5 * width_hor),
+                    vreinterpret_u16_u8(dd.val[1]), 2);
+      vst1_lane_u16((uint16_t *)(t + 6 * width_hor),
+                    vreinterpret_u16_u8(dd.val[0]), 3);
+      vst1_lane_u16((uint16_t *)(t + 7 * width_hor),
+                    vreinterpret_u16_u8(dd.val[1]), 3);
+
+      s[0] = s[8];
+      s[1] = s[9];
+      s[2] = s[10];
+      s[3] = s[11];
+
+      t += 2;
+      x -= 2;
+    } while (x);
+    src += 8 * src_stride - 4 * width_hor;
+    t += 7 * width_hor;
+    y -= 8;
+  } while (y);
+
+  // vertical 8x2
+  x = width_ver;
+  t = temp_buffer;
+  do {
+    load_u8_8x4(t, width_hor, &s[0], &s[1], &s[2], &s[3]);
+    t += 4 * width_hor;
+    y = height_ver;
+
+    do {
+      load_u8_8x8(t, width_hor, &s[4], &s[5], &s[6], &s[7], &s[8], &s[9],
+                  &s[10], &s[11]);
+      t += 8 * width_hor;
+
+      d[0] = scale_filter_8(&s[0], filters);  // 00 01 02 03 04 05 06 07
+      d[1] = scale_filter_8(&s[4], filters);  // 10 11 12 13 14 15 16 17
+      vst1_u8(dst + 0 * dst_stride, d[0]);
+      vst1_u8(dst + 1 * dst_stride, d[1]);
+
+      s[0] = s[8];
+      s[1] = s[9];
+      s[2] = s[10];
+      s[3] = s[11];
+
+      dst += 2 * dst_stride;
+      y -= 2;
+    } while (y);
+    t -= width_hor * (4 * height_ver + 4);
+    t += 8;
+    dst -= height_ver * dst_stride;
+    dst += 8;
+    x -= 8;
+  } while (x);
+}
+
+static INLINE uint8x8_t scale_filter_bilinear(const uint8x8_t *const s,
+                                              const uint8x8_t *const coef) {
+  const uint16x8_t h0 = vmull_u8(s[0], coef[0]);
+  const uint16x8_t h1 = vmlal_u8(h0, s[1], coef[1]);
+
+  return vrshrn_n_u16(h1, 7);
+}
+
+// Notes for 4 to 3 scaling:
+//
+// 1. 6 rows are calculated in each horizontal inner loop, so width_hor must be
+// multiple of 6, and no less than w.
+//
+// 2. 8 rows are calculated in each vertical inner loop, so width_ver must be
+// multiple of 8, and no less than w.
+//
+// 3. 8 columns are calculated in each horizontal inner loop for further
+// vertical scaling, so height_hor must be multiple of 8, and no less than
+// 4 * h / 3.
+//
+// 4. 6 columns are calculated in each vertical inner loop, so height_ver must
+// be multiple of 6, and no less than h.
+//
+// 5. The physical location of the last row of the 4 to 3 scaled frame is
+// decided by phase_scaler, and are always less than 1 pixel below the last row
+// of the original image.
+static void scale_plane_4_to_3_bilinear(const uint8_t *src,
+                                        const int src_stride, uint8_t *dst,
+                                        const int dst_stride, const int w,
+                                        const int h, const int phase_scaler,
+                                        uint8_t *const temp_buffer) {
+  static const int step_q4 = 16 * 4 / 3;
+  const int width_hor = (w + 5) - ((w + 5) % 6);
+  const int stride_hor = width_hor + 2;  // store 2 extra pixels
+  const int width_ver = (w + 7) & ~7;
+  // We only need 1 extra row below because there are only 2 bilinear
+  // coefficients.
+  const int height_hor = (4 * h / 3 + 1 + 7) & ~7;
+  const int height_ver = (h + 5) - ((h + 5) % 6);
+  int x, y = height_hor;
+  uint8_t *t = temp_buffer;
+  uint8x8_t s[9], d[8], c[6];
+  const InterpKernel *interp_kernel =
+      (const InterpKernel *)av1_interp_filter_params_list[BILINEAR].filter_ptr;
+  assert(w && h);
+
+  c[0] = vdup_n_u8((uint8_t)interp_kernel[phase_scaler][3]);
+  c[1] = vdup_n_u8((uint8_t)interp_kernel[phase_scaler][4]);
+  c[2] = vdup_n_u8(
+      (uint8_t)interp_kernel[(phase_scaler + 1 * step_q4) & SUBPEL_MASK][3]);
+  c[3] = vdup_n_u8(
+      (uint8_t)interp_kernel[(phase_scaler + 1 * step_q4) & SUBPEL_MASK][4]);
+  c[4] = vdup_n_u8(
+      (uint8_t)interp_kernel[(phase_scaler + 2 * step_q4) & SUBPEL_MASK][3]);
+  c[5] = vdup_n_u8(
+      (uint8_t)interp_kernel[(phase_scaler + 2 * step_q4) & SUBPEL_MASK][4]);
+
+  d[6] = vdup_n_u8(0);
+  d[7] = vdup_n_u8(0);
+
+  // horizontal 6x8
+  do {
+    load_u8_8x8(src, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5],
+                &s[6], &s[7]);
+    src += 1;
+    transpose_u8_8x8(&s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], &s[7]);
+    x = width_hor;
+
+    do {
+      load_u8_8x8(src, src_stride, &s[1], &s[2], &s[3], &s[4], &s[5], &s[6],
+                  &s[7], &s[8]);
+      src += 8;
+      transpose_u8_8x8(&s[1], &s[2], &s[3], &s[4], &s[5], &s[6], &s[7], &s[8]);
+
+      // 00 10 20 30 40 50 60 70
+      // 01 11 21 31 41 51 61 71
+      // 02 12 22 32 42 52 62 72
+      // 03 13 23 33 43 53 63 73
+      // 04 14 24 34 44 54 64 74
+      // 05 15 25 35 45 55 65 75
+      d[0] = scale_filter_bilinear(&s[0], &c[0]);
+      d[1] =
+          scale_filter_bilinear(&s[(phase_scaler + 1 * step_q4) >> 4], &c[2]);
+      d[2] =
+          scale_filter_bilinear(&s[(phase_scaler + 2 * step_q4) >> 4], &c[4]);
+      d[3] = scale_filter_bilinear(&s[4], &c[0]);
+      d[4] = scale_filter_bilinear(&s[4 + ((phase_scaler + 1 * step_q4) >> 4)],
+                                   &c[2]);
+      d[5] = scale_filter_bilinear(&s[4 + ((phase_scaler + 2 * step_q4) >> 4)],
+                                   &c[4]);
+
+      // 00 01 02 03 04 05 xx xx
+      // 10 11 12 13 14 15 xx xx
+      // 20 21 22 23 24 25 xx xx
+      // 30 31 32 33 34 35 xx xx
+      // 40 41 42 43 44 45 xx xx
+      // 50 51 52 53 54 55 xx xx
+      // 60 61 62 63 64 65 xx xx
+      // 70 71 72 73 74 75 xx xx
+      transpose_u8_8x8(&d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], &d[7]);
+      // store 2 extra pixels
+      vst1_u8(t + 0 * stride_hor, d[0]);
+      vst1_u8(t + 1 * stride_hor, d[1]);
+      vst1_u8(t + 2 * stride_hor, d[2]);
+      vst1_u8(t + 3 * stride_hor, d[3]);
+      vst1_u8(t + 4 * stride_hor, d[4]);
+      vst1_u8(t + 5 * stride_hor, d[5]);
+      vst1_u8(t + 6 * stride_hor, d[6]);
+      vst1_u8(t + 7 * stride_hor, d[7]);
+
+      s[0] = s[8];
+
+      t += 6;
+      x -= 6;
+    } while (x);
+    src += 8 * src_stride - 4 * width_hor / 3 - 1;
+    t += 7 * stride_hor + 2;
+    y -= 8;
+  } while (y);
+
+  // vertical 8x6
+  x = width_ver;
+  t = temp_buffer;
+  do {
+    load_u8_8x8(t, stride_hor, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6],
+                &s[7]);
+    t += stride_hor;
+    y = height_ver;
+
+    do {
+      load_u8_8x8(t, stride_hor, &s[1], &s[2], &s[3], &s[4], &s[5], &s[6],
+                  &s[7], &s[8]);
+      t += 8 * stride_hor;
+
+      d[0] = scale_filter_bilinear(&s[0], &c[0]);
+      d[1] =
+          scale_filter_bilinear(&s[(phase_scaler + 1 * step_q4) >> 4], &c[2]);
+      d[2] =
+          scale_filter_bilinear(&s[(phase_scaler + 2 * step_q4) >> 4], &c[4]);
+      d[3] = scale_filter_bilinear(&s[4], &c[0]);
+      d[4] = scale_filter_bilinear(&s[4 + ((phase_scaler + 1 * step_q4) >> 4)],
+                                   &c[2]);
+      d[5] = scale_filter_bilinear(&s[4 + ((phase_scaler + 2 * step_q4) >> 4)],
+                                   &c[4]);
+      vst1_u8(dst + 0 * dst_stride, d[0]);
+      vst1_u8(dst + 1 * dst_stride, d[1]);
+      vst1_u8(dst + 2 * dst_stride, d[2]);
+      vst1_u8(dst + 3 * dst_stride, d[3]);
+      vst1_u8(dst + 4 * dst_stride, d[4]);
+      vst1_u8(dst + 5 * dst_stride, d[5]);
+
+      s[0] = s[8];
+
+      dst += 6 * dst_stride;
+      y -= 6;
+    } while (y);
+    t -= stride_hor * (4 * height_ver / 3 + 1);
+    t += 8;
+    dst -= height_ver * dst_stride;
+    dst += 8;
+    x -= 8;
+  } while (x);
+}
+
+static void scale_plane_4_to_3_general(const uint8_t *src, const int src_stride,
+                                       uint8_t *dst, const int dst_stride,
+                                       const int w, const int h,
+                                       const InterpKernel *const coef,
+                                       const int phase_scaler,
+                                       uint8_t *const temp_buffer) {
+  static const int step_q4 = 16 * 4 / 3;
+  const int width_hor = (w + 5) - ((w + 5) % 6);
+  const int stride_hor = width_hor + 2;  // store 2 extra pixels
+  const int width_ver = (w + 7) & ~7;
+  // We need (SUBPEL_TAPS - 1) extra rows: (SUBPEL_TAPS / 2 - 1) extra rows
+  // above and (SUBPEL_TAPS / 2) extra rows below.
+  const int height_hor = (4 * h / 3 + SUBPEL_TAPS - 1 + 7) & ~7;
+  const int height_ver = (h + 5) - ((h + 5) % 6);
+  const int16x8_t filters0 = vld1q_s16(
+      (const int16_t *)&coef[(phase_scaler + 0 * step_q4) & SUBPEL_MASK]);
+  const int16x8_t filters1 = vld1q_s16(
+      (const int16_t *)&coef[(phase_scaler + 1 * step_q4) & SUBPEL_MASK]);
+  const int16x8_t filters2 = vld1q_s16(
+      (const int16_t *)&coef[(phase_scaler + 2 * step_q4) & SUBPEL_MASK]);
+  int x, y = height_hor;
+  uint8_t *t = temp_buffer;
+  uint8x8_t s[15], d[8];
+
+  assert(w && h);
+
+  src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2;
+  d[6] = vdup_n_u8(0);
+  d[7] = vdup_n_u8(0);
+
+  // horizontal 6x8
+  do {
+    load_u8_8x8(src + 1, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5],
+                &s[6], &s[7]);
+    transpose_u8_8x8(&s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], &s[7]);
+    x = width_hor;
+
+    do {
+      src += 8;
+      load_u8_8x8(src, src_stride, &s[7], &s[8], &s[9], &s[10], &s[11], &s[12],
+                  &s[13], &s[14]);
+      transpose_u8_8x8(&s[7], &s[8], &s[9], &s[10], &s[11], &s[12], &s[13],
+                       &s[14]);
+
+      // 00 10 20 30 40 50 60 70
+      // 01 11 21 31 41 51 61 71
+      // 02 12 22 32 42 52 62 72
+      // 03 13 23 33 43 53 63 73
+      // 04 14 24 34 44 54 64 74
+      // 05 15 25 35 45 55 65 75
+      d[0] = scale_filter_8(&s[0], filters0);
+      d[1] = scale_filter_8(&s[(phase_scaler + 1 * step_q4) >> 4], filters1);
+      d[2] = scale_filter_8(&s[(phase_scaler + 2 * step_q4) >> 4], filters2);
+      d[3] = scale_filter_8(&s[4], filters0);
+      d[4] =
+          scale_filter_8(&s[4 + ((phase_scaler + 1 * step_q4) >> 4)], filters1);
+      d[5] =
+          scale_filter_8(&s[4 + ((phase_scaler + 2 * step_q4) >> 4)], filters2);
+
+      // 00 01 02 03 04 05 xx xx
+      // 10 11 12 13 14 15 xx xx
+      // 20 21 22 23 24 25 xx xx
+      // 30 31 32 33 34 35 xx xx
+      // 40 41 42 43 44 45 xx xx
+      // 50 51 52 53 54 55 xx xx
+      // 60 61 62 63 64 65 xx xx
+      // 70 71 72 73 74 75 xx xx
+      transpose_u8_8x8(&d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], &d[7]);
+      // store 2 extra pixels
+      vst1_u8(t + 0 * stride_hor, d[0]);
+      vst1_u8(t + 1 * stride_hor, d[1]);
+      vst1_u8(t + 2 * stride_hor, d[2]);
+      vst1_u8(t + 3 * stride_hor, d[3]);
+      vst1_u8(t + 4 * stride_hor, d[4]);
+      vst1_u8(t + 5 * stride_hor, d[5]);
+      vst1_u8(t + 6 * stride_hor, d[6]);
+      vst1_u8(t + 7 * stride_hor, d[7]);
+
+      s[0] = s[8];
+      s[1] = s[9];
+      s[2] = s[10];
+      s[3] = s[11];
+      s[4] = s[12];
+      s[5] = s[13];
+      s[6] = s[14];
+
+      t += 6;
+      x -= 6;
+    } while (x);
+    src += 8 * src_stride - 4 * width_hor / 3;
+    t += 7 * stride_hor + 2;
+    y -= 8;
+  } while (y);
+
+  // vertical 8x6
+  x = width_ver;
+  t = temp_buffer;
+  do {
+    load_u8_8x8(t, stride_hor, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6],
+                &s[7]);
+    t += 7 * stride_hor;
+    y = height_ver;
+
+    do {
+      load_u8_8x8(t, stride_hor, &s[7], &s[8], &s[9], &s[10], &s[11], &s[12],
+                  &s[13], &s[14]);
+      t += 8 * stride_hor;
+
+      d[0] = scale_filter_8(&s[0], filters0);
+      d[1] = scale_filter_8(&s[(phase_scaler + 1 * step_q4) >> 4], filters1);
+      d[2] = scale_filter_8(&s[(phase_scaler + 2 * step_q4) >> 4], filters2);
+      d[3] = scale_filter_8(&s[4], filters0);
+      d[4] =
+          scale_filter_8(&s[4 + ((phase_scaler + 1 * step_q4) >> 4)], filters1);
+      d[5] =
+          scale_filter_8(&s[4 + ((phase_scaler + 2 * step_q4) >> 4)], filters2);
+      vst1_u8(dst + 0 * dst_stride, d[0]);
+      vst1_u8(dst + 1 * dst_stride, d[1]);
+      vst1_u8(dst + 2 * dst_stride, d[2]);
+      vst1_u8(dst + 3 * dst_stride, d[3]);
+      vst1_u8(dst + 4 * dst_stride, d[4]);
+      vst1_u8(dst + 5 * dst_stride, d[5]);
+
+      s[0] = s[8];
+      s[1] = s[9];
+      s[2] = s[10];
+      s[3] = s[11];
+      s[4] = s[12];
+      s[5] = s[13];
+      s[6] = s[14];
+
+      dst += 6 * dst_stride;
+      y -= 6;
+    } while (y);
+    t -= stride_hor * (4 * height_ver / 3 + 7);
+    t += 8;
+    dst -= height_ver * dst_stride;
+    dst += 8;
+    x -= 8;
+  } while (x);
+}
+
+void av1_resize_and_extend_frame_neon(const YV12_BUFFER_CONFIG *src,
+                                      YV12_BUFFER_CONFIG *dst,
+                                      const InterpFilter filter,
+                                      const int phase, const int num_planes) {
+  // We use AOMMIN(num_planes, MAX_MB_PLANE) instead of num_planes to quiet
+  // the static analysis warnings.
+  int scaled = 0;
+  for (int i = 0; i < AOMMIN(num_planes, MAX_MB_PLANE); ++i) {
+    const int is_uv = i > 0;
+    const int src_w = src->crop_widths[is_uv];
+    const int src_h = src->crop_heights[is_uv];
+    const int dst_w = dst->crop_widths[is_uv];
+    const int dst_h = dst->crop_heights[is_uv];
+    const int dst_y_w = (dst->crop_widths[0] + 1) & ~1;
+    const int dst_y_h = (dst->crop_heights[0] + 1) & ~1;
+
+    if (2 * dst_w == src_w && 2 * dst_h == src_h) {
+      scaled = 1;
+      if (phase == 0) {
+        scale_plane_2_to_1_phase_0(src->buffers[i], src->strides[is_uv],
+                                   dst->buffers[i], dst->strides[is_uv], dst_w,
+                                   dst_h);
+      } else if (filter == BILINEAR) {
+        const int16_t c0 = av1_bilinear_filters[phase][3];
+        const int16_t c1 = av1_bilinear_filters[phase][4];
+        scale_plane_2_to_1_bilinear(src->buffers[i], src->strides[is_uv],
+                                    dst->buffers[i], dst->strides[is_uv], dst_w,
+                                    dst_h, c0, c1);
+      } else {
+        const int buffer_stride = (dst_y_w + 3) & ~3;
+        const int buffer_height = (2 * dst_y_h + SUBPEL_TAPS - 2 + 7) & ~7;
+        uint8_t *const temp_buffer =
+            (uint8_t *)malloc(buffer_stride * buffer_height);
+        if (temp_buffer) {
+          const InterpKernel *interp_kernel =
+              (const InterpKernel *)av1_interp_filter_params_list[filter]
+                  .filter_ptr;
+          scale_plane_2_to_1_general(src->buffers[i], src->strides[is_uv],
+                                     dst->buffers[i], dst->strides[is_uv],
+                                     dst_w, dst_h, interp_kernel[phase],
+                                     temp_buffer);
+          free(temp_buffer);
+        } else {
+          scaled = 0;
+        }
+      }
+    } else if (4 * dst_w == src_w && 4 * dst_h == src_h) {
+      scaled = 1;
+      if (phase == 0) {
+        scale_plane_4_to_1_phase_0(src->buffers[i], src->strides[is_uv],
+                                   dst->buffers[i], dst->strides[is_uv], dst_w,
+                                   dst_h);
+      } else if (filter == BILINEAR) {
+        const int16_t c0 = av1_bilinear_filters[phase][3];
+        const int16_t c1 = av1_bilinear_filters[phase][4];
+        scale_plane_4_to_1_bilinear(src->buffers[i], src->strides[is_uv],
+                                    dst->buffers[i], dst->strides[is_uv], dst_w,
+                                    dst_h, c0, c1);
+      } else {
+        const int buffer_stride = (dst_y_w + 1) & ~1;
+        const int buffer_height = (4 * dst_y_h + SUBPEL_TAPS - 2 + 7) & ~7;
+        uint8_t *const temp_buffer =
+            (uint8_t *)malloc(buffer_stride * buffer_height);
+        if (temp_buffer) {
+          const InterpKernel *interp_kernel =
+              (const InterpKernel *)av1_interp_filter_params_list[filter]
+                  .filter_ptr;
+          scale_plane_4_to_1_general(src->buffers[i], src->strides[is_uv],
+                                     dst->buffers[i], dst->strides[is_uv],
+                                     dst_w, dst_h, interp_kernel[phase],
+                                     temp_buffer);
+          free(temp_buffer);
+        } else {
+          scaled = 0;
+        }
+      }
+    } else if (4 * dst_w == 3 * src_w && 4 * dst_h == 3 * src_h) {
+      // 4 to 3
+      const int buffer_stride = (dst_y_w + 5) - ((dst_y_w + 5) % 6) + 2;
+      const int buffer_height = (4 * dst_y_h / 3 + SUBPEL_TAPS - 1 + 7) & ~7;
+      uint8_t *const temp_buffer =
+          (uint8_t *)malloc(buffer_stride * buffer_height);
+      if (temp_buffer) {
+        scaled = 1;
+        if (filter == BILINEAR) {
+          scale_plane_4_to_3_bilinear(src->buffers[i], src->strides[is_uv],
+                                      dst->buffers[i], dst->strides[is_uv],
+                                      dst_w, dst_h, phase, temp_buffer);
+        } else {
+          const InterpKernel *interp_kernel =
+              (const InterpKernel *)av1_interp_filter_params_list[filter]
+                  .filter_ptr;
+          scale_plane_4_to_3_general(src->buffers[i], src->strides[is_uv],
+                                     dst->buffers[i], dst->strides[is_uv],
+                                     dst_w, dst_h, interp_kernel, phase,
+                                     temp_buffer);
+        }
+        free(temp_buffer);
+      } else {
+        scaled = 0;
+      }
+    }
+  }
+  if (!scaled) {
+    av1_resize_and_extend_frame_c(src, dst, filter, phase, num_planes);
+  } else {
+    aom_extend_frame_borders(dst, num_planes);
+  }
+}
diff --git a/av1/common/av1_common_int.h b/av1/common/av1_common_int.h
index 0403405..0a68cb5 100644
--- a/av1/common/av1_common_int.h
+++ b/av1/common/av1_common_int.h
@@ -79,6 +79,8 @@
 #define TXCOEFF_TIMER 0
 #define TXCOEFF_COST_TIMER 0
 
+/*!\cond */
+
 enum {
   SINGLE_REFERENCE = 0,
   COMPOUND_REFERENCE = 1,
@@ -186,14 +188,20 @@
   InternalFrameBufferList int_frame_buffers;
 } BufferPool;
 
+/*!\endcond */
+
+/*!\brief Parameters related to CDEF */
 typedef struct {
-  int cdef_damping;
-  int nb_cdef_strengths;
-  int cdef_strengths[CDEF_MAX_STRENGTHS];
-  int cdef_uv_strengths[CDEF_MAX_STRENGTHS];
-  int cdef_bits;
+  int cdef_damping;                       /*!< CDEF damping factor */
+  int nb_cdef_strengths;                  /*!< Number of CDEF strength values */
+  int cdef_strengths[CDEF_MAX_STRENGTHS]; /*!< CDEF strength values for luma */
+  int cdef_uv_strengths[CDEF_MAX_STRENGTHS]; /*!< CDEF strength values for
+                                                chroma */
+  int cdef_bits; /*!< Number of CDEF strength values in bits */
 } CdefInfo;
 
+/*!\cond */
+
 typedef struct {
   int delta_q_present_flag;
   // Resolution of delta quant
@@ -230,6 +238,10 @@
   int num_bits_height;
   int max_frame_width;
   int max_frame_height;
+  // Whether current and reference frame IDs are signaled in the bitstream.
+  // Frame id numbers are additional information that do not affect the
+  // decoding process, but provide decoders with a way of detecting missing
+  // reference frames so that appropriate action can be taken.
   uint8_t frame_id_numbers_present_flag;
   int frame_id_length;
   int delta_frame_id_length;
@@ -314,410 +326,711 @@
   int frame_refs_short_signaling;
 } CurrentFrame;
 
-// Struct containing some frame level features.
+/*!\endcond */
+
+/*!
+ * \brief Frame level features.
+ */
 typedef struct {
+  /*!
+   * If true, CDF update in the symbol encoding/decoding process is disabled.
+   */
   bool disable_cdf_update;
+  /*!
+   * If true, motion vectors are specified to eighth pel precision; and
+   * if false, motion vectors are specified to quarter pel precision.
+   */
   bool allow_high_precision_mv;
-  bool cur_frame_force_integer_mv;  // 0 the default in AOM, 1 only integer
+  /*!
+   * If true, force integer motion vectors; if false, use the default.
+   */
+  bool cur_frame_force_integer_mv;
+  /*!
+   * If true, palette tool and/or intra block copy tools may be used.
+   */
   bool allow_screen_content_tools;
-  bool allow_intrabc;
-  bool allow_warped_motion;
-  // Whether to use previous frames' motion vectors for prediction.
+  bool allow_intrabc;       /*!< If true, intra block copy tool may be used. */
+  bool allow_warped_motion; /*!< If true, frame may use warped motion mode. */
+  /*!
+   * If true, using previous frames' motion vectors for prediction is allowed.
+   */
   bool allow_ref_frame_mvs;
-  bool coded_lossless;  // frame is fully lossless at the coded resolution.
-  bool all_lossless;    // frame is fully lossless at the upscaled resolution.
+  /*!
+   * If true, frame is fully lossless at coded resolution.
+   * */
+  bool coded_lossless;
+  /*!
+   * If true, frame is fully lossless at upscaled resolution.
+   */
+  bool all_lossless;
+  /*!
+   * If true, the frame is restricted to a reduced subset of the full set of
+   * transform types.
+   */
   bool reduced_tx_set_used;
+  /*!
+   * If true, error resilient mode is enabled.
+   * Note: Error resilient mode allows the syntax of a frame to be parsed
+   * independently of previously decoded frames.
+   */
   bool error_resilient_mode;
+  /*!
+   * If false, only MOTION_MODE that may be used is SIMPLE_TRANSLATION;
+   * if true, all MOTION_MODES may be used.
+   */
   bool switchable_motion_mode;
-  TX_MODE tx_mode;
-  InterpFilter interp_filter;
+  TX_MODE tx_mode;            /*!< Transform mode at frame level. */
+  InterpFilter interp_filter; /*!< Interpolation filter at frame level. */
+  /*!
+   * The reference frame that contains the CDF values and other state that
+   * should be loaded at the start of the frame.
+   */
   int primary_ref_frame;
+  /*!
+   * Byte alignment of the planes in the reference buffers.
+   */
   int byte_alignment;
-  // Flag signaling how frame contexts should be updated at the end of
-  // a frame decode
+  /*!
+   * Flag signaling how frame contexts should be updated at the end of
+   * a frame decode.
+   */
   REFRESH_FRAME_CONTEXT_MODE refresh_frame_context;
 } FeatureFlags;
 
-// Struct containing params related to tiles.
+/*!
+ * \brief Params related to tiles.
+ */
 typedef struct CommonTileParams {
-  int cols;           // number of tile columns that frame is divided into
-  int rows;           // number of tile rows that frame is divided into
-  int max_width_sb;   // maximum tile width in superblock units.
-  int max_height_sb;  // maximum tile height in superblock units.
-  // Min width of non-rightmost tile in MI units. Only valid if cols > 1.
+  int cols;          /*!< number of tile columns that frame is divided into */
+  int rows;          /*!< number of tile rows that frame is divided into */
+  int max_width_sb;  /*!< maximum tile width in superblock units. */
+  int max_height_sb; /*!< maximum tile height in superblock units. */
+
+  /*!
+   * Min width of non-rightmost tile in MI units. Only valid if cols > 1.
+   */
   int min_inner_width;
 
-  // If true, tiles are uniformly spaced with power-of-two number of rows and
-  // columns.
-  // If false, tiles have explicitly configured widths and heights.
+  /*!
+   * If true, tiles are uniformly spaced with power-of-two number of rows and
+   * columns.
+   * If false, tiles have explicitly configured widths and heights.
+   */
   int uniform_spacing;
 
-  // Following members are only valid when uniform_spacing == 1
-  int log2_cols;  // log2 of 'cols'.
-  int log2_rows;  // log2 of 'rows'.
-  int width;      // tile width in MI units
-  int height;     // tile height in MI units
-  // End of members that are only valid when uniform_spacing == 1
+  /**
+   * \name Members only valid when uniform_spacing == 1
+   */
+  /**@{*/
+  int log2_cols; /*!< log2 of 'cols'. */
+  int log2_rows; /*!< log2 of 'rows'. */
+  int width;     /*!< tile width in MI units */
+  int height;    /*!< tile height in MI units */
+  /**@}*/
 
-  // Min num of tile columns possible based on 'max_width_sb' and frame width.
+  /*!
+   * Min num of tile columns possible based on 'max_width_sb' and frame width.
+   */
   int min_log2_cols;
-  // Min num of tile rows possible based on 'max_height_sb' and frame height.
+  /*!
+   * Min num of tile rows possible based on 'max_height_sb' and frame height.
+   */
   int min_log2_rows;
-  // Min num of tile columns possible based on frame width.
+  /*!
+   * Min num of tile columns possible based on frame width.
+   */
   int max_log2_cols;
-  // Max num of tile columns possible based on frame width.
+  /*!
+   * Max num of tile columns possible based on frame width.
+   */
   int max_log2_rows;
-  // log2 of min number of tiles (same as min_log2_cols + min_log2_rows).
+  /*!
+   * log2 of min number of tiles (same as min_log2_cols + min_log2_rows).
+   */
   int min_log2;
-  // col_start_sb[i] is the start position of tile column i in superblock units.
-  // valid for 0 <= i <= cols
+  /*!
+   * col_start_sb[i] is the start position of tile column i in superblock units.
+   * valid for 0 <= i <= cols
+   */
   int col_start_sb[MAX_TILE_COLS + 1];
-  // row_start_sb[i] is the start position of tile row i in superblock units.
-  // valid for 0 <= i <= rows
+  /*!
+   * row_start_sb[i] is the start position of tile row i in superblock units.
+   * valid for 0 <= i <= rows
+   */
   int row_start_sb[MAX_TILE_ROWS + 1];
-  // If true, we are using large scale tile mode.
+  /*!
+   * If true, we are using large scale tile mode.
+   */
   unsigned int large_scale;
-  // Only relevant when large_scale == 1.
-  // If true, the independent decoding of a single tile or a section of a frame
-  // is allowed.
+  /*!
+   * Only relevant when large_scale == 1.
+   * If true, the independent decoding of a single tile or a section of a frame
+   * is allowed.
+   */
   unsigned int single_tile_decoding;
 } CommonTileParams;
 
-// Struct containing params related to MB_MODE_INFO arrays and related info.
 typedef struct CommonModeInfoParams CommonModeInfoParams;
+/*!
+ * \brief Params related to MB_MODE_INFO arrays and related info.
+ */
 struct CommonModeInfoParams {
-  // Number of rows/cols in the frame in 16 pixel units.
-  // This is computed from frame width and height aligned to a multiple of 8.
+  /*!
+   * Number of rows in the frame in 16 pixel units.
+   * This is computed from frame height aligned to a multiple of 8.
+   */
   int mb_rows;
+  /*!
+   * Number of cols in the frame in 16 pixel units.
+   * This is computed from frame width aligned to a multiple of 8.
+   */
   int mb_cols;
-  // Total MBs = mb_rows * mb_cols.
+
+  /*!
+   * Total MBs = mb_rows * mb_cols.
+   */
   int MBs;
 
-  // Number of rows/cols in the frame in 4 pixel (MB_MODE_INFO) units.
-  // This is computed from frame width and height aligned to a multiple of 8.
+  /*!
+   * Number of rows in the frame in 4 pixel (MB_MODE_INFO) units.
+   * This is computed from frame height aligned to a multiple of 8.
+   */
   int mi_rows;
+  /*!
+   * Number of cols in the frame in 4 pixel (MB_MODE_INFO) units.
+   * This is computed from frame width aligned to a multiple of 8.
+   */
   int mi_cols;
 
-  // An array of MB_MODE_INFO structs for every 'mi_alloc_bsize' sized block
-  // in the frame.
-  // Note: This array should be treated like a scratch memory, and should NOT be
-  // accessed directly, in most cases. Please use 'mi_grid_base' array instead.
+  /*!
+   * An array of MB_MODE_INFO structs for every 'mi_alloc_bsize' sized block
+   * in the frame.
+   * Note: This array should be treated like a scratch memory, and should NOT be
+   * accessed directly, in most cases. Please use 'mi_grid_base' array instead.
+   */
   MB_MODE_INFO *mi_alloc;
-  // Number of allocated elements in 'mi_alloc'.
+  /*!
+   * Number of allocated elements in 'mi_alloc'.
+   */
   int mi_alloc_size;
-  // Stride for 'mi_alloc' array.
+  /*!
+   * Stride for 'mi_alloc' array.
+   */
   int mi_alloc_stride;
-  // The minimum block size that each element in 'mi_alloc' can correspond to.
-  // For decoder, this is always BLOCK_4X4.
-  // For encoder, this is currently set to BLOCK_4X4 for resolution < 4k,
-  // and BLOCK_8X8 for resolution >= 4k.
+  /*!
+   * The minimum block size that each element in 'mi_alloc' can correspond to.
+   * For decoder, this is always BLOCK_4X4.
+   * For encoder, this is currently set to BLOCK_4X4 for resolution < 4k,
+   * and BLOCK_8X8 for resolution >= 4k.
+   */
   BLOCK_SIZE mi_alloc_bsize;
 
-  // Grid of pointers to 4x4 MB_MODE_INFO structs allocated in 'mi_alloc'.
-  // It's possible that:
-  // - Multiple pointers in the grid point to the same element in 'mi_alloc'
-  // (for example, for all 4x4 blocks that belong to the same partition block).
-  // - Some pointers can be NULL (for example, for blocks outside visible area).
+  /*!
+   * Grid of pointers to 4x4 MB_MODE_INFO structs allocated in 'mi_alloc'.
+   * It's possible that:
+   * - Multiple pointers in the grid point to the same element in 'mi_alloc'
+   * (for example, for all 4x4 blocks that belong to the same partition block).
+   * - Some pointers can be NULL (for example, for blocks outside visible area).
+   */
   MB_MODE_INFO **mi_grid_base;
-  // Number of allocated elements in 'mi_grid_base' (and 'tx_type_map' also).
+  /*!
+   * Number of allocated elements in 'mi_grid_base' (and 'tx_type_map' also).
+   */
   int mi_grid_size;
-  // Stride for 'mi_grid_base' (and 'tx_type_map' also).
+  /*!
+   * Stride for 'mi_grid_base' (and 'tx_type_map' also).
+   */
   int mi_stride;
 
-  // An array of tx types for each 4x4 block in the frame.
-  // Number of allocated elements is same as 'mi_grid_size', and stride is
-  // same as 'mi_grid_size'. So, indexing into 'tx_type_map' is same as that of
-  // 'mi_grid_base'.
+  /*!
+   * An array of tx types for each 4x4 block in the frame.
+   * Number of allocated elements is same as 'mi_grid_size', and stride is
+   * same as 'mi_grid_size'. So, indexing into 'tx_type_map' is same as that of
+   * 'mi_grid_base'.
+   */
   TX_TYPE *tx_type_map;
 
-  // Function pointers to allow separate logic for encoder and decoder.
+  /**
+   * \name Function pointers to allow separate logic for encoder and decoder.
+   */
+  /**@{*/
+  /*!
+   * Free the memory allocated to arrays in 'mi_params'.
+   * \param[in,out]   mi_params   object containing common mode info parameters
+   */
   void (*free_mi)(struct CommonModeInfoParams *mi_params);
+  /*!
+   * Initialize / reset appropriate arrays in 'mi_params'.
+   * \param[in,out]   mi_params   object containing common mode info parameters
+   */
   void (*setup_mi)(struct CommonModeInfoParams *mi_params);
+  /*!
+   * Allocate required memory for arrays in 'mi_params'.
+   * \param[in,out]   mi_params   object containing common mode info parameters
+   * \param           width       frame width
+   * \param           height      frame height
+   */
   void (*set_mb_mi)(struct CommonModeInfoParams *mi_params, int width,
                     int height);
+  /**@}*/
 };
 
-// Parameters related to quantization at the frame level.
 typedef struct CommonQuantParams CommonQuantParams;
+/*!
+ * \brief Parameters related to quantization at the frame level.
+ */
 struct CommonQuantParams {
-  // Base qindex of the frame in the range 0 to 255.
+  /*!
+   * Base qindex of the frame in the range 0 to 255.
+   */
   int base_qindex;
 
-  // Delta of qindex (from base_qindex) for Y plane DC coefficient.
-  // Note: y_ac_delta_q is implicitly 0.
+  /*!
+   * Delta of qindex (from base_qindex) for Y plane DC coefficient.
+   * Note: y_ac_delta_q is implicitly 0.
+   */
   int y_dc_delta_q;
 
-  // Delta of qindex (from base_qindex) for U plane DC and AC coefficients.
+  /*!
+   * Delta of qindex (from base_qindex) for U plane DC coefficients.
+   */
   int u_dc_delta_q;
+  /*!
+   * Delta of qindex (from base_qindex) for U plane AC coefficients.
+   */
   int v_dc_delta_q;
 
-  // Delta of qindex (from base_qindex) for V plane DC and AC coefficients.
-  // Same as those for U plane if cm->seq_params.separate_uv_delta_q == 0.
+  /*!
+   * Delta of qindex (from base_qindex) for V plane DC coefficients.
+   * Same as those for U plane if cm->seq_params.separate_uv_delta_q == 0.
+   */
   int u_ac_delta_q;
+  /*!
+   * Delta of qindex (from base_qindex) for V plane AC coefficients.
+   * Same as those for U plane if cm->seq_params.separate_uv_delta_q == 0.
+   */
   int v_ac_delta_q;
 
-  // Note: The qindex per superblock may have a delta from the qindex obtained
-  // at frame level from parameters above, based on 'cm->delta_q_info'.
+  /*
+   * Note: The qindex per superblock may have a delta from the qindex obtained
+   * at frame level from parameters above, based on 'cm->delta_q_info'.
+   */
 
-  // The dequantizers below are true dequantizers used only in the
-  // dequantization process.  They have the same coefficient
-  // shift/scale as TX.
-  int16_t y_dequant_QTX[MAX_SEGMENTS][2];
-  int16_t u_dequant_QTX[MAX_SEGMENTS][2];
-  int16_t v_dequant_QTX[MAX_SEGMENTS][2];
+  /**
+   * \name True dequantizers.
+   * The dequantizers below are true dequantizers used only in the
+   * dequantization process.  They have the same coefficient
+   * shift/scale as TX.
+   */
+  /**@{*/
+  int16_t y_dequant_QTX[MAX_SEGMENTS][2]; /*!< Dequant for Y plane */
+  int16_t u_dequant_QTX[MAX_SEGMENTS][2]; /*!< Dequant for U plane */
+  int16_t v_dequant_QTX[MAX_SEGMENTS][2]; /*!< Dequant for V plane */
+  /**@}*/
 
-  // Global quant matrix tables
+  /**
+   * \name Global quantization matrix tables.
+   */
+  /**@{*/
+  /*!
+   * Global dquantization matrix table.
+   */
   const qm_val_t *giqmatrix[NUM_QM_LEVELS][3][TX_SIZES_ALL];
+  /*!
+   * Global quantization matrix table.
+   */
   const qm_val_t *gqmatrix[NUM_QM_LEVELS][3][TX_SIZES_ALL];
+  /**@}*/
 
-  // Local quant matrix tables for each frame
+  /**
+   * \name Local dequantization matrix tables for each frame.
+   */
+  /**@{*/
+  /*!
+   * Local dequant matrix for Y plane.
+   */
   const qm_val_t *y_iqmatrix[MAX_SEGMENTS][TX_SIZES_ALL];
+  /*!
+   * Local dequant matrix for U plane.
+   */
   const qm_val_t *u_iqmatrix[MAX_SEGMENTS][TX_SIZES_ALL];
+  /*!
+   * Local dequant matrix for V plane.
+   */
   const qm_val_t *v_iqmatrix[MAX_SEGMENTS][TX_SIZES_ALL];
+  /**@}*/
 
-  // Flag indicating whether quantization matrices are being used:
-  //  - If true, qm_level_y, qm_level_u and qm_level_v indicate the level
-  //    indices to be used to access appropriate global quant matrix tables.
-  //  - If false, we implicitly use level index 'NUM_QM_LEVELS - 1'.
+  /*!
+   * Flag indicating whether quantization matrices are being used:
+   *  - If true, qm_level_y, qm_level_u and qm_level_v indicate the level
+   *    indices to be used to access appropriate global quant matrix tables.
+   *  - If false, we implicitly use level index 'NUM_QM_LEVELS - 1'.
+   */
   bool using_qmatrix;
-  int qmatrix_level_y;
-  int qmatrix_level_u;
-  int qmatrix_level_v;
+  /**
+   * \name Valid only when using_qmatrix == true
+   * Indicate the level indices to be used to access appropriate global quant
+   * matrix tables.
+   */
+  /**@{*/
+  int qmatrix_level_y; /*!< Level index for Y plane */
+  int qmatrix_level_u; /*!< Level index for U plane */
+  int qmatrix_level_v; /*!< Level index for V plane */
+  /**@}*/
 };
 
-// Context used for transmitting various symbols in the bistream.
 typedef struct CommonContexts CommonContexts;
+/*!
+ * \brief Contexts used for transmitting various symbols in the bitstream.
+ */
 struct CommonContexts {
-  // Context used by 'FRAME_CONTEXT.partition_cdf' to transmit partition type.
-  // partition[i][j] is the context for ith tile row, jth mi_col.
+  /*!
+   * Context used by 'FRAME_CONTEXT.partition_cdf' to transmit partition type.
+   * partition[i][j] is the context for ith tile row, jth mi_col.
+   */
   PARTITION_CONTEXT **partition;
 
-  // Context used to derive context for multiple symbols:
-  // - 'TXB_CTX.txb_skip_ctx' used by 'FRAME_CONTEXT.txb_skip_cdf' to transmit
-  // to transmit skip_txfm flag.
-  // - 'TXB_CTX.dc_sign_ctx' used by 'FRAME_CONTEXT.dc_sign_cdf' to transmit
-  // sign.
-  // entropy[i][j][k] is the context for ith plane, jth tile row, kth mi_col.
+  /*!
+   * Context used to derive context for multiple symbols:
+   * - 'TXB_CTX.txb_skip_ctx' used by 'FRAME_CONTEXT.txb_skip_cdf' to transmit
+   * to transmit skip_txfm flag.
+   * - 'TXB_CTX.dc_sign_ctx' used by 'FRAME_CONTEXT.dc_sign_cdf' to transmit
+   * sign.
+   * entropy[i][j][k] is the context for ith plane, jth tile row, kth mi_col.
+   */
   ENTROPY_CONTEXT **entropy[MAX_MB_PLANE];
 
-  // Context used to derive context for 'FRAME_CONTEXT.txfm_partition_cdf' to
-  // transmit 'is_split' flag to indicate if this transform block should be
-  // split into smaller sub-blocks.
-  // txfm[i][j] is the context for ith tile row, jth mi_col.
+  /*!
+   * Context used to derive context for 'FRAME_CONTEXT.txfm_partition_cdf' to
+   * transmit 'is_split' flag to indicate if this transform block should be
+   * split into smaller sub-blocks.
+   * txfm[i][j] is the context for ith tile row, jth mi_col.
+   */
   TXFM_CONTEXT **txfm;
 
-  // Dimensions that were used to allocate the arrays above.
-  // If these dimensions change, the arrays may have to be re-allocated.
-  int num_planes;     // Corresponds to av1_num_planes(cm)
-  int num_tile_rows;  // Corresponds to cm->tiles.row
-  int num_mi_cols;    // Corresponds to cm->mi_params.mi_cols
+  /*!
+   * Dimensions that were used to allocate the arrays above.
+   * If these dimensions change, the arrays may have to be re-allocated.
+   */
+  int num_planes;    /*!< Corresponds to av1_num_planes(cm) */
+  int num_tile_rows; /*!< Corresponds to cm->tiles.row */
+  int num_mi_cols;   /*!< Corresponds to cm->mi_params.mi_cols */
 };
 
+/*!
+ * \brief Top level common structure used by both encoder and decoder.
+ */
 typedef struct AV1Common {
-  // Information about the current frame that is being coded.
+  /*!
+   * Information about the current frame that is being coded.
+   */
   CurrentFrame current_frame;
-  // Code and details about current error status.
+  /*!
+   * Code and details about current error status.
+   */
   struct aom_internal_error_info error;
 
-  // AV1 allows two types of frame scaling operations:
-  // (1) Frame super-resolution: that allows coding a frame at lower resolution
-  // and after decoding the frame, normatively uscales and restores the frame --
-  // inside the coding loop.
-  // (2) Frame resize: that allows coding frame at lower/higher resolution, and
-  // then non-normatively upscale the frame at the time of rendering -- outside
-  // the coding loop.
-  // Hence, the need for 3 types of dimensions.
+  /*!
+   * AV1 allows two types of frame scaling operations:
+   * 1. Frame super-resolution: that allows coding a frame at lower resolution
+   * and after decoding the frame, normatively uscales and restores the frame --
+   * inside the coding loop.
+   * 2. Frame resize: that allows coding frame at lower/higher resolution, and
+   * then non-normatively upscale the frame at the time of rendering -- outside
+   * the coding loop.
+   * Hence, the need for 3 types of dimensions.
+   */
 
-  // Coded frame dimensions.
-  int width;
-  int height;
+  /**
+   * \name Coded frame dimensions.
+   */
+  /**@{*/
+  int width;  /*!< Coded frame width */
+  int height; /*!< Coded frame height */
+  /**@}*/
 
-  // Rendered frame dimensions, after applying both super-resolution and resize
-  // to the coded frame.
-  // Different from coded dimensions if super-resolution and/or resize are
-  // being used for this frame.
-  int render_width;
-  int render_height;
+  /**
+   * \name Rendered frame dimensions.
+   * Dimensions after applying both super-resolution and resize to the coded
+   * frame. Different from coded dimensions if super-resolution and/or resize
+   * are being used for this frame.
+   */
+  /**@{*/
+  int render_width;  /*!< Rendered frame width */
+  int render_height; /*!< Rendered frame height */
+  /**@}*/
 
-  // Frame dimensions after applying super-resolution to the coded frame (if
-  // present), but before applying resize.
-  // Larger than the coded dimensions if super-resolution is being used for
-  // this frame.
-  // Different from rendered dimensions if resize is being used for this frame.
-  int superres_upscaled_width;
-  int superres_upscaled_height;
+  /**
+   * \name Super-resolved frame dimensions.
+   * Frame dimensions after applying super-resolution to the coded frame (if
+   * present), but before applying resize.
+   * Larger than the coded dimensions if super-resolution is being used for
+   * this frame.
+   * Different from rendered dimensions if resize is being used for this frame.
+   */
+  /**@{*/
+  int superres_upscaled_width;  /*!< Super-resolved frame width */
+  int superres_upscaled_height; /*!< Super-resolved frame height */
+  /**@}*/
 
-  // The denominator of the superres scale used by this frame.
-  // Note: The numerator is fixed to be SCALE_NUMERATOR.
+  /*!
+   * The denominator of the superres scale used by this frame.
+   * Note: The numerator is fixed to be SCALE_NUMERATOR.
+   */
   uint8_t superres_scale_denominator;
 
-  // If true, buffer removal times are present.
+  /*!
+   * If true, buffer removal times are present.
+   */
   bool buffer_removal_time_present;
-  // buffer_removal_times[op_num] specifies the frame removal time in units of
-  // DecCT clock ticks counted from the removal time of the last random access
-  // point for operating point op_num.
-  // TODO(urvang): We probably don't need the +1 here.
+  /*!
+   * buffer_removal_times[op_num] specifies the frame removal time in units of
+   * DecCT clock ticks counted from the removal time of the last random access
+   * point for operating point op_num.
+   * TODO(urvang): We probably don't need the +1 here.
+   */
   uint32_t buffer_removal_times[MAX_NUM_OPERATING_POINTS + 1];
-  // Presentation time of the frame in clock ticks DispCT counted from the
-  // removal time of the last random access point for the operating point that
-  // is being decoded.
+  /*!
+   * Presentation time of the frame in clock ticks DispCT counted from the
+   * removal time of the last random access point for the operating point that
+   * is being decoded.
+   */
   uint32_t frame_presentation_time;
 
-  // Buffer where previous frame is stored.
+  /*!
+   * Buffer where previous frame is stored.
+   */
   RefCntBuffer *prev_frame;
 
-  // Buffer into which the current frame will be stored and other related info.
-  // TODO(hkuang): Combine this with cur_buf in macroblockd.
+  /*!
+   * Buffer into which the current frame will be stored and other related info.
+   * TODO(hkuang): Combine this with cur_buf in macroblockd.
+   */
   RefCntBuffer *cur_frame;
 
-  // For encoder, we have a two-level mapping from reference frame type to the
-  // corresponding buffer in the buffer pool:
-  // * 'remapped_ref_idx[i - 1]' maps reference type 'i' (range: LAST_FRAME ...
-  // EXTREF_FRAME) to a remapped index 'j' (in range: 0 ... REF_FRAMES - 1)
-  // * Later, 'cm->ref_frame_map[j]' maps the remapped index 'j' to a pointer to
-  // the reference counted buffer structure RefCntBuffer, taken from the buffer
-  // pool cm->buffer_pool->frame_bufs.
-  //
-  // LAST_FRAME,                        ...,      EXTREF_FRAME
-  //      |                                           |
-  //      v                                           v
-  // remapped_ref_idx[LAST_FRAME - 1],  ...,  remapped_ref_idx[EXTREF_FRAME - 1]
-  //      |                                           |
-  //      v                                           v
-  // ref_frame_map[],                   ...,     ref_frame_map[]
-  //
-  // Note: INTRA_FRAME always refers to the current frame, so there's no need to
-  // have a remapped index for the same.
+  /*!
+   * For encoder, we have a two-level mapping from reference frame type to the
+   * corresponding buffer in the buffer pool:
+   * * 'remapped_ref_idx[i - 1]' maps reference type 'i' (range: LAST_FRAME ...
+   * EXTREF_FRAME) to a remapped index 'j' (in range: 0 ... REF_FRAMES - 1)
+   * * Later, 'cm->ref_frame_map[j]' maps the remapped index 'j' to a pointer to
+   * the reference counted buffer structure RefCntBuffer, taken from the buffer
+   * pool cm->buffer_pool->frame_bufs.
+   *
+   * LAST_FRAME,                        ...,      EXTREF_FRAME
+   *      |                                           |
+   *      v                                           v
+   * remapped_ref_idx[LAST_FRAME - 1],  ...,  remapped_ref_idx[EXTREF_FRAME - 1]
+   *      |                                           |
+   *      v                                           v
+   * ref_frame_map[],                   ...,     ref_frame_map[]
+   *
+   * Note: INTRA_FRAME always refers to the current frame, so there's no need to
+   * have a remapped index for the same.
+   */
   int remapped_ref_idx[REF_FRAMES];
 
-  // Scale of the current frame with respect to itself.
-  // This is currently used for intra block copy, which behaves like an inter
-  // prediction mode, where the reference frame is the current frame itself.
+  /*!
+   * Scale of the current frame with respect to itself.
+   * This is currently used for intra block copy, which behaves like an inter
+   * prediction mode, where the reference frame is the current frame itself.
+   */
   struct scale_factors sf_identity;
 
-  // Scale factors of the reference frame with respect to the current frame.
-  // This is required for generating inter prediction and will be non-identity
-  // for a reference frame, if it has different dimensions than the coded
-  // dimensions of the current frame.
+  /*!
+   * Scale factors of the reference frame with respect to the current frame.
+   * This is required for generating inter prediction and will be non-identity
+   * for a reference frame, if it has different dimensions than the coded
+   * dimensions of the current frame.
+   */
   struct scale_factors ref_scale_factors[REF_FRAMES];
 
-  // For decoder, ref_frame_map[i] maps reference type 'i' to a pointer to
-  // the buffer in the buffer pool 'cm->buffer_pool.frame_bufs'.
-  // For encoder, ref_frame_map[j] (where j = remapped_ref_idx[i]) maps
-  // remapped reference index 'j' (that is, original reference type 'i') to
-  // a pointer to the buffer in the buffer pool 'cm->buffer_pool.frame_bufs'.
+  /*!
+   * For decoder, ref_frame_map[i] maps reference type 'i' to a pointer to
+   * the buffer in the buffer pool 'cm->buffer_pool.frame_bufs'.
+   * For encoder, ref_frame_map[j] (where j = remapped_ref_idx[i]) maps
+   * remapped reference index 'j' (that is, original reference type 'i') to
+   * a pointer to the buffer in the buffer pool 'cm->buffer_pool.frame_bufs'.
+   */
   RefCntBuffer *ref_frame_map[REF_FRAMES];
 
-  // If true, this frame is actually shown after decoding.
-  // If false, this frame is coded in the bitstream, but not shown. It is only
-  // used as a reference for other frames coded later.
+  /*!
+   * If true, this frame is actually shown after decoding.
+   * If false, this frame is coded in the bitstream, but not shown. It is only
+   * used as a reference for other frames coded later.
+   */
   int show_frame;
 
-  // If true, this frame can be used as a show-existing frame for other frames
-  // coded later.
-  // When 'show_frame' is true, this is always true for all non-keyframes.
-  // When 'show_frame' is false, this value is transmitted in the bitstream.
+  /*!
+   * If true, this frame can be used as a show-existing frame for other frames
+   * coded later.
+   * When 'show_frame' is true, this is always true for all non-keyframes.
+   * When 'show_frame' is false, this value is transmitted in the bitstream.
+   */
   int showable_frame;
 
-  // If true, show an existing frame coded before, instead of actually coding a
-  // frame. The existing frame comes from one of the existing reference buffers,
-  // as signaled in the bitstream.
+  /*!
+   * If true, show an existing frame coded before, instead of actually coding a
+   * frame. The existing frame comes from one of the existing reference buffers,
+   * as signaled in the bitstream.
+   */
   int show_existing_frame;
 
-  // Whether some features are allowed or not.
+  /*!
+   * Whether some features are allowed or not.
+   */
   FeatureFlags features;
 
-  // Params related to MB_MODE_INFO arrays and related info.
+  /*!
+   * Params related to MB_MODE_INFO arrays and related info.
+   */
   CommonModeInfoParams mi_params;
 
 #if CONFIG_ENTROPY_STATS
+  /*!
+   * Context type used by token CDFs, in the range 0 .. (TOKEN_CDF_Q_CTXS - 1).
+   */
   int coef_cdf_category;
-#endif
-  // Quantization params.
+#endif  // CONFIG_ENTROPY_STATS
+
+  /*!
+   * Quantization params.
+   */
   CommonQuantParams quant_params;
 
-  // Segmentation info for current frame.
+  /*!
+   * Segmentation info for current frame.
+   */
   struct segmentation seg;
 
-  // Segmentation map for previous frame.
+  /*!
+   * Segmentation map for previous frame.
+   */
   uint8_t *last_frame_seg_map;
 
-  // Deblocking filter parameters.
-  loop_filter_info_n lf_info;
-  struct loopfilter lf;
+  /**
+   * \name Deblocking filter parameters.
+   */
+  /**@{*/
+  loop_filter_info_n lf_info; /*!< Loop filter info */
+  struct loopfilter lf;       /*!< Loop filter parameters */
+  /**@}*/
 
-  // Loop Restoration filter parameters.
-  RestorationInfo rst_info[MAX_MB_PLANE];  // Loop Restoration filter info.
-  int32_t *rst_tmpbuf;  // Scratch buffer for self-guided restoration filter.
-  RestorationLineBuffers *rlbs;  // Line buffers required by loop restoration.
-  YV12_BUFFER_CONFIG rst_frame;  // Stores the output of loop restoration.
+  /**
+   * \name Loop Restoration filter parameters.
+   */
+  /**@{*/
+  RestorationInfo rst_info[MAX_MB_PLANE]; /*!< Loop Restoration filter info */
+  int32_t *rst_tmpbuf; /*!< Scratch buffer for self-guided restoration */
+  RestorationLineBuffers *rlbs; /*!< Line buffers needed by loop restoration */
+  YV12_BUFFER_CONFIG rst_frame; /*!< Stores the output of loop restoration */
+  /**@}*/
 
-  // CDEF (Constrained Directional Enhancement Filter) parameters.
+  /*!
+   * CDEF (Constrained Directional Enhancement Filter) parameters.
+   */
   CdefInfo cdef_info;
 
-  // Parameters for film grain synthesis.
+  /*!
+   * Parameters for film grain synthesis.
+   */
   aom_film_grain_t film_grain_params;
 
-  // Parameters for delta quantization and delta loop filter level.
+  /*!
+   * Parameters for delta quantization and delta loop filter level.
+   */
   DeltaQInfo delta_q_info;
 
-  // Global motion parameters for each reference frame.
+  /*!
+   * Global motion parameters for each reference frame.
+   */
   WarpedMotionParams global_motion[REF_FRAMES];
 
-  // Elements part of the sequence header, that are applicable for all the
-  // frames in the video.
+  /*!
+   * Elements part of the sequence header, that are applicable for all the
+   * frames in the video.
+   */
   SequenceHeader seq_params;
 
-  // Current CDFs of all the symbols for the current frame.
+  /*!
+   * Current CDFs of all the symbols for the current frame.
+   */
   FRAME_CONTEXT *fc;
-  // Default CDFs used when features.primary_ref_frame = PRIMARY_REF_NONE
-  // (e.g. for a keyframe). These default CDFs are defined by the bitstream and
-  // copied from default CDF tables for each symbol.
+  /*!
+   * Default CDFs used when features.primary_ref_frame = PRIMARY_REF_NONE
+   * (e.g. for a keyframe). These default CDFs are defined by the bitstream and
+   * copied from default CDF tables for each symbol.
+   */
   FRAME_CONTEXT *default_frame_context;
 
-  // Parameters related to tiling.
+  /*!
+   * Parameters related to tiling.
+   */
   CommonTileParams tiles;
 
-  // External BufferPool passed from outside.
+  /*!
+   * External BufferPool passed from outside.
+   */
   BufferPool *buffer_pool;
 
-  // Above context buffers and their sizes.
-  // Note: above contexts are allocated in this struct, as their size is
-  // dependent on frame width, while left contexts are declared and allocated in
-  // MACROBLOCKD struct, as they have a fixed size.
+  /*!
+   * Above context buffers and their sizes.
+   * Note: above contexts are allocated in this struct, as their size is
+   * dependent on frame width, while left contexts are declared and allocated in
+   * MACROBLOCKD struct, as they have a fixed size.
+   */
   CommonContexts above_contexts;
 
-  // When cm->seq_params.frame_id_numbers_present_flag == 1, current and
-  // reference frame IDs are signaled in the bitstream.
-  int current_frame_id;
-  int ref_frame_id[REF_FRAMES];
+  /**
+   * \name Signaled when cm->seq_params.frame_id_numbers_present_flag == 1
+   */
+  /**@{*/
+  int current_frame_id;         /*!< frame ID for the current frame. */
+  int ref_frame_id[REF_FRAMES]; /*!< frame IDs for the reference frames. */
+  /**@}*/
 
-  // Motion vectors provided by motion field estimation.
-  // tpl_mvs[row * stride + col] stores MV for block at [mi_row, mi_col] where:
-  // mi_row = 2 * row,
-  // mi_col = 2 * col, and
-  // stride = cm->mi_params.mi_stride / 2
+  /*!
+   * Motion vectors provided by motion field estimation.
+   * tpl_mvs[row * stride + col] stores MV for block at [mi_row, mi_col] where:
+   * mi_row = 2 * row,
+   * mi_col = 2 * col, and
+   * stride = cm->mi_params.mi_stride / 2
+   */
   TPL_MV_REF *tpl_mvs;
-  // Allocated size of 'tpl_mvs' array. Refer to 'ensure_mv_buffer()' function.
+  /*!
+   * Allocated size of 'tpl_mvs' array. Refer to 'ensure_mv_buffer()' function.
+   */
   int tpl_mvs_mem_size;
-  // ref_frame_sign_bias[k] is 1 if relative distance between reference 'k' and
-  // current frame is positive; and 0 otherwise.
+  /*!
+   * ref_frame_sign_bias[k] is 1 if relative distance between reference 'k' and
+   * current frame is positive; and 0 otherwise.
+   */
   int ref_frame_sign_bias[REF_FRAMES];
-  // ref_frame_side[k] is 1 if relative distance between reference 'k' and
-  // current frame is positive, -1 if relative distance is 0; and 0 otherwise.
-  // TODO(jingning): This can be combined with sign_bias later.
+  /*!
+   * ref_frame_side[k] is 1 if relative distance between reference 'k' and
+   * current frame is positive, -1 if relative distance is 0; and 0 otherwise.
+   * TODO(jingning): This can be combined with sign_bias later.
+   */
   int8_t ref_frame_side[REF_FRAMES];
 
-  // Number of temporal layers: may be > 1 for SVC (scalable vector coding).
+  /*!
+   * Number of temporal layers: may be > 1 for SVC (scalable vector coding).
+   */
   unsigned int number_temporal_layers;
-  // Temporal layer ID of this frame
-  // (in the range 0 ... (number_temporal_layers - 1)).
+  /*!
+   * Temporal layer ID of this frame
+   * (in the range 0 ... (number_temporal_layers - 1)).
+   */
   int temporal_layer_id;
 
-  // Number of spatial layers: may be > 1 for SVC (scalable vector coding).
+  /*!
+   * Number of spatial layers: may be > 1 for SVC (scalable vector coding).
+   */
   unsigned int number_spatial_layers;
-  // Spatial layer ID of this frame
-  // (in the range 0 ... (number_spatial_layers - 1)).
+  /*!
+   * Spatial layer ID of this frame
+   * (in the range 0 ... (number_spatial_layers - 1)).
+   */
   int spatial_layer_id;
 
 #if TXCOEFF_TIMER
@@ -737,6 +1050,8 @@
 #endif  // CONFIG_LPF_MASK
 } AV1_COMMON;
 
+/*!\cond */
+
 // TODO(hkuang): Don't need to lock the whole pool after implementing atomic
 // frame reference count.
 static void lock_buffer_pool(BufferPool *const pool) {
@@ -938,14 +1253,11 @@
   xd->above_txfm_context = above_contexts->txfm[tile_row];
 }
 
-static INLINE void av1_init_macroblockd(AV1_COMMON *cm, MACROBLOCKD *xd,
-                                        tran_low_t *dqcoeff) {
+static INLINE void av1_init_macroblockd(AV1_COMMON *cm, MACROBLOCKD *xd) {
   const int num_planes = av1_num_planes(cm);
   const CommonQuantParams *const quant_params = &cm->quant_params;
 
   for (int i = 0; i < num_planes; ++i) {
-    xd->plane[i].dqcoeff = dqcoeff;
-
     if (xd->plane[i].plane_type == PLANE_TYPE_Y) {
       memcpy(xd->plane[i].seg_dequant_QTX, quant_params->y_dequant_QTX,
              sizeof(quant_params->y_dequant_QTX));
@@ -979,7 +1291,7 @@
   for (i = 0; i < num_planes; ++i) {
     struct macroblockd_plane *const pd = &xd->plane[i];
     // Offset the buffer pointer
-    const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+    const BLOCK_SIZE bsize = xd->mi[0]->bsize;
     if (pd->subsampling_y && (mi_row & 0x01) && (mi_size_high[bsize] == 1))
       row_offset = mi_row - 1;
     if (pd->subsampling_x && (mi_col & 0x01) && (mi_size_wide[bsize] == 1))
@@ -1072,16 +1384,17 @@
 
   xd->height = bh;
   xd->width = bw;
-  xd->is_sec_rect = 0;
+
+  xd->is_last_vertical_rect = 0;
   if (xd->width < xd->height) {
-    // Only mark is_sec_rect as 1 for the last block.
-    // For PARTITION_VERT_4, it would be (0, 0, 0, 1);
-    // For other partitions, it would be (0, 1).
-    if (!((mi_col + xd->width) & (xd->height - 1))) xd->is_sec_rect = 1;
+    if (!((mi_col + xd->width) & (xd->height - 1))) {
+      xd->is_last_vertical_rect = 1;
+    }
   }
 
+  xd->is_first_horizontal_rect = 0;
   if (xd->width > xd->height)
-    if (mi_row & (xd->width - 1)) xd->is_sec_rect = 1;
+    if (!(mi_row & (xd->width - 1))) xd->is_first_horizontal_rect = 1;
 }
 
 static INLINE aom_cdf_prob *get_y_mode_cdf(FRAME_CONTEXT *tile_ctx,
@@ -1447,7 +1760,9 @@
 
   const int offset = mi_row * mi_params->mi_stride + mi_col;
   MB_MODE_INFO **mi = mi_params->mi_grid_base + offset;
-  const BLOCK_SIZE subsize = mi[0]->sb_type;
+  const BLOCK_SIZE subsize = mi[0]->bsize;
+
+  assert(bsize < BLOCK_SIZES_ALL);
 
   if (subsize == bsize) return PARTITION_NONE;
 
@@ -1470,7 +1785,7 @@
       if (sshigh * 4 == bhigh) return PARTITION_HORZ_4;
       assert(sshigh * 2 == bhigh);
 
-      if (mbmi_below->sb_type == subsize)
+      if (mbmi_below->bsize == subsize)
         return PARTITION_HORZ;
       else
         return PARTITION_HORZ_B;
@@ -1481,7 +1796,7 @@
       if (sswide * 4 == bwide) return PARTITION_VERT_4;
       assert(sswide * 2 == bhigh);
 
-      if (mbmi_right->sb_type == subsize)
+      if (mbmi_right->bsize == subsize)
         return PARTITION_VERT;
       else
         return PARTITION_VERT_B;
@@ -1495,8 +1810,8 @@
       // it's PARTITION_SPLIT.
       if (sswide * 2 != bwide || sshigh * 2 != bhigh) return PARTITION_SPLIT;
 
-      if (mi_size_wide[mbmi_below->sb_type] == bwide) return PARTITION_HORZ_A;
-      if (mi_size_high[mbmi_right->sb_type] == bhigh) return PARTITION_VERT_A;
+      if (mi_size_wide[mbmi_below->bsize] == bwide) return PARTITION_HORZ_A;
+      if (mi_size_high[mbmi_right->bsize] == bhigh) return PARTITION_VERT_A;
 
       return PARTITION_SPLIT;
     }
@@ -1550,6 +1865,8 @@
           seq_level_idx != SEQ_LEVEL_7_2 && seq_level_idx != SEQ_LEVEL_7_3);
 }
 
+/*!\endcond */
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/av1/common/av1_loopfilter.c b/av1/common/av1_loopfilter.c
index c756760..caa15c2 100644
--- a/av1/common/av1_loopfilter.c
+++ b/av1/common/av1_loopfilter.c
@@ -199,11 +199,11 @@
   TX_SIZE tx_size =
       (plane == AOM_PLANE_Y)
           ? mbmi->tx_size
-          : av1_get_max_uv_txsize(mbmi->sb_type, plane_ptr->subsampling_x,
+          : av1_get_max_uv_txsize(mbmi->bsize, plane_ptr->subsampling_x,
                                   plane_ptr->subsampling_y);
   assert(tx_size < TX_SIZES_ALL);
-  if ((plane == AOM_PLANE_Y) && is_inter_block(mbmi) && !mbmi->skip) {
-    const BLOCK_SIZE sb_type = mbmi->sb_type;
+  if ((plane == AOM_PLANE_Y) && is_inter_block(mbmi) && !mbmi->skip_txfm) {
+    const BLOCK_SIZE sb_type = mbmi->bsize;
     const int blk_row = mi_row & (mi_size_high[sb_type] - 1);
     const int blk_col = mi_col & (mi_size_wide[sb_type] - 1);
     const TX_SIZE mb_tx_size =
@@ -279,7 +279,7 @@
     {
       const uint32_t curr_level =
           av1_get_filter_level(cm, &cm->lf_info, edge_dir, plane, mbmi);
-      const int curr_skipped = mbmi->skip && is_inter_block(mbmi);
+      const int curr_skipped = mbmi->skip_txfm && is_inter_block(mbmi);
       uint32_t level = curr_level;
       if (coord) {
         {
@@ -295,10 +295,10 @@
           const uint32_t pv_lvl =
               av1_get_filter_level(cm, &cm->lf_info, edge_dir, plane, mi_prev);
 
-          const int pv_skip = mi_prev->skip && is_inter_block(mi_prev);
-          const BLOCK_SIZE bsize =
-              get_plane_block_size(mbmi->sb_type, plane_ptr->subsampling_x,
-                                   plane_ptr->subsampling_y);
+          const int pv_skip_txfm =
+              mi_prev->skip_txfm && is_inter_block(mi_prev);
+          const BLOCK_SIZE bsize = get_plane_block_size(
+              mbmi->bsize, plane_ptr->subsampling_x, plane_ptr->subsampling_y);
           assert(bsize < BLOCK_SIZES_ALL);
           const int prediction_masks = edge_dir == VERT_EDGE
                                            ? block_size_wide[bsize] - 1
@@ -307,7 +307,7 @@
           // if the current and the previous blocks are skipped,
           // deblock the edge if the edge belongs to a PU's edge only.
           if ((curr_level || pv_lvl) &&
-              (!pv_skip || !curr_skipped || pu_edge)) {
+              (!pv_skip_txfm || !curr_skipped || pu_edge)) {
             const TX_SIZE min_ts = AOMMIN(ts, pv_ts);
             if (TX_4X4 >= min_ts) {
               params->filter_length = 4;
diff --git a/av1/common/av1_loopfilter.h b/av1/common/av1_loopfilter.h
index ce26d16..ca16bbe 100644
--- a/av1/common/av1_loopfilter.h
+++ b/av1/common/av1_loopfilter.h
@@ -33,6 +33,7 @@
   LF_PATH_SLOW,
 };
 
+/*!\cond */
 enum { VERT_EDGE = 0, HORZ_EDGE = 1, NUM_EDGE_DIRS } UENUM1BYTE(EDGE_DIR);
 typedef struct {
   uint64_t bits[4];
@@ -118,6 +119,16 @@
   uint8_t lvl[MAX_MB_PLANE][MAX_SEGMENTS][2][REF_FRAMES][MAX_MODE_LF_DELTAS];
 } loop_filter_info_n;
 
+typedef struct LoopFilterWorkerData {
+  YV12_BUFFER_CONFIG *frame_buffer;
+  struct AV1Common *cm;
+  struct macroblockd_plane planes[MAX_MB_PLANE];
+  // TODO(Ranjit): When the filter functions are modified to use xd->lossless
+  // add lossless as a member here.
+  MACROBLOCKD *xd;
+} LFWorkerData;
+/*!\endcond */
+
 /* assorted loopfilter functions which get used elsewhere */
 struct AV1Common;
 struct macroblockd;
@@ -128,6 +139,11 @@
 void av1_loop_filter_frame_init(struct AV1Common *cm, int plane_start,
                                 int plane_end);
 
+/*!\brief Apply AV1 loop filter
+ *
+ * \ingroup in_loop_filter
+ * \callgraph
+ */
 #if CONFIG_LPF_MASK
 void av1_loop_filter_frame(YV12_BUFFER_CONFIG *frame, struct AV1Common *cm,
                            struct macroblockd *xd, int is_decoding,
@@ -148,15 +164,6 @@
                                  const MACROBLOCKD_PLANE *const plane_ptr,
                                  const uint32_t mi_row, const uint32_t mi_col);
 
-typedef struct LoopFilterWorkerData {
-  YV12_BUFFER_CONFIG *frame_buffer;
-  struct AV1Common *cm;
-  struct macroblockd_plane planes[MAX_MB_PLANE];
-  // TODO(Ranjit): When the filter functions are modified to use xd->lossless
-  // add lossless as a member here.
-  MACROBLOCKD *xd;
-} LFWorkerData;
-
 uint8_t av1_get_filter_level(const struct AV1Common *cm,
                              const loop_filter_info_n *lfi_n, const int dir_idx,
                              int plane, const MB_MODE_INFO *mbmi);
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index 296c6c5..2264b80 100644
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -103,15 +103,15 @@
 
 # directional intra predictor functions
 add_proto qw/void av1_dr_prediction_z1/, "uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_above, int dx, int dy";
-specialize qw/av1_dr_prediction_z1 avx2/;
+specialize qw/av1_dr_prediction_z1 avx2 neon/;
 add_proto qw/void av1_dr_prediction_z2/, "uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_above, int upsample_left, int dx, int dy";
-specialize qw/av1_dr_prediction_z2 avx2/;
+specialize qw/av1_dr_prediction_z2 avx2 neon/;
 add_proto qw/void av1_dr_prediction_z3/, "uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_left, int dx, int dy";
-specialize qw/av1_dr_prediction_z3 avx2/;
+specialize qw/av1_dr_prediction_z3 avx2 neon/;
 
 # FILTER_INTRA predictor functions
 add_proto qw/void av1_filter_intra_predictor/, "uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint8_t *above, const uint8_t *left, int mode";
-specialize qw/av1_filter_intra_predictor sse4_1/;
+specialize qw/av1_filter_intra_predictor sse4_1 neon/;
 
 # High bitdepth functions
 
@@ -136,20 +136,81 @@
 specialize qw/av1_inv_txfm_add ssse3 avx2 neon/;
 
 add_proto qw/void av1_highbd_inv_txfm_add/, "const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param";
-specialize qw/av1_highbd_inv_txfm_add sse4_1 avx2/;
+specialize qw/av1_highbd_inv_txfm_add sse4_1 avx2 neon/;
 
 add_proto qw/void av1_highbd_inv_txfm_add_4x4/,  "const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param";
-specialize qw/av1_highbd_inv_txfm_add_4x4 sse4_1/;
+specialize qw/av1_highbd_inv_txfm_add_4x4 sse4_1 neon/;
 add_proto qw/void av1_highbd_inv_txfm_add_8x8/,  "const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param";
-specialize qw/av1_highbd_inv_txfm_add_8x8 sse4_1/;
+specialize qw/av1_highbd_inv_txfm_add_8x8 sse4_1 neon/;
 add_proto qw/void av1_highbd_inv_txfm_add_4x8/,  "const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param";
-specialize qw/av1_highbd_inv_txfm_add_4x8 sse4_1/;
+specialize qw/av1_highbd_inv_txfm_add_4x8 sse4_1 neon/;
 add_proto qw/void av1_highbd_inv_txfm_add_8x4/,  "const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param";
-specialize qw/av1_highbd_inv_txfm_add_8x4 sse4_1/;
+specialize qw/av1_highbd_inv_txfm_add_8x4 sse4_1 neon/;
 add_proto qw/void av1_highbd_inv_txfm_add_4x16/,  "const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param";
-specialize qw/av1_highbd_inv_txfm_add_4x16 sse4_1/;
+specialize qw/av1_highbd_inv_txfm_add_4x16 sse4_1 neon/;
 add_proto qw/void av1_highbd_inv_txfm_add_16x4/,  "const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param";
-specialize qw/av1_highbd_inv_txfm_add_16x4 sse4_1/;
+specialize qw/av1_highbd_inv_txfm_add_16x4 sse4_1 neon/;
+add_proto qw/void av1_highbd_inv_txfm_add_8x16/,  "const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param";
+specialize qw/av1_highbd_inv_txfm_add_8x16  neon/;
+add_proto qw/void av1_highbd_inv_txfm_add_16x8/,  "const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param";
+specialize qw/av1_highbd_inv_txfm_add_16x8  neon/;
+add_proto qw/void av1_highbd_inv_txfm_add_16x32/,  "const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param";
+specialize qw/av1_highbd_inv_txfm_add_16x32  neon/;
+add_proto qw/void av1_highbd_inv_txfm_add_32x16/,  "const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param";
+specialize qw/av1_highbd_inv_txfm_add_32x16  neon/;
+add_proto qw/void av1_highbd_inv_txfm_add_32x32/,  "const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param";
+specialize qw/av1_highbd_inv_txfm_add_32x32  neon/;
+add_proto qw/void av1_highbd_inv_txfm_add_32x64/,  "const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param";
+specialize qw/av1_highbd_inv_txfm_add_32x64  neon/;
+add_proto qw/void av1_highbd_inv_txfm_add_64x32/,  "const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param";
+specialize qw/av1_highbd_inv_txfm_add_64x32  neon/;
+add_proto qw/void av1_highbd_inv_txfm_add_64x64/,  "const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param";
+specialize qw/av1_highbd_inv_txfm_add_64x64  neon/;
+add_proto qw/void av1_highbd_inv_txfm_add_8x32/,  "const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param";
+specialize qw/av1_highbd_inv_txfm_add_32x32  neon/;
+add_proto qw/void av1_highbd_inv_txfm_add_32x8/,  "const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param";
+specialize qw/av1_highbd_inv_txfm_add_32x64  neon/;
+add_proto qw/void av1_highbd_inv_txfm_add_16x64/,  "const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param";
+specialize qw/av1_highbd_inv_txfm_add_64x32  neon/;
+add_proto qw/void av1_highbd_inv_txfm_add_64x16/,  "const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param";
+specialize qw/av1_highbd_inv_txfm_add_64x64  neon/;
+
+add_proto qw/void av1_inv_txfm2d_add_4x4/,  "const tran_low_t *input, uint8_t *dest, int stride, TX_TYPE tx_type, const int bd";
+specialize qw/av1_inv_txfm2d_add_4x4 neon/;
+add_proto qw/void av1_inv_txfm2d_add_8x8/,  "const tran_low_t *input, uint8_t *dest, int stride, TX_TYPE tx_type, const int bd";
+specialize qw/av1_inv_txfm2d_add_8x8 neon/;
+add_proto qw/void av1_inv_txfm2d_add_4x8/,  "const tran_low_t *input, uint8_t *dest, int stride, TX_TYPE tx_type, const int bd";
+specialize qw/av1_inv_txfm2d_add_4x8 neon/;
+add_proto qw/void av1_inv_txfm2d_add_8x4/,  "const tran_low_t *input, uint8_t *dest, int stride, TX_TYPE tx_type, const int bd";
+specialize qw/av1_inv_txfm2d_add_8x4 neon/;
+add_proto qw/void av1_inv_txfm2d_add_4x16/,  "const tran_low_t *input, uint8_t *dest, int stride, TX_TYPE tx_type, const int bd";
+specialize qw/av1_inv_txfm2d_add_4x16 neon/;
+add_proto qw/void av1_inv_txfm2d_add_16x4/,  "const tran_low_t *input, uint8_t *dest, int stride, TX_TYPE tx_type, const int bd";
+specialize qw/av1_inv_txfm2d_add_16x4 neon/;
+add_proto qw/void av1_inv_txfm2d_add_8x16/,  "const tran_low_t *input, uint8_t *dest, int stride, TX_TYPE tx_type, const int bd";
+specialize qw/av1_inv_txfm2d_add_8x16  neon/;
+add_proto qw/void av1_inv_txfm2d_add_16x8/,  "const tran_low_t *input, uint8_t *dest, int stride, TX_TYPE tx_type, const int bd";
+specialize qw/av1_inv_txfm2d_add_16x8  neon/;
+add_proto qw/void av1_inv_txfm2d_add_16x32/,  "const tran_low_t *input, uint8_t *dest, int stride, TX_TYPE tx_type, const int bd";
+specialize qw/av1_inv_txfm2d_add_16x32  neon/;
+add_proto qw/void av1_inv_txfm2d_add_32x16/,  "const tran_low_t *input, uint8_t *dest, int stride, TX_TYPE tx_type, const int bd";
+specialize qw/av1_inv_txfm2d_add_32x16  neon/;
+add_proto qw/void av1_inv_txfm2d_add_32x32/,  "const tran_low_t *input, uint8_t *dest, int stride, TX_TYPE tx_type, const int bd";
+specialize qw/av1_inv_txfm2d_add_32x32  neon/;
+add_proto qw/void av1_inv_txfm2d_add_32x64/,  "const tran_low_t *input, uint8_t *dest, int stride, TX_TYPE tx_type, const int bd";
+specialize qw/av1_inv_txfm2d_add_32x64  neon/;
+add_proto qw/void av1_inv_txfm2d_add_64x32/,  "const tran_low_t *input, uint8_t *dest, int stride, TX_TYPE tx_type, const int bd";
+specialize qw/av1_inv_txfm2d_add_64x32  neon/;
+add_proto qw/void av1_inv_txfm2d_add_64x64/,  "const tran_low_t *input, uint8_t *dest, int stride, TX_TYPE tx_type, const int bd";
+specialize qw/av1_inv_txfm2d_add_64x64  neon/;
+add_proto qw/void av1_inv_txfm2d_add_8x32/,  "const tran_low_t *input, uint8_t *dest, int stride, TX_TYPE tx_type, const int bd";
+specialize qw/av1_inv_txfm2d_add_8x32  neon/;
+add_proto qw/void av1_inv_txfm2d_add_32x8/,  "const tran_low_t *input, uint8_t *dest, int stride, TX_TYPE tx_type, const int bd";
+specialize qw/av1_inv_txfm2d_add_32x8  neon/;
+add_proto qw/void av1_inv_txfm2d_add_16x64/,  "const tran_low_t *input, uint8_t *dest, int stride, TX_TYPE tx_type, const int bd";
+specialize qw/av1_inv_txfm2d_add_16x64  neon/;
+add_proto qw/void av1_inv_txfm2d_add_64x16/,  "const tran_low_t *input, uint8_t *dest, int stride, TX_TYPE tx_type, const int bd";
+specialize qw/av1_inv_txfm2d_add_64x16  neon/;
 
 add_proto qw/void av1_highbd_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
 add_proto qw/void av1_highbd_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
@@ -183,6 +244,7 @@
   add_proto qw/void av1_highbd_dr_prediction_z1/, "uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_above, int dx, int dy, int bd";
   specialize qw/av1_highbd_dr_prediction_z1 avx2/;
   add_proto qw/void av1_highbd_dr_prediction_z2/, "uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_above, int upsample_left, int dx, int dy, int bd";
+
   specialize qw/av1_highbd_dr_prediction_z2 avx2/;
   add_proto qw/void av1_highbd_dr_prediction_z3/, "uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_left, int dx, int dy, int bd";
   specialize qw/av1_highbd_dr_prediction_z3 avx2/;
@@ -202,6 +264,10 @@
 add_proto qw/void av1_round_shift_array/, "int32_t *arr, int size, int bit";
 specialize "av1_round_shift_array", qw/sse4_1 neon/;
 
+# Resize functions.
+add_proto qw/void av1_resize_and_extend_frame/, "const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, const InterpFilter filter, const int phase, const int num_planes";
+specialize qw/av1_resize_and_extend_frame ssse3 neon/;
+
 #
 # Encoder functions below this point.
 #
@@ -225,75 +291,79 @@
 
 
   add_proto qw/void av1_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/av1_quantize_fp_32x32 avx2/;
+  specialize qw/av1_quantize_fp_32x32 neon avx2/;
 
   add_proto qw/void av1_quantize_fp_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/av1_quantize_fp_64x64 avx2/;
+  specialize qw/av1_quantize_fp_64x64 neon avx2/;
+
+  add_proto qw/void aom_quantize_b_helper/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr, const int log_scale";
+  specialize qw/aom_quantize_b_helper neon/;
 
   # fdct functions
 
   add_proto qw/void av1_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/av1_fwht4x4 neon/;
 
   #fwd txfm
   add_proto qw/void av1_lowbd_fwd_txfm/, "const int16_t *src_diff, tran_low_t *coeff, int diff_stride, TxfmParam *txfm_param";
-  specialize qw/av1_lowbd_fwd_txfm sse2 sse4_1 avx2/;
+  specialize qw/av1_lowbd_fwd_txfm sse2 sse4_1 avx2 neon/;
 
   add_proto qw/void av1_fwd_txfm2d_4x8/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
-  specialize qw/av1_fwd_txfm2d_4x8 sse4_1/;
+  specialize qw/av1_fwd_txfm2d_4x8 sse4_1 neon/;
   add_proto qw/void av1_fwd_txfm2d_8x4/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
-  specialize qw/av1_fwd_txfm2d_8x4 sse4_1/;
+  specialize qw/av1_fwd_txfm2d_8x4 sse4_1 neon/;
   add_proto qw/void av1_fwd_txfm2d_8x16/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
-  specialize qw/av1_fwd_txfm2d_8x16 sse4_1 avx2/;
+  specialize qw/av1_fwd_txfm2d_8x16 sse4_1 avx2 neon/;
   add_proto qw/void av1_fwd_txfm2d_16x8/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
-  specialize qw/av1_fwd_txfm2d_16x8 sse4_1 avx2/;
+  specialize qw/av1_fwd_txfm2d_16x8 sse4_1 avx2 neon/;
   add_proto qw/void av1_fwd_txfm2d_16x32/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
-  specialize qw/av1_fwd_txfm2d_16x32 sse4_1/;
+  specialize qw/av1_fwd_txfm2d_16x32 sse4_1 neon/;
   add_proto qw/void av1_fwd_txfm2d_32x16/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
-  specialize qw/av1_fwd_txfm2d_32x16 sse4_1/;
+  specialize qw/av1_fwd_txfm2d_32x16 sse4_1 neon/;
   add_proto qw/void av1_fwd_txfm2d_4x16/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
-  specialize qw/av1_fwd_txfm2d_4x16 sse4_1/;
+  specialize qw/av1_fwd_txfm2d_4x16 sse4_1 neon/;
   add_proto qw/void av1_fwd_txfm2d_16x4/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
-  specialize qw/av1_fwd_txfm2d_16x4 sse4_1/;
+  specialize qw/av1_fwd_txfm2d_16x4 sse4_1 neon/;
   add_proto qw/void av1_fwd_txfm2d_8x32/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
-  specialize qw/av1_fwd_txfm2d_8x32 sse4_1/;
+  specialize qw/av1_fwd_txfm2d_8x32 sse4_1 neon/;
   add_proto qw/void av1_fwd_txfm2d_32x8/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
-  specialize qw/av1_fwd_txfm2d_32x8 sse4_1/;
+  specialize qw/av1_fwd_txfm2d_32x8 sse4_1 neon/;
   add_proto qw/void av1_fwd_txfm2d_4x4/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
-  specialize qw/av1_fwd_txfm2d_4x4 sse4_1/;
+  specialize qw/av1_fwd_txfm2d_4x4 sse4_1 neon/;
   add_proto qw/void av1_fwd_txfm2d_8x8/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
-  specialize qw/av1_fwd_txfm2d_8x8 sse4_1 avx2/;
+  specialize qw/av1_fwd_txfm2d_8x8 sse4_1 avx2 neon/;
   add_proto qw/void av1_fwd_txfm2d_16x16/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
-  specialize qw/av1_fwd_txfm2d_16x16 sse4_1 avx2/;
+  specialize qw/av1_fwd_txfm2d_16x16 sse4_1 avx2 neon/;
   add_proto qw/void av1_fwd_txfm2d_32x32/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
-  specialize qw/av1_fwd_txfm2d_32x32 sse4_1 avx2/;
+  specialize qw/av1_fwd_txfm2d_32x32 sse4_1 avx2 neon/;
 
   add_proto qw/void av1_fwd_txfm2d_64x64/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
-  specialize qw/av1_fwd_txfm2d_64x64 sse4_1 avx2/;
+  specialize qw/av1_fwd_txfm2d_64x64 sse4_1 avx2 neon/;
   add_proto qw/void av1_fwd_txfm2d_32x64/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
-  specialize qw/av1_fwd_txfm2d_32x64 sse4_1/;
+  specialize qw/av1_fwd_txfm2d_32x64 sse4_1 neon/;
   add_proto qw/void av1_fwd_txfm2d_64x32/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
-  specialize qw/av1_fwd_txfm2d_64x32 sse4_1/;
+  specialize qw/av1_fwd_txfm2d_64x32 sse4_1 neon/;
   add_proto qw/void av1_fwd_txfm2d_16x64/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
-  specialize qw/av1_fwd_txfm2d_16x64 sse4_1/;
+  specialize qw/av1_fwd_txfm2d_16x64 sse4_1 neon/;
   add_proto qw/void av1_fwd_txfm2d_64x16/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
-  specialize qw/av1_fwd_txfm2d_64x16 sse4_1/;
+  specialize qw/av1_fwd_txfm2d_64x16 sse4_1 neon/;
 
   #
   # Motion search
   #
-  add_proto qw/int av1_full_range_search/, "const struct macroblock *x, const struct search_site_config *cfg, MV *ref_mv, MV *best_mv, int search_param, int sad_per_bit, int *num00, const struct aom_variance_vtable *fn_ptr, const MV *center_mv";
-
   if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") {
-    add_proto qw/void av1_apply_temporal_filter_yuv/, "const struct yv12_buffer_config *ref_frame, const struct macroblockd *mbd, const BLOCK_SIZE block_size, const int mb_row, const int mb_col, const int num_planes, const int strength, const int use_subblock, const int *subblock_filter_weights, const uint8_t *pred, uint32_t *accum, uint16_t *count";
-    specialize qw/av1_apply_temporal_filter_yuv sse4_1/;
+    add_proto qw/void av1_apply_temporal_filter/, "const struct yv12_buffer_config *ref_frame, const struct macroblockd *mbd, const BLOCK_SIZE block_size, const int mb_row, const int mb_col, const int num_planes, const double *noise_levels, const MV *subblock_mvs, const int *subblock_mses, const int q_factor, const int filter_strength, const uint8_t *pred, uint32_t *accum, uint16_t *count";
+    specialize qw/av1_apply_temporal_filter sse2 avx2/;
+    if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+    add_proto qw/void av1_highbd_apply_temporal_filter/, "const struct yv12_buffer_config *ref_frame, const struct macroblockd *mbd, const BLOCK_SIZE block_size, const int mb_row, const int mb_col, const int num_planes, const double *noise_levels, const MV *subblock_mvs, const int *subblock_mses, const int q_factor, const int filter_strength, const uint8_t *pred, uint32_t *accum, uint16_t *count";
+    specialize qw/av1_highbd_apply_temporal_filter sse2 avx2/;
   }
-
-  if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") {
-    add_proto qw/void av1_apply_temporal_filter_planewise/, "const struct yv12_buffer_config *ref_frame, const struct macroblockd *mbd, const BLOCK_SIZE block_size, const int mb_row, const int mb_col, const int num_planes, const double *noise_levels, const int use_subblock, const int block_mse, const int *subblock_mses, const int q_factor, const uint8_t *pred, uint32_t *accum, uint16_t *count";
-    specialize qw/av1_apply_temporal_filter_planewise sse2 avx2/;
   }
   add_proto qw/void av1_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr, int log_scale";
 
+add_proto qw/void av1_calc_indices_dim1/, "const int *data, const int *centroids, uint8_t *indices, int n, int k";
+specialize qw/av1_calc_indices_dim1 avx2/;
+
   # ENCODEMB INVOKE
   if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
     add_proto qw/int64_t av1_highbd_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd";
@@ -306,14 +376,15 @@
   }
 
   add_proto qw/void av1_highbd_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/av1_highbd_fwht4x4 neon/;
 
   # End av1_high encoder functions
 
   # txb
   add_proto qw/void av1_get_nz_map_contexts/, "const uint8_t *const levels, const int16_t *const scan, const uint16_t eob, const TX_SIZE tx_size, const TX_CLASS tx_class, int8_t *const coeff_contexts";
-  specialize qw/av1_get_nz_map_contexts sse2/;
+  specialize qw/av1_get_nz_map_contexts sse2 neon/;
   add_proto qw/void av1_txb_init_levels/, "const tran_low_t *const coeff, const int width, const int height, uint8_t *const levels";
-  specialize qw/av1_txb_init_levels sse4_1 avx2/;
+  specialize qw/av1_txb_init_levels sse4_1 avx2 neon/;
 
   add_proto qw/uint64_t av1_wedge_sse_from_residuals/, "const int16_t *r1, const int16_t *d, const uint8_t *m, int N";
   specialize qw/av1_wedge_sse_from_residuals sse2 avx2/;
@@ -326,29 +397,29 @@
   add_proto qw/uint32_t av1_get_crc32c_value/, "void *crc_calculator, uint8_t *p, size_t length";
   specialize qw/av1_get_crc32c_value sse4_2/;
 
-  add_proto qw/void av1_compute_stats/,  "int wiener_win, const uint8_t *dgd8, const uint8_t *src8, int h_start, int h_end, int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H";
-  specialize qw/av1_compute_stats sse4_1 avx2/;
+  if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") {
+    add_proto qw/void av1_compute_stats/,  "int wiener_win, const uint8_t *dgd8, const uint8_t *src8, int h_start, int h_end, int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H";
+    specialize qw/av1_compute_stats sse4_1 avx2/;
+    add_proto qw/void av1_calc_proj_params/, " const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2], const sgr_params_type *params";
+    specialize qw/av1_calc_proj_params sse4_1 avx2/;
+    add_proto qw/int64_t av1_lowbd_pixel_proj_error/, " const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params";
+    specialize qw/av1_lowbd_pixel_proj_error sse4_1 avx2 neon/;
 
-  if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
-    add_proto qw/void av1_compute_stats_highbd/,  "int wiener_win, const uint8_t *dgd8, const uint8_t *src8, int h_start, int h_end, int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H, aom_bit_depth_t bit_depth";
-    specialize qw/av1_compute_stats_highbd sse4_1 avx2/;
+    if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+      add_proto qw/void av1_calc_proj_params_high_bd/, " const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2], const sgr_params_type *params";
+      specialize qw/av1_calc_proj_params_high_bd sse4_1 avx2/;
+      add_proto qw/int64_t av1_highbd_pixel_proj_error/, " const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params";
+      specialize qw/av1_highbd_pixel_proj_error sse4_1 avx2/;
+      add_proto qw/void av1_compute_stats_highbd/,  "int wiener_win, const uint8_t *dgd8, const uint8_t *src8, int h_start, int h_end, int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H, aom_bit_depth_t bit_depth";
+      specialize qw/av1_compute_stats_highbd sse4_1 avx2/;
+    }
   }
 
-  add_proto qw/void av1_calc_proj_params/, " const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2], const sgr_params_type *params";
-  specialize qw/av1_calc_proj_params avx2/;
-
-  add_proto qw/int64_t av1_lowbd_pixel_proj_error/, " const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params";
-  specialize qw/av1_lowbd_pixel_proj_error sse4_1 avx2/;
-
-  if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
-    add_proto qw/int64_t av1_highbd_pixel_proj_error/, " const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params";
-    specialize qw/av1_highbd_pixel_proj_error sse4_1 avx2/;
-  }
   add_proto qw/void av1_get_horver_correlation_full/, " const int16_t *diff, int stride, int w, int h, float *hcorr, float *vcorr";
-  specialize qw/av1_get_horver_correlation_full sse4_1 avx2/;
+  specialize qw/av1_get_horver_correlation_full sse4_1 avx2 neon/;
 
   add_proto qw/void av1_nn_predict/, " const float *input_nodes, const NN_CONFIG *const nn_config, int reduce_prec, float *const output";
-  specialize qw/av1_nn_predict sse3/;
+  specialize qw/av1_nn_predict sse3 neon/;
 }
 # end encoder functions
 
@@ -380,17 +451,18 @@
 }
 
 # WARPED_MOTION / GLOBAL_MOTION functions
-
-add_proto qw/void av1_warp_affine/, "const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta";
-specialize qw/av1_warp_affine sse4_1 avx2 neon/;
-
-if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes" && aom_config("CONFIG_REALTIME_ONLY") ne "yes") {
   add_proto qw/void av1_highbd_warp_affine/, "const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta";
-  specialize qw/av1_highbd_warp_affine sse4_1/;
+  specialize qw/av1_highbd_warp_affine sse4_1 avx2/;
 }
 
-add_proto qw/int64_t av1_calc_frame_error/, "const uint8_t *const ref, int stride, const uint8_t *const dst, int p_width, int p_height, int p_stride";
-specialize qw/av1_calc_frame_error sse2 avx2/;
+if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") {
+  add_proto qw/void av1_warp_affine/, "const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta";
+  specialize qw/av1_warp_affine sse4_1 avx2 neon/;
+
+  add_proto qw/int64_t av1_calc_frame_error/, "const uint8_t *const ref, int stride, const uint8_t *const dst, int p_width, int p_height, int p_stride";
+  specialize qw/av1_calc_frame_error sse2 avx2/;
+}
 
 if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
   add_proto qw/double av1_compute_cross_correlation/, "unsigned char *im1, int stride1, int x1, int y1, unsigned char *im2, int stride2, int x2, int y2";
@@ -398,41 +470,39 @@
 }
 
 # LOOP_RESTORATION functions
+if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") {
+  add_proto qw/void av1_apply_selfguided_restoration/, "const uint8_t *dat, int width, int height, int stride, int eps, const int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf, int bit_depth, int highbd";
+  specialize qw/av1_apply_selfguided_restoration sse4_1 avx2 neon/;
 
-add_proto qw/void av1_apply_selfguided_restoration/, "const uint8_t *dat, int width, int height, int stride, int eps, const int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf, int bit_depth, int highbd";
-specialize qw/av1_apply_selfguided_restoration sse4_1 avx2 neon/;
-
-add_proto qw/int av1_selfguided_restoration/, "const uint8_t *dgd8, int width, int height,
-                                 int dgd_stride, int32_t *flt0, int32_t *flt1, int flt_stride,
-                                 int sgr_params_idx, int bit_depth, int highbd";
-specialize qw/av1_selfguided_restoration sse4_1 avx2 neon/;
+  add_proto qw/int av1_selfguided_restoration/, "const uint8_t *dgd8, int width, int height,
+                                  int dgd_stride, int32_t *flt0, int32_t *flt1, int flt_stride,
+                                  int sgr_params_idx, int bit_depth, int highbd";
+  specialize qw/av1_selfguided_restoration sse4_1 avx2 neon/;
+}
 
 # CONVOLVE_ROUND/COMPOUND_ROUND functions
 
 add_proto qw/void av1_convolve_2d_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params";
-add_proto qw/void av1_convolve_2d_copy_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params";
-add_proto qw/void av1_convolve_x_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params";
-add_proto qw/void av1_convolve_y_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params";
+add_proto qw/void av1_convolve_x_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, ConvolveParams *conv_params";
+add_proto qw/void av1_convolve_y_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn";
 add_proto qw/void av1_dist_wtd_convolve_2d/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params";
-add_proto qw/void av1_dist_wtd_convolve_2d_copy/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params";
-add_proto qw/void av1_dist_wtd_convolve_x/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params";
-add_proto qw/void av1_dist_wtd_convolve_y/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params";
+add_proto qw/void av1_dist_wtd_convolve_2d_copy/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, ConvolveParams *conv_params";
+add_proto qw/void av1_dist_wtd_convolve_x/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, ConvolveParams *conv_params";
+add_proto qw/void av1_dist_wtd_convolve_y/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn, ConvolveParams *conv_params";
 if(aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
-  add_proto qw/void av1_highbd_convolve_2d_copy_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd";
   add_proto qw/void av1_highbd_convolve_2d_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd";
-  add_proto qw/void av1_highbd_convolve_x_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd";
-  add_proto qw/void av1_highbd_convolve_y_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd";
+  add_proto qw/void av1_highbd_convolve_x_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, ConvolveParams *conv_params, int bd";
+  add_proto qw/void av1_highbd_convolve_y_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn, int bd";
   add_proto qw/void av1_highbd_dist_wtd_convolve_2d/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd";
-  add_proto qw/void av1_highbd_dist_wtd_convolve_x/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd";
-  add_proto qw/void av1_highbd_dist_wtd_convolve_y/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd";
-  add_proto qw/void av1_highbd_dist_wtd_convolve_2d_copy/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd";
+  add_proto qw/void av1_highbd_dist_wtd_convolve_x/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, ConvolveParams *conv_params, int bd";
+  add_proto qw/void av1_highbd_dist_wtd_convolve_y/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn, ConvolveParams *conv_params, int bd";
+  add_proto qw/void av1_highbd_dist_wtd_convolve_2d_copy/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, ConvolveParams *conv_params, int bd";
   add_proto qw/void av1_highbd_convolve_2d_scale/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_qn, const int y_step_qn, ConvolveParams *conv_params, int bd";
 }
 
   add_proto qw/void av1_convolve_2d_scale/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_qn, const int y_step_qn, ConvolveParams *conv_params";
 
   specialize qw/av1_convolve_2d_sr sse2 avx2 neon/;
-  specialize qw/av1_convolve_2d_copy_sr sse2 avx2 neon/;
   specialize qw/av1_convolve_x_sr sse2 avx2 neon/;
   specialize qw/av1_convolve_y_sr sse2 avx2 neon/;
   specialize qw/av1_convolve_2d_scale sse4_1/;
@@ -445,7 +515,6 @@
     specialize qw/av1_highbd_dist_wtd_convolve_x sse4_1 avx2/;
     specialize qw/av1_highbd_dist_wtd_convolve_y sse4_1 avx2/;
     specialize qw/av1_highbd_dist_wtd_convolve_2d_copy sse4_1 avx2/;
-    specialize qw/av1_highbd_convolve_2d_copy_sr sse2 avx2/;
     specialize qw/av1_highbd_convolve_2d_sr ssse3 avx2/;
     specialize qw/av1_highbd_convolve_x_sr ssse3 avx2/;
     specialize qw/av1_highbd_convolve_y_sr ssse3 avx2/;
diff --git a/av1/common/blockd.h b/av1/common/blockd.h
index 47597bc..0eb212c 100644
--- a/av1/common/blockd.h
+++ b/av1/common/blockd.h
@@ -39,6 +39,8 @@
 
 #define INTERINTRA_WEDGE_SIGN 0
 
+/*!\cond */
+
 // DIFFWTD_MASK_TYPES should not surpass 1 << MAX_DIFFWTD_MASK_BITS
 enum {
   DIFFWTD_38 = 0,
@@ -188,7 +190,7 @@
   // rate/dist.
   int64_t rdcost;
   int64_t sse;
-  int skip;  // sse should equal to dist when skip == 1
+  int skip_txfm;  // sse should equal to dist when skip_txfm == 1
   int zero_rate;
 #if CONFIG_RD_DEBUG
   int txb_coeff_cost[MAX_MB_PLANE];
@@ -212,64 +214,133 @@
 
 #define INTER_TX_SIZE_BUF_LEN 16
 #define TXK_TYPE_BUF_LEN 64
-// This structure now relates to 4x4 block regions.
+/*!\endcond */
+
+/*! \brief Stores the prediction/txfm mode of the current coding block
+ */
 typedef struct MB_MODE_INFO {
-  // interinter members
-  INTERINTER_COMPOUND_DATA interinter_comp;
-  WarpedMotionParams wm_params;
-  int_mv mv[2];
+  /*****************************************************************************
+   * \name General Info of the Coding Block
+   ****************************************************************************/
+  /**@{*/
+  /*! \brief The block size of the current coding block */
+  BLOCK_SIZE bsize;
+  /*! \brief The partition type of the current coding block. */
+  PARTITION_TYPE partition;
+  /*! \brief The prediction mode used */
+  PREDICTION_MODE mode;
+  /*! \brief The UV mode when intra is used */
+  UV_PREDICTION_MODE uv_mode;
+  /*! \brief The q index for the current coding block. */
   int current_qindex;
-  // Only for INTER blocks
+  /**@}*/
+
+  /*****************************************************************************
+   * \name Inter Mode Info
+   ****************************************************************************/
+  /**@{*/
+  /*! \brief The motion vectors used by the current inter mode */
+  int_mv mv[2];
+  /*! \brief The reference frames for the MV */
+  MV_REFERENCE_FRAME ref_frame[2];
+  /*! \brief Filter used in subpel interpolation. */
   int_interpfilters interp_filters;
-  // TODO(debargha): Consolidate these flags
+  /*! \brief The motion mode used by the inter prediction. */
+  MOTION_MODE motion_mode;
+  /*! \brief Number of samples used by warp causal */
+  uint8_t num_proj_ref;
+  /*! \brief The number of overlapped neighbors above/left for obmc/warp motion
+   * mode. */
+  uint8_t overlappable_neighbors;
+  /*! \brief The parameters used in warp motion mode. */
+  WarpedMotionParams wm_params;
+  /*! \brief The type of intra mode used by inter-intra */
+  INTERINTRA_MODE interintra_mode;
+  /*! \brief The type of wedge used in interintra mode. */
+  int8_t interintra_wedge_index;
+  /*! \brief Struct that stores the data used in interinter compound mode. */
+  INTERINTER_COMPOUND_DATA interinter_comp;
+  /**@}*/
+
+  /*****************************************************************************
+   * \name Intra Mode Info
+   ****************************************************************************/
+  /**@{*/
+  /*! \brief Directional mode delta: the angle is base angle + (angle_delta *
+   * step). */
+  int8_t angle_delta[PLANE_TYPES];
+  /*! \brief The type of filter intra mode used (if applicable). */
+  FILTER_INTRA_MODE_INFO filter_intra_mode_info;
+  /*! \brief Chroma from Luma: Joint sign of alpha Cb and alpha Cr */
+  int8_t cfl_alpha_signs;
+  /*! \brief Chroma from Luma: Index of the alpha Cb and alpha Cr combination */
+  uint8_t cfl_alpha_idx;
+  /*! \brief Stores the size and colors of palette mode */
+  PALETTE_MODE_INFO palette_mode_info;
+  /**@}*/
+
+  /*****************************************************************************
+   * \name Transform Info
+   ****************************************************************************/
+  /**@{*/
+  /*! \brief Whether to skip transforming and sending. */
+  int8_t skip_txfm;
+  /*! \brief Transform size when fixed size txfm is used (e.g. intra modes). */
+  TX_SIZE tx_size;
+  /*! \brief Transform size when recursive txfm tree is on. */
+  TX_SIZE inter_tx_size[INTER_TX_SIZE_BUF_LEN];
+  /**@}*/
+
+  /*****************************************************************************
+   * \name Loop Filter Info
+   ****************************************************************************/
+  /**@{*/
+  /*! \copydoc MACROBLOCKD::delta_lf_from_base */
+  int8_t delta_lf_from_base;
+  /*! \copydoc MACROBLOCKD::delta_lf */
+  int8_t delta_lf[FRAME_LF_COUNT];
+  /**@}*/
+
+  /*****************************************************************************
+   * \name Bitfield for Memory Reduction
+   ****************************************************************************/
+  /**@{*/
+  /*! \brief The segment id */
+  uint8_t segment_id : 3;
+  /*! \brief Only valid when temporal update if off. */
+  uint8_t seg_id_predicted : 1;
+  /*! \brief Which ref_mv to use */
+  uint8_t ref_mv_idx : 2;
+  /*! \brief Inter skip mode */
+  uint8_t skip_mode : 1;
+  /*! \brief Whether intrabc is used. */
+  uint8_t use_intrabc : 1;
+  /*! \brief Indicates if masked compound is used(1) or not (0). */
+  uint8_t comp_group_idx : 1;
+  /*! \brief Indicates whether dist_wtd_comp(0) is used or not (0). */
+  uint8_t compound_idx : 1;
+  /*! \brief Whether to use interintra wedge */
+  uint8_t use_wedge_interintra : 1;
+  /*! \brief CDEF strength per BLOCK_64X64 */
+  int8_t cdef_strength : 4;
+  /**@}*/
+
 #if CONFIG_RD_DEBUG
+  /*! \brief RD info used for debugging */
   RD_STATS rd_stats;
+  /*! \brief The current row in unit of 4x4 blocks for debugging */
   int mi_row;
+  /*! \brief The current col in unit of 4x4 blocks for debugging */
   int mi_col;
 #endif
 #if CONFIG_INSPECTION
+  /*! \brief Whether we are skipping the current rows or columns. */
   int16_t tx_skip[TXK_TYPE_BUF_LEN];
 #endif
-  PALETTE_MODE_INFO palette_mode_info;
-  // Common for both INTER and INTRA blocks
-  BLOCK_SIZE sb_type;
-  PREDICTION_MODE mode;
-  // Only for INTRA blocks
-  UV_PREDICTION_MODE uv_mode;
-  // interintra members
-  INTERINTRA_MODE interintra_mode;
-  MOTION_MODE motion_mode;
-  PARTITION_TYPE partition;
-  MV_REFERENCE_FRAME ref_frame[2];
-  FILTER_INTRA_MODE_INFO filter_intra_mode_info;
-  int8_t skip;
-  uint8_t inter_tx_size[INTER_TX_SIZE_BUF_LEN];
-  TX_SIZE tx_size;
-  int8_t delta_lf_from_base;
-  int8_t delta_lf[FRAME_LF_COUNT];
-  int8_t interintra_wedge_index;
-  // The actual prediction angle is the base angle + (angle_delta * step).
-  int8_t angle_delta[PLANE_TYPES];
-  /* deringing gain *per-superblock* */
-  // Joint sign of alpha Cb and alpha Cr
-  int8_t cfl_alpha_signs;
-  // Index of the alpha Cb and alpha Cr combination
-  uint8_t cfl_alpha_idx;
-  uint8_t num_proj_ref;
-  uint8_t overlappable_neighbors[2];
-  // If comp_group_idx=0, indicate if dist_wtd_comp(0) or avg_comp(1) is used.
-  uint8_t compound_idx;
-  uint8_t use_wedge_interintra : 1;
-  uint8_t segment_id : 3;
-  uint8_t seg_id_predicted : 1;  // valid only when temporal_update is enabled
-  uint8_t skip_mode : 1;
-  uint8_t use_intrabc : 1;
-  uint8_t ref_mv_idx : 2;
-  // Indicate if masked compound is used(1) or not(0).
-  uint8_t comp_group_idx : 1;
-  int8_t cdef_strength : 4;
 } MB_MODE_INFO;
 
+/*!\cond */
+
 static INLINE int is_intrabc_block(const MB_MODE_INFO *mbmi) {
   return mbmi->use_intrabc;
 }
@@ -349,7 +420,7 @@
 static INLINE int is_global_mv_block(const MB_MODE_INFO *const mbmi,
                                      TransformationType type) {
   const PREDICTION_MODE mode = mbmi->mode;
-  const BLOCK_SIZE bsize = mbmi->sb_type;
+  const BLOCK_SIZE bsize = mbmi->bsize;
   const int block_size_allowed =
       AOMMIN(block_size_wide[bsize], block_size_high[bsize]) >= 8;
   return (mode == GLOBALMV || mode == GLOBAL_GLOBALMV) && type > TRANSLATION &&
@@ -390,9 +461,6 @@
 } CB_BUFFER;
 
 typedef struct macroblockd_plane {
-  tran_low_t *dqcoeff;
-  tran_low_t *dqcoeff_block;
-  eob_info *eob_data;
   PLANE_TYPE plane_type;
   int subsampling_x;
   int subsampling_y;
@@ -405,6 +473,9 @@
   // dequantization process.  They have the same coefficient
   // shift/scale as TX.
   int16_t seg_dequant_QTX[MAX_SEGMENTS][2];
+  // Pointer to color index map of:
+  // - Current coding block, on encoder side.
+  // - Current superblock, on decoder side.
   uint8_t *color_index_map;
 
   // block size in pixels
@@ -416,16 +487,36 @@
 
 #define BLOCK_OFFSET(i) ((i) << 4)
 
+/*!\endcond */
+
+/*!\brief Parameters related to Wiener Filter */
 typedef struct {
+  /*!
+   * Vertical filter kernel.
+   */
   DECLARE_ALIGNED(16, InterpKernel, vfilter);
+
+  /*!
+   * Horizontal filter kernel.
+   */
   DECLARE_ALIGNED(16, InterpKernel, hfilter);
 } WienerInfo;
 
+/*!\brief Parameters related to Sgrproj Filter */
 typedef struct {
+  /*!
+   * Parameter index.
+   */
   int ep;
+
+  /*!
+   * Weights for linear combination of filtered versions
+   */
   int xqd[2];
 } SgrprojInfo;
 
+/*!\cond */
+
 #if CONFIG_DEBUG
 #define CFL_SUB8X8_VAL_MI_SIZE (4)
 #define CFL_SUB8X8_VAL_MI_SQUARE \
@@ -475,204 +566,376 @@
 
 struct scale_factors;
 
-// Most/all of the pointers are mere pointers to actual arrays are allocated
-// elsewhere. This is mostly for coding convenience.
+/*!\endcond */
+
+/*! \brief Variables related to current coding block.
+ *
+ * This is a common set of variables used by both encoder and decoder.
+ * Most/all of the pointers are mere pointers to actual arrays are allocated
+ * elsewhere. This is mostly for coding convenience.
+ */
 typedef struct macroblockd {
-  // Row and column position of current macroblock in mi units.
-  int mi_row;
-  int mi_col;
-  // Same as cm->mi_params.mi_stride, copied here for convenience.
+  /**
+   * \name Position of current macroblock in mi units
+   */
+  /**@{*/
+  int mi_row; /*!< Row position in mi units. */
+  int mi_col; /*!< Column position in mi units. */
+  /**@}*/
+
+  /*!
+   * Same as cm->mi_params.mi_stride, copied here for convenience.
+   */
   int mi_stride;
 
-  // True if current block transmits chroma information.
-  // More detail:
-  // Smallest supported block size for both luma and chroma plane is 4x4. Hence,
-  // in case of subsampled chroma plane (YUV 4:2:0 or YUV 4:2:2), multiple luma
-  // blocks smaller than 8x8 maybe combined into one chroma block.
-  // For example, for YUV 4:2:0, let's say an 8x8 area is split into four 4x4
-  // luma blocks. Then, a single chroma block of size 4x4 will cover the area of
-  // these four luma blocks. This is implemented in bitstream as follows:
-  // - There are four MB_MODE_INFO structs for the four luma blocks.
-  // - First 3 MB_MODE_INFO have is_chroma_ref = false, and so do not transmit
-  // any information for chroma planes.
-  // - Last block will have is_chroma_ref = true and transmits chroma
-  // information for the 4x4 chroma block that covers whole 8x8 area covered by
-  // four luma blocks.
-  // Similar logic applies for chroma blocks that cover 2 or 3 luma blocks.
+  /*!
+   * True if current block transmits chroma information.
+   * More detail:
+   * Smallest supported block size for both luma and chroma plane is 4x4. Hence,
+   * in case of subsampled chroma plane (YUV 4:2:0 or YUV 4:2:2), multiple luma
+   * blocks smaller than 8x8 maybe combined into one chroma block.
+   * For example, for YUV 4:2:0, let's say an 8x8 area is split into four 4x4
+   * luma blocks. Then, a single chroma block of size 4x4 will cover the area of
+   * these four luma blocks. This is implemented in bitstream as follows:
+   * - There are four MB_MODE_INFO structs for the four luma blocks.
+   * - First 3 MB_MODE_INFO have is_chroma_ref = false, and so do not transmit
+   * any information for chroma planes.
+   * - Last block will have is_chroma_ref = true and transmits chroma
+   * information for the 4x4 chroma block that covers whole 8x8 area covered by
+   * four luma blocks.
+   * Similar logic applies for chroma blocks that cover 2 or 3 luma blocks.
+   */
   bool is_chroma_ref;
 
+  /*!
+   * Info specific to each plane.
+   */
   struct macroblockd_plane plane[MAX_MB_PLANE];
 
+  /*!
+   * Tile related info.
+   */
   TileInfo tile;
 
-  // Appropriate offset inside cm->mi_params.mi_grid_base based on current
-  // mi_row and mi_col.
+  /*!
+   * Appropriate offset inside cm->mi_params.mi_grid_base based on current
+   * mi_row and mi_col.
+   */
   MB_MODE_INFO **mi;
 
-  // True if 4x4 block above the current block is available.
+  /*!
+   * True if 4x4 block above the current block is available.
+   */
   bool up_available;
-  // True if 4x4 block to the left of the current block is available.
+  /*!
+   * True if 4x4 block to the left of the current block is available.
+   */
   bool left_available;
-  // True if the above chrome reference block is available.
+  /*!
+   * True if the above chrome reference block is available.
+   */
   bool chroma_up_available;
-  // True if the left chrome reference block is available.
+  /*!
+   * True if the left chrome reference block is available.
+   */
   bool chroma_left_available;
 
-  // MB_MODE_INFO for 4x4 block to the left of the current block, if
-  // left_available == true; otherwise NULL.
+  /*!
+   * MB_MODE_INFO for 4x4 block to the left of the current block, if
+   * left_available == true; otherwise NULL.
+   */
   MB_MODE_INFO *left_mbmi;
-  // MB_MODE_INFO for 4x4 block above the current block, if
-  // up_available == true; otherwise NULL.
+  /*!
+   * MB_MODE_INFO for 4x4 block above the current block, if
+   * up_available == true; otherwise NULL.
+   */
   MB_MODE_INFO *above_mbmi;
-  // Above chroma reference block if is_chroma_ref == true for the current block
-  // and chroma_up_available == true; otherwise NULL.
-  // See also: the special case logic when current chroma block covers more than
-  // one luma blocks in set_mi_row_col().
+  /*!
+   * Above chroma reference block if is_chroma_ref == true for the current block
+   * and chroma_up_available == true; otherwise NULL.
+   * See also: the special case logic when current chroma block covers more than
+   * one luma blocks in set_mi_row_col().
+   */
   MB_MODE_INFO *chroma_left_mbmi;
-  // Left chroma reference block if is_chroma_ref == true for the current block
-  // and chroma_left_available == true; otherwise NULL.
-  // See also: the special case logic when current chroma block covers more than
-  // one luma blocks in set_mi_row_col().
+  /*!
+   * Left chroma reference block if is_chroma_ref == true for the current block
+   * and chroma_left_available == true; otherwise NULL.
+   * See also: the special case logic when current chroma block covers more than
+   * one luma blocks in set_mi_row_col().
+   */
   MB_MODE_INFO *chroma_above_mbmi;
 
-  // Appropriate offset based on current 'mi_row' and 'mi_col', inside
-  // 'tx_type_map' in one of 'CommonModeInfoParams', 'PICK_MODE_CONTEXT' or
-  // 'MACROBLOCK' structs.
+  /*!
+   * Appropriate offset based on current 'mi_row' and 'mi_col', inside
+   * 'tx_type_map' in one of 'CommonModeInfoParams', 'PICK_MODE_CONTEXT' or
+   * 'MACROBLOCK' structs.
+   */
   uint8_t *tx_type_map;
-  // Stride for 'tx_type_map'. Note that this may / may not be same as
-  // 'mi_stride', depending on which actual array 'tx_type_map' points to.
+  /*!
+   * Stride for 'tx_type_map'. Note that this may / may not be same as
+   * 'mi_stride', depending on which actual array 'tx_type_map' points to.
+   */
   int tx_type_map_stride;
 
-  // Distance of this macroblock from frame edges in 1/8th pixel units.
-  int mb_to_left_edge;
-  int mb_to_right_edge;
-  int mb_to_top_edge;
-  int mb_to_bottom_edge;
+  /**
+   * \name Distance of this macroblock from frame edges in 1/8th pixel units.
+   */
+  /**@{*/
+  int mb_to_left_edge;   /*!< Distance from left edge */
+  int mb_to_right_edge;  /*!< Distance from right edge */
+  int mb_to_top_edge;    /*!< Distance from top edge */
+  int mb_to_bottom_edge; /*!< Distance from bottom edge */
+  /**@}*/
 
-  // Scale factors for reference frames of the current block.
-  // These are pointers into 'cm->ref_scale_factors'.
+  /*!
+   * Scale factors for reference frames of the current block.
+   * These are pointers into 'cm->ref_scale_factors'.
+   */
   const struct scale_factors *block_ref_scale_factors[2];
 
+  /*!
+   * - On encoder side: points to cpi->source, which is the buffer containing
+   * the current *source* frame (maybe filtered).
+   * - On decoder side: points to cm->cur_frame->buf, which is the buffer into
+   * which current frame is being *decoded*.
+   */
   const YV12_BUFFER_CONFIG *cur_buf;
 
-  // Entropy contexts for the above blocks.
-  // above_entropy_context[i][j] corresponds to above entropy context for ith
-  // plane and jth mi column of this *frame*, wrt current 'mi_row'.
-  // These are pointers into 'cm->above_contexts.entropy'.
+  /*!
+   * Entropy contexts for the above blocks.
+   * above_entropy_context[i][j] corresponds to above entropy context for ith
+   * plane and jth mi column of this *frame*, wrt current 'mi_row'.
+   * These are pointers into 'cm->above_contexts.entropy'.
+   */
   ENTROPY_CONTEXT *above_entropy_context[MAX_MB_PLANE];
-  // Entropy contexts for the left blocks.
-  // left_entropy_context[i][j] corresponds to left entropy context for ith
-  // plane and jth mi row of this *superblock*, wrt current 'mi_col'.
-  // Note: These contain actual data, NOT pointers.
+  /*!
+   * Entropy contexts for the left blocks.
+   * left_entropy_context[i][j] corresponds to left entropy context for ith
+   * plane and jth mi row of this *superblock*, wrt current 'mi_col'.
+   * Note: These contain actual data, NOT pointers.
+   */
   ENTROPY_CONTEXT left_entropy_context[MAX_MB_PLANE][MAX_MIB_SIZE];
 
-  // Partition contexts for the above blocks.
-  // above_partition_context[i] corresponds to above partition context for ith
-  // mi column of this *frame*, wrt current 'mi_row'.
-  // These are pointers into 'cm->above_contexts.partition'.
+  /*!
+   * Partition contexts for the above blocks.
+   * above_partition_context[i] corresponds to above partition context for ith
+   * mi column of this *frame*, wrt current 'mi_row'.
+   * This is a pointer into 'cm->above_contexts.partition'.
+   */
   PARTITION_CONTEXT *above_partition_context;
-  // Partition contexts for the left blocks.
-  // left_partition_context[i] corresponds to left partition context for ith
-  // mi row of this *superblock*, wrt current 'mi_col'.
-  // Note: These contain actual data, NOT pointers.
+  /*!
+   * Partition contexts for the left blocks.
+   * left_partition_context[i] corresponds to left partition context for ith
+   * mi row of this *superblock*, wrt current 'mi_col'.
+   * Note: These contain actual data, NOT pointers.
+   */
   PARTITION_CONTEXT left_partition_context[MAX_MIB_SIZE];
 
-  // Transform contexts for the above blocks.
-  // TODO(urvang): Indexed two different ways from cm->above_contexts.txfm in
-  // code currently. Need to make it consistent / document why.
+  /*!
+   * Transform contexts for the above blocks.
+   * above_txfm_context[i] corresponds to above transform context for ith mi col
+   * from the current position (mi row and mi column) for this *frame*.
+   * This is a pointer into 'cm->above_contexts.txfm'.
+   */
   TXFM_CONTEXT *above_txfm_context;
-  // Transform contexts for the left blocks.
+  /*!
+   * Transform contexts for the left blocks.
+   * left_txfm_context[i] corresponds to left transform context for ith mi row
+   * from the current position (mi_row and mi_col) for this *superblock*.
+   * This is a pointer into 'left_txfm_context_buffer'.
+   */
   TXFM_CONTEXT *left_txfm_context;
-  // TODO(urvang): 'left_txfm_context' points to 'left_txfm_context_buffer'.
-  // Can we remove this indirection?
+  /*!
+   * left_txfm_context_buffer[i] is the left transform context for ith mi_row
+   * in this *superblock*.
+   * Behaves like an internal actual buffer which 'left_txt_context' points to,
+   * and never accessed directly except to fill in initial default values.
+   */
   TXFM_CONTEXT left_txfm_context_buffer[MAX_MIB_SIZE];
 
-  // Default values for the two restoration filters for each plane.
-  // These values are used as reference values when writing the bitstream. That
-  // is, we transmit the delta between the actual values in
-  // cm->rst_info[plane].unit_info[unit_idx] and these reference values.
-  WienerInfo wiener_info[MAX_MB_PLANE];
-  SgrprojInfo sgrproj_info[MAX_MB_PLANE];
+  /**
+   * \name Default values for the two restoration filters for each plane.
+   * Default values for the two restoration filters for each plane.
+   * These values are used as reference values when writing the bitstream. That
+   * is, we transmit the delta between the actual values in
+   * cm->rst_info[plane].unit_info[unit_idx] and these reference values.
+   */
+  /**@{*/
+  WienerInfo wiener_info[MAX_MB_PLANE];   /*!< Defaults for Wiener filter*/
+  SgrprojInfo sgrproj_info[MAX_MB_PLANE]; /*!< Defaults for SGR filter */
+  /**@}*/
 
-  // Block dimensions in MB_MODE_INFO units.
-  uint8_t width;
-  uint8_t height;
+  /**
+   * \name Block dimensions in MB_MODE_INFO units.
+   */
+  /**@{*/
+  uint8_t width;  /*!< Block width in MB_MODE_INFO units */
+  uint8_t height; /*!< Block height in MB_MODE_INFO units */
+  /**@}*/
 
-  uint8_t ref_mv_count[MODE_CTX_REF_FRAMES];
+  /*!
+   * Contains the motion vector candidates found during motion vector prediction
+   * process. ref_mv_stack[i] contains the candidates for ith type of
+   * reference frame (single/compound). The actual number of candidates found in
+   * ref_mv_stack[i] is stored in either dcb->ref_mv_count[i] (decoder side)
+   * or mbmi_ext->ref_mv_count[i] (encoder side).
+   */
   CANDIDATE_MV ref_mv_stack[MODE_CTX_REF_FRAMES][MAX_REF_MV_STACK_SIZE];
+  /*!
+   * weight[i][j] is the weight for ref_mv_stack[i][j] and used to compute the
+   * DRL (dynamic reference list) mode contexts.
+   */
   uint16_t weight[MODE_CTX_REF_FRAMES][MAX_REF_MV_STACK_SIZE];
-  uint8_t is_sec_rect;
 
-  // Counts of each reference frame in the above and left neighboring blocks.
-  // NOTE: Take into account both single and comp references.
+  /*!
+   * True if this is the last vertical rectangular block in a VERTICAL or
+   * VERTICAL_4 partition.
+   */
+  bool is_last_vertical_rect;
+  /*!
+   * True if this is the 1st horizontal rectangular block in a HORIZONTAL or
+   * HORIZONTAL_4 partition.
+   */
+  bool is_first_horizontal_rect;
+
+  /*!
+   * Counts of each reference frame in the above and left neighboring blocks.
+   * NOTE: Take into account both single and comp references.
+   */
   uint8_t neighbors_ref_counts[REF_FRAMES];
 
+  /*!
+   * Current CDFs of all the symbols for the current tile.
+   */
   FRAME_CONTEXT *tile_ctx;
-  // Bit depth: copied from cm->seq_params.bit_depth for convenience.
+
+  /*!
+   * Bit depth: copied from cm->seq_params.bit_depth for convenience.
+   */
   int bd;
 
+  /*!
+   * Quantizer index for each segment (base qindex + delta for each segment).
+   */
   int qindex[MAX_SEGMENTS];
+  /*!
+   * lossless[s] is true if segment 's' is coded losslessly.
+   */
   int lossless[MAX_SEGMENTS];
-  // TODO(urvang): Move to decoder.
-  int corrupted;
-  // Same as cm->features.cur_frame_force_integer_mv.
+  /*!
+   * Q index for the coding blocks in this superblock will be stored in
+   * mbmi->current_qindex. Now, when cm->delta_q_info.delta_q_present_flag is
+   * true, mbmi->current_qindex is computed by taking 'current_base_qindex' as
+   * the base, and adding any transmitted delta qindex on top of it.
+   * Precisely, this is the latest qindex used by the first coding block of a
+   * non-skip superblock in the current tile; OR
+   * same as cm->quant_params.base_qindex (if not explicitly set yet).
+   * Note: This is 'CurrentQIndex' in the AV1 spec.
+   */
+  int current_base_qindex;
+
+  /*!
+   * Same as cm->features.cur_frame_force_integer_mv.
+   */
   int cur_frame_force_integer_mv;
-  // Pointer to cm->error.
+
+  /*!
+   * Pointer to cm->error.
+   */
   struct aom_internal_error_info *error_info;
-  // Same as cm->global_motion.
+
+  /*!
+   * Same as cm->global_motion.
+   */
   const WarpedMotionParams *global_motion;
-  int delta_qindex;
-  int current_qindex;
-  // Since actual frame level loop filtering level value is not available
-  // at the beginning of the tile (only available during actual filtering)
-  // at encoder side.we record the delta_lf (against the frame level loop
-  // filtering level) and code the delta between previous superblock's delta
-  // lf and current delta lf. It is equivalent to the delta between previous
-  // superblock's actual lf and current lf.
+
+  /*!
+   * Since actual frame level loop filtering level value is not available
+   * at the beginning of the tile (only available during actual filtering)
+   * at encoder side.we record the delta_lf (against the frame level loop
+   * filtering level) and code the delta between previous superblock's delta
+   * lf and current delta lf. It is equivalent to the delta between previous
+   * superblock's actual lf and current lf.
+   */
   int8_t delta_lf_from_base;
-  // For this experiment, we have four frame filter levels for different plane
-  // and direction. So, to support the per superblock update, we need to add
-  // a few more params as below.
-  // 0: delta loop filter level for y plane vertical
-  // 1: delta loop filter level for y plane horizontal
-  // 2: delta loop filter level for u plane
-  // 3: delta loop filter level for v plane
-  // To make it consistent with the reference to each filter level in segment,
-  // we need to -1, since
-  // SEG_LVL_ALT_LF_Y_V = 1;
-  // SEG_LVL_ALT_LF_Y_H = 2;
-  // SEG_LVL_ALT_LF_U   = 3;
-  // SEG_LVL_ALT_LF_V   = 4;
+  /*!
+   * We have four frame filter levels for different plane and direction. So, to
+   * support the per superblock update, we need to add a few more params:
+   * 0. delta loop filter level for y plane vertical
+   * 1. delta loop filter level for y plane horizontal
+   * 2. delta loop filter level for u plane
+   * 3. delta loop filter level for v plane
+   * To make it consistent with the reference to each filter level in segment,
+   * we need to -1, since
+   * - SEG_LVL_ALT_LF_Y_V = 1;
+   * - SEG_LVL_ALT_LF_Y_H = 2;
+   * - SEG_LVL_ALT_LF_U   = 3;
+   * - SEG_LVL_ALT_LF_V   = 4;
+   */
   int8_t delta_lf[FRAME_LF_COUNT];
-  // cdef_transmitted[i] is true if CDEF strength for ith CDEF unit in the
-  // current superblock has already been read from (decoder) / written to
-  // (encoder) the bitstream; and false otherwise.
-  // More detail:
-  // (1) CDEF strength is transmitted only once per CDEF unit, in the 1st
-  // non-skip coding block. So, we need this array to keep track of whether CDEF
-  // strengths for the given CDEF units have been transmitted yet or not.
-  // (2) Superblock size can be either 128x128 or 64x64, but CDEF unit size is
-  // fixed to be 64x64. So, there may be 4 CDEF units within a superblock (if
-  // superblock size is 128x128). Hence the array size is 4.
-  // (3) In the current implementation, CDEF strength for this CDEF unit is
-  // stored in the MB_MODE_INFO of the 1st block in this CDEF unit (inside
-  // cm->mi_params.mi_grid_base).
+  /*!
+   * cdef_transmitted[i] is true if CDEF strength for ith CDEF unit in the
+   * current superblock has already been read from (decoder) / written to
+   * (encoder) the bitstream; and false otherwise.
+   * More detail:
+   * 1. CDEF strength is transmitted only once per CDEF unit, in the 1st
+   * non-skip coding block. So, we need this array to keep track of whether CDEF
+   * strengths for the given CDEF units have been transmitted yet or not.
+   * 2. Superblock size can be either 128x128 or 64x64, but CDEF unit size is
+   * fixed to be 64x64. So, there may be 4 CDEF units within a superblock (if
+   * superblock size is 128x128). Hence the array size is 4.
+   * 3. In the current implementation, CDEF strength for this CDEF unit is
+   * stored in the MB_MODE_INFO of the 1st block in this CDEF unit (inside
+   * cm->mi_params.mi_grid_base).
+   */
   bool cdef_transmitted[4];
 
+  /*!
+   * Mask for this block used for compound prediction.
+   */
   DECLARE_ALIGNED(16, uint8_t, seg_mask[2 * MAX_SB_SQUARE]);
-  uint8_t *mc_buf[2];
+
+  /*!
+   * CFL (chroma from luma) related parameters.
+   */
   CFL_CTX cfl;
 
-  DIST_WTD_COMP_PARAMS jcp_param;
-
-  uint16_t cb_offset[MAX_MB_PLANE];
-  uint16_t txb_offset[MAX_MB_PLANE];
+  /*!
+   * Offset to plane[p].color_index_map.
+   * Currently:
+   * - On encoder side, this is always 0 as 'color_index_map' is allocated per
+   * *coding block* there.
+   * - On decoder side, this may be non-zero, as 'color_index_map' is a (static)
+   * memory pointing to the base of a *superblock* there, and we need an offset
+   * to it to get the color index map for current coding block.
+   */
   uint16_t color_index_map_offset[2];
 
+  /*!
+   * Temporary buffer used for convolution in case of compound reference only
+   * for (weighted or uniform) averaging operation.
+   * There are pointers to actual buffers allocated elsewhere: e.g.
+   * - In decoder, 'pbi->td.tmp_conv_dst' or
+   * 'pbi->thread_data[t].td->xd.tmp_conv_dst' and
+   * - In encoder, 'x->tmp_conv_dst' or
+   * 'cpi->tile_thr_data[t].td->mb.tmp_conv_dst'.
+   */
   CONV_BUF_TYPE *tmp_conv_dst;
+  /*!
+   * Temporary buffers used to build OBMC prediction by above (index 0) and left
+   * (index 1) predictors respectively.
+   * tmp_obmc_bufs[i][p * MAX_SB_SQUARE] is the buffer used for plane 'p'.
+   * There are pointers to actual buffers allocated elsewhere: e.g.
+   * - In decoder, 'pbi->td.tmp_obmc_bufs' or
+   * 'pbi->thread_data[t].td->xd.tmp_conv_dst' and
+   * -In encoder, 'x->tmp_pred_bufs' or
+   * 'cpi->tile_thr_data[t].td->mb.tmp_pred_bufs'.
+   */
   uint8_t *tmp_obmc_bufs[2];
 } MACROBLOCKD;
 
+/*!\cond */
+
 static INLINE int is_cur_buf_hbd(const MACROBLOCKD *xd) {
   return xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH ? 1 : 0;
 }
@@ -863,12 +1126,12 @@
 static INLINE TX_TYPE get_default_tx_type(PLANE_TYPE plane_type,
                                           const MACROBLOCKD *xd,
                                           TX_SIZE tx_size,
-                                          int is_screen_content_type) {
+                                          int use_screen_content_tools) {
   const MB_MODE_INFO *const mbmi = xd->mi[0];
 
   if (is_inter_block(mbmi) || plane_type != PLANE_TYPE_Y ||
       xd->lossless[mbmi->segment_id] || tx_size >= TX_32X32 ||
-      is_screen_content_type)
+      use_screen_content_tools)
     return DCT_DCT;
 
   return intra_mode_to_tx_type(mbmi, plane_type);
@@ -1076,7 +1339,7 @@
   if (xd->lossless[mbmi->segment_id]) return TX_4X4;
   if (plane == 0) return mbmi->tx_size;
   const MACROBLOCKD_PLANE *pd = &xd->plane[plane];
-  return av1_get_max_uv_txsize(mbmi->sb_type, pd->subsampling_x,
+  return av1_get_max_uv_txsize(mbmi->bsize, pd->subsampling_x,
                                pd->subsampling_y);
 }
 
@@ -1116,7 +1379,7 @@
 }
 
 static INLINE int is_interintra_allowed(const MB_MODE_INFO *mbmi) {
-  return is_interintra_allowed_bsize(mbmi->sb_type) &&
+  return is_interintra_allowed_bsize(mbmi->bsize) &&
          is_interintra_allowed_mode(mbmi->mode) &&
          is_interintra_allowed_ref(mbmi->ref_frame);
 }
@@ -1159,34 +1422,29 @@
 static const int max_neighbor_obmc[6] = { 0, 1, 2, 3, 4, 4 };
 
 static INLINE int check_num_overlappable_neighbors(const MB_MODE_INFO *mbmi) {
-  return !(mbmi->overlappable_neighbors[0] == 0 &&
-           mbmi->overlappable_neighbors[1] == 0);
+  return mbmi->overlappable_neighbors != 0;
 }
 
 static INLINE MOTION_MODE
 motion_mode_allowed(const WarpedMotionParams *gm_params, const MACROBLOCKD *xd,
                     const MB_MODE_INFO *mbmi, int allow_warped_motion) {
+  if (!check_num_overlappable_neighbors(mbmi)) return SIMPLE_TRANSLATION;
   if (xd->cur_frame_force_integer_mv == 0) {
     const TransformationType gm_type = gm_params[mbmi->ref_frame[0]].wmtype;
     if (is_global_mv_block(mbmi, gm_type)) return SIMPLE_TRANSLATION;
   }
-  if (is_motion_variation_allowed_bsize(mbmi->sb_type) &&
+  if (is_motion_variation_allowed_bsize(mbmi->bsize) &&
       is_inter_mode(mbmi->mode) && mbmi->ref_frame[1] != INTRA_FRAME &&
       is_motion_variation_allowed_compound(mbmi)) {
-    if (!check_num_overlappable_neighbors(mbmi)) return SIMPLE_TRANSLATION;
     assert(!has_second_ref(mbmi));
-    if (mbmi->num_proj_ref >= 1 &&
-        (allow_warped_motion &&
-         !av1_is_scaled(xd->block_ref_scale_factors[0]))) {
-      if (xd->cur_frame_force_integer_mv) {
-        return OBMC_CAUSAL;
-      }
+    if (mbmi->num_proj_ref >= 1 && allow_warped_motion &&
+        !xd->cur_frame_force_integer_mv &&
+        !av1_is_scaled(xd->block_ref_scale_factors[0])) {
       return WARPED_CAUSAL;
     }
     return OBMC_CAUSAL;
-  } else {
-    return SIMPLE_TRANSLATION;
   }
+  return SIMPLE_TRANSLATION;
 }
 
 static INLINE int is_neighbor_overlappable(const MB_MODE_INFO *mbmi) {
@@ -1228,15 +1486,23 @@
   // Special handling for chroma sub8x8.
   const int is_chroma_sub8_x = plane > 0 && plane_block_width < 4;
   const int is_chroma_sub8_y = plane > 0 && plane_block_height < 4;
-  if (width) *width = plane_block_width + 2 * is_chroma_sub8_x;
-  if (height) *height = plane_block_height + 2 * is_chroma_sub8_y;
+  if (width) {
+    *width = plane_block_width + 2 * is_chroma_sub8_x;
+    assert(*width >= 0);
+  }
+  if (height) {
+    *height = plane_block_height + 2 * is_chroma_sub8_y;
+    assert(*height >= 0);
+  }
   if (rows_within_bounds) {
     *rows_within_bounds =
         (block_rows >> pd->subsampling_y) + 2 * is_chroma_sub8_y;
+    assert(*rows_within_bounds >= 0);
   }
   if (cols_within_bounds) {
     *cols_within_bounds =
         (block_cols >> pd->subsampling_x) + 2 * is_chroma_sub8_x;
+    assert(*cols_within_bounds >= 0);
   }
 }
 
@@ -1265,7 +1531,7 @@
   // First check if all modes are GLOBALMV
   if (mbmi->mode != GLOBALMV && mbmi->mode != GLOBAL_GLOBALMV) return 0;
 
-  if (AOMMIN(mi_size_wide[mbmi->sb_type], mi_size_high[mbmi->sb_type]) < 2)
+  if (AOMMIN(mi_size_wide[mbmi->bsize], mi_size_high[mbmi->bsize]) < 2)
     return 0;
 
   // Now check if all global motion is non translational
@@ -1289,6 +1555,8 @@
   return tx_size_2d[tx_size];
 }
 
+/*!\endcond */
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/av1/common/cdef.c b/av1/common/cdef.c
index ef7b866..bc8eecc 100644
--- a/av1/common/cdef.c
+++ b/av1/common/cdef.c
@@ -26,7 +26,7 @@
   MB_MODE_INFO **mbmi = grid + mi_row * mi_stride + mi_col;
   for (int r = 0; r < mi_size_high[BLOCK_8X8]; ++r, mbmi += mi_stride) {
     for (int c = 0; c < mi_size_wide[BLOCK_8X8]; ++c) {
-      if (!mbmi[c]->skip) return 0;
+      if (!mbmi[c]->skip_txfm) return 0;
     }
   }
 
diff --git a/av1/common/cdef.h b/av1/common/cdef.h
index c36fd13..4d6e600 100644
--- a/av1/common/cdef.h
+++ b/av1/common/cdef.h
@@ -40,11 +40,20 @@
 int av1_cdef_compute_sb_list(const CommonModeInfoParams *const mi_params,
                              int mi_row, int mi_col, cdef_list *dlist,
                              BLOCK_SIZE bsize);
-void av1_cdef_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm, MACROBLOCKD *xd);
 
-void av1_cdef_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref,
-                     AV1_COMMON *cm, MACROBLOCKD *xd, int pick_method,
-                     int rdmult);
+/*!\brief Function for applying CDEF to a frame
+ *
+ * \ingroup in_loop_cdef
+ * This function applies CDEF to a frame.
+ *
+ * \param[in, out]  frame       Compressed frame buffer
+ * \param[in, out]  cm          Pointer to top level common structure
+ * \param[in]       xd          Pointer to common current coding block structure
+ *
+ * \return Nothing is returned. Instead, the filtered frame is output in
+ * \c frame.
+ */
+void av1_cdef_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm, MACROBLOCKD *xd);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/av1/common/cfl.h b/av1/common/cfl.h
index a1d6dc2..0062e9f 100644
--- a/av1/common/cfl.h
+++ b/av1/common/cfl.h
@@ -18,7 +18,7 @@
 // Can we use CfL for the current block?
 static INLINE CFL_ALLOWED_TYPE is_cfl_allowed(const MACROBLOCKD *xd) {
   const MB_MODE_INFO *mbmi = xd->mi[0];
-  const BLOCK_SIZE bsize = mbmi->sb_type;
+  const BLOCK_SIZE bsize = mbmi->bsize;
   assert(bsize < BLOCK_SIZES_ALL);
   if (xd->lossless[mbmi->segment_id]) {
     // In lossless, CfL is available when the partition size is equal to the
diff --git a/av1/common/convolve.c b/av1/common/convolve.c
index e177e3c..4d26f9c 100644
--- a/av1/common/convolve.c
+++ b/av1/common/convolve.c
@@ -166,18 +166,9 @@
 
 void av1_convolve_y_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
                          int dst_stride, int w, int h,
-                         const InterpFilterParams *filter_params_x,
                          const InterpFilterParams *filter_params_y,
-                         const int subpel_x_qn, const int subpel_y_qn,
-                         ConvolveParams *conv_params) {
+                         const int subpel_y_qn) {
   const int fo_vert = filter_params_y->taps / 2 - 1;
-  (void)filter_params_x;
-  (void)subpel_x_qn;
-  (void)conv_params;
-
-  assert(conv_params->round_0 <= FILTER_BITS);
-  assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) ||
-         ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS)));
 
   // vertical filter
   const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
@@ -197,14 +188,9 @@
 void av1_convolve_x_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
                          int dst_stride, int w, int h,
                          const InterpFilterParams *filter_params_x,
-                         const InterpFilterParams *filter_params_y,
-                         const int subpel_x_qn, const int subpel_y_qn,
-                         ConvolveParams *conv_params) {
+                         const int subpel_x_qn, ConvolveParams *conv_params) {
   const int fo_horiz = filter_params_x->taps / 2 - 1;
   const int bits = FILTER_BITS - conv_params->round_0;
-  (void)filter_params_y;
-  (void)subpel_y_qn;
-  (void)conv_params;
 
   assert(bits >= 0);
   assert((FILTER_BITS - conv_params->round_1) >= 0 ||
@@ -226,23 +212,6 @@
   }
 }
 
-void av1_convolve_2d_copy_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
-                               int dst_stride, int w, int h,
-                               const InterpFilterParams *filter_params_x,
-                               const InterpFilterParams *filter_params_y,
-                               const int subpel_x_qn, const int subpel_y_qn,
-                               ConvolveParams *conv_params) {
-  (void)filter_params_x;
-  (void)filter_params_y;
-  (void)subpel_x_qn;
-  (void)subpel_y_qn;
-  (void)conv_params;
-
-  for (int y = 0; y < h; ++y) {
-    memmove(dst + y * dst_stride, src + y * src_stride, w * sizeof(src[0]));
-  }
-}
-
 void av1_dist_wtd_convolve_2d_c(const uint8_t *src, int src_stride,
                                 uint8_t *dst, int dst_stride, int w, int h,
                                 const InterpFilterParams *filter_params_x,
@@ -311,9 +280,8 @@
 
 void av1_dist_wtd_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst,
                                int dst_stride, int w, int h,
-                               const InterpFilterParams *filter_params_x,
                                const InterpFilterParams *filter_params_y,
-                               const int subpel_x_qn, const int subpel_y_qn,
+                               const int subpel_y_qn,
                                ConvolveParams *conv_params) {
   CONV_BUF_TYPE *dst16 = conv_params->dst;
   int dst16_stride = conv_params->dst_stride;
@@ -325,8 +293,6 @@
                            (1 << (offset_bits - conv_params->round_1 - 1));
   const int round_bits =
       2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
-  (void)filter_params_x;
-  (void)subpel_x_qn;
 
   // vertical filter
   const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
@@ -362,8 +328,7 @@
 void av1_dist_wtd_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst,
                                int dst_stride, int w, int h,
                                const InterpFilterParams *filter_params_x,
-                               const InterpFilterParams *filter_params_y,
-                               const int subpel_x_qn, const int subpel_y_qn,
+                               const int subpel_x_qn,
                                ConvolveParams *conv_params) {
   CONV_BUF_TYPE *dst16 = conv_params->dst;
   int dst16_stride = conv_params->dst_stride;
@@ -375,8 +340,6 @@
                            (1 << (offset_bits - conv_params->round_1 - 1));
   const int round_bits =
       2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
-  (void)filter_params_y;
-  (void)subpel_y_qn;
 
   // horizontal filter
   const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
@@ -411,10 +374,6 @@
 
 void av1_dist_wtd_convolve_2d_copy_c(const uint8_t *src, int src_stride,
                                      uint8_t *dst, int dst_stride, int w, int h,
-                                     const InterpFilterParams *filter_params_x,
-                                     const InterpFilterParams *filter_params_y,
-                                     const int subpel_x_qn,
-                                     const int subpel_y_qn,
                                      ConvolveParams *conv_params) {
   CONV_BUF_TYPE *dst16 = conv_params->dst;
   int dst16_stride = conv_params->dst_stride;
@@ -424,10 +383,6 @@
   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
   const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
                            (1 << (offset_bits - conv_params->round_1 - 1));
-  (void)filter_params_x;
-  (void)filter_params_y;
-  (void)subpel_x_qn;
-  (void)subpel_y_qn;
 
   for (int y = 0; y < h; ++y) {
     for (int x = 0; x < w; ++x) {
@@ -552,13 +507,78 @@
                         y_step_qn, conv_params);
 }
 
+static void convolve_2d_facade_compound(
+    const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w,
+    int h, const InterpFilterParams *filter_params_x,
+    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+    const int subpel_y_qn, ConvolveParams *conv_params) {
+  const bool need_x = subpel_x_qn != 0;
+  const bool need_y = subpel_y_qn != 0;
+  if (!need_x && !need_y) {
+    av1_dist_wtd_convolve_2d_copy(src, src_stride, dst, dst_stride, w, h,
+                                  conv_params);
+  } else if (need_x && !need_y) {
+    av1_dist_wtd_convolve_x(src, src_stride, dst, dst_stride, w, h,
+                            filter_params_x, subpel_x_qn, conv_params);
+  } else if (!need_x && need_y) {
+    av1_dist_wtd_convolve_y(src, src_stride, dst, dst_stride, w, h,
+                            filter_params_y, subpel_y_qn, conv_params);
+  } else {
+    assert(need_y && need_x);
+    av1_dist_wtd_convolve_2d(src, src_stride, dst, dst_stride, w, h,
+                             filter_params_x, filter_params_y, subpel_x_qn,
+                             subpel_y_qn, conv_params);
+  }
+}
+
+static void convolve_2d_facade_single(
+    const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w,
+    int h, const InterpFilterParams *filter_params_x,
+    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+    const int subpel_y_qn, ConvolveParams *conv_params) {
+  const bool need_x = subpel_x_qn != 0;
+  const bool need_y = subpel_y_qn != 0;
+  if (!need_x && !need_y) {
+    aom_convolve_copy(src, src_stride, dst, dst_stride, w, h);
+  } else if (need_x && !need_y) {
+    // Filters with taps > 8 are only for encoder side use.
+    // TODO(any): need SIMD for > 8 taps filters
+    if (filter_params_x->taps > 8 || filter_params_y->taps > 8) {
+      av1_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h,
+                          filter_params_x, subpel_x_qn, conv_params);
+    } else {
+      av1_convolve_x_sr(src, src_stride, dst, dst_stride, w, h, filter_params_x,
+                        subpel_x_qn, conv_params);
+    }
+  } else if (!need_x && need_y) {
+    if (filter_params_x->taps > 8 || filter_params_y->taps > 8) {
+      av1_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h,
+                          filter_params_y, subpel_y_qn);
+    } else {
+      av1_convolve_y_sr(src, src_stride, dst, dst_stride, w, h, filter_params_y,
+                        subpel_y_qn);
+    }
+  } else {
+    assert(need_x && need_y);
+
+    if (filter_params_x->taps > 8 || filter_params_y->taps > 8) {
+      av1_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h,
+                           filter_params_x, filter_params_y, subpel_x_qn,
+                           subpel_y_qn, conv_params);
+    } else {
+      av1_convolve_2d_sr(src, src_stride, dst, dst_stride, w, h,
+                         filter_params_x, filter_params_y, subpel_x_qn,
+                         subpel_y_qn, conv_params);
+    }
+  }
+}
+
 void av1_convolve_2d_facade(const uint8_t *src, int src_stride, uint8_t *dst,
                             int dst_stride, int w, int h,
                             const InterpFilterParams *interp_filters[2],
                             const int subpel_x_qn, int x_step_q4,
                             const int subpel_y_qn, int y_step_q4, int scaled,
-                            ConvolveParams *conv_params,
-                            const struct scale_factors *sf) {
+                            ConvolveParams *conv_params) {
   (void)x_step_q4;
   (void)y_step_q4;
   (void)dst;
@@ -580,13 +600,11 @@
       return;
     } else if (subpel_x_qn) {
       av1_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h,
-                          filter_params_x, filter_params_y, subpel_x_qn,
-                          subpel_y_qn, conv_params);
+                          filter_params_x, subpel_x_qn, conv_params);
       return;
     } else if (subpel_y_qn) {
       av1_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h,
-                          filter_params_x, filter_params_y, subpel_x_qn,
-                          subpel_y_qn, conv_params);
+                          filter_params_y, subpel_y_qn);
       return;
     }
   }
@@ -595,41 +613,25 @@
     convolve_2d_scale_wrapper(src, src_stride, dst, dst_stride, w, h,
                               filter_params_x, filter_params_y, subpel_x_qn,
                               x_step_q4, subpel_y_qn, y_step_q4, conv_params);
+  } else if (conv_params->is_compound) {
+    convolve_2d_facade_compound(src, src_stride, dst, dst_stride, w, h,
+                                filter_params_x, filter_params_y, subpel_x_qn,
+                                subpel_y_qn, conv_params);
   } else {
-    sf->convolve[subpel_x_qn != 0][subpel_y_qn != 0][conv_params->is_compound](
-        src, src_stride, dst, dst_stride, w, h, filter_params_x,
-        filter_params_y, subpel_x_qn, subpel_y_qn, conv_params);
+    convolve_2d_facade_single(src, src_stride, dst, dst_stride, w, h,
+                              filter_params_x, filter_params_y, subpel_x_qn,
+                              subpel_y_qn, conv_params);
   }
 }
 
 #if CONFIG_AV1_HIGHBITDEPTH
-void av1_highbd_convolve_2d_copy_sr_c(
-    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
-    int h, const InterpFilterParams *filter_params_x,
-    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
-    const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
-  (void)filter_params_x;
-  (void)filter_params_y;
-  (void)subpel_x_qn;
-  (void)subpel_y_qn;
-  (void)conv_params;
-  (void)bd;
-
-  for (int y = 0; y < h; ++y) {
-    memmove(dst + y * dst_stride, src + y * src_stride, w * sizeof(src[0]));
-  }
-}
-
 void av1_highbd_convolve_x_sr_c(const uint16_t *src, int src_stride,
                                 uint16_t *dst, int dst_stride, int w, int h,
                                 const InterpFilterParams *filter_params_x,
-                                const InterpFilterParams *filter_params_y,
-                                const int subpel_x_qn, const int subpel_y_qn,
+                                const int subpel_x_qn,
                                 ConvolveParams *conv_params, int bd) {
   const int fo_horiz = filter_params_x->taps / 2 - 1;
   const int bits = FILTER_BITS - conv_params->round_0;
-  (void)filter_params_y;
-  (void)subpel_y_qn;
 
   assert(bits >= 0);
   assert((FILTER_BITS - conv_params->round_1) >= 0 ||
@@ -653,18 +655,9 @@
 
 void av1_highbd_convolve_y_sr_c(const uint16_t *src, int src_stride,
                                 uint16_t *dst, int dst_stride, int w, int h,
-                                const InterpFilterParams *filter_params_x,
                                 const InterpFilterParams *filter_params_y,
-                                const int subpel_x_qn, const int subpel_y_qn,
-                                ConvolveParams *conv_params, int bd) {
+                                const int subpel_y_qn, int bd) {
   const int fo_vert = filter_params_y->taps / 2 - 1;
-  (void)filter_params_x;
-  (void)subpel_x_qn;
-  (void)conv_params;
-
-  assert(conv_params->round_0 <= FILTER_BITS);
-  assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) ||
-         ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS)));
   // vertical filter
   const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
       filter_params_y, subpel_y_qn & SUBPEL_MASK);
@@ -800,11 +793,12 @@
   }
 }
 
-void av1_highbd_dist_wtd_convolve_x_c(
-    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
-    int h, const InterpFilterParams *filter_params_x,
-    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
-    const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
+void av1_highbd_dist_wtd_convolve_x_c(const uint16_t *src, int src_stride,
+                                      uint16_t *dst, int dst_stride, int w,
+                                      int h,
+                                      const InterpFilterParams *filter_params_x,
+                                      const int subpel_x_qn,
+                                      ConvolveParams *conv_params, int bd) {
   CONV_BUF_TYPE *dst16 = conv_params->dst;
   int dst16_stride = conv_params->dst_stride;
   const int fo_horiz = filter_params_x->taps / 2 - 1;
@@ -815,8 +809,6 @@
   const int round_bits =
       2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
   assert(round_bits >= 0);
-  (void)filter_params_y;
-  (void)subpel_y_qn;
   assert(bits >= 0);
   // horizontal filter
   const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
@@ -849,11 +841,12 @@
   }
 }
 
-void av1_highbd_dist_wtd_convolve_y_c(
-    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
-    int h, const InterpFilterParams *filter_params_x,
-    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
-    const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
+void av1_highbd_dist_wtd_convolve_y_c(const uint16_t *src, int src_stride,
+                                      uint16_t *dst, int dst_stride, int w,
+                                      int h,
+                                      const InterpFilterParams *filter_params_y,
+                                      const int subpel_y_qn,
+                                      ConvolveParams *conv_params, int bd) {
   CONV_BUF_TYPE *dst16 = conv_params->dst;
   int dst16_stride = conv_params->dst_stride;
   const int fo_vert = filter_params_y->taps / 2 - 1;
@@ -864,8 +857,6 @@
   const int round_bits =
       2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
   assert(round_bits >= 0);
-  (void)filter_params_x;
-  (void)subpel_x_qn;
   assert(bits >= 0);
   // vertical filter
   const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
@@ -898,11 +889,11 @@
   }
 }
 
-void av1_highbd_dist_wtd_convolve_2d_copy_c(
-    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
-    int h, const InterpFilterParams *filter_params_x,
-    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
-    const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
+void av1_highbd_dist_wtd_convolve_2d_copy_c(const uint16_t *src, int src_stride,
+                                            uint16_t *dst, int dst_stride,
+                                            int w, int h,
+                                            ConvolveParams *conv_params,
+                                            int bd) {
   CONV_BUF_TYPE *dst16 = conv_params->dst;
   int dst16_stride = conv_params->dst_stride;
   const int bits =
@@ -911,10 +902,6 @@
   const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
                            (1 << (offset_bits - conv_params->round_1 - 1));
   assert(bits >= 0);
-  (void)filter_params_x;
-  (void)filter_params_y;
-  (void)subpel_x_qn;
-  (void)subpel_y_qn;
 
   for (int y = 0; y < h; ++y) {
     for (int x = 0; x < w; ++x) {
@@ -1025,13 +1012,86 @@
   }
 }
 
+static void highbd_convolve_2d_facade_compound(
+    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride,
+    const int w, const int h, const InterpFilterParams *filter_params_x,
+    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+    const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
+  const bool need_x = subpel_x_qn != 0;
+  const bool need_y = subpel_y_qn != 0;
+  if (!need_x && !need_y) {
+    av1_highbd_dist_wtd_convolve_2d_copy(src, src_stride, dst, dst_stride, w, h,
+                                         conv_params, bd);
+  } else if (need_x && !need_y) {
+    av1_highbd_dist_wtd_convolve_x(src, src_stride, dst, dst_stride, w, h,
+                                   filter_params_x, subpel_x_qn, conv_params,
+                                   bd);
+  } else if (!need_x && need_y) {
+    av1_highbd_dist_wtd_convolve_y(src, src_stride, dst, dst_stride, w, h,
+                                   filter_params_y, subpel_y_qn, conv_params,
+                                   bd);
+  } else {
+    assert(need_x && need_y);
+    av1_highbd_dist_wtd_convolve_2d(src, src_stride, dst, dst_stride, w, h,
+                                    filter_params_x, filter_params_y,
+                                    subpel_x_qn, subpel_y_qn, conv_params, bd);
+  }
+}
+
+static void highbd_convolve_2d_facade_single(
+    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride,
+    const int w, const int h, const InterpFilterParams *filter_params_x,
+    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+    const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
+  const bool need_x = subpel_x_qn != 0;
+  const bool need_y = subpel_y_qn != 0;
+  // Filters with taps > 8 are only for encoder side use.
+  const int filter_x_taps_gt8 =
+      (filter_params_x == NULL) ? 0 : ((filter_params_x->taps > 8) ? 1 : 0);
+  const int filter_y_taps_gt8 =
+      (filter_params_y == NULL) ? 0 : ((filter_params_y->taps > 8) ? 1 : 0);
+
+  if (!need_x && !need_y) {
+    aom_highbd_convolve_copy(src, src_stride, dst, dst_stride, w, h);
+  } else if (need_x && !need_y) {
+    // TODO(any): need SIMD for > 8 taps filters
+    if (filter_x_taps_gt8 || filter_y_taps_gt8) {
+      av1_highbd_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h,
+                                 filter_params_x, subpel_x_qn, conv_params, bd);
+
+    } else {
+      av1_highbd_convolve_x_sr(src, src_stride, dst, dst_stride, w, h,
+                               filter_params_x, subpel_x_qn, conv_params, bd);
+    }
+  } else if (!need_x && need_y) {
+    if (filter_x_taps_gt8 || filter_y_taps_gt8) {
+      av1_highbd_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h,
+                                 filter_params_y, subpel_y_qn, bd);
+    } else {
+      av1_highbd_convolve_y_sr(src, src_stride, dst, dst_stride, w, h,
+                               filter_params_y, subpel_y_qn, bd);
+    }
+  } else {
+    assert(need_x && need_y);
+    if (filter_x_taps_gt8 || filter_y_taps_gt8) {
+      av1_highbd_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h,
+                                  filter_params_x, filter_params_y, subpel_x_qn,
+                                  subpel_y_qn, conv_params, bd);
+    } else {
+      av1_highbd_convolve_2d_sr(src, src_stride, dst, dst_stride, w, h,
+                                filter_params_x, filter_params_y, subpel_x_qn,
+                                subpel_y_qn, conv_params, bd);
+    }
+  }
+}
+
 void av1_highbd_convolve_2d_facade(const uint8_t *src8, int src_stride,
                                    uint8_t *dst8, int dst_stride, int w, int h,
                                    const InterpFilterParams *interp_filters[2],
                                    const int subpel_x_qn, int x_step_q4,
                                    const int subpel_y_qn, int y_step_q4,
                                    int scaled, ConvolveParams *conv_params,
-                                   const struct scale_factors *sf, int bd) {
+                                   int bd) {
   (void)x_step_q4;
   (void)y_step_q4;
   (void)dst_stride;
@@ -1044,8 +1104,8 @@
   const InterpFilterParams *filter_params_y =
       need_filter_params_y ? interp_filters[1] : NULL;
 
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
   if (scaled) {
-    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
     if (conv_params->is_compound) {
       assert(conv_params->dst != NULL);
     }
@@ -1053,13 +1113,14 @@
                                  filter_params_x, filter_params_y, subpel_x_qn,
                                  x_step_q4, subpel_y_qn, y_step_q4, conv_params,
                                  bd);
-  } else {
-    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
-
-    sf->highbd_convolve[subpel_x_qn != 0][subpel_y_qn !=
-                                          0][conv_params->is_compound](
+  } else if (conv_params->is_compound) {
+    highbd_convolve_2d_facade_compound(
         src, src_stride, dst, dst_stride, w, h, filter_params_x,
         filter_params_y, subpel_x_qn, subpel_y_qn, conv_params, bd);
+  } else {
+    highbd_convolve_2d_facade_single(src, src_stride, dst, dst_stride, w, h,
+                                     filter_params_x, filter_params_y,
+                                     subpel_x_qn, subpel_y_qn, conv_params, bd);
   }
 }
 #endif  // CONFIG_AV1_HIGHBITDEPTH
diff --git a/av1/common/convolve.h b/av1/common/convolve.h
index 04df86c..490d778 100644
--- a/av1/common/convolve.h
+++ b/av1/common/convolve.h
@@ -26,7 +26,6 @@
   int round_1;
   int plane;
   int is_compound;
-  int compound_index;  // 0: the first single in compound mode, 1: the second.
   int use_dist_wtd_comp_avg;
   int fwd_offset;
   int bck_offset;
@@ -59,15 +58,13 @@
                             const InterpFilterParams *interp_filters[2],
                             const int subpel_x_qn, int x_step_q4,
                             const int subpel_y_qn, int y_step_q4, int scaled,
-                            ConvolveParams *conv_params,
-                            const struct scale_factors *sf);
+                            ConvolveParams *conv_params);
 
 static INLINE ConvolveParams get_conv_params_no_round(int cmp_index, int plane,
                                                       CONV_BUF_TYPE *dst,
                                                       int dst_stride,
                                                       int is_compound, int bd) {
   ConvolveParams conv_params;
-  conv_params.compound_index = cmp_index;
   assert(IMPLIES(cmp_index, is_compound));
 
   conv_params.is_compound = is_compound;
@@ -122,7 +119,7 @@
                                    const int subpel_x_qn, int x_step_q4,
                                    const int subpel_y_qn, int y_step_q4,
                                    int scaled, ConvolveParams *conv_params,
-                                   const struct scale_factors *sf, int bd);
+                                   int bd);
 
 // TODO(sarahparker) This will need to be integerized and optimized
 void av1_convolve_2d_sobel_y_c(const uint8_t *src, int src_stride, double *dst,
diff --git a/av1/common/debugmodes.c b/av1/common/debugmodes.c
index ff02ddd..7e6160f 100644
--- a/av1/common/debugmodes.c
+++ b/av1/common/debugmodes.c
@@ -17,7 +17,7 @@
 
 static void log_frame_info(AV1_COMMON *cm, const char *str, FILE *f) {
   fprintf(f, "%s", str);
-  fprintf(f, "(Frame %d, Show:%d, Q:%d): \n", cm->current_frame.frame_number,
+  fprintf(f, "(Frame %u, Show:%d, Q:%d): \n", cm->current_frame.frame_number,
           cm->show_frame, cm->quant_params.base_qindex);
 }
 /* This function dereferences a pointer to the mbmi structure
@@ -52,7 +52,7 @@
   const int rows = mi_params->mi_rows;
   const int cols = mi_params->mi_cols;
 
-  print_mi_data(cm, mvs, "Partitions:", offsetof(MB_MODE_INFO, sb_type));
+  print_mi_data(cm, mvs, "Partitions:", offsetof(MB_MODE_INFO, bsize));
   print_mi_data(cm, mvs, "Modes:", offsetof(MB_MODE_INFO, mode));
   print_mi_data(cm, mvs, "Ref frame:", offsetof(MB_MODE_INFO, ref_frame[0]));
   print_mi_data(cm, mvs, "Transform:", offsetof(MB_MODE_INFO, tx_size));
@@ -63,7 +63,7 @@
   for (int mi_row = 0; mi_row < rows; mi_row++) {
     fprintf(mvs, "S ");
     for (int mi_col = 0; mi_col < cols; mi_col++) {
-      fprintf(mvs, "%2d ", mi[0]->skip);
+      fprintf(mvs, "%2d ", mi[0]->skip_txfm);
       mi++;
     }
     fprintf(mvs, "\n");
diff --git a/av1/common/entropy.c b/av1/common/entropy.c
index 1f7a0ef..eda7a36 100644
--- a/av1/common/entropy.c
+++ b/av1/common/entropy.c
@@ -130,7 +130,7 @@
   RESET_CDF_COUNTER(fc->compound_index_cdf, 2);
   RESET_CDF_COUNTER(fc->comp_group_idx_cdf, 2);
   RESET_CDF_COUNTER(fc->skip_mode_cdfs, 2);
-  RESET_CDF_COUNTER(fc->skip_cdfs, 2);
+  RESET_CDF_COUNTER(fc->skip_txfm_cdfs, 2);
   RESET_CDF_COUNTER(fc->intra_inter_cdf, 2);
   reset_nmv_counter(&fc->nmvc);
   reset_nmv_counter(&fc->ndvc);
diff --git a/av1/common/entropymode.c b/av1/common/entropymode.c
index 5f061be..daee332 100644
--- a/av1/common/entropymode.c
+++ b/av1/common/entropymode.c
@@ -793,7 +793,7 @@
       { AOM_CDF2(28165) }, { AOM_CDF2(22401) }, { AOM_CDF2(16088) }
     };
 
-static const aom_cdf_prob default_skip_cdfs[SKIP_CONTEXTS][CDF_SIZE(2)] = {
+static const aom_cdf_prob default_skip_txfm_cdfs[SKIP_CONTEXTS][CDF_SIZE(2)] = {
   { AOM_CDF2(31671) }, { AOM_CDF2(16515) }, { AOM_CDF2(4576) }
 };
 
@@ -973,6 +973,113 @@
   assert(color_index_ctx < PALETTE_COLOR_INDEX_CONTEXTS);
   return color_index_ctx;
 }
+
+int av1_fast_palette_color_index_context(const uint8_t *color_map, int stride,
+                                         int r, int c, int *color_idx) {
+  assert(r > 0 || c > 0);
+
+  // This goes in the order of left, top, and top-left. This has the advantage
+  // that unless anything here are not distinct or invalid, this will already
+  // be in sorted order. Furthermore, if either of the first two are not
+  // invalid, we know the last one is also invalid.
+  int color_neighbors[NUM_PALETTE_NEIGHBORS];
+  color_neighbors[0] = (c - 1 >= 0) ? color_map[r * stride + c - 1] : -1;
+  color_neighbors[1] = (r - 1 >= 0) ? color_map[(r - 1) * stride + c] : -1;
+  color_neighbors[2] =
+      (c - 1 >= 0 && r - 1 >= 0) ? color_map[(r - 1) * stride + c - 1] : -1;
+
+  // Since our array is so small, using a couple if statements is faster
+  int scores[NUM_PALETTE_NEIGHBORS] = { 2, 2, 1 };
+  if (color_neighbors[0] == color_neighbors[1]) {
+    scores[0] += scores[1];
+    color_neighbors[1] = -1;
+
+    if (color_neighbors[0] == color_neighbors[2]) {
+      scores[0] += scores[2];
+      color_neighbors[2] = -1;
+    }
+  } else if (color_neighbors[0] == color_neighbors[2]) {
+    scores[0] += scores[2];
+    color_neighbors[2] = -1;
+  } else if (color_neighbors[1] == color_neighbors[2]) {
+    scores[1] += scores[2];
+    color_neighbors[2] = -1;
+  }
+
+  int color_rank[NUM_PALETTE_NEIGHBORS] = { -1, -1, -1 };
+  int score_rank[NUM_PALETTE_NEIGHBORS] = { 0, 0, 0 };
+  int num_valid_colors = 0;
+  for (int idx = 0; idx < NUM_PALETTE_NEIGHBORS; idx++) {
+    if (color_neighbors[idx] != -1) {
+      score_rank[num_valid_colors] = scores[idx];
+      color_rank[num_valid_colors] = color_neighbors[idx];
+      num_valid_colors++;
+    }
+  }
+
+  // Sort everything
+  // We need to swap the first two elements if they have the same score but
+  // the color indices are not in the right order
+  if (score_rank[0] < score_rank[1] ||
+      (score_rank[0] == score_rank[1] && color_rank[0] > color_rank[1])) {
+    const int tmp_score = score_rank[0];
+    const int tmp_color = color_rank[0];
+    score_rank[0] = score_rank[1];
+    color_rank[0] = color_rank[1];
+    score_rank[1] = tmp_score;
+    color_rank[1] = tmp_color;
+  }
+  if (score_rank[0] < score_rank[2]) {
+    const int tmp_score = score_rank[0];
+    const int tmp_color = color_rank[0];
+    score_rank[0] = score_rank[2];
+    color_rank[0] = color_rank[2];
+    score_rank[2] = tmp_score;
+    color_rank[2] = tmp_color;
+  }
+  if (score_rank[1] < score_rank[2]) {
+    const int tmp_score = score_rank[1];
+    const int tmp_color = color_rank[1];
+    score_rank[1] = score_rank[2];
+    color_rank[1] = color_rank[2];
+    score_rank[2] = tmp_score;
+    color_rank[2] = tmp_color;
+  }
+
+  if (color_idx != NULL) {
+    // If any of the neighbor color has higher index than current color index,
+    // then we move up by 1 unless the current color is the same as one of the
+    // neighbor
+    const int current_color = *color_idx = color_map[r * stride + c];
+    int same_neighbor = -1;
+    for (int idx = 0; idx < NUM_PALETTE_NEIGHBORS; idx++) {
+      if (color_rank[idx] > current_color) {
+        (*color_idx)++;
+      } else if (color_rank[idx] == current_color) {
+        same_neighbor = idx;
+      }
+    }
+    if (same_neighbor != -1) {
+      *color_idx = same_neighbor;
+    }
+  }
+
+  // Get hash value of context.
+  int color_index_ctx_hash = 0;
+  static const int hash_multipliers[NUM_PALETTE_NEIGHBORS] = { 1, 2, 2 };
+  for (int idx = 0; idx < NUM_PALETTE_NEIGHBORS; ++idx) {
+    color_index_ctx_hash += score_rank[idx] * hash_multipliers[idx];
+  }
+  assert(color_index_ctx_hash > 0);
+  assert(color_index_ctx_hash <= MAX_COLOR_CONTEXT_HASH);
+
+  // Lookup context from hash.
+  const int color_index_ctx =
+      palette_color_index_context_lookup[color_index_ctx_hash];
+  assert(color_index_ctx >= 0);
+  assert(color_index_ctx < PALETTE_COLOR_INDEX_CONTEXTS);
+  return color_index_ctx;
+}
 #undef NUM_PALETTE_NEIGHBORS
 #undef MAX_COLOR_CONTEXT_HASH
 
@@ -1020,7 +1127,7 @@
   av1_copy(fc->intra_ext_tx_cdf, default_intra_ext_tx_cdf);
   av1_copy(fc->inter_ext_tx_cdf, default_inter_ext_tx_cdf);
   av1_copy(fc->skip_mode_cdfs, default_skip_mode_cdfs);
-  av1_copy(fc->skip_cdfs, default_skip_cdfs);
+  av1_copy(fc->skip_txfm_cdfs, default_skip_txfm_cdfs);
   av1_copy(fc->intra_inter_cdf, default_intra_inter_cdf);
   for (int i = 0; i < SPATIAL_PREDICTION_PROBS; i++)
     av1_copy(fc->seg.spatial_pred_seg_cdf[i],
@@ -1086,9 +1193,10 @@
   // Features disabled, 0, with delta coding (Default state).
   av1_clearall_segfeatures(&cm->seg);
 
-  if (cm->cur_frame->seg_map)
+  if (cm->cur_frame->seg_map) {
     memset(cm->cur_frame->seg_map, 0,
-           (cm->mi_params.mi_rows * cm->mi_params.mi_cols));
+           (cm->cur_frame->mi_rows * cm->cur_frame->mi_cols));
+  }
 
   // reset mode ref deltas
   av1_set_default_ref_deltas(cm->cur_frame->ref_deltas);
diff --git a/av1/common/entropymode.h b/av1/common/entropymode.h
index bbbf55d..59f249b 100644
--- a/av1/common/entropymode.h
+++ b/av1/common/entropymode.h
@@ -121,8 +121,8 @@
   aom_cdf_prob txfm_partition_cdf[TXFM_PARTITION_CONTEXTS][CDF_SIZE(2)];
   aom_cdf_prob compound_index_cdf[COMP_INDEX_CONTEXTS][CDF_SIZE(2)];
   aom_cdf_prob comp_group_idx_cdf[COMP_GROUP_IDX_CONTEXTS][CDF_SIZE(2)];
-  aom_cdf_prob skip_mode_cdfs[SKIP_CONTEXTS][CDF_SIZE(2)];
-  aom_cdf_prob skip_cdfs[SKIP_CONTEXTS][CDF_SIZE(2)];
+  aom_cdf_prob skip_mode_cdfs[SKIP_MODE_CONTEXTS][CDF_SIZE(2)];
+  aom_cdf_prob skip_txfm_cdfs[SKIP_CONTEXTS][CDF_SIZE(2)];
   aom_cdf_prob intra_inter_cdf[INTRA_INTER_CONTEXTS][CDF_SIZE(2)];
   nmv_context nmvc;
   nmv_context ndvc;
@@ -205,6 +205,11 @@
                                         int r, int c, int palette_size,
                                         uint8_t *color_order, int *color_idx);
 
+// A faster version of av1_get_palette_color_index_context used by the encoder
+// exploiting the fact that the encoder does not need to maintain a color order.
+int av1_fast_palette_color_index_context(const uint8_t *color_map, int stride,
+                                         int r, int c, int *color_idx);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/av1/common/enums.h b/av1/common/enums.h
index 0c09a1b..9c2976b 100644
--- a/av1/common/enums.h
+++ b/av1/common/enums.h
@@ -22,6 +22,10 @@
 extern "C" {
 #endif
 
+/*! @file */
+
+/*!\cond */
+
 #undef MAX_SB_SIZE
 
 // Max superblock size
@@ -408,6 +412,7 @@
   GLOBAL_GLOBALMV,
   NEW_NEWMV,
   MB_MODE_COUNT,
+  PRED_MODE_INVALID = MB_MODE_COUNT,
   INTRA_MODE_START = DC_PRED,
   INTRA_MODE_END = NEARESTMV,
   DIR_MODE_START = V_PRED,
@@ -636,15 +641,21 @@
 // NONE_FRAME to (MODE_CTX_REF_FRAMES - 1). Hence, it is not defined as an enum.
 typedef int8_t MV_REFERENCE_FRAME;
 
-enum {
-  RESTORE_NONE,
-  RESTORE_WIENER,
-  RESTORE_SGRPROJ,
-  RESTORE_SWITCHABLE,
-  RESTORE_SWITCHABLE_TYPES = RESTORE_SWITCHABLE,
-  RESTORE_TYPES = 4,
-} UENUM1BYTE(RestorationType);
+/*!\endcond */
 
+/*!\enum RestorationType
+ * \brief This enumeration defines various restoration types supported
+ */
+typedef enum {
+  RESTORE_NONE,       /**< No restoration */
+  RESTORE_WIENER,     /**< Separable Wiener restoration */
+  RESTORE_SGRPROJ,    /**< Selfguided restoration */
+  RESTORE_SWITCHABLE, /**< Switchable restoration */
+  RESTORE_SWITCHABLE_TYPES = RESTORE_SWITCHABLE, /**< Num Switchable types */
+  RESTORE_TYPES = 4,                             /**< Num Restore types */
+} RestorationType;
+
+/*!\cond */
 // Picture prediction structures (0-12 are predefined) in scalability metadata.
 enum {
   SCALABILITY_L1T2 = 0,
@@ -671,6 +682,8 @@
 #define MAX_EXTERNAL_REFERENCES 128
 #define MAX_TILES 512
 
+/*!\endcond */
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/av1/common/filter.h b/av1/common/filter.h
index 91791d3..b73421c 100644
--- a/av1/common/filter.h
+++ b/av1/common/filter.h
@@ -32,6 +32,9 @@
   EIGHTTAP_SMOOTH,
   MULTITAP_SHARP,
   BILINEAR,
+  // Encoder side only filters
+  MULTITAP_SHARP2,
+
   INTERP_FILTERS_ALL,
   SWITCHABLE_FILTERS = BILINEAR,
   SWITCHABLE = SWITCHABLE_FILTERS + 1, /* the last switchable one */
@@ -102,7 +105,6 @@
 typedef struct InterpFilterParams {
   const int16_t *filter_ptr;
   uint16_t taps;
-  uint16_t subpel_shifts;
   InterpFilter interp_filter;
 } InterpFilterParams;
 
@@ -154,16 +156,38 @@
   { 0, 0, 4, 36, 62, 26, 0, 0 },    { 0, 0, 2, 34, 62, 28, 2, 0 }
 };
 
+DECLARE_ALIGNED(256, static const int16_t,
+                av1_sub_pel_filters_12sharp[SUBPEL_SHIFTS][12]) = {
+  { 0, 0, 0, 0, 0, 128, 0, 0, 0, 0, 0, 0 },
+  { 0, 1, -2, 3, -7, 127, 8, -4, 2, -1, 1, 0 },
+  { -1, 2, -3, 6, -13, 124, 18, -8, 4, -2, 2, -1 },
+  { -1, 3, -4, 8, -18, 120, 28, -12, 7, -4, 2, -1 },
+  { -1, 3, -6, 10, -21, 115, 38, -15, 8, -5, 3, -1 },
+  { -2, 4, -6, 12, -24, 108, 49, -18, 10, -6, 3, -2 },
+  { -2, 4, -7, 13, -25, 100, 60, -21, 11, -7, 4, -2 },
+  { -2, 4, -7, 13, -26, 91, 71, -24, 13, -7, 4, -2 },
+  { -2, 4, -7, 13, -25, 81, 81, -25, 13, -7, 4, -2 },
+  { -2, 4, -7, 13, -24, 71, 91, -26, 13, -7, 4, -2 },
+  { -2, 4, -7, 11, -21, 60, 100, -25, 13, -7, 4, -2 },
+  { -2, 3, -6, 10, -18, 49, 108, -24, 12, -6, 4, -2 },
+  { -1, 3, -5, 8, -15, 38, 115, -21, 10, -6, 3, -1 },
+  { -1, 2, -4, 7, -12, 28, 120, -18, 8, -4, 3, -1 },
+  { -1, 2, -2, 4, -8, 18, 124, -13, 6, -3, 2, -1 },
+  { 0, 1, -1, 2, -4, 8, 127, -7, 3, -2, 1, 0 }
+};
+
 static const InterpFilterParams
-    av1_interp_filter_params_list[SWITCHABLE_FILTERS + 1] = {
-      { (const int16_t *)av1_sub_pel_filters_8, SUBPEL_TAPS, SUBPEL_SHIFTS,
-        EIGHTTAP_REGULAR },
+    av1_interp_filter_params_list[INTERP_FILTERS_ALL] = {
+      { (const int16_t *)av1_sub_pel_filters_8, SUBPEL_TAPS, EIGHTTAP_REGULAR },
       { (const int16_t *)av1_sub_pel_filters_8smooth, SUBPEL_TAPS,
-        SUBPEL_SHIFTS, EIGHTTAP_SMOOTH },
-      { (const int16_t *)av1_sub_pel_filters_8sharp, SUBPEL_TAPS, SUBPEL_SHIFTS,
+        EIGHTTAP_SMOOTH },
+      { (const int16_t *)av1_sub_pel_filters_8sharp, SUBPEL_TAPS,
         MULTITAP_SHARP },
-      { (const int16_t *)av1_bilinear_filters, SUBPEL_TAPS, SUBPEL_SHIFTS,
-        BILINEAR }
+      { (const int16_t *)av1_bilinear_filters, SUBPEL_TAPS, BILINEAR },
+
+      // The following filters are for encoder only, and now they are used in
+      // temporal filtering. The predictor block size >= 16 in temporal filter.
+      { (const int16_t *)av1_sub_pel_filters_12sharp, 12, MULTITAP_SHARP2 },
     };
 
 // A special 2-tap bilinear filter for IntraBC chroma. IntraBC uses full pixel
@@ -175,7 +199,7 @@
 };
 
 static const InterpFilterParams av1_intrabc_filter_params = {
-  av1_intrabc_bilinear_filter, 2, 0, BILINEAR
+  av1_intrabc_bilinear_filter, 2, BILINEAR
 };
 
 DECLARE_ALIGNED(256, static const InterpKernel,
@@ -213,14 +237,11 @@
 
 // For w<=4, MULTITAP_SHARP is the same as EIGHTTAP_REGULAR
 static const InterpFilterParams av1_interp_4tap[SWITCHABLE_FILTERS + 1] = {
-  { (const int16_t *)av1_sub_pel_filters_4, SUBPEL_TAPS, SUBPEL_SHIFTS,
-    EIGHTTAP_REGULAR },
-  { (const int16_t *)av1_sub_pel_filters_4smooth, SUBPEL_TAPS, SUBPEL_SHIFTS,
+  { (const int16_t *)av1_sub_pel_filters_4, SUBPEL_TAPS, EIGHTTAP_REGULAR },
+  { (const int16_t *)av1_sub_pel_filters_4smooth, SUBPEL_TAPS,
     EIGHTTAP_SMOOTH },
-  { (const int16_t *)av1_sub_pel_filters_4, SUBPEL_TAPS, SUBPEL_SHIFTS,
-    EIGHTTAP_REGULAR },
-  { (const int16_t *)av1_bilinear_filters, SUBPEL_TAPS, SUBPEL_SHIFTS,
-    BILINEAR },
+  { (const int16_t *)av1_sub_pel_filters_4, SUBPEL_TAPS, EIGHTTAP_REGULAR },
+  { (const int16_t *)av1_bilinear_filters, SUBPEL_TAPS, BILINEAR },
 };
 
 static INLINE const InterpFilterParams *
diff --git a/av1/common/mvref_common.c b/av1/common/mvref_common.c
index db3098c..04e050a 100644
--- a/av1/common/mvref_common.c
+++ b/av1/common/mvref_common.c
@@ -160,7 +160,7 @@
 
   for (int i = 0; i < end_mi;) {
     const MB_MODE_INFO *const candidate = candidate_mi0[col_offset + i];
-    const int candidate_bsize = candidate->sb_type;
+    const int candidate_bsize = candidate->bsize;
     const int n4_w = mi_size_wide[candidate_bsize];
     int len = AOMMIN(xd->width, n4_w);
     if (use_step_16)
@@ -207,7 +207,7 @@
   for (i = 0; i < end_mi;) {
     const MB_MODE_INFO *const candidate =
         xd->mi[(row_offset + i) * xd->mi_stride + col_offset];
-    const int candidate_bsize = candidate->sb_type;
+    const int candidate_bsize = candidate->bsize;
     const int n4_h = mi_size_high[candidate_bsize];
     int len = AOMMIN(xd->height, n4_h);
     if (use_step_16)
@@ -285,15 +285,17 @@
     bs <<= 1;
   }
 
-  // The left hand of two vertical rectangles always has a top right (as the
-  // block above will have been decoded)
-  if (xd->width < xd->height)
-    if (!xd->is_sec_rect) has_tr = 1;
+  // In a VERTICAL or VERTICAL_4 partition, all partition before the last one
+  // always have a top right (as the block above will have been decoded).
+  if (xd->width < xd->height) {
+    if (!xd->is_last_vertical_rect) has_tr = 1;
+  }
 
-  // The bottom of two horizontal rectangles never has a top right (as the block
-  // to the right won't have been decoded)
-  if (xd->width > xd->height)
-    if (xd->is_sec_rect) has_tr = 0;
+  // In a HORIZONTAL or HORIZONTAL_4 partition, partitions after the first one
+  // never have a top right (as the block to the right won't have been decoded).
+  if (xd->width > xd->height) {
+    if (!xd->is_first_horizontal_rect) has_tr = 0;
+  }
 
   // The bottom left square of a Vertical A (in the old format) does
   // not have a top right as it is decoded before the right hand
@@ -686,14 +688,14 @@
         const MB_MODE_INFO *const candidate = xd->mi[-xd->mi_stride + idx];
         process_compound_ref_mv_candidate(
             candidate, cm, rf, ref_id, ref_id_count, ref_diff, ref_diff_count);
-        idx += mi_size_wide[candidate->sb_type];
+        idx += mi_size_wide[candidate->bsize];
       }
 
       for (int idx = 0; abs(max_col_offset) >= 1 && idx < mi_size;) {
         const MB_MODE_INFO *const candidate = xd->mi[idx * xd->mi_stride - 1];
         process_compound_ref_mv_candidate(
             candidate, cm, rf, ref_id, ref_id_count, ref_diff, ref_diff_count);
-        idx += mi_size_high[candidate->sb_type];
+        idx += mi_size_high[candidate->bsize];
       }
 
       // Build up the compound mv predictor
@@ -750,7 +752,7 @@
       const MB_MODE_INFO *const candidate = xd->mi[-xd->mi_stride + idx];
       process_single_ref_mv_candidate(candidate, cm, ref_frame, refmv_count,
                                       ref_mv_stack, ref_mv_weight);
-      idx += mi_size_wide[candidate->sb_type];
+      idx += mi_size_wide[candidate->bsize];
     }
 
     for (int idx = 0; abs(max_col_offset) >= 1 && idx < mi_size &&
@@ -758,7 +760,7 @@
       const MB_MODE_INFO *const candidate = xd->mi[idx * xd->mi_stride - 1];
       process_single_ref_mv_candidate(candidate, cm, ref_frame, refmv_count,
                                       ref_mv_stack, ref_mv_weight);
-      idx += mi_size_high[candidate->sb_type];
+      idx += mi_size_high[candidate->bsize];
     }
 
     for (int idx = 0; idx < *refmv_count; ++idx) {
@@ -795,7 +797,7 @@
       global_mvs[ref_frame].as_int = INVALID_MV;
     }
   } else {
-    const BLOCK_SIZE bsize = mi->sb_type;
+    const BLOCK_SIZE bsize = mi->bsize;
     const int allow_high_precision_mv = cm->features.allow_high_precision_mv;
     const int force_integer_mv = cm->features.cur_frame_force_integer_mv;
     if (ref_frame < REF_FRAMES) {
@@ -1050,15 +1052,15 @@
 static INLINE void record_samples(const MB_MODE_INFO *mbmi, int *pts,
                                   int *pts_inref, int row_offset, int sign_r,
                                   int col_offset, int sign_c) {
-  int bw = block_size_wide[mbmi->sb_type];
-  int bh = block_size_high[mbmi->sb_type];
-  int x = col_offset * MI_SIZE + sign_c * AOMMAX(bw, MI_SIZE) / 2 - 1;
-  int y = row_offset * MI_SIZE + sign_r * AOMMAX(bh, MI_SIZE) / 2 - 1;
+  const int bw = block_size_wide[mbmi->bsize];
+  const int bh = block_size_high[mbmi->bsize];
+  const int x = col_offset * MI_SIZE + sign_c * bw / 2 - 1;
+  const int y = row_offset * MI_SIZE + sign_r * bh / 2 - 1;
 
   pts[0] = GET_MV_SUBPEL(x);
   pts[1] = GET_MV_SUBPEL(y);
-  pts_inref[0] = GET_MV_SUBPEL(x) + mbmi->mv[0].as_mv.col;
-  pts_inref[1] = GET_MV_SUBPEL(y) + mbmi->mv[0].as_mv.row;
+  pts_inref[0] = pts[0] + mbmi->mv[0].as_mv.col;
+  pts_inref[1] = pts[1] + mbmi->mv[0].as_mv.row;
 }
 
 // Select samples according to the motion vector difference.
@@ -1067,44 +1069,22 @@
   const int bw = block_size_wide[bsize];
   const int bh = block_size_high[bsize];
   const int thresh = clamp(AOMMAX(bw, bh), 16, 112);
-  int pts_mvd[SAMPLES_ARRAY_SIZE] = { 0 };
-  int i, j, k, l = len;
   uint8_t ret = 0;
   assert(len <= LEAST_SQUARES_SAMPLES_MAX);
 
-  // Obtain the motion vector difference.
-  for (i = 0; i < len; ++i) {
-    pts_mvd[i] = abs(pts_inref[2 * i] - pts[2 * i] - mv->col) +
-                 abs(pts_inref[2 * i + 1] - pts[2 * i + 1] - mv->row);
-
-    if (pts_mvd[i] > thresh)
-      pts_mvd[i] = -1;
-    else
-      ret++;
+  // Only keep the samples with MV differences within threshold.
+  for (int i = 0; i < len; ++i) {
+    const int diff = abs(pts_inref[2 * i] - pts[2 * i] - mv->col) +
+                     abs(pts_inref[2 * i + 1] - pts[2 * i + 1] - mv->row);
+    if (diff > thresh) continue;
+    if (ret != i) {
+      memcpy(pts + 2 * ret, pts + 2 * i, 2 * sizeof(pts[0]));
+      memcpy(pts_inref + 2 * ret, pts_inref + 2 * i, 2 * sizeof(pts_inref[0]));
+    }
+    ++ret;
   }
-
   // Keep at least 1 sample.
-  if (!ret) return 1;
-
-  i = 0;
-  j = l - 1;
-  for (k = 0; k < l - ret; k++) {
-    while (pts_mvd[i] != -1) i++;
-    while (pts_mvd[j] == -1) j--;
-    assert(i != j);
-    if (i > j) break;
-
-    // Replace the discarded samples;
-    pts_mvd[i] = pts_mvd[j];
-    pts[2 * i] = pts[2 * j];
-    pts[2 * i + 1] = pts[2 * j + 1];
-    pts_inref[2 * i] = pts_inref[2 * j];
-    pts_inref[2 * i + 1] = pts_inref[2 * j + 1];
-    i++;
-    j--;
-  }
-
-  return ret;
+  return AOMMAX(ret, 1);
 }
 
 // Note: Samples returned are at 1/8-pel precision
@@ -1116,7 +1096,6 @@
   const int ref_frame = mbmi0->ref_frame[0];
   const int up_available = xd->up_available;
   const int left_available = xd->left_available;
-  int i, mi_step;
   uint8_t np = 0;
   int do_tl = 1;
   int do_tr = 1;
@@ -1128,7 +1107,7 @@
   if (up_available) {
     const int mi_row_offset = -1;
     const MB_MODE_INFO *mbmi = xd->mi[mi_row_offset * mi_stride];
-    uint8_t superblock_width = mi_size_wide[mbmi->sb_type];
+    uint8_t superblock_width = mi_size_wide[mbmi->bsize];
 
     if (xd->width <= superblock_width) {
       // Handle "current block width <= above block width" case.
@@ -1141,24 +1120,22 @@
         record_samples(mbmi, pts, pts_inref, 0, -1, col_offset, 1);
         pts += 2;
         pts_inref += 2;
-        np++;
-        if (np >= LEAST_SQUARES_SAMPLES_MAX) return LEAST_SQUARES_SAMPLES_MAX;
+        if (++np >= LEAST_SQUARES_SAMPLES_MAX) return LEAST_SQUARES_SAMPLES_MAX;
       }
     } else {
       // Handle "current block width > above block width" case.
-      for (i = 0; i < AOMMIN(xd->width, cm->mi_params.mi_cols - mi_col);
-           i += mi_step) {
+      for (int i = 0; i < AOMMIN(xd->width, cm->mi_params.mi_cols - mi_col);
+           i += superblock_width) {
         mbmi = xd->mi[i + mi_row_offset * mi_stride];
-        superblock_width = mi_size_wide[mbmi->sb_type];
-        mi_step = AOMMIN(xd->width, superblock_width);
+        superblock_width = mi_size_wide[mbmi->bsize];
 
         if (mbmi->ref_frame[0] == ref_frame &&
             mbmi->ref_frame[1] == NONE_FRAME) {
           record_samples(mbmi, pts, pts_inref, 0, -1, i, 1);
           pts += 2;
           pts_inref += 2;
-          np++;
-          if (np >= LEAST_SQUARES_SAMPLES_MAX) return LEAST_SQUARES_SAMPLES_MAX;
+          if (++np >= LEAST_SQUARES_SAMPLES_MAX)
+            return LEAST_SQUARES_SAMPLES_MAX;
         }
       }
     }
@@ -1169,7 +1146,7 @@
   if (left_available) {
     const int mi_col_offset = -1;
     const MB_MODE_INFO *mbmi = xd->mi[mi_col_offset];
-    uint8_t superblock_height = mi_size_high[mbmi->sb_type];
+    uint8_t superblock_height = mi_size_high[mbmi->bsize];
 
     if (xd->height <= superblock_height) {
       // Handle "current block height <= above block height" case.
@@ -1186,19 +1163,18 @@
       }
     } else {
       // Handle "current block height > above block height" case.
-      for (i = 0; i < AOMMIN(xd->height, cm->mi_params.mi_rows - mi_row);
-           i += mi_step) {
+      for (int i = 0; i < AOMMIN(xd->height, cm->mi_params.mi_rows - mi_row);
+           i += superblock_height) {
         mbmi = xd->mi[mi_col_offset + i * mi_stride];
-        superblock_height = mi_size_high[mbmi->sb_type];
-        mi_step = AOMMIN(xd->height, superblock_height);
+        superblock_height = mi_size_high[mbmi->bsize];
 
         if (mbmi->ref_frame[0] == ref_frame &&
             mbmi->ref_frame[1] == NONE_FRAME) {
           record_samples(mbmi, pts, pts_inref, i, 1, 0, -1);
           pts += 2;
           pts_inref += 2;
-          np++;
-          if (np >= LEAST_SQUARES_SAMPLES_MAX) return LEAST_SQUARES_SAMPLES_MAX;
+          if (++np >= LEAST_SQUARES_SAMPLES_MAX)
+            return LEAST_SQUARES_SAMPLES_MAX;
         }
       }
     }
@@ -1215,8 +1191,7 @@
       record_samples(mbmi, pts, pts_inref, 0, -1, 0, -1);
       pts += 2;
       pts_inref += 2;
-      np++;
-      if (np >= LEAST_SQUARES_SAMPLES_MAX) return LEAST_SQUARES_SAMPLES_MAX;
+      if (++np >= LEAST_SQUARES_SAMPLES_MAX) return LEAST_SQUARES_SAMPLES_MAX;
     }
   }
   assert(np <= LEAST_SQUARES_SAMPLES_MAX);
@@ -1234,8 +1209,7 @@
 
       if (mbmi->ref_frame[0] == ref_frame && mbmi->ref_frame[1] == NONE_FRAME) {
         record_samples(mbmi, pts, pts_inref, 0, -1, xd->width, 1);
-        np++;
-        if (np >= LEAST_SQUARES_SAMPLES_MAX) return LEAST_SQUARES_SAMPLES_MAX;
+        if (++np >= LEAST_SQUARES_SAMPLES_MAX) return LEAST_SQUARES_SAMPLES_MAX;
       }
     }
   }
diff --git a/av1/common/obmc.h b/av1/common/obmc.h
index cc97b6b..b840345 100644
--- a/av1/common/obmc.h
+++ b/av1/common/obmc.h
@@ -35,7 +35,7 @@
        above_mi_col += mi_step) {
     MB_MODE_INFO **above_mi = prev_row_mi + above_mi_col;
     mi_step =
-        AOMMIN(mi_size_wide[above_mi[0]->sb_type], mi_size_wide[BLOCK_64X64]);
+        AOMMIN(mi_size_wide[above_mi[0]->bsize], mi_size_wide[BLOCK_64X64]);
     // If we're considering a block with width 4, it should be treated as
     // half of a pair of blocks with chroma information in the second. Move
     // above_mi_col back to the start of the pair if needed, set above_mbmi
@@ -72,7 +72,7 @@
        left_mi_row += mi_step) {
     MB_MODE_INFO **left_mi = prev_col_mi + left_mi_row * xd->mi_stride;
     mi_step =
-        AOMMIN(mi_size_high[left_mi[0]->sb_type], mi_size_high[BLOCK_64X64]);
+        AOMMIN(mi_size_high[left_mi[0]->bsize], mi_size_high[BLOCK_64X64]);
     if (mi_step == 1) {
       left_mi_row &= ~1;
       left_mi = prev_col_mi + (left_mi_row + 1) * xd->mi_stride;
diff --git a/av1/common/obu_util.c b/av1/common/obu_util.c
index 7d2694b..cfca03b 100644
--- a/av1/common/obu_util.c
+++ b/av1/common/obu_util.c
@@ -14,24 +14,6 @@
 
 #include "aom_dsp/bitreader_buffer.h"
 
-// Returns 1 when OBU type is valid, and 0 otherwise.
-static int valid_obu_type(int obu_type) {
-  int valid_type = 0;
-  switch (obu_type) {
-    case OBU_SEQUENCE_HEADER:
-    case OBU_TEMPORAL_DELIMITER:
-    case OBU_FRAME_HEADER:
-    case OBU_TILE_GROUP:
-    case OBU_METADATA:
-    case OBU_FRAME:
-    case OBU_REDUNDANT_FRAME_HEADER:
-    case OBU_TILE_LIST:
-    case OBU_PADDING: valid_type = 1; break;
-    default: break;
-  }
-  return valid_type;
-}
-
 static aom_codec_err_t read_obu_size(const uint8_t *data,
                                      size_t bytes_available,
                                      size_t *const obu_size,
@@ -63,9 +45,6 @@
   }
 
   header->type = (OBU_TYPE)aom_rb_read_literal(rb, 4);
-
-  if (!valid_obu_type(header->type)) return AOM_CODEC_CORRUPT_FRAME;
-
   header->has_extension = aom_rb_read_bit(rb);
   header->has_size_field = aom_rb_read_bit(rb);
 
@@ -74,10 +53,8 @@
     return AOM_CODEC_UNSUP_BITSTREAM;
   }
 
-  if (aom_rb_read_bit(rb) != 0) {
-    // obu_reserved_1bit must be set to 0.
-    return AOM_CODEC_CORRUPT_FRAME;
-  }
+  // obu_reserved_1bit must be set to 0. The value is ignored by a decoder.
+  aom_rb_read_bit(rb);
 
   if (header->has_extension) {
     if (bit_buffer_byte_length == 1) return AOM_CODEC_CORRUPT_FRAME;
@@ -85,10 +62,12 @@
     header->size += 1;
     header->temporal_layer_id = aom_rb_read_literal(rb, 3);
     header->spatial_layer_id = aom_rb_read_literal(rb, 2);
-    if (aom_rb_read_literal(rb, 3) != 0) {
-      // extension_header_reserved_3bits must be set to 0.
-      return AOM_CODEC_CORRUPT_FRAME;
-    }
+    // extension_header_reserved_3bits must be set to 0. The value is ignored by
+    // a decoder.
+    aom_rb_read_literal(rb, 3);
+  } else {
+    header->temporal_layer_id = 0;
+    header->spatial_layer_id = 0;
   }
 
   return AOM_CODEC_OK;
diff --git a/av1/common/obu_util.h b/av1/common/obu_util.h
index 7c56904..adf3568 100644
--- a/av1/common/obu_util.h
+++ b/av1/common/obu_util.h
@@ -22,9 +22,9 @@
                 // optional OBU extension header) in the bitstream.
   OBU_TYPE type;
   int has_size_field;
-  int has_extension;
-  // The following fields come from the OBU extension header and therefore are
-  // only used if has_extension is true.
+  int has_extension;  // Whether the optional OBU extension header is present.
+  // The following fields come from the OBU extension header. They are set to 0
+  // if has_extension is false.
   int temporal_layer_id;
   int spatial_layer_id;
 } ObuHeader;
diff --git a/av1/common/pred_common.h b/av1/common/pred_common.h
index d1dab97..12bcce8 100644
--- a/av1/common/pred_common.h
+++ b/av1/common/pred_common.h
@@ -169,12 +169,12 @@
   return above_skip_mode + left_skip_mode;
 }
 
-static INLINE int av1_get_skip_context(const MACROBLOCKD *xd) {
+static INLINE int av1_get_skip_txfm_context(const MACROBLOCKD *xd) {
   const MB_MODE_INFO *const above_mi = xd->above_mbmi;
   const MB_MODE_INFO *const left_mi = xd->left_mbmi;
-  const int above_skip = above_mi ? above_mi->skip : 0;
-  const int left_skip = left_mi ? left_mi->skip : 0;
-  return above_skip + left_skip;
+  const int above_skip_txfm = above_mi ? above_mi->skip_txfm : 0;
+  const int left_skip_txfm = left_mi ? left_mi->skip_txfm : 0;
+  return above_skip_txfm + left_skip_txfm;
 }
 
 int av1_get_pred_context_switchable_interp(const MACROBLOCKD *xd, int dir);
@@ -208,8 +208,8 @@
   return xd->tile_ctx->comp_inter_cdf[av1_get_reference_mode_context(xd)];
 }
 
-static INLINE aom_cdf_prob *av1_get_skip_cdf(const MACROBLOCKD *xd) {
-  return xd->tile_ctx->skip_cdfs[av1_get_skip_context(xd)];
+static INLINE aom_cdf_prob *av1_get_skip_txfm_cdf(const MACROBLOCKD *xd) {
+  return xd->tile_ctx->skip_txfm_cdfs[av1_get_skip_txfm_context(xd)];
 }
 
 int av1_get_comp_reference_type_context(const MACROBLOCKD *xd);
@@ -340,7 +340,7 @@
   const MB_MODE_INFO *mbmi = xd->mi[0];
   const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
   const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
-  const TX_SIZE max_tx_size = max_txsize_rect_lookup[mbmi->sb_type];
+  const TX_SIZE max_tx_size = max_txsize_rect_lookup[mbmi->bsize];
   const int max_tx_wide = tx_size_wide[max_tx_size];
   const int max_tx_high = tx_size_high[max_tx_size];
   const int has_above = xd->up_available;
@@ -351,11 +351,11 @@
 
   if (has_above)
     if (is_inter_block(above_mbmi))
-      above = block_size_wide[above_mbmi->sb_type] >= max_tx_wide;
+      above = block_size_wide[above_mbmi->bsize] >= max_tx_wide;
 
   if (has_left)
     if (is_inter_block(left_mbmi))
-      left = block_size_high[left_mbmi->sb_type] >= max_tx_high;
+      left = block_size_high[left_mbmi->bsize] >= max_tx_high;
 
   if (has_above && has_left)
     return (above + left);
diff --git a/av1/common/reconinter.c b/av1/common/reconinter.c
index 287addd..449a00f 100644
--- a/av1/common/reconinter.c
+++ b/av1/common/reconinter.c
@@ -108,17 +108,6 @@
     inter_pred_params->mode = WARP_PRED;
 }
 
-void av1_init_mask_comp(InterPredParams *inter_pred_params, BLOCK_SIZE bsize,
-                        const INTERINTER_COMPOUND_DATA *mask_comp) {
-  inter_pred_params->sb_type = bsize;
-  inter_pred_params->mask_comp = *mask_comp;
-
-  if (inter_pred_params->conv_params.compound_index == 1) {
-    inter_pred_params->conv_params.do_average = 0;
-    inter_pred_params->comp_mode = MASK_COMP;
-  }
-}
-
 void av1_make_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst,
                               int dst_stride,
                               InterPredParams *inter_pred_params,
@@ -126,8 +115,33 @@
   assert(IMPLIES(inter_pred_params->conv_params.is_compound,
                  inter_pred_params->conv_params.dst != NULL));
 
+  if (inter_pred_params->mode == TRANSLATION_PRED) {
+#if CONFIG_AV1_HIGHBITDEPTH
+    if (inter_pred_params->use_hbd_buf) {
+      highbd_inter_predictor(src, src_stride, dst, dst_stride, subpel_params,
+                             inter_pred_params->block_width,
+                             inter_pred_params->block_height,
+                             &inter_pred_params->conv_params,
+                             inter_pred_params->interp_filter_params,
+                             inter_pred_params->bit_depth);
+    } else {
+      inter_predictor(src, src_stride, dst, dst_stride, subpel_params,
+                      inter_pred_params->block_width,
+                      inter_pred_params->block_height,
+                      &inter_pred_params->conv_params,
+                      inter_pred_params->interp_filter_params);
+    }
+#else
+    inter_predictor(src, src_stride, dst, dst_stride, subpel_params,
+                    inter_pred_params->block_width,
+                    inter_pred_params->block_height,
+                    &inter_pred_params->conv_params,
+                    inter_pred_params->interp_filter_params);
+#endif
+  }
+#if !CONFIG_REALTIME_ONLY
   // TODO(jingning): av1_warp_plane() can be further cleaned up.
-  if (inter_pred_params->mode == WARP_PRED) {
+  else if (inter_pred_params->mode == WARP_PRED) {
     av1_warp_plane(
         &inter_pred_params->warp_params, inter_pred_params->use_hbd_buf,
         inter_pred_params->bit_depth, inter_pred_params->ref_frame_buf.buf0,
@@ -138,30 +152,8 @@
         inter_pred_params->block_width, inter_pred_params->block_height,
         dst_stride, inter_pred_params->subsampling_x,
         inter_pred_params->subsampling_y, &inter_pred_params->conv_params);
-  } else if (inter_pred_params->mode == TRANSLATION_PRED) {
-#if CONFIG_AV1_HIGHBITDEPTH
-    if (inter_pred_params->use_hbd_buf) {
-      highbd_inter_predictor(
-          src, src_stride, dst, dst_stride, subpel_params,
-          inter_pred_params->scale_factors, inter_pred_params->block_width,
-          inter_pred_params->block_height, &inter_pred_params->conv_params,
-          inter_pred_params->interp_filter_params,
-          inter_pred_params->bit_depth);
-    } else {
-      inter_predictor(
-          src, src_stride, dst, dst_stride, subpel_params,
-          inter_pred_params->scale_factors, inter_pred_params->block_width,
-          inter_pred_params->block_height, &inter_pred_params->conv_params,
-          inter_pred_params->interp_filter_params);
-    }
-#else
-    inter_predictor(
-        src, src_stride, dst, dst_stride, subpel_params,
-        inter_pred_params->scale_factors, inter_pred_params->block_width,
-        inter_pred_params->block_height, &inter_pred_params->conv_params,
-        inter_pred_params->interp_filter_params);
-#endif
   }
+#endif
 }
 
 static const uint8_t wedge_master_oblique_odd[MASK_MASTER_SIZE] = {
@@ -328,14 +320,12 @@
 
 const uint8_t *av1_get_compound_type_mask(
     const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE sb_type) {
-  assert(is_masked_compound_type(comp_data->type));
   (void)sb_type;
   switch (comp_data->type) {
     case COMPOUND_WEDGE:
       return av1_get_contiguous_soft_mask(comp_data->wedge_index,
                                           comp_data->wedge_sign, sb_type);
-    case COMPOUND_DIFFWTD: return comp_data->seg_mask;
-    default: assert(0); return NULL;
+    default: return comp_data->seg_mask;
   }
 }
 
@@ -373,21 +363,6 @@
   }
 }
 
-static AOM_INLINE void diffwtd_mask(uint8_t *mask, int which_inverse,
-                                    int mask_base, const uint8_t *src0,
-                                    int src0_stride, const uint8_t *src1,
-                                    int src1_stride, int h, int w) {
-  int i, j, m, diff;
-  for (i = 0; i < h; ++i) {
-    for (j = 0; j < w; ++j) {
-      diff =
-          abs((int)src0[i * src0_stride + j] - (int)src1[i * src1_stride + j]);
-      m = clamp(mask_base + (diff / DIFF_FACTOR), 0, AOM_BLEND_A64_MAX_ALPHA);
-      mask[i * w + j] = which_inverse ? AOM_BLEND_A64_MAX_ALPHA - m : m;
-    }
-  }
-}
-
 void av1_build_compound_diffwtd_mask_c(uint8_t *mask,
                                        DIFFWTD_MASK_TYPE mask_type,
                                        const uint8_t *src0, int src0_stride,
@@ -395,90 +370,29 @@
                                        int h, int w) {
   switch (mask_type) {
     case DIFFWTD_38:
-      diffwtd_mask(mask, 0, 38, src0, src0_stride, src1, src1_stride, h, w);
+      av1_diffwtd_mask(mask, 0, 38, src0, src0_stride, src1, src1_stride, h, w);
       break;
     case DIFFWTD_38_INV:
-      diffwtd_mask(mask, 1, 38, src0, src0_stride, src1, src1_stride, h, w);
+      av1_diffwtd_mask(mask, 1, 38, src0, src0_stride, src1, src1_stride, h, w);
       break;
     default: assert(0);
   }
 }
 
-static AOM_FORCE_INLINE void diffwtd_mask_highbd(
-    uint8_t *mask, int which_inverse, int mask_base, const uint16_t *src0,
-    int src0_stride, const uint16_t *src1, int src1_stride, int h, int w,
-    const unsigned int bd) {
-  assert(bd >= 8);
-  if (bd == 8) {
-    if (which_inverse) {
-      for (int i = 0; i < h; ++i) {
-        for (int j = 0; j < w; ++j) {
-          int diff = abs((int)src0[j] - (int)src1[j]) / DIFF_FACTOR;
-          unsigned int m = negative_to_zero(mask_base + diff);
-          m = AOMMIN(m, AOM_BLEND_A64_MAX_ALPHA);
-          mask[j] = AOM_BLEND_A64_MAX_ALPHA - m;
-        }
-        src0 += src0_stride;
-        src1 += src1_stride;
-        mask += w;
-      }
-    } else {
-      for (int i = 0; i < h; ++i) {
-        for (int j = 0; j < w; ++j) {
-          int diff = abs((int)src0[j] - (int)src1[j]) / DIFF_FACTOR;
-          unsigned int m = negative_to_zero(mask_base + diff);
-          m = AOMMIN(m, AOM_BLEND_A64_MAX_ALPHA);
-          mask[j] = m;
-        }
-        src0 += src0_stride;
-        src1 += src1_stride;
-        mask += w;
-      }
-    }
-  } else {
-    const unsigned int bd_shift = bd - 8;
-    if (which_inverse) {
-      for (int i = 0; i < h; ++i) {
-        for (int j = 0; j < w; ++j) {
-          int diff =
-              (abs((int)src0[j] - (int)src1[j]) >> bd_shift) / DIFF_FACTOR;
-          unsigned int m = negative_to_zero(mask_base + diff);
-          m = AOMMIN(m, AOM_BLEND_A64_MAX_ALPHA);
-          mask[j] = AOM_BLEND_A64_MAX_ALPHA - m;
-        }
-        src0 += src0_stride;
-        src1 += src1_stride;
-        mask += w;
-      }
-    } else {
-      for (int i = 0; i < h; ++i) {
-        for (int j = 0; j < w; ++j) {
-          int diff =
-              (abs((int)src0[j] - (int)src1[j]) >> bd_shift) / DIFF_FACTOR;
-          unsigned int m = negative_to_zero(mask_base + diff);
-          m = AOMMIN(m, AOM_BLEND_A64_MAX_ALPHA);
-          mask[j] = m;
-        }
-        src0 += src0_stride;
-        src1 += src1_stride;
-        mask += w;
-      }
-    }
-  }
-}
-
 void av1_build_compound_diffwtd_mask_highbd_c(
     uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0,
     int src0_stride, const uint8_t *src1, int src1_stride, int h, int w,
     int bd) {
   switch (mask_type) {
     case DIFFWTD_38:
-      diffwtd_mask_highbd(mask, 0, 38, CONVERT_TO_SHORTPTR(src0), src0_stride,
-                          CONVERT_TO_SHORTPTR(src1), src1_stride, h, w, bd);
+      av1_diffwtd_mask_highbd(mask, 0, 38, CONVERT_TO_SHORTPTR(src0),
+                              src0_stride, CONVERT_TO_SHORTPTR(src1),
+                              src1_stride, h, w, bd);
       break;
     case DIFFWTD_38_INV:
-      diffwtd_mask_highbd(mask, 1, 38, CONVERT_TO_SHORTPTR(src0), src0_stride,
-                          CONVERT_TO_SHORTPTR(src1), src1_stride, h, w, bd);
+      av1_diffwtd_mask_highbd(mask, 1, 38, CONVERT_TO_SHORTPTR(src0),
+                              src0_stride, CONVERT_TO_SHORTPTR(src1),
+                              src1_stride, h, w, bd);
       break;
     default: assert(0);
   }
@@ -543,13 +457,13 @@
     int w;
     for (w = 0; w < wtypes; ++w) {
       mask = get_wedge_mask_inplace(w, 0, bsize);
-      aom_convolve_copy(mask, MASK_MASTER_STRIDE, dst, bw, NULL, 0, NULL, 0, bw,
+      aom_convolve_copy(mask, MASK_MASTER_STRIDE, dst, bw /* dst_stride */, bw,
                         bh);
       wedge_params->masks[0][w] = dst;
       dst += bw * bh;
 
       mask = get_wedge_mask_inplace(w, 1, bsize);
-      aom_convolve_copy(mask, MASK_MASTER_STRIDE, dst, bw, NULL, 0, NULL, 0, bw,
+      aom_convolve_copy(mask, MASK_MASTER_STRIDE, dst, bw /* dst_stride */, bw,
                         bh);
       wedge_params->masks[1][w] = dst;
       dst += bw * bh;
@@ -662,10 +576,10 @@
 #endif
 }
 
-void av1_make_masked_inter_predictor(const uint8_t *pre, int pre_stride,
-                                     uint8_t *dst, int dst_stride,
-                                     InterPredParams *inter_pred_params,
-                                     const SubpelParams *subpel_params) {
+static void make_masked_inter_predictor(const uint8_t *pre, int pre_stride,
+                                        uint8_t *dst, int dst_stride,
+                                        InterPredParams *inter_pred_params,
+                                        const SubpelParams *subpel_params) {
   const INTERINTER_COMPOUND_DATA *comp_data = &inter_pred_params->mask_comp;
   BLOCK_SIZE sb_type = inter_pred_params->sb_type;
 
@@ -705,208 +619,20 @@
 void av1_build_one_inter_predictor(
     uint8_t *dst, int dst_stride, const MV *const src_mv,
     InterPredParams *inter_pred_params, MACROBLOCKD *xd, int mi_x, int mi_y,
-    int ref, CalcSubpelParamsFunc calc_subpel_params_func) {
+    int ref, uint8_t **mc_buf, CalcSubpelParamsFunc calc_subpel_params_func) {
   SubpelParams subpel_params;
   uint8_t *src;
   int src_stride;
-  calc_subpel_params_func(src_mv, inter_pred_params, xd, mi_x, mi_y, ref, &src,
-                          &subpel_params, &src_stride);
+  calc_subpel_params_func(src_mv, inter_pred_params, xd, mi_x, mi_y, ref,
+                          mc_buf, &src, &subpel_params, &src_stride);
 
   if (inter_pred_params->comp_mode == UNIFORM_SINGLE ||
       inter_pred_params->comp_mode == UNIFORM_COMP) {
     av1_make_inter_predictor(src, src_stride, dst, dst_stride,
                              inter_pred_params, &subpel_params);
   } else {
-    av1_make_masked_inter_predictor(src, src_stride, dst, dst_stride,
-                                    inter_pred_params, &subpel_params);
-  }
-}
-
-// True if the following hold:
-//  1. Not intrabc and not build_for_obmc
-//  2. A U or V plane
-//  3. If the block size differs from the base block size
-//  4. If sub-sampled, none of the previous blocks around the sub-sample
-//     are intrabc or inter-blocks
-static bool is_sub8x8_inter(const MACROBLOCKD *xd, int plane, BLOCK_SIZE bsize,
-                            int is_intrabc, int build_for_obmc) {
-  if (is_intrabc || build_for_obmc) {
-    return false;
-  }
-
-  const struct macroblockd_plane *const pd = &xd->plane[plane];
-  const int ss_x = pd->subsampling_x;
-  const int ss_y = pd->subsampling_y;
-  if ((block_size_wide[bsize] >= 8 || !ss_x) &&
-      (block_size_high[bsize] >= 8 || !ss_y)) {
-    return false;
-  }
-
-  // For sub8x8 chroma blocks, we may be covering more than one luma block's
-  // worth of pixels. Thus (mi_x, mi_y) may not be the correct coordinates for
-  // the top-left corner of the prediction source - the correct top-left corner
-  // is at (pre_x, pre_y).
-  const int row_start = (block_size_high[bsize] == 4) && ss_y ? -1 : 0;
-  const int col_start = (block_size_wide[bsize] == 4) && ss_x ? -1 : 0;
-
-  for (int row = row_start; row <= 0; ++row) {
-    for (int col = col_start; col <= 0; ++col) {
-      const MB_MODE_INFO *this_mbmi = xd->mi[row * xd->mi_stride + col];
-      if (!is_inter_block(this_mbmi)) return false;
-      if (is_intrabc_block(this_mbmi)) return false;
-    }
-  }
-  return true;
-}
-
-static void build_inter_predictors_sub8x8(
-    const AV1_COMMON *cm, MACROBLOCKD *xd, int plane, const MB_MODE_INFO *mi,
-    int bw, int bh, int mi_x, int mi_y,
-    CalcSubpelParamsFunc calc_subpel_params_func) {
-  const BLOCK_SIZE bsize = mi->sb_type;
-  struct macroblockd_plane *const pd = &xd->plane[plane];
-  const bool ss_x = pd->subsampling_x;
-  const bool ss_y = pd->subsampling_y;
-  const int b4_w = block_size_wide[bsize] >> ss_x;
-  const int b4_h = block_size_high[bsize] >> ss_y;
-  const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, ss_x, ss_y);
-  const int b8_w = block_size_wide[plane_bsize];
-  const int b8_h = block_size_high[plane_bsize];
-  const int is_compound = has_second_ref(mi);
-  assert(!is_compound);
-  assert(!is_intrabc_block(mi));
-
-  // For sub8x8 chroma blocks, we may be covering more than one luma block's
-  // worth of pixels. Thus (mi_x, mi_y) may not be the correct coordinates for
-  // the top-left corner of the prediction source - the correct top-left corner
-  // is at (pre_x, pre_y).
-  const int row_start = (block_size_high[bsize] == 4) && ss_y ? -1 : 0;
-  const int col_start = (block_size_wide[bsize] == 4) && ss_x ? -1 : 0;
-  const int pre_x = (mi_x + MI_SIZE * col_start) >> ss_x;
-  const int pre_y = (mi_y + MI_SIZE * row_start) >> ss_y;
-
-  int row = row_start;
-  for (int y = 0; y < b8_h; y += b4_h) {
-    int col = col_start;
-    for (int x = 0; x < b8_w; x += b4_w) {
-      MB_MODE_INFO *this_mbmi = xd->mi[row * xd->mi_stride + col];
-      int tmp_dst_stride = 8;
-      assert(bw < 8 || bh < 8);
-      (void)bw;
-      (void)bh;
-      struct buf_2d *const dst_buf = &pd->dst;
-      uint8_t *dst = dst_buf->buf + dst_buf->stride * y + x;
-      int ref = 0;
-      const RefCntBuffer *ref_buf =
-          get_ref_frame_buf(cm, this_mbmi->ref_frame[ref]);
-      const struct scale_factors *ref_scale_factors =
-          get_ref_scale_factors_const(cm, this_mbmi->ref_frame[ref]);
-      const struct scale_factors *const sf = ref_scale_factors;
-      const struct buf_2d pre_buf = {
-        NULL,
-        (plane == 1) ? ref_buf->buf.u_buffer : ref_buf->buf.v_buffer,
-        ref_buf->buf.uv_crop_width,
-        ref_buf->buf.uv_crop_height,
-        ref_buf->buf.uv_stride,
-      };
-
-      const MV mv = this_mbmi->mv[ref].as_mv;
-
-      InterPredParams inter_pred_params;
-      av1_init_inter_params(&inter_pred_params, b4_w, b4_h, pre_y + y,
-                            pre_x + x, pd->subsampling_x, pd->subsampling_y,
-                            xd->bd, is_cur_buf_hbd(xd), mi->use_intrabc, sf,
-                            &pre_buf, this_mbmi->interp_filters);
-      inter_pred_params.conv_params = get_conv_params_no_round(
-          ref, plane, xd->tmp_conv_dst, tmp_dst_stride, is_compound, xd->bd);
-      inter_pred_params.conv_params.use_dist_wtd_comp_avg = 0;
-
-      av1_build_one_inter_predictor(dst, dst_buf->stride, &mv,
-                                    &inter_pred_params, xd, mi_x + x, mi_y + y,
-                                    ref, calc_subpel_params_func);
-
-      ++col;
-    }
-    ++row;
-  }
-}
-
-static void build_inter_predictors_8x8_and_bigger(
-    const AV1_COMMON *cm, MACROBLOCKD *xd, int plane, const MB_MODE_INFO *mi,
-    int build_for_obmc, int bw, int bh, int mi_x, int mi_y,
-    CalcSubpelParamsFunc calc_subpel_params_func) {
-  const int is_compound = has_second_ref(mi);
-  const int is_intrabc = is_intrabc_block(mi);
-  assert(IMPLIES(is_intrabc, !is_compound));
-  struct macroblockd_plane *const pd = &xd->plane[plane];
-  struct buf_2d *const dst_buf = &pd->dst;
-  uint8_t *const dst = dst_buf->buf;
-
-  int is_global[2] = { 0, 0 };
-  for (int ref = 0; ref < 1 + is_compound; ++ref) {
-    const WarpedMotionParams *const wm = &xd->global_motion[mi->ref_frame[ref]];
-    is_global[ref] = is_global_mv_block(mi, wm->wmtype);
-  }
-
-  const BLOCK_SIZE bsize = mi->sb_type;
-  const int ss_x = pd->subsampling_x;
-  const int ss_y = pd->subsampling_y;
-  const int row_start =
-      (block_size_high[bsize] == 4) && ss_y && !build_for_obmc ? -1 : 0;
-  const int col_start =
-      (block_size_wide[bsize] == 4) && ss_x && !build_for_obmc ? -1 : 0;
-  const int pre_x = (mi_x + MI_SIZE * col_start) >> ss_x;
-  const int pre_y = (mi_y + MI_SIZE * row_start) >> ss_y;
-
-  for (int ref = 0; ref < 1 + is_compound; ++ref) {
-    const struct scale_factors *const sf =
-        is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref];
-    struct buf_2d *const pre_buf = is_intrabc ? dst_buf : &pd->pre[ref];
-    const MV mv = mi->mv[ref].as_mv;
-    const WarpTypesAllowed warp_types = { is_global[ref],
-                                          mi->motion_mode == WARPED_CAUSAL };
-
-    InterPredParams inter_pred_params;
-    av1_init_inter_params(&inter_pred_params, bw, bh, pre_y, pre_x,
-                          pd->subsampling_x, pd->subsampling_y, xd->bd,
-                          is_cur_buf_hbd(xd), mi->use_intrabc, sf, pre_buf,
-                          mi->interp_filters);
-    if (is_compound) av1_init_comp_mode(&inter_pred_params);
-    inter_pred_params.conv_params = get_conv_params_no_round(
-        ref, plane, xd->tmp_conv_dst, MAX_SB_SIZE, is_compound, xd->bd);
-
-    av1_dist_wtd_comp_weight_assign(
-        cm, mi, 0, &inter_pred_params.conv_params.fwd_offset,
-        &inter_pred_params.conv_params.bck_offset,
-        &inter_pred_params.conv_params.use_dist_wtd_comp_avg, is_compound);
-
-    if (!build_for_obmc)
-      av1_init_warp_params(&inter_pred_params, &warp_types, ref, xd, mi);
-
-    if (is_masked_compound_type(mi->interinter_comp.type)) {
-      av1_init_mask_comp(&inter_pred_params, mi->sb_type, &mi->interinter_comp);
-      // Assign physical buffer.
-      inter_pred_params.mask_comp.seg_mask = xd->seg_mask;
-    }
-
-    av1_build_one_inter_predictor(dst, dst_buf->stride, &mv, &inter_pred_params,
-                                  xd, mi_x, mi_y, ref, calc_subpel_params_func);
-  }
-}
-
-void av1_build_inter_predictors(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                int plane, const MB_MODE_INFO *mi,
-                                int build_for_obmc, int bw, int bh, int mi_x,
-                                int mi_y,
-                                CalcSubpelParamsFunc calc_subpel_params_func) {
-  if (is_sub8x8_inter(xd, plane, mi->sb_type, is_intrabc_block(mi),
-                      build_for_obmc)) {
-    build_inter_predictors_sub8x8(cm, xd, plane, mi, bw, bh, mi_x, mi_y,
-                                  calc_subpel_params_func);
-  } else {
-    build_inter_predictors_8x8_and_bigger(cm, xd, plane, mi, build_for_obmc, bw,
-                                          bh, mi_x, mi_y,
-                                          calc_subpel_params_func);
+    make_masked_inter_predictor(src, src_stride, dst, dst_stride,
+                                inter_pred_params, &subpel_params);
   }
 }
 
@@ -917,6 +643,8 @@
                                      int is_compound) {
   assert(fwd_offset != NULL && bck_offset != NULL);
   if (!is_compound || mbmi->compound_idx) {
+    *fwd_offset = 8;
+    *bck_offset = 8;
     *use_dist_wtd_comp_avg = 0;
     return;
   }
@@ -958,6 +686,195 @@
   *bck_offset = quant_dist_lookup_table[order_idx][i][1 - order];
 }
 
+// True if the following hold:
+//  1. Not intrabc and not build_for_obmc
+//  2. At least one dimension is size 4 with subsampling
+//  3. If sub-sampled, none of the previous blocks around the sub-sample
+//     are intrabc or inter-blocks
+static bool is_sub8x8_inter(const MACROBLOCKD *xd, int plane, BLOCK_SIZE bsize,
+                            int is_intrabc, int build_for_obmc) {
+  if (is_intrabc || build_for_obmc) {
+    return false;
+  }
+
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const int ss_x = pd->subsampling_x;
+  const int ss_y = pd->subsampling_y;
+  const int is_sub4_x = (block_size_wide[bsize] == 4) && ss_x;
+  const int is_sub4_y = (block_size_high[bsize] == 4) && ss_y;
+  if (!is_sub4_x && !is_sub4_y) {
+    return false;
+  }
+
+  // For sub8x8 chroma blocks, we may be covering more than one luma block's
+  // worth of pixels. Thus (mi_x, mi_y) may not be the correct coordinates for
+  // the top-left corner of the prediction source - the correct top-left corner
+  // is at (pre_x, pre_y).
+  const int row_start = is_sub4_y ? -1 : 0;
+  const int col_start = is_sub4_x ? -1 : 0;
+
+  for (int row = row_start; row <= 0; ++row) {
+    for (int col = col_start; col <= 0; ++col) {
+      const MB_MODE_INFO *this_mbmi = xd->mi[row * xd->mi_stride + col];
+      if (!is_inter_block(this_mbmi)) return false;
+      if (is_intrabc_block(this_mbmi)) return false;
+    }
+  }
+  return true;
+}
+
+static void build_inter_predictors_sub8x8(
+    const AV1_COMMON *cm, MACROBLOCKD *xd, int plane, const MB_MODE_INFO *mi,
+    int mi_x, int mi_y, uint8_t **mc_buf,
+    CalcSubpelParamsFunc calc_subpel_params_func) {
+  const BLOCK_SIZE bsize = mi->bsize;
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  const bool ss_x = pd->subsampling_x;
+  const bool ss_y = pd->subsampling_y;
+  const int b4_w = block_size_wide[bsize] >> ss_x;
+  const int b4_h = block_size_high[bsize] >> ss_y;
+  const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, ss_x, ss_y);
+  const int b8_w = block_size_wide[plane_bsize];
+  const int b8_h = block_size_high[plane_bsize];
+  const int is_compound = has_second_ref(mi);
+  assert(!is_compound);
+  assert(!is_intrabc_block(mi));
+
+  // For sub8x8 chroma blocks, we may be covering more than one luma block's
+  // worth of pixels. Thus (mi_x, mi_y) may not be the correct coordinates for
+  // the top-left corner of the prediction source - the correct top-left corner
+  // is at (pre_x, pre_y).
+  const int row_start = (block_size_high[bsize] == 4) && ss_y ? -1 : 0;
+  const int col_start = (block_size_wide[bsize] == 4) && ss_x ? -1 : 0;
+  const int pre_x = (mi_x + MI_SIZE * col_start) >> ss_x;
+  const int pre_y = (mi_y + MI_SIZE * row_start) >> ss_y;
+
+  int row = row_start;
+  for (int y = 0; y < b8_h; y += b4_h) {
+    int col = col_start;
+    for (int x = 0; x < b8_w; x += b4_w) {
+      MB_MODE_INFO *this_mbmi = xd->mi[row * xd->mi_stride + col];
+      struct buf_2d *const dst_buf = &pd->dst;
+      uint8_t *dst = dst_buf->buf + dst_buf->stride * y + x;
+      int ref = 0;
+      const RefCntBuffer *ref_buf =
+          get_ref_frame_buf(cm, this_mbmi->ref_frame[ref]);
+      const struct scale_factors *ref_scale_factors =
+          get_ref_scale_factors_const(cm, this_mbmi->ref_frame[ref]);
+      const struct scale_factors *const sf = ref_scale_factors;
+      const struct buf_2d pre_buf = {
+        NULL,
+        (plane == 1) ? ref_buf->buf.u_buffer : ref_buf->buf.v_buffer,
+        ref_buf->buf.uv_crop_width,
+        ref_buf->buf.uv_crop_height,
+        ref_buf->buf.uv_stride,
+      };
+
+      const MV mv = this_mbmi->mv[ref].as_mv;
+
+      InterPredParams inter_pred_params;
+      av1_init_inter_params(&inter_pred_params, b4_w, b4_h, pre_y + y,
+                            pre_x + x, pd->subsampling_x, pd->subsampling_y,
+                            xd->bd, is_cur_buf_hbd(xd), mi->use_intrabc, sf,
+                            &pre_buf, this_mbmi->interp_filters);
+      inter_pred_params.conv_params =
+          get_conv_params_no_round(ref, plane, NULL, 0, is_compound, xd->bd);
+
+      av1_build_one_inter_predictor(dst, dst_buf->stride, &mv,
+                                    &inter_pred_params, xd, mi_x + x, mi_y + y,
+                                    ref, mc_buf, calc_subpel_params_func);
+
+      ++col;
+    }
+    ++row;
+  }
+}
+
+static void build_inter_predictors_8x8_and_bigger(
+    const AV1_COMMON *cm, MACROBLOCKD *xd, int plane, const MB_MODE_INFO *mi,
+    int build_for_obmc, int bw, int bh, int mi_x, int mi_y, uint8_t **mc_buf,
+    CalcSubpelParamsFunc calc_subpel_params_func) {
+  const int is_compound = has_second_ref(mi);
+  const int is_intrabc = is_intrabc_block(mi);
+  assert(IMPLIES(is_intrabc, !is_compound));
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  struct buf_2d *const dst_buf = &pd->dst;
+  uint8_t *const dst = dst_buf->buf;
+
+  int is_global[2] = { 0, 0 };
+  for (int ref = 0; ref < 1 + is_compound; ++ref) {
+    const WarpedMotionParams *const wm = &xd->global_motion[mi->ref_frame[ref]];
+    is_global[ref] = is_global_mv_block(mi, wm->wmtype);
+  }
+
+  const BLOCK_SIZE bsize = mi->bsize;
+  const int ss_x = pd->subsampling_x;
+  const int ss_y = pd->subsampling_y;
+  const int row_start =
+      (block_size_high[bsize] == 4) && ss_y && !build_for_obmc ? -1 : 0;
+  const int col_start =
+      (block_size_wide[bsize] == 4) && ss_x && !build_for_obmc ? -1 : 0;
+  const int pre_x = (mi_x + MI_SIZE * col_start) >> ss_x;
+  const int pre_y = (mi_y + MI_SIZE * row_start) >> ss_y;
+
+  for (int ref = 0; ref < 1 + is_compound; ++ref) {
+    const struct scale_factors *const sf =
+        is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref];
+    struct buf_2d *const pre_buf = is_intrabc ? dst_buf : &pd->pre[ref];
+    const MV mv = mi->mv[ref].as_mv;
+    const WarpTypesAllowed warp_types = { is_global[ref],
+                                          mi->motion_mode == WARPED_CAUSAL };
+
+    InterPredParams inter_pred_params;
+    av1_init_inter_params(&inter_pred_params, bw, bh, pre_y, pre_x,
+                          pd->subsampling_x, pd->subsampling_y, xd->bd,
+                          is_cur_buf_hbd(xd), mi->use_intrabc, sf, pre_buf,
+                          mi->interp_filters);
+    if (is_compound) av1_init_comp_mode(&inter_pred_params);
+    inter_pred_params.conv_params = get_conv_params_no_round(
+        ref, plane, xd->tmp_conv_dst, MAX_SB_SIZE, is_compound, xd->bd);
+
+    av1_dist_wtd_comp_weight_assign(
+        cm, mi, 0, &inter_pred_params.conv_params.fwd_offset,
+        &inter_pred_params.conv_params.bck_offset,
+        &inter_pred_params.conv_params.use_dist_wtd_comp_avg, is_compound);
+
+    if (!build_for_obmc)
+      av1_init_warp_params(&inter_pred_params, &warp_types, ref, xd, mi);
+
+    if (is_masked_compound_type(mi->interinter_comp.type)) {
+      inter_pred_params.sb_type = mi->bsize;
+      inter_pred_params.mask_comp = mi->interinter_comp;
+      if (ref == 1) {
+        inter_pred_params.conv_params.do_average = 0;
+        inter_pred_params.comp_mode = MASK_COMP;
+      }
+      // Assign physical buffer.
+      inter_pred_params.mask_comp.seg_mask = xd->seg_mask;
+    }
+
+    av1_build_one_inter_predictor(dst, dst_buf->stride, &mv, &inter_pred_params,
+                                  xd, mi_x, mi_y, ref, mc_buf,
+                                  calc_subpel_params_func);
+  }
+}
+
+void av1_build_inter_predictors(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                int plane, const MB_MODE_INFO *mi,
+                                int build_for_obmc, int bw, int bh, int mi_x,
+                                int mi_y, uint8_t **mc_buf,
+                                CalcSubpelParamsFunc calc_subpel_params_func) {
+  if (is_sub8x8_inter(xd, plane, mi->bsize, is_intrabc_block(mi),
+                      build_for_obmc)) {
+    assert(bw < 8 || bh < 8);
+    build_inter_predictors_sub8x8(cm, xd, plane, mi, mi_x, mi_y, mc_buf,
+                                  calc_subpel_params_func);
+  } else {
+    build_inter_predictors_8x8_and_bigger(cm, xd, plane, mi, build_for_obmc, bw,
+                                          bh, mi_x, mi_y, mc_buf,
+                                          calc_subpel_params_func);
+  }
+}
 void av1_setup_dst_planes(struct macroblockd_plane *planes, BLOCK_SIZE bsize,
                           const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col,
                           const int plane_start, const int plane_end) {
@@ -982,7 +899,7 @@
     for (int i = 0; i < AOMMIN(num_planes, MAX_MB_PLANE); ++i) {
       struct macroblockd_plane *const pd = &xd->plane[i];
       const int is_uv = i > 0;
-      setup_pred_plane(&pd->pre[idx], xd->mi[0]->sb_type, src->buffers[i],
+      setup_pred_plane(&pd->pre[idx], xd->mi[0]->bsize, src->buffers[i],
                        src->crop_widths[is_uv], src->crop_heights[is_uv],
                        src->strides[is_uv], mi_row, mi_col, sf,
                        pd->subsampling_x, pd->subsampling_y);
@@ -1043,15 +960,15 @@
 void av1_count_overlappable_neighbors(const AV1_COMMON *cm, MACROBLOCKD *xd) {
   MB_MODE_INFO *mbmi = xd->mi[0];
 
-  mbmi->overlappable_neighbors[0] = 0;
-  mbmi->overlappable_neighbors[1] = 0;
+  mbmi->overlappable_neighbors = 0;
 
-  if (!is_motion_variation_allowed_bsize(mbmi->sb_type)) return;
+  if (!is_motion_variation_allowed_bsize(mbmi->bsize)) return;
 
   foreach_overlappable_nb_above(cm, xd, INT_MAX, increment_int_ptr,
-                                &mbmi->overlappable_neighbors[0]);
+                                &mbmi->overlappable_neighbors);
+  if (mbmi->overlappable_neighbors) return;
   foreach_overlappable_nb_left(cm, xd, INT_MAX, increment_int_ptr,
-                               &mbmi->overlappable_neighbors[1]);
+                               &mbmi->overlappable_neighbors);
 }
 
 // HW does not support < 4x4 prediction. To limit the bandwidth requirement, if
@@ -1098,7 +1015,7 @@
   (void)rel_mi_row;
   (void)dir;
   struct obmc_inter_pred_ctxt *ctxt = (struct obmc_inter_pred_ctxt *)fun_ctxt;
-  const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+  const BLOCK_SIZE bsize = xd->mi[0]->bsize;
   const int overlap =
       AOMMIN(block_size_high[bsize], block_size_high[BLOCK_64X64]) >> 1;
 
@@ -1137,7 +1054,7 @@
   (void)rel_mi_col;
   (void)dir;
   struct obmc_inter_pred_ctxt *ctxt = (struct obmc_inter_pred_ctxt *)fun_ctxt;
-  const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+  const BLOCK_SIZE bsize = xd->mi[0]->bsize;
   const int overlap =
       AOMMIN(block_size_wide[bsize], block_size_wide[BLOCK_64X64]) >> 1;
 
@@ -1179,7 +1096,7 @@
                                      int above_stride[MAX_MB_PLANE],
                                      uint8_t *left[MAX_MB_PLANE],
                                      int left_stride[MAX_MB_PLANE]) {
-  const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+  const BLOCK_SIZE bsize = xd->mi[0]->bsize;
 
   // handle above row
   struct obmc_inter_pred_ctxt ctxt_above = { above, above_stride };
@@ -1194,42 +1111,39 @@
                                build_obmc_inter_pred_left, &ctxt_left);
 }
 
-void av1_setup_address_for_obmc(MACROBLOCKD *xd, int mi_row_offset,
-                                int mi_col_offset, MB_MODE_INFO *ref_mbmi,
-                                struct build_prediction_ctxt *ctxt,
-                                const int num_planes) {
-  const BLOCK_SIZE ref_bsize = AOMMAX(BLOCK_8X8, ref_mbmi->sb_type);
-  const int ref_mi_row = xd->mi_row + mi_row_offset;
-  const int ref_mi_col = xd->mi_col + mi_col_offset;
-
-  for (int plane = 0; plane < num_planes; ++plane) {
-    struct macroblockd_plane *const pd = &xd->plane[plane];
-    setup_pred_plane(&pd->dst, ref_bsize, ctxt->tmp_buf[plane],
-                     ctxt->tmp_width[plane], ctxt->tmp_height[plane],
-                     ctxt->tmp_stride[plane], mi_row_offset, mi_col_offset,
-                     NULL, pd->subsampling_x, pd->subsampling_y);
+void av1_setup_obmc_dst_bufs(MACROBLOCKD *xd, uint8_t **dst_buf1,
+                             uint8_t **dst_buf2) {
+#if CONFIG_AV1_HIGHBITDEPTH
+  if (is_cur_buf_hbd(xd)) {
+    int len = sizeof(uint16_t);
+    dst_buf1[0] = CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[0]);
+    dst_buf1[1] =
+        CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE * len);
+    dst_buf1[2] =
+        CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE * 2 * len);
+    dst_buf2[0] = CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[1]);
+    dst_buf2[1] =
+        CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE * len);
+    dst_buf2[2] =
+        CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE * 2 * len);
+  } else {
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+    dst_buf1[0] = xd->tmp_obmc_bufs[0];
+    dst_buf1[1] = xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE;
+    dst_buf1[2] = xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE * 2;
+    dst_buf2[0] = xd->tmp_obmc_bufs[1];
+    dst_buf2[1] = xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE;
+    dst_buf2[2] = xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE * 2;
+#if CONFIG_AV1_HIGHBITDEPTH
   }
-
-  const MV_REFERENCE_FRAME frame = ref_mbmi->ref_frame[0];
-
-  const RefCntBuffer *const ref_buf = get_ref_frame_buf(ctxt->cm, frame);
-  const struct scale_factors *const sf =
-      get_ref_scale_factors_const(ctxt->cm, frame);
-
-  xd->block_ref_scale_factors[0] = sf;
-  if ((!av1_is_valid_scale(sf)))
-    aom_internal_error(xd->error_info, AOM_CODEC_UNSUP_BITSTREAM,
-                       "Reference frame has invalid dimensions");
-
-  av1_setup_pre_planes(xd, 0, &ref_buf->buf, ref_mi_row, ref_mi_col, sf,
-                       num_planes);
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 }
 
 void av1_setup_build_prediction_by_above_pred(
     MACROBLOCKD *xd, int rel_mi_col, uint8_t above_mi_width,
     MB_MODE_INFO *above_mbmi, struct build_prediction_ctxt *ctxt,
     const int num_planes) {
-  const BLOCK_SIZE a_bsize = AOMMAX(BLOCK_8X8, above_mbmi->sb_type);
+  const BLOCK_SIZE a_bsize = AOMMAX(BLOCK_8X8, above_mbmi->bsize);
   const int above_mi_col = xd->mi_col + rel_mi_col;
 
   av1_modify_neighbor_predictor_for_obmc(above_mbmi);
@@ -1268,7 +1182,7 @@
                                              MB_MODE_INFO *left_mbmi,
                                              struct build_prediction_ctxt *ctxt,
                                              const int num_planes) {
-  const BLOCK_SIZE l_bsize = AOMMAX(BLOCK_8X8, left_mbmi->sb_type);
+  const BLOCK_SIZE l_bsize = AOMMAX(BLOCK_8X8, left_mbmi->bsize);
   const int left_mi_row = xd->mi_row + rel_mi_row;
 
   av1_modify_neighbor_predictor_for_obmc(left_mbmi);
diff --git a/av1/common/reconinter.h b/av1/common/reconinter.h
index fe3c6a6..46d7d2f 100644
--- a/av1/common/reconinter.h
+++ b/av1/common/reconinter.h
@@ -17,6 +17,7 @@
 #include "av1/common/filter.h"
 #include "av1/common/warped_motion.h"
 #include "aom/aom_integer.h"
+#include "aom_dsp/blend.h"
 
 // Work out how many pixels off the edge of a reference frame we're allowed
 // to go when forming an inter prediction.
@@ -88,6 +89,7 @@
   int *tmp_height;
   int *tmp_stride;
   int mb_to_far_edge;
+  void *dcb;  // Decoder-only coding block.
 };
 
 typedef enum InterPredMode {
@@ -136,9 +138,6 @@
                           const WarpTypesAllowed *warp_types, int ref,
                           const MACROBLOCKD *xd, const MB_MODE_INFO *mi);
 
-void av1_init_mask_comp(InterPredParams *inter_pred_params, BLOCK_SIZE bsize,
-                        const INTERINTER_COMPOUND_DATA *mask_comp);
-
 static INLINE int has_scale(int xs, int ys) {
   return xs != SCALE_SUBPEL_SHIFTS || ys != SCALE_SUBPEL_SHIFTS;
 }
@@ -156,45 +155,42 @@
 
 static INLINE void inter_predictor(
     const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride,
-    const SubpelParams *subpel_params, const struct scale_factors *sf, int w,
-    int h, ConvolveParams *conv_params,
-    const InterpFilterParams *interp_filters[2]) {
+    const SubpelParams *subpel_params, int w, int h,
+    ConvolveParams *conv_params, const InterpFilterParams *interp_filters[2]) {
   assert(conv_params->do_average == 0 || conv_params->do_average == 1);
-  assert(sf);
   const int is_scaled = has_scale(subpel_params->xs, subpel_params->ys);
   if (is_scaled) {
     av1_convolve_2d_facade(src, src_stride, dst, dst_stride, w, h,
                            interp_filters, subpel_params->subpel_x,
                            subpel_params->xs, subpel_params->subpel_y,
-                           subpel_params->ys, 1, conv_params, sf);
+                           subpel_params->ys, 1, conv_params);
   } else {
     SubpelParams sp = *subpel_params;
     revert_scale_extra_bits(&sp);
     av1_convolve_2d_facade(src, src_stride, dst, dst_stride, w, h,
                            interp_filters, sp.subpel_x, sp.xs, sp.subpel_y,
-                           sp.ys, 0, conv_params, sf);
+                           sp.ys, 0, conv_params);
   }
 }
 
 static INLINE void highbd_inter_predictor(
     const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride,
-    const SubpelParams *subpel_params, const struct scale_factors *sf, int w,
-    int h, ConvolveParams *conv_params,
-    const InterpFilterParams *interp_filters[2], int bd) {
+    const SubpelParams *subpel_params, int w, int h,
+    ConvolveParams *conv_params, const InterpFilterParams *interp_filters[2],
+    int bd) {
   assert(conv_params->do_average == 0 || conv_params->do_average == 1);
-  assert(sf);
   const int is_scaled = has_scale(subpel_params->xs, subpel_params->ys);
   if (is_scaled) {
     av1_highbd_convolve_2d_facade(src, src_stride, dst, dst_stride, w, h,
                                   interp_filters, subpel_params->subpel_x,
                                   subpel_params->xs, subpel_params->subpel_y,
-                                  subpel_params->ys, 1, conv_params, sf, bd);
+                                  subpel_params->ys, 1, conv_params, bd);
   } else {
     SubpelParams sp = *subpel_params;
     revert_scale_extra_bits(&sp);
     av1_highbd_convolve_2d_facade(src, src_stride, dst, dst_stride, w, h,
                                   interp_filters, sp.subpel_x, sp.xs,
-                                  sp.subpel_y, sp.ys, 0, conv_params, sf, bd);
+                                  sp.subpel_y, sp.ys, 0, conv_params, bd);
   }
 }
 
@@ -241,27 +237,22 @@
                               InterPredParams *inter_pred_params,
                               const SubpelParams *subpel_params);
 
-void av1_make_masked_inter_predictor(const uint8_t *pre, int pre_stride,
-                                     uint8_t *dst, int dst_stride,
-                                     InterPredParams *inter_pred_params,
-                                     const SubpelParams *subpel_params);
-
 typedef void (*CalcSubpelParamsFunc)(const MV *const src_mv,
                                      InterPredParams *const inter_pred_params,
                                      MACROBLOCKD *xd, int mi_x, int mi_y,
-                                     int ref, uint8_t **pre,
+                                     int ref, uint8_t **mc_buf, uint8_t **pre,
                                      SubpelParams *subpel_params,
                                      int *src_stride);
 
 void av1_build_one_inter_predictor(
     uint8_t *dst, int dst_stride, const MV *const src_mv,
     InterPredParams *inter_pred_params, MACROBLOCKD *xd, int mi_x, int mi_y,
-    int ref, CalcSubpelParamsFunc calc_subpel_params_func);
+    int ref, uint8_t **mc_buf, CalcSubpelParamsFunc calc_subpel_params_func);
 
 void av1_build_inter_predictors(const AV1_COMMON *cm, MACROBLOCKD *xd,
                                 int plane, const MB_MODE_INFO *mi,
                                 int build_for_obmc, int bw, int bh, int mi_x,
-                                int mi_y,
+                                int mi_y, uint8_t **mc_buf,
                                 CalcSubpelParamsFunc calc_subpel_params_func);
 
 // TODO(jkoleszar): yet another mv clamping function :-(
@@ -343,10 +334,10 @@
   return 1;
 }
 
-void av1_setup_address_for_obmc(MACROBLOCKD *xd, int mi_row_offset,
-                                int mi_col_offset, MB_MODE_INFO *ref_mbmi,
-                                struct build_prediction_ctxt *ctxt,
-                                const int num_planes);
+// Sets up buffers 'dst_buf1' and 'dst_buf2' from relevant buffers in 'xd' for
+// subsequent use in OBMC prediction.
+void av1_setup_obmc_dst_bufs(MACROBLOCKD *xd, uint8_t **dst_buf1,
+                             uint8_t **dst_buf2);
 
 void av1_setup_build_prediction_by_above_pred(
     MACROBLOCKD *xd, int rel_mi_col, uint8_t above_mi_width,
@@ -371,12 +362,98 @@
 
 void av1_init_wedge_masks();
 
+static INLINE void av1_diffwtd_mask(uint8_t *mask, int which_inverse,
+                                    int mask_base, const uint8_t *src0,
+                                    int src0_stride, const uint8_t *src1,
+                                    int src1_stride, int h, int w) {
+  int i, j, m, diff;
+  for (i = 0; i < h; ++i) {
+    for (j = 0; j < w; ++j) {
+      diff =
+          abs((int)src0[i * src0_stride + j] - (int)src1[i * src1_stride + j]);
+      m = clamp(mask_base + (diff / DIFF_FACTOR), 0, AOM_BLEND_A64_MAX_ALPHA);
+      mask[i * w + j] = which_inverse ? AOM_BLEND_A64_MAX_ALPHA - m : m;
+    }
+  }
+}
+
+static INLINE void av1_diffwtd_mask_highbd(uint8_t *mask, int which_inverse,
+                                           int mask_base, const uint16_t *src0,
+                                           int src0_stride,
+                                           const uint16_t *src1,
+                                           int src1_stride, int h, int w,
+                                           const unsigned int bd) {
+  assert(bd >= 8);
+  if (bd == 8) {
+    if (which_inverse) {
+      for (int i = 0; i < h; ++i) {
+        for (int j = 0; j < w; ++j) {
+          int diff = abs((int)src0[j] - (int)src1[j]) / DIFF_FACTOR;
+          unsigned int m = negative_to_zero(mask_base + diff);
+          m = AOMMIN(m, AOM_BLEND_A64_MAX_ALPHA);
+          mask[j] = AOM_BLEND_A64_MAX_ALPHA - m;
+        }
+        src0 += src0_stride;
+        src1 += src1_stride;
+        mask += w;
+      }
+    } else {
+      for (int i = 0; i < h; ++i) {
+        for (int j = 0; j < w; ++j) {
+          int diff = abs((int)src0[j] - (int)src1[j]) / DIFF_FACTOR;
+          unsigned int m = negative_to_zero(mask_base + diff);
+          m = AOMMIN(m, AOM_BLEND_A64_MAX_ALPHA);
+          mask[j] = m;
+        }
+        src0 += src0_stride;
+        src1 += src1_stride;
+        mask += w;
+      }
+    }
+  } else {
+    const unsigned int bd_shift = bd - 8;
+    if (which_inverse) {
+      for (int i = 0; i < h; ++i) {
+        for (int j = 0; j < w; ++j) {
+          int diff =
+              (abs((int)src0[j] - (int)src1[j]) >> bd_shift) / DIFF_FACTOR;
+          unsigned int m = negative_to_zero(mask_base + diff);
+          m = AOMMIN(m, AOM_BLEND_A64_MAX_ALPHA);
+          mask[j] = AOM_BLEND_A64_MAX_ALPHA - m;
+        }
+        src0 += src0_stride;
+        src1 += src1_stride;
+        mask += w;
+      }
+    } else {
+      for (int i = 0; i < h; ++i) {
+        for (int j = 0; j < w; ++j) {
+          int diff =
+              (abs((int)src0[j] - (int)src1[j]) >> bd_shift) / DIFF_FACTOR;
+          unsigned int m = negative_to_zero(mask_base + diff);
+          m = AOMMIN(m, AOM_BLEND_A64_MAX_ALPHA);
+          mask[j] = m;
+        }
+        src0 += src0_stride;
+        src1 += src1_stride;
+        mask += w;
+      }
+    }
+  }
+}
+
 static INLINE const uint8_t *av1_get_contiguous_soft_mask(int8_t wedge_index,
                                                           int8_t wedge_sign,
                                                           BLOCK_SIZE sb_type) {
   return av1_wedge_params_lookup[sb_type].masks[wedge_sign][wedge_index];
 }
 
+void av1_dist_wtd_comp_weight_assign(const AV1_COMMON *cm,
+                                     const MB_MODE_INFO *mbmi, int order_idx,
+                                     int *fwd_offset, int *bck_offset,
+                                     int *use_dist_wtd_comp_avg,
+                                     int is_compound);
+
 const uint8_t *av1_get_compound_type_mask(
     const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE sb_type);
 
@@ -396,11 +473,6 @@
                             const uint8_t *inter_pred, int inter_stride,
                             const uint8_t *intra_pred, int intra_stride);
 
-void av1_dist_wtd_comp_weight_assign(const AV1_COMMON *cm,
-                                     const MB_MODE_INFO *mbmi, int order_idx,
-                                     int *fwd_offset, int *bck_offset,
-                                     int *use_dist_wtd_comp_avg,
-                                     int is_compound);
 int av1_allow_warp(const MB_MODE_INFO *const mbmi,
                    const WarpTypesAllowed *const warp_types,
                    const WarpedMotionParams *const gm_params,
diff --git a/av1/common/reconintra.c b/av1/common/reconintra.c
index efb3794..90ad8b7 100644
--- a/av1/common/reconintra.c
+++ b/av1/common/reconintra.c
@@ -35,6 +35,7 @@
 #define INTRA_EDGE_FILT 3
 #define INTRA_EDGE_TAPS 5
 #define MAX_UPSAMPLE_SZ 16
+#define NUM_INTRA_NEIGHBOUR_PIXELS (MAX_TX_SIZE * 2 + 32)
 
 static const uint8_t extend_modes[INTRA_MODES] = {
   NEED_ABOVE | NEED_LEFT,                   // DC
@@ -463,6 +464,17 @@
 static void init_intra_predictors_internal(void) {
   assert(NELEMENTS(mode_to_angle_map) == INTRA_MODES);
 
+#if CONFIG_REALTIME_ONLY
+#define INIT_RECTANGULAR(p, type)             \
+  p[TX_4X8] = aom_##type##_predictor_4x8;     \
+  p[TX_8X4] = aom_##type##_predictor_8x4;     \
+  p[TX_8X16] = aom_##type##_predictor_8x16;   \
+  p[TX_16X8] = aom_##type##_predictor_16x8;   \
+  p[TX_16X32] = aom_##type##_predictor_16x32; \
+  p[TX_32X16] = aom_##type##_predictor_32x16; \
+  p[TX_32X64] = aom_##type##_predictor_32x64; \
+  p[TX_64X32] = aom_##type##_predictor_64x32;
+#else
 #define INIT_RECTANGULAR(p, type)             \
   p[TX_4X8] = aom_##type##_predictor_4x8;     \
   p[TX_8X4] = aom_##type##_predictor_8x4;     \
@@ -478,6 +490,7 @@
   p[TX_32X8] = aom_##type##_predictor_32x8;   \
   p[TX_16X64] = aom_##type##_predictor_16x64; \
   p[TX_64X16] = aom_##type##_predictor_64x16;
+#endif
 
 #define INIT_NO_4X4(p, type)                  \
   p[TX_8X8] = aom_##type##_predictor_8x8;     \
@@ -854,10 +867,6 @@
 
   assert(bw <= 32 && bh <= 32);
 
-  // The initialization is just for silencing Jenkins static analysis warnings
-  for (r = 0; r < bh + 1; ++r)
-    memset(buffer[r], 0, (bw + 1) * sizeof(buffer[0][0]));
-
   for (r = 0; r < bh; ++r) buffer[r + 1][0] = left[r];
   memcpy(buffer[0], &above[-1], (bw + 1) * sizeof(uint8_t));
 
@@ -905,10 +914,6 @@
 
   assert(bw <= 32 && bh <= 32);
 
-  // The initialization is just for silencing Jenkins static analysis warnings
-  for (r = 0; r < bh + 1; ++r)
-    memset(buffer[r], 0, (bw + 1) * sizeof(buffer[0][0]));
-
   for (r = 0; r < bh; ++r) buffer[r + 1][0] = left[r];
   memcpy(buffer[0], &above[-1], (bw + 1) * sizeof(buffer[0][0]));
 
@@ -1142,8 +1147,8 @@
   int i;
   uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
   uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
-  DECLARE_ALIGNED(16, uint16_t, left_data[MAX_TX_SIZE * 2 + 32]);
-  DECLARE_ALIGNED(16, uint16_t, above_data[MAX_TX_SIZE * 2 + 32]);
+  DECLARE_ALIGNED(16, uint16_t, left_data[NUM_INTRA_NEIGHBOUR_PIXELS]);
+  DECLARE_ALIGNED(16, uint16_t, above_data[NUM_INTRA_NEIGHBOUR_PIXELS]);
   uint16_t *const above_row = above_data + 16;
   uint16_t *const left_col = left_data + 16;
   const int txwpx = tx_size_wide[tx_size];
@@ -1157,6 +1162,12 @@
   const int is_dr_mode = av1_is_directional_mode(mode);
   const int use_filter_intra = filter_intra_mode != FILTER_INTRA_MODES;
   int base = 128 << (xd->bd - 8);
+  // The left_data, above_data buffers must be zeroed to fix some intermittent
+  // valgrind errors. Uninitialized reads in intra pred modules (e.g. width = 4
+  // path in av1_highbd_dr_prediction_z2_avx2()) from left_data, above_data are
+  // seen to be the potential reason for this issue.
+  aom_memset16(left_data, base + 1, NUM_INTRA_NEIGHBOUR_PIXELS);
+  aom_memset16(above_data, base - 1, NUM_INTRA_NEIGHBOUR_PIXELS);
 
   // The default values if ref pixels are not available:
   // base   base-1 base-1 .. base-1 base-1 base-1 base-1 base-1 base-1
@@ -1211,12 +1222,8 @@
       }
       if (i < num_left_pixels_needed)
         aom_memset16(&left_col[i], left_col[i - 1], num_left_pixels_needed - i);
-    } else {
-      if (n_top_px > 0) {
-        aom_memset16(left_col, above_ref[0], num_left_pixels_needed);
-      } else {
-        aom_memset16(left_col, base + 1, num_left_pixels_needed);
-      }
+    } else if (n_top_px > 0) {
+      aom_memset16(left_col, above_ref[0], num_left_pixels_needed);
     }
   }
 
@@ -1238,12 +1245,8 @@
       if (i < num_top_pixels_needed)
         aom_memset16(&above_row[i], above_row[i - 1],
                      num_top_pixels_needed - i);
-    } else {
-      if (n_left_px > 0) {
-        aom_memset16(above_row, left_ref[0], num_top_pixels_needed);
-      } else {
-        aom_memset16(above_row, base - 1, num_top_pixels_needed);
-      }
+    } else if (n_left_px > 0) {
+      aom_memset16(above_row, left_ref[0], num_top_pixels_needed);
     }
   }
 
@@ -1330,8 +1333,8 @@
   int i;
   const uint8_t *above_ref = ref - ref_stride;
   const uint8_t *left_ref = ref - 1;
-  DECLARE_ALIGNED(16, uint8_t, left_data[MAX_TX_SIZE * 2 + 32]);
-  DECLARE_ALIGNED(16, uint8_t, above_data[MAX_TX_SIZE * 2 + 32]);
+  DECLARE_ALIGNED(16, uint8_t, left_data[NUM_INTRA_NEIGHBOUR_PIXELS]);
+  DECLARE_ALIGNED(16, uint8_t, above_data[NUM_INTRA_NEIGHBOUR_PIXELS]);
   uint8_t *const above_row = above_data + 16;
   uint8_t *const left_col = left_data + 16;
   const int txwpx = tx_size_wide[tx_size];
@@ -1342,6 +1345,12 @@
   int p_angle = 0;
   const int is_dr_mode = av1_is_directional_mode(mode);
   const int use_filter_intra = filter_intra_mode != FILTER_INTRA_MODES;
+  // The left_data, above_data buffers must be zeroed to fix some intermittent
+  // valgrind errors. Uninitialized reads in intra pred modules (e.g. width = 4
+  // path in av1_dr_prediction_z1_avx2()) from left_data, above_data are seen to
+  // be the potential reason for this issue.
+  memset(left_data, 129, NUM_INTRA_NEIGHBOUR_PIXELS);
+  memset(above_data, 127, NUM_INTRA_NEIGHBOUR_PIXELS);
 
   // The default values if ref pixels are not available:
   // 128 127 127 .. 127 127 127 127 127 127
@@ -1386,10 +1395,7 @@
     int need_bottom = extend_modes[mode] & NEED_BOTTOMLEFT;
     if (use_filter_intra) need_bottom = 0;
     if (is_dr_mode) need_bottom = p_angle > 180;
-    // the avx2 dr_prediction_z2 may read at most 3 extra bytes,
-    // due to the avx2 mask load is with dword granularity.
-    // so we initialize 3 extra bytes to silence valgrind complain.
-    const int num_left_pixels_needed = txhpx + (need_bottom ? txwpx : 3);
+    const int num_left_pixels_needed = txhpx + (need_bottom ? txwpx : 0);
     i = 0;
     if (n_left_px > 0) {
       for (; i < n_left_px; i++) left_col[i] = left_ref[i * ref_stride];
@@ -1400,12 +1406,8 @@
       }
       if (i < num_left_pixels_needed)
         memset(&left_col[i], left_col[i - 1], num_left_pixels_needed - i);
-    } else {
-      if (n_top_px > 0) {
-        memset(left_col, above_ref[0], num_left_pixels_needed);
-      } else {
-        memset(left_col, 129, num_left_pixels_needed);
-      }
+    } else if (n_top_px > 0) {
+      memset(left_col, above_ref[0], num_left_pixels_needed);
     }
   }
 
@@ -1425,12 +1427,8 @@
       }
       if (i < num_top_pixels_needed)
         memset(&above_row[i], above_row[i - 1], num_top_pixels_needed - i);
-    } else {
-      if (n_left_px > 0) {
-        memset(above_row, left_ref[0], num_top_pixels_needed);
-      } else {
-        memset(above_row, 127, num_top_pixels_needed);
-      }
+    } else if (n_left_px > 0) {
+      memset(above_row, left_ref[0], num_top_pixels_needed);
     }
   }
 
@@ -1602,17 +1600,13 @@
       col_off || (ss_x ? xd->chroma_left_available : xd->left_available);
   const int mi_row = -xd->mb_to_top_edge >> (3 + MI_SIZE_LOG2);
   const int mi_col = -xd->mb_to_left_edge >> (3 + MI_SIZE_LOG2);
-  const int xr_chr_offset = 0;
-  const int yd_chr_offset = 0;
 
   // Distance between the right edge of this prediction block to
   // the frame right edge
-  const int xr =
-      (xd->mb_to_right_edge >> (3 + ss_x)) + (wpx - x - txwpx) - xr_chr_offset;
+  const int xr = (xd->mb_to_right_edge >> (3 + ss_x)) + wpx - x - txwpx;
   // Distance between the bottom edge of this prediction block to
   // the frame bottom edge
-  const int yd =
-      (xd->mb_to_bottom_edge >> (3 + ss_y)) + (hpx - y - txhpx) - yd_chr_offset;
+  const int yd = (xd->mb_to_bottom_edge >> (3 + ss_y)) + hpx - y - txhpx;
   const int right_available =
       mi_col + ((col_off + txw) << ss_x) < xd->tile.mi_col_end;
   const int bottom_available =
@@ -1620,7 +1614,7 @@
 
   const PARTITION_TYPE partition = mbmi->partition;
 
-  BLOCK_SIZE bsize = mbmi->sb_type;
+  BLOCK_SIZE bsize = mbmi->bsize;
   // force 4x4 chroma component block size.
   if (ss_x || ss_y) {
     bsize = scale_chroma_bsize(bsize, ss_x, ss_y);
@@ -1674,8 +1668,8 @@
   if (plane != AOM_PLANE_Y && mbmi->uv_mode == UV_CFL_PRED) {
 #if CONFIG_DEBUG
     assert(is_cfl_allowed(xd));
-    const BLOCK_SIZE plane_bsize = get_plane_block_size(
-        mbmi->sb_type, pd->subsampling_x, pd->subsampling_y);
+    const BLOCK_SIZE plane_bsize =
+        get_plane_block_size(mbmi->bsize, pd->subsampling_x, pd->subsampling_y);
     (void)plane_bsize;
     assert(plane_bsize < BLOCK_SIZES_ALL);
     if (!xd->lossless[mbmi->segment_id]) {
diff --git a/av1/common/reconintra.h b/av1/common/reconintra.h
index 9d20356..333802f 100644
--- a/av1/common/reconintra.h
+++ b/av1/common/reconintra.h
@@ -69,7 +69,7 @@
                                            const MB_MODE_INFO *mbmi) {
   return mbmi->mode == DC_PRED &&
          mbmi->palette_mode_info.palette_size[0] == 0 &&
-         av1_filter_intra_allowed_bsize(cm, mbmi->sb_type);
+         av1_filter_intra_allowed_bsize(cm, mbmi->bsize);
 }
 
 extern const int8_t av1_filter_intra_taps[FILTER_INTRA_MODES][8][8];
diff --git a/av1/common/resize.c b/av1/common/resize.c
index 98f28f7..725b94f 100644
--- a/av1/common/resize.c
+++ b/av1/common/resize.c
@@ -24,6 +24,7 @@
 #include "av1/common/common.h"
 #include "av1/common/resize.h"
 
+#include "config/aom_dsp_rtcd.h"
 #include "config/aom_scale_rtcd.h"
 
 // Filters for interpolation (0.5-band) - note this also filters integer pels.
@@ -1188,9 +1189,48 @@
 }
 #endif  // CONFIG_AV1_HIGHBITDEPTH
 
-void av1_resize_and_extend_frame(const YV12_BUFFER_CONFIG *src,
-                                 YV12_BUFFER_CONFIG *dst, int bd,
-                                 const int num_planes) {
+void av1_resize_and_extend_frame_c(const YV12_BUFFER_CONFIG *src,
+                                   YV12_BUFFER_CONFIG *dst,
+                                   const InterpFilter filter,
+                                   const int phase_scaler,
+                                   const int num_planes) {
+  const int src_w = src->y_crop_width;
+  const int src_h = src->y_crop_height;
+  const uint8_t *const srcs[3] = { src->y_buffer, src->u_buffer,
+                                   src->v_buffer };
+  const int src_strides[3] = { src->y_stride, src->uv_stride, src->uv_stride };
+  uint8_t *const dsts[3] = { dst->y_buffer, dst->u_buffer, dst->v_buffer };
+  const int dst_strides[3] = { dst->y_stride, dst->uv_stride, dst->uv_stride };
+  assert(filter == BILINEAR || filter == EIGHTTAP_SMOOTH ||
+         filter == EIGHTTAP_REGULAR);
+  const InterpKernel *const kernel =
+      filter == BILINEAR ? av1_bilinear_filters : av1_sub_pel_filters_8smooth;
+  const int dst_w = dst->y_crop_width;
+  const int dst_h = dst->y_crop_height;
+  for (int i = 0; i < AOMMIN(num_planes, MAX_MB_PLANE); ++i) {
+    const int factor = (i == 0 || i == 3 ? 1 : 2);
+    const int src_stride = src_strides[i];
+    const int dst_stride = dst_strides[i];
+    for (int y = 0; y < dst_h; y += 16) {
+      const int y_q4 = y * (16 / factor) * src_h / dst_h + phase_scaler;
+      for (int x = 0; x < dst_w; x += 16) {
+        const int x_q4 = x * (16 / factor) * src_w / dst_w + phase_scaler;
+        const uint8_t *src_ptr = srcs[i] +
+                                 (y / factor) * src_h / dst_h * src_stride +
+                                 (x / factor) * src_w / dst_w;
+        uint8_t *dst_ptr = dsts[i] + (y / factor) * dst_stride + (x / factor);
+
+        aom_scaled_2d(src_ptr, src_stride, dst_ptr, dst_stride, kernel,
+                      x_q4 & 0xf, 16 * src_w / dst_w, y_q4 & 0xf,
+                      16 * src_h / dst_h, 16 / factor, 16 / factor);
+      }
+    }
+  }
+}
+
+void av1_resize_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src,
+                                              YV12_BUFFER_CONFIG *dst, int bd,
+                                              const int num_planes) {
   // TODO(dkovalev): replace YV12_BUFFER_CONFIG with aom_image_t
 
   // We use AOMMIN(num_planes, MAX_MB_PLANE) instead of num_planes to quiet
@@ -1298,14 +1338,36 @@
   aom_extend_frame_borders(dst, num_planes);
 }
 
-YV12_BUFFER_CONFIG *av1_scale_if_required(AV1_COMMON *cm,
-                                          YV12_BUFFER_CONFIG *unscaled,
-                                          YV12_BUFFER_CONFIG *scaled) {
-  const int num_planes = av1_num_planes(cm);
-  if (cm->width != unscaled->y_crop_width ||
-      cm->height != unscaled->y_crop_height) {
-    av1_resize_and_extend_frame(unscaled, scaled, (int)cm->seq_params.bit_depth,
-                                num_planes);
+YV12_BUFFER_CONFIG *av1_scale_if_required(
+    AV1_COMMON *cm, YV12_BUFFER_CONFIG *unscaled, YV12_BUFFER_CONFIG *scaled,
+    const InterpFilter filter, const int phase, const bool use_optimized_scaler,
+    const bool for_psnr) {
+  // If scaling is performed for the sole purpose of calculating PSNR, then our
+  // target dimensions are superres upscaled width/height. Otherwise our target
+  // dimensions are coded width/height.
+  const bool scaling_required =
+      for_psnr ? (cm->superres_upscaled_width != unscaled->y_crop_width ||
+                  cm->superres_upscaled_height != unscaled->y_crop_height)
+               : (cm->width != unscaled->y_crop_width ||
+                  cm->height != unscaled->y_crop_height);
+
+  if (scaling_required) {
+    const int num_planes = av1_num_planes(cm);
+#if CONFIG_AV1_HIGHBITDEPTH
+    if (use_optimized_scaler && cm->seq_params.bit_depth == AOM_BITS_8) {
+      av1_resize_and_extend_frame(unscaled, scaled, filter, phase, num_planes);
+    } else {
+      av1_resize_and_extend_frame_nonnormative(
+          unscaled, scaled, (int)cm->seq_params.bit_depth, num_planes);
+    }
+#else
+    if (use_optimized_scaler) {
+      av1_resize_and_extend_frame(unscaled, scaled, filter, phase, num_planes);
+    } else {
+      av1_resize_and_extend_frame_nonnormative(
+          unscaled, scaled, (int)cm->seq_params.bit_depth, num_planes);
+    }
+#endif
     return scaled;
   } else {
     return unscaled;
diff --git a/av1/common/resize.h b/av1/common/resize.h
index 8ee859e..b08de80 100644
--- a/av1/common/resize.h
+++ b/av1/common/resize.h
@@ -63,9 +63,6 @@
                                 uint8_t *oy, int oy_stride, uint8_t *ou,
                                 uint8_t *ov, int ouv_stride, int oheight,
                                 int owidth, int bd);
-void av1_resize_and_extend_frame(const YV12_BUFFER_CONFIG *src,
-                                 YV12_BUFFER_CONFIG *dst, int bd,
-                                 const int num_planes);
 
 void av1_upscale_normative_rows(const AV1_COMMON *cm, const uint8_t *src,
                                 int src_stride, uint8_t *dst, int dst_stride,
@@ -74,9 +71,14 @@
                                             const YV12_BUFFER_CONFIG *src,
                                             YV12_BUFFER_CONFIG *dst);
 
-YV12_BUFFER_CONFIG *av1_scale_if_required(AV1_COMMON *cm,
-                                          YV12_BUFFER_CONFIG *unscaled,
-                                          YV12_BUFFER_CONFIG *scaled);
+YV12_BUFFER_CONFIG *av1_scale_if_required(
+    AV1_COMMON *cm, YV12_BUFFER_CONFIG *unscaled, YV12_BUFFER_CONFIG *scaled,
+    const InterpFilter filter, const int phase, const bool use_optimized_scaler,
+    const bool for_psnr);
+
+void av1_resize_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src,
+                                              YV12_BUFFER_CONFIG *dst, int bd,
+                                              const int num_planes);
 
 // Calculates the scaled dimensions from the given original dimensions and the
 // resize scale denominator.
diff --git a/av1/common/restoration.h b/av1/common/restoration.h
index 3b80dd5..338c9e7c 100644
--- a/av1/common/restoration.h
+++ b/av1/common/restoration.h
@@ -22,6 +22,10 @@
 extern "C" {
 #endif
 
+/*! @file */
+
+/*!\cond */
+
 // Border for Loop restoration buffer
 #define AOM_RESTORATION_FRAME_BORDER 32
 #define CLIP(x, lo, hi) ((x) < (lo) ? (lo) : (x) > (hi) ? (hi) : (x))
@@ -183,13 +187,28 @@
   int r[2];  // radii
   int s[2];  // sgr parameters for r[0] and r[1], based on GenSgrprojVtable()
 } sgr_params_type;
+/*!\endcond */
 
+/*!\brief Parameters related to Restoration Unit Info */
 typedef struct {
+  /*!
+   * restoration type
+   */
   RestorationType restoration_type;
+
+  /*!
+   * Wiener filter parameters if restoration_type indicates Wiener
+   */
   WienerInfo wiener_info;
+
+  /*!
+   * Sgrproj filter parameters if restoration_type indicates Sgrproj
+   */
   SgrprojInfo sgrproj_info;
 } RestorationUnitInfo;
 
+/*!\cond */
+
 // A restoration line buffer needs space for two lines plus a horizontal filter
 // margin of RESTORATION_EXTRA_HORZ on each side.
 #define RESTORATION_LINEBUFFER_WIDTH \
@@ -207,33 +226,89 @@
   uint16_t tmp_save_above[RESTORATION_BORDER][RESTORATION_LINEBUFFER_WIDTH];
   uint16_t tmp_save_below[RESTORATION_BORDER][RESTORATION_LINEBUFFER_WIDTH];
 } RestorationLineBuffers;
+/*!\endcond */
 
+/*!\brief Parameters related to Restoration Stripe boundaries */
 typedef struct {
+  /*!
+   * stripe boundary above
+   */
   uint8_t *stripe_boundary_above;
+
+  /*!
+   * stripe boundary below
+   */
   uint8_t *stripe_boundary_below;
+
+  /*!
+   * strides for stripe boundaries above and below
+   */
   int stripe_boundary_stride;
+
+  /*!
+   * size of stripe boundaries above and below
+   */
   int stripe_boundary_size;
 } RestorationStripeBoundaries;
 
+/*!\brief Parameters related to Restoration Info */
 typedef struct {
+  /*!
+   * Restoration type for frame
+   */
   RestorationType frame_restoration_type;
+
+  /*!
+   * Restoration unit size
+   */
   int restoration_unit_size;
 
-  // Fields below here are allocated and initialised by
-  // av1_alloc_restoration_struct. (horz_)units_per_tile give the number of
-  // restoration units in (one row of) the largest tile in the frame. The data
-  // in unit_info is laid out with units_per_tile entries for each tile, which
-  // have stride horz_units_per_tile.
-  //
-  // Even if there are tiles of different sizes, the data in unit_info is laid
-  // out as if all tiles are of full size.
+  /**
+   * \name Fields allocated and initialised by av1_alloc_restoration_struct.
+   * (horz_)units_per_tile give the number of restoration units in
+   * (one row of) the largest tile in the frame.
+   */
+  /**@{*/
+  /*!
+   * Number of units per tile for the largest tile in the frame
+   */
   int units_per_tile;
-  int vert_units_per_tile, horz_units_per_tile;
+
+  /*!
+   * Number of vertical units per tile
+   */
+  int vert_units_per_tile;
+
+  /*!
+   * Number of horizontal units per tile for the largest tile in the frame
+   */
+  int horz_units_per_tile;
+  /**@}*/
+
+  /*!
+   * List of info for units in tile.
+   * The data in unit_info is laid out with units_per_tile entries for each
+   * tile, which have stride horz_units_per_tile.
+   * Even if there are tiles of different sizes, the data in unit_info is
+   * laid out as if all tiles are of full size.
+   */
   RestorationUnitInfo *unit_info;
+
+  /*!
+   * Restoration Stripe boundary info
+   */
   RestorationStripeBoundaries boundaries;
+
+  /*!
+   * Whether optimized lr can be used for speed.
+   * That includes cases of no cdef and no superres, or if fast trial runs
+   * are used on the encoder side.
+   */
   int optimized_lr;
 } RestorationInfo;
 
+/*!\cond */
+
 static INLINE void set_default_sgrproj(SgrprojInfo *sgrproj_info) {
   sgrproj_info->xqd[0] = (SGRPROJ_PRJ_MIN0 + SGRPROJ_PRJ_MAX0) / 2;
   sgrproj_info->xqd[1] = (SGRPROJ_PRJ_MIN1 + SGRPROJ_PRJ_MAX1) / 2;
@@ -291,25 +366,39 @@
                       int border_horz, int border_vert, int highbd);
 void av1_decode_xq(const int *xqd, int *xq, const sgr_params_type *params);
 
-// Filter a single loop restoration unit.
-//
-// limits is the limits of the unit. rui gives the mode to use for this unit
-// and its coefficients. If striped loop restoration is enabled, rsb contains
-// deblocked pixels to use for stripe boundaries; rlbs is just some space to
-// use as a scratch buffer. tile_rect gives the limits of the tile containing
-// this unit. tile_stripe0 is the index of the first stripe in this tile.
-//
-// ss_x and ss_y are flags which should be 1 if this is a plane with
-// horizontal/vertical subsampling, respectively. highbd is a flag which should
-// be 1 in high bit depth mode, in which case bit_depth is the bit depth.
-//
-// data8 is the frame data (pointing at the top-left corner of the frame, not
-// the restoration unit) and stride is its stride. dst8 is the buffer where the
-// results will be written and has stride dst_stride. Like data8, dst8 should
-// point at the top-left corner of the frame.
-//
-// Finally tmpbuf is a scratch buffer used by the sgrproj filter which should
-// be at least SGRPROJ_TMPBUF_SIZE big.
+/*!\endcond */
+
+/*!\brief Function for applying loop restoration filter to a single unit.
+ *
+ * \ingroup in_loop_restoration
+ * This function applies the loop restoration filter to a single
+ * loop restoration unit.
+ *
+ * \param[in]  limits        Limits of the unit
+ * \param[in]  rui           The parameters to use for this unit and its
+ *                           coefficients
+ * \param[in]  rsb           Deblocked pixels to use for stripe boundaries
+ * \param[in]  rlbs          Space to use as a scratch buffer
+ * \param[in]  tile_rect     Limits of the tile containing this unit
+ * \param[in]  tile_stripe0  Index of the first stripe in this tile
+ * \param[in]  ss_x          Horizontal subsampling for plane
+ * \param[in]  ss_y          Vertical subsampling for plane
+ * \param[in]  highbd        Whether high bitdepth pipeline is used
+ * \param[in]  bit_depth     Bit-depth of the video
+ * \param[in]  data8         Frame data (pointing at the top-left corner of
+ *                           the frame, not the restoration unit).
+ * \param[in]  stride        Stride of \c data8
+ * \param[out] dst8          Buffer where the results will be written. Like
+ *                           \c data8, \c dst8 should point at the top-left
+ *                           corner of the frame
+ * \param[in]  dst_stride    Stride of \c dst8
+ * \param[in]  tmpbuf        Scratch buffer used by the sgrproj filter which
+ *                           should be at least SGRPROJ_TMPBUF_SIZE big.
+ * \param[in]  optimized_lr  Whether to use fast optimized Loop Restoration
+ *
+ * \return Nothing is returned. Instead, the filtered unit is output in
+ * \c dst8 at the proper restoration unit offset.
+ */
 void av1_loop_restoration_filter_unit(
     const RestorationTileLimits *limits, const RestorationUnitInfo *rui,
     const RestorationStripeBoundaries *rsb, RestorationLineBuffers *rlbs,
@@ -317,9 +406,24 @@
     int highbd, int bit_depth, uint8_t *data8, int stride, uint8_t *dst8,
     int dst_stride, int32_t *tmpbuf, int optimized_lr);
 
+/*!\brief Function for applying loop restoration filter to a frame
+ *
+ * \ingroup in_loop_restoration
+ * This function applies the loop restoration filter to a frame.
+ *
+ * \param[in, out]  frame         Compressed frame buffer
+ * \param[in, out]  cm            Pointer to top level common structure
+ * \param[in]       optimized_lr  Whether to use fast optimized Loop Restoration
+ * \param[in]       lr_ctxt       Loop restoration context
+ *
+ * \return Nothing is returned. Instead, the filtered frame is output in
+ * \c frame.
+ */
 void av1_loop_restoration_filter_frame(YV12_BUFFER_CONFIG *frame,
                                        struct AV1Common *cm, int optimized_lr,
                                        void *lr_ctxt);
+/*!\cond */
+
 void av1_loop_restoration_precal();
 
 typedef void (*rest_tile_start_visitor_t)(int tile_row, int tile_col,
@@ -373,6 +477,9 @@
 void av1_lr_sync_read_dummy(void *const lr_sync, int r, int c, int plane);
 void av1_lr_sync_write_dummy(void *const lr_sync, int r, int c,
                              const int sb_cols, int plane);
+
+/*!\endcond */
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/av1/common/scale.c b/av1/common/scale.c
index 3b14c0a..5bcd8df 100644
--- a/av1/common/scale.c
+++ b/av1/common/scale.c
@@ -84,45 +84,4 @@
     sf->scale_value_x = unscaled_value;
     sf->scale_value_y = unscaled_value;
   }
-
-  // AV1 convolve functions
-  // Special case convolve functions should produce the same result as
-  // av1_convolve_2d.
-  // subpel_x_qn == 0 && subpel_y_qn == 0
-  sf->convolve[0][0][0] = av1_convolve_2d_copy_sr;
-  // subpel_x_qn == 0
-  sf->convolve[0][1][0] = av1_convolve_y_sr;
-  // subpel_y_qn == 0
-  sf->convolve[1][0][0] = av1_convolve_x_sr;
-  // subpel_x_qn != 0 && subpel_y_qn != 0
-  sf->convolve[1][1][0] = av1_convolve_2d_sr;
-  // subpel_x_qn == 0 && subpel_y_qn == 0
-  sf->convolve[0][0][1] = av1_dist_wtd_convolve_2d_copy;
-  // subpel_x_qn == 0
-  sf->convolve[0][1][1] = av1_dist_wtd_convolve_y;
-  // subpel_y_qn == 0
-  sf->convolve[1][0][1] = av1_dist_wtd_convolve_x;
-  // subpel_x_qn != 0 && subpel_y_qn != 0
-  sf->convolve[1][1][1] = av1_dist_wtd_convolve_2d;
-#if CONFIG_AV1_HIGHBITDEPTH
-  // AV1 High BD convolve functions
-  // Special case convolve functions should produce the same result as
-  // av1_highbd_convolve_2d.
-  // subpel_x_qn == 0 && subpel_y_qn == 0
-  sf->highbd_convolve[0][0][0] = av1_highbd_convolve_2d_copy_sr;
-  // subpel_x_qn == 0
-  sf->highbd_convolve[0][1][0] = av1_highbd_convolve_y_sr;
-  // subpel_y_qn == 0
-  sf->highbd_convolve[1][0][0] = av1_highbd_convolve_x_sr;
-  // subpel_x_qn != 0 && subpel_y_qn != 0
-  sf->highbd_convolve[1][1][0] = av1_highbd_convolve_2d_sr;
-  // subpel_x_qn == 0 && subpel_y_qn == 0
-  sf->highbd_convolve[0][0][1] = av1_highbd_dist_wtd_convolve_2d_copy;
-  // subpel_x_qn == 0
-  sf->highbd_convolve[0][1][1] = av1_highbd_dist_wtd_convolve_y;
-  // subpel_y_qn == 0
-  sf->highbd_convolve[1][0][1] = av1_highbd_dist_wtd_convolve_x;
-  // subpel_x_qn != 0 && subpel_y_qn != 0
-  sf->highbd_convolve[1][1][1] = av1_highbd_dist_wtd_convolve_2d;
-#endif
 }
diff --git a/av1/common/scale.h b/av1/common/scale.h
index 16b40bd..fd30416 100644
--- a/av1/common/scale.h
+++ b/av1/common/scale.h
@@ -33,10 +33,6 @@
 
   int (*scale_value_x)(int val, const struct scale_factors *sf);
   int (*scale_value_y)(int val, const struct scale_factors *sf);
-
-  // convolve_fn_ptr[subpel_x != 0][subpel_y != 0][is_compound]
-  aom_convolve_fn_t convolve[2][2][2];
-  aom_highbd_convolve_fn_t highbd_convolve[2][2][2];
 };
 
 MV32 av1_scale_mv(const MV *mv, int x, int y, const struct scale_factors *sf);
diff --git a/av1/common/thread_common.c b/av1/common/thread_common.c
index f3c8795..914256f 100644
--- a/av1/common/thread_common.c
+++ b/av1/common/thread_common.c
@@ -33,6 +33,7 @@
     return 8;
 }
 
+#if !CONFIG_REALTIME_ONLY
 static INLINE int get_lr_sync_range(int width) {
 #if 0
   // nsync numbers are picked by testing. For example, for 4k
@@ -50,6 +51,7 @@
   return 1;
 #endif
 }
+#endif
 
 // Allocate memory for lf row synchronization
 static void loop_filter_alloc(AV1LfSync *lf_sync, AV1_COMMON *cm, int rows,
@@ -528,6 +530,7 @@
 #endif
 }
 
+#if !CONFIG_REALTIME_ONLY
 static INLINE void lr_sync_read(void *const lr_sync, int r, int c, int plane) {
 #if CONFIG_MULTITHREAD
   AV1LrSync *const loop_res_sync = (AV1LrSync *)lr_sync;
@@ -928,3 +931,4 @@
   foreach_rest_unit_in_planes_mt(loop_rest_ctxt, workers, num_workers, lr_sync,
                                  cm);
 }
+#endif
diff --git a/av1/common/thread_common.h b/av1/common/thread_common.h
index 7397f1c..97b8abc 100644
--- a/av1/common/thread_common.h
+++ b/av1/common/thread_common.h
@@ -108,12 +108,15 @@
 #endif
                               AVxWorker *workers, int num_workers,
                               AV1LfSync *lf_sync);
+
+#if !CONFIG_REALTIME_ONLY
 void av1_loop_restoration_filter_frame_mt(YV12_BUFFER_CONFIG *frame,
                                           struct AV1Common *cm,
                                           int optimized_lr, AVxWorker *workers,
                                           int num_workers, AV1LrSync *lr_sync,
                                           void *lr_ctxt);
 void av1_loop_restoration_dealloc(AV1LrSync *lr_sync, int num_workers);
+#endif
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/av1/common/x86/convolve_2d_avx2.c b/av1/common/x86/convolve_2d_avx2.c
index e19575d..211f258 100644
--- a/av1/common/x86/convolve_2d_avx2.c
+++ b/av1/common/x86/convolve_2d_avx2.c
@@ -27,8 +27,7 @@
                              const int subpel_x_qn, const int subpel_y_qn,
                              ConvolveParams *conv_params) {
   const int bd = 8;
-  int im_stride = 8;
-  int i, is_horiz_4tap = 0, is_vert_4tap = 0;
+  int im_stride = 8, i;
   DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
   const int bits =
       FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
@@ -57,261 +56,57 @@
   prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs_h);
   prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_v);
 
-  // Condition for checking valid horz_filt taps
-  if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs_h[0], coeffs_h[3]), 0)))
-    is_horiz_4tap = 1;
+  const int16_t *const filter_x = av1_get_interp_filter_subpel_kernel(
+      filter_params_x, subpel_x_qn & SUBPEL_MASK);
+  const int16_t *const filter_y = av1_get_interp_filter_subpel_kernel(
+      filter_params_y, subpel_y_qn & SUBPEL_MASK);
 
-  // Condition for checking valid vert_filt taps
-  if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs_v[0], coeffs_v[3]), 0)))
-    is_vert_4tap = 1;
+  int horiz_tap = SUBPEL_TAPS;
+  int vert_tap = SUBPEL_TAPS;
 
-  // horz_filt as 4 tap and vert_filt as 8 tap
-  if (is_horiz_4tap) {
-    int im_h = h + filter_params_y->taps - 1;
-    const int fo_vert = filter_params_y->taps / 2 - 1;
-    const int fo_horiz = 1;
-    const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+  if (!(filter_x[0] | filter_x[1] | filter_x[6] | filter_x[7]))
+    horiz_tap = 4;
+  else if (!(filter_x[0] | filter_x[7]))
+    horiz_tap = 6;
 
-    // horz-filter
-    for (int j = 0; j < w; j += 8) {
-      for (i = 0; i < (im_h - 2); i += 2) {
-        __m256i data = _mm256_castsi128_si256(
-            _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j]));
+  if (!(filter_y[0] | filter_y[1] | filter_y[6] | filter_y[7]))
+    vert_tap = 4;
+  else if (!(filter_y[0] | filter_y[7]))
+    vert_tap = 6;
 
-        // Load the next line
-        data = _mm256_inserti128_si256(
-            data,
-            _mm_loadu_si128(
-                (__m128i *)&src_ptr[(i * src_stride) + j + src_stride]),
-            1);
-        __m256i res = convolve_lowbd_x_4tap(data, coeffs_h + 1, filt);
+  if (horiz_tap == 6)
+    prepare_coeffs_6t_lowbd(filter_params_x, subpel_x_qn, coeffs_h);
+  else
+    prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs_h);
 
-        res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h),
-                               round_shift_h);
-        _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);
-      }
+  if (vert_tap == 6)
+    prepare_coeffs_6t(filter_params_y, subpel_y_qn, coeffs_v);
+  else
+    prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_v);
 
-      __m256i data_1 = _mm256_castsi128_si256(
-          _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j]));
+  int im_h = h + vert_tap - 1;
+  const int fo_vert = vert_tap / 2 - 1;
+  const int fo_horiz = horiz_tap / 2 - 1;
+  const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
 
-      __m256i res = convolve_lowbd_x_4tap(data_1, coeffs_h + 1, filt);
-      res =
-          _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), round_shift_h);
-      _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);
+  filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
+  filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
 
-      // vert filter
-      CONVOLVE_SR_VERTICAL_FILTER_8TAP;
+  for (int j = 0; j < w; j += 8) {
+    if (horiz_tap == 4) {
+      CONVOLVE_SR_HORIZONTAL_FILTER_4TAP
+    } else if (horiz_tap == 6) {
+      CONVOLVE_SR_HORIZONTAL_FILTER_6TAP
+    } else {
+      CONVOLVE_SR_HORIZONTAL_FILTER_8TAP
     }
-  } else if (is_vert_4tap) {
-    int im_h = h + 3;
-    const int fo_vert = 1;
-    const int fo_horiz = filter_params_x->taps / 2 - 1;
-    const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
 
-    filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
-    filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
-
-    for (int j = 0; j < w; j += 8) {
-      // horz_filter
-      CONVOLVE_SR_HORIZONTAL_FILTER_8TAP;
-      // vert_filter
-      __m256i s[6];
-      __m256i src_0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));
-      __m256i src_1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));
-      __m256i src_2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));
-      __m256i src_3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));
-
-      s[0] = _mm256_unpacklo_epi16(src_0, src_1);
-      s[1] = _mm256_unpacklo_epi16(src_2, src_3);
-      s[3] = _mm256_unpackhi_epi16(src_0, src_1);
-      s[4] = _mm256_unpackhi_epi16(src_2, src_3);
-
-      for (i = 0; i < h; i += 2) {
-        const int16_t *data = &im_block[i * im_stride];
-
-        const __m256i s4 =
-            _mm256_loadu_si256((__m256i *)(data + 4 * im_stride));
-        const __m256i s5 =
-            _mm256_loadu_si256((__m256i *)(data + 5 * im_stride));
-
-        s[2] = _mm256_unpacklo_epi16(s4, s5);
-        s[5] = _mm256_unpackhi_epi16(s4, s5);
-
-        __m256i res_a = convolve_4tap(s, coeffs_v + 1);
-        __m256i res_b = convolve_4tap(s + 3, coeffs_v + 1);
-
-        // Combine V round and 2F-H-V round into a single rounding
-        res_a =
-            _mm256_sra_epi32(_mm256_add_epi32(res_a, sum_round_v), sum_shift_v);
-        res_b =
-            _mm256_sra_epi32(_mm256_add_epi32(res_b, sum_round_v), sum_shift_v);
-
-        const __m256i res_a_round = _mm256_sra_epi32(
-            _mm256_add_epi32(res_a, round_const_v), round_shift_v);
-        const __m256i res_b_round = _mm256_sra_epi32(
-            _mm256_add_epi32(res_b, round_const_v), round_shift_v);
-
-        /* rounding code */
-        // 16 bit conversion
-        const __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round);
-        // 8 bit conversion and saturation to uint8
-        const __m256i res_8b = _mm256_packus_epi16(res_16bit, res_16bit);
-
-        const __m128i res_0 = _mm256_castsi256_si128(res_8b);
-        const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);
-
-        // Store values into the destination buffer
-        __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];
-        __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + j + dst_stride];
-        if (w - j > 4) {
-          _mm_storel_epi64(p_0, res_0);
-          _mm_storel_epi64(p_1, res_1);
-        } else if (w == 4) {
-          xx_storel_32(p_0, res_0);
-          xx_storel_32(p_1, res_1);
-        } else {
-          *(uint16_t *)p_0 = _mm_cvtsi128_si32(res_0);
-          *(uint16_t *)p_1 = _mm_cvtsi128_si32(res_1);
-        }
-
-        s[0] = s[1];
-        s[1] = s[2];
-        s[3] = s[4];
-        s[4] = s[5];
-      }
+    if (vert_tap == 4) {
+      CONVOLVE_SR_VERTICAL_FILTER_4TAP
+    } else if (vert_tap == 6) {
+      CONVOLVE_SR_VERTICAL_FILTER_6TAP
+    } else {
+      CONVOLVE_SR_VERTICAL_FILTER_8TAP
     }
-  } else {
-    int j;
-    int im_h = h + filter_params_y->taps - 1;
-    const int fo_vert = filter_params_y->taps / 2 - 1;
-    const int fo_horiz = filter_params_x->taps / 2 - 1;
-    const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
-
-    filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
-    filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
-
-    for (j = 0; j < w; j += 8) {
-      CONVOLVE_SR_HORIZONTAL_FILTER_8TAP;
-
-      CONVOLVE_SR_VERTICAL_FILTER_8TAP;
-    }
-  }
-}
-
-static INLINE void copy_128(const uint8_t *src, uint8_t *dst) {
-  __m256i s[4];
-  s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 32));
-  s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 32));
-  s[2] = _mm256_loadu_si256((__m256i *)(src + 2 * 32));
-  s[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 32));
-  _mm256_storeu_si256((__m256i *)(dst + 0 * 32), s[0]);
-  _mm256_storeu_si256((__m256i *)(dst + 1 * 32), s[1]);
-  _mm256_storeu_si256((__m256i *)(dst + 2 * 32), s[2]);
-  _mm256_storeu_si256((__m256i *)(dst + 3 * 32), s[3]);
-}
-
-void av1_convolve_2d_copy_sr_avx2(const uint8_t *src, int src_stride,
-                                  uint8_t *dst, int dst_stride, int w, int h,
-                                  const InterpFilterParams *filter_params_x,
-                                  const InterpFilterParams *filter_params_y,
-                                  const int subpel_x_qn, const int subpel_y_qn,
-                                  ConvolveParams *conv_params) {
-  (void)filter_params_x;
-  (void)filter_params_y;
-  (void)subpel_x_qn;
-  (void)subpel_y_qn;
-  (void)conv_params;
-
-  if (w >= 16) {
-    assert(!((intptr_t)dst % 16));
-    assert(!(dst_stride % 16));
-  }
-
-  if (w == 2) {
-    do {
-      memmove(dst, src, 2 * sizeof(*src));
-      src += src_stride;
-      dst += dst_stride;
-      memmove(dst, src, 2 * sizeof(*src));
-      src += src_stride;
-      dst += dst_stride;
-      h -= 2;
-    } while (h);
-  } else if (w == 4) {
-    do {
-      memmove(dst, src, 4 * sizeof(*src));
-      src += src_stride;
-      dst += dst_stride;
-      memmove(dst, src, 4 * sizeof(*src));
-      src += src_stride;
-      dst += dst_stride;
-      h -= 2;
-    } while (h);
-  } else if (w == 8) {
-    do {
-      __m128i s[2];
-      s[0] = _mm_loadl_epi64((__m128i *)src);
-      src += src_stride;
-      s[1] = _mm_loadl_epi64((__m128i *)src);
-      src += src_stride;
-      _mm_storel_epi64((__m128i *)dst, s[0]);
-      dst += dst_stride;
-      _mm_storel_epi64((__m128i *)dst, s[1]);
-      dst += dst_stride;
-      h -= 2;
-    } while (h);
-  } else if (w == 16) {
-    do {
-      __m128i s[2];
-      s[0] = _mm_loadu_si128((__m128i *)src);
-      src += src_stride;
-      s[1] = _mm_loadu_si128((__m128i *)src);
-      src += src_stride;
-      _mm_store_si128((__m128i *)dst, s[0]);
-      dst += dst_stride;
-      _mm_store_si128((__m128i *)dst, s[1]);
-      dst += dst_stride;
-      h -= 2;
-    } while (h);
-  } else if (w == 32) {
-    do {
-      __m256i s[2];
-      s[0] = _mm256_loadu_si256((__m256i *)src);
-      src += src_stride;
-      s[1] = _mm256_loadu_si256((__m256i *)src);
-      src += src_stride;
-      _mm256_storeu_si256((__m256i *)dst, s[0]);
-      dst += dst_stride;
-      _mm256_storeu_si256((__m256i *)dst, s[1]);
-      dst += dst_stride;
-      h -= 2;
-    } while (h);
-  } else if (w == 64) {
-    do {
-      __m256i s[4];
-      s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 32));
-      s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 32));
-      src += src_stride;
-      s[2] = _mm256_loadu_si256((__m256i *)(src + 0 * 32));
-      s[3] = _mm256_loadu_si256((__m256i *)(src + 1 * 32));
-      src += src_stride;
-      _mm256_storeu_si256((__m256i *)(dst + 0 * 32), s[0]);
-      _mm256_storeu_si256((__m256i *)(dst + 1 * 32), s[1]);
-      dst += dst_stride;
-      _mm256_storeu_si256((__m256i *)(dst + 0 * 32), s[2]);
-      _mm256_storeu_si256((__m256i *)(dst + 1 * 32), s[3]);
-      dst += dst_stride;
-      h -= 2;
-    } while (h);
-  } else {
-    do {
-      copy_128(src, dst);
-      src += src_stride;
-      dst += dst_stride;
-      copy_128(src, dst);
-      src += src_stride;
-      dst += dst_stride;
-      h -= 2;
-    } while (h);
   }
 }
diff --git a/av1/common/x86/convolve_2d_sse2.c b/av1/common/x86/convolve_2d_sse2.c
index 5376ea7..1db9853 100644
--- a/av1/common/x86/convolve_2d_sse2.c
+++ b/av1/common/x86/convolve_2d_sse2.c
@@ -216,156 +216,12 @@
   }
 }
 
-static INLINE void copy_128(const uint8_t *src, uint8_t *dst) {
-  __m128i s[8];
-  s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 16));
-  s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 16));
-  s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 16));
-  s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 16));
-  s[4] = _mm_loadu_si128((__m128i *)(src + 4 * 16));
-  s[5] = _mm_loadu_si128((__m128i *)(src + 5 * 16));
-  s[6] = _mm_loadu_si128((__m128i *)(src + 6 * 16));
-  s[7] = _mm_loadu_si128((__m128i *)(src + 7 * 16));
-  _mm_store_si128((__m128i *)(dst + 0 * 16), s[0]);
-  _mm_store_si128((__m128i *)(dst + 1 * 16), s[1]);
-  _mm_store_si128((__m128i *)(dst + 2 * 16), s[2]);
-  _mm_store_si128((__m128i *)(dst + 3 * 16), s[3]);
-  _mm_store_si128((__m128i *)(dst + 4 * 16), s[4]);
-  _mm_store_si128((__m128i *)(dst + 5 * 16), s[5]);
-  _mm_store_si128((__m128i *)(dst + 6 * 16), s[6]);
-  _mm_store_si128((__m128i *)(dst + 7 * 16), s[7]);
-}
-
-void av1_convolve_2d_copy_sr_sse2(const uint8_t *src, int src_stride,
-                                  uint8_t *dst, int dst_stride, int w, int h,
-                                  const InterpFilterParams *filter_params_x,
-                                  const InterpFilterParams *filter_params_y,
-                                  const int subpel_x_qn, const int subpel_y_qn,
-                                  ConvolveParams *conv_params) {
-  (void)filter_params_x;
-  (void)filter_params_y;
-  (void)subpel_x_qn;
-  (void)subpel_y_qn;
-  (void)conv_params;
-
-  if (w >= 16) {
-    assert(!((intptr_t)dst % 16));
-    assert(!(dst_stride % 16));
-  }
-
-  if (w == 2) {
-    do {
-      memmove(dst, src, 2 * sizeof(*src));
-      src += src_stride;
-      dst += dst_stride;
-      memmove(dst, src, 2 * sizeof(*src));
-      src += src_stride;
-      dst += dst_stride;
-      h -= 2;
-    } while (h);
-  } else if (w == 4) {
-    do {
-      memmove(dst, src, 4 * sizeof(*src));
-      src += src_stride;
-      dst += dst_stride;
-      memmove(dst, src, 4 * sizeof(*src));
-      src += src_stride;
-      dst += dst_stride;
-      h -= 2;
-    } while (h);
-  } else if (w == 8) {
-    do {
-      __m128i s[2];
-      s[0] = _mm_loadl_epi64((__m128i *)src);
-      src += src_stride;
-      s[1] = _mm_loadl_epi64((__m128i *)src);
-      src += src_stride;
-      _mm_storel_epi64((__m128i *)dst, s[0]);
-      dst += dst_stride;
-      _mm_storel_epi64((__m128i *)dst, s[1]);
-      dst += dst_stride;
-      h -= 2;
-    } while (h);
-  } else if (w == 16) {
-    do {
-      __m128i s[2];
-      s[0] = _mm_loadu_si128((__m128i *)src);
-      src += src_stride;
-      s[1] = _mm_loadu_si128((__m128i *)src);
-      src += src_stride;
-      _mm_store_si128((__m128i *)dst, s[0]);
-      dst += dst_stride;
-      _mm_store_si128((__m128i *)dst, s[1]);
-      dst += dst_stride;
-      h -= 2;
-    } while (h);
-  } else if (w == 32) {
-    do {
-      __m128i s[4];
-      s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 16));
-      s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 16));
-      src += src_stride;
-      s[2] = _mm_loadu_si128((__m128i *)(src + 0 * 16));
-      s[3] = _mm_loadu_si128((__m128i *)(src + 1 * 16));
-      src += src_stride;
-      _mm_store_si128((__m128i *)(dst + 0 * 16), s[0]);
-      _mm_store_si128((__m128i *)(dst + 1 * 16), s[1]);
-      dst += dst_stride;
-      _mm_store_si128((__m128i *)(dst + 0 * 16), s[2]);
-      _mm_store_si128((__m128i *)(dst + 1 * 16), s[3]);
-      dst += dst_stride;
-      h -= 2;
-    } while (h);
-  } else if (w == 64) {
-    do {
-      __m128i s[8];
-      s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 16));
-      s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 16));
-      s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 16));
-      s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 16));
-      src += src_stride;
-      s[4] = _mm_loadu_si128((__m128i *)(src + 0 * 16));
-      s[5] = _mm_loadu_si128((__m128i *)(src + 1 * 16));
-      s[6] = _mm_loadu_si128((__m128i *)(src + 2 * 16));
-      s[7] = _mm_loadu_si128((__m128i *)(src + 3 * 16));
-      src += src_stride;
-      _mm_store_si128((__m128i *)(dst + 0 * 16), s[0]);
-      _mm_store_si128((__m128i *)(dst + 1 * 16), s[1]);
-      _mm_store_si128((__m128i *)(dst + 2 * 16), s[2]);
-      _mm_store_si128((__m128i *)(dst + 3 * 16), s[3]);
-      dst += dst_stride;
-      _mm_store_si128((__m128i *)(dst + 0 * 16), s[4]);
-      _mm_store_si128((__m128i *)(dst + 1 * 16), s[5]);
-      _mm_store_si128((__m128i *)(dst + 2 * 16), s[6]);
-      _mm_store_si128((__m128i *)(dst + 3 * 16), s[7]);
-      dst += dst_stride;
-      h -= 2;
-    } while (h);
-  } else {
-    do {
-      copy_128(src, dst);
-      src += src_stride;
-      dst += dst_stride;
-      copy_128(src, dst);
-      src += src_stride;
-      dst += dst_stride;
-      h -= 2;
-    } while (h);
-  }
-}
-
-void av1_dist_wtd_convolve_2d_copy_sse2(
-    const uint8_t *src, int src_stride, uint8_t *dst0, int dst_stride0, int w,
-    int h, const InterpFilterParams *filter_params_x,
-    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
-    const int subpel_y_qn, ConvolveParams *conv_params) {
+void av1_dist_wtd_convolve_2d_copy_sse2(const uint8_t *src, int src_stride,
+                                        uint8_t *dst0, int dst_stride0, int w,
+                                        int h, ConvolveParams *conv_params) {
   const int bd = 8;
   CONV_BUF_TYPE *dst = conv_params->dst;
   int dst_stride = conv_params->dst_stride;
-  (void)filter_params_x;
-  (void)filter_params_y;
-  (void)subpel_x_qn;
-  (void)subpel_y_qn;
 
   const int bits =
       FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
diff --git a/av1/common/x86/convolve_avx2.c b/av1/common/x86/convolve_avx2.c
index 1d5bc6f..1a9bf5e 100644
--- a/av1/common/x86/convolve_avx2.c
+++ b/av1/common/x86/convolve_avx2.c
@@ -19,11 +19,9 @@
 
 void av1_convolve_y_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
                             int dst_stride, int w, int h,
-                            const InterpFilterParams *filter_params_x,
                             const InterpFilterParams *filter_params_y,
-                            const int subpel_x_qn, const int subpel_y_qn,
-                            ConvolveParams *conv_params) {
-  int i, j, is_vert_4tap = 0;
+                            const int subpel_y_qn) {
+  int i, j, vert_tap = SUBPEL_TAPS;
   // right shift is F-1 because we are already dividing
   // filter co-efficients by 2
   const int right_shift_bits = (FILTER_BITS - 1);
@@ -31,24 +29,25 @@
   const __m256i right_shift_const =
       _mm256_set1_epi16((1 << right_shift_bits) >> 1);
 
-  assert(conv_params->round_0 <= FILTER_BITS);
-  assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) ||
-         ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS)));
-
-  (void)filter_params_x;
-  (void)subpel_x_qn;
-  (void)conv_params;
   __m256i coeffs[4], s[8];
   __m128i d[6];
 
-  prepare_coeffs_lowbd(filter_params_y, subpel_y_qn, coeffs);
-
   // Condition for checking valid vert_filt taps
-  if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs[0], coeffs[3]), 0)))
-    is_vert_4tap = 1;
+  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
+      filter_params_y, subpel_y_qn & SUBPEL_MASK);
+  if (!(filter[0] | filter[1] | filter[6] | filter[7])) {
+    vert_tap = 4;
+  } else if (!(filter[0] | filter[7])) {
+    vert_tap = 6;
+  }
+
+  if (vert_tap == 6)
+    prepare_coeffs_6t_lowbd(filter_params_y, subpel_y_qn, coeffs);
+  else
+    prepare_coeffs_lowbd(filter_params_y, subpel_y_qn, coeffs);
 
   // vert_filt as 4 tap
-  if (is_vert_4tap) {
+  if (vert_tap == 4) {
     const int fo_vert = 1;
     const uint8_t *const src_ptr = src - fo_vert * src_stride;
     for (j = 0; j < w; j += 16) {
@@ -142,6 +141,108 @@
         s[4] = s[5];
       }
     }
+  } else if (vert_tap == 6) {
+    const int fo_vert = vert_tap / 2 - 1;
+    const uint8_t *const src_ptr = src - fo_vert * src_stride;
+
+    for (j = 0; j < w; j += 16) {
+      const uint8_t *data = &src_ptr[j];
+      __m256i src6;
+
+      d[0] = _mm_loadu_si128((__m128i *)(data + 0 * src_stride));
+      d[1] = _mm_loadu_si128((__m128i *)(data + 1 * src_stride));
+      d[2] = _mm_loadu_si128((__m128i *)(data + 2 * src_stride));
+      d[3] = _mm_loadu_si128((__m128i *)(data + 3 * src_stride));
+      // Load lines a and b. Line a to lower 128, line b to upper 128
+      const __m256i src_01a = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(d[0]), _mm256_castsi128_si256(d[1]), 0x20);
+
+      const __m256i src_12a = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(d[1]), _mm256_castsi128_si256(d[2]), 0x20);
+
+      const __m256i src_23a = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(d[2]), _mm256_castsi128_si256(d[3]), 0x20);
+
+      src6 = _mm256_castsi128_si256(
+          _mm_loadu_si128((__m128i *)(data + 4 * src_stride)));
+      const __m256i src_34a =
+          _mm256_permute2x128_si256(_mm256_castsi128_si256(d[3]), src6, 0x20);
+
+      s[0] = _mm256_unpacklo_epi8(src_01a, src_12a);
+      s[1] = _mm256_unpacklo_epi8(src_23a, src_34a);
+
+      s[3] = _mm256_unpackhi_epi8(src_01a, src_12a);
+      s[4] = _mm256_unpackhi_epi8(src_23a, src_34a);
+
+      for (i = 0; i < h; i += 2) {
+        data = &src_ptr[i * src_stride + j];
+        const __m256i src_45a = _mm256_permute2x128_si256(
+            src6,
+            _mm256_castsi128_si256(
+                _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
+            0x20);
+
+        src6 = _mm256_castsi128_si256(
+            _mm_loadu_si128((__m128i *)(data + 6 * src_stride)));
+        const __m256i src_56a = _mm256_permute2x128_si256(
+            _mm256_castsi128_si256(
+                _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
+            src6, 0x20);
+
+        s[2] = _mm256_unpacklo_epi8(src_45a, src_56a);
+        s[5] = _mm256_unpackhi_epi8(src_45a, src_56a);
+
+        const __m256i res_lo = convolve_lowbd_6tap(s, coeffs);
+
+        /* rounding code */
+        // shift by F - 1
+        const __m256i res_16b_lo = _mm256_sra_epi16(
+            _mm256_add_epi16(res_lo, right_shift_const), right_shift);
+        // 8 bit conversion and saturation to uint8
+        __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo);
+
+        if (w - j > 8) {
+          const __m256i res_hi = convolve_lowbd_6tap(s + 3, coeffs);
+
+          /* rounding code */
+          // shift by F - 1
+          const __m256i res_16b_hi = _mm256_sra_epi16(
+              _mm256_add_epi16(res_hi, right_shift_const), right_shift);
+          // 8 bit conversion and saturation to uint8
+          __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi);
+
+          __m256i res_a = _mm256_unpacklo_epi64(res_8b_lo, res_8b_hi);
+
+          const __m128i res_0 = _mm256_castsi256_si128(res_a);
+          const __m128i res_1 = _mm256_extracti128_si256(res_a, 1);
+
+          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_0);
+          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
+                           res_1);
+        } else {
+          const __m128i res_0 = _mm256_castsi256_si128(res_8b_lo);
+          const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
+          if (w - j > 4) {
+            _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0);
+            _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
+                             res_1);
+          } else if (w - j > 2) {
+            xx_storel_32(&dst[i * dst_stride + j], res_0);
+            xx_storel_32(&dst[i * dst_stride + j + dst_stride], res_1);
+          } else {
+            __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];
+            __m128i *const p_1 =
+                (__m128i *)&dst[i * dst_stride + j + dst_stride];
+            *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0);
+            *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1);
+          }
+        }
+        s[0] = s[1];
+        s[1] = s[2];
+        s[3] = s[4];
+        s[4] = s[5];
+      }
+    }
   } else {
     const int fo_vert = filter_params_y->taps / 2 - 1;
     const uint8_t *const src_ptr = src - fo_vert * src_stride;
@@ -263,8 +364,7 @@
 void av1_convolve_x_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
                             int dst_stride, int w, int h,
                             const InterpFilterParams *filter_params_x,
-                            const InterpFilterParams *filter_params_y,
-                            const int subpel_x_qn, const int subpel_y_qn,
+                            const int subpel_x_qn,
                             ConvolveParams *conv_params) {
   const int bits = FILTER_BITS - conv_params->round_0;
 
@@ -273,9 +373,7 @@
   const __m128i round_0_shift = _mm_cvtsi32_si128(conv_params->round_0 - 1);
   const __m256i round_const = _mm256_set1_epi16((1 << bits) >> 1);
   const __m128i round_shift = _mm_cvtsi32_si128(bits);
-  int i, is_horiz_4tap = 0;
-  (void)filter_params_y;
-  (void)subpel_y_qn;
+  int i, horiz_tap = SUBPEL_TAPS;
 
   assert(bits >= 0);
   assert((FILTER_BITS - conv_params->round_1) >= 0 ||
@@ -286,14 +384,21 @@
   filt[0] = _mm256_load_si256((__m256i const *)(filt_global_avx2));
   filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
 
-  prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs);
+  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
+      filter_params_x, subpel_x_qn & SUBPEL_MASK);
+  if (!(filter[0] | filter[1] | filter[6] | filter[7])) {
+    horiz_tap = 4;
+  } else if (!(filter[0] | filter[7])) {
+    horiz_tap = 6;
+  }
 
-  // Condition for checking valid horz_filt taps
-  if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs[0], coeffs[3]), 0)))
-    is_horiz_4tap = 1;
+  if (horiz_tap == 6)
+    prepare_coeffs_6t_lowbd(filter_params_x, subpel_x_qn, coeffs);
+  else
+    prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs);
 
   // horz_filt as 4 tap
-  if (is_horiz_4tap) {
+  if (horiz_tap == 4) {
     const int fo_horiz = 1;
     const uint8_t *const src_ptr = src - fo_horiz;
     if (w <= 8) {
@@ -363,6 +468,78 @@
         }
       }
     }
+  } else if (horiz_tap == 6) {
+    const int fo_horiz = horiz_tap / 2 - 1;
+    const uint8_t *const src_ptr = src - fo_horiz;
+    filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
+    filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
+
+    if (w <= 8) {
+      for (i = 0; i < h; i += 2) {
+        const __m256i data = _mm256_permute2x128_si256(
+            _mm256_castsi128_si256(
+                _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))),
+            _mm256_castsi128_si256(_mm_loadu_si128(
+                (__m128i *)(&src_ptr[i * src_stride + src_stride]))),
+            0x20);
+
+        __m256i res_16b = convolve_lowbd_x_6tap(data, coeffs, filt);
+
+        res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const),
+                                   round_0_shift);
+
+        res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const),
+                                   round_shift);
+
+        /* rounding code */
+        // 8 bit conversion and saturation to uint8
+        __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
+
+        const __m128i res_0 = _mm256_castsi256_si128(res_8b);
+        const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);
+        if (w > 4) {
+          _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0);
+          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1);
+        } else if (w > 2) {
+          xx_storel_32(&dst[i * dst_stride], res_0);
+          xx_storel_32(&dst[i * dst_stride + dst_stride], res_1);
+        } else {
+          __m128i *const p_0 = (__m128i *)&dst[i * dst_stride];
+          __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + dst_stride];
+          *(uint16_t *)p_0 = _mm_cvtsi128_si32(res_0);
+          *(uint16_t *)p_1 = _mm_cvtsi128_si32(res_1);
+        }
+      }
+    } else {
+      for (i = 0; i < h; ++i) {
+        for (int j = 0; j < w; j += 16) {
+          // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 16 17
+          // 18 19 20 21 22 23
+          const __m256i data = _mm256_inserti128_si256(
+              _mm256_loadu_si256((__m256i *)&src_ptr[(i * src_stride) + j]),
+              _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + (j + 8)]),
+              1);
+
+          __m256i res_16b = convolve_lowbd_x_6tap(data, coeffs, filt);
+
+          res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const),
+                                     round_0_shift);
+
+          res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const),
+                                     round_shift);
+
+          /* rounding code */
+          // 8 bit conversion and saturation to uint8
+          __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
+
+          // Store values into the destination buffer
+          // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+          res_8b = _mm256_permute4x64_epi64(res_8b, 216);
+          __m128i res = _mm256_castsi256_si128(res_8b);
+          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res);
+        }
+      }
+    }
   } else {
     const int fo_horiz = filter_params_x->taps / 2 - 1;
     const uint8_t *const src_ptr = src - fo_horiz;
diff --git a/av1/common/x86/convolve_sse2.c b/av1/common/x86/convolve_sse2.c
index 4323ac4..7273cde 100644
--- a/av1/common/x86/convolve_sse2.c
+++ b/av1/common/x86/convolve_sse2.c
@@ -77,24 +77,14 @@
 
 void av1_convolve_y_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst,
                             int dst_stride, int w, int h,
-                            const InterpFilterParams *filter_params_x,
                             const InterpFilterParams *filter_params_y,
-                            const int subpel_x_qn, const int subpel_y_qn,
-                            ConvolveParams *conv_params) {
+                            const int subpel_y_qn) {
   const int fo_vert = filter_params_y->taps / 2 - 1;
   const uint8_t *src_ptr = src - fo_vert * src_stride;
   const __m128i round_const = _mm_set1_epi32((1 << FILTER_BITS) >> 1);
   const __m128i round_shift = _mm_cvtsi32_si128(FILTER_BITS);
   __m128i coeffs[4];
 
-  (void)filter_params_x;
-  (void)subpel_x_qn;
-  (void)conv_params;
-
-  assert(conv_params->round_0 <= FILTER_BITS);
-  assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) ||
-         ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS)));
-
   prepare_coeffs(filter_params_y, subpel_y_qn, coeffs);
 
   if (w <= 4) {
@@ -239,8 +229,7 @@
 void av1_convolve_x_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst,
                             int dst_stride, int w, int h,
                             const InterpFilterParams *filter_params_x,
-                            const InterpFilterParams *filter_params_y,
-                            const int subpel_x_qn, const int subpel_y_qn,
+                            const int subpel_x_qn,
                             ConvolveParams *conv_params) {
   const int fo_horiz = filter_params_x->taps / 2 - 1;
   const uint8_t *src_ptr = src - fo_horiz;
@@ -252,9 +241,6 @@
   const __m128i round_shift = _mm_cvtsi32_si128(bits);
   __m128i coeffs[4];
 
-  (void)filter_params_y;
-  (void)subpel_y_qn;
-
   assert(bits >= 0);
   assert((FILTER_BITS - conv_params->round_1) >= 0 ||
          ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
diff --git a/av1/common/x86/filterintra_sse4.c b/av1/common/x86/filterintra_sse4.c
index c11edc1..d05bb0e 100644
--- a/av1/common/x86/filterintra_sse4.c
+++ b/av1/common/x86/filterintra_sse4.c
@@ -9,7 +9,9 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
+#include <assert.h>
 #include <smmintrin.h>
+#include <string.h>
 
 #include "config/av1_rtcd.h"
 
@@ -17,59 +19,332 @@
 #include "av1/common/enums.h"
 #include "av1/common/reconintra.h"
 
+//------------------------------------------------------------------------------
+// filter_intra_predictor_sse4_1
+
+// This shuffle mask selects 32-bit blocks in the order 0, 1, 0, 1, which
+// duplicates the first 8 bytes of a 128-bit vector into the second 8 bytes.
+#define DUPLICATE_FIRST_HALF 0x44
+
+// Apply all filter taps to the given 7 packed 16-bit values, keeping the 8th
+// at zero to preserve the sum.
+static INLINE void filter_4x2_sse4_1(uint8_t *dst, const ptrdiff_t stride,
+                                     const __m128i *pixels,
+                                     const __m128i *taps_0_1,
+                                     const __m128i *taps_2_3,
+                                     const __m128i *taps_4_5,
+                                     const __m128i *taps_6_7) {
+  const __m128i mul_0_01 = _mm_maddubs_epi16(*pixels, *taps_0_1);
+  const __m128i mul_0_23 = _mm_maddubs_epi16(*pixels, *taps_2_3);
+  // |output_half| contains 8 partial sums.
+  __m128i output_half = _mm_hadd_epi16(mul_0_01, mul_0_23);
+  __m128i output = _mm_hadd_epi16(output_half, output_half);
+  const __m128i output_row0 =
+      _mm_packus_epi16(xx_roundn_epi16_unsigned(output, 4),
+                       /* arbitrary pack arg */ output);
+  xx_storel_32(dst, output_row0);
+  const __m128i mul_1_01 = _mm_maddubs_epi16(*pixels, *taps_4_5);
+  const __m128i mul_1_23 = _mm_maddubs_epi16(*pixels, *taps_6_7);
+  output_half = _mm_hadd_epi16(mul_1_01, mul_1_23);
+  output = _mm_hadd_epi16(output_half, output_half);
+  const __m128i output_row1 =
+      _mm_packus_epi16(xx_roundn_epi16_unsigned(output, 4),
+                       /* arbitrary pack arg */ output);
+  xx_storel_32(dst + stride, output_row1);
+}
+
+// 4xH transform sizes are given special treatment because xx_loadl_64 goes out
+// of bounds and every block involves the left column. This implementation
+// loads TL from the top row for the first block, so it is not
+static INLINE void filter_4xh(uint8_t *dest, ptrdiff_t stride,
+                              const uint8_t *const top_ptr,
+                              const uint8_t *const left_ptr, int mode,
+                              const int height) {
+  const __m128i taps_0_1 = xx_load_128(av1_filter_intra_taps[mode][0]);
+  const __m128i taps_2_3 = xx_load_128(av1_filter_intra_taps[mode][2]);
+  const __m128i taps_4_5 = xx_load_128(av1_filter_intra_taps[mode][4]);
+  const __m128i taps_6_7 = xx_load_128(av1_filter_intra_taps[mode][6]);
+  __m128i top = xx_loadl_32(top_ptr - 1);
+  __m128i pixels = _mm_insert_epi8(top, (int8_t)top_ptr[3], 4);
+  __m128i left = (height == 4 ? xx_loadl_32(left_ptr) : xx_loadl_64(left_ptr));
+  left = _mm_slli_si128(left, 5);
+
+  // Relative pixels: top[-1], top[0], top[1], top[2], top[3], left[0], left[1],
+  // left[2], left[3], left[4], left[5], left[6], left[7]
+  pixels = _mm_or_si128(left, pixels);
+
+  // Duplicate first 8 bytes.
+  pixels = _mm_shuffle_epi32(pixels, DUPLICATE_FIRST_HALF);
+  filter_4x2_sse4_1(dest, stride, &pixels, &taps_0_1, &taps_2_3, &taps_4_5,
+                    &taps_6_7);
+  dest += stride;  // Move to y = 1.
+  pixels = xx_loadl_32(dest);
+
+  // Relative pixels: top[0], top[1], top[2], top[3], empty, left[-2], left[-1],
+  // left[0], left[1], ...
+  pixels = _mm_or_si128(left, pixels);
+
+  // This mask rearranges bytes in the order: 6, 0, 1, 2, 3, 7, 8, 15. The last
+  // byte is an unused value, which shall be multiplied by 0 when we apply the
+  // filter.
+  const int64_t kInsertTopLeftFirstMask = 0x0F08070302010006;
+
+  // Insert left[-1] in front as TL and put left[0] and left[1] at the end.
+  const __m128i pixel_order1 = _mm_set1_epi64x(kInsertTopLeftFirstMask);
+  pixels = _mm_shuffle_epi8(pixels, pixel_order1);
+  dest += stride;  // Move to y = 2.
+  filter_4x2_sse4_1(dest, stride, &pixels, &taps_0_1, &taps_2_3, &taps_4_5,
+                    &taps_6_7);
+  dest += stride;  // Move to y = 3.
+
+  // Compute the middle 8 rows before using common code for the final 4 rows.
+  // Because the common code below this block assumes that
+  if (height == 16) {
+    // This shift allows us to use pixel_order2 twice after shifting by 2 later.
+    left = _mm_slli_si128(left, 1);
+    pixels = xx_loadl_32(dest);
+
+    // Relative pixels: top[0], top[1], top[2], top[3], empty, empty, left[-4],
+    // left[-3], left[-2], left[-1], left[0], left[1], left[2], left[3]
+    pixels = _mm_or_si128(left, pixels);
+
+    // This mask rearranges bytes in the order: 9, 0, 1, 2, 3, 7, 8, 15. The
+    // last byte is an unused value, as above. The top-left was shifted to
+    // position nine to keep two empty spaces after the top pixels.
+    const int64_t kInsertTopLeftSecondMask = 0x0F0B0A0302010009;
+
+    // Insert (relative) left[-1] in front as TL and put left[0] and left[1] at
+    // the end.
+    const __m128i pixel_order2 = _mm_set1_epi64x(kInsertTopLeftSecondMask);
+    pixels = _mm_shuffle_epi8(pixels, pixel_order2);
+    dest += stride;  // Move to y = 4.
+
+    // First 4x2 in the if body.
+    filter_4x2_sse4_1(dest, stride, &pixels, &taps_0_1, &taps_2_3, &taps_4_5,
+                      &taps_6_7);
+
+    // Clear all but final pixel in the first 8 of left column.
+    __m128i keep_top_left = _mm_srli_si128(left, 13);
+    dest += stride;  // Move to y = 5.
+    pixels = xx_loadl_32(dest);
+    left = _mm_srli_si128(left, 2);
+
+    // Relative pixels: top[0], top[1], top[2], top[3], left[-6],
+    // left[-5], left[-4], left[-3], left[-2], left[-1], left[0], left[1]
+    pixels = _mm_or_si128(left, pixels);
+    left = xx_loadl_64(left_ptr + 8);
+
+    pixels = _mm_shuffle_epi8(pixels, pixel_order2);
+    dest += stride;  // Move to y = 6.
+
+    // Second 4x2 in the if body.
+    filter_4x2_sse4_1(dest, stride, &pixels, &taps_0_1, &taps_2_3, &taps_4_5,
+                      &taps_6_7);
+
+    // Position TL value so we can use pixel_order1.
+    keep_top_left = _mm_slli_si128(keep_top_left, 6);
+    dest += stride;  // Move to y = 7.
+    pixels = xx_loadl_32(dest);
+    left = _mm_slli_si128(left, 7);
+    left = _mm_or_si128(left, keep_top_left);
+
+    // Relative pixels: top[0], top[1], top[2], top[3], empty, empty,
+    // left[-1], left[0], left[1], left[2], left[3], ...
+    pixels = _mm_or_si128(left, pixels);
+    pixels = _mm_shuffle_epi8(pixels, pixel_order1);
+    dest += stride;  // Move to y = 8.
+
+    // Third 4x2 in the if body.
+    filter_4x2_sse4_1(dest, stride, &pixels, &taps_0_1, &taps_2_3, &taps_4_5,
+                      &taps_6_7);
+    dest += stride;  // Move to y = 9.
+
+    // Prepare final inputs.
+    pixels = xx_loadl_32(dest);
+    left = _mm_srli_si128(left, 2);
+
+    // Relative pixels: top[0], top[1], top[2], top[3], left[-3], left[-2]
+    // left[-1], left[0], left[1], left[2], left[3], ...
+    pixels = _mm_or_si128(left, pixels);
+    pixels = _mm_shuffle_epi8(pixels, pixel_order1);
+    dest += stride;  // Move to y = 10.
+
+    // Fourth 4x2 in the if body.
+    filter_4x2_sse4_1(dest, stride, &pixels, &taps_0_1, &taps_2_3, &taps_4_5,
+                      &taps_6_7);
+    dest += stride;  // Move to y = 11.
+  }
+
+  // In both the 8 and 16 case, we assume that the left vector has the next TL
+  // at position 8.
+  if (height > 4) {
+    // Erase prior left pixels by shifting TL to position 0.
+    left = _mm_srli_si128(left, 8);
+    left = _mm_slli_si128(left, 6);
+    pixels = xx_loadl_32(dest);
+
+    // Relative pixels: top[0], top[1], top[2], top[3], empty, empty,
+    // left[-1], left[0], left[1], left[2], left[3], ...
+    pixels = _mm_or_si128(left, pixels);
+    pixels = _mm_shuffle_epi8(pixels, pixel_order1);
+    dest += stride;  // Move to y = 12 or 4.
+
+    // First of final two 4x2 blocks.
+    filter_4x2_sse4_1(dest, stride, &pixels, &taps_0_1, &taps_2_3, &taps_4_5,
+                      &taps_6_7);
+    dest += stride;  // Move to y = 13 or 5.
+    pixels = xx_loadl_32(dest);
+    left = _mm_srli_si128(left, 2);
+
+    // Relative pixels: top[0], top[1], top[2], top[3], left[-3], left[-2]
+    // left[-1], left[0], left[1], left[2], left[3], ...
+    pixels = _mm_or_si128(left, pixels);
+    pixels = _mm_shuffle_epi8(pixels, pixel_order1);
+    dest += stride;  // Move to y = 14 or 6.
+
+    // Last of final two 4x2 blocks.
+    filter_4x2_sse4_1(dest, stride, &pixels, &taps_0_1, &taps_2_3, &taps_4_5,
+                      &taps_6_7);
+  }
+}
+
+static INLINE void filter_intra_predictor_sse4_1(void *const dest,
+                                                 ptrdiff_t stride,
+                                                 const void *const top_row,
+                                                 const void *const left_column,
+                                                 int mode, const int width,
+                                                 const int height) {
+  const uint8_t *const top_ptr = (const uint8_t *)top_row;
+  const uint8_t *const left_ptr = (const uint8_t *)left_column;
+  uint8_t *dst = (uint8_t *)dest;
+  if (width == 4) {
+    filter_4xh(dst, stride, top_ptr, left_ptr, mode, height);
+    return;
+  }
+
+  // There is one set of 7 taps for each of the 4x2 output pixels.
+  const __m128i taps_0_1 = xx_load_128(av1_filter_intra_taps[mode][0]);
+  const __m128i taps_2_3 = xx_load_128(av1_filter_intra_taps[mode][2]);
+  const __m128i taps_4_5 = xx_load_128(av1_filter_intra_taps[mode][4]);
+  const __m128i taps_6_7 = xx_load_128(av1_filter_intra_taps[mode][6]);
+
+  // This mask rearranges bytes in the order: 0, 1, 2, 3, 4, 8, 9, 15. The 15 at
+  // the end is an unused value, which shall be multiplied by 0 when we apply
+  // the filter.
+  const int64_t kCondenseLeftMask = 0x0F09080403020100;
+
+  // Takes the "left section" and puts it right after p0-p4.
+  const __m128i pixel_order1 = _mm_set1_epi64x(kCondenseLeftMask);
+
+  // This mask rearranges bytes in the order: 8, 0, 1, 2, 3, 9, 10, 15. The last
+  // byte is unused as above.
+  const int64_t kInsertTopLeftMask = 0x0F0A090302010008;
+
+  // Shuffles the "top left" from the left section, to the front. Used when
+  // grabbing data from left_column and not top_row.
+  const __m128i pixel_order2 = _mm_set1_epi64x(kInsertTopLeftMask);
+
+  // This first pass takes care of the cases where the top left pixel comes from
+  // top_row.
+  __m128i pixels = xx_loadl_64(top_ptr - 1);
+  __m128i left = _mm_slli_si128(xx_loadl_32(left_column), 8);
+  pixels = _mm_or_si128(pixels, left);
+
+  // Two sets of the same pixels to multiply with two sets of taps.
+  pixels = _mm_shuffle_epi8(pixels, pixel_order1);
+  filter_4x2_sse4_1(dst, stride, &pixels, &taps_0_1, &taps_2_3, &taps_4_5,
+                    &taps_6_7);
+  left = _mm_srli_si128(left, 1);
+
+  // Load
+  pixels = xx_loadl_32(dst + stride);
+
+  // Because of the above shift, this OR 'invades' the final of the first 8
+  // bytes of |pixels|. This is acceptable because the 8th filter tap is always
+  // a padded 0.
+  pixels = _mm_or_si128(pixels, left);
+  pixels = _mm_shuffle_epi8(pixels, pixel_order2);
+  const ptrdiff_t stride2 = stride << 1;
+  const ptrdiff_t stride4 = stride << 2;
+  filter_4x2_sse4_1(dst + stride2, stride, &pixels, &taps_0_1, &taps_2_3,
+                    &taps_4_5, &taps_6_7);
+  dst += 4;
+  for (int x = 3; x < width - 4; x += 4) {
+    pixels = xx_loadl_32(top_ptr + x);
+    pixels = _mm_insert_epi8(pixels, (int8_t)top_ptr[x + 4], 4);
+    pixels = _mm_insert_epi8(pixels, (int8_t)dst[-1], 5);
+    pixels = _mm_insert_epi8(pixels, (int8_t)dst[stride - 1], 6);
+
+    // Duplicate bottom half into upper half.
+    pixels = _mm_shuffle_epi32(pixels, DUPLICATE_FIRST_HALF);
+    filter_4x2_sse4_1(dst, stride, &pixels, &taps_0_1, &taps_2_3, &taps_4_5,
+                      &taps_6_7);
+    pixels = xx_loadl_32(dst + stride - 1);
+    pixels = _mm_insert_epi8(pixels, (int8_t)dst[stride + 3], 4);
+    pixels = _mm_insert_epi8(pixels, (int8_t)dst[stride2 - 1], 5);
+    pixels = _mm_insert_epi8(pixels, (int8_t)dst[stride + stride2 - 1], 6);
+
+    // Duplicate bottom half into upper half.
+    pixels = _mm_shuffle_epi32(pixels, DUPLICATE_FIRST_HALF);
+    filter_4x2_sse4_1(dst + stride2, stride, &pixels, &taps_0_1, &taps_2_3,
+                      &taps_4_5, &taps_6_7);
+    dst += 4;
+  }
+
+  // Now we handle heights that reference previous blocks rather than top_row.
+  for (int y = 4; y < height; y += 4) {
+    // Leftmost 4x4 block for this height.
+    dst -= width;
+    dst += stride4;
+
+    // Top Left is not available by offset in these leftmost blocks.
+    pixels = xx_loadl_32(dst - stride);
+    left = _mm_slli_si128(xx_loadl_32(left_ptr + y - 1), 8);
+    left = _mm_insert_epi8(left, (int8_t)left_ptr[y + 3], 12);
+    pixels = _mm_or_si128(pixels, left);
+    pixels = _mm_shuffle_epi8(pixels, pixel_order2);
+    filter_4x2_sse4_1(dst, stride, &pixels, &taps_0_1, &taps_2_3, &taps_4_5,
+                      &taps_6_7);
+
+    // The bytes shifted into positions 6 and 7 will be ignored by the shuffle.
+    left = _mm_srli_si128(left, 2);
+    pixels = xx_loadl_32(dst + stride);
+    pixels = _mm_or_si128(pixels, left);
+    pixels = _mm_shuffle_epi8(pixels, pixel_order2);
+    filter_4x2_sse4_1(dst + stride2, stride, &pixels, &taps_0_1, &taps_2_3,
+                      &taps_4_5, &taps_6_7);
+
+    dst += 4;
+
+    // Remaining 4x4 blocks for this height.
+    for (int x = 4; x < width; x += 4) {
+      pixels = xx_loadl_32(dst - stride - 1);
+      pixels = _mm_insert_epi8(pixels, (int8_t)dst[-stride + 3], 4);
+      pixels = _mm_insert_epi8(pixels, (int8_t)dst[-1], 5);
+      pixels = _mm_insert_epi8(pixels, (int8_t)dst[stride - 1], 6);
+
+      // Duplicate bottom half into upper half.
+      pixels = _mm_shuffle_epi32(pixels, DUPLICATE_FIRST_HALF);
+      filter_4x2_sse4_1(dst, stride, &pixels, &taps_0_1, &taps_2_3, &taps_4_5,
+                        &taps_6_7);
+      pixels = xx_loadl_32(dst + stride - 1);
+      pixels = _mm_insert_epi8(pixels, (int8_t)dst[stride + 3], 4);
+      pixels = _mm_insert_epi8(pixels, (int8_t)dst[stride2 - 1], 5);
+      pixels = _mm_insert_epi8(pixels, (int8_t)dst[stride2 + stride - 1], 6);
+
+      // Duplicate bottom half into upper half.
+      pixels = _mm_shuffle_epi32(pixels, DUPLICATE_FIRST_HALF);
+      filter_4x2_sse4_1(dst + stride2, stride, &pixels, &taps_0_1, &taps_2_3,
+                        &taps_4_5, &taps_6_7);
+      dst += 4;
+    }
+  }
+}
+
 void av1_filter_intra_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride,
                                        TX_SIZE tx_size, const uint8_t *above,
                                        const uint8_t *left, int mode) {
-  int r, c;
-  uint8_t buffer[33][33];
   const int bw = tx_size_wide[tx_size];
   const int bh = tx_size_high[tx_size];
-
-  assert(bw <= 32 && bh <= 32);
-
-  // The initialization is just for silencing Jenkins static analysis warnings
-  for (r = 0; r < bh + 1; ++r)
-    memset(buffer[r], 0, (bw + 1) * sizeof(buffer[0][0]));
-
-  for (r = 0; r < bh; ++r) buffer[r + 1][0] = left[r];
-  memcpy(buffer[0], &above[-1], (bw + 1) * sizeof(uint8_t));
-
-  const __m128i f1f0 = xx_load_128(av1_filter_intra_taps[mode][0]);
-  const __m128i f3f2 = xx_load_128(av1_filter_intra_taps[mode][2]);
-  const __m128i f5f4 = xx_load_128(av1_filter_intra_taps[mode][4]);
-  const __m128i f7f6 = xx_load_128(av1_filter_intra_taps[mode][6]);
-  const __m128i filter_intra_scale_bits =
-      _mm_set1_epi16(1 << (15 - FILTER_INTRA_SCALE_BITS));
-
-  for (r = 1; r < bh + 1; r += 2) {
-    for (c = 1; c < bw + 1; c += 4) {
-      DECLARE_ALIGNED(16, uint8_t, p[8]);
-      memcpy(p, &buffer[r - 1][c - 1], 5 * sizeof(uint8_t));
-      p[5] = buffer[r][c - 1];
-      p[6] = buffer[r + 1][c - 1];
-      p[7] = 0;
-      const __m128i p_b = xx_loadl_64(p);
-      const __m128i in = _mm_unpacklo_epi64(p_b, p_b);
-      const __m128i out_01 = _mm_maddubs_epi16(in, f1f0);
-      const __m128i out_23 = _mm_maddubs_epi16(in, f3f2);
-      const __m128i out_45 = _mm_maddubs_epi16(in, f5f4);
-      const __m128i out_67 = _mm_maddubs_epi16(in, f7f6);
-      const __m128i out_0123 = _mm_hadd_epi16(out_01, out_23);
-      const __m128i out_4567 = _mm_hadd_epi16(out_45, out_67);
-      const __m128i out_01234567 = _mm_hadd_epi16(out_0123, out_4567);
-      // Rounding
-      const __m128i round_w =
-          _mm_mulhrs_epi16(out_01234567, filter_intra_scale_bits);
-      const __m128i out_r = _mm_packus_epi16(round_w, round_w);
-      const __m128i out_r1 = _mm_srli_si128(out_r, 4);
-      // Storing
-      xx_storel_32(&buffer[r][c], out_r);
-      xx_storel_32(&buffer[r + 1][c], out_r1);
-    }
-  }
-
-  for (r = 0; r < bh; ++r) {
-    memcpy(dst, &buffer[r + 1][1], bw * sizeof(uint8_t));
-    dst += stride;
-  }
+  filter_intra_predictor_sse4_1(dst, stride, above, left, mode, bw, bh);
 }
diff --git a/av1/common/x86/highbd_convolve_2d_avx2.c b/av1/common/x86/highbd_convolve_2d_avx2.c
index 396aed0..12d1962 100644
--- a/av1/common/x86/highbd_convolve_2d_avx2.c
+++ b/av1/common/x86/highbd_convolve_2d_avx2.c
@@ -185,142 +185,3 @@
     }
   }
 }
-
-static INLINE void copy_64(const uint16_t *src, uint16_t *dst) {
-  __m256i s[4];
-  s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 16));
-  s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 16));
-  s[2] = _mm256_loadu_si256((__m256i *)(src + 2 * 16));
-  s[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 16));
-  _mm256_storeu_si256((__m256i *)(dst + 0 * 16), s[0]);
-  _mm256_storeu_si256((__m256i *)(dst + 1 * 16), s[1]);
-  _mm256_storeu_si256((__m256i *)(dst + 2 * 16), s[2]);
-  _mm256_storeu_si256((__m256i *)(dst + 3 * 16), s[3]);
-}
-
-static INLINE void copy_128(const uint16_t *src, uint16_t *dst) {
-  __m256i s[8];
-  s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 16));
-  s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 16));
-  s[2] = _mm256_loadu_si256((__m256i *)(src + 2 * 16));
-  s[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 16));
-  s[4] = _mm256_loadu_si256((__m256i *)(src + 4 * 16));
-  s[5] = _mm256_loadu_si256((__m256i *)(src + 5 * 16));
-  s[6] = _mm256_loadu_si256((__m256i *)(src + 6 * 16));
-  s[7] = _mm256_loadu_si256((__m256i *)(src + 7 * 16));
-
-  _mm256_storeu_si256((__m256i *)(dst + 0 * 16), s[0]);
-  _mm256_storeu_si256((__m256i *)(dst + 1 * 16), s[1]);
-  _mm256_storeu_si256((__m256i *)(dst + 2 * 16), s[2]);
-  _mm256_storeu_si256((__m256i *)(dst + 3 * 16), s[3]);
-  _mm256_storeu_si256((__m256i *)(dst + 4 * 16), s[4]);
-  _mm256_storeu_si256((__m256i *)(dst + 5 * 16), s[5]);
-  _mm256_storeu_si256((__m256i *)(dst + 6 * 16), s[6]);
-  _mm256_storeu_si256((__m256i *)(dst + 7 * 16), s[7]);
-}
-
-void av1_highbd_convolve_2d_copy_sr_avx2(
-    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
-    int h, const InterpFilterParams *filter_params_x,
-    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
-    const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
-  (void)filter_params_x;
-  (void)filter_params_y;
-  (void)subpel_x_qn;
-  (void)subpel_y_qn;
-  (void)conv_params;
-  (void)bd;
-
-  if (w >= 16) {
-    assert(!((intptr_t)dst % 16));
-    assert(!(dst_stride % 16));
-  }
-
-  if (w == 2) {
-    do {
-      memmove(dst, src, 2 * sizeof(*src));
-      src += src_stride;
-      dst += dst_stride;
-      memmove(dst, src, 2 * sizeof(*src));
-      src += src_stride;
-      dst += dst_stride;
-      h -= 2;
-    } while (h);
-  } else if (w == 4) {
-    do {
-      __m128i s[2];
-      s[0] = _mm_loadl_epi64((__m128i *)src);
-      src += src_stride;
-      s[1] = _mm_loadl_epi64((__m128i *)src);
-      src += src_stride;
-      _mm_storel_epi64((__m128i *)dst, s[0]);
-      dst += dst_stride;
-      _mm_storel_epi64((__m128i *)dst, s[1]);
-      dst += dst_stride;
-      h -= 2;
-    } while (h);
-  } else if (w == 8) {
-    do {
-      __m128i s[2];
-      s[0] = _mm_loadu_si128((__m128i *)src);
-      src += src_stride;
-      s[1] = _mm_loadu_si128((__m128i *)src);
-      src += src_stride;
-      _mm_store_si128((__m128i *)dst, s[0]);
-      dst += dst_stride;
-      _mm_store_si128((__m128i *)dst, s[1]);
-      dst += dst_stride;
-      h -= 2;
-    } while (h);
-  } else if (w == 16) {
-    do {
-      __m256i s[2];
-      s[0] = _mm256_loadu_si256((__m256i *)src);
-      src += src_stride;
-      s[1] = _mm256_loadu_si256((__m256i *)src);
-      src += src_stride;
-      _mm256_storeu_si256((__m256i *)dst, s[0]);
-      dst += dst_stride;
-      _mm256_storeu_si256((__m256i *)dst, s[1]);
-      dst += dst_stride;
-      h -= 2;
-    } while (h);
-  } else if (w == 32) {
-    do {
-      __m256i s[4];
-      s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 16));
-      s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 16));
-      src += src_stride;
-      s[2] = _mm256_loadu_si256((__m256i *)(src + 0 * 16));
-      s[3] = _mm256_loadu_si256((__m256i *)(src + 1 * 16));
-      src += src_stride;
-      _mm256_storeu_si256((__m256i *)(dst + 0 * 16), s[0]);
-      _mm256_storeu_si256((__m256i *)(dst + 1 * 16), s[1]);
-      dst += dst_stride;
-      _mm256_storeu_si256((__m256i *)(dst + 0 * 16), s[2]);
-      _mm256_storeu_si256((__m256i *)(dst + 1 * 16), s[3]);
-      dst += dst_stride;
-      h -= 2;
-    } while (h);
-  } else if (w == 64) {
-    do {
-      copy_64(src, dst);
-      src += src_stride;
-      dst += dst_stride;
-      copy_64(src, dst);
-      src += src_stride;
-      dst += dst_stride;
-      h -= 2;
-    } while (h);
-  } else {
-    do {
-      copy_128(src, dst);
-      src += src_stride;
-      dst += dst_stride;
-      copy_128(src, dst);
-      src += src_stride;
-      dst += dst_stride;
-      h -= 2;
-    } while (h);
-  }
-}
diff --git a/av1/common/x86/highbd_convolve_2d_sse2.c b/av1/common/x86/highbd_convolve_2d_sse2.c
deleted file mode 100644
index f758775..0000000
--- a/av1/common/x86/highbd_convolve_2d_sse2.c
+++ /dev/null
@@ -1,191 +0,0 @@
-/*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-#include <emmintrin.h>
-#include <assert.h>
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/aom_filter.h"
-
-static INLINE void copy_64(const uint16_t *src, uint16_t *dst) {
-  __m128i s[8];
-  s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 8));
-  s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 8));
-  s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 8));
-  s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 8));
-  s[4] = _mm_loadu_si128((__m128i *)(src + 4 * 8));
-  s[5] = _mm_loadu_si128((__m128i *)(src + 5 * 8));
-  s[6] = _mm_loadu_si128((__m128i *)(src + 6 * 8));
-  s[7] = _mm_loadu_si128((__m128i *)(src + 7 * 8));
-  _mm_store_si128((__m128i *)(dst + 0 * 8), s[0]);
-  _mm_store_si128((__m128i *)(dst + 1 * 8), s[1]);
-  _mm_store_si128((__m128i *)(dst + 2 * 8), s[2]);
-  _mm_store_si128((__m128i *)(dst + 3 * 8), s[3]);
-  _mm_store_si128((__m128i *)(dst + 4 * 8), s[4]);
-  _mm_store_si128((__m128i *)(dst + 5 * 8), s[5]);
-  _mm_store_si128((__m128i *)(dst + 6 * 8), s[6]);
-  _mm_store_si128((__m128i *)(dst + 7 * 8), s[7]);
-}
-
-static INLINE void copy_128(const uint16_t *src, uint16_t *dst) {
-  __m128i s[16];
-  s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 8));
-  s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 8));
-  s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 8));
-  s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 8));
-  s[4] = _mm_loadu_si128((__m128i *)(src + 4 * 8));
-  s[5] = _mm_loadu_si128((__m128i *)(src + 5 * 8));
-  s[6] = _mm_loadu_si128((__m128i *)(src + 6 * 8));
-  s[7] = _mm_loadu_si128((__m128i *)(src + 7 * 8));
-  s[8] = _mm_loadu_si128((__m128i *)(src + 8 * 8));
-  s[9] = _mm_loadu_si128((__m128i *)(src + 9 * 8));
-  s[10] = _mm_loadu_si128((__m128i *)(src + 10 * 8));
-  s[11] = _mm_loadu_si128((__m128i *)(src + 11 * 8));
-  s[12] = _mm_loadu_si128((__m128i *)(src + 12 * 8));
-  s[13] = _mm_loadu_si128((__m128i *)(src + 13 * 8));
-  s[14] = _mm_loadu_si128((__m128i *)(src + 14 * 8));
-  s[15] = _mm_loadu_si128((__m128i *)(src + 15 * 8));
-  _mm_store_si128((__m128i *)(dst + 0 * 8), s[0]);
-  _mm_store_si128((__m128i *)(dst + 1 * 8), s[1]);
-  _mm_store_si128((__m128i *)(dst + 2 * 8), s[2]);
-  _mm_store_si128((__m128i *)(dst + 3 * 8), s[3]);
-  _mm_store_si128((__m128i *)(dst + 4 * 8), s[4]);
-  _mm_store_si128((__m128i *)(dst + 5 * 8), s[5]);
-  _mm_store_si128((__m128i *)(dst + 6 * 8), s[6]);
-  _mm_store_si128((__m128i *)(dst + 7 * 8), s[7]);
-  _mm_store_si128((__m128i *)(dst + 8 * 8), s[8]);
-  _mm_store_si128((__m128i *)(dst + 9 * 8), s[9]);
-  _mm_store_si128((__m128i *)(dst + 10 * 8), s[10]);
-  _mm_store_si128((__m128i *)(dst + 11 * 8), s[11]);
-  _mm_store_si128((__m128i *)(dst + 12 * 8), s[12]);
-  _mm_store_si128((__m128i *)(dst + 13 * 8), s[13]);
-  _mm_store_si128((__m128i *)(dst + 14 * 8), s[14]);
-  _mm_store_si128((__m128i *)(dst + 15 * 8), s[15]);
-}
-
-void av1_highbd_convolve_2d_copy_sr_sse2(
-    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
-    int h, const InterpFilterParams *filter_params_x,
-    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
-    const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
-  (void)filter_params_x;
-  (void)filter_params_y;
-  (void)subpel_x_qn;
-  (void)subpel_y_qn;
-  (void)conv_params;
-  (void)bd;
-  if (w >= 16) {
-    assert(!((intptr_t)dst % 16));
-    assert(!(dst_stride % 16));
-  }
-
-  if (w == 2) {
-    do {
-      __m128i s = _mm_loadl_epi64((__m128i *)src);
-      *(uint32_t *)dst = _mm_cvtsi128_si32(s);
-      src += src_stride;
-      dst += dst_stride;
-      s = _mm_loadl_epi64((__m128i *)src);
-      *(uint32_t *)dst = _mm_cvtsi128_si32(s);
-      src += src_stride;
-      dst += dst_stride;
-      h -= 2;
-    } while (h);
-  } else if (w == 4) {
-    do {
-      __m128i s[2];
-      s[0] = _mm_loadl_epi64((__m128i *)src);
-      src += src_stride;
-      s[1] = _mm_loadl_epi64((__m128i *)src);
-      src += src_stride;
-      _mm_storel_epi64((__m128i *)dst, s[0]);
-      dst += dst_stride;
-      _mm_storel_epi64((__m128i *)dst, s[1]);
-      dst += dst_stride;
-      h -= 2;
-    } while (h);
-  } else if (w == 8) {
-    do {
-      __m128i s[2];
-      s[0] = _mm_loadu_si128((__m128i *)src);
-      src += src_stride;
-      s[1] = _mm_loadu_si128((__m128i *)src);
-      src += src_stride;
-      _mm_store_si128((__m128i *)dst, s[0]);
-      dst += dst_stride;
-      _mm_store_si128((__m128i *)dst, s[1]);
-      dst += dst_stride;
-      h -= 2;
-    } while (h);
-  } else if (w == 16) {
-    do {
-      __m128i s[4];
-      s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 8));
-      s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 8));
-      src += src_stride;
-      s[2] = _mm_loadu_si128((__m128i *)(src + 0 * 8));
-      s[3] = _mm_loadu_si128((__m128i *)(src + 1 * 8));
-      src += src_stride;
-      _mm_store_si128((__m128i *)(dst + 0 * 8), s[0]);
-      _mm_store_si128((__m128i *)(dst + 1 * 8), s[1]);
-      dst += dst_stride;
-      _mm_store_si128((__m128i *)(dst + 0 * 8), s[2]);
-      _mm_store_si128((__m128i *)(dst + 1 * 8), s[3]);
-      dst += dst_stride;
-      h -= 2;
-    } while (h);
-  } else if (w == 32) {
-    do {
-      __m128i s[8];
-      s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 8));
-      s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 8));
-      s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 8));
-      s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 8));
-      src += src_stride;
-      s[4] = _mm_loadu_si128((__m128i *)(src + 0 * 8));
-      s[5] = _mm_loadu_si128((__m128i *)(src + 1 * 8));
-      s[6] = _mm_loadu_si128((__m128i *)(src + 2 * 8));
-      s[7] = _mm_loadu_si128((__m128i *)(src + 3 * 8));
-      src += src_stride;
-      _mm_store_si128((__m128i *)(dst + 0 * 8), s[0]);
-      _mm_store_si128((__m128i *)(dst + 1 * 8), s[1]);
-      _mm_store_si128((__m128i *)(dst + 2 * 8), s[2]);
-      _mm_store_si128((__m128i *)(dst + 3 * 8), s[3]);
-      dst += dst_stride;
-      _mm_store_si128((__m128i *)(dst + 0 * 8), s[4]);
-      _mm_store_si128((__m128i *)(dst + 1 * 8), s[5]);
-      _mm_store_si128((__m128i *)(dst + 2 * 8), s[6]);
-      _mm_store_si128((__m128i *)(dst + 3 * 8), s[7]);
-      dst += dst_stride;
-      h -= 2;
-    } while (h);
-  } else if (w == 64) {
-    do {
-      copy_64(src, dst);
-      src += src_stride;
-      dst += dst_stride;
-      copy_64(src, dst);
-      src += src_stride;
-      dst += dst_stride;
-      h -= 2;
-    } while (h);
-  } else {
-    do {
-      copy_128(src, dst);
-      src += src_stride;
-      dst += dst_stride;
-      copy_128(src, dst);
-      src += src_stride;
-      dst += dst_stride;
-      h -= 2;
-    } while (h);
-  }
-}
diff --git a/av1/common/x86/highbd_convolve_2d_sse4.c b/av1/common/x86/highbd_convolve_2d_sse4.c
index d2ff47c..b2c39cd 100644
--- a/av1/common/x86/highbd_convolve_2d_sse4.c
+++ b/av1/common/x86/highbd_convolve_2d_sse4.c
@@ -21,17 +21,13 @@
 #include "aom_dsp/x86/convolve_sse4_1.h"
 #include "av1/common/convolve.h"
 
-void av1_highbd_dist_wtd_convolve_2d_copy_sse4_1(
-    const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
-    int h, const InterpFilterParams *filter_params_x,
-    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
-    const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
+void av1_highbd_dist_wtd_convolve_2d_copy_sse4_1(const uint16_t *src,
+                                                 int src_stride, uint16_t *dst0,
+                                                 int dst_stride0, int w, int h,
+                                                 ConvolveParams *conv_params,
+                                                 int bd) {
   CONV_BUF_TYPE *dst = conv_params->dst;
   int dst_stride = conv_params->dst_stride;
-  (void)filter_params_x;
-  (void)filter_params_y;
-  (void)subpel_x_qn;
-  (void)subpel_y_qn;
 
   const int bits =
       FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
diff --git a/av1/common/x86/highbd_jnt_convolve_avx2.c b/av1/common/x86/highbd_jnt_convolve_avx2.c
index 70f1ec7..9cedd44 100644
--- a/av1/common/x86/highbd_jnt_convolve_avx2.c
+++ b/av1/common/x86/highbd_jnt_convolve_avx2.c
@@ -22,17 +22,13 @@
 #include "aom_dsp/aom_filter.h"
 #include "av1/common/convolve.h"
 
-void av1_highbd_dist_wtd_convolve_2d_copy_avx2(
-    const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
-    int h, const InterpFilterParams *filter_params_x,
-    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
-    const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
+void av1_highbd_dist_wtd_convolve_2d_copy_avx2(const uint16_t *src,
+                                               int src_stride, uint16_t *dst0,
+                                               int dst_stride0, int w, int h,
+                                               ConvolveParams *conv_params,
+                                               int bd) {
   CONV_BUF_TYPE *dst = conv_params->dst;
   int dst_stride = conv_params->dst_stride;
-  (void)filter_params_x;
-  (void)filter_params_y;
-  (void)subpel_x_qn;
-  (void)subpel_y_qn;
 
   const int bits =
       FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
@@ -466,16 +462,13 @@
 
 void av1_highbd_dist_wtd_convolve_x_avx2(
     const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
-    int h, const InterpFilterParams *filter_params_x,
-    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
-    const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
+    int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn,
+    ConvolveParams *conv_params, int bd) {
   CONV_BUF_TYPE *dst = conv_params->dst;
   int dst_stride = conv_params->dst_stride;
   const int fo_horiz = filter_params_x->taps / 2 - 1;
   const uint16_t *const src_ptr = src - fo_horiz;
   const int bits = FILTER_BITS - conv_params->round_1;
-  (void)filter_params_y;
-  (void)subpel_y_qn;
 
   int i, j;
   __m256i s[4], coeffs_x[4];
@@ -635,16 +628,13 @@
 
 void av1_highbd_dist_wtd_convolve_y_avx2(
     const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
-    int h, const InterpFilterParams *filter_params_x,
-    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
-    const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
+    int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn,
+    ConvolveParams *conv_params, int bd) {
   CONV_BUF_TYPE *dst = conv_params->dst;
   int dst_stride = conv_params->dst_stride;
   const int fo_vert = filter_params_y->taps / 2 - 1;
   const uint16_t *const src_ptr = src - fo_vert * src_stride;
   const int bits = FILTER_BITS - conv_params->round_0;
-  (void)filter_params_x;
-  (void)subpel_x_qn;
 
   assert(bits >= 0);
   int i, j;
diff --git a/av1/common/x86/highbd_jnt_convolve_sse4.c b/av1/common/x86/highbd_jnt_convolve_sse4.c
index f033a6f..af45764 100644
--- a/av1/common/x86/highbd_jnt_convolve_sse4.c
+++ b/av1/common/x86/highbd_jnt_convolve_sse4.c
@@ -19,16 +19,13 @@
 
 void av1_highbd_dist_wtd_convolve_y_sse4_1(
     const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
-    int h, const InterpFilterParams *filter_params_x,
-    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
-    const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
+    int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn,
+    ConvolveParams *conv_params, int bd) {
   CONV_BUF_TYPE *dst = conv_params->dst;
   int dst_stride = conv_params->dst_stride;
   const int fo_vert = filter_params_y->taps / 2 - 1;
   const uint16_t *const src_ptr = src - fo_vert * src_stride;
   const int bits = FILTER_BITS - conv_params->round_0;
-  (void)filter_params_x;
-  (void)subpel_x_qn;
 
   assert(bits >= 0);
   int i, j;
@@ -261,16 +258,13 @@
 
 void av1_highbd_dist_wtd_convolve_x_sse4_1(
     const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
-    int h, const InterpFilterParams *filter_params_x,
-    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
-    const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
+    int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn,
+    ConvolveParams *conv_params, int bd) {
   CONV_BUF_TYPE *dst = conv_params->dst;
   int dst_stride = conv_params->dst_stride;
   const int fo_horiz = filter_params_x->taps / 2 - 1;
   const uint16_t *const src_ptr = src - fo_horiz;
   const int bits = FILTER_BITS - conv_params->round_1;
-  (void)filter_params_y;
-  (void)subpel_y_qn;
 
   int i, j;
   __m128i s[4], coeffs_x[4];
diff --git a/av1/common/x86/highbd_warp_affine_avx2.c b/av1/common/x86/highbd_warp_affine_avx2.c
new file mode 100644
index 0000000..9cb0bba
--- /dev/null
+++ b/av1/common/x86/highbd_warp_affine_avx2.c
@@ -0,0 +1,652 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <immintrin.h>
+
+#include "config/av1_rtcd.h"
+
+#include "av1/common/warped_motion.h"
+
+void av1_highbd_warp_affine_avx2(const int32_t *mat, const uint16_t *ref,
+                                 int width, int height, int stride,
+                                 uint16_t *pred, int p_col, int p_row,
+                                 int p_width, int p_height, int p_stride,
+                                 int subsampling_x, int subsampling_y, int bd,
+                                 ConvolveParams *conv_params, int16_t alpha,
+                                 int16_t beta, int16_t gamma, int16_t delta) {
+  __m256i tmp[15];
+  const int reduce_bits_horiz =
+      conv_params->round_0 +
+      AOMMAX(bd + FILTER_BITS - conv_params->round_0 - 14, 0);
+  const int reduce_bits_vert = conv_params->is_compound
+                                   ? conv_params->round_1
+                                   : 2 * FILTER_BITS - reduce_bits_horiz;
+  const int max_bits_horiz = bd + FILTER_BITS + 1 - reduce_bits_horiz;
+  const int offset_bits_horiz = bd + FILTER_BITS - 1;
+  const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz;
+  const int round_bits =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  (void)max_bits_horiz;
+  assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
+
+  const __m256i clip_pixel =
+      _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+  const __m128i reduce_bits_vert_shift = _mm_cvtsi32_si128(reduce_bits_vert);
+  const __m256i reduce_bits_vert_const =
+      _mm256_set1_epi32(((1 << reduce_bits_vert) >> 1));
+  const __m256i res_add_const = _mm256_set1_epi32(1 << offset_bits_vert);
+  const __m256i res_sub_const =
+      _mm256_set1_epi32(-(1 << (offset_bits - conv_params->round_1)) -
+                        (1 << (offset_bits - conv_params->round_1 - 1)));
+  __m128i round_bits_shift = _mm_cvtsi32_si128(round_bits);
+  __m256i round_bits_const = _mm256_set1_epi32(((1 << round_bits) >> 1));
+
+  const int w0 = conv_params->fwd_offset;
+  const int w1 = conv_params->bck_offset;
+  const __m256i wt0 = _mm256_set1_epi32(w0);
+  const __m256i wt1 = _mm256_set1_epi32(w1);
+
+  __m256i v_rbhoriz = _mm256_set1_epi32(1 << (reduce_bits_horiz - 1));
+  __m256i v_zeros = _mm256_setzero_si256();
+  int ohoriz = 1 << offset_bits_horiz;
+  int mhoriz = 1 << max_bits_horiz;
+  (void)mhoriz;
+  int sx;
+
+  for (int i = 0; i < p_height; i += 8) {
+    for (int j = 0; j < p_width; j += 8) {
+      // Calculate the center of this 8x8 block,
+      // project to luma coordinates (if in a subsampled chroma plane),
+      // apply the affine transformation,
+      // then convert back to the original coordinates (if necessary)
+      const int32_t src_x = (p_col + j + 4) << subsampling_x;
+      const int32_t src_y = (p_row + i + 4) << subsampling_y;
+      const int32_t dst_x = mat[2] * src_x + mat[3] * src_y + mat[0];
+      const int32_t dst_y = mat[4] * src_x + mat[5] * src_y + mat[1];
+      const int32_t x4 = dst_x >> subsampling_x;
+      const int32_t y4 = dst_y >> subsampling_y;
+
+      const int16_t ix4 = x4 >> WARPEDMODEL_PREC_BITS;
+      int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+      const int16_t iy4 = y4 >> WARPEDMODEL_PREC_BITS;
+      int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+
+      sx4 += alpha * (-4) + beta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
+             (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
+      sy4 += gamma * (-4) + delta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
+             (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
+
+      sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
+      sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
+
+      // Horizontal filter
+      if (ix4 <= -7) {
+        for (int k = -7; k < AOMMIN(8, p_height - i); ++k) {
+          int iy = iy4 + k;
+          if (iy < 0)
+            iy = 0;
+          else if (iy > height - 1)
+            iy = height - 1;
+          tmp[k + 7] = _mm256_cvtepi16_epi32(_mm_set1_epi16(
+              (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
+              ref[iy * stride] * (1 << (FILTER_BITS - reduce_bits_horiz))));
+        }
+      } else if (ix4 >= width + 6) {
+        for (int k = -7; k < AOMMIN(8, p_height - i); ++k) {
+          int iy = iy4 + k;
+          if (iy < 0)
+            iy = 0;
+          else if (iy > height - 1)
+            iy = height - 1;
+          tmp[k + 7] = _mm256_cvtepi16_epi32(
+              _mm_set1_epi16((1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
+                             ref[iy * stride + (width - 1)] *
+                                 (1 << (FILTER_BITS - reduce_bits_horiz))));
+        }
+      } else if (((ix4 - 7) < 0) || ((ix4 + 9) > width)) {
+        int32_t tmp1[8];
+        for (int k = -7; k < AOMMIN(8, p_height - i); ++k) {
+          const int iy = clamp(iy4 + k, 0, height - 1);
+
+          sx = sx4 + beta * (k + 4);
+          for (int l = -4; l < 4; ++l) {
+            int ix = ix4 + l - 3;
+            const int offs = sx >> WARPEDDIFF_PREC_BITS;
+            const int16_t *coeffs = av1_warped_filter[offs];
+
+            int32_t sum = 1 << offset_bits_horiz;
+            for (int m = 0; m < 8; ++m) {
+              const int sample_x = clamp(ix + m, 0, width - 1);
+              sum += ref[iy * stride + sample_x] * coeffs[m];
+            }
+            sum = ROUND_POWER_OF_TWO(sum, reduce_bits_horiz);
+            tmp1[(l + 4) / 2 + ((l + 4) % 2) * 4] = sum;
+            sx += alpha;
+          }
+          tmp[k + 7] = _mm256_loadu_si256((__m256i *)tmp1);
+        }
+      } else {
+        if (beta == 0 && alpha == 0) {
+          sx = sx4;
+          __m128i v_01 = _mm_loadu_si128(
+              (__m128i *)
+                  av1_warped_filter[sx >>
+                                    WARPEDDIFF_PREC_BITS]);  // A7A6A5A4A3A2A1A0
+          __m256i v_c01 = _mm256_broadcastd_epi32(v_01);     // A1A0A1A0A1A0A1A0
+          __m256i v_c23 = _mm256_broadcastd_epi32(
+              _mm_shuffle_epi32(v_01, 1));  // A3A2A3A2A3A2A3A2
+          __m256i v_c45 = _mm256_broadcastd_epi32(
+              _mm_shuffle_epi32(v_01, 2));  // A5A4A5A4A5A4A5A4
+          __m256i v_c67 = _mm256_broadcastd_epi32(
+              _mm_shuffle_epi32(v_01, 3));  // A7A6A7A6A7A6A7A6
+          for (int k = -7; k < AOMMIN(8, p_height - i); ++k) {
+            int iy = iy4 + k;
+            if (iy < 0)
+              iy = 0;
+            else if (iy > height - 1)
+              iy = height - 1;
+            iy = iy * stride;
+
+            __m256i v_refl = _mm256_inserti128_si256(
+                _mm256_set1_epi16(0),
+                _mm_loadu_si128((__m128i *)&ref[iy + ix4 - 7]), 0);
+            v_refl = _mm256_inserti128_si256(
+                v_refl, _mm_loadu_si128((__m128i *)&ref[iy + ix4 + 1]),
+                1);  // R15 .. R0
+
+            __m256i v_ref = _mm256_permute4x64_epi64(v_refl, 0xEE);
+
+            __m256i v_refu =
+                _mm256_alignr_epi8(v_ref, v_refl, 2);  // R8R15R14...R2R1
+            v_refl = _mm256_inserti128_si256(
+                v_refl, _mm256_extracti128_si256(v_refu, 0), 1);
+            v_refu = _mm256_inserti128_si256(
+                v_refu, _mm256_extracti128_si256(v_ref, 0), 0);
+
+            __m256i v_sum = _mm256_set1_epi32(ohoriz);
+            __m256i parsum = _mm256_madd_epi16(
+                v_c01, _mm256_alignr_epi8(v_refu, v_refl,
+                                          0));  // R8R7R6..R1R7R6R5..R1R0
+            __m256i v_sum1 = _mm256_add_epi32(v_sum, parsum);
+
+            parsum = _mm256_madd_epi16(
+                v_c23,
+                _mm256_alignr_epi8(v_refu, v_refl, 4));  // R10R9..R3R9R8..R3R2
+            __m256i v_sum2 = _mm256_add_epi32(v_sum1, parsum);
+            parsum = _mm256_madd_epi16(
+                v_c45, _mm256_alignr_epi8(v_refu, v_refl,
+                                          8));  // R12R11..R5R11R10..R5R4
+            __m256i v_sum3 = _mm256_add_epi32(v_sum2, parsum);
+            parsum = _mm256_madd_epi16(
+                v_c67, _mm256_alignr_epi8(v_refu, v_refl,
+                                          12));  // R14R13..R7R13R12..R7R6
+            __m256i v_sum4 = _mm256_add_epi32(v_sum3, parsum);
+
+            tmp[k + 7] = _mm256_srai_epi32(_mm256_add_epi32(v_sum4, v_rbhoriz),
+                                           reduce_bits_horiz);
+          }
+        } else if (alpha == 0) {
+          for (int k = -7; k < AOMMIN(8, p_height - i); ++k) {
+            int iy = iy4 + k;
+            if (iy < 0)
+              iy = 0;
+            else if (iy > height - 1)
+              iy = height - 1;
+            iy = iy * stride;
+
+            sx = sx4 + beta * (k + 4);
+
+            __m128i v_01 = _mm_loadu_si128(
+                (__m128i *)av1_warped_filter
+                    [sx >> WARPEDDIFF_PREC_BITS]);          // A7A6A5A4A3A2A1A0
+            __m256i v_c01 = _mm256_broadcastd_epi32(v_01);  // A1A0A1A0A1A0A1A0
+            __m256i v_c23 = _mm256_broadcastd_epi32(
+                _mm_shuffle_epi32(v_01, 1));  // A3A2A3A2A3A2A3A2
+            __m256i v_c45 = _mm256_broadcastd_epi32(
+                _mm_shuffle_epi32(v_01, 2));  // A5A4A5A4A5A4A5A4
+            __m256i v_c67 = _mm256_broadcastd_epi32(
+                _mm_shuffle_epi32(v_01, 3));  // A7A6A7A6A7A6A7A6
+
+            __m256i v_refl = _mm256_inserti128_si256(
+                _mm256_set1_epi16(0),
+                _mm_loadu_si128((__m128i *)&ref[iy + ix4 - 7]), 0);
+            v_refl = _mm256_inserti128_si256(
+                v_refl, _mm_loadu_si128((__m128i *)&ref[iy + ix4 + 1]),
+                1);  // R15 .. R0
+
+            __m256i v_ref = _mm256_permute4x64_epi64(v_refl, 0xEE);
+
+            __m256i v_refu =
+                _mm256_alignr_epi8(v_ref, v_refl, 2);  // R8R15R14...R2R1
+
+            v_refl = _mm256_inserti128_si256(
+                v_refl, _mm256_extracti128_si256(v_refu, 0), 1);
+            v_refu = _mm256_inserti128_si256(
+                v_refu, _mm256_extracti128_si256(v_ref, 0), 0);
+
+            __m256i v_sum = _mm256_set1_epi32(ohoriz);
+            __m256i parsum =
+                _mm256_madd_epi16(v_c01, _mm256_alignr_epi8(v_refu, v_refl, 0));
+            __m256i v_sum1 = _mm256_add_epi32(v_sum, parsum);
+
+            parsum =
+                _mm256_madd_epi16(v_c23, _mm256_alignr_epi8(v_refu, v_refl, 4));
+            __m256i v_sum2 = _mm256_add_epi32(v_sum1, parsum);
+            parsum =
+                _mm256_madd_epi16(v_c45, _mm256_alignr_epi8(v_refu, v_refl, 8));
+            __m256i v_sum3 = _mm256_add_epi32(v_sum2, parsum);
+            parsum = _mm256_madd_epi16(v_c67,
+                                       _mm256_alignr_epi8(v_refu, v_refl, 12));
+            __m256i v_sum4 = _mm256_add_epi32(v_sum3, parsum);
+
+            tmp[k + 7] = _mm256_srai_epi32(_mm256_add_epi32(v_sum4, v_rbhoriz),
+                                           reduce_bits_horiz);
+          }
+        } else if (beta == 0) {
+          sx = sx4;
+          __m256i v_coeff01 = _mm256_inserti128_si256(
+              v_zeros,
+              _mm_loadu_si128(
+                  (__m128i *)av1_warped_filter[(sx) >> WARPEDDIFF_PREC_BITS]),
+              0);
+          v_coeff01 = _mm256_inserti128_si256(
+              v_coeff01,
+              _mm_loadu_si128(
+                  (__m128i *)
+                      av1_warped_filter[(sx + alpha) >> WARPEDDIFF_PREC_BITS]),
+              1);  // B7B6..B1B0A7A6..A1A0
+          __m256i v_coeff23 = _mm256_inserti128_si256(
+              v_zeros,
+              _mm_loadu_si128(
+                  (__m128i *)av1_warped_filter[(sx + 2 * alpha) >>
+                                               WARPEDDIFF_PREC_BITS]),
+              0);
+          v_coeff23 = _mm256_inserti128_si256(
+              v_coeff23,
+              _mm_loadu_si128(
+                  (__m128i *)av1_warped_filter[(sx + 3 * alpha) >>
+                                               WARPEDDIFF_PREC_BITS]),
+              1);  // D7D6..D1D0C7C6..C1C0
+          __m256i v_coeff45 = _mm256_inserti128_si256(
+              v_zeros,
+              _mm_loadu_si128(
+                  (__m128i *)av1_warped_filter[(sx + 4 * alpha) >>
+                                               WARPEDDIFF_PREC_BITS]),
+              0);
+          v_coeff45 = _mm256_inserti128_si256(
+              v_coeff45,
+              _mm_loadu_si128(
+                  (__m128i *)av1_warped_filter[(sx + 5 * alpha) >>
+                                               WARPEDDIFF_PREC_BITS]),
+              1);  // F7F6..F1F0E7E6..E1E0
+          __m256i v_coeff67 = _mm256_inserti128_si256(
+              v_zeros,
+              _mm_loadu_si128(
+                  (__m128i *)av1_warped_filter[(sx + 6 * alpha) >>
+                                               WARPEDDIFF_PREC_BITS]),
+              0);
+          v_coeff67 = _mm256_inserti128_si256(
+              v_coeff67,
+              _mm_loadu_si128(
+                  (__m128i *)av1_warped_filter[(sx + 7 * alpha) >>
+                                               WARPEDDIFF_PREC_BITS]),
+              1);  // H7H6..H1H0G7G6..G1G0
+
+          __m256i v_c0123 = _mm256_unpacklo_epi32(
+              v_coeff01,
+              v_coeff23);  // D3D2B3B2D1D0B1B0C3C2A3A2C1C0A1A0
+          __m256i v_c0123u = _mm256_unpackhi_epi32(
+              v_coeff01,
+              v_coeff23);  // D7D6B7B6D5D4B5B4C7C6A7A6C5C4A5A4
+          __m256i v_c4567 = _mm256_unpacklo_epi32(
+              v_coeff45,
+              v_coeff67);  // H3H2F3F2H1H0F1F0G3G2E3E2G1G0E1E0
+          __m256i v_c4567u = _mm256_unpackhi_epi32(
+              v_coeff45,
+              v_coeff67);  // H7H6F7F6H5H4F5F4G7G6E7E6G5G4E5E4
+
+          __m256i v_c01 = _mm256_unpacklo_epi64(
+              v_c0123, v_c4567);  // H1H0F1F0D1D0B1B0G1G0E1E0C1C0A1A0
+          __m256i v_c23 =
+              _mm256_unpackhi_epi64(v_c0123, v_c4567);  // H3H2 ... A3A2
+          __m256i v_c45 =
+              _mm256_unpacklo_epi64(v_c0123u, v_c4567u);  // H5H4 ... A5A4
+          __m256i v_c67 =
+              _mm256_unpackhi_epi64(v_c0123u, v_c4567u);  // H7H6 ... A7A6
+
+          for (int k = -7; k < AOMMIN(8, p_height - i); ++k) {
+            int iy = iy4 + k;
+            if (iy < 0)
+              iy = 0;
+            else if (iy > height - 1)
+              iy = height - 1;
+            iy = iy * stride;
+
+            __m256i v_refl = _mm256_inserti128_si256(
+                _mm256_set1_epi16(0),
+                _mm_loadu_si128((__m128i *)&ref[iy + ix4 - 7]), 0);
+            v_refl = _mm256_inserti128_si256(
+                v_refl, _mm_loadu_si128((__m128i *)&ref[iy + ix4 + 1]),
+                1);  // R15 .. R0
+
+            __m256i v_ref = _mm256_permute4x64_epi64(v_refl, 0xEE);
+
+            __m256i v_refu =
+                _mm256_alignr_epi8(v_ref, v_refl, 2);  // R8R15R14...R2R1
+
+            v_refl = _mm256_inserti128_si256(
+                v_refl, _mm256_extracti128_si256(v_refu, 0), 1);
+            v_refu = _mm256_inserti128_si256(
+                v_refu, _mm256_extracti128_si256(v_ref, 0), 0);
+
+            __m256i v_sum = _mm256_set1_epi32(ohoriz);
+            __m256i parsum = _mm256_madd_epi16(
+                v_c01, _mm256_alignr_epi8(v_refu, v_refl,
+                                          0));  // R8R7R6..R1R7R6R5..R1R0
+            __m256i v_sum1 = _mm256_add_epi32(v_sum, parsum);
+
+            parsum = _mm256_madd_epi16(
+                v_c23,
+                _mm256_alignr_epi8(v_refu, v_refl, 4));  // R10R9..R3R9R8..R3R2
+            __m256i v_sum2 = _mm256_add_epi32(v_sum1, parsum);
+            parsum = _mm256_madd_epi16(
+                v_c45, _mm256_alignr_epi8(v_refu, v_refl,
+                                          8));  // R12R11..R5R11R10..R5R4
+            __m256i v_sum3 = _mm256_add_epi32(v_sum2, parsum);
+            parsum = _mm256_madd_epi16(
+                v_c67, _mm256_alignr_epi8(v_refu, v_refl,
+                                          12));  // R14R13..R7R13R12..R7R6
+            __m256i v_sum4 = _mm256_add_epi32(v_sum3, parsum);
+
+            tmp[k + 7] = _mm256_srai_epi32(_mm256_add_epi32(v_sum4, v_rbhoriz),
+                                           reduce_bits_horiz);
+          }
+
+        } else {
+          for (int k = -7; k < AOMMIN(8, p_height - i); ++k) {
+            int iy = iy4 + k;
+            if (iy < 0)
+              iy = 0;
+            else if (iy > height - 1)
+              iy = height - 1;
+            iy = iy * stride;
+
+            sx = sx4 + beta * (k + 4);
+
+            __m256i v_coeff01 = _mm256_inserti128_si256(
+                v_zeros,
+                _mm_loadu_si128(
+                    (__m128i *)av1_warped_filter[(sx) >> WARPEDDIFF_PREC_BITS]),
+                0);
+            v_coeff01 = _mm256_inserti128_si256(
+                v_coeff01,
+                _mm_loadu_si128(
+                    (__m128i *)av1_warped_filter[(sx + alpha) >>
+                                                 WARPEDDIFF_PREC_BITS]),
+                1);  // B7B6..B1B0A7A6..A1A0
+            __m256i v_coeff23 = _mm256_inserti128_si256(
+                v_zeros,
+                _mm_loadu_si128(
+                    (__m128i *)av1_warped_filter[(sx + 2 * alpha) >>
+                                                 WARPEDDIFF_PREC_BITS]),
+                0);
+            v_coeff23 = _mm256_inserti128_si256(
+                v_coeff23,
+                _mm_loadu_si128(
+                    (__m128i *)av1_warped_filter[(sx + 3 * alpha) >>
+                                                 WARPEDDIFF_PREC_BITS]),
+                1);  // D7D6..D1D0C7C6..C1C0
+            __m256i v_coeff45 = _mm256_inserti128_si256(
+                v_zeros,
+                _mm_loadu_si128(
+                    (__m128i *)av1_warped_filter[(sx + 4 * alpha) >>
+                                                 WARPEDDIFF_PREC_BITS]),
+                0);
+            v_coeff45 = _mm256_inserti128_si256(
+                v_coeff45,
+                _mm_loadu_si128(
+                    (__m128i *)av1_warped_filter[(sx + 5 * alpha) >>
+                                                 WARPEDDIFF_PREC_BITS]),
+                1);  // F7F6..F1F0E7E6..E1E0
+            __m256i v_coeff67 = _mm256_inserti128_si256(
+                v_zeros,
+                _mm_loadu_si128(
+                    (__m128i *)av1_warped_filter[(sx + 6 * alpha) >>
+                                                 WARPEDDIFF_PREC_BITS]),
+                0);
+            v_coeff67 = _mm256_inserti128_si256(
+                v_coeff67,
+                _mm_loadu_si128(
+                    (__m128i *)av1_warped_filter[(sx + 7 * alpha) >>
+                                                 WARPEDDIFF_PREC_BITS]),
+                1);  // H7H6..H1H0G7G6..G1G0
+
+            __m256i v_c0123 = _mm256_unpacklo_epi32(
+                v_coeff01,
+                v_coeff23);  // D3D2B3B2D1D0B1B0C3C2A3A2C1C0A1A0
+            __m256i v_c0123u = _mm256_unpackhi_epi32(
+                v_coeff01,
+                v_coeff23);  // D7D6B7B6D5D4B5B4C7C6A7A6C5C4A5A4
+            __m256i v_c4567 = _mm256_unpacklo_epi32(
+                v_coeff45,
+                v_coeff67);  // H3H2F3F2H1H0F1F0G3G2E3E2G1G0E1E0
+            __m256i v_c4567u = _mm256_unpackhi_epi32(
+                v_coeff45,
+                v_coeff67);  // H7H6F7F6H5H4F5F4G7G6E7E6G5G4E5E4
+
+            __m256i v_c01 = _mm256_unpacklo_epi64(
+                v_c0123, v_c4567);  // H1H0F1F0D1D0B1B0G1G0E1E0C1C0A1A0
+            __m256i v_c23 =
+                _mm256_unpackhi_epi64(v_c0123, v_c4567);  // H3H2 ... A3A2
+            __m256i v_c45 =
+                _mm256_unpacklo_epi64(v_c0123u, v_c4567u);  // H5H4 ... A5A4
+            __m256i v_c67 =
+                _mm256_unpackhi_epi64(v_c0123u, v_c4567u);  // H7H6 ... A7A6
+
+            __m256i v_refl = _mm256_inserti128_si256(
+                _mm256_set1_epi16(0),
+                _mm_loadu_si128((__m128i *)&ref[iy + ix4 - 7]), 0);
+            v_refl = _mm256_inserti128_si256(
+                v_refl, _mm_loadu_si128((__m128i *)&ref[iy + ix4 + 1]),
+                1);  // R15 .. R0
+
+            __m256i v_ref = _mm256_permute4x64_epi64(v_refl, 0xEE);
+
+            __m256i v_refu =
+                _mm256_alignr_epi8(v_ref, v_refl, 2);  // R8R15R14...R2R1
+
+            v_refl = _mm256_inserti128_si256(
+                v_refl, _mm256_extracti128_si256(v_refu, 0), 1);
+            v_refu = _mm256_inserti128_si256(
+                v_refu, _mm256_extracti128_si256(v_ref, 0), 0);
+
+            __m256i v_sum = _mm256_set1_epi32(ohoriz);
+            __m256i parsum =
+                _mm256_madd_epi16(v_c01, _mm256_alignr_epi8(v_refu, v_refl, 0));
+            __m256i v_sum1 = _mm256_add_epi32(v_sum, parsum);
+
+            parsum =
+                _mm256_madd_epi16(v_c23, _mm256_alignr_epi8(v_refu, v_refl, 4));
+            __m256i v_sum2 = _mm256_add_epi32(v_sum1, parsum);
+            parsum =
+                _mm256_madd_epi16(v_c45, _mm256_alignr_epi8(v_refu, v_refl, 8));
+            __m256i v_sum3 = _mm256_add_epi32(v_sum2, parsum);
+            parsum = _mm256_madd_epi16(v_c67,
+                                       _mm256_alignr_epi8(v_refu, v_refl, 12));
+            __m256i v_sum4 = _mm256_add_epi32(v_sum3, parsum);
+
+            tmp[k + 7] = _mm256_srai_epi32(_mm256_add_epi32(v_sum4, v_rbhoriz),
+                                           reduce_bits_horiz);
+          }
+        }
+      }
+
+      // Vertical filter
+      for (int k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
+        int sy = sy4 + delta * (k + 4);
+        const __m256i *src = tmp + (k + 4);
+
+        __m256i v_coeff01 = _mm256_inserti128_si256(
+            v_zeros,
+            _mm_loadu_si128(
+                (__m128i *)av1_warped_filter[(sy) >> WARPEDDIFF_PREC_BITS]),
+            0);
+        v_coeff01 = _mm256_inserti128_si256(
+            v_coeff01,
+            _mm_loadu_si128(
+                (__m128i *)
+                    av1_warped_filter[(sy + gamma) >> WARPEDDIFF_PREC_BITS]),
+            1);
+        __m256i v_coeff23 = _mm256_inserti128_si256(
+            v_zeros,
+            _mm_loadu_si128((__m128i *)av1_warped_filter[(sy + 2 * gamma) >>
+                                                         WARPEDDIFF_PREC_BITS]),
+            0);
+        v_coeff23 = _mm256_inserti128_si256(
+            v_coeff23,
+            _mm_loadu_si128((__m128i *)av1_warped_filter[(sy + 3 * gamma) >>
+                                                         WARPEDDIFF_PREC_BITS]),
+            1);
+        __m256i v_coeff45 = _mm256_inserti128_si256(
+            v_zeros,
+            _mm_loadu_si128((__m128i *)av1_warped_filter[(sy + 4 * gamma) >>
+                                                         WARPEDDIFF_PREC_BITS]),
+            0);
+        v_coeff45 = _mm256_inserti128_si256(
+            v_coeff45,
+            _mm_loadu_si128((__m128i *)av1_warped_filter[(sy + 5 * gamma) >>
+                                                         WARPEDDIFF_PREC_BITS]),
+            1);
+        __m256i v_coeff67 = _mm256_inserti128_si256(
+            v_zeros,
+            _mm_loadu_si128((__m128i *)av1_warped_filter[(sy + 6 * gamma) >>
+                                                         WARPEDDIFF_PREC_BITS]),
+            0);
+        v_coeff67 = _mm256_inserti128_si256(
+            v_coeff67,
+            _mm_loadu_si128((__m128i *)av1_warped_filter[(sy + 7 * gamma) >>
+                                                         WARPEDDIFF_PREC_BITS]),
+            1);
+
+        __m256i v_c0123 = _mm256_unpacklo_epi32(
+            v_coeff01,
+            v_coeff23);  // D3D2B3B2D1D0B1B0C3C2A3A2C1C0A1A0
+        __m256i v_c0123u = _mm256_unpackhi_epi32(
+            v_coeff01,
+            v_coeff23);  // D7D6B7B6D5D4B5B4C7C6A7A6C5C4A5A4
+        __m256i v_c4567 = _mm256_unpacklo_epi32(
+            v_coeff45,
+            v_coeff67);  // H3H2F3F2H1H0F1F0G3G2E3E2G1G0E1E0
+        __m256i v_c4567u = _mm256_unpackhi_epi32(
+            v_coeff45,
+            v_coeff67);  // H7H6F7F6H5H4F5F4G7G6E7E6G5G4E5E4
+
+        __m256i v_c01 = _mm256_unpacklo_epi64(
+            v_c0123, v_c4567);  // H1H0F1F0D1D0B1B0G1G0E1E0C1C0A1A0
+        __m256i v_c23 =
+            _mm256_unpackhi_epi64(v_c0123, v_c4567);  // H3H2 ... A3A2
+        __m256i v_c45 =
+            _mm256_unpacklo_epi64(v_c0123u, v_c4567u);  // H5H4 ... A5A4
+        __m256i v_c67 =
+            _mm256_unpackhi_epi64(v_c0123u, v_c4567u);  // H7H6 ... A7A6
+
+        __m256i v_src01l =
+            _mm256_unpacklo_epi32(src[0], src[1]);  // T13T03T11T01T12T02T10T00
+        __m256i v_src01u =
+            _mm256_unpackhi_epi32(src[0], src[1]);  // T17T07T15T05T16T06T14T04
+        __m256i v_sum =
+            _mm256_madd_epi16(_mm256_packus_epi32(v_src01l, v_src01u),
+                              v_c01);  // S7S5S3S1S6S4S2S0
+
+        __m256i v_src23l = _mm256_unpacklo_epi32(src[2], src[3]);
+        __m256i v_src23u = _mm256_unpackhi_epi32(src[2], src[3]);
+        v_sum = _mm256_add_epi32(
+            v_sum,
+            _mm256_madd_epi16(_mm256_packus_epi32(v_src23l, v_src23u), v_c23));
+
+        __m256i v_src45l = _mm256_unpacklo_epi32(src[4], src[5]);
+        __m256i v_src45u = _mm256_unpackhi_epi32(src[4], src[5]);
+        v_sum = _mm256_add_epi32(
+            v_sum,
+            _mm256_madd_epi16(_mm256_packus_epi32(v_src45l, v_src45u), v_c45));
+
+        __m256i v_src67l = _mm256_unpacklo_epi32(src[6], src[7]);
+        __m256i v_src67u = _mm256_unpackhi_epi32(src[6], src[7]);
+        v_sum = _mm256_add_epi32(
+            v_sum,
+            _mm256_madd_epi16(_mm256_packus_epi32(v_src67l, v_src67u), v_c67));
+
+        // unpack S7S5S3S1S6S4S2S0 to S7S6S5S4S3S2S1S0
+
+        __m256i v_suml =
+            _mm256_permute4x64_epi64(v_sum, 0xD8);  // S7S5S6S4S3S1S2S0
+        __m256i v_sumh =
+            _mm256_permute4x64_epi64(v_sum, 0x32);      // S2S0S7S5S2S0S3S1
+        v_sum = _mm256_unpacklo_epi32(v_suml, v_sumh);  // S7S6S5S4S3S2S1S0
+
+        if (conv_params->is_compound) {
+          __m128i *const p =
+              (__m128i *)&conv_params
+                  ->dst[(i + k + 4) * conv_params->dst_stride + j];
+
+          v_sum = _mm256_add_epi32(v_sum, res_add_const);
+          v_sum =
+              _mm256_sra_epi32(_mm256_add_epi32(v_sum, reduce_bits_vert_const),
+                               reduce_bits_vert_shift);
+          if (conv_params->do_average) {
+            __m128i *const dst16 = (__m128i *)&pred[(i + k + 4) * p_stride + j];
+            __m256i p_32 = _mm256_cvtepu16_epi32(_mm_loadu_si128(p));
+
+            if (conv_params->use_dist_wtd_comp_avg) {
+              v_sum = _mm256_add_epi32(_mm256_mullo_epi32(p_32, wt0),
+                                       _mm256_mullo_epi32(v_sum, wt1));
+              v_sum = _mm256_srai_epi32(v_sum, DIST_PRECISION_BITS);
+            } else {
+              v_sum = _mm256_srai_epi32(_mm256_add_epi32(p_32, v_sum), 1);
+            }
+
+            __m256i v_sum1 = _mm256_add_epi32(v_sum, res_sub_const);
+            v_sum1 = _mm256_sra_epi32(
+                _mm256_add_epi32(v_sum1, round_bits_const), round_bits_shift);
+
+            __m256i v_sum16 = _mm256_packus_epi32(v_sum1, v_sum1);
+            v_sum16 = _mm256_permute4x64_epi64(v_sum16, 0xD8);
+            v_sum16 = _mm256_min_epi16(v_sum16, clip_pixel);
+            _mm_storeu_si128(dst16, _mm256_extracti128_si256(v_sum16, 0));
+          } else {
+            v_sum = _mm256_packus_epi32(v_sum, v_sum);
+            __m256i v_sum16 = _mm256_permute4x64_epi64(v_sum, 0xD8);
+            _mm_storeu_si128(p, _mm256_extracti128_si256(v_sum16, 0));
+          }
+        } else {
+          // Round and pack into 8 bits
+          const __m256i round_const =
+              _mm256_set1_epi32(-(1 << (bd + reduce_bits_vert - 1)) +
+                                ((1 << reduce_bits_vert) >> 1));
+
+          __m256i v_sum1 = _mm256_srai_epi32(
+              _mm256_add_epi32(v_sum, round_const), reduce_bits_vert);
+
+          v_sum1 = _mm256_packus_epi32(v_sum1, v_sum1);
+          __m256i v_sum16 = _mm256_permute4x64_epi64(v_sum1, 0xD8);
+          // Clamp res_16bit to the range [0, 2^bd - 1]
+          const __m256i max_val = _mm256_set1_epi16((1 << bd) - 1);
+          const __m256i zero = _mm256_setzero_si256();
+          v_sum16 = _mm256_max_epi16(_mm256_min_epi16(v_sum16, max_val), zero);
+
+          __m128i *const p = (__m128i *)&pred[(i + k + 4) * p_stride + j];
+
+          _mm_storeu_si128(p, _mm256_extracti128_si256(v_sum16, 0));
+        }
+      }
+    }
+  }
+}
diff --git a/av1/common/x86/jnt_convolve_avx2.c b/av1/common/x86/jnt_convolve_avx2.c
index 6de61573..7a13d4a 100644
--- a/av1/common/x86/jnt_convolve_avx2.c
+++ b/av1/common/x86/jnt_convolve_avx2.c
@@ -38,8 +38,7 @@
 void av1_dist_wtd_convolve_x_avx2(const uint8_t *src, int src_stride,
                                   uint8_t *dst0, int dst_stride0, int w, int h,
                                   const InterpFilterParams *filter_params_x,
-                                  const InterpFilterParams *filter_params_y,
-                                  const int subpel_x_qn, const int subpel_y_qn,
+                                  const int subpel_x_qn,
                                   ConvolveParams *conv_params) {
   CONV_BUF_TYPE *dst = conv_params->dst;
   int dst_stride = conv_params->dst_stride;
@@ -64,9 +63,6 @@
       _mm256_set1_epi16((1 << (conv_params->round_0 - 1)) >> 1);
   const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0 - 1);
 
-  (void)filter_params_y;
-  (void)subpel_y_qn;
-
   __m256i filt[4], coeffs[4];
 
   filt[0] = _mm256_load_si256((__m256i const *)filt_global_avx2);
@@ -189,9 +185,8 @@
 
 void av1_dist_wtd_convolve_y_avx2(const uint8_t *src, int src_stride,
                                   uint8_t *dst0, int dst_stride0, int w, int h,
-                                  const InterpFilterParams *filter_params_x,
                                   const InterpFilterParams *filter_params_y,
-                                  const int subpel_x_qn, const int subpel_y_qn,
+                                  const int subpel_y_qn,
                                   ConvolveParams *conv_params) {
   CONV_BUF_TYPE *dst = conv_params->dst;
   int dst_stride = conv_params->dst_stride;
@@ -222,10 +217,6 @@
 
   prepare_coeffs_lowbd(filter_params_y, subpel_y_qn, coeffs);
 
-  (void)conv_params;
-  (void)filter_params_x;
-  (void)subpel_x_qn;
-
   // Condition for checking valid vert_filt taps
   if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs[0], coeffs[3]), 0)))
     is_vert_4tap = 1;
@@ -802,18 +793,12 @@
   }
 }
 
-void av1_dist_wtd_convolve_2d_copy_avx2(
-    const uint8_t *src, int src_stride, uint8_t *dst0, int dst_stride0, int w,
-    int h, const InterpFilterParams *filter_params_x,
-    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
-    const int subpel_y_qn, ConvolveParams *conv_params) {
+void av1_dist_wtd_convolve_2d_copy_avx2(const uint8_t *src, int src_stride,
+                                        uint8_t *dst0, int dst_stride0, int w,
+                                        int h, ConvolveParams *conv_params) {
   const int bd = 8;
   CONV_BUF_TYPE *dst = conv_params->dst;
   int dst_stride = conv_params->dst_stride;
-  (void)filter_params_x;
-  (void)filter_params_y;
-  (void)subpel_x_qn;
-  (void)subpel_y_qn;
 
   const int bits =
       FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
diff --git a/av1/common/x86/jnt_convolve_sse2.c b/av1/common/x86/jnt_convolve_sse2.c
index f8f640a..b8400c0 100644
--- a/av1/common/x86/jnt_convolve_sse2.c
+++ b/av1/common/x86/jnt_convolve_sse2.c
@@ -19,8 +19,7 @@
 void av1_dist_wtd_convolve_x_sse2(const uint8_t *src, int src_stride,
                                   uint8_t *dst0, int dst_stride0, int w, int h,
                                   const InterpFilterParams *filter_params_x,
-                                  const InterpFilterParams *filter_params_y,
-                                  const int subpel_x_qn, const int subpel_y_qn,
+                                  const int subpel_x_qn,
                                   ConvolveParams *conv_params) {
   const int bd = 8;
   CONV_BUF_TYPE *dst = conv_params->dst;
@@ -47,9 +46,6 @@
   const __m128i rounding_const = _mm_set1_epi16((1 << rounding_shift) >> 1);
   __m128i coeffs[4];
 
-  (void)filter_params_y;
-  (void)subpel_y_qn;
-
   prepare_coeffs(filter_params_x, subpel_x_qn, coeffs);
 
   if (w == 4) {
@@ -152,9 +148,8 @@
 
 void av1_dist_wtd_convolve_y_sse2(const uint8_t *src, int src_stride,
                                   uint8_t *dst0, int dst_stride0, int w, int h,
-                                  const InterpFilterParams *filter_params_x,
                                   const InterpFilterParams *filter_params_y,
-                                  const int subpel_x_qn, const int subpel_y_qn,
+                                  const int subpel_y_qn,
                                   ConvolveParams *conv_params) {
   const int bd = 8;
   CONV_BUF_TYPE *dst = conv_params->dst;
@@ -179,9 +174,6 @@
   const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);
   __m128i coeffs[4];
 
-  (void)filter_params_x;
-  (void)subpel_x_qn;
-
   prepare_coeffs(filter_params_y, subpel_y_qn, coeffs);
 
   if (w == 4) {
diff --git a/av1/common/x86/resize_ssse3.c b/av1/common/x86/resize_ssse3.c
new file mode 100644
index 0000000..0578668
--- /dev/null
+++ b/av1/common/x86/resize_ssse3.c
@@ -0,0 +1,946 @@
+/*
+ *
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tmmintrin.h>  // SSSE3
+
+#include "aom_dsp/x86/convolve_sse2.h"
+#include "aom_dsp/x86/mem_sse2.h"
+#include "aom_dsp/x86/transpose_sse2.h"
+#include "av1/common/resize.h"
+#include "config/av1_rtcd.h"
+#include "config/aom_scale_rtcd.h"
+
+static INLINE __m128i scale_plane_2_to_1_phase_0_kernel(
+    const uint8_t *const src, const __m128i *const mask) {
+  const __m128i a = _mm_loadu_si128((const __m128i *)(&src[0]));
+  const __m128i b = _mm_loadu_si128((const __m128i *)(&src[16]));
+  const __m128i a_and = _mm_and_si128(a, *mask);
+  const __m128i b_and = _mm_and_si128(b, *mask);
+  return _mm_packus_epi16(a_and, b_and);
+}
+
+static INLINE void shuffle_filter_odd_ssse3(const int16_t *const filter,
+                                            __m128i *const f) {
+  const __m128i f_values = _mm_load_si128((const __m128i *)filter);
+  // pack and duplicate the filter values
+  // It utilizes the fact that the high byte of filter[3] is always 0 to clean
+  // half of f[0] and f[4].
+  assert(filter[3] >= 0 && filter[3] < 256);
+  f[0] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0007u));
+  f[1] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0402u));
+  f[2] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0806u));
+  f[3] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0c0au));
+  f[4] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x070eu));
+}
+
+static INLINE __m128i convolve8_8_even_offset_ssse3(const __m128i *const s,
+                                                    const __m128i *const f) {
+  // multiply 2 adjacent elements with the filter and add the result
+  const __m128i k_64 = _mm_set1_epi16(1 << 6);
+  const __m128i x0 = _mm_maddubs_epi16(s[0], f[0]);
+  const __m128i x1 = _mm_maddubs_epi16(s[1], f[1]);
+  const __m128i x2 = _mm_maddubs_epi16(s[2], f[2]);
+  const __m128i x3 = _mm_maddubs_epi16(s[3], f[3]);
+  // compensate the subtracted 64 in f[1]. x4 is always non negative.
+  const __m128i x4 = _mm_maddubs_epi16(s[1], _mm_set1_epi8(64));
+  // add and saturate the results together
+  __m128i temp = _mm_adds_epi16(x0, x3);
+  temp = _mm_adds_epi16(temp, x1);
+  temp = _mm_adds_epi16(temp, x2);
+  temp = _mm_adds_epi16(temp, x4);
+  // round and shift by 7 bit each 16 bit
+  temp = _mm_adds_epi16(temp, k_64);
+  temp = _mm_srai_epi16(temp, 7);
+  return temp;
+}
+
+static INLINE __m128i convolve8_8_odd_offset_ssse3(const __m128i *const s,
+                                                   const __m128i *const f) {
+  // multiply 2 adjacent elements with the filter and add the result
+  const __m128i k_64 = _mm_set1_epi16(1 << 6);
+  const __m128i x0 = _mm_maddubs_epi16(s[0], f[0]);
+  const __m128i x1 = _mm_maddubs_epi16(s[1], f[1]);
+  const __m128i x2 = _mm_maddubs_epi16(s[2], f[2]);
+  const __m128i x3 = _mm_maddubs_epi16(s[3], f[3]);
+  const __m128i x4 = _mm_maddubs_epi16(s[4], f[4]);
+  // compensate the subtracted 64 in f[2]. x5 is always non negative.
+  const __m128i x5 = _mm_maddubs_epi16(s[2], _mm_set1_epi8(64));
+  __m128i temp;
+
+  // add and saturate the results together
+  temp = _mm_adds_epi16(x0, x1);
+  temp = _mm_adds_epi16(temp, x2);
+  temp = _mm_adds_epi16(temp, x3);
+  temp = _mm_adds_epi16(temp, x4);
+  temp = _mm_adds_epi16(temp, x5);
+  // round and shift by 7 bit each 16 bit
+  temp = _mm_adds_epi16(temp, k_64);
+  temp = _mm_srai_epi16(temp, 7);
+  return temp;
+}
+
+static void scale_plane_2_to_1_phase_0(const uint8_t *src,
+                                       const ptrdiff_t src_stride, uint8_t *dst,
+                                       const ptrdiff_t dst_stride,
+                                       const int dst_w, const int dst_h) {
+  const int max_width = (dst_w + 15) & ~15;
+  const __m128i mask = _mm_set1_epi16(0x00FF);
+  int y = dst_h;
+
+  do {
+    int x = max_width;
+    do {
+      const __m128i d = scale_plane_2_to_1_phase_0_kernel(src, &mask);
+      _mm_storeu_si128((__m128i *)dst, d);
+      src += 32;
+      dst += 16;
+      x -= 16;
+    } while (x);
+    src += 2 * (src_stride - max_width);
+    dst += dst_stride - max_width;
+  } while (--y);
+}
+
+static void scale_plane_4_to_1_phase_0(const uint8_t *src,
+                                       const ptrdiff_t src_stride, uint8_t *dst,
+                                       const ptrdiff_t dst_stride,
+                                       const int dst_w, const int dst_h) {
+  const int max_width = (dst_w + 15) & ~15;
+  const __m128i mask = _mm_set1_epi32(0x000000FF);
+  int y = dst_h;
+
+  do {
+    int x = max_width;
+    do {
+      const __m128i d0 = scale_plane_2_to_1_phase_0_kernel(&src[0], &mask);
+      const __m128i d1 = scale_plane_2_to_1_phase_0_kernel(&src[32], &mask);
+      const __m128i d2 = _mm_packus_epi16(d0, d1);
+      _mm_storeu_si128((__m128i *)dst, d2);
+      src += 64;
+      dst += 16;
+      x -= 16;
+    } while (x);
+    src += 4 * (src_stride - max_width);
+    dst += dst_stride - max_width;
+  } while (--y);
+}
+
+static INLINE __m128i scale_plane_bilinear_kernel(const __m128i *const s,
+                                                  const __m128i c0c1) {
+  const __m128i k_64 = _mm_set1_epi16(1 << 6);
+  const __m128i t0 = _mm_maddubs_epi16(s[0], c0c1);
+  const __m128i t1 = _mm_maddubs_epi16(s[1], c0c1);
+  // round and shift by 7 bit each 16 bit
+  const __m128i t2 = _mm_adds_epi16(t0, k_64);
+  const __m128i t3 = _mm_adds_epi16(t1, k_64);
+  const __m128i t4 = _mm_srai_epi16(t2, 7);
+  const __m128i t5 = _mm_srai_epi16(t3, 7);
+  return _mm_packus_epi16(t4, t5);
+}
+
+static void scale_plane_2_to_1_bilinear(const uint8_t *src,
+                                        const ptrdiff_t src_stride,
+                                        uint8_t *dst,
+                                        const ptrdiff_t dst_stride,
+                                        const int dst_w, const int dst_h,
+                                        const __m128i c0c1) {
+  const int max_width = (dst_w + 15) & ~15;
+  int y = dst_h;
+
+  do {
+    int x = max_width;
+    do {
+      __m128i s[2], d[2];
+
+      // Horizontal
+      // Even rows
+      s[0] = _mm_loadu_si128((const __m128i *)(src + 0));
+      s[1] = _mm_loadu_si128((const __m128i *)(src + 16));
+      d[0] = scale_plane_bilinear_kernel(s, c0c1);
+
+      // odd rows
+      s[0] = _mm_loadu_si128((const __m128i *)(src + src_stride + 0));
+      s[1] = _mm_loadu_si128((const __m128i *)(src + src_stride + 16));
+      d[1] = scale_plane_bilinear_kernel(s, c0c1);
+
+      // Vertical
+      s[0] = _mm_unpacklo_epi8(d[0], d[1]);
+      s[1] = _mm_unpackhi_epi8(d[0], d[1]);
+      d[0] = scale_plane_bilinear_kernel(s, c0c1);
+
+      _mm_storeu_si128((__m128i *)dst, d[0]);
+      src += 32;
+      dst += 16;
+      x -= 16;
+    } while (x);
+    src += 2 * (src_stride - max_width);
+    dst += dst_stride - max_width;
+  } while (--y);
+}
+
+static void scale_plane_4_to_1_bilinear(const uint8_t *src,
+                                        const ptrdiff_t src_stride,
+                                        uint8_t *dst,
+                                        const ptrdiff_t dst_stride,
+                                        const int dst_w, const int dst_h,
+                                        const __m128i c0c1) {
+  const int max_width = (dst_w + 15) & ~15;
+  int y = dst_h;
+
+  do {
+    int x = max_width;
+    do {
+      __m128i s[8], d[8];
+
+      // Note: Using _mm_packus_epi32() in SSE4.1 could be faster.
+      //       Here we tried to not use shuffle instructions which would be slow
+      //       on some x86 CPUs.
+
+      // Horizontal
+      // 000 001 xx xx 004 005 xx xx  008 009 xx xx 00C 00D xx xx
+      // 010 011 xx xx 014 015 xx xx  018 019 xx xx 01C 01D xx xx
+      // 020 021 xx xx 024 025 xx xx  028 029 xx xx 02C 02D xx xx
+      // 030 031 xx xx 034 035 xx xx  038 039 xx xx 03C 03D xx xx
+      // 100 101 xx xx 104 105 xx xx  108 109 xx xx 10C 10D xx xx
+      // 110 111 xx xx 114 115 xx xx  118 119 xx xx 11C 11D xx xx
+      // 120 121 xx xx 124 125 xx xx  128 129 xx xx 12C 12D xx xx
+      // 130 131 xx xx 134 135 xx xx  138 139 xx xx 13C 13D xx xx
+      s[0] = _mm_loadu_si128((const __m128i *)(&src[0]));
+      s[1] = _mm_loadu_si128((const __m128i *)(&src[16]));
+      s[2] = _mm_loadu_si128((const __m128i *)(&src[32]));
+      s[3] = _mm_loadu_si128((const __m128i *)(&src[48]));
+      s[4] = _mm_loadu_si128((const __m128i *)(src + src_stride + 0));
+      s[5] = _mm_loadu_si128((const __m128i *)(src + src_stride + 16));
+      s[6] = _mm_loadu_si128((const __m128i *)(src + src_stride + 32));
+      s[7] = _mm_loadu_si128((const __m128i *)(src + src_stride + 48));
+
+      // 000 001 100 101 xx xx xx xx  004 005 104 105 xx xx xx xx
+      // 008 009 108 109 xx xx xx xx  00C 00D 10C 10D xx xx xx xx
+      // 010 011 110 111 xx xx xx xx  014 015 114 115 xx xx xx xx
+      // 018 019 118 119 xx xx xx xx  01C 01D 11C 11D xx xx xx xx
+      // 020 021 120 121 xx xx xx xx  024 025 124 125 xx xx xx xx
+      // 028 029 128 129 xx xx xx xx  02C 02D 12C 12D xx xx xx xx
+      // 030 031 130 131 xx xx xx xx  034 035 134 135 xx xx xx xx
+      // 038 039 138 139 xx xx xx xx  03C 03D 13C 13D xx xx xx xx
+      d[0] = _mm_unpacklo_epi16(s[0], s[4]);
+      d[1] = _mm_unpackhi_epi16(s[0], s[4]);
+      d[2] = _mm_unpacklo_epi16(s[1], s[5]);
+      d[3] = _mm_unpackhi_epi16(s[1], s[5]);
+      d[4] = _mm_unpacklo_epi16(s[2], s[6]);
+      d[5] = _mm_unpackhi_epi16(s[2], s[6]);
+      d[6] = _mm_unpacklo_epi16(s[3], s[7]);
+      d[7] = _mm_unpackhi_epi16(s[3], s[7]);
+
+      // 000 001 100 101 008 009 108 109  xx xx xx xx xx xx xx xx
+      // 004 005 104 105 00C 00D 10C 10D  xx xx xx xx xx xx xx xx
+      // 010 011 110 111 018 019 118 119  xx xx xx xx xx xx xx xx
+      // 014 015 114 115 01C 01D 11C 11D  xx xx xx xx xx xx xx xx
+      // 020 021 120 121 028 029 128 129  xx xx xx xx xx xx xx xx
+      // 024 025 124 125 02C 02D 12C 12D  xx xx xx xx xx xx xx xx
+      // 030 031 130 131 038 039 138 139  xx xx xx xx xx xx xx xx
+      // 034 035 134 135 03C 03D 13C 13D  xx xx xx xx xx xx xx xx
+      s[0] = _mm_unpacklo_epi32(d[0], d[1]);
+      s[1] = _mm_unpackhi_epi32(d[0], d[1]);
+      s[2] = _mm_unpacklo_epi32(d[2], d[3]);
+      s[3] = _mm_unpackhi_epi32(d[2], d[3]);
+      s[4] = _mm_unpacklo_epi32(d[4], d[5]);
+      s[5] = _mm_unpackhi_epi32(d[4], d[5]);
+      s[6] = _mm_unpacklo_epi32(d[6], d[7]);
+      s[7] = _mm_unpackhi_epi32(d[6], d[7]);
+
+      // 000 001 100 101 004 005 104 105  008 009 108 109 00C 00D 10C 10D
+      // 010 011 110 111 014 015 114 115  018 019 118 119 01C 01D 11C 11D
+      // 020 021 120 121 024 025 124 125  028 029 128 129 02C 02D 12C 12D
+      // 030 031 130 131 034 035 134 135  038 039 138 139 03C 03D 13C 13D
+      d[0] = _mm_unpacklo_epi32(s[0], s[1]);
+      d[1] = _mm_unpacklo_epi32(s[2], s[3]);
+      d[2] = _mm_unpacklo_epi32(s[4], s[5]);
+      d[3] = _mm_unpacklo_epi32(s[6], s[7]);
+
+      d[0] = scale_plane_bilinear_kernel(&d[0], c0c1);
+      d[1] = scale_plane_bilinear_kernel(&d[2], c0c1);
+
+      // Vertical
+      d[0] = scale_plane_bilinear_kernel(d, c0c1);
+
+      _mm_storeu_si128((__m128i *)dst, d[0]);
+      src += 64;
+      dst += 16;
+      x -= 16;
+    } while (x);
+    src += 4 * (src_stride - max_width);
+    dst += dst_stride - max_width;
+  } while (--y);
+}
+
+static void scale_plane_4_to_1_general(const uint8_t *src, const int src_stride,
+                                       uint8_t *dst, const int dst_stride,
+                                       const int w, const int h,
+                                       const int16_t *const coef,
+                                       uint8_t *const temp_buffer) {
+  const int width_hor = (w + 1) & ~1;
+  const int width_ver = (w + 7) & ~7;
+  const int height_hor = (4 * h + SUBPEL_TAPS - 2 + 7) & ~7;
+  const int height_ver = (h + 1) & ~1;
+  int x, y = height_hor;
+  uint8_t *t = temp_buffer;
+  __m128i s[11], d[4];
+  __m128i f[4];
+
+  assert(w && h);
+
+  shuffle_filter_ssse3(coef, f);
+  src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2 + 3;
+
+  // horizontal 2x8
+  do {
+    load_8bit_8x8(src + 4, src_stride, s);
+    // 00 01 10 11 20 21 30 31  40 41 50 51 60 61 70 71
+    // 02 03 12 13 22 23 32 33  42 43 52 53 62 63 72 73
+    // 04 05 14 15 24 25 34 35  44 45 54 55 64 65 74 75 (overlapped)
+    // 06 07 16 17 26 27 36 37  46 47 56 57 66 67 76 77 (overlapped)
+    transpose_16bit_4x8(s, s);
+    x = width_hor;
+
+    do {
+      src += 8;
+      load_8bit_8x8(src, src_stride, &s[2]);
+      // 04 05 14 15 24 25 34 35  44 45 54 55 64 65 74 75
+      // 06 07 16 17 26 27 36 37  46 47 56 57 66 67 76 77
+      // 08 09 18 19 28 29 38 39  48 49 58 59 68 69 78 79
+      // 0A 0B 1A 1B 2A 2B 3A 3B  4A 4B 5A 5B 6A 6B 7A 7B
+      transpose_16bit_4x8(&s[2], &s[2]);
+
+      d[0] = convolve8_8_ssse3(&s[0], f);  // 00 10 20 30 40 50 60 70
+      d[1] = convolve8_8_ssse3(&s[2], f);  // 01 11 21 31 41 51 61 71
+
+      // 00 10 20 30 40 50 60 70  xx xx xx xx xx xx xx xx
+      // 01 11 21 31 41 51 61 71  xx xx xx xx xx xx xx xx
+      d[0] = _mm_packus_epi16(d[0], d[0]);
+      d[1] = _mm_packus_epi16(d[1], d[1]);
+      // 00 10 01 11 20 30 21 31  40 50 41 51 60 70 61 71
+      d[0] = _mm_unpacklo_epi16(d[0], d[1]);
+      store_8bit_4x4_sse2(d[0], t, 2 * width_hor);
+
+      s[0] = s[4];
+      s[1] = s[5];
+
+      t += 4;
+      x -= 2;
+    } while (x);
+    src += 8 * src_stride - 4 * width_hor;
+    t += 6 * width_hor;
+    y -= 8;
+  } while (y);
+
+  // vertical 8x2
+  x = width_ver;
+  t = temp_buffer;
+  do {
+    // 00 10 01 11 02 12 03 13  04 14 05 15 06 16 07 17
+    // 20 30 21 31 22 32 23 33  24 34 25 35 26 36 27 37
+    s[0] = _mm_loadu_si128((const __m128i *)(t + 0 * width_hor));
+    s[1] = _mm_loadu_si128((const __m128i *)(t + 2 * width_hor));
+    t += 4 * width_hor;
+    y = height_ver;
+
+    do {
+      // 40 50 41 51 42 52 43 53  44 54 45 55 46 56 47 57
+      // 60 70 61 71 62 72 63 73  64 74 65 75 66 76 67 77
+      // 80 90 81 91 82 92 83 93  84 94 85 95 86 96 87 77
+      // A0 B0 A1 B1 A2 B2 A3 B3  A4 B4 A5 B5 A6 B6 A7 77
+      loadu_8bit_16x4(t, 2 * width_hor, &s[2]);
+      t += 8 * width_hor;
+
+      d[0] = convolve8_8_ssse3(&s[0], f);  // 00 01 02 03 04 05 06 07
+      d[1] = convolve8_8_ssse3(&s[2], f);  // 10 11 12 13 14 15 16 17
+
+      // 00 01 02 03 04 05 06 07  10 11 12 13 14 15 16 17
+      d[0] = _mm_packus_epi16(d[0], d[1]);
+      _mm_storel_epi64((__m128i *)(dst + 0 * dst_stride), d[0]);
+      _mm_storeh_epi64((__m128i *)(dst + 1 * dst_stride), d[0]);
+
+      s[0] = s[4];
+      s[1] = s[5];
+
+      dst += 2 * dst_stride;
+      y -= 2;
+    } while (y);
+    t -= width_hor * (4 * height_ver + 4);
+    t += 16;
+    dst -= height_ver * dst_stride;
+    dst += 8;
+    x -= 8;
+  } while (x);
+}
+
+static void scale_plane_2_to_1_general(const uint8_t *src, const int src_stride,
+                                       uint8_t *dst, const int dst_stride,
+                                       const int w, const int h,
+                                       const int16_t *const coef,
+                                       uint8_t *const temp_buffer) {
+  const int width_hor = (w + 3) & ~3;
+  const int width_ver = (w + 7) & ~7;
+  const int height_hor = (2 * h + SUBPEL_TAPS - 2 + 7) & ~7;
+  const int height_ver = (h + 3) & ~3;
+  int x, y = height_hor;
+  uint8_t *t = temp_buffer;
+  __m128i s[11], d[4];
+  __m128i f[4];
+
+  assert(w && h);
+
+  shuffle_filter_ssse3(coef, f);
+  src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2 + 1;
+
+  // horizontal 4x8
+  do {
+    load_8bit_8x8(src + 2, src_stride, s);
+    // 00 01 10 11 20 21 30 31  40 41 50 51 60 61 70 71
+    // 02 03 12 13 22 23 32 33  42 43 52 53 62 63 72 73
+    // 04 05 14 15 24 25 34 35  44 45 54 55 64 65 74 75
+    // 06 07 16 17 26 27 36 37  46 47 56 57 66 67 76 77 (overlapped)
+    transpose_16bit_4x8(s, s);
+    x = width_hor;
+
+    do {
+      src += 8;
+      load_8bit_8x8(src, src_stride, &s[3]);
+      // 06 07 16 17 26 27 36 37  46 47 56 57 66 67 76 77
+      // 08 09 18 19 28 29 38 39  48 49 58 59 68 69 78 79
+      // 0A 0B 1A 1B 2A 2B 3A 3B  4A 4B 5A 5B 6A 6B 7A 7B
+      // 0C 0D 1C 1D 2C 2D 3C 3D  4C 4D 5C 5D 6C 6D 7C 7D
+      transpose_16bit_4x8(&s[3], &s[3]);
+
+      d[0] = convolve8_8_ssse3(&s[0], f);  // 00 10 20 30 40 50 60 70
+      d[1] = convolve8_8_ssse3(&s[1], f);  // 01 11 21 31 41 51 61 71
+      d[2] = convolve8_8_ssse3(&s[2], f);  // 02 12 22 32 42 52 62 72
+      d[3] = convolve8_8_ssse3(&s[3], f);  // 03 13 23 33 43 53 63 73
+
+      // 00 10 20 30 40 50 60 70  02 12 22 32 42 52 62 72
+      // 01 11 21 31 41 51 61 71  03 13 23 33 43 53 63 73
+      d[0] = _mm_packus_epi16(d[0], d[2]);
+      d[1] = _mm_packus_epi16(d[1], d[3]);
+      // 00 10 01 11 20 30 21 31  40 50 41 51 60 70 61 71
+      // 02 12 03 13 22 32 23 33  42 52 43 53 62 72 63 73
+      d[2] = _mm_unpacklo_epi16(d[0], d[1]);
+      d[3] = _mm_unpackhi_epi16(d[0], d[1]);
+      // 00 10 01 11 02 12 03 13  20 30 21 31 22 32 23 33
+      // 40 50 41 51 42 52 43 53  60 70 61 71 62 72 63 73
+      d[0] = _mm_unpacklo_epi32(d[2], d[3]);
+      d[1] = _mm_unpackhi_epi32(d[2], d[3]);
+      store_8bit_8x4_from_16x2(d, t, 2 * width_hor);
+
+      s[0] = s[4];
+      s[1] = s[5];
+      s[2] = s[6];
+
+      t += 8;
+      x -= 4;
+    } while (x);
+    src += 8 * src_stride - 2 * width_hor;
+    t += 6 * width_hor;
+    y -= 8;
+  } while (y);
+
+  // vertical 8x4
+  x = width_ver;
+  t = temp_buffer;
+  do {
+    // 00 10 01 11 02 12 03 13  04 14 05 15 06 16 07 17
+    // 20 30 21 31 22 32 23 33  24 34 25 35 26 36 27 37
+    // 40 50 41 51 42 52 43 53  44 54 45 55 46 56 47 57
+    s[0] = _mm_loadu_si128((const __m128i *)(t + 0 * width_hor));
+    s[1] = _mm_loadu_si128((const __m128i *)(t + 2 * width_hor));
+    s[2] = _mm_loadu_si128((const __m128i *)(t + 4 * width_hor));
+    t += 6 * width_hor;
+    y = height_ver;
+
+    do {
+      // 60 70 61 71 62 72 63 73  64 74 65 75 66 76 67 77
+      // 80 90 81 91 82 92 83 93  84 94 85 95 86 96 87 77
+      // A0 B0 A1 B1 A2 B2 A3 B3  A4 B4 A5 B5 A6 B6 A7 77
+      // C0 D0 C1 D1 C2 D2 C3 D3  C4 D4 C5 D5 C6 D6 C7 77
+      loadu_8bit_16x4(t, 2 * width_hor, &s[3]);
+      t += 8 * width_hor;
+
+      d[0] = convolve8_8_ssse3(&s[0], f);  // 00 01 02 03 04 05 06 07
+      d[1] = convolve8_8_ssse3(&s[1], f);  // 10 11 12 13 14 15 16 17
+      d[2] = convolve8_8_ssse3(&s[2], f);  // 20 21 22 23 24 25 26 27
+      d[3] = convolve8_8_ssse3(&s[3], f);  // 30 31 32 33 34 35 36 37
+
+      // 00 01 02 03 04 05 06 07  10 11 12 13 14 15 16 17
+      // 20 21 22 23 24 25 26 27  30 31 32 33 34 35 36 37
+      d[0] = _mm_packus_epi16(d[0], d[1]);
+      d[1] = _mm_packus_epi16(d[2], d[3]);
+      store_8bit_8x4_from_16x2(d, dst, dst_stride);
+
+      s[0] = s[4];
+      s[1] = s[5];
+      s[2] = s[6];
+
+      dst += 4 * dst_stride;
+      y -= 4;
+    } while (y);
+    t -= width_hor * (2 * height_ver + 6);
+    t += 16;
+    dst -= height_ver * dst_stride;
+    dst += 8;
+    x -= 8;
+  } while (x);
+}
+
+typedef void (*shuffle_filter_funcs)(const int16_t *const filter,
+                                     __m128i *const f);
+
+typedef __m128i (*convolve8_funcs)(const __m128i *const s,
+                                   const __m128i *const f);
+
+static void scale_plane_4_to_3_general(const uint8_t *src, const int src_stride,
+                                       uint8_t *dst, const int dst_stride,
+                                       const int w, const int h,
+                                       const InterpKernel *const coef,
+                                       const int phase,
+                                       uint8_t *const temp_buffer) {
+  static const int step_q4 = 16 * 4 / 3;
+  const int width_hor = (w + 5) - ((w + 5) % 6);
+  const int stride_hor = 2 * width_hor + 4;  // store 4 extra pixels
+  const int width_ver = (w + 7) & ~7;
+  // We need (SUBPEL_TAPS - 1) extra rows: (SUBPEL_TAPS / 2 - 1) extra rows
+  // above and (SUBPEL_TAPS / 2) extra rows below.
+  const int height_hor = (4 * h / 3 + SUBPEL_TAPS - 1 + 7) & ~7;
+  const int height_ver = (h + 5) - ((h + 5) % 6);
+  int x, y = height_hor;
+  uint8_t *t = temp_buffer;
+  __m128i s[12], d[6], dd[4];
+  __m128i f0[4], f1[5], f2[5];
+  // The offset of the first row is always less than 1 pixel.
+  const int offset1_q4 = phase + 1 * step_q4;
+  const int offset2_q4 = phase + 2 * step_q4;
+  // offset_idxx indicates the pixel offset is even (0) or odd (1).
+  // It's used to choose the src offset and filter coefficient offset.
+  const int offset_idx1 = (offset1_q4 >> 4) & 1;
+  const int offset_idx2 = (offset2_q4 >> 4) & 1;
+  static const shuffle_filter_funcs shuffle_filter_func_list[2] = {
+    shuffle_filter_ssse3, shuffle_filter_odd_ssse3
+  };
+  static const convolve8_funcs convolve8_func_list[2] = {
+    convolve8_8_even_offset_ssse3, convolve8_8_odd_offset_ssse3
+  };
+
+  assert(w && h);
+
+  shuffle_filter_ssse3(coef[(phase + 0 * step_q4) & SUBPEL_MASK], f0);
+  shuffle_filter_func_list[offset_idx1](coef[offset1_q4 & SUBPEL_MASK], f1);
+  shuffle_filter_func_list[offset_idx2](coef[offset2_q4 & SUBPEL_MASK], f2);
+
+  // Sub 64 to avoid overflow.
+  // Coef 128 would be treated as -128 in PMADDUBSW. Sub 64 here.
+  // Coef 128 is in either fx[1] or fx[2] depending on the phase idx.
+  // When filter phase idx is 1, the two biggest coefficients are shuffled
+  // together, and the sum of them are always no less than 128. Sub 64 here.
+  // After the subtraction, when the sum of all positive coefficients are no
+  // larger than 128, and the sum of all negative coefficients are no
+  // less than -128, there will be no overflow in the convolve8 functions.
+  f0[1] = _mm_sub_epi8(f0[1], _mm_set1_epi8(64));
+  f1[1 + offset_idx1] = _mm_sub_epi8(f1[1 + offset_idx1], _mm_set1_epi8(64));
+  f2[1 + offset_idx2] = _mm_sub_epi8(f2[1 + offset_idx2], _mm_set1_epi8(64));
+
+  src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2 - 1;
+
+  // horizontal 6x8
+  do {
+    load_8bit_8x8(src, src_stride, s);
+    // 00 01 10 11 20 21 30 31  40 41 50 51 60 61 70 71
+    // 02 03 12 13 22 23 32 33  42 43 52 53 62 63 72 73
+    // 04 05 14 15 24 25 34 35  44 45 54 55 64 65 74 75
+    // 06 07 16 17 26 27 36 37  46 47 56 57 66 67 76 77
+    transpose_16bit_4x8(s, s);
+    x = width_hor;
+
+    do {
+      src += 8;
+      load_8bit_8x8(src, src_stride, &s[4]);
+      // 08 09 18 19 28 29 38 39  48 49 58 59 68 69 78 79
+      // 0A 0B 1A 1B 2A 2B 3A 3B  4A 4B 5A 5B 6A 6B 7A 7B
+      // OC 0D 1C 1D 2C 2D 3C 3D  4C 4D 5C 5D 6C 6D 7C 7D
+      // 0E 0F 1E 1F 2E 2F 3E 3F  4E 4F 5E 5F 6E 6F 7E 7F
+      transpose_16bit_4x8(&s[4], &s[4]);
+
+      // 00 10 20 30 40 50 60 70
+      // 01 11 21 31 41 51 61 71
+      // 02 12 22 32 42 52 62 72
+      // 03 13 23 33 43 53 63 73
+      // 04 14 24 34 44 54 64 74
+      // 05 15 25 35 45 55 65 75
+      d[0] = convolve8_8_even_offset_ssse3(&s[0], f0);
+      d[1] = convolve8_func_list[offset_idx1](&s[offset1_q4 >> 5], f1);
+      d[2] = convolve8_func_list[offset_idx2](&s[offset2_q4 >> 5], f2);
+      d[3] = convolve8_8_even_offset_ssse3(&s[2], f0);
+      d[4] = convolve8_func_list[offset_idx1](&s[2 + (offset1_q4 >> 5)], f1);
+      d[5] = convolve8_func_list[offset_idx2](&s[2 + (offset2_q4 >> 5)], f2);
+
+      // 00 10 20 30 40 50 60 70  02 12 22 32 42 52 62 72
+      // 01 11 21 31 41 51 61 71  03 13 23 33 43 53 63 73
+      // 04 14 24 34 44 54 64 74  xx xx xx xx xx xx xx xx
+      // 05 15 25 35 45 55 65 75  xx xx xx xx xx xx xx xx
+      dd[0] = _mm_packus_epi16(d[0], d[2]);
+      dd[1] = _mm_packus_epi16(d[1], d[3]);
+      dd[2] = _mm_packus_epi16(d[4], d[4]);
+      dd[3] = _mm_packus_epi16(d[5], d[5]);
+
+      // 00 10 01 11 20 30 21 31  40 50 41 51 60 70 61 71
+      // 02 12 03 13 22 32 23 33  42 52 43 53 62 72 63 73
+      // 04 14 05 15 24 34 25 35  44 54 45 55 64 74 65 75
+      d[0] = _mm_unpacklo_epi16(dd[0], dd[1]);
+      d[1] = _mm_unpackhi_epi16(dd[0], dd[1]);
+      d[2] = _mm_unpacklo_epi16(dd[2], dd[3]);
+
+      // 00 10 01 11 02 12 03 13  20 30 21 31 22 32 23 33
+      // 40 50 41 51 42 52 43 53  60 70 61 71 62 72 63 73
+      // 04 14 05 15 xx xx xx xx  24 34 25 35 xx xx xx xx
+      // 44 54 45 55 xx xx xx xx  64 74 65 75 xx xx xx xx
+      dd[0] = _mm_unpacklo_epi32(d[0], d[1]);
+      dd[1] = _mm_unpackhi_epi32(d[0], d[1]);
+      dd[2] = _mm_unpacklo_epi32(d[2], d[2]);
+      dd[3] = _mm_unpackhi_epi32(d[2], d[2]);
+
+      // 00 10 01 11 02 12 03 13  04 14 05 15 xx xx xx xx
+      // 20 30 21 31 22 32 23 33  24 34 25 35 xx xx xx xx
+      // 40 50 41 51 42 52 43 53  44 54 45 55 xx xx xx xx
+      // 60 70 61 71 62 72 63 73  64 74 65 75 xx xx xx xx
+      d[0] = _mm_unpacklo_epi64(dd[0], dd[2]);
+      d[1] = _mm_unpackhi_epi64(dd[0], dd[2]);
+      d[2] = _mm_unpacklo_epi64(dd[1], dd[3]);
+      d[3] = _mm_unpackhi_epi64(dd[1], dd[3]);
+
+      // store 4 extra pixels
+      storeu_8bit_16x4(d, t, stride_hor);
+
+      s[0] = s[4];
+      s[1] = s[5];
+      s[2] = s[6];
+      s[3] = s[7];
+
+      t += 12;
+      x -= 6;
+    } while (x);
+    src += 8 * src_stride - 4 * width_hor / 3;
+    t += 3 * stride_hor + 4;
+    y -= 8;
+  } while (y);
+
+  // vertical 8x6
+  x = width_ver;
+  t = temp_buffer;
+  do {
+    // 00 10 01 11 02 12 03 13  04 14 05 15 06 16 07 17
+    // 20 30 21 31 22 32 23 33  24 34 25 35 26 36 27 37
+    // 40 50 41 51 42 52 43 53  44 54 45 55 46 56 47 57
+    // 60 70 61 71 62 72 63 73  64 74 65 75 66 76 67 77
+    loadu_8bit_16x4(t, stride_hor, s);
+    y = height_ver;
+
+    do {
+      // 80 90 81 91 82 92 83 93  84 94 85 95 86 96 87 97
+      // A0 B0 A1 B1 A2 B2 A3 B3  A4 B4 A5 B5 A6 B6 A7 B7
+      // C0 D0 C1 D1 C2 D2 C3 D3  C4 D4 C5 D5 C6 D6 C7 D7
+      // E0 F0 E1 F1 E2 F2 E3 F3  E4 F4 E5 F5 E6 F6 E7 F7
+      t += 4 * stride_hor;
+      loadu_8bit_16x4(t, stride_hor, &s[4]);
+
+      d[0] = convolve8_8_even_offset_ssse3(&s[0], f0);
+      d[1] = convolve8_func_list[offset_idx1](&s[offset1_q4 >> 5], f1);
+      d[2] = convolve8_func_list[offset_idx2](&s[offset2_q4 >> 5], f2);
+      d[3] = convolve8_8_even_offset_ssse3(&s[2], f0);
+      d[4] = convolve8_func_list[offset_idx1](&s[2 + (offset1_q4 >> 5)], f1);
+      d[5] = convolve8_func_list[offset_idx2](&s[2 + (offset2_q4 >> 5)], f2);
+
+      // 00 01 02 03 04 05 06 07  10 11 12 13 14 15 16 17
+      // 20 21 22 23 24 25 26 27  30 31 32 33 34 35 36 37
+      // 40 41 42 43 44 45 46 47  50 51 52 53 54 55 56 57
+      d[0] = _mm_packus_epi16(d[0], d[1]);
+      d[2] = _mm_packus_epi16(d[2], d[3]);
+      d[4] = _mm_packus_epi16(d[4], d[5]);
+
+      _mm_storel_epi64((__m128i *)(dst + 0 * dst_stride), d[0]);
+      _mm_storeh_epi64((__m128i *)(dst + 1 * dst_stride), d[0]);
+      _mm_storel_epi64((__m128i *)(dst + 2 * dst_stride), d[2]);
+      _mm_storeh_epi64((__m128i *)(dst + 3 * dst_stride), d[2]);
+      _mm_storel_epi64((__m128i *)(dst + 4 * dst_stride), d[4]);
+      _mm_storeh_epi64((__m128i *)(dst + 5 * dst_stride), d[4]);
+
+      s[0] = s[4];
+      s[1] = s[5];
+      s[2] = s[6];
+      s[3] = s[7];
+
+      dst += 6 * dst_stride;
+      y -= 6;
+    } while (y);
+    t -= stride_hor * 2 * height_ver / 3;
+    t += 16;
+    dst -= height_ver * dst_stride;
+    dst += 8;
+    x -= 8;
+  } while (x);
+}
+
+static INLINE __m128i scale_1_to_2_phase_0_kernel(const __m128i *const s,
+                                                  const __m128i *const f) {
+  __m128i ss[4], temp;
+
+  ss[0] = _mm_unpacklo_epi8(s[0], s[1]);
+  ss[1] = _mm_unpacklo_epi8(s[2], s[3]);
+  ss[2] = _mm_unpacklo_epi8(s[4], s[5]);
+  ss[3] = _mm_unpacklo_epi8(s[6], s[7]);
+  temp = convolve8_8_ssse3(ss, f);
+  return _mm_packus_epi16(temp, temp);
+}
+
+// Only calculate odd columns since even columns are just src pixels' copies.
+static void scale_1_to_2_phase_0_row(const uint8_t *src, uint8_t *dst,
+                                     const int w, const __m128i *const f) {
+  int x = w;
+
+  do {
+    __m128i s[8], temp;
+    s[0] = _mm_loadl_epi64((const __m128i *)(src + 0));
+    s[1] = _mm_loadl_epi64((const __m128i *)(src + 1));
+    s[2] = _mm_loadl_epi64((const __m128i *)(src + 2));
+    s[3] = _mm_loadl_epi64((const __m128i *)(src + 3));
+    s[4] = _mm_loadl_epi64((const __m128i *)(src + 4));
+    s[5] = _mm_loadl_epi64((const __m128i *)(src + 5));
+    s[6] = _mm_loadl_epi64((const __m128i *)(src + 6));
+    s[7] = _mm_loadl_epi64((const __m128i *)(src + 7));
+    temp = scale_1_to_2_phase_0_kernel(s, f);
+    _mm_storel_epi64((__m128i *)dst, temp);
+    src += 8;
+    dst += 8;
+    x -= 8;
+  } while (x);
+}
+
+static void scale_plane_1_to_2_phase_0(const uint8_t *src,
+                                       const ptrdiff_t src_stride, uint8_t *dst,
+                                       const ptrdiff_t dst_stride,
+                                       const int src_w, const int src_h,
+                                       const int16_t *const coef,
+                                       uint8_t *const temp_buffer) {
+  int max_width;
+  int y;
+  uint8_t *tmp[9];
+  __m128i f[4];
+
+  max_width = (src_w + 7) & ~7;
+  tmp[0] = temp_buffer + 0 * max_width;
+  tmp[1] = temp_buffer + 1 * max_width;
+  tmp[2] = temp_buffer + 2 * max_width;
+  tmp[3] = temp_buffer + 3 * max_width;
+  tmp[4] = temp_buffer + 4 * max_width;
+  tmp[5] = temp_buffer + 5 * max_width;
+  tmp[6] = temp_buffer + 6 * max_width;
+  tmp[7] = temp_buffer + 7 * max_width;
+
+  shuffle_filter_ssse3(coef, f);
+
+  scale_1_to_2_phase_0_row(src - 3 * src_stride - 3, tmp[0], max_width, f);
+  scale_1_to_2_phase_0_row(src - 2 * src_stride - 3, tmp[1], max_width, f);
+  scale_1_to_2_phase_0_row(src - 1 * src_stride - 3, tmp[2], max_width, f);
+  scale_1_to_2_phase_0_row(src + 0 * src_stride - 3, tmp[3], max_width, f);
+  scale_1_to_2_phase_0_row(src + 1 * src_stride - 3, tmp[4], max_width, f);
+  scale_1_to_2_phase_0_row(src + 2 * src_stride - 3, tmp[5], max_width, f);
+  scale_1_to_2_phase_0_row(src + 3 * src_stride - 3, tmp[6], max_width, f);
+
+  y = src_h;
+  do {
+    int x;
+    scale_1_to_2_phase_0_row(src + 4 * src_stride - 3, tmp[7], max_width, f);
+    for (x = 0; x < max_width; x += 8) {
+      __m128i s[8], C, D, CD;
+
+      // Even rows
+      const __m128i a = _mm_loadl_epi64((const __m128i *)(src + x));
+      const __m128i b = _mm_loadl_epi64((const __m128i *)(tmp[3] + x));
+      const __m128i ab = _mm_unpacklo_epi8(a, b);
+      _mm_storeu_si128((__m128i *)(dst + 2 * x), ab);
+
+      // Odd rows
+      // Even columns
+      load_8bit_8x8(src + x - 3 * src_stride, src_stride, s);
+      C = scale_1_to_2_phase_0_kernel(s, f);
+
+      // Odd columns
+      s[0] = _mm_loadl_epi64((const __m128i *)(tmp[0] + x));
+      s[1] = _mm_loadl_epi64((const __m128i *)(tmp[1] + x));
+      s[2] = _mm_loadl_epi64((const __m128i *)(tmp[2] + x));
+      s[3] = _mm_loadl_epi64((const __m128i *)(tmp[3] + x));
+      s[4] = _mm_loadl_epi64((const __m128i *)(tmp[4] + x));
+      s[5] = _mm_loadl_epi64((const __m128i *)(tmp[5] + x));
+      s[6] = _mm_loadl_epi64((const __m128i *)(tmp[6] + x));
+      s[7] = _mm_loadl_epi64((const __m128i *)(tmp[7] + x));
+      D = scale_1_to_2_phase_0_kernel(s, f);
+
+      CD = _mm_unpacklo_epi8(C, D);
+      _mm_storeu_si128((__m128i *)(dst + dst_stride + 2 * x), CD);
+    }
+
+    src += src_stride;
+    dst += 2 * dst_stride;
+    tmp[8] = tmp[0];
+    tmp[0] = tmp[1];
+    tmp[1] = tmp[2];
+    tmp[2] = tmp[3];
+    tmp[3] = tmp[4];
+    tmp[4] = tmp[5];
+    tmp[5] = tmp[6];
+    tmp[6] = tmp[7];
+    tmp[7] = tmp[8];
+  } while (--y);
+}
+
+void av1_resize_and_extend_frame_ssse3(const YV12_BUFFER_CONFIG *src,
+                                       YV12_BUFFER_CONFIG *dst,
+                                       const InterpFilter filter,
+                                       const int phase, const int num_planes) {
+  // We use AOMMIN(num_planes, MAX_MB_PLANE) instead of num_planes to quiet
+  // the static analysis warnings.
+  int scaled = 0;
+  for (int i = 0; i < AOMMIN(num_planes, MAX_MB_PLANE); ++i) {
+    const int is_uv = i > 0;
+    const int src_w = src->crop_widths[is_uv];
+    const int src_h = src->crop_heights[is_uv];
+    const int src_y_w = (src->crop_widths[0] + 1) & ~1;
+    const int dst_w = dst->crop_widths[is_uv];
+    const int dst_h = dst->crop_heights[is_uv];
+    const int dst_y_w = (dst->crop_widths[0] + 1) & ~1;
+    const int dst_y_h = (dst->crop_heights[0] + 1) & ~1;
+
+    if (2 * dst_w == src_w && 2 * dst_h == src_h) {
+      // 2 to 1
+      scaled = 1;
+      if (phase == 0) {
+        scale_plane_2_to_1_phase_0(src->buffers[i], src->strides[is_uv],
+                                   dst->buffers[i], dst->strides[is_uv], dst_w,
+                                   dst_h);
+      } else if (filter == BILINEAR) {
+        const int16_t c0 = av1_bilinear_filters[phase][3];
+        const int16_t c1 = av1_bilinear_filters[phase][4];
+        const __m128i c0c1 = _mm_set1_epi16(c0 | (c1 << 8));  // c0 and c1 >= 0
+        scale_plane_2_to_1_bilinear(src->buffers[i], src->strides[is_uv],
+                                    dst->buffers[i], dst->strides[is_uv], dst_w,
+                                    dst_h, c0c1);
+      } else {
+        const int buffer_stride = (dst_y_w + 3) & ~3;
+        const int buffer_height = (2 * dst_y_h + SUBPEL_TAPS - 2 + 7) & ~7;
+        uint8_t *const temp_buffer =
+            (uint8_t *)malloc(buffer_stride * buffer_height);
+        if (temp_buffer) {
+          const InterpKernel *interp_kernel =
+              (const InterpKernel *)av1_interp_filter_params_list[filter]
+                  .filter_ptr;
+          scale_plane_2_to_1_general(src->buffers[i], src->strides[is_uv],
+                                     dst->buffers[i], dst->strides[is_uv],
+                                     dst_w, dst_h, interp_kernel[phase],
+                                     temp_buffer);
+          free(temp_buffer);
+        } else {
+          scaled = 0;
+        }
+      }
+    } else if (4 * dst_w == src_w && 4 * dst_h == src_h) {
+      // 4 to 1
+      scaled = 1;
+      if (phase == 0) {
+        scale_plane_4_to_1_phase_0(src->buffers[i], src->strides[is_uv],
+                                   dst->buffers[i], dst->strides[is_uv], dst_w,
+                                   dst_h);
+      } else if (filter == BILINEAR) {
+        const int16_t c0 = av1_bilinear_filters[phase][3];
+        const int16_t c1 = av1_bilinear_filters[phase][4];
+        const __m128i c0c1 = _mm_set1_epi16(c0 | (c1 << 8));  // c0 and c1 >= 0
+        scale_plane_4_to_1_bilinear(src->buffers[i], src->strides[is_uv],
+                                    dst->buffers[i], dst->strides[is_uv], dst_w,
+                                    dst_h, c0c1);
+      } else {
+        const int buffer_stride = (dst_y_w + 1) & ~1;
+        const int buffer_height = (4 * dst_y_h + SUBPEL_TAPS - 2 + 7) & ~7;
+        // When dst_w is 1 or 2, we need extra padding to avoid heap read
+        // overflow
+        const int extra_padding = 16;
+        uint8_t *const temp_buffer =
+            (uint8_t *)malloc(buffer_stride * buffer_height + extra_padding);
+        if (temp_buffer) {
+          const InterpKernel *interp_kernel =
+              (const InterpKernel *)av1_interp_filter_params_list[filter]
+                  .filter_ptr;
+          scale_plane_4_to_1_general(src->buffers[i], src->strides[is_uv],
+                                     dst->buffers[i], dst->strides[is_uv],
+                                     dst_w, dst_h, interp_kernel[phase],
+                                     temp_buffer);
+          free(temp_buffer);
+        } else {
+          scaled = 0;
+        }
+      }
+    } else if (4 * dst_w == 3 * src_w && 4 * dst_h == 3 * src_h) {
+      // 4 to 3
+      const int buffer_stride_hor = (dst_y_w + 5) - ((dst_y_w + 5) % 6) + 2;
+      const int buffer_stride_ver = (dst_y_w + 7) & ~7;
+      const int buffer_height = (4 * dst_y_h / 3 + SUBPEL_TAPS - 1 + 7) & ~7;
+      // When the vertical filter reads more pixels than the horizontal filter
+      // generated in each row, we need extra padding to avoid heap read
+      // overflow. For example, the horizontal filter generates 18 pixels but
+      // the vertical filter reads 24 pixels in a row. The difference is
+      // multiplied by 2 since two rows are interlaced together in the
+      // optimization.
+      const int extra_padding =
+          (buffer_stride_ver > buffer_stride_hor)
+              ? 2 * (buffer_stride_ver - buffer_stride_hor)
+              : 0;
+      const int buffer_size = buffer_stride_hor * buffer_height + extra_padding;
+      uint8_t *const temp_buffer = (uint8_t *)malloc(buffer_size);
+      if (temp_buffer) {
+        scaled = 1;
+        const InterpKernel *interp_kernel =
+            (const InterpKernel *)av1_interp_filter_params_list[filter]
+                .filter_ptr;
+        scale_plane_4_to_3_general(src->buffers[i], src->strides[is_uv],
+                                   dst->buffers[i], dst->strides[is_uv], dst_w,
+                                   dst_h, interp_kernel, phase, temp_buffer);
+        free(temp_buffer);
+      } else {
+        scaled = 0;
+      }
+    } else if (dst_w == src_w * 2 && dst_h == src_h * 2) {
+      // 1 to 2
+      uint8_t *const temp_buffer = (uint8_t *)malloc(8 * ((src_y_w + 7) & ~7));
+      if (temp_buffer) {
+        scaled = 1;
+        const InterpKernel *interp_kernel =
+            (const InterpKernel *)av1_interp_filter_params_list[filter]
+                .filter_ptr;
+        scale_plane_1_to_2_phase_0(src->buffers[i], src->strides[is_uv],
+                                   dst->buffers[i], dst->strides[is_uv], src_w,
+                                   src_h, interp_kernel[8], temp_buffer);
+        free(temp_buffer);
+      } else {
+        scaled = 0;
+      }
+    }
+  }
+  if (!scaled) {
+    av1_resize_and_extend_frame_c(src, dst, filter, phase, num_planes);
+  } else {
+    aom_extend_frame_borders(dst, num_planes);
+  }
+}
diff --git a/av1/decoder/decodeframe.c b/av1/decoder/decodeframe.c
index 7abfac4..f85375b 100644
--- a/av1/decoder/decodeframe.c
+++ b/av1/decoder/decodeframe.c
@@ -117,9 +117,11 @@
   }
 }
 
+#if !CONFIG_REALTIME_ONLY
 static AOM_INLINE void loop_restoration_read_sb_coeffs(
     const AV1_COMMON *const cm, MACROBLOCKD *xd, aom_reader *const r, int plane,
     int runit_idx);
+#endif
 
 static int read_is_valid(const uint8_t *start, size_t len, const uint8_t *end) {
   return len != 0 && len <= (size_t)(end - start);
@@ -140,31 +142,30 @@
   }
 }
 
-static AOM_INLINE void inverse_transform_block(MACROBLOCKD *xd, int plane,
-                                               const TX_TYPE tx_type,
+static AOM_INLINE void inverse_transform_block(DecoderCodingBlock *dcb,
+                                               int plane, const TX_TYPE tx_type,
                                                const TX_SIZE tx_size,
                                                uint8_t *dst, int stride,
                                                int reduced_tx_set) {
-  struct macroblockd_plane *const pd = &xd->plane[plane];
-  tran_low_t *const dqcoeff = pd->dqcoeff_block + xd->cb_offset[plane];
-  eob_info *eob_data = pd->eob_data + xd->txb_offset[plane];
+  tran_low_t *const dqcoeff = dcb->dqcoeff_block[plane] + dcb->cb_offset[plane];
+  eob_info *eob_data = dcb->eob_data[plane] + dcb->txb_offset[plane];
   uint16_t scan_line = eob_data->max_scan_line;
   uint16_t eob = eob_data->eob;
-  av1_inverse_transform_block(xd, dqcoeff, plane, tx_type, tx_size, dst, stride,
-                              eob, reduced_tx_set);
+  av1_inverse_transform_block(&dcb->xd, dqcoeff, plane, tx_type, tx_size, dst,
+                              stride, eob, reduced_tx_set);
   memset(dqcoeff, 0, (scan_line + 1) * sizeof(dqcoeff[0]));
 }
 
 static AOM_INLINE void read_coeffs_tx_intra_block(
-    const AV1_COMMON *const cm, MACROBLOCKD *const xd, aom_reader *const r,
+    const AV1_COMMON *const cm, DecoderCodingBlock *dcb, aom_reader *const r,
     const int plane, const int row, const int col, const TX_SIZE tx_size) {
-  MB_MODE_INFO *mbmi = xd->mi[0];
-  if (!mbmi->skip) {
+  MB_MODE_INFO *mbmi = dcb->xd.mi[0];
+  if (!mbmi->skip_txfm) {
 #if TXCOEFF_TIMER
     struct aom_usec_timer timer;
     aom_usec_timer_start(&timer);
 #endif
-    av1_read_coeffs_txb_facade(cm, xd, r, plane, row, col, tx_size);
+    av1_read_coeffs_txb_facade(cm, dcb, r, plane, row, col, tx_size);
 #if TXCOEFF_TIMER
     aom_usec_timer_mark(&timer);
     const int64_t elapsed_time = aom_usec_timer_elapsed(&timer);
@@ -175,12 +176,12 @@
 }
 
 static AOM_INLINE void decode_block_void(const AV1_COMMON *const cm,
-                                         MACROBLOCKD *const xd,
+                                         DecoderCodingBlock *dcb,
                                          aom_reader *const r, const int plane,
                                          const int row, const int col,
                                          const TX_SIZE tx_size) {
   (void)cm;
-  (void)xd;
+  (void)dcb;
   (void)r;
   (void)plane;
   (void)row;
@@ -189,10 +190,10 @@
 }
 
 static AOM_INLINE void predict_inter_block_void(AV1_COMMON *const cm,
-                                                MACROBLOCKD *const xd,
+                                                DecoderCodingBlock *dcb,
                                                 BLOCK_SIZE bsize) {
   (void)cm;
-  (void)xd;
+  (void)dcb;
   (void)bsize;
 }
 
@@ -203,37 +204,39 @@
 }
 
 static AOM_INLINE void predict_and_reconstruct_intra_block(
-    const AV1_COMMON *const cm, MACROBLOCKD *const xd, aom_reader *const r,
+    const AV1_COMMON *const cm, DecoderCodingBlock *dcb, aom_reader *const r,
     const int plane, const int row, const int col, const TX_SIZE tx_size) {
   (void)r;
+  MACROBLOCKD *const xd = &dcb->xd;
   MB_MODE_INFO *mbmi = xd->mi[0];
   PLANE_TYPE plane_type = get_plane_type(plane);
 
   av1_predict_intra_block_facade(cm, xd, plane, col, row, tx_size);
 
-  if (!mbmi->skip) {
-    struct macroblockd_plane *const pd = &xd->plane[plane];
-    eob_info *eob_data = pd->eob_data + xd->txb_offset[plane];
+  if (!mbmi->skip_txfm) {
+    eob_info *eob_data = dcb->eob_data[plane] + dcb->txb_offset[plane];
     if (eob_data->eob) {
       const bool reduced_tx_set_used = cm->features.reduced_tx_set_used;
       // tx_type was read out in av1_read_coeffs_txb.
       const TX_TYPE tx_type = av1_get_tx_type(xd, plane_type, row, col, tx_size,
                                               reduced_tx_set_used);
+      struct macroblockd_plane *const pd = &xd->plane[plane];
       uint8_t *dst = &pd->dst.buf[(row * pd->dst.stride + col) << MI_SIZE_LOG2];
-      inverse_transform_block(xd, plane, tx_type, tx_size, dst, pd->dst.stride,
+      inverse_transform_block(dcb, plane, tx_type, tx_size, dst, pd->dst.stride,
                               reduced_tx_set_used);
     }
   }
   if (plane == AOM_PLANE_Y && store_cfl_required(cm, xd)) {
-    cfl_store_tx(xd, row, col, tx_size, mbmi->sb_type);
+    cfl_store_tx(xd, row, col, tx_size, mbmi->bsize);
   }
 }
 
 static AOM_INLINE void inverse_transform_inter_block(
-    const AV1_COMMON *const cm, MACROBLOCKD *const xd, aom_reader *const r,
+    const AV1_COMMON *const cm, DecoderCodingBlock *dcb, aom_reader *const r,
     const int plane, const int blk_row, const int blk_col,
     const TX_SIZE tx_size) {
   (void)r;
+  MACROBLOCKD *const xd = &dcb->xd;
   PLANE_TYPE plane_type = get_plane_type(plane);
   const struct macroblockd_plane *const pd = &xd->plane[plane];
   const bool reduced_tx_set_used = cm->features.reduced_tx_set_used;
@@ -243,7 +246,7 @@
 
   uint8_t *dst =
       &pd->dst.buf[(blk_row * pd->dst.stride + blk_col) << MI_SIZE_LOG2];
-  inverse_transform_block(xd, plane, tx_type, tx_size, dst, pd->dst.stride,
+  inverse_transform_block(dcb, plane, tx_type, tx_size, dst, pd->dst.stride,
                           reduced_tx_set_used);
 #if CONFIG_MISMATCH_DEBUG
   int pixel_c, pixel_r;
@@ -260,21 +263,22 @@
 #endif
 }
 
-static AOM_INLINE void set_cb_buffer_offsets(MACROBLOCKD *const xd,
+static AOM_INLINE void set_cb_buffer_offsets(DecoderCodingBlock *dcb,
                                              TX_SIZE tx_size, int plane) {
-  xd->cb_offset[plane] += tx_size_wide[tx_size] * tx_size_high[tx_size];
-  xd->txb_offset[plane] =
-      xd->cb_offset[plane] / (TX_SIZE_W_MIN * TX_SIZE_H_MIN);
+  dcb->cb_offset[plane] += tx_size_wide[tx_size] * tx_size_high[tx_size];
+  dcb->txb_offset[plane] =
+      dcb->cb_offset[plane] / (TX_SIZE_W_MIN * TX_SIZE_H_MIN);
 }
 
 static AOM_INLINE void decode_reconstruct_tx(
     AV1_COMMON *cm, ThreadData *const td, aom_reader *r,
     MB_MODE_INFO *const mbmi, int plane, BLOCK_SIZE plane_bsize, int blk_row,
     int blk_col, int block, TX_SIZE tx_size, int *eob_total) {
-  MACROBLOCKD *const xd = &td->xd;
+  DecoderCodingBlock *const dcb = &td->dcb;
+  MACROBLOCKD *const xd = &dcb->xd;
   const struct macroblockd_plane *const pd = &xd->plane[plane];
   const TX_SIZE plane_tx_size =
-      plane ? av1_get_max_uv_txsize(mbmi->sb_type, pd->subsampling_x,
+      plane ? av1_get_max_uv_txsize(mbmi->bsize, pd->subsampling_x,
                                     pd->subsampling_y)
             : mbmi->inter_tx_size[av1_get_txb_size_index(plane_bsize, blk_row,
                                                          blk_col)];
@@ -285,14 +289,14 @@
   if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
 
   if (tx_size == plane_tx_size || plane) {
-    td->read_coeffs_tx_inter_block_visit(cm, xd, r, plane, blk_row, blk_col,
+    td->read_coeffs_tx_inter_block_visit(cm, dcb, r, plane, blk_row, blk_col,
                                          tx_size);
 
-    td->inverse_tx_inter_block_visit(cm, xd, r, plane, blk_row, blk_col,
+    td->inverse_tx_inter_block_visit(cm, dcb, r, plane, blk_row, blk_col,
                                      tx_size);
-    eob_info *eob_data = pd->eob_data + xd->txb_offset[plane];
+    eob_info *eob_data = dcb->eob_data[plane] + dcb->txb_offset[plane];
     *eob_total += eob_data->eob;
-    set_cb_buffer_offsets(xd, tx_size, plane);
+    set_cb_buffer_offsets(dcb, tx_size, plane);
   } else {
     const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
     assert(IMPLIES(tx_size <= TX_4X4, sub_txs == tx_size));
@@ -326,7 +330,7 @@
   const TileInfo *const tile = &xd->tile;
 
   set_mi_offsets(mi_params, xd, mi_row, mi_col);
-  xd->mi[0]->sb_type = bsize;
+  xd->mi[0]->bsize = bsize;
 #if CONFIG_RD_DEBUG
   xd->mi[0]->mi_row = mi_row;
   xd->mi[0]->mi_col = mi_col;
@@ -353,7 +357,7 @@
 }
 
 static AOM_INLINE void decode_mbmi_block(AV1Decoder *const pbi,
-                                         MACROBLOCKD *const xd, int mi_row,
+                                         DecoderCodingBlock *dcb, int mi_row,
                                          int mi_col, aom_reader *r,
                                          PARTITION_TYPE partition,
                                          BLOCK_SIZE bsize) {
@@ -363,13 +367,14 @@
   const int bh = mi_size_high[bsize];
   const int x_mis = AOMMIN(bw, cm->mi_params.mi_cols - mi_col);
   const int y_mis = AOMMIN(bh, cm->mi_params.mi_rows - mi_row);
+  MACROBLOCKD *const xd = &dcb->xd;
 
 #if CONFIG_ACCOUNTING
   aom_accounting_set_context(&pbi->accounting, mi_col, mi_row);
 #endif
   set_offsets(cm, xd, bsize, mi_row, mi_col, bw, bh, x_mis, y_mis);
   xd->mi[0]->partition = partition;
-  av1_read_mode_info(pbi, xd, r, x_mis, y_mis);
+  av1_read_mode_info(pbi, dcb, r, x_mis, y_mis);
   if (bsize >= BLOCK_8X8 &&
       (seq_params->subsampling_x || seq_params->subsampling_y)) {
     const BLOCK_SIZE uv_subsize =
@@ -629,8 +634,8 @@
 
 static void dec_calc_subpel_params_and_extend(
     const MV *const src_mv, InterPredParams *const inter_pred_params,
-    MACROBLOCKD *xd, int mi_x, int mi_y, int ref, uint8_t **pre,
-    SubpelParams *subpel_params, int *src_stride) {
+    MACROBLOCKD *const xd, int mi_x, int mi_y, int ref, uint8_t **mc_buf,
+    uint8_t **pre, SubpelParams *subpel_params, int *src_stride) {
   PadBlock block;
   MV32 scaled_mv;
   int subpel_x_mv, subpel_y_mv;
@@ -641,26 +646,30 @@
       inter_pred_params->scale_factors, &inter_pred_params->ref_frame_buf,
       scaled_mv, block, subpel_x_mv, subpel_y_mv,
       inter_pred_params->mode == WARP_PRED, inter_pred_params->is_intrabc,
-      inter_pred_params->use_hbd_buf, xd->mc_buf[ref], pre, src_stride);
+      inter_pred_params->use_hbd_buf, mc_buf[ref], pre, src_stride);
 }
 
-static void dec_build_inter_predictors(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                       int plane, const MB_MODE_INFO *mi,
+static void dec_build_inter_predictors(const AV1_COMMON *cm,
+                                       DecoderCodingBlock *dcb, int plane,
+                                       const MB_MODE_INFO *mi,
                                        int build_for_obmc, int bw, int bh,
                                        int mi_x, int mi_y) {
-  av1_build_inter_predictors(cm, xd, plane, mi, build_for_obmc, bw, bh, mi_x,
-                             mi_y, dec_calc_subpel_params_and_extend);
+  av1_build_inter_predictors(cm, &dcb->xd, plane, mi, build_for_obmc, bw, bh,
+                             mi_x, mi_y, dcb->mc_buf,
+                             dec_calc_subpel_params_and_extend);
 }
 
 static AOM_INLINE void dec_build_inter_predictor(const AV1_COMMON *cm,
-                                                 MACROBLOCKD *xd, int mi_row,
-                                                 int mi_col, BLOCK_SIZE bsize) {
+                                                 DecoderCodingBlock *dcb,
+                                                 int mi_row, int mi_col,
+                                                 BLOCK_SIZE bsize) {
+  MACROBLOCKD *const xd = &dcb->xd;
   const int num_planes = av1_num_planes(cm);
   for (int plane = 0; plane < num_planes; ++plane) {
     if (plane && !xd->is_chroma_ref) break;
     const int mi_x = mi_col * MI_SIZE;
     const int mi_y = mi_row * MI_SIZE;
-    dec_build_inter_predictors(cm, xd, plane, xd->mi[0], 0,
+    dec_build_inter_predictors(cm, dcb, plane, xd->mi[0], 0,
                                xd->plane[plane].width, xd->plane[plane].height,
                                mi_x, mi_y);
     if (is_interintra_pred(xd->mi[0])) {
@@ -676,7 +685,7 @@
 }
 
 static INLINE void dec_build_prediction_by_above_pred(
-    MACROBLOCKD *xd, int rel_mi_row, int rel_mi_col, uint8_t op_mi_size,
+    MACROBLOCKD *const xd, int rel_mi_row, int rel_mi_col, uint8_t op_mi_size,
     int dir, MB_MODE_INFO *above_mbmi, void *fun_ctxt, const int num_planes) {
   struct build_prediction_ctxt *ctxt = (struct build_prediction_ctxt *)fun_ctxt;
   const int above_mi_col = xd->mi_col + rel_mi_col;
@@ -691,7 +700,7 @@
   mi_x = above_mi_col << MI_SIZE_LOG2;
   mi_y = xd->mi_row << MI_SIZE_LOG2;
 
-  const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+  const BLOCK_SIZE bsize = xd->mi[0]->bsize;
 
   for (int j = 0; j < num_planes; ++j) {
     const struct macroblockd_plane *pd = &xd->plane[j];
@@ -700,15 +709,16 @@
                    block_size_high[BLOCK_64X64] >> (pd->subsampling_y + 1));
 
     if (av1_skip_u4x4_pred_in_obmc(bsize, pd, 0)) continue;
-    dec_build_inter_predictors(ctxt->cm, xd, j, &backup_mbmi, 1, bw, bh, mi_x,
-                               mi_y);
+    dec_build_inter_predictors(ctxt->cm, (DecoderCodingBlock *)ctxt->dcb, j,
+                               &backup_mbmi, 1, bw, bh, mi_x, mi_y);
   }
 }
 
 static AOM_INLINE void dec_build_prediction_by_above_preds(
-    const AV1_COMMON *cm, MACROBLOCKD *xd, uint8_t *tmp_buf[MAX_MB_PLANE],
-    int tmp_width[MAX_MB_PLANE], int tmp_height[MAX_MB_PLANE],
-    int tmp_stride[MAX_MB_PLANE]) {
+    const AV1_COMMON *cm, DecoderCodingBlock *dcb,
+    uint8_t *tmp_buf[MAX_MB_PLANE], int tmp_width[MAX_MB_PLANE],
+    int tmp_height[MAX_MB_PLANE], int tmp_stride[MAX_MB_PLANE]) {
+  MACROBLOCKD *const xd = &dcb->xd;
   if (!xd->up_available) return;
 
   // Adjust mb_to_bottom_edge to have the correct value for the OBMC
@@ -717,10 +727,10 @@
   const int this_height = xd->height * MI_SIZE;
   const int pred_height = AOMMIN(this_height / 2, 32);
   xd->mb_to_bottom_edge += GET_MV_SUBPEL(this_height - pred_height);
-  struct build_prediction_ctxt ctxt = { cm,         tmp_buf,
-                                        tmp_width,  tmp_height,
-                                        tmp_stride, xd->mb_to_right_edge };
-  const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+  struct build_prediction_ctxt ctxt = {
+    cm, tmp_buf, tmp_width, tmp_height, tmp_stride, xd->mb_to_right_edge, dcb
+  };
+  const BLOCK_SIZE bsize = xd->mi[0]->bsize;
   foreach_overlappable_nb_above(cm, xd,
                                 max_neighbor_obmc[mi_size_wide_log2[bsize]],
                                 dec_build_prediction_by_above_pred, &ctxt);
@@ -731,7 +741,7 @@
 }
 
 static INLINE void dec_build_prediction_by_left_pred(
-    MACROBLOCKD *xd, int rel_mi_row, int rel_mi_col, uint8_t op_mi_size,
+    MACROBLOCKD *const xd, int rel_mi_row, int rel_mi_col, uint8_t op_mi_size,
     int dir, MB_MODE_INFO *left_mbmi, void *fun_ctxt, const int num_planes) {
   struct build_prediction_ctxt *ctxt = (struct build_prediction_ctxt *)fun_ctxt;
   const int left_mi_row = xd->mi_row + rel_mi_row;
@@ -745,7 +755,7 @@
                                           &backup_mbmi, ctxt, num_planes);
   mi_x = xd->mi_col << MI_SIZE_LOG2;
   mi_y = left_mi_row << MI_SIZE_LOG2;
-  const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+  const BLOCK_SIZE bsize = xd->mi[0]->bsize;
 
   for (int j = 0; j < num_planes; ++j) {
     const struct macroblockd_plane *pd = &xd->plane[j];
@@ -754,15 +764,16 @@
     int bh = (op_mi_size << MI_SIZE_LOG2) >> pd->subsampling_y;
 
     if (av1_skip_u4x4_pred_in_obmc(bsize, pd, 1)) continue;
-    dec_build_inter_predictors(ctxt->cm, xd, j, &backup_mbmi, 1, bw, bh, mi_x,
-                               mi_y);
+    dec_build_inter_predictors(ctxt->cm, (DecoderCodingBlock *)ctxt->dcb, j,
+                               &backup_mbmi, 1, bw, bh, mi_x, mi_y);
   }
 }
 
 static AOM_INLINE void dec_build_prediction_by_left_preds(
-    const AV1_COMMON *cm, MACROBLOCKD *xd, uint8_t *tmp_buf[MAX_MB_PLANE],
-    int tmp_width[MAX_MB_PLANE], int tmp_height[MAX_MB_PLANE],
-    int tmp_stride[MAX_MB_PLANE]) {
+    const AV1_COMMON *cm, DecoderCodingBlock *dcb,
+    uint8_t *tmp_buf[MAX_MB_PLANE], int tmp_width[MAX_MB_PLANE],
+    int tmp_height[MAX_MB_PLANE], int tmp_stride[MAX_MB_PLANE]) {
+  MACROBLOCKD *const xd = &dcb->xd;
   if (!xd->left_available) return;
 
   // Adjust mb_to_right_edge to have the correct value for the OBMC
@@ -772,10 +783,10 @@
   const int pred_width = AOMMIN(this_width / 2, 32);
   xd->mb_to_right_edge += GET_MV_SUBPEL(this_width - pred_width);
 
-  struct build_prediction_ctxt ctxt = { cm,         tmp_buf,
-                                        tmp_width,  tmp_height,
-                                        tmp_stride, xd->mb_to_bottom_edge };
-  const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+  struct build_prediction_ctxt ctxt = {
+    cm, tmp_buf, tmp_width, tmp_height, tmp_stride, xd->mb_to_bottom_edge, dcb
+  };
+  const BLOCK_SIZE bsize = xd->mi[0]->bsize;
   foreach_overlappable_nb_left(cm, xd,
                                max_neighbor_obmc[mi_size_high_log2[bsize]],
                                dec_build_prediction_by_left_pred, &ctxt);
@@ -785,33 +796,8 @@
   xd->mb_to_bottom_edge = ctxt.mb_to_far_edge;
 }
 
-static void set_dst_buf(MACROBLOCKD *xd, uint8_t **dst_buf1,
-                        uint8_t **dst_buf2) {
-  dst_buf1[0] = xd->tmp_obmc_bufs[0];
-  dst_buf1[1] = xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE;
-  dst_buf1[2] = xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE * 2;
-  dst_buf2[0] = xd->tmp_obmc_bufs[1];
-  dst_buf2[1] = xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE;
-  dst_buf2[2] = xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE * 2;
-}
-
-#if CONFIG_AV1_HIGHBITDEPTH
-static void set_dst_buf_highbd(MACROBLOCKD *xd, uint8_t **dst_buf1,
-                               uint8_t **dst_buf2) {
-  int len = sizeof(uint16_t);
-  dst_buf1[0] = CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[0]);
-  dst_buf1[1] = CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE * len);
-  dst_buf1[2] =
-      CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE * 2 * len);
-  dst_buf2[0] = CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[1]);
-  dst_buf2[1] = CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE * len);
-  dst_buf2[2] =
-      CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE * 2 * len);
-}
-#endif
-
-static AOM_INLINE void dec_build_obmc_inter_predictors_sb(const AV1_COMMON *cm,
-                                                          MACROBLOCKD *xd) {
+static AOM_INLINE void dec_build_obmc_inter_predictors_sb(
+    const AV1_COMMON *cm, DecoderCodingBlock *dcb) {
   const int num_planes = av1_num_planes(cm);
   uint8_t *dst_buf1[MAX_MB_PLANE], *dst_buf2[MAX_MB_PLANE];
   int dst_stride1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
@@ -821,24 +807,17 @@
   int dst_height1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
   int dst_height2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
 
-#if CONFIG_AV1_HIGHBITDEPTH
-  if (is_cur_buf_hbd(xd)) {
-    set_dst_buf_highbd(xd, dst_buf1, dst_buf2);
-  } else {
-    set_dst_buf(xd, dst_buf1, dst_buf2);
-  }
-#else
-  set_dst_buf(xd, dst_buf1, dst_buf2);
-#endif
+  MACROBLOCKD *const xd = &dcb->xd;
+  av1_setup_obmc_dst_bufs(xd, dst_buf1, dst_buf2);
 
-  dec_build_prediction_by_above_preds(cm, xd, dst_buf1, dst_width1, dst_height1,
-                                      dst_stride1);
-  dec_build_prediction_by_left_preds(cm, xd, dst_buf2, dst_width2, dst_height2,
+  dec_build_prediction_by_above_preds(cm, dcb, dst_buf1, dst_width1,
+                                      dst_height1, dst_stride1);
+  dec_build_prediction_by_left_preds(cm, dcb, dst_buf2, dst_width2, dst_height2,
                                      dst_stride2);
   const int mi_row = xd->mi_row;
   const int mi_col = xd->mi_col;
-  av1_setup_dst_planes(xd->plane, xd->mi[0]->sb_type, &cm->cur_frame->buf,
-                       mi_row, mi_col, 0, num_planes);
+  av1_setup_dst_planes(xd->plane, xd->mi[0]->bsize, &cm->cur_frame->buf, mi_row,
+                       mi_col, 0, num_planes);
   av1_build_obmc_inter_prediction(cm, xd, dst_buf1, dst_stride1, dst_buf2,
                                   dst_stride2);
 }
@@ -847,13 +826,14 @@
                                              MACROBLOCKD *const xd) {
   MB_MODE_INFO *mbmi = xd->mi[0];
   if (store_cfl_required(cm, xd)) {
-    cfl_store_block(xd, mbmi->sb_type, mbmi->tx_size);
+    cfl_store_block(xd, mbmi->bsize, mbmi->tx_size);
   }
 }
 
 static AOM_INLINE void predict_inter_block(AV1_COMMON *const cm,
-                                           MACROBLOCKD *const xd,
+                                           DecoderCodingBlock *dcb,
                                            BLOCK_SIZE bsize) {
+  MACROBLOCKD *const xd = &dcb->xd;
   MB_MODE_INFO *mbmi = xd->mi[0];
   const int num_planes = av1_num_planes(cm);
   const int mi_row = xd->mi_row;
@@ -875,9 +855,9 @@
     }
   }
 
-  dec_build_inter_predictor(cm, xd, mi_row, mi_col, bsize);
+  dec_build_inter_predictor(cm, dcb, mi_row, mi_col, bsize);
   if (mbmi->motion_mode == OBMC_CAUSAL) {
-    dec_build_obmc_inter_predictors_sb(cm, xd);
+    dec_build_obmc_inter_predictors_sb(cm, dcb);
   }
 #if CONFIG_MISMATCH_DEBUG
   for (int plane = 0; plane < num_planes; ++plane) {
@@ -901,7 +881,7 @@
   (void)r;
   Av1ColorMapParam params;
   const MB_MODE_INFO *const mbmi = xd->mi[0];
-  av1_get_block_dimensions(mbmi->sb_type, plane, xd, &params.plane_width,
+  av1_get_block_dimensions(mbmi->bsize, plane, xd, &params.plane_width,
                            &params.plane_height, NULL, NULL);
   xd->color_index_map_offset[plane] += params.plane_width * params.plane_height;
 }
@@ -911,7 +891,8 @@
                                                 aom_reader *r,
                                                 BLOCK_SIZE bsize) {
   AV1_COMMON *const cm = &pbi->common;
-  MACROBLOCKD *const xd = &td->xd;
+  DecoderCodingBlock *const dcb = &td->dcb;
+  MACROBLOCKD *const xd = &dcb->xd;
   const int num_planes = av1_num_planes(cm);
   MB_MODE_INFO *mbmi = xd->mi[0];
 
@@ -945,20 +926,20 @@
                blk_row += stepr) {
             for (int blk_col = col >> pd->subsampling_x; blk_col < unit_width;
                  blk_col += stepc) {
-              td->read_coeffs_tx_intra_block_visit(cm, xd, r, plane, blk_row,
+              td->read_coeffs_tx_intra_block_visit(cm, dcb, r, plane, blk_row,
                                                    blk_col, tx_size);
-              td->predict_and_recon_intra_block_visit(cm, xd, r, plane, blk_row,
-                                                      blk_col, tx_size);
-              set_cb_buffer_offsets(xd, tx_size, plane);
+              td->predict_and_recon_intra_block_visit(
+                  cm, dcb, r, plane, blk_row, blk_col, tx_size);
+              set_cb_buffer_offsets(dcb, tx_size, plane);
             }
           }
         }
       }
     }
   } else {
-    td->predict_inter_block_visit(cm, xd, bsize);
+    td->predict_inter_block_visit(cm, dcb, bsize);
     // Reconstruction
-    if (!mbmi->skip) {
+    if (!mbmi->skip_txfm) {
       int eobtotal = 0;
 
       const int max_blocks_wide = max_block_wide(xd, bsize, 0);
@@ -1042,7 +1023,7 @@
                                           aom_reader *r) {
   FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
   int is_split = 0;
-  const BLOCK_SIZE bsize = mbmi->sb_type;
+  const BLOCK_SIZE bsize = mbmi->bsize;
   const int max_blocks_high = max_block_high(xd, bsize, 0);
   const int max_blocks_wide = max_block_wide(xd, bsize, 0);
   if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
@@ -1066,7 +1047,7 @@
 
   const int ctx = txfm_partition_context(xd->above_txfm_context + blk_col,
                                          xd->left_txfm_context + blk_row,
-                                         mbmi->sb_type, tx_size);
+                                         mbmi->bsize, tx_size);
   is_split = aom_read_symbol(r, ec_ctx->txfm_partition_cdf[ctx], 2, ACCT_STR);
 
   if (is_split) {
@@ -1127,7 +1108,7 @@
                                      aom_reader *r) {
   // TODO(debargha): Clean up the logic here. This function should only
   // be called for intra.
-  const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+  const BLOCK_SIZE bsize = xd->mi[0]->bsize;
   const int32_t tx_size_cat = bsize_to_tx_size_cat(bsize);
   const int max_depths = bsize_to_max_depth(bsize);
   const int ctx = get_tx_size_context(xd);
@@ -1142,7 +1123,7 @@
 static TX_SIZE read_tx_size(const MACROBLOCKD *const xd, TX_MODE tx_mode,
                             int is_inter, int allow_select_inter,
                             aom_reader *r) {
-  const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+  const BLOCK_SIZE bsize = xd->mi[0]->bsize;
   if (xd->lossless[xd->mi[0]->segment_id]) return TX_4X4;
 
   if (block_signals_txsize(bsize)) {
@@ -1163,8 +1144,9 @@
                                           int mi_col, aom_reader *r,
                                           PARTITION_TYPE partition,
                                           BLOCK_SIZE bsize) {
-  MACROBLOCKD *const xd = &td->xd;
-  decode_mbmi_block(pbi, xd, mi_row, mi_col, r, partition, bsize);
+  DecoderCodingBlock *const dcb = &td->dcb;
+  MACROBLOCKD *const xd = &dcb->xd;
+  decode_mbmi_block(pbi, dcb, mi_row, mi_col, r, partition, bsize);
 
   av1_visit_palette(pbi, xd, r, av1_decode_palette_tokens);
 
@@ -1173,7 +1155,7 @@
   MB_MODE_INFO *mbmi = xd->mi[0];
   int inter_block_tx = is_inter_block(mbmi) || is_intrabc_block(mbmi);
   if (cm->features.tx_mode == TX_MODE_SELECT && block_signals_txsize(bsize) &&
-      !mbmi->skip && inter_block_tx && !xd->lossless[mbmi->segment_id]) {
+      !mbmi->skip_txfm && inter_block_tx && !xd->lossless[mbmi->segment_id]) {
     const TX_SIZE max_tx_size = max_txsize_rect_lookup[bsize];
     const int bh = tx_size_high_unit[max_tx_size];
     const int bw = tx_size_wide_unit[max_tx_size];
@@ -1188,12 +1170,12 @@
 #endif
                            idy, idx, r);
   } else {
-    mbmi->tx_size =
-        read_tx_size(xd, cm->features.tx_mode, inter_block_tx, !mbmi->skip, r);
+    mbmi->tx_size = read_tx_size(xd, cm->features.tx_mode, inter_block_tx,
+                                 !mbmi->skip_txfm, r);
     if (inter_block_tx)
       memset(mbmi->inter_tx_size, mbmi->tx_size, sizeof(mbmi->inter_tx_size));
     set_txfm_ctxs(mbmi->tx_size, xd->width, xd->height,
-                  mbmi->skip && is_inter_block(mbmi), xd);
+                  mbmi->skip_txfm && is_inter_block(mbmi), xd);
 #if CONFIG_LPF_MASK
     const int w = mi_size_wide[bsize];
     const int h = mi_size_high[bsize];
@@ -1227,7 +1209,7 @@
   if (cm->delta_q_info.delta_q_present_flag) {
     for (int i = 0; i < MAX_SEGMENTS; i++) {
       const int current_qindex =
-          av1_get_qindex(&cm->seg, i, xd->current_qindex);
+          av1_get_qindex(&cm->seg, i, xd->current_base_qindex);
       const CommonQuantParams *const quant_params = &cm->quant_params;
       for (int j = 0; j < num_planes; ++j) {
         const int dc_delta_q = j == 0 ? quant_params->y_dc_delta_q
@@ -1243,7 +1225,7 @@
       }
     }
   }
-  if (mbmi->skip) av1_reset_entropy_context(xd, bsize, num_planes);
+  if (mbmi->skip_txfm) av1_reset_entropy_context(xd, bsize, num_planes);
 
   decode_token_recon_block(pbi, td, r, bsize);
 }
@@ -1254,7 +1236,8 @@
                                                       BLOCK_SIZE bsize) {
   AV1_COMMON *const cm = &pbi->common;
   const CommonModeInfoParams *const mi_params = &cm->mi_params;
-  MACROBLOCKD *const xd = &td->xd;
+  DecoderCodingBlock *const dcb = &td->dcb;
+  MACROBLOCKD *const xd = &dcb->xd;
   const int bw = mi_size_wide[bsize];
   const int bh = mi_size_high[bsize];
   const int num_planes = av1_num_planes(cm);
@@ -1324,7 +1307,8 @@
                                         int parse_decode_flag) {
   assert(bsize < BLOCK_SIZES_ALL);
   AV1_COMMON *const cm = &pbi->common;
-  MACROBLOCKD *const xd = &td->xd;
+  DecoderCodingBlock *const dcb = &td->dcb;
+  MACROBLOCKD *const xd = &dcb->xd;
   const int bw = mi_size_wide[bsize];
   const int hbs = bw >> 1;
   PARTITION_TYPE partition;
@@ -1346,6 +1330,7 @@
                                                      parse_decode_block };
 
   if (parse_decode_flag & 1) {
+#if !CONFIG_REALTIME_ONLY
     const int num_planes = av1_num_planes(cm);
     for (int plane = 0; plane < num_planes; ++plane) {
       int rcol0, rcol1, rrow0, rrow1;
@@ -1360,6 +1345,7 @@
         }
       }
     }
+#endif
 
     partition = (bsize < BLOCK_8X8) ? PARTITION_NONE
                                     : read_partition(xd, mi_row, mi_col, reader,
@@ -1482,9 +1468,10 @@
 
   seg->enabled = aom_rb_read_bit(rb);
   if (!seg->enabled) {
-    if (cm->cur_frame->seg_map)
+    if (cm->cur_frame->seg_map) {
       memset(cm->cur_frame->seg_map, 0,
-             (cm->mi_params.mi_rows * cm->mi_params.mi_cols));
+             (cm->cur_frame->mi_rows * cm->cur_frame->mi_cols));
+    }
 
     memset(seg, 0, sizeof(*seg));
     segfeatures_copy(&cm->cur_frame->seg, seg);
@@ -1602,6 +1589,7 @@
   }
 }
 
+#if !CONFIG_REALTIME_ONLY
 static AOM_INLINE void read_wiener_filter(int wiener_win,
                                           WienerInfo *wiener_info,
                                           WienerInfo *ref_wiener_info,
@@ -1705,7 +1693,7 @@
     int runit_idx) {
   const RestorationInfo *rsi = &cm->rst_info[plane];
   RestorationUnitInfo *rui = &rsi->unit_info[runit_idx];
-  if (rsi->frame_restoration_type == RESTORE_NONE) return;
+  assert(rsi->frame_restoration_type != RESTORE_NONE);
 
   assert(!cm->features.all_lossless);
 
@@ -1742,6 +1730,7 @@
     }
   }
 }
+#endif  // !CONFIG_REALTIME_ONLY
 
 static AOM_INLINE void setup_loopfilter(AV1_COMMON *cm,
                                         struct aom_read_bit_buffer *rb) {
@@ -2465,7 +2454,7 @@
   }
 }
 
-static AOM_INLINE void set_cb_buffer(AV1Decoder *pbi, MACROBLOCKD *const xd,
+static AOM_INLINE void set_cb_buffer(AV1Decoder *pbi, DecoderCodingBlock *dcb,
                                      CB_BUFFER *cb_buffer_base,
                                      const int num_planes, int mi_row,
                                      int mi_col) {
@@ -2476,11 +2465,12 @@
   CB_BUFFER *cb_buffer = cb_buffer_base + offset;
 
   for (int plane = 0; plane < num_planes; ++plane) {
-    xd->plane[plane].dqcoeff_block = cb_buffer->dqcoeff[plane];
-    xd->plane[plane].eob_data = cb_buffer->eob_data[plane];
-    xd->cb_offset[plane] = 0;
-    xd->txb_offset[plane] = 0;
+    dcb->dqcoeff_block[plane] = cb_buffer->dqcoeff[plane];
+    dcb->eob_data[plane] = cb_buffer->eob_data[plane];
+    dcb->cb_offset[plane] = 0;
+    dcb->txb_offset[plane] = 0;
   }
+  MACROBLOCKD *const xd = &dcb->xd;
   xd->plane[0].color_index_map = cb_buffer->color_index_map[0];
   xd->plane[1].color_index_map = cb_buffer->color_index_map[1];
   xd->color_index_map_offset[0] = 0;
@@ -2643,7 +2633,7 @@
 
   for (int mi_col = tile_info.mi_col_start; mi_col < tile_info.mi_col_end;
        mi_col += cm->seq_params.mib_size, sb_col_in_tile++) {
-    set_cb_buffer(pbi, &td->xd, pbi->cb_buffer_base, num_planes, mi_row,
+    set_cb_buffer(pbi, &td->dcb, pbi->cb_buffer_base, num_planes, mi_row,
                   mi_col);
 
     sync_read(&tile_data->dec_row_mt_sync, sb_row_in_tile, sb_col_in_tile);
@@ -2711,25 +2701,28 @@
 
   av1_tile_set_row(&tile_info, cm, tile_row);
   av1_tile_set_col(&tile_info, cm, tile_col);
-  av1_zero_above_context(cm, &td->xd, tile_info.mi_col_start,
-                         tile_info.mi_col_end, tile_row);
-  av1_reset_loop_filter_delta(&td->xd, num_planes);
-  av1_reset_loop_restoration(&td->xd, num_planes);
+  DecoderCodingBlock *const dcb = &td->dcb;
+  MACROBLOCKD *const xd = &dcb->xd;
+
+  av1_zero_above_context(cm, xd, tile_info.mi_col_start, tile_info.mi_col_end,
+                         tile_row);
+  av1_reset_loop_filter_delta(xd, num_planes);
+  av1_reset_loop_restoration(xd, num_planes);
 
   for (int mi_row = tile_info.mi_row_start; mi_row < tile_info.mi_row_end;
        mi_row += cm->seq_params.mib_size) {
-    av1_zero_left_context(&td->xd);
+    av1_zero_left_context(xd);
 
     for (int mi_col = tile_info.mi_col_start; mi_col < tile_info.mi_col_end;
          mi_col += cm->seq_params.mib_size) {
-      set_cb_buffer(pbi, &td->xd, &td->cb_buffer_base, num_planes, 0, 0);
+      set_cb_buffer(pbi, dcb, &td->cb_buffer_base, num_planes, 0, 0);
 
       // Bit-stream parsing and decoding of the superblock
       decode_partition(pbi, td, mi_row, mi_col, td->bit_reader,
                        cm->seq_params.sb_size, 0x3);
 
       if (aom_reader_has_overflowed(td->bit_reader)) {
-        aom_merge_corrupted_flag(&td->xd.corrupted, 1);
+        aom_merge_corrupted_flag(&dcb->corrupted, 1);
         return;
       }
     }
@@ -2737,7 +2730,7 @@
 
   int corrupted =
       (check_trailing_bits_after_symbol_coder(td->bit_reader)) ? 1 : 0;
-  aom_merge_corrupted_flag(&td->xd.corrupted, corrupted);
+  aom_merge_corrupted_flag(&dcb->corrupted, corrupted);
 }
 
 static const uint8_t *decode_tiles(AV1Decoder *pbi, const uint8_t *data,
@@ -2816,13 +2809,14 @@
   set_decode_func_pointers(&pbi->td, 0x3);
 
   // Load all tile information into thread_data.
-  td->xd = pbi->mb;
-  td->xd.corrupted = 0;
-  td->xd.mc_buf[0] = td->mc_buf[0];
-  td->xd.mc_buf[1] = td->mc_buf[1];
-  td->xd.tmp_conv_dst = td->tmp_conv_dst;
+  td->dcb = pbi->dcb;
+
+  td->dcb.corrupted = 0;
+  td->dcb.mc_buf[0] = td->mc_buf[0];
+  td->dcb.mc_buf[1] = td->mc_buf[1];
+  td->dcb.xd.tmp_conv_dst = td->tmp_conv_dst;
   for (int j = 0; j < 2; ++j) {
-    td->xd.tmp_obmc_bufs[j] = td->tmp_obmc_bufs[j];
+    td->dcb.xd.tmp_obmc_bufs[j] = td->tmp_obmc_bufs[j];
   }
 
   for (tile_row = tile_rows_start; tile_row < tile_rows_end; ++tile_row) {
@@ -2839,8 +2833,8 @@
 
       td->bit_reader = &tile_data->bit_reader;
       av1_zero(td->cb_buffer_base.dqcoeff);
-      av1_tile_init(&td->xd.tile, cm, row, col);
-      td->xd.current_qindex = cm->quant_params.base_qindex;
+      av1_tile_init(&td->dcb.xd.tile, cm, row, col);
+      td->dcb.xd.current_base_qindex = cm->quant_params.base_qindex;
       setup_bool_decoder(tile_bs_buf->data, data_end, tile_bs_buf->size,
                          &cm->error, td->bit_reader, allow_update_cdf);
 #if CONFIG_ACCOUNTING
@@ -2852,18 +2846,18 @@
         td->bit_reader->accounting = NULL;
       }
 #endif
-      av1_init_macroblockd(cm, &td->xd, NULL);
+      av1_init_macroblockd(cm, &td->dcb.xd);
       av1_init_above_context(&cm->above_contexts, av1_num_planes(cm), row,
-                             &td->xd);
+                             &td->dcb.xd);
 
       // Initialise the tile context from the frame context
       tile_data->tctx = *cm->fc;
-      td->xd.tile_ctx = &tile_data->tctx;
+      td->dcb.xd.tile_ctx = &tile_data->tctx;
 
       // decode tile
       decode_tile(pbi, td, row, col);
-      aom_merge_corrupted_flag(&pbi->mb.corrupted, td->xd.corrupted);
-      if (pbi->mb.corrupted)
+      aom_merge_corrupted_flag(&pbi->dcb.corrupted, td->dcb.corrupted);
+      if (pbi->dcb.corrupted)
         aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
                            "Failed to decode tile data");
     }
@@ -2910,8 +2904,10 @@
 
   td->bit_reader = &tile_data->bit_reader;
   av1_zero(td->cb_buffer_base.dqcoeff);
-  av1_tile_init(&td->xd.tile, cm, tile_row, tile_col);
-  td->xd.current_qindex = cm->quant_params.base_qindex;
+
+  MACROBLOCKD *const xd = &td->dcb.xd;
+  av1_tile_init(&xd->tile, cm, tile_row, tile_col);
+  xd->current_base_qindex = cm->quant_params.base_qindex;
   setup_bool_decoder(tile_buffer->data, thread_data->data_end,
                      tile_buffer->size, &thread_data->error_info,
                      td->bit_reader, allow_update_cdf);
@@ -2924,14 +2920,13 @@
     td->bit_reader->accounting = NULL;
   }
 #endif
-  av1_init_macroblockd(cm, &td->xd, NULL);
-  td->xd.error_info = &thread_data->error_info;
-  av1_init_above_context(&cm->above_contexts, av1_num_planes(cm), tile_row,
-                         &td->xd);
+  av1_init_macroblockd(cm, xd);
+  xd->error_info = &thread_data->error_info;
+  av1_init_above_context(&cm->above_contexts, av1_num_planes(cm), tile_row, xd);
 
   // Initialise the tile context from the frame context
   tile_data->tctx = *cm->fc;
-  td->xd.tile_ctx = &tile_data->tctx;
+  xd->tile_ctx = &tile_data->tctx;
 #if CONFIG_ACCOUNTING
   if (pbi->acct_enabled) {
     tile_data->bit_reader.accounting->last_tell_frac =
@@ -2952,7 +2947,7 @@
   // before it returns.
   if (setjmp(thread_data->error_info.jmp)) {
     thread_data->error_info.setjmp = 0;
-    thread_data->td->xd.corrupted = 1;
+    thread_data->td->dcb.corrupted = 1;
     return 0;
   }
   thread_data->error_info.setjmp = 1;
@@ -2963,7 +2958,7 @@
   set_decode_func_pointers(td, 0x3);
 
   assert(cm->tiles.cols > 0);
-  while (!td->xd.corrupted) {
+  while (!td->dcb.corrupted) {
     TileJobsDec *cur_job_info = get_dec_job_info(&pbi->tile_mt_info);
 
     if (cur_job_info != NULL) {
@@ -2980,7 +2975,7 @@
     }
   }
   thread_data->error_info.setjmp = 0;
-  return !td->xd.corrupted;
+  return !td->dcb.corrupted;
 }
 
 static INLINE int get_max_row_mt_workers_per_tile(AV1_COMMON *cm,
@@ -3143,27 +3138,28 @@
   const int num_planes = av1_num_planes(cm);
   TileInfo tile_info = tile_data->tile_info;
   int tile_row = tile_info.tile_row;
+  DecoderCodingBlock *const dcb = &td->dcb;
+  MACROBLOCKD *const xd = &dcb->xd;
 
-  av1_zero_above_context(cm, &td->xd, tile_info.mi_col_start,
-                         tile_info.mi_col_end, tile_row);
-  av1_reset_loop_filter_delta(&td->xd, num_planes);
-  av1_reset_loop_restoration(&td->xd, num_planes);
+  av1_zero_above_context(cm, xd, tile_info.mi_col_start, tile_info.mi_col_end,
+                         tile_row);
+  av1_reset_loop_filter_delta(xd, num_planes);
+  av1_reset_loop_restoration(xd, num_planes);
 
   for (int mi_row = tile_info.mi_row_start; mi_row < tile_info.mi_row_end;
        mi_row += cm->seq_params.mib_size) {
-    av1_zero_left_context(&td->xd);
+    av1_zero_left_context(xd);
 
     for (int mi_col = tile_info.mi_col_start; mi_col < tile_info.mi_col_end;
          mi_col += cm->seq_params.mib_size) {
-      set_cb_buffer(pbi, &td->xd, pbi->cb_buffer_base, num_planes, mi_row,
-                    mi_col);
+      set_cb_buffer(pbi, dcb, pbi->cb_buffer_base, num_planes, mi_row, mi_col);
 
       // Bit-stream parsing of the superblock
       decode_partition(pbi, td, mi_row, mi_col, td->bit_reader,
                        cm->seq_params.sb_size, 0x1);
 
       if (aom_reader_has_overflowed(td->bit_reader)) {
-        aom_merge_corrupted_flag(&td->xd.corrupted, 1);
+        aom_merge_corrupted_flag(&dcb->corrupted, 1);
         return;
       }
     }
@@ -3172,7 +3168,7 @@
 
   int corrupted =
       (check_trailing_bits_after_symbol_coder(td->bit_reader)) ? 1 : 0;
-  aom_merge_corrupted_flag(&td->xd.corrupted, corrupted);
+  aom_merge_corrupted_flag(&dcb->corrupted, corrupted);
 }
 
 static int row_mt_worker_hook(void *arg1, void *arg2) {
@@ -3182,14 +3178,14 @@
   ThreadData *const td = thread_data->td;
   uint8_t allow_update_cdf;
   AV1DecRowMTInfo *frame_row_mt_info = &pbi->frame_row_mt_info;
-  td->xd.corrupted = 0;
+  td->dcb.corrupted = 0;
 
   // The jmp_buf is valid only for the duration of the function that calls
   // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
   // before it returns.
   if (setjmp(thread_data->error_info.jmp)) {
     thread_data->error_info.setjmp = 0;
-    thread_data->td->xd.corrupted = 1;
+    thread_data->td->dcb.corrupted = 1;
 #if CONFIG_MULTITHREAD
     pthread_mutex_lock(pbi->row_mt_mutex_);
 #endif
@@ -3208,7 +3204,7 @@
   set_decode_func_pointers(td, 0x1);
 
   assert(cm->tiles.cols > 0);
-  while (!td->xd.corrupted) {
+  while (!td->dcb.corrupted) {
     TileJobsDec *cur_job_info = get_dec_job_info(&pbi->tile_mt_info);
 
     if (cur_job_info != NULL) {
@@ -3237,7 +3233,7 @@
     }
   }
 
-  if (td->xd.corrupted) {
+  if (td->dcb.corrupted) {
     thread_data->error_info.setjmp = 0;
 #if CONFIG_MULTITHREAD
     pthread_mutex_lock(pbi->row_mt_mutex_);
@@ -3279,9 +3275,9 @@
     AV1DecRowMTSync *dec_row_mt_sync = &tile_data->dec_row_mt_sync;
     TileInfo tile_info = tile_data->tile_info;
 
-    av1_tile_init(&td->xd.tile, cm, tile_row, tile_col);
-    av1_init_macroblockd(cm, &td->xd, NULL);
-    td->xd.error_info = &thread_data->error_info;
+    av1_tile_init(&td->dcb.xd.tile, cm, tile_row, tile_col);
+    av1_init_macroblockd(cm, &td->dcb.xd);
+    td->dcb.xd.error_info = &thread_data->error_info;
 
     decode_tile_sb_row(pbi, td, tile_info, mi_row);
 
@@ -3294,7 +3290,7 @@
 #endif
   }
   thread_data->error_info.setjmp = 0;
-  return !td->xd.corrupted;
+  return !td->dcb.corrupted;
 }
 
 // sorts in descending order
@@ -3370,13 +3366,20 @@
                                            ThreadData *thread_data,
                                            int buf_size, int use_highbd) {
   for (int ref = 0; ref < 2; ref++) {
+    // The mc_buf/hbd_mc_buf must be zeroed to fix a intermittent valgrind error
+    // 'Conditional jump or move depends on uninitialised value' from the loop
+    // filter. Uninitialized reads in convolve function (e.g. horiz_4tap path in
+    // av1_convolve_2d_sr_avx2()) from mc_buf/hbd_mc_buf are seen to be the
+    // potential reason for this issue.
     if (use_highbd) {
       uint16_t *hbd_mc_buf;
       CHECK_MEM_ERROR(cm, hbd_mc_buf, (uint16_t *)aom_memalign(16, buf_size));
+      memset(hbd_mc_buf, 0, buf_size);
       thread_data->mc_buf[ref] = CONVERT_TO_BYTEPTR(hbd_mc_buf);
     } else {
       CHECK_MEM_ERROR(cm, thread_data->mc_buf[ref],
                       (uint8_t *)aom_memalign(16, buf_size));
+      memset(thread_data->mc_buf[ref], 0, buf_size);
     }
   }
   thread_data->mc_buf_size = buf_size;
@@ -3402,13 +3405,14 @@
   for (int worker_idx = 0; worker_idx < num_workers; ++worker_idx) {
     AVxWorker *const worker = &pbi->tile_workers[worker_idx];
     DecWorkerData *const thread_data = pbi->thread_data + worker_idx;
-    thread_data->td->xd = pbi->mb;
-    thread_data->td->xd.corrupted = 0;
-    thread_data->td->xd.mc_buf[0] = thread_data->td->mc_buf[0];
-    thread_data->td->xd.mc_buf[1] = thread_data->td->mc_buf[1];
-    thread_data->td->xd.tmp_conv_dst = thread_data->td->tmp_conv_dst;
+    thread_data->td->dcb = pbi->dcb;
+    thread_data->td->dcb.corrupted = 0;
+    thread_data->td->dcb.mc_buf[0] = thread_data->td->mc_buf[0];
+    thread_data->td->dcb.mc_buf[1] = thread_data->td->mc_buf[1];
+    thread_data->td->dcb.xd.tmp_conv_dst = thread_data->td->tmp_conv_dst;
     for (int j = 0; j < 2; ++j) {
-      thread_data->td->xd.tmp_obmc_bufs[j] = thread_data->td->tmp_obmc_bufs[j];
+      thread_data->td->dcb.xd.tmp_obmc_bufs[j] =
+          thread_data->td->tmp_obmc_bufs[j];
     }
     winterface->sync(worker);
 
@@ -3452,7 +3456,7 @@
     aom_merge_corrupted_flag(&corrupted, !winterface->sync(worker));
   }
 
-  pbi->mb.corrupted = corrupted;
+  pbi->dcb.corrupted = corrupted;
 }
 
 static AOM_INLINE void decode_mt_init(AV1Decoder *pbi) {
@@ -3600,7 +3604,7 @@
   launch_dec_workers(pbi, data_end, num_workers);
   sync_dec_workers(pbi, num_workers);
 
-  if (pbi->mb.corrupted)
+  if (pbi->dcb.corrupted)
     aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
                        "Failed to decode tile data");
 
@@ -3805,7 +3809,7 @@
   launch_dec_workers(pbi, data_end, num_workers);
   sync_dec_workers(pbi, num_workers);
 
-  if (pbi->mb.corrupted)
+  if (pbi->dcb.corrupted)
     aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
                        "Failed to decode tile data");
 
@@ -4310,10 +4314,14 @@
                        trans_dec_factor;
   }
 
+#if !CONFIG_REALTIME_ONLY
+  // For realtime only build, warped motion is disabled, so this section is not
+  // needed.
   if (params->wmtype <= AFFINE) {
     int good_shear_params = av1_get_shear_params(params);
     if (!good_shear_params) return 0;
   }
+#endif
 
   return 1;
 }
@@ -4440,9 +4448,12 @@
   const SequenceHeader *const seq_params = &cm->seq_params;
   CurrentFrame *const current_frame = &cm->current_frame;
   FeatureFlags *const features = &cm->features;
-  MACROBLOCKD *const xd = &pbi->mb;
+  MACROBLOCKD *const xd = &pbi->dcb.xd;
   BufferPool *const pool = cm->buffer_pool;
   RefCntBuffer *const frame_bufs = pool->frame_bufs;
+  aom_s_frame_info *sframe_info = &pbi->sframe_info;
+  sframe_info->is_s_frame = 0;
+  sframe_info->is_s_frame_at_altref = 0;
 
   if (!pbi->sequence_header_ready) {
     aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
@@ -4548,6 +4559,13 @@
     }
 
     cm->show_frame = aom_rb_read_bit(rb);
+    if (cm->show_frame == 0) pbi->is_arf_frame_present = 1;
+    if (cm->show_frame == 0 && cm->current_frame.frame_type == KEY_FRAME)
+      pbi->is_fwd_kf_present = 1;
+    if (cm->current_frame.frame_type == S_FRAME) {
+      sframe_info->is_s_frame = 1;
+      sframe_info->is_s_frame_at_altref = cm->show_frame ? 0 : 1;
+    }
     if (seq_params->still_picture &&
         (current_frame->frame_type != KEY_FRAME || !cm->show_frame)) {
       aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
@@ -4989,7 +5007,7 @@
   cm->delta_q_info.delta_q_present_flag =
       quant_params->base_qindex > 0 ? aom_rb_read_bit(rb) : 0;
   if (cm->delta_q_info.delta_q_present_flag) {
-    xd->current_qindex = quant_params->base_qindex;
+    xd->current_base_qindex = quant_params->base_qindex;
     cm->delta_q_info.delta_q_res = 1 << aom_rb_read_literal(rb, 2);
     if (!features->allow_intrabc)
       cm->delta_q_info.delta_lf_present_flag = aom_rb_read_bit(rb);
@@ -5092,6 +5110,7 @@
   return (BITSTREAM_PROFILE)profile;
 }
 
+#if !CONFIG_REALTIME_ONLY
 static AOM_INLINE void superres_post_decode(AV1Decoder *pbi) {
   AV1_COMMON *const cm = &pbi->common;
   BufferPool *const pool = cm->buffer_pool;
@@ -5101,6 +5120,7 @@
 
   av1_superres_upscale(cm, pool);
 }
+#endif
 
 uint32_t av1_decode_frame_headers_and_setup(AV1Decoder *pbi,
                                             struct aom_read_bit_buffer *rb,
@@ -5109,10 +5129,10 @@
                                             int trailing_bits_present) {
   AV1_COMMON *const cm = &pbi->common;
   const int num_planes = av1_num_planes(cm);
-  MACROBLOCKD *const xd = &pbi->mb;
+  MACROBLOCKD *const xd = &pbi->dcb.xd;
 
 #if CONFIG_BITSTREAM_DEBUG
-  aom_bitstream_queue_set_frame_read(cm->current_frame.frame_number * 2 +
+  aom_bitstream_queue_set_frame_read(cm->current_frame.order_hint * 2 +
                                      cm->show_frame);
 #endif
 #if CONFIG_MISMATCH_DEBUG
@@ -5174,7 +5194,7 @@
     aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
                        "Uninitialized entropy context.");
 
-  xd->corrupted = 0;
+  pbi->dcb.corrupted = 0;
   return uncomp_hdr_size;
 }
 
@@ -5182,11 +5202,13 @@
 static AOM_INLINE void setup_frame_info(AV1Decoder *pbi) {
   AV1_COMMON *const cm = &pbi->common;
 
+#if !CONFIG_REALTIME_ONLY
   if (cm->rst_info[0].frame_restoration_type != RESTORE_NONE ||
       cm->rst_info[1].frame_restoration_type != RESTORE_NONE ||
       cm->rst_info[2].frame_restoration_type != RESTORE_NONE) {
     av1_alloc_restoration_buffers(cm);
   }
+#endif
   const int use_highbd = cm->seq_params.use_highbitdepth;
   const int buf_size = MC_TEMP_BUF_PELS << use_highbd;
   if (pbi->td.mc_buf_size != buf_size) {
@@ -5201,7 +5223,7 @@
                                     int end_tile, int initialize_flag) {
   AV1_COMMON *const cm = &pbi->common;
   CommonTileParams *const tiles = &cm->tiles;
-  MACROBLOCKD *const xd = &pbi->mb;
+  MACROBLOCKD *const xd = &pbi->dcb.xd;
   const int tile_count_tg = end_tile - start_tile + 1;
 
   if (initialize_flag) setup_frame_info(pbi);
@@ -5233,13 +5255,13 @@
     if (cm->lf.filter_level[0] || cm->lf.filter_level[1]) {
       if (pbi->num_workers > 1) {
         av1_loop_filter_frame_mt(
-            &cm->cur_frame->buf, cm, &pbi->mb, 0, num_planes, 0,
+            &cm->cur_frame->buf, cm, &pbi->dcb.xd, 0, num_planes, 0,
 #if CONFIG_LPF_MASK
             1,
 #endif
             pbi->tile_workers, pbi->num_workers, &pbi->lf_row_sync);
       } else {
-        av1_loop_filter_frame(&cm->cur_frame->buf, cm, &pbi->mb,
+        av1_loop_filter_frame(&cm->cur_frame->buf, cm, &pbi->dcb.xd,
 #if CONFIG_LPF_MASK
                               1,
 #endif
@@ -5247,10 +5269,6 @@
       }
     }
 
-    const int do_loop_restoration =
-        cm->rst_info[0].frame_restoration_type != RESTORE_NONE ||
-        cm->rst_info[1].frame_restoration_type != RESTORE_NONE ||
-        cm->rst_info[2].frame_restoration_type != RESTORE_NONE;
     const int do_cdef =
         !pbi->skip_loop_filter && !cm->features.coded_lossless &&
         (cm->cdef_info.cdef_bits || cm->cdef_info.cdef_strengths[0] ||
@@ -5258,12 +5276,19 @@
     const int do_superres = av1_superres_scaled(cm);
     const int optimized_loop_restoration = !do_cdef && !do_superres;
 
+#if !CONFIG_REALTIME_ONLY
+    const int do_loop_restoration =
+        cm->rst_info[0].frame_restoration_type != RESTORE_NONE ||
+        cm->rst_info[1].frame_restoration_type != RESTORE_NONE ||
+        cm->rst_info[2].frame_restoration_type != RESTORE_NONE;
     if (!optimized_loop_restoration) {
       if (do_loop_restoration)
         av1_loop_restoration_save_boundary_lines(&pbi->common.cur_frame->buf,
                                                  cm, 0);
 
-      if (do_cdef) av1_cdef_frame(&pbi->common.cur_frame->buf, cm, &pbi->mb);
+      if (do_cdef) {
+        av1_cdef_frame(&pbi->common.cur_frame->buf, cm, &pbi->dcb.xd);
+      }
 
       superres_post_decode(pbi);
 
@@ -5297,12 +5322,19 @@
         }
       }
     }
+#else
+    if (!optimized_loop_restoration) {
+      if (do_cdef) {
+        av1_cdef_frame(&pbi->common.cur_frame->buf, cm, &pbi->dcb.xd);
+      }
+    }
+#endif  // !CONFIG_REALTIME_ONLY
   }
 #if CONFIG_LPF_MASK
   av1_zero_array(cm->lf.lfm, cm->lf.lfm_num);
 #endif
 
-  if (!xd->corrupted) {
+  if (!pbi->dcb.corrupted) {
     if (cm->features.refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) {
       assert(pbi->context_update_tile_id < pbi->allocated_tiles);
       *cm->fc = pbi->tile_data[pbi->context_update_tile_id].tctx;
diff --git a/av1/decoder/decodemv.c b/av1/decoder/decodemv.c
index e97cec4..412be86 100644
--- a/av1/decoder/decodemv.c
+++ b/av1/decoder/decodemv.c
@@ -37,7 +37,7 @@
 }
 
 static void read_cdef(AV1_COMMON *cm, aom_reader *r, MACROBLOCKD *const xd) {
-  const int skip = xd->mi[0]->skip;
+  const int skip_txfm = xd->mi[0]->skip_txfm;
   if (cm->features.coded_lossless) return;
   if (cm->features.allow_intrabc) {
     assert(cm->cdef_info.cdef_bits == 0);
@@ -66,7 +66,7 @@
                         : 0;
 
   // Read CDEF strength from the first non-skip coding block in this CDEF unit.
-  if (!xd->cdef_transmitted[index] && !skip) {
+  if (!xd->cdef_transmitted[index] && !skip_txfm) {
     // CDEF strength for this CDEF unit needs to be read into the MB_MODE_INFO
     // of the 1st block in this CDEF unit.
     const int first_block_mask = ~(cdef_size - 1);
@@ -84,13 +84,13 @@
 static int read_delta_qindex(AV1_COMMON *cm, const MACROBLOCKD *xd,
                              aom_reader *r, MB_MODE_INFO *const mbmi) {
   int sign, abs, reduced_delta_qindex = 0;
-  BLOCK_SIZE bsize = mbmi->sb_type;
+  BLOCK_SIZE bsize = mbmi->bsize;
   const int b_col = xd->mi_col & (cm->seq_params.mib_size - 1);
   const int b_row = xd->mi_row & (cm->seq_params.mib_size - 1);
   const int read_delta_q_flag = (b_col == 0 && b_row == 0);
   FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
 
-  if ((bsize != cm->seq_params.sb_size || mbmi->skip == 0) &&
+  if ((bsize != cm->seq_params.sb_size || mbmi->skip_txfm == 0) &&
       read_delta_q_flag) {
     abs = aom_read_symbol(r, ec_ctx->delta_q_cdf, DELTA_Q_PROBS + 1, ACCT_STR);
     const int smallval = (abs < DELTA_Q_SMALL);
@@ -116,12 +116,12 @@
                               const MB_MODE_INFO *const mbmi, int mi_col,
                               int mi_row) {
   int reduced_delta_lflevel = 0;
-  const BLOCK_SIZE bsize = mbmi->sb_type;
+  const BLOCK_SIZE bsize = mbmi->bsize;
   const int b_col = mi_col & (cm->seq_params.mib_size - 1);
   const int b_row = mi_row & (cm->seq_params.mib_size - 1);
   const int read_delta_lf_flag = (b_col == 0 && b_row == 0);
 
-  if ((bsize != cm->seq_params.sb_size || mbmi->skip == 0) &&
+  if ((bsize != cm->seq_params.sb_size || mbmi->skip_txfm == 0) &&
       read_delta_lf_flag) {
     int abs = aom_read_symbol(r, cdf, DELTA_LF_PROBS + 1, ACCT_STR);
     const int smallval = (abs < DELTA_LF_SMALL);
@@ -193,13 +193,14 @@
     return NEARMV;
 }
 
-static void read_drl_idx(FRAME_CONTEXT *ec_ctx, MACROBLOCKD *xd,
+static void read_drl_idx(FRAME_CONTEXT *ec_ctx, DecoderCodingBlock *dcb,
                          MB_MODE_INFO *mbmi, aom_reader *r) {
+  MACROBLOCKD *const xd = &dcb->xd;
   uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
   mbmi->ref_mv_idx = 0;
   if (mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV) {
     for (int idx = 0; idx < 2; ++idx) {
-      if (xd->ref_mv_count[ref_frame_type] > idx + 1) {
+      if (dcb->ref_mv_count[ref_frame_type] > idx + 1) {
         uint8_t drl_ctx = av1_drl_ctx(xd->weight[ref_frame_type], idx);
         int drl_idx = aom_read_symbol(r, ec_ctx->drl_cdf[drl_ctx], 2, ACCT_STR);
         mbmi->ref_mv_idx = idx + drl_idx;
@@ -212,7 +213,7 @@
     // TODO(jingning): Unify the two syntax decoding loops after the NEARESTMV
     // mode is factored in.
     for (int idx = 1; idx < 3; ++idx) {
-      if (xd->ref_mv_count[ref_frame_type] > idx + 1) {
+      if (dcb->ref_mv_count[ref_frame_type] > idx + 1) {
         uint8_t drl_ctx = av1_drl_ctx(xd->weight[ref_frame_type], idx);
         int drl_idx = aom_read_symbol(r, ec_ctx->drl_cdf[drl_ctx], 2, ACCT_STR);
         mbmi->ref_mv_idx = idx + drl_idx - 1;
@@ -235,12 +236,11 @@
 
   if (last_motion_mode_allowed == OBMC_CAUSAL) {
     motion_mode =
-        aom_read_symbol(r, xd->tile_ctx->obmc_cdf[mbmi->sb_type], 2, ACCT_STR);
+        aom_read_symbol(r, xd->tile_ctx->obmc_cdf[mbmi->bsize], 2, ACCT_STR);
     return (MOTION_MODE)(SIMPLE_TRANSLATION + motion_mode);
   } else {
-    motion_mode =
-        aom_read_symbol(r, xd->tile_ctx->motion_mode_cdf[mbmi->sb_type],
-                        MOTION_MODES, ACCT_STR);
+    motion_mode = aom_read_symbol(r, xd->tile_ctx->motion_mode_cdf[mbmi->bsize],
+                                  MOTION_MODES, ACCT_STR);
     return (MOTION_MODE)(SIMPLE_TRANSLATION + motion_mode);
   }
 }
@@ -367,8 +367,8 @@
   const int mi_row = xd->mi_row;
   const int mi_col = xd->mi_col;
   const int mi_offset = mi_row * mi_params->mi_cols + mi_col;
-  const int bw = mi_size_wide[mbmi->sb_type];
-  const int bh = mi_size_high[mbmi->sb_type];
+  const int bw = mi_size_wide[mbmi->bsize];
+  const int bh = mi_size_high[mbmi->bsize];
 
   // TODO(slavarnway): move x_mis, y_mis into xd ?????
   const int x_mis = AOMMIN(mi_params->mi_cols - mi_col, bw);
@@ -386,7 +386,7 @@
   if (preskip) {
     if (!seg->segid_preskip) return 0;
   } else {
-    if (mbmi->skip) {
+    if (mbmi->skip_txfm) {
       if (seg->temporal_update) {
         mbmi->seg_id_predicted = 0;
       }
@@ -422,7 +422,7 @@
     return 0;
   }
 
-  if (!is_comp_ref_allowed(xd->mi[0]->sb_type)) return 0;
+  if (!is_comp_ref_allowed(xd->mi[0]->bsize)) return 0;
 
   if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME) ||
       segfeature_active(&cm->seg, segment_id, SEG_LVL_GLOBALMV)) {
@@ -439,15 +439,16 @@
   return skip_mode;
 }
 
-static int read_skip(AV1_COMMON *cm, const MACROBLOCKD *xd, int segment_id,
-                     aom_reader *r) {
+static int read_skip_txfm(AV1_COMMON *cm, const MACROBLOCKD *xd, int segment_id,
+                          aom_reader *r) {
   if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) {
     return 1;
   } else {
-    const int ctx = av1_get_skip_context(xd);
+    const int ctx = av1_get_skip_txfm_context(xd);
     FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
-    const int skip = aom_read_symbol(r, ec_ctx->skip_cdfs[ctx], 2, ACCT_STR);
-    return skip;
+    const int skip_txfm =
+        aom_read_symbol(r, ec_ctx->skip_txfm_cdfs[ctx], 2, ACCT_STR);
+    return skip_txfm;
   }
 }
 
@@ -563,7 +564,7 @@
                                    aom_reader *r) {
   const int num_planes = av1_num_planes(cm);
   MB_MODE_INFO *const mbmi = xd->mi[0];
-  const BLOCK_SIZE bsize = mbmi->sb_type;
+  const BLOCK_SIZE bsize = mbmi->bsize;
   assert(av1_allow_palette(cm->features.allow_screen_content_tools, bsize));
   PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
   const int bsize_ctx = av1_get_palette_bsize_ctx(bsize);
@@ -608,7 +609,7 @@
 
   if (av1_filter_intra_allowed(cm, mbmi)) {
     filter_intra_mode_info->use_filter_intra = aom_read_symbol(
-        r, xd->tile_ctx->filter_intra_cdfs[mbmi->sb_type], 2, ACCT_STR);
+        r, xd->tile_ctx->filter_intra_cdfs[mbmi->bsize], 2, ACCT_STR);
     if (filter_intra_mode_info->use_filter_intra) {
       filter_intra_mode_info->filter_intra_mode = aom_read_symbol(
           r, xd->tile_ctx->filter_intra_mode_cdf, FILTER_INTRA_MODES, ACCT_STR);
@@ -626,7 +627,8 @@
   *tx_type = DCT_DCT;
 
   // No need to read transform type if block is skipped.
-  if (mbmi->skip || segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP))
+  if (mbmi->skip_txfm ||
+      segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP))
     return;
 
   // No need to read transform type for lossless mode(qindex==0).
@@ -684,13 +686,14 @@
   return valid;
 }
 
-static void read_intrabc_info(AV1_COMMON *const cm, MACROBLOCKD *const xd,
+static void read_intrabc_info(AV1_COMMON *const cm, DecoderCodingBlock *dcb,
                               aom_reader *r) {
+  MACROBLOCKD *const xd = &dcb->xd;
   MB_MODE_INFO *const mbmi = xd->mi[0];
   FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
   mbmi->use_intrabc = aom_read_symbol(r, ec_ctx->intrabc_cdf, 2, ACCT_STR);
   if (mbmi->use_intrabc) {
-    BLOCK_SIZE bsize = mbmi->sb_type;
+    BLOCK_SIZE bsize = mbmi->bsize;
     mbmi->mode = DC_PRED;
     mbmi->uv_mode = UV_DC_PRED;
     mbmi->interp_filters = av1_broadcast_interp_filter(BILINEAR);
@@ -699,7 +702,7 @@
     int16_t inter_mode_ctx[MODE_CTX_REF_FRAMES];
     int_mv ref_mvs[INTRA_FRAME + 1][MAX_MV_REF_CANDIDATES];
 
-    av1_find_mv_refs(cm, xd, mbmi, INTRA_FRAME, xd->ref_mv_count,
+    av1_find_mv_refs(cm, xd, mbmi, INTRA_FRAME, dcb->ref_mv_count,
                      xd->ref_mv_stack, xd->weight, ref_mvs, /*global_mvs=*/NULL,
                      inter_mode_ctx);
 
@@ -731,10 +734,10 @@
 
   if (delta_q_info->delta_q_present_flag) {
     MB_MODE_INFO *const mbmi = xd->mi[0];
-    xd->current_qindex +=
+    xd->current_base_qindex +=
         read_delta_qindex(cm, xd, r, mbmi) * delta_q_info->delta_q_res;
     /* Normative: Clamp to [1,MAXQ] to not interfere with lossless mode */
-    xd->current_qindex = clamp(xd->current_qindex, 1, MAXQ);
+    xd->current_base_qindex = clamp(xd->current_base_qindex, 1, MAXQ);
     FRAME_CONTEXT *const ec_ctx = xd->tile_ctx;
     if (delta_q_info->delta_lf_present_flag) {
       const int mi_row = xd->mi_row;
@@ -764,11 +767,12 @@
 }
 
 static void read_intra_frame_mode_info(AV1_COMMON *const cm,
-                                       MACROBLOCKD *const xd, aom_reader *r) {
+                                       DecoderCodingBlock *dcb, aom_reader *r) {
+  MACROBLOCKD *const xd = &dcb->xd;
   MB_MODE_INFO *const mbmi = xd->mi[0];
   const MB_MODE_INFO *above_mi = xd->above_mbmi;
   const MB_MODE_INFO *left_mi = xd->left_mbmi;
-  const BLOCK_SIZE bsize = mbmi->sb_type;
+  const BLOCK_SIZE bsize = mbmi->bsize;
   struct segmentation *const seg = &cm->seg;
 
   FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
@@ -776,16 +780,16 @@
   if (seg->segid_preskip)
     mbmi->segment_id = read_intra_segment_id(cm, xd, bsize, r, 0);
 
-  mbmi->skip = read_skip(cm, xd, mbmi->segment_id, r);
+  mbmi->skip_txfm = read_skip_txfm(cm, xd, mbmi->segment_id, r);
 
   if (!seg->segid_preskip)
-    mbmi->segment_id = read_intra_segment_id(cm, xd, bsize, r, mbmi->skip);
+    mbmi->segment_id = read_intra_segment_id(cm, xd, bsize, r, mbmi->skip_txfm);
 
   read_cdef(cm, r, xd);
 
   read_delta_q_params(cm, xd, r);
 
-  mbmi->current_qindex = xd->current_qindex;
+  mbmi->current_qindex = xd->current_base_qindex;
 
   mbmi->ref_frame[0] = INTRA_FRAME;
   mbmi->ref_frame[1] = NONE_FRAME;
@@ -800,7 +804,7 @@
       xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
 
   if (av1_allow_intrabc(cm)) {
-    read_intrabc_info(cm, xd, r);
+    read_intrabc_info(cm, dcb, r);
     if (is_intrabc_block(mbmi)) return;
   }
 
@@ -896,7 +900,7 @@
 static REFERENCE_MODE read_block_reference_mode(AV1_COMMON *cm,
                                                 const MACROBLOCKD *xd,
                                                 aom_reader *r) {
-  if (!is_comp_ref_allowed(xd->mi[0]->sb_type)) return SINGLE_REFERENCE;
+  if (!is_comp_ref_allowed(xd->mi[0]->bsize)) return SINGLE_REFERENCE;
   if (cm->current_frame.reference_mode == REFERENCE_MODE_SELECT) {
     const int ctx = av1_get_reference_mode_context(xd);
     const REFERENCE_MODE mode = (REFERENCE_MODE)aom_read_symbol(
@@ -1058,7 +1062,7 @@
                                        MACROBLOCKD *const xd,
                                        MB_MODE_INFO *const mbmi,
                                        aom_reader *r) {
-  const BLOCK_SIZE bsize = mbmi->sb_type;
+  const BLOCK_SIZE bsize = mbmi->bsize;
   const int use_angle_delta = av1_use_angle_delta(bsize);
 
   mbmi->ref_frame[0] = INTRA_FRAME;
@@ -1111,7 +1115,7 @@
                             aom_reader *r) {
   FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
   MB_MODE_INFO *mbmi = xd->mi[0];
-  BLOCK_SIZE bsize = mbmi->sb_type;
+  BLOCK_SIZE bsize = mbmi->bsize;
   FeatureFlags *const features = &cm->features;
   if (features->cur_frame_force_integer_mv) {
     allow_hp = MV_SUBPEL_NONE;
@@ -1261,17 +1265,18 @@
 #endif  // DEC_MISMATCH_DEBUG
 
 static void read_inter_block_mode_info(AV1Decoder *const pbi,
-                                       MACROBLOCKD *const xd,
+                                       DecoderCodingBlock *dcb,
                                        MB_MODE_INFO *const mbmi,
                                        aom_reader *r) {
   AV1_COMMON *const cm = &pbi->common;
   FeatureFlags *const features = &cm->features;
-  const BLOCK_SIZE bsize = mbmi->sb_type;
+  const BLOCK_SIZE bsize = mbmi->bsize;
   const int allow_hp = features->allow_high_precision_mv;
   int_mv nearestmv[2], nearmv[2];
   int_mv ref_mvs[MODE_CTX_REF_FRAMES][MAX_MV_REF_CANDIDATES] = { { { 0 } } };
   int16_t inter_mode_ctx[MODE_CTX_REF_FRAMES];
   int pts[SAMPLES_ARRAY_SIZE], pts_inref[SAMPLES_ARRAY_SIZE];
+  MACROBLOCKD *const xd = &dcb->xd;
   FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
 
   mbmi->uv_mode = UV_DC_PRED;
@@ -1284,7 +1289,7 @@
   const int is_compound = has_second_ref(mbmi);
 
   const MV_REFERENCE_FRAME ref_frame = av1_ref_frame_type(mbmi->ref_frame);
-  av1_find_mv_refs(cm, xd, mbmi, ref_frame, xd->ref_mv_count, xd->ref_mv_stack,
+  av1_find_mv_refs(cm, xd, mbmi, ref_frame, dcb->ref_mv_count, xd->ref_mv_stack,
                    xd->weight, ref_mvs, /*global_mvs=*/NULL, inter_mode_ctx);
 
   mbmi->ref_mv_idx = 0;
@@ -1305,7 +1310,7 @@
         mbmi->mode = read_inter_mode(ec_ctx, r, mode_ctx);
       if (mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV ||
           have_nearmv_in_inter_mode(mbmi->mode))
-        read_drl_idx(ec_ctx, xd, mbmi, r);
+        read_drl_idx(ec_ctx, dcb, mbmi, r);
     }
   }
 
@@ -1357,7 +1362,7 @@
       ref_mv[1] = xd->ref_mv_stack[ref_frame][ref_mv_idx].comp_mv;
   } else {
     if (mbmi->mode == NEWMV) {
-      if (xd->ref_mv_count[ref_frame] > 1)
+      if (dcb->ref_mv_count[ref_frame] > 1)
         ref_mv[0] = xd->ref_mv_stack[ref_frame][mbmi->ref_mv_idx].this_mv;
     }
   }
@@ -1367,7 +1372,7 @@
   const int mv_corrupted_flag =
       !assign_mv(cm, xd, mbmi->mode, mbmi->ref_frame, mbmi->mv, ref_mv,
                  nearestmv, nearmv, is_compound, allow_hp, r);
-  aom_merge_corrupted_flag(&xd->corrupted, mv_corrupted_flag);
+  aom_merge_corrupted_flag(&dcb->corrupted, mv_corrupted_flag);
 
   mbmi->use_wedge_interintra = 0;
   if (cm->seq_params.enable_interintra_compound && !mbmi->skip_mode &&
@@ -1401,7 +1406,7 @@
   }
 
   mbmi->motion_mode = SIMPLE_TRANSLATION;
-  if (is_motion_variation_allowed_bsize(mbmi->sb_type) && !mbmi->skip_mode &&
+  if (is_motion_variation_allowed_bsize(mbmi->bsize) && !mbmi->skip_mode &&
       !has_second_ref(mbmi)) {
     mbmi->num_proj_ref = av1_findSamples(cm, xd, pts, pts_inref);
   }
@@ -1470,10 +1475,10 @@
   read_mb_interp_filter(xd, features->interp_filter,
                         cm->seq_params.enable_dual_filter, mbmi, r);
 
-  const int mi_row = xd->mi_row;
-  const int mi_col = xd->mi_col;
-
+#if !CONFIG_REALTIME_ONLY
   if (mbmi->motion_mode == WARPED_CAUSAL) {
+    const int mi_row = xd->mi_row;
+    const int mi_col = xd->mi_col;
     mbmi->wm_params.wmtype = DEFAULT_WMTYPE;
     mbmi->wm_params.invalid = 0;
 
@@ -1491,6 +1496,7 @@
       mbmi->wm_params.invalid = 1;
     }
   }
+#endif
 
   xd->cfl.store_y = store_cfl_required(cm, xd);
 
@@ -1500,8 +1506,9 @@
 }
 
 static void read_inter_frame_mode_info(AV1Decoder *const pbi,
-                                       MACROBLOCKD *const xd, aom_reader *r) {
+                                       DecoderCodingBlock *dcb, aom_reader *r) {
   AV1_COMMON *const cm = &pbi->common;
+  MACROBLOCKD *const xd = &dcb->xd;
   MB_MODE_INFO *const mbmi = xd->mi[0];
   int inter_block = 1;
 
@@ -1512,9 +1519,9 @@
   mbmi->skip_mode = read_skip_mode(cm, xd, mbmi->segment_id, r);
 
   if (mbmi->skip_mode)
-    mbmi->skip = 1;
+    mbmi->skip_txfm = 1;
   else
-    mbmi->skip = read_skip(cm, xd, mbmi->segment_id, r);
+    mbmi->skip_txfm = read_skip_txfm(cm, xd, mbmi->segment_id, r);
 
   if (!cm->seg.segid_preskip)
     mbmi->segment_id = read_inter_segment_id(cm, xd, 0, r);
@@ -1526,7 +1533,7 @@
   if (!mbmi->skip_mode)
     inter_block = read_is_inter_block(cm, xd, mbmi->segment_id, r);
 
-  mbmi->current_qindex = xd->current_qindex;
+  mbmi->current_qindex = xd->current_base_qindex;
 
   xd->above_txfm_context =
       cm->above_contexts.txfm[xd->tile.tile_row] + xd->mi_col;
@@ -1534,7 +1541,7 @@
       xd->left_txfm_context_buffer + (xd->mi_row & MAX_MIB_MASK);
 
   if (inter_block)
-    read_inter_block_mode_info(pbi, xd, mbmi, r);
+    read_inter_block_mode_info(pbi, dcb, mbmi, r);
   else
     read_intra_block_mode_info(cm, xd, mbmi, r);
 }
@@ -1557,19 +1564,20 @@
   }
 }
 
-void av1_read_mode_info(AV1Decoder *const pbi, MACROBLOCKD *xd, aom_reader *r,
-                        int x_mis, int y_mis) {
+void av1_read_mode_info(AV1Decoder *const pbi, DecoderCodingBlock *dcb,
+                        aom_reader *r, int x_mis, int y_mis) {
   AV1_COMMON *const cm = &pbi->common;
+  MACROBLOCKD *const xd = &dcb->xd;
   MB_MODE_INFO *const mi = xd->mi[0];
   mi->use_intrabc = 0;
 
   if (frame_is_intra_only(cm)) {
-    read_intra_frame_mode_info(cm, xd, r);
-    if (pbi->common.seq_params.order_hint_info.enable_ref_frame_mvs)
+    read_intra_frame_mode_info(cm, dcb, r);
+    if (cm->seq_params.order_hint_info.enable_ref_frame_mvs)
       intra_copy_frame_mvs(cm, xd->mi_row, xd->mi_col, x_mis, y_mis);
   } else {
-    read_inter_frame_mode_info(pbi, xd, r);
-    if (pbi->common.seq_params.order_hint_info.enable_ref_frame_mvs)
+    read_inter_frame_mode_info(pbi, dcb, r);
+    if (cm->seq_params.order_hint_info.enable_ref_frame_mvs)
       av1_copy_frame_mvs(cm, mi, xd->mi_row, xd->mi_col, x_mis, y_mis);
   }
 }
diff --git a/av1/decoder/decodemv.h b/av1/decoder/decodemv.h
index 289e66a..3d8629c 100644
--- a/av1/decoder/decodemv.h
+++ b/av1/decoder/decodemv.h
@@ -20,8 +20,8 @@
 extern "C" {
 #endif
 
-void av1_read_mode_info(AV1Decoder *const pbi, MACROBLOCKD *xd, aom_reader *r,
-                        int x_mis, int y_mis);
+void av1_read_mode_info(AV1Decoder *const pbi, DecoderCodingBlock *dcb,
+                        aom_reader *r, int x_mis, int y_mis);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/av1/decoder/decoder.c b/av1/decoder/decoder.c
index fc5f2cd..b877961 100644
--- a/av1/decoder/decoder.c
+++ b/av1/decoder/decoder.c
@@ -138,7 +138,9 @@
   av1_loop_filter_init(cm);
 
   av1_qm_init(&cm->quant_params, av1_num_planes(cm));
+#if !CONFIG_REALTIME_ONLY
   av1_loop_restoration_precal();
+#endif
 #if CONFIG_ACCOUNTING
   pbi->acct_enabled = 1;
   aom_accounting_init(&pbi->accounting);
@@ -216,7 +218,9 @@
 
   if (pbi->num_workers > 0) {
     av1_loop_filter_dealloc(&pbi->lf_row_sync);
+#if !CONFIG_REALTIME_ONLY
     av1_loop_restoration_dealloc(&pbi->lr_row_sync, pbi->num_workers);
+#endif
     av1_dealloc_dec_jobs(&pbi->tile_mt_info);
   }
 
diff --git a/av1/decoder/decoder.h b/av1/decoder/decoder.h
index 4580de2..b20e9c1 100644
--- a/av1/decoder/decoder.h
+++ b/av1/decoder/decoder.h
@@ -33,25 +33,88 @@
 extern "C" {
 #endif
 
+/*!
+ * \brief Contains coding block data required by the decoder.
+ *
+ * This includes:
+ * - Coding block info that is common between encoder and decoder.
+ * - Other coding block info only needed by the decoder.
+ * Contrast this with a similar struct MACROBLOCK on encoder side.
+ * This data is also common between ThreadData and AV1Decoder structs.
+ */
+typedef struct DecoderCodingBlock {
+  /*!
+   * Coding block info that is common between encoder and decoder.
+   */
+  DECLARE_ALIGNED(32, MACROBLOCKD, xd);
+  /*!
+   * True if the at least one of the coding blocks decoded was corrupted.
+   */
+  int corrupted;
+  /*!
+   * Pointer to 'mc_buf' inside 'pbi->td' (single-threaded decoding) or
+   * 'pbi->thread_data[i].td' (multi-threaded decoding).
+   */
+  uint8_t *mc_buf[2];
+  /*!
+   * Pointer to 'dqcoeff' inside 'td->cb_buffer_base' or 'pbi->cb_buffer_base'
+   * with appropriate offset for the current superblock, for each plane.
+   */
+  tran_low_t *dqcoeff_block[MAX_MB_PLANE];
+  /*!
+   * cb_offset[p] is the offset into the dqcoeff_block[p] for the current coding
+   * block, for each plane 'p'.
+   */
+  uint16_t cb_offset[MAX_MB_PLANE];
+  /*!
+   * Pointer to 'eob_data' inside 'td->cb_buffer_base' or 'pbi->cb_buffer_base'
+   * with appropriate offset for the current superblock, for each plane.
+   */
+  eob_info *eob_data[MAX_MB_PLANE];
+  /*!
+   * txb_offset[p] is the offset into the eob_data[p] for the current coding
+   * block, for each plane 'p'.
+   */
+  uint16_t txb_offset[MAX_MB_PLANE];
+  /*!
+   * ref_mv_count[i] specifies the number of number of motion vector candidates
+   * in xd->ref_mv_stack[i].
+   */
+  uint8_t ref_mv_count[MODE_CTX_REF_FRAMES];
+} DecoderCodingBlock;
+
+/*!\cond */
+
 typedef void (*decode_block_visitor_fn_t)(const AV1_COMMON *const cm,
-                                          MACROBLOCKD *const xd,
+                                          DecoderCodingBlock *dcb,
                                           aom_reader *const r, const int plane,
                                           const int row, const int col,
                                           const TX_SIZE tx_size);
 
 typedef void (*predict_inter_block_visitor_fn_t)(AV1_COMMON *const cm,
-                                                 MACROBLOCKD *const xd,
+                                                 DecoderCodingBlock *dcb,
                                                  BLOCK_SIZE bsize);
 
 typedef void (*cfl_store_inter_block_visitor_fn_t)(AV1_COMMON *const cm,
                                                    MACROBLOCKD *const xd);
 
 typedef struct ThreadData {
-  DECLARE_ALIGNED(32, MACROBLOCKD, xd);
+  DecoderCodingBlock dcb;
+
+  // Coding block buffer for the current superblock.
+  // Used only for single-threaded decoding and multi-threaded decoding with
+  // row_mt == 1 cases.
+  // See also: similar buffer in 'AV1Decoder'.
   CB_BUFFER cb_buffer_base;
+
   aom_reader *bit_reader;
+
+  // Motion compensation buffer used to get a prediction buffer with extended
+  // borders. One buffer for each of the two possible references.
   uint8_t *mc_buf[2];
+  // Allocated size of 'mc_buf'.
   int32_t mc_buf_size;
+  // If true, the pointers in 'mc_buf' were converted from highbd pointers.
   int mc_buf_use_highbd;  // Boolean: whether the byte pointers stored in
                           // mc_buf were converted from highbd pointers.
 
@@ -156,7 +219,7 @@
 } AV1DecTileMT;
 
 typedef struct AV1Decoder {
-  DECLARE_ALIGNED(32, MACROBLOCKD, mb);
+  DecoderCodingBlock dcb;
 
   DECLARE_ALIGNED(32, AV1_COMMON, common);
 
@@ -228,11 +291,24 @@
   int tile_count_minus_1;
   uint32_t coded_tile_data_size;
   unsigned int ext_tile_debug;  // for ext-tile software debug & testing
+
+  // Decoder has 3 modes of operation:
+  // (1) Single-threaded decoding.
+  // (2) Multi-threaded decoding with each tile decoded in parallel.
+  // (3) In addition to (2), each thread decodes 1 superblock row in parallel.
+  // row_mt = 1 triggers mode (3) above, while row_mt = 0, will trigger mode (1)
+  // or (2) depending on 'max_threads'.
   unsigned int row_mt;
+
   EXTERNAL_REFERENCES ext_refs;
   YV12_BUFFER_CONFIG tile_list_outbuf;
 
+  // Coding block buffer for the current frame.
+  // Allocated and used only for multi-threaded decoding with 'row_mt == 0'.
+  // See also: similar buffer in 'ThreadData' struct.
   CB_BUFFER *cb_buffer_base;
+  // Allocated size of 'cb_buffer_base'. Currently same as the number of
+  // superblocks in the coded frame.
   int cb_buffer_alloc_size;
 
   int allocated_row_mt_sync_rows;
@@ -250,6 +326,10 @@
   int skip_film_grain;
   int is_annexb;
   int valid_for_referencing[REF_FRAMES];
+  int is_fwd_kf_present;
+  int is_arf_frame_present;
+  int num_tile_groups;
+  aom_s_frame_info sframe_info;
 } AV1Decoder;
 
 // Returns 0 on success. Sets pbi->common.error.error_code to a nonzero error
@@ -324,6 +404,8 @@
                                    int mi_row, int mi_col, aom_reader *r,
                                    PARTITION_TYPE partition, BLOCK_SIZE bsize);
 
+/*!\endcond */
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/av1/decoder/decodetxb.c b/av1/decoder/decodetxb.c
index 541f4c9..0ec1487 100644
--- a/av1/decoder/decodetxb.c
+++ b/av1/decoder/decodetxb.c
@@ -107,11 +107,12 @@
   }
 }
 
-uint8_t av1_read_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCKD *const xd,
+uint8_t av1_read_coeffs_txb(const AV1_COMMON *const cm, DecoderCodingBlock *dcb,
                             aom_reader *const r, const int blk_row,
                             const int blk_col, const int plane,
                             const TXB_CTX *const txb_ctx,
                             const TX_SIZE tx_size) {
+  MACROBLOCKD *const xd = &dcb->xd;
   FRAME_CONTEXT *const ec_ctx = xd->tile_ctx;
   const int32_t max_value = (1 << (7 + xd->bd)) - 1;
   const int32_t min_value = -(1 << (7 + xd->bd));
@@ -120,7 +121,7 @@
   MB_MODE_INFO *const mbmi = xd->mi[0];
   struct macroblockd_plane *const pd = &xd->plane[plane];
   const int16_t *const dequant = pd->seg_dequant_QTX[mbmi->segment_id];
-  tran_low_t *const tcoeffs = pd->dqcoeff_block + xd->cb_offset[plane];
+  tran_low_t *const tcoeffs = dcb->dqcoeff_block[plane] + dcb->cb_offset[plane];
   const int shift = av1_get_tx_scale(tx_size);
   const int bwl = get_txb_bwl(tx_size);
   const int width = get_txb_wide(tx_size);
@@ -131,7 +132,7 @@
   uint8_t *const levels = set_levels(levels_buf, width);
   const int all_zero = aom_read_symbol(
       r, ec_ctx->txb_skip_cdf[txs_ctx][txb_ctx->txb_skip_ctx], 2, ACCT_STR);
-  eob_info *eob_data = pd->eob_data + xd->txb_offset[plane];
+  eob_info *eob_data = dcb->eob_data[plane] + dcb->txb_offset[plane];
   uint16_t *const eob = &(eob_data->eob);
   uint16_t *const max_scan_line = &(eob_data->max_scan_line);
   *max_scan_line = 0;
@@ -140,7 +141,7 @@
 #if CONFIG_INSPECTION
   if (plane == 0) {
     const int txk_type_idx =
-        av1_get_txk_type_index(mbmi->sb_type, blk_row, blk_col);
+        av1_get_txk_type_index(mbmi->bsize, blk_row, blk_col);
     mbmi->tx_skip[txk_type_idx] = all_zero;
   }
 #endif
@@ -321,17 +322,18 @@
 }
 
 void av1_read_coeffs_txb_facade(const AV1_COMMON *const cm,
-                                MACROBLOCKD *const xd, aom_reader *const r,
+                                DecoderCodingBlock *dcb, aom_reader *const r,
                                 const int plane, const int row, const int col,
                                 const TX_SIZE tx_size) {
 #if TXCOEFF_TIMER
   struct aom_usec_timer timer;
   aom_usec_timer_start(&timer);
 #endif
+  MACROBLOCKD *const xd = &dcb->xd;
   MB_MODE_INFO *const mbmi = xd->mi[0];
   struct macroblockd_plane *const pd = &xd->plane[plane];
 
-  const BLOCK_SIZE bsize = mbmi->sb_type;
+  const BLOCK_SIZE bsize = mbmi->bsize;
   assert(bsize < BLOCK_SIZES_ALL);
   const BLOCK_SIZE plane_bsize =
       get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
@@ -340,7 +342,7 @@
   get_txb_ctx(plane_bsize, tx_size, plane, pd->above_entropy_context + col,
               pd->left_entropy_context + row, &txb_ctx);
   const uint8_t cul_level =
-      av1_read_coeffs_txb(cm, xd, r, row, col, plane, &txb_ctx, tx_size);
+      av1_read_coeffs_txb(cm, dcb, r, row, col, plane, &txb_ctx, tx_size);
   av1_set_entropy_contexts(xd, pd, plane, plane_bsize, tx_size, cul_level, col,
                            row);
 
diff --git a/av1/decoder/decodetxb.h b/av1/decoder/decodetxb.h
index 39bf0bf..fd34d40 100644
--- a/av1/decoder/decodetxb.h
+++ b/av1/decoder/decodetxb.h
@@ -12,21 +12,23 @@
 #ifndef AOM_AV1_DECODER_DECODETXB_H_
 #define AOM_AV1_DECODER_DECODETXB_H_
 
-#include "config/aom_config.h"
+#include "av1/common/enums.h"
 
-#include "av1/common/av1_common_int.h"
-#include "av1/common/blockd.h"
-#include "av1/common/txb_common.h"
-#include "aom_dsp/bitreader.h"
+struct aom_reader;
+struct AV1Common;
+struct DecoderCodingBlock;
+struct txb_ctx;
 
-uint8_t av1_read_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCKD *const xd,
-                            aom_reader *const r, const int blk_row,
+uint8_t av1_read_coeffs_txb(const struct AV1Common *const cm,
+                            struct DecoderCodingBlock *dcb,
+                            struct aom_reader *const r, const int blk_row,
                             const int blk_col, const int plane,
-                            const TXB_CTX *const txb_ctx,
+                            const struct txb_ctx *const txb_ctx,
                             const TX_SIZE tx_size);
 
-void av1_read_coeffs_txb_facade(const AV1_COMMON *const cm,
-                                MACROBLOCKD *const xd, aom_reader *const r,
-                                const int plane, const int row, const int col,
+void av1_read_coeffs_txb_facade(const struct AV1Common *const cm,
+                                struct DecoderCodingBlock *dcb,
+                                struct aom_reader *const r, const int plane,
+                                const int row, const int col,
                                 const TX_SIZE tx_size);
 #endif  // AOM_AV1_DECODER_DECODETXB_H_
diff --git a/av1/decoder/detokenize.c b/av1/decoder/detokenize.c
index 9d54bd1..3c6a006 100644
--- a/av1/decoder/detokenize.c
+++ b/av1/decoder/detokenize.c
@@ -72,7 +72,7 @@
                          : xd->tile_ctx->palette_y_color_index_cdf;
   const MB_MODE_INFO *const mbmi = xd->mi[0];
   params.n_colors = mbmi->palette_mode_info.palette_size[plane];
-  av1_get_block_dimensions(mbmi->sb_type, plane, xd, &params.plane_width,
+  av1_get_block_dimensions(mbmi->bsize, plane, xd, &params.plane_width,
                            &params.plane_height, &params.rows, &params.cols);
   decode_color_map_tokens(&params, r);
 }
diff --git a/av1/decoder/inspection.c b/av1/decoder/inspection.c
index d121a70..b706c45 100644
--- a/av1/decoder/inspection.c
+++ b/av1/decoder/inspection.c
@@ -99,9 +99,9 @@
       mi->compound_type = mbmi->interinter_comp.type;
 
       // Block Size
-      mi->sb_type = mbmi->sb_type;
+      mi->bsize = mbmi->bsize;
       // Skip Flag
-      mi->skip = mbmi->skip;
+      mi->skip = mbmi->skip_txfm;
       mi->filter[0] = av1_extract_interp_filter(mbmi->interp_filters, 0);
       mi->filter[1] = av1_extract_interp_filter(mbmi->interp_filters, 1);
       mi->dual_filter_type = mi->filter[0] * 3 + mi->filter[1];
@@ -109,7 +109,7 @@
       // Transform
       // TODO(anyone): extract tx type info from mbmi->txk_type[].
 
-      const BLOCK_SIZE bsize = mbmi->sb_type;
+      const BLOCK_SIZE bsize = mbmi->bsize;
       const int c = i % mi_size_wide[bsize];
       const int r = j % mi_size_high[bsize];
       if (is_inter_block(mbmi) || is_intrabc_block(mbmi))
diff --git a/av1/decoder/inspection.h b/av1/decoder/inspection.h
index b963f6a..70b1c80 100644
--- a/av1/decoder/inspection.h
+++ b/av1/decoder/inspection.h
@@ -38,7 +38,7 @@
   int16_t ref_frame[2];
   int16_t mode;
   int16_t uv_mode;
-  int16_t sb_type;
+  int16_t bsize;
   int16_t skip;
   int16_t segment_id;
   int16_t dual_filter_type;
diff --git a/av1/decoder/obu.c b/av1/decoder/obu.c
index 791e596..de24adc 100644
--- a/av1/decoder/obu.c
+++ b/av1/decoder/obu.c
@@ -52,13 +52,13 @@
 }
 
 static int is_obu_in_current_operating_point(AV1Decoder *pbi,
-                                             ObuHeader obu_header) {
-  if (!pbi->current_operating_point) {
+                                             const ObuHeader *obu_header) {
+  if (!pbi->current_operating_point || !obu_header->has_extension) {
     return 1;
   }
 
-  if ((pbi->current_operating_point >> obu_header.temporal_layer_id) & 0x1 &&
-      (pbi->current_operating_point >> (obu_header.spatial_layer_id + 8)) &
+  if ((pbi->current_operating_point >> obu_header->temporal_layer_id) & 0x1 &&
+      (pbi->current_operating_point >> (obu_header->spatial_layer_id + 8)) &
           0x1) {
     return 1;
   }
@@ -576,102 +576,95 @@
                                 const uint8_t *data, size_t sz,
                                 aom_metadata_insert_flags_t insert_flag) {
   AV1_COMMON *const cm = &pbi->common;
+  if (!pbi->metadata) {
+    pbi->metadata = aom_img_metadata_array_alloc(0);
+    if (!pbi->metadata) {
+      aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+                         "Failed to allocate metadata array");
+    }
+  }
   aom_metadata_t *metadata =
       aom_img_metadata_alloc(metadata_type, data, sz, insert_flag);
   if (!metadata) {
     aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
                        "Error allocating metadata");
   }
-  if (!pbi->metadata) {
-    pbi->metadata = aom_img_metadata_array_alloc(1);
-    if (!pbi->metadata) {
-      aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
-                         "Failed to allocate metadata array");
-    }
-  } else {
-    aom_metadata_t **metadata_array =
-        (aom_metadata_t **)realloc(pbi->metadata->metadata_array,
-                                   (pbi->metadata->sz + 1) * sizeof(metadata));
-    if (!metadata_array) {
-      aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
-                         "Error allocating metadata");
-    }
-    pbi->metadata->metadata_array = metadata_array;
-    pbi->metadata->sz++;
+  aom_metadata_t **metadata_array =
+      (aom_metadata_t **)realloc(pbi->metadata->metadata_array,
+                                 (pbi->metadata->sz + 1) * sizeof(metadata));
+  if (!metadata_array) {
+    aom_img_metadata_free(metadata);
+    aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+                       "Error growing metadata array");
   }
-  pbi->metadata->metadata_array[pbi->metadata->sz - 1] = metadata;
+  pbi->metadata->metadata_array = metadata_array;
+  pbi->metadata->metadata_array[pbi->metadata->sz] = metadata;
+  pbi->metadata->sz++;
 }
 
-// On success, returns the number of bytes read from 'data'. On failure, calls
-// aom_internal_error() and does not return.
-static size_t read_metadata_itut_t35(AV1Decoder *const pbi, const uint8_t *data,
-                                     size_t sz) {
-  const int kMinItuT35PayloadSize = 2;
+// On failure, calls aom_internal_error() and does not return.
+static void read_metadata_itut_t35(AV1Decoder *const pbi, const uint8_t *data,
+                                   size_t sz) {
   AV1_COMMON *const cm = &pbi->common;
   if (sz == 0) {
     aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
                        "itu_t_t35_country_code is missing");
   }
-  int bytes_read = get_last_nonzero_byte_index(data, sz);
-  if (bytes_read < 0) {
-    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
-                       "No trailing bits found on metadata");
+  int country_code_size = 1;
+  if (*data == 0xFF) {
+    if (sz == 1) {
+      aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                         "itu_t_t35_country_code_extension_byte is missing");
+    }
+    ++country_code_size;
   }
-  if (*data == 0xFF && bytes_read < kMinItuT35PayloadSize) {
+  int end_index = get_last_nonzero_byte_index(data, sz);
+  if (end_index < country_code_size) {
     aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
-                       "itu_t_t35_country_code_extension_byte is missing");
+                       "No trailing bits found in ITU-T T.35 metadata OBU");
   }
-  alloc_read_metadata(pbi, OBU_METADATA_TYPE_ITUT_T35, data, (size_t)bytes_read,
+  // itu_t_t35_payload_bytes is byte aligned. Section 6.7.2 of the spec says:
+  //   itu_t_t35_payload_bytes shall be bytes containing data registered as
+  //   specified in Recommendation ITU-T T.35.
+  // Therefore the first trailing byte should be 0x80.
+  if (data[end_index] != 0x80) {
+    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                       "The last nonzero byte of the ITU-T T.35 metadata OBU "
+                       "is 0x%02x, should be 0x80.",
+                       data[end_index]);
+  }
+  alloc_read_metadata(pbi, OBU_METADATA_TYPE_ITUT_T35, data, end_index,
                       AOM_MIF_ANY_FRAME);
-  return (size_t)bytes_read;
 }
 
 // On success, returns the number of bytes read from 'data'. On failure, calls
 // aom_internal_error() and does not return.
 static size_t read_metadata_hdr_cll(AV1Decoder *const pbi, const uint8_t *data,
                                     size_t sz) {
-  const int kHdrCllPayloadSize = 4;
+  const size_t kHdrCllPayloadSize = 4;
   AV1_COMMON *const cm = &pbi->common;
-  if (sz == 0) {
-    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
-                       "HDR CLL metadata payload is missing");
-  }
-  int bytes_read = get_last_nonzero_byte_index(data, sz);
-  if (bytes_read < 0) {
-    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
-                       "No trailing bits found on metadata");
-  }
-  if (bytes_read != kHdrCllPayloadSize) {
+  if (sz < kHdrCllPayloadSize) {
     aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
                        "Incorrect HDR CLL metadata payload size");
   }
-  alloc_read_metadata(pbi, OBU_METADATA_TYPE_HDR_CLL, data, (size_t)bytes_read,
+  alloc_read_metadata(pbi, OBU_METADATA_TYPE_HDR_CLL, data, kHdrCllPayloadSize,
                       AOM_MIF_ANY_FRAME);
-  return (size_t)bytes_read;
+  return kHdrCllPayloadSize;
 }
 
 // On success, returns the number of bytes read from 'data'. On failure, calls
 // aom_internal_error() and does not return.
 static size_t read_metadata_hdr_mdcv(AV1Decoder *const pbi, const uint8_t *data,
                                      size_t sz) {
-  const int kMdcvPayloadSize = 24;
+  const size_t kMdcvPayloadSize = 24;
   AV1_COMMON *const cm = &pbi->common;
-  if (sz == 0) {
-    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
-                       "HDR MDCV metadata payload is missing");
-  }
-  int bytes_read = get_last_nonzero_byte_index(data, sz);
-  if (bytes_read < 0) {
-    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
-                       "No trailing bits found on HDR MDCV metadata");
-  }
-  if (bytes_read != kMdcvPayloadSize) {
+  if (sz < kMdcvPayloadSize) {
     aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
                        "Incorrect HDR MDCV metadata payload size");
   }
-  alloc_read_metadata(pbi, OBU_METADATA_TYPE_HDR_MDCV, data, (size_t)bytes_read,
+  alloc_read_metadata(pbi, OBU_METADATA_TYPE_HDR_MDCV, data, kMdcvPayloadSize,
                       AOM_MIF_ANY_FRAME);
-  return (size_t)bytes_read;
+  return kMdcvPayloadSize;
 }
 
 static void scalability_structure(struct aom_read_bit_buffer *rb) {
@@ -679,7 +672,9 @@
   const int spatial_layer_dimensions_present_flag = aom_rb_read_bit(rb);
   const int spatial_layer_description_present_flag = aom_rb_read_bit(rb);
   const int temporal_group_description_present_flag = aom_rb_read_bit(rb);
-  aom_rb_read_literal(rb, 3);  // reserved
+  // scalability_structure_reserved_3bits must be set to zero and be ignored by
+  // decoders.
+  aom_rb_read_literal(rb, 3);
 
   if (spatial_layer_dimensions_present_flag) {
     for (int i = 0; i <= spatial_layers_cnt_minus_1; i++) {
@@ -781,28 +776,21 @@
     // If metadata_type is reserved for future use or a user private value,
     // ignore the entire OBU and just check trailing bits.
     if (get_last_nonzero_byte(data + type_length, sz - type_length) == 0) {
-      pbi->common.error.error_code = AOM_CODEC_CORRUPT_FRAME;
+      cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
       return 0;
     }
     return sz;
   }
   if (metadata_type == OBU_METADATA_TYPE_ITUT_T35) {
-    size_t bytes_read =
-        type_length +
-        read_metadata_itut_t35(pbi, data + type_length, sz - type_length);
-    // itu_t_t35_payload_bytes is byte aligned and the first
-    // trailing byte should be 0x80.
-    if (get_last_nonzero_byte(data + bytes_read, sz - bytes_read) != 0x80) {
-      pbi->common.error.error_code = AOM_CODEC_CORRUPT_FRAME;
-      return 0;
-    }
+    // read_metadata_itut_t35() checks trailing bits.
+    read_metadata_itut_t35(pbi, data + type_length, sz - type_length);
     return sz;
   } else if (metadata_type == OBU_METADATA_TYPE_HDR_CLL) {
     size_t bytes_read =
         type_length +
         read_metadata_hdr_cll(pbi, data + type_length, sz - type_length);
     if (get_last_nonzero_byte(data + bytes_read, sz - bytes_read) != 0x80) {
-      pbi->common.error.error_code = AOM_CODEC_CORRUPT_FRAME;
+      cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
       return 0;
     }
     return sz;
@@ -811,7 +799,7 @@
         type_length +
         read_metadata_hdr_mdcv(pbi, data + type_length, sz - type_length);
     if (get_last_nonzero_byte(data + bytes_read, sz - bytes_read) != 0x80) {
-      pbi->common.error.error_code = AOM_CODEC_CORRUPT_FRAME;
+      cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
       return 0;
     }
     return sz;
@@ -860,11 +848,22 @@
   AV1_COMMON *const cm = &pbi->common;
   int frame_decoding_finished = 0;
   int is_first_tg_obu_received = 1;
+  // Whenever pbi->seen_frame_header is set to 1, frame_header is set to the
+  // beginning of the frame_header_obu and frame_header_size is set to its
+  // size. This allows us to check if a redundant frame_header_obu is a copy
+  // of the previous frame_header_obu.
+  //
+  // Initialize frame_header to a dummy nonnull pointer, otherwise the Clang
+  // Static Analyzer in clang 7.0.1 will falsely warn that a null pointer is
+  // passed as an argument to a 'nonnull' parameter of memcmp(). The initial
+  // value will not be used.
+  const uint8_t *frame_header = data;
   uint32_t frame_header_size = 0;
   ObuHeader obu_header;
   memset(&obu_header, 0, sizeof(obu_header));
   pbi->seen_frame_header = 0;
   pbi->next_start_tile = 0;
+  pbi->num_tile_groups = 0;
 
   if (data_end < data) {
     cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
@@ -915,10 +914,9 @@
     cm->spatial_layer_id = obu_header.spatial_layer_id;
 
     if (obu_header.type != OBU_TEMPORAL_DELIMITER &&
-        obu_header.type != OBU_SEQUENCE_HEADER &&
-        obu_header.type != OBU_PADDING) {
+        obu_header.type != OBU_SEQUENCE_HEADER) {
       // don't decode obu if it's not in current operating mode
-      if (!is_obu_in_current_operating_point(pbi, obu_header)) {
+      if (!is_obu_in_current_operating_point(pbi, &obu_header)) {
         data += payload_size;
         continue;
       }
@@ -929,8 +927,12 @@
     switch (obu_header.type) {
       case OBU_TEMPORAL_DELIMITER:
         decoded_payload_size = read_temporal_delimiter_obu();
-        pbi->seen_frame_header = 0;
-        pbi->next_start_tile = 0;
+        if (pbi->seen_frame_header) {
+          // A new temporal unit has started, but the frame in the previous
+          // temporal unit is incomplete.
+          cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+          return -1;
+        }
         break;
       case OBU_SEQUENCE_HEADER:
         decoded_payload_size = read_sequence_header_obu(pbi, &rb);
@@ -961,14 +963,15 @@
             (cm->tiles.large_scale && !pbi->camera_frame_header_ready)) {
           frame_header_size = read_frame_header_obu(
               pbi, &rb, data, p_data_end, obu_header.type != OBU_FRAME);
+          frame_header = data;
           pbi->seen_frame_header = 1;
           if (!pbi->ext_tile_debug && cm->tiles.large_scale)
             pbi->camera_frame_header_ready = 1;
         } else {
-          // TODO(wtc): Verify that the frame_header_obu is identical to the
-          // original frame_header_obu. For now just skip frame_header_size
-          // bytes in the bit buffer.
-          if (frame_header_size > payload_size) {
+          // Verify that the frame_header_obu is identical to the original
+          // frame_header_obu.
+          if (frame_header_size > payload_size ||
+              memcmp(data, frame_header, frame_header_size) != 0) {
             cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
             return -1;
           }
@@ -1021,7 +1024,11 @@
             obu_header.type == OBU_FRAME);
         if (cm->error.error_code != AOM_CODEC_OK) return -1;
         is_first_tg_obu_received = 0;
-        if (frame_decoding_finished) pbi->seen_frame_header = 0;
+        if (frame_decoding_finished) {
+          pbi->seen_frame_header = 0;
+          pbi->next_start_tile = 0;
+        }
+        pbi->num_tile_groups++;
         break;
       case OBU_METADATA:
         decoded_payload_size = read_metadata(pbi, data, payload_size);
@@ -1048,7 +1055,7 @@
         if (cm->error.error_code != AOM_CODEC_OK) return -1;
         break;
       case OBU_PADDING:
-        decoded_payload_size = read_padding(&pbi->common, data, payload_size);
+        decoded_payload_size = read_padding(cm, data, payload_size);
         if (cm->error.error_code != AOM_CODEC_OK) return -1;
         break;
       default:
diff --git a/av1/encoder/aq_complexity.c b/av1/encoder/aq_complexity.c
index 3658006..3ea5f63 100644
--- a/av1/encoder/aq_complexity.c
+++ b/av1/encoder/aq_complexity.c
@@ -47,10 +47,11 @@
 
 static bool is_frame_aq_enabled(const AV1_COMP *const cpi) {
   const AV1_COMMON *const cm = &cpi->common;
+  const RefreshFrameFlagsInfo *const refresh_frame_flags = &cpi->refresh_frame;
 
   return frame_is_intra_only(cm) || cm->features.error_resilient_mode ||
-         cpi->refresh_alt_ref_frame ||
-         (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref);
+         refresh_frame_flags->alt_ref_frame ||
+         (refresh_frame_flags->golden_frame && !cpi->rc.is_src_frame_alt_ref);
 }
 
 // Segmentation only makes sense if the target bits per SB is above a threshold.
@@ -106,7 +107,8 @@
 
       qindex_delta = av1_compute_qdelta_by_rate(
           &cpi->rc, cm->current_frame.frame_type, base_qindex,
-          aq_c_q_adj_factor[aq_strength][segment], cm->seq_params.bit_depth);
+          aq_c_q_adj_factor[aq_strength][segment], cpi->is_screen_content_type,
+          cm->seq_params.bit_depth);
 
       // For AQ complexity mode, we dont allow Q0 in a segment if the base
       // Q is not 0. Q0 (lossless) implies 4x4 only and in AQ mode 2 a segment
diff --git a/av1/encoder/aq_cyclicrefresh.c b/av1/encoder/aq_cyclicrefresh.c
index b888494..30aeaa1 100644
--- a/av1/encoder/aq_cyclicrefresh.c
+++ b/av1/encoder/aq_cyclicrefresh.c
@@ -37,7 +37,6 @@
   }
   assert(MAXQ <= 255);
   memset(cr->last_coded_q_map, MAXQ, last_coded_q_map_size);
-  cr->avg_frame_low_motion = 0.0;
   return cr;
 }
 
@@ -80,19 +79,15 @@
 static int compute_deltaq(const AV1_COMP *cpi, int q, double rate_factor) {
   const CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
   const RATE_CONTROL *const rc = &cpi->rc;
-  int deltaq =
-      av1_compute_qdelta_by_rate(rc, cpi->common.current_frame.frame_type, q,
-                                 rate_factor, cpi->common.seq_params.bit_depth);
+  int deltaq = av1_compute_qdelta_by_rate(
+      rc, cpi->common.current_frame.frame_type, q, rate_factor,
+      cpi->is_screen_content_type, cpi->common.seq_params.bit_depth);
   if ((-deltaq) > cr->max_qdelta_perc * q / 100) {
     deltaq = -cr->max_qdelta_perc * q / 100;
   }
   return deltaq;
 }
 
-// For the just encoded frame, estimate the bits, incorporating the delta-q
-// from non-base segment. For now ignore effect of multiple segments
-// (with different delta-q). Note this function is called in the postencode
-// (called from rc_update_rate_correction_factors()).
 int av1_cyclic_refresh_estimate_bits_at_q(const AV1_COMP *cpi,
                                           double correction_factor) {
   const AV1_COMMON *const cm = &cpi->common;
@@ -110,21 +105,19 @@
   const int estimated_bits =
       (int)((1.0 - weight_segment1 - weight_segment2) *
                 av1_estimate_bits_at_q(frame_type, base_qindex, mbs,
-                                       correction_factor, bit_depth) +
+                                       correction_factor, bit_depth,
+                                       cpi->is_screen_content_type) +
             weight_segment1 * av1_estimate_bits_at_q(
                                   frame_type, base_qindex + cr->qindex_delta[1],
-                                  mbs, correction_factor, bit_depth) +
+                                  mbs, correction_factor, bit_depth,
+                                  cpi->is_screen_content_type) +
             weight_segment2 * av1_estimate_bits_at_q(
                                   frame_type, base_qindex + cr->qindex_delta[2],
-                                  mbs, correction_factor, bit_depth));
+                                  mbs, correction_factor, bit_depth,
+                                  cpi->is_screen_content_type));
   return estimated_bits;
 }
 
-// Prior to encoding the frame, estimate the bits per mb, for a given q = i and
-// a corresponding delta-q (for segment 1). This function is called in the
-// rc_regulate_q() to set the base qp index.
-// Note: the segment map is set to either 0/CR_SEGMENT_ID_BASE (no refresh) or
-// to 1/CR_SEGMENT_ID_BOOST1 (refresh) for each superblock, prior to encoding.
 int av1_cyclic_refresh_rc_bits_per_mb(const AV1_COMP *cpi, int i,
                                       double correction_factor) {
   const AV1_COMMON *const cm = &cpi->common;
@@ -144,17 +137,15 @@
   bits_per_mb =
       (int)((1.0 - weight_segment) *
                 av1_rc_bits_per_mb(cm->current_frame.frame_type, i,
-                                   correction_factor,
-                                   cm->seq_params.bit_depth) +
+                                   correction_factor, cm->seq_params.bit_depth,
+                                   cpi->is_screen_content_type) +
             weight_segment * av1_rc_bits_per_mb(cm->current_frame.frame_type,
                                                 i + deltaq, correction_factor,
-                                                cm->seq_params.bit_depth));
+                                                cm->seq_params.bit_depth,
+                                                cpi->is_screen_content_type));
   return bits_per_mb;
 }
 
-// Prior to coding a given prediction block, of size bsize at (mi_row, mi_col),
-// check if we should reset the segment_id, and update the cyclic_refresh map
-// and segmentation map.
 void av1_cyclic_refresh_update_segment(const AV1_COMP *cpi,
                                        MB_MODE_INFO *const mbmi, int mi_row,
                                        int mi_col, BLOCK_SIZE bsize,
@@ -197,49 +188,71 @@
 
   // Update entries in the cyclic refresh map with new_map_value, and
   // copy mbmi->segment_id into global segmentation map.
-  for (int y = 0; y < ymis; y++)
-    for (int x = 0; x < xmis; x++) {
+  // 8x8 is smallest coding block size for non-key frames.
+  for (int y = 0; y < ymis; y += 2)
+    for (int x = 0; x < xmis; x += 2) {
       int map_offset = block_index + y * cm->mi_params.mi_cols + x;
       cr->map[map_offset] = new_map_value;
       cpi->enc_seg.map[map_offset] = mbmi->segment_id;
     }
 }
 
-// Update the some stats after encode frame is done.
 void av1_cyclic_refresh_postencode(AV1_COMP *const cpi) {
   AV1_COMMON *const cm = &cpi->common;
   const CommonModeInfoParams *const mi_params = &cm->mi_params;
   CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+  RATE_CONTROL *const rc = &cpi->rc;
+  SVC *const svc = &cpi->svc;
   unsigned char *const seg_map = cpi->enc_seg.map;
-  cr->cnt_zeromv = 0;
+  int cnt_zeromv = 0;
   cr->actual_num_seg1_blocks = 0;
   cr->actual_num_seg2_blocks = 0;
-  for (int mi_row = 0; mi_row < mi_params->mi_rows; mi_row++) {
-    for (int mi_col = 0; mi_col < mi_params->mi_cols; mi_col++) {
-      MB_MODE_INFO **mi =
-          mi_params->mi_grid_base + mi_row * mi_params->mi_stride + mi_col;
+  // 8X8 blocks are smallest partition used on delta frames.
+  for (int mi_row = 0; mi_row < mi_params->mi_rows; mi_row += 2) {
+    MB_MODE_INFO **mi = mi_params->mi_grid_base + mi_row * mi_params->mi_stride;
+    int sh = 2;
+    for (int mi_col = 0; mi_col < mi_params->mi_cols; mi_col += sh) {
+      sh = mi_size_wide[mi[0]->bsize];
       MV mv = mi[0]->mv[0].as_mv;
       if (cm->seg.enabled) {
         int map_index = mi_row * mi_params->mi_cols + mi_col;
         if (cyclic_refresh_segment_id(seg_map[map_index]) ==
             CR_SEGMENT_ID_BOOST1)
-          cr->actual_num_seg1_blocks++;
+          cr->actual_num_seg1_blocks += sh << 1;
         else if (cyclic_refresh_segment_id(seg_map[map_index]) ==
                  CR_SEGMENT_ID_BOOST2)
-          cr->actual_num_seg2_blocks++;
+          cr->actual_num_seg2_blocks += sh << 1;
       }
       // Accumulate low_content_frame.
-      if (is_inter_block(mi[0]) && abs(mv.row) < 16 && abs(mv.col) < 16)
-        cr->cnt_zeromv++;
+      if (is_inter_block(mi[0]) && mi[0]->ref_frame[0] == LAST_FRAME &&
+          abs(mv.row) < 16 && abs(mv.col) < 16)
+        cnt_zeromv += sh << 1;
+      if (mi_col + sh < mi_params->mi_cols) {
+        mi += sh;
+      }
     }
   }
-  cr->cnt_zeromv =
-      100 * cr->cnt_zeromv / (mi_params->mi_rows * mi_params->mi_cols);
-  cr->avg_frame_low_motion =
-      (3 * cr->avg_frame_low_motion + (double)cr->cnt_zeromv) / 4;
+  cnt_zeromv = 100 * cnt_zeromv / (mi_params->mi_rows * mi_params->mi_cols);
+  if (!cpi->use_svc ||
+      (cpi->use_svc &&
+       !cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame &&
+       cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1)) {
+    rc->avg_frame_low_motion = (3 * rc->avg_frame_low_motion + cnt_zeromv) / 4;
+    // For SVC: set avg_frame_low_motion (only computed on top spatial layer)
+    // to all lower spatial layers.
+    if (cpi->use_svc &&
+        svc->spatial_layer_id == svc->number_spatial_layers - 1) {
+      for (int i = 0; i < svc->number_spatial_layers - 1; ++i) {
+        const int layer = LAYER_IDS_TO_IDX(i, svc->temporal_layer_id,
+                                           svc->number_temporal_layers);
+        LAYER_CONTEXT *const lc = &svc->layer_context[layer];
+        RATE_CONTROL *const lrc = &lc->rc;
+        lrc->avg_frame_low_motion = rc->avg_frame_low_motion;
+      }
+    }
+  }
 }
 
-// Set golden frame update interval, for 1 pass CBR mode.
 void av1_cyclic_refresh_set_golden_update(AV1_COMP *const cpi) {
   RATE_CONTROL *const rc = &cpi->rc;
   CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
@@ -250,7 +263,7 @@
     rc->baseline_gf_interval = AOMMIN(2 * (100 / cr->percent_refresh), 40);
   else
     rc->baseline_gf_interval = 20;
-  if (cr->avg_frame_low_motion < 40) rc->baseline_gf_interval = 8;
+  if (rc->avg_frame_low_motion < 40) rc->baseline_gf_interval = 8;
 }
 
 // Update the segmentation map, and related quantities: cyclic refresh map,
@@ -291,8 +304,8 @@
     int mi_col = sb_col_index * cm->seq_params.mib_size;
     // TODO(any): Ensure the population of
     // cpi->common.features.allow_screen_content_tools and use the same instead
-    // of cpi->oxcf.content == AOM_CONTENT_SCREEN
-    int qindex_thresh = cpi->oxcf.content == AOM_CONTENT_SCREEN
+    // of cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN
+    int qindex_thresh = cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN
                             ? av1_get_qindex(&cm->seg, CR_SEGMENT_ID_BOOST2,
                                              cm->quant_params.base_qindex)
                             : 0;
@@ -302,14 +315,15 @@
     // Loop through all MI blocks in superblock and update map.
     xmis = AOMMIN(mi_params->mi_cols - mi_col, cm->seq_params.mib_size);
     ymis = AOMMIN(mi_params->mi_rows - mi_row, cm->seq_params.mib_size);
-    for (y = 0; y < ymis; y++) {
-      for (x = 0; x < xmis; x++) {
+    // cr_map only needed at 8x8 blocks.
+    for (y = 0; y < ymis; y += 2) {
+      for (x = 0; x < xmis; x += 2) {
         const int bl_index2 = bl_index + y * mi_params->mi_cols + x;
         // If the block is as a candidate for clean up then mark it
         // for possible boost/refresh (segment 1). The segment id may get
         // reset to 0 later if block gets coded anything other than GLOBALMV.
         if (cr->map[bl_index2] == 0) {
-          if (cr->last_coded_q_map[bl_index2] > qindex_thresh) sum_map++;
+          if (cr->last_coded_q_map[bl_index2] > qindex_thresh) sum_map += 4;
         } else if (cr->map[bl_index2] < 0) {
           cr->map[bl_index2]++;
         }
@@ -317,7 +331,7 @@
     }
     // Enforce constant segment over superblock.
     // If segment is at least half of superblock, set to 1.
-    if (sum_map >= xmis * ymis / 2) {
+    if (sum_map >= (xmis * ymis) >> 1) {
       for (y = 0; y < ymis; y++)
         for (x = 0; x < xmis; x++) {
           seg_map[bl_index + y * mi_params->mi_cols + x] = CR_SEGMENT_ID_BOOST1;
@@ -345,12 +359,12 @@
   int qp_thresh = AOMMIN(20, rc->best_quality << 1);
   int qp_max_thresh = 118 * MAXQ >> 7;
   cr->apply_cyclic_refresh = 1;
-  if (frame_is_intra_only(cm) || is_lossless_requested(&cpi->oxcf) ||
+  if (frame_is_intra_only(cm) || is_lossless_requested(&cpi->oxcf.rc_cfg) ||
       cpi->svc.temporal_layer_id > 0 ||
       rc->avg_frame_qindex[INTER_FRAME] < qp_thresh ||
       (rc->frames_since_key > 20 &&
        rc->avg_frame_qindex[INTER_FRAME] > qp_max_thresh) ||
-      (cr->avg_frame_low_motion < 45 && rc->frames_since_key > 40)) {
+      (rc->avg_frame_low_motion < 45 && rc->frames_since_key > 40)) {
     cr->apply_cyclic_refresh = 0;
     return;
   }
@@ -378,14 +392,14 @@
       cr->rate_ratio_qdelta = AOMMAX(cr->rate_ratio_qdelta, 2.5);
     }
   }
-  if (cpi->oxcf.rc_mode == AOM_VBR) {
+  if (cpi->oxcf.rc_cfg.mode == AOM_VBR) {
     // To be adjusted for VBR mode, e.g., based on gf period and boost.
     // For now use smaller qp-delta (than CBR), no second boosted seg, and
     // turn-off (no refresh) on golden refresh (since it's already boosted).
     cr->percent_refresh = 10;
     cr->rate_ratio_qdelta = 1.5;
     cr->rate_boost_fac = 10;
-    if (cpi->refresh_golden_frame == 1) {
+    if (cpi->refresh_frame.golden_frame) {
       cr->percent_refresh = 0;
       cr->rate_ratio_qdelta = 1.0;
     }
@@ -416,7 +430,6 @@
       cm->prev_frame && (cm->width != cm->prev_frame->width ||
                          cm->height != cm->prev_frame->height);
   if (resolution_change) av1_cyclic_refresh_reset_resize(cpi);
-  if (cm->current_frame.frame_number == 0) cr->low_content_avg = 0.0;
   if (!cr->apply_cyclic_refresh) {
     // Set segmentation map to 0 and disable.
     unsigned char *const seg_map = cpi->enc_seg.map;
@@ -497,5 +510,6 @@
   CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
   memset(cr->map, 0, cm->mi_params.mi_rows * cm->mi_params.mi_cols);
   cr->sb_index = 0;
-  cpi->refresh_golden_frame = 1;
+  cpi->refresh_frame.golden_frame = true;
+  cr->apply_cyclic_refresh = 0;
 }
diff --git a/av1/encoder/aq_cyclicrefresh.h b/av1/encoder/aq_cyclicrefresh.h
index ee62f6a..23d1f16 100644
--- a/av1/encoder/aq_cyclicrefresh.h
+++ b/av1/encoder/aq_cyclicrefresh.h
@@ -27,46 +27,85 @@
 // Maximum rate target ratio for setting segment delta-qp.
 #define CR_MAX_RATE_TARGET_RATIO 4.0
 
+/*!
+ * \brief The stucture of CYCLIC_REFRESH.
+ * \ingroup cyclic_refresh
+ */
 struct CYCLIC_REFRESH {
-  // Percentage of blocks per frame that are targeted as candidates
-  // for cyclic refresh.
+  /*!
+   * Percentage of blocks per frame that are targeted as candidates
+   * for cyclic refresh.
+   */
   int percent_refresh;
-  // Maximum q-delta as percentage of base q.
+  /*!
+   * Maximum q-delta as percentage of base q.
+   */
   int max_qdelta_perc;
-  // Superblock starting index for cycling through the frame.
+  /*!
+   *Superblock starting index for cycling through the frame.
+   */
   int sb_index;
-  // Controls how long block will need to wait to be refreshed again, in
-  // excess of the cycle time, i.e., in the case of all zero motion, block
-  // will be refreshed every (100/percent_refresh + time_for_refresh) frames.
+  /*!
+   * Controls how long block will need to wait to be refreshed again, in
+   * excess of the cycle time, i.e., in the case of all zero motion, block
+   * will be refreshed every (100/percent_refresh + time_for_refresh) frames.
+   */
   int time_for_refresh;
-  // Target number of (4x4) blocks that are set for delta-q.
+  /*!
+   * Target number of (4x4) blocks that are set for delta-q.
+   */
   int target_num_seg_blocks;
-  // Actual number of (4x4) blocks that were applied delta-q.
+  /*!
+   * Actual number of (4x4) blocks that were applied delta-q,
+   * for segment 1.
+   */
   int actual_num_seg1_blocks;
+  /*!
+   * Actual number of (4x4) blocks that were applied delta-q,
+   * for segment 2.
+   */
   int actual_num_seg2_blocks;
-  // RD mult. parameters for segment 1.
+  /*!
+   * RD mult. parameters for segment 1.
+   */
   int rdmult;
-  // Cyclic refresh map.
+  /*!
+   * Cyclic refresh map.
+   */
   int8_t *map;
-  // Map of the last q a block was coded at.
+  /*!
+   * Map of the last q a block was coded at.
+   */
   uint8_t *last_coded_q_map;
-  // Thresholds applied to the projected rate/distortion of the coding block,
-  // when deciding whether block should be refreshed.
+  /*!
+   * Threshold applied to the projected rate of the coding block,
+   * when deciding whether block should be refreshed.
+   */
   int64_t thresh_rate_sb;
+  /*!
+   * Threshold applied to the projected distortion of the coding block,
+   * when deciding whether block should be refreshed.
+   */
   int64_t thresh_dist_sb;
-  // Threshold applied to the motion vector (in units of 1/8 pel) of the
-  // coding block, when deciding whether block should be refreshed.
+  /*!
+   * Threshold applied to the motion vector (in units of 1/8 pel) of the
+   * coding block, when deciding whether block should be refreshed.
+   */
   int16_t motion_thresh;
-  // Rate target ratio to set q delta.
+  /*!
+   * Rate target ratio to set q delta.
+   */
   double rate_ratio_qdelta;
-  // Boost factor for rate target ratio, for segment CR_SEGMENT_ID_BOOST2.
+  /*!
+   * Boost factor for rate target ratio, for segment CR_SEGMENT_ID_BOOST2.
+   */
   int rate_boost_fac;
-  double low_content_avg;
+
+  /*!\cond */
   int qindex_delta[3];
   double weight_segment;
   int apply_cyclic_refresh;
-  int cnt_zeromv;
-  double avg_frame_low_motion;
+  /*!\endcond */
 };
 
 struct AV1_COMP;
@@ -77,34 +116,129 @@
 
 void av1_cyclic_refresh_free(CYCLIC_REFRESH *cr);
 
-// Estimate the bits, incorporating the delta-q from segment 1, after encoding
-// the frame.
+/*!\brief Estimate the bits, incorporating the delta-q from the segments.
+ *
+ * For the just encoded frame, estimate the bits, incorporating the delta-q
+ * from non-base segment(s). Note this function is called in the postencode
+ * (called from rc_update_rate_correction_factors()).
+ *
+ * \ingroup cyclic_refresh
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in]       cpi               Top level encoder structure
+ * \param[in]       correction_factor rate correction factor
+ *
+ * \return Return the estimated bits at given q.
+ */
 int av1_cyclic_refresh_estimate_bits_at_q(const struct AV1_COMP *cpi,
                                           double correction_factor);
 
-// Estimate the bits per mb, for a given q = i and a corresponding delta-q
-// (for segment 1), prior to encoding the frame.
+/*!\brief Estimate the bits per mb, for given q = i and delta-q.
+ *
+ * Prior to encoding the frame, estimate the bits per mb, for a given q = i and
+ * a corresponding delta-q (for segment 1). This function is called in the
+ * rc_regulate_q() to set the base qp index. Note: the segment map is set to
+ * either 0/CR_SEGMENT_ID_BASE (no refresh) or to 1/CR_SEGMENT_ID_BOOST1
+ * (refresh) for each superblock, prior to encoding.
+ *
+ * \ingroup cyclic_refresh
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in]       cpi               Top level encoder structure
+ * \param[in]       i                 q index
+ * \param[in]       correction_factor rate correction factor
+ *
+ * \return Return the estimated bits for q = i and delta-q (segment 1).
+ */
 int av1_cyclic_refresh_rc_bits_per_mb(const struct AV1_COMP *cpi, int i,
                                       double correction_factor);
 
-// Prior to coding a given prediction block, of size bsize at (mi_row, mi_col),
-// check if we should reset the segment_id, and update the cyclic_refresh map
-// and segmentation map.
+/*!\brief Update segment_id for block based on mode selected.
+ *
+ * Prior to coding a given prediction block, of size bsize at (mi_row, mi_col),
+ * check if we should reset the segment_id (based on mode/motion/skip selected
+ * for that block) and update the cyclic_refresh map and segmentation map.
+ *
+ * \ingroup cyclic_refresh
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in]   cpi       Top level encoder structure
+ * \param[in]   mbmi      MB_MODE_INFO pointer for mi block
+ * \param[in]   mi_row    Row coordinate of the block in a step size of MI_SIZE
+ * \param[in]   mi_col    Col coordinate of the block in a step size of MI_SIZE
+ * \param[in]   bsize     Block size
+ * \param[in]   rate      Projected block rate from pickmode
+ * \param[in]   dist      Projected block dist from pickmode
+ * \param[in]  skip       Skip flag set from picmode
+ *
+ * \return Update the \c mbmi->segment_id, the \c cpi->cyclic_refresh and
+ * the \c cm->cpi->enc_seg.map.
+ */
 void av1_cyclic_refresh_update_segment(const struct AV1_COMP *cpi,
                                        MB_MODE_INFO *const mbmi, int mi_row,
                                        int mi_col, BLOCK_SIZE bsize,
                                        int64_t rate, int64_t dist, int skip);
 
-// Update the some stats after encode frame is done.
+/*!\brief Update stats after encoding frame.
+ *
+ * Update the number of block encoded with segment 1 and 2,
+ * and update the number of blocks encoded with small/zero motion.
+ *
+ * \ingroup cyclic_refresh
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in]   cpi       Top level encoder structure
+ *
+ * \return Updates the \c cpi->cyclic_refresh with the new stats.
+ */
 void av1_cyclic_refresh_postencode(struct AV1_COMP *const cpi);
 
-// Set golden frame update interval, for 1 pass CBR mode.
+/*!\brief Set golden frame update interval nased on cyclic refresh.
+ *
+ * \ingroup cyclic_refresh
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in]   cpi       Top level encoder structure
+ *
+ * \return Returns the interval in \c cpi->rc.baseline_gf_interval.
+ */
 void av1_cyclic_refresh_set_golden_update(struct AV1_COMP *const cpi);
 
-// Set/update global/frame level refresh parameters.
+/*!\brief Set the global/frame level parameters for cyclic refresh.
+ *
+ * First call to the cyclic refresh, before encoding the frame.
+ * Sets the flag on whether cyclic refresh should be applied, sets
+ * the amount/percent of refresh, and the amount of boost applied to
+ * the two segments (set by rate_ratio_qdelta and rate_boost_fac).
+ *
+ * \ingroup cyclic_refresh
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in]       cpi          Top level encoder structure
+ *
+ * \return Updates the \c cpi->cyclic_refresh with the settings.
+ */
 void av1_cyclic_refresh_update_parameters(struct AV1_COMP *const cpi);
 
-// Setup cyclic background refresh: set delta q and segmentation map.
+/*!\brief Setup the cyclic background refresh.
+ *
+ * Set the delta q for the segment(s), and set the segmentation map.
+ *
+ * \ingroup cyclic_refresh
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in]       cpi          Top level encoder structure
+ *
+ * \return Updates the \c cpi->cyclic_refresh with the cyclic refresh
+ * parameters and the \c cm->seg with the segmentation data.
+ */
 void av1_cyclic_refresh_setup(struct AV1_COMP *const cpi);
 
 int av1_cyclic_refresh_get_rdmult(const CYCLIC_REFRESH *cr);
diff --git a/av1/encoder/aq_variance.c b/av1/encoder/aq_variance.c
index 4176da2..92d7ad1 100644
--- a/av1/encoder/aq_variance.c
+++ b/av1/encoder/aq_variance.c
@@ -44,6 +44,7 @@
 
 void av1_vaq_frame_setup(AV1_COMP *cpi) {
   AV1_COMMON *cm = &cpi->common;
+  const RefreshFrameFlagsInfo *const refresh_frame_flags = &cpi->refresh_frame;
   const int base_qindex = cm->quant_params.base_qindex;
   struct segmentation *seg = &cm->seg;
   int i;
@@ -65,8 +66,8 @@
     return;
   }
   if (frame_is_intra_only(cm) || cm->features.error_resilient_mode ||
-      cpi->refresh_alt_ref_frame ||
-      (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref)) {
+      refresh_frame_flags->alt_ref_frame ||
+      (refresh_frame_flags->golden_frame && !cpi->rc.is_src_frame_alt_ref)) {
     cpi->vaq_refresh = 1;
 
     av1_enable_segmentation(seg);
@@ -79,7 +80,8 @@
       // it.
       int qindex_delta = av1_compute_qdelta_by_rate(
           &cpi->rc, cm->current_frame.frame_type, base_qindex,
-          rate_ratio[i] / avg_ratio, cm->seq_params.bit_depth);
+          rate_ratio[i] / avg_ratio, cpi->is_screen_content_type,
+          cm->seq_params.bit_depth);
 
       // We don't allow qindex 0 in a segment if the base value is not 0.
       // Q index 0 (lossless) implies 4x4 encoding only and in AQ mode a segment
@@ -196,7 +198,8 @@
   const int base_qindex = cm->quant_params.base_qindex;
   int qindex_delta = av1_compute_qdelta_by_rate(
       &cpi->rc, cm->current_frame.frame_type, base_qindex,
-      deltaq_rate_ratio[rate_level], cm->seq_params.bit_depth);
+      deltaq_rate_ratio[rate_level], cpi->is_screen_content_type,
+      cm->seq_params.bit_depth);
 
   if ((base_qindex != 0) && ((base_qindex + qindex_delta) == 0)) {
     qindex_delta = -base_qindex + 1;
diff --git a/av1/encoder/arm/neon/av1_fwd_txfm2d_neon.c b/av1/encoder/arm/neon/av1_fwd_txfm2d_neon.c
new file mode 100644
index 0000000..b9a314d
--- /dev/null
+++ b/av1/encoder/arm/neon/av1_fwd_txfm2d_neon.c
@@ -0,0 +1,4403 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "aom_dsp/txfm_common.h"
+#include "aom_ports/mem.h"
+#include "av1/common/av1_txfm.h"
+#include "av1/encoder/av1_fwd_txfm1d_cfg.h"
+#include "av1/common/arm/mem_neon.h"
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#define custom_packs_s32(w0, w1) vcombine_s16(vqmovn_s32(w0), vqmovn_s32(w1));
+
+static INLINE void transpose_16bit_4x4(const int16x8_t *const in,
+                                       int16x8_t *const out) {
+#if defined(__aarch64__)
+  const int16x8_t a0 = vzip1q_s16(in[0], in[1]);
+  const int16x8_t a1 = vzip1q_s16(in[2], in[3]);
+#else
+  int16x4x2_t temp;
+  temp = vzip_s16(vget_low_s16(in[0]), vget_low_s16(in[1]));
+  const int16x8_t a0 = vcombine_s16(temp.val[0], temp.val[1]);
+  temp = vzip_s16(vget_low_s16(in[2]), vget_low_s16(in[3]));
+  const int16x8_t a1 = vcombine_s16(temp.val[0], temp.val[1]);
+#endif
+
+  int32x4x2_t a01 =
+      vzipq_s32(vreinterpretq_s32_s16(a0), vreinterpretq_s32_s16(a1));
+  out[0] = vreinterpretq_s16_s32(a01.val[0]);
+  out[1] = vextq_s16(vreinterpretq_s16_s32(a01.val[0]), out[1], 4);
+  out[2] = vreinterpretq_s16_s32(a01.val[1]);
+  out[3] = vextq_s16(vreinterpretq_s16_s32(a01.val[1]), out[3], 4);
+}
+
+static INLINE void transpose_16bit_4x8(const int16x8_t *const in,
+                                       int16x8_t *const out) {
+#if defined(__aarch64__)
+  const int16x8_t a0 = vzip1q_s16(in[0], in[1]);
+  const int16x8_t a1 = vzip1q_s16(in[2], in[3]);
+  const int16x8_t a2 = vzip1q_s16(in[4], in[5]);
+  const int16x8_t a3 = vzip1q_s16(in[6], in[7]);
+#else
+  int16x4x2_t temp;
+  temp = vzip_s16(vget_low_s16(in[0]), vget_low_s16(in[1]));
+  const int16x8_t a0 = vcombine_s16(temp.val[0], temp.val[1]);
+  temp = vzip_s16(vget_low_s16(in[2]), vget_low_s16(in[3]));
+  const int16x8_t a1 = vcombine_s16(temp.val[0], temp.val[1]);
+  temp = vzip_s16(vget_low_s16(in[4]), vget_low_s16(in[5]));
+  const int16x8_t a2 = vcombine_s16(temp.val[0], temp.val[1]);
+  temp = vzip_s16(vget_low_s16(in[6]), vget_low_s16(in[7]));
+  const int16x8_t a3 = vcombine_s16(temp.val[0], temp.val[1]);
+#endif
+
+  const int32x4x2_t b02 =
+      vzipq_s32(vreinterpretq_s32_s16(a0), vreinterpretq_s32_s16(a1));
+  const int32x4x2_t b13 =
+      vzipq_s32(vreinterpretq_s32_s16(a2), vreinterpretq_s32_s16(a3));
+
+#if defined(__aarch64__)
+  out[0] = vreinterpretq_s16_s64(vzip1q_s64(vreinterpretq_s64_s32(b02.val[0]),
+                                            vreinterpretq_s64_s32(b13.val[0])));
+  out[1] = vreinterpretq_s16_s64(vzip2q_s64(vreinterpretq_s64_s32(b02.val[0]),
+                                            vreinterpretq_s64_s32(b13.val[0])));
+  out[2] = vreinterpretq_s16_s64(vzip1q_s64(vreinterpretq_s64_s32(b02.val[1]),
+                                            vreinterpretq_s64_s32(b13.val[1])));
+  out[3] = vreinterpretq_s16_s64(vzip2q_s64(vreinterpretq_s64_s32(b02.val[1]),
+                                            vreinterpretq_s64_s32(b13.val[1])));
+#else
+  out[0] = vreinterpretq_s16_s32(
+      vextq_s32(vextq_s32(b02.val[0], b02.val[0], 2), b13.val[0], 2));
+  out[2] = vreinterpretq_s16_s32(
+      vextq_s32(vextq_s32(b02.val[1], b02.val[1], 2), b13.val[1], 2));
+  out[1] = vreinterpretq_s16_s32(
+      vextq_s32(b02.val[0], vextq_s32(b13.val[0], b13.val[0], 2), 2));
+  out[3] = vreinterpretq_s16_s32(
+      vextq_s32(b02.val[1], vextq_s32(b13.val[1], b13.val[1], 2), 2));
+#endif
+}
+
+static INLINE void transpose_16bit_8x4(const int16x8_t *const in,
+                                       int16x8_t *const out) {
+  const int16x8x2_t a04 = vzipq_s16(in[0], in[1]);
+  const int16x8x2_t a15 = vzipq_s16(in[2], in[3]);
+
+  const int32x4x2_t b01 = vzipq_s32(vreinterpretq_s32_s16(a04.val[0]),
+                                    vreinterpretq_s32_s16(a15.val[0]));
+  const int32x4x2_t b45 = vzipq_s32(vreinterpretq_s32_s16(a04.val[1]),
+                                    vreinterpretq_s32_s16(a15.val[1]));
+
+  const int32x4_t zeros = vdupq_n_s32(0);
+
+#if defined(__aarch64__)
+  out[0] = vreinterpretq_s16_s64(vzip1q_s64(vreinterpretq_s64_s32(b01.val[0]),
+                                            vreinterpretq_s64_s32(zeros)));
+  out[1] = vreinterpretq_s16_s64(vzip2q_s64(vreinterpretq_s64_s32(b01.val[0]),
+                                            vreinterpretq_s64_s32(zeros)));
+  out[2] = vreinterpretq_s16_s64(vzip1q_s64(vreinterpretq_s64_s32(b01.val[1]),
+                                            vreinterpretq_s64_s32(zeros)));
+  out[3] = vreinterpretq_s16_s64(vzip2q_s64(vreinterpretq_s64_s32(b01.val[1]),
+                                            vreinterpretq_s64_s32(zeros)));
+  out[4] = vreinterpretq_s16_s64(vzip1q_s64(vreinterpretq_s64_s32(b45.val[0]),
+                                            vreinterpretq_s64_s32(zeros)));
+  out[5] = vreinterpretq_s16_s64(vzip2q_s64(vreinterpretq_s64_s32(b45.val[0]),
+                                            vreinterpretq_s64_s32(zeros)));
+  out[6] = vreinterpretq_s16_s64(vzip1q_s64(vreinterpretq_s64_s32(b45.val[1]),
+                                            vreinterpretq_s64_s32(zeros)));
+  out[7] = vreinterpretq_s16_s64(vzip2q_s64(vreinterpretq_s64_s32(b45.val[1]),
+                                            vreinterpretq_s64_s32(zeros)));
+#else
+  out[0] = vreinterpretq_s16_s32(
+      vextq_s32(vextq_s32(b01.val[0], b01.val[0], 2), zeros, 2));
+  out[1] = vreinterpretq_s16_s32(vextq_s32(b01.val[0], zeros, 2));
+  out[2] = vreinterpretq_s16_s32(
+      vextq_s32(vextq_s32(b01.val[1], b01.val[1], 2), zeros, 2));
+  out[3] = vreinterpretq_s16_s32(vextq_s32(b01.val[1], zeros, 2));
+  out[4] = vreinterpretq_s16_s32(
+      vextq_s32(vextq_s32(b45.val[0], b45.val[0], 2), zeros, 2));
+  out[5] = vreinterpretq_s16_s32(vextq_s32(b45.val[0], zeros, 2));
+  out[6] = vreinterpretq_s16_s32(
+      vextq_s32(vextq_s32(b45.val[1], b45.val[1], 2), zeros, 2));
+  out[7] = vreinterpretq_s16_s32(vextq_s32(b45.val[1], zeros, 2));
+#endif
+}
+
+static INLINE void transpose_16bit_8x8(const int16x8_t *const in,
+                                       int16x8_t *const out) {
+  const int16x8x2_t a04 = vzipq_s16(in[0], in[1]);
+  const int16x8x2_t a15 = vzipq_s16(in[2], in[3]);
+  const int16x8x2_t a26 = vzipq_s16(in[4], in[5]);
+  const int16x8x2_t a37 = vzipq_s16(in[6], in[7]);
+
+  const int32x4x2_t b04 = vzipq_s32(vreinterpretq_s32_s16(a04.val[0]),
+                                    vreinterpretq_s32_s16(a15.val[0]));
+  const int32x4x2_t b15 = vzipq_s32(vreinterpretq_s32_s16(a26.val[0]),
+                                    vreinterpretq_s32_s16(a37.val[0]));
+  const int32x4x2_t b26 = vzipq_s32(vreinterpretq_s32_s16(a04.val[1]),
+                                    vreinterpretq_s32_s16(a15.val[1]));
+  const int32x4x2_t b37 = vzipq_s32(vreinterpretq_s32_s16(a26.val[1]),
+                                    vreinterpretq_s32_s16(a37.val[1]));
+
+#if defined(__aarch64__)
+  out[0] = vreinterpretq_s16_s64(vzip1q_s64(vreinterpretq_s64_s32(b04.val[0]),
+                                            vreinterpretq_s64_s32(b15.val[0])));
+  out[1] = vreinterpretq_s16_s64(vzip2q_s64(vreinterpretq_s64_s32(b04.val[0]),
+                                            vreinterpretq_s64_s32(b15.val[0])));
+  out[2] = vreinterpretq_s16_s64(vzip1q_s64(vreinterpretq_s64_s32(b04.val[1]),
+                                            vreinterpretq_s64_s32(b15.val[1])));
+  out[3] = vreinterpretq_s16_s64(vzip2q_s64(vreinterpretq_s64_s32(b04.val[1]),
+                                            vreinterpretq_s64_s32(b15.val[1])));
+  out[4] = vreinterpretq_s16_s64(vzip1q_s64(vreinterpretq_s64_s32(b26.val[0]),
+                                            vreinterpretq_s64_s32(b37.val[0])));
+  out[5] = vreinterpretq_s16_s64(vzip2q_s64(vreinterpretq_s64_s32(b26.val[0]),
+                                            vreinterpretq_s64_s32(b37.val[0])));
+  out[6] = vreinterpretq_s16_s64(vzip1q_s64(vreinterpretq_s64_s32(b26.val[1]),
+                                            vreinterpretq_s64_s32(b37.val[1])));
+  out[7] = vreinterpretq_s16_s64(vzip2q_s64(vreinterpretq_s64_s32(b26.val[1]),
+                                            vreinterpretq_s64_s32(b37.val[1])));
+#else
+  out[0] = vreinterpretq_s16_s32(
+      vextq_s32(vextq_s32(b04.val[0], b04.val[0], 2), b15.val[0], 2));
+  out[1] = vreinterpretq_s16_s32(
+      vextq_s32(b04.val[0], vextq_s32(b15.val[0], b15.val[0], 2), 2));
+  out[2] = vreinterpretq_s16_s32(
+      vextq_s32(vextq_s32(b04.val[1], b04.val[1], 2), b15.val[1], 2));
+  out[3] = vreinterpretq_s16_s32(
+      vextq_s32(b04.val[1], vextq_s32(b15.val[1], b15.val[1], 2), 2));
+  out[4] = vreinterpretq_s16_s32(
+      vextq_s32(vextq_s32(b26.val[0], b26.val[0], 2), b37.val[0], 2));
+  out[5] = vreinterpretq_s16_s32(
+      vextq_s32(b26.val[0], vextq_s32(b37.val[0], b37.val[0], 2), 2));
+  out[6] = vreinterpretq_s16_s32(
+      vextq_s32(vextq_s32(b26.val[1], b26.val[1], 2), b37.val[1], 2));
+  out[7] = vreinterpretq_s16_s32(
+      vextq_s32(b26.val[1], vextq_s32(b37.val[1], b37.val[1], 2), 2));
+#endif
+}
+
+static INLINE void av1_round_shift_rect_array_32_neon(int32x4_t *input,
+                                                      int32x4_t *output,
+                                                      const int size) {
+  int i;
+  for (i = 0; i < size; i++) {
+    output[i] = vrshrq_n_s32(vmulq_n_s32(vrshrq_n_s32(input[i], 2), NewSqrt2),
+                             NewSqrt2Bits);
+  }
+}
+
+static INLINE void av1_round_shift_array_32_neon(int32x4_t *input,
+                                                 int32x4_t *output,
+                                                 const int size) {
+  int i;
+  for (i = 0; i < size; i++) output[i] = vrshrq_n_s32(input[i], 2);
+}
+
+#define btf_32_neon(w0, w1, in0, in1, out0, out1, v_cos_bit) \
+  do {                                                       \
+    out0 = vmulq_n_s32(in0, w0);                             \
+    out0 = vmlaq_n_s32(out0, in1, w1);                       \
+    out0 = vrshlq_s32(out0, v_cos_bit);                      \
+    out1 = vmulq_n_s32(in0, w1);                             \
+    out1 = vmlsq_n_s32(out1, in1, w0);                       \
+    out1 = vrshlq_s32(out1, v_cos_bit);                      \
+  } while (0)
+
+#define btf_32_type1_neon(w0, w1, in0, in1, out0, out1, v_cos_bit) \
+  do {                                                             \
+    btf_32_neon(w1, w0, in1, in0, out0, out1, v_cos_bit);          \
+  } while (0)
+
+#define btf_32_neon_mode0(w0, w1, in0, in1, out0, out1, v_cos_bit) \
+  do {                                                             \
+    out0 = vmulq_n_s32(in1, w1);                                   \
+    out0 = vmlsq_n_s32(out0, in0, w0);                             \
+    out0 = vrshlq_s32(out0, v_cos_bit);                            \
+    out1 = vmulq_n_s32(in0, w1);                                   \
+    out1 = vmlaq_n_s32(out1, in1, w0);                             \
+    out1 = vrshlq_s32(out1, v_cos_bit);                            \
+  } while (0)
+
+#define btf_32_neon_mode01(w0, w1, in0, in1, out0, out1, v_cos_bit) \
+  do {                                                              \
+    out0 = vmulq_n_s32(in1, w1);                                    \
+    out0 = vmlaq_n_s32(out0, in0, w0);                              \
+    out0 = vrshlq_s32(vnegq_s32(out0), v_cos_bit);                  \
+    out1 = vmulq_n_s32(in1, w0);                                    \
+    out1 = vmlsq_n_s32(out1, in0, w1);                              \
+    out1 = vrshlq_s32(out1, v_cos_bit);                             \
+  } while (0)
+
+static INLINE void flip_buf_neon(int16x8_t *in, int16x8_t *out, int size) {
+  for (int i = 0; i < size; ++i) {
+    out[size - i - 1] = in[i];
+  }
+}
+
+static INLINE void store_16bit_to_32bit_w4(const int16x8_t a,
+                                           int32_t *const b) {
+  vst1q_s32(b, vmovl_s16(vget_low_s16(a)));
+}
+
+static INLINE void store_16bit_to_32bit(int16x8_t a, int32_t *b) {
+  vst1q_s32(b, vmovl_s16(vget_low_s16(a)));
+  vst1q_s32((b + 4), vmovl_s16(vget_high_s16(a)));
+}
+
+static INLINE void store_rect_16bit_to_32bit_w4(
+    const int16x8_t a, int32_t *const b, const int16x4_t *v_newsqrt2,
+    const int32x4_t *v_newsqrt2bits) {
+  const int32x4_t b_lo =
+      vrshlq_s32(vmull_s16(vget_low_s16(a), *v_newsqrt2), *v_newsqrt2bits);
+  vst1q_s32(b, b_lo);
+}
+
+static INLINE void store_rect_16bit_to_32bit(const int16x8_t a,
+                                             int32_t *const b,
+                                             const int16x4_t *v_newsqrt2,
+                                             const int32x4_t *v_newsqrt2bits) {
+  const int32x4_t b_lo =
+      vrshlq_s32(vmull_s16(vget_low_s16(a), *v_newsqrt2), *v_newsqrt2bits);
+  const int32x4_t b_hi =
+      vrshlq_s32(vmull_s16(vget_high_s16(a), *v_newsqrt2), *v_newsqrt2bits);
+  vst1q_s32(b, b_lo);
+  vst1q_s32((b + 4), b_hi);
+}
+
+static INLINE void load_buffer_16bit_to_16bit_w4(const int16_t *const in,
+                                                 const int stride,
+                                                 int16x8_t *const out,
+                                                 const int out_size) {
+  for (int i = 0; i < out_size; ++i)
+    out[i] = vreinterpretq_s16_u64(vld1q_lane_u64(
+        (uint64_t *)(in + i * stride), vreinterpretq_u64_s16(out[i]), 0));
+}
+
+static INLINE void load_buffer_16bit_to_16bit_w4_flip(const int16_t *const in,
+                                                      const int stride,
+                                                      int16x8_t *const out,
+                                                      const int out_size) {
+  for (int i = 0; i < out_size; ++i)
+    out[out_size - i - 1] = vreinterpretq_s16_u64(
+        vld1q_lane_u64((uint64_t *)(in + i * stride),
+                       vreinterpretq_u64_s16(out[out_size - i - 1]), 0));
+}
+
+static INLINE void load_buffer_16bit_to_16bit(const int16_t *in, int stride,
+                                              int16x8_t *out, int out_size) {
+  for (int i = 0; i < out_size; ++i) {
+    out[i] = vld1q_s16(in + i * stride);
+  }
+}
+
+static INLINE void load_buffer_16bit_to_16bit_flip(const int16_t *in,
+                                                   int stride, int16x8_t *out,
+                                                   int out_size) {
+  for (int i = 0; i < out_size; ++i) {
+    out[out_size - i - 1] = vld1q_s16(in + i * stride);
+  }
+}
+
+static INLINE void store_buffer_16bit_to_32bit_w4(const int16x8_t *const in,
+                                                  int32_t *const out,
+                                                  const int stride,
+                                                  const int out_size) {
+  for (int i = 0; i < out_size; ++i) {
+    store_16bit_to_32bit_w4(in[i], out + i * stride);
+  }
+}
+
+static INLINE void store_buffer_16bit_to_32bit_w8(const int16x8_t *const in,
+                                                  int32_t *const out,
+                                                  const int stride,
+                                                  const int out_size) {
+  for (int i = 0; i < out_size; ++i) {
+    store_16bit_to_32bit(in[i], out + i * stride);
+  }
+}
+
+static INLINE void store_rect_buffer_16bit_to_32bit_w4(
+    const int16x8_t *const in, int32_t *const out, const int stride,
+    const int out_size) {
+  const int16x4_t v_newsqrt2 = vdup_n_s16(NewSqrt2);
+  const int32x4_t v_newsqrt2bits = vdupq_n_s32(-NewSqrt2Bits);
+  for (int i = 0; i < out_size; ++i) {
+    store_rect_16bit_to_32bit_w4(in[i], out + i * stride, &v_newsqrt2,
+                                 &v_newsqrt2bits);
+  }
+}
+
+static INLINE void store_rect_buffer_16bit_to_32bit_w8(
+    const int16x8_t *const in, int32_t *const out, const int stride,
+    const int out_size) {
+  const int16x4_t v_newsqrt2 = vdup_n_s16(NewSqrt2);
+  const int32x4_t v_newsqrt2bits = vdupq_n_s32(-NewSqrt2Bits);
+  for (int i = 0; i < out_size; ++i) {
+    store_rect_16bit_to_32bit(in[i], out + i * stride, &v_newsqrt2,
+                              &v_newsqrt2bits);
+  }
+}
+
+static INLINE void round_shift_16bit(int16x8_t *in, int size, int bit) {
+  const int16x8_t vbit = vdupq_n_s16(bit);
+  for (int i = 0; i < size; ++i) {
+    in[i] = vrshlq_s16(in[i], vbit);
+  }
+}
+
+static INLINE void round_shift_16bit_vector(int16x8_t *in, int size,
+                                            const int16x8_t *v_bit) {
+  for (int i = 0; i < size; ++i) {
+    in[i] = vrshlq_s16(in[i], *v_bit);
+  }
+}
+
+void av1_fadst4x4_neon(const int16x8_t *input, int16x8_t *output,
+                       int8_t cos_bit, const int8_t *stage_range) {
+  (void)stage_range;
+  const int32_t *sinpi = sinpi_arr(cos_bit);
+
+  int32x4_t u[6], v[6];
+
+  u[0] = vmovl_s16(vget_low_s16(input[0]));
+  u[1] = vmovl_s16(vget_low_s16(input[1]));
+  u[2] = vmovl_s16(vget_low_s16(input[2]));
+  u[3] = vmovl_s16(vget_low_s16(input[3]));
+  u[4] = vaddq_s32(u[0], u[1]);
+  v[5] = vmulq_n_s32(u[2], sinpi[3]);
+  v[0] = vmulq_n_s32(u[1], sinpi[2]);
+  v[0] = vmlaq_n_s32(v[0], u[0], sinpi[1]);
+  v[1] = vmlaq_n_s32(v[5], u[3], sinpi[4]);
+  v[2] = vmulq_n_s32(u[4], sinpi[3]);
+  v[3] = vmulq_n_s32(u[0], sinpi[4]);
+  v[3] = vmlsq_n_s32(v[3], u[1], sinpi[1]);
+  v[4] = vmlsq_n_s32(v[5], u[3], sinpi[2]);
+
+  u[0] = vaddq_s32(v[0], v[1]);
+  u[1] = vmlsq_n_s32(v[2], u[3], sinpi[3]);
+  u[2] = vsubq_s32(v[3], v[4]);
+  u[3] = vsubq_s32(u[2], u[0]);
+  u[5] = vmlaq_n_s32(u[3], v[5], 3);
+
+  int32x4_t vshift = vdupq_n_s32(-cos_bit);
+  u[0] = vrshlq_s32(u[0], vshift);
+  u[1] = vrshlq_s32(u[1], vshift);
+  u[2] = vrshlq_s32(u[2], vshift);
+  u[3] = vrshlq_s32(u[5], vshift);
+
+  output[0] = custom_packs_s32(u[0], u[2]);
+
+  output[1] = custom_packs_s32(u[1], u[3]);
+  output[2] = vextq_s16(output[0], output[0], 4);
+  output[3] = vextq_s16(output[1], output[1], 4);
+}
+
+#define btf_16_w4_neon(w0_l, w0_h, w1_l, w1_h, in0, in1, out0, out1, \
+                       v_cos_bit)                                    \
+  {                                                                  \
+    int32x4_t in0_l = vmovl_s16(vget_low_s16(in0));                  \
+    int32x4_t in1_l = vmovl_s16(vget_low_s16(in1));                  \
+    int32x4_t u0 = vmulq_n_s32(in0_l, w0_l);                         \
+    u0 = vmlaq_n_s32(u0, in1_l, w0_h);                               \
+    int32x4_t v0 = vmulq_n_s32(in0_l, w1_l);                         \
+    v0 = vmlaq_n_s32(v0, in1_l, w1_h);                               \
+    int32x4_t c0 = vrshlq_s32(u0, v_cos_bit);                        \
+    int32x4_t d0 = vrshlq_s32(v0, v_cos_bit);                        \
+    const int16x4_t c1 = vqmovn_s32(c0);                             \
+    const int16x4_t d1 = vqmovn_s32(d0);                             \
+    out0 = vcombine_s16(c1, c1);                                     \
+    out1 = vcombine_s16(d1, c1);                                     \
+  }
+
+#define btf_16_w4_neon_mode0(w0_l, w0_h, in0, in1, out0, out1, v_cos_bit) \
+  {                                                                       \
+    int32x4_t in0_l = vmovl_s16(vget_low_s16(in0));                       \
+    int32x4_t in1_l = vmovl_s16(vget_low_s16(in1));                       \
+    int32x4_t u0 = vmulq_n_s32(in1_l, w0_h);                              \
+    u0 = vmlsq_n_s32(u0, in0_l, w0_l);                                    \
+    int32x4_t v0 = vmulq_n_s32(in0_l, w0_h);                              \
+    v0 = vmlaq_n_s32(v0, in1_l, w0_l);                                    \
+    int32x4_t c0 = vrshlq_s32(u0, v_cos_bit);                             \
+    int32x4_t d0 = vrshlq_s32(v0, v_cos_bit);                             \
+    const int16x4_t c1 = vqmovn_s32(c0);                                  \
+    const int16x4_t d1 = vqmovn_s32(d0);                                  \
+    out0 = vcombine_s16(c1, c1);                                          \
+    out1 = vcombine_s16(d1, c1);                                          \
+  }
+
+#define btf_16_w4_neon_mode2(w0_l, w0_h, in0, in1, out0, out1, v_cos_bit) \
+  {                                                                       \
+    int32x4_t in0_l = vmovl_s16(vget_low_s16(in0));                       \
+    int32x4_t in1_l = vmovl_s16(vget_low_s16(in1));                       \
+    int32x4_t u0 = vmulq_n_s32(in0_l, w0_l);                              \
+    u0 = vmlaq_n_s32(u0, in1_l, w0_h);                                    \
+    int32x4_t v0 = vmulq_n_s32(in1_l, w0_l);                              \
+    v0 = vmlsq_n_s32(v0, in0_l, w0_h);                                    \
+    int32x4_t c0 = vrshlq_s32(u0, v_cos_bit);                             \
+    int32x4_t d0 = vrshlq_s32(v0, v_cos_bit);                             \
+    const int16x4_t c1 = vqmovn_s32(c0);                                  \
+    const int16x4_t d1 = vqmovn_s32(d0);                                  \
+    out0 = vcombine_s16(c1, c1);                                          \
+    out1 = vcombine_s16(d1, c1);                                          \
+  }
+
+#define btf_16_w4_neon_mode3(w0_l, w0_h, in0, in1, out0, out1, v_cos_bit) \
+  {                                                                       \
+    int32x4_t in0_l = vmovl_s16(vget_low_s16(in0));                       \
+    int32x4_t in1_l = vmovl_s16(vget_low_s16(in1));                       \
+    int32x4_t u0 = vmulq_n_s32(in0_l, w0_l);                              \
+    u0 = vmlaq_n_s32(u0, in1_l, w0_h);                                    \
+    int32x4_t v0 = vmulq_n_s32(in0_l, w0_h);                              \
+    v0 = vmlsq_n_s32(v0, in1_l, w0_l);                                    \
+    int32x4_t c0 = vrshlq_s32(u0, v_cos_bit);                             \
+    int32x4_t d0 = vrshlq_s32(v0, v_cos_bit);                             \
+    const int16x4_t c1 = vqmovn_s32(c0);                                  \
+    const int16x4_t d1 = vqmovn_s32(d0);                                  \
+    out0 = vcombine_s16(c1, c1);                                          \
+    out1 = vcombine_s16(d1, c1);                                          \
+  }
+
+static void fadst4x8_neon(const int16x8_t *input, int16x8_t *output,
+                          int8_t cos_bit, const int8_t *stage_range) {
+  (void)stage_range;
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const int32x4_t v_cos_bit = vdupq_n_s32(-cos_bit);
+
+  // stage 1-2
+  int16x8_t x2[8];
+  btf_16_w4_neon_mode3(cospi[32], cospi[32], vqnegq_s16(input[3]), input[4],
+                       x2[2], x2[3], v_cos_bit);
+  btf_16_w4_neon_mode3(cospi[32], cospi[32], input[2], vqnegq_s16(input[5]),
+                       x2[6], x2[7], v_cos_bit);
+
+  // stage 3
+  int16x8_t x3[8];
+  x3[0] = vqaddq_s16(input[0], x2[2]);
+  x3[2] = vqsubq_s16(input[0], x2[2]);
+  x3[1] = vqsubq_s16(x2[3], input[7]);
+  x3[3] = vqsubq_s16(vqnegq_s16(input[7]), x2[3]);
+  x3[4] = vqaddq_s16(vqnegq_s16(input[1]), x2[6]);
+  x3[6] = vqsubq_s16(vqnegq_s16(input[1]), x2[6]);
+  x3[5] = vqaddq_s16(input[6], x2[7]);
+  x3[7] = vqsubq_s16(input[6], x2[7]);
+
+  // stage 4
+  int16x8_t x4[8];
+
+  btf_16_w4_neon_mode3(cospi[16], cospi[48], x3[4], x3[5], x4[4], x4[5],
+                       v_cos_bit);
+  btf_16_w4_neon_mode0(cospi[48], cospi[16], x3[6], x3[7], x4[6], x4[7],
+                       v_cos_bit);
+
+  // stage 5
+  int16x8_t x5[8];
+  x5[0] = vqaddq_s16(x3[0], x4[4]);
+  x5[4] = vqsubq_s16(x3[0], x4[4]);
+  x5[1] = vqaddq_s16(x3[1], x4[5]);
+  x5[5] = vqsubq_s16(x3[1], x4[5]);
+  x5[2] = vqaddq_s16(x3[2], x4[6]);
+  x5[6] = vqsubq_s16(x3[2], x4[6]);
+  x5[3] = vqaddq_s16(x3[3], x4[7]);
+  x5[7] = vqsubq_s16(x3[3], x4[7]);
+
+  // stage 6-7
+  btf_16_w4_neon_mode3(cospi[4], cospi[60], x5[0], x5[1], output[7], output[0],
+                       v_cos_bit);
+  btf_16_w4_neon_mode3(cospi[20], cospi[44], x5[2], x5[3], output[5], output[2],
+                       v_cos_bit);
+  btf_16_w4_neon_mode3(cospi[36], cospi[28], x5[4], x5[5], output[3], output[4],
+                       v_cos_bit);
+  btf_16_w4_neon_mode3(cospi[52], cospi[12], x5[6], x5[7], output[1], output[6],
+                       v_cos_bit);
+}
+
+static void fadst8x4_neon(const int16x8_t *input, int16x8_t *output,
+                          int8_t cos_bit, const int8_t *stage_range) {
+  (void)stage_range;
+  const int32_t *sinpi = sinpi_arr(cos_bit);
+
+  const int16x8_t in7 = vaddq_s16(input[0], input[1]);
+  int32x4_t u_lo[8], u_hi[8], v_hi[8];
+
+  int32x4_t in0_l = vmovl_s16(vget_low_s16(input[0]));
+  int32x4_t in0_h = vmovl_s16(vget_high_s16(input[0]));
+  int32x4_t in1_l = vmovl_s16(vget_low_s16(input[1]));
+  int32x4_t in1_h = vmovl_s16(vget_high_s16(input[1]));
+  int32x4_t in2_l = vmovl_s16(vget_low_s16(input[2]));
+  int32x4_t in2_h = vmovl_s16(vget_high_s16(input[2]));
+  int32x4_t in3_l = vmovl_s16(vget_low_s16(input[3]));
+  int32x4_t in3_h = vmovl_s16(vget_high_s16(input[3]));
+  int32x4_t in7_l = vmovl_s16(vget_low_s16(in7));
+  int32x4_t in7_h = vmovl_s16(vget_high_s16(in7));
+
+  u_lo[0] = vmulq_n_s32(in1_l, sinpi[2]);
+  u_lo[0] = vmlaq_n_s32(u_lo[0], in0_l, sinpi[1]);
+
+  u_hi[0] = vmulq_n_s32(in1_h, sinpi[2]);
+  u_hi[0] = vmlaq_n_s32(u_hi[0], in0_h, sinpi[1]);
+
+  u_lo[0] = vmlaq_n_s32(u_lo[0], in3_l, sinpi[4]);
+  u_lo[0] = vmlaq_n_s32(u_lo[0], in2_l, sinpi[3]);
+
+  u_hi[0] = vmlaq_n_s32(u_hi[0], in3_h, sinpi[4]);
+  u_hi[0] = vmlaq_n_s32(u_hi[0], in2_h, sinpi[3]);
+
+  u_lo[1] = vmulq_n_s32(in7_l, sinpi[3]);
+
+  v_hi[2] = vmulq_n_s32(in7_h, sinpi[3]);
+  u_lo[2] = vmulq_n_s32(in0_l, sinpi[4]);
+  u_lo[2] = vmlsq_n_s32(u_lo[2], in1_l, sinpi[1]);
+
+  u_hi[2] = vmulq_n_s32(in0_h, sinpi[4]);
+  u_hi[2] = vmlsq_n_s32(u_hi[2], in1_h, sinpi[1]);
+
+  u_lo[2] = vmlaq_n_s32(u_lo[2], in3_l, sinpi[2]);
+  u_lo[2] = vmlsq_n_s32(u_lo[2], in2_l, sinpi[3]);
+
+  u_hi[2] = vmlaq_n_s32(u_hi[2], in3_h, sinpi[2]);
+  u_hi[2] = vmlsq_n_s32(u_hi[2], in2_h, sinpi[3]);
+
+  u_lo[1] = vmlsq_n_s32(u_lo[1], in3_l, sinpi[3]);
+
+  const int32x4_t v_cos_bit = vdupq_n_s32(-cos_bit);
+
+  u_hi[1] = vmlsq_n_s32(v_hi[2], in3_h, sinpi[3]);
+
+  u_lo[3] = vsubq_s32(u_lo[2], u_lo[0]);
+  u_hi[3] = vsubq_s32(u_hi[2], u_hi[0]);
+
+  u_lo[6] = vmlaq_n_s32(u_lo[3], in2_l, sinpi[3] * 3);
+  u_hi[6] = vmlaq_n_s32(u_hi[3], in2_h, sinpi[3] * 3);
+
+  u_lo[0] = vrshlq_s32(u_lo[0], v_cos_bit);
+  u_hi[0] = vrshlq_s32(u_hi[0], v_cos_bit);
+  u_lo[1] = vrshlq_s32(u_lo[1], v_cos_bit);
+  u_hi[1] = vrshlq_s32(u_hi[1], v_cos_bit);
+  u_lo[2] = vrshlq_s32(u_lo[2], v_cos_bit);
+  u_hi[2] = vrshlq_s32(u_hi[2], v_cos_bit);
+  u_lo[3] = vrshlq_s32(u_lo[6], v_cos_bit);
+  u_hi[3] = vrshlq_s32(u_hi[6], v_cos_bit);
+
+  output[0] = custom_packs_s32(u_lo[0], u_hi[0]);
+  output[1] = custom_packs_s32(u_lo[1], u_hi[1]);
+  output[2] = custom_packs_s32(u_lo[2], u_hi[2]);
+  output[3] = custom_packs_s32(u_lo[3], u_hi[3]);
+}
+
+void av1_fdct4x4_neon(const int16x8_t *input, int16x8_t *output, int8_t cos_bit,
+                      const int8_t *stage_range) {
+  (void)stage_range;
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const int32x4_t v_cos_bit = vdupq_n_s32(-cos_bit);
+
+  int32x4_t u[4];
+
+  int32x4_t in12a = vaddl_s16(vget_low_s16(input[1]), vget_low_s16(input[2]));
+  int32x4_t in12s = vsubl_s16(vget_low_s16(input[1]), vget_low_s16(input[2]));
+  int32x4_t in03a = vaddl_s16(vget_low_s16(input[0]), vget_low_s16(input[3]));
+  int32x4_t in03s = vsubl_s16(vget_low_s16(input[0]), vget_low_s16(input[3]));
+
+  int32x4_t u0ad1 = vmulq_n_s32(in12a, cospi[32]);
+  int32x4_t u0ad2 = vmulq_n_s32(in03a, cospi[32]);
+  u[0] = vaddq_s32(u0ad1, u0ad2);
+  u[1] = vsubq_s32(u0ad2, u0ad1);
+  u[2] = vmulq_n_s32(in12s, cospi[48]);
+  u[2] = vmlaq_n_s32(u[2], in03s, cospi[16]);
+
+  u[3] = vmulq_n_s32(in03s, cospi[48]);
+  u[3] = vmlsq_n_s32(u[3], in12s, cospi[16]);
+
+  u[0] = vrshlq_s32(u[0], v_cos_bit);
+  u[1] = vrshlq_s32(u[1], v_cos_bit);
+  u[2] = vrshlq_s32(u[2], v_cos_bit);
+  u[3] = vrshlq_s32(u[3], v_cos_bit);
+
+  output[0] = custom_packs_s32(u[0], u[1]);
+  output[1] = custom_packs_s32(u[2], u[3]);
+  output[2] = vextq_s16(output[0], output[0], 4);
+  output[3] = vextq_s16(output[1], output[1], 4);
+}
+
+#define btf_16_neon(w0_l, w0_h, w1_l, w1_h, in0, in1, out0, out1) \
+  {                                                               \
+    int32x4_t in_low0 = vmovl_s16(vget_low_s16(in0));             \
+    int32x4_t in_high0 = vmovl_s16(vget_high_s16(in0));           \
+    int32x4_t in_low1 = vmovl_s16(vget_low_s16(in1));             \
+    int32x4_t in_high1 = vmovl_s16(vget_high_s16(in1));           \
+    int32x4_t u0 = vmulq_n_s32(in_low1, w0_h);                    \
+    u0 = vmlaq_n_s32(u0, in_low0, w0_l);                          \
+    int32x4_t u1 = vmulq_n_s32(in_high1, w0_h);                   \
+    u1 = vmlaq_n_s32(u1, in_high0, w0_l);                         \
+    int32x4_t v0 = vmulq_n_s32(in_low1, w1_h);                    \
+    v0 = vmlaq_n_s32(v0, in_low0, w1_l);                          \
+    int32x4_t v1 = vmulq_n_s32(in_high1, w1_h);                   \
+    v1 = vmlaq_n_s32(v1, in_high0, w1_l);                         \
+    int32x4_t c0 = vrshlq_s32(u0, v_cos_bit);                     \
+    int32x4_t c1 = vrshlq_s32(u1, v_cos_bit);                     \
+    int32x4_t d0 = vrshlq_s32(v0, v_cos_bit);                     \
+    int32x4_t d1 = vrshlq_s32(v1, v_cos_bit);                     \
+    out0 = custom_packs_s32(c0, c1);                              \
+    out1 = custom_packs_s32(d0, d1);                              \
+  }
+
+#define btf_16_neon_mode0(w0_l, w0_h, in0, in1, out0, out1, v_cos_bit) \
+  {                                                                    \
+    int32x4_t in_low0 = vmovl_s16(vget_low_s16(in0));                  \
+    int32x4_t in_high0 = vmovl_s16(vget_high_s16(in0));                \
+    int32x4_t in_low1 = vmovl_s16(vget_low_s16(in1));                  \
+    int32x4_t in_high1 = vmovl_s16(vget_high_s16(in1));                \
+    int32x4_t u0 = vmulq_n_s32(in_low1, w0_h);                         \
+    u0 = vmlsq_n_s32(u0, in_low0, w0_l);                               \
+    int32x4_t u1 = vmulq_n_s32(in_high1, w0_h);                        \
+    u1 = vmlsq_n_s32(u1, in_high0, w0_l);                              \
+    int32x4_t v0 = vmulq_n_s32(in_low1, w0_l);                         \
+    v0 = vmlaq_n_s32(v0, in_low0, w0_h);                               \
+    int32x4_t v1 = vmulq_n_s32(in_high1, w0_l);                        \
+    v1 = vmlaq_n_s32(v1, in_high0, w0_h);                              \
+    int32x4_t c0 = vrshlq_s32(u0, v_cos_bit);                          \
+    int32x4_t c1 = vrshlq_s32(u1, v_cos_bit);                          \
+    int32x4_t d0 = vrshlq_s32(v0, v_cos_bit);                          \
+    int32x4_t d1 = vrshlq_s32(v1, v_cos_bit);                          \
+    out0 = custom_packs_s32(c0, c1);                                   \
+    out1 = custom_packs_s32(d0, d1);                                   \
+  }
+
+#define btf_16_neon_mode1(w0_l, w0_h, in0, in1, out0, out1, v_cos_bit) \
+  {                                                                    \
+    int32x4_t in_low0 = vmovl_s16(vget_low_s16(in0));                  \
+    int32x4_t in_high0 = vmovl_s16(vget_high_s16(in0));                \
+    int32x4_t in_low1 = vmovl_s16(vget_low_s16(in1));                  \
+    int32x4_t in_high1 = vmovl_s16(vget_high_s16(in1));                \
+    int32x4_t u0 = vmulq_n_s32(in_low0, w0_l);                         \
+    u0 = vmlsq_n_s32(u0, in_low1, w0_h);                               \
+    int32x4_t u1 = vmulq_n_s32(in_high0, w0_l);                        \
+    u1 = vmlsq_n_s32(u1, in_high1, w0_h);                              \
+    int32x4_t v0 = vmulq_n_s32(in_low1, w0_l);                         \
+    v0 = vmlaq_n_s32(v0, in_low0, w0_h);                               \
+    int32x4_t v1 = vmulq_n_s32(in_high1, w0_l);                        \
+    v1 = vmlaq_n_s32(v1, in_high0, w0_h);                              \
+    int32x4_t c0 = vrshlq_s32(u0, v_cos_bit);                          \
+    int32x4_t c1 = vrshlq_s32(u1, v_cos_bit);                          \
+    int32x4_t d0 = vrshlq_s32(v0, v_cos_bit);                          \
+    int32x4_t d1 = vrshlq_s32(v1, v_cos_bit);                          \
+    out0 = custom_packs_s32(c0, c1);                                   \
+    out1 = custom_packs_s32(d0, d1);                                   \
+  }
+
+#define btf_16_neon_mode02(w0_l, w0_h, in0, in1, out0, out1, v_cos_bit) \
+  {                                                                     \
+    int32x4_t in_low0 = vmovl_s16(vget_low_s16(in0));                   \
+    int32x4_t in_high0 = vmovl_s16(vget_high_s16(in0));                 \
+    int32x4_t in_low1 = vmovl_s16(vget_low_s16(in1));                   \
+    int32x4_t in_high1 = vmovl_s16(vget_high_s16(in1));                 \
+    int32x4_t u0 = vmulq_n_s32(in_low1, -w0_h);                         \
+    u0 = vmlsq_n_s32(u0, in_low0, w0_l);                                \
+    int32x4_t u1 = vmulq_n_s32(in_high1, -w0_h);                        \
+    u1 = vmlsq_n_s32(u1, in_high0, w0_l);                               \
+    int32x4_t v0 = vmulq_n_s32(in_low1, w0_l);                          \
+    v0 = vmlsq_n_s32(v0, in_low0, w0_h);                                \
+    int32x4_t v1 = vmulq_n_s32(in_high1, w0_l);                         \
+    v1 = vmlsq_n_s32(v1, in_high0, w0_h);                               \
+    int32x4_t c0 = vrshlq_s32(u0, v_cos_bit);                           \
+    int32x4_t c1 = vrshlq_s32(u1, v_cos_bit);                           \
+    int32x4_t d0 = vrshlq_s32(v0, v_cos_bit);                           \
+    int32x4_t d1 = vrshlq_s32(v1, v_cos_bit);                           \
+    out0 = custom_packs_s32(c0, c1);                                    \
+    out1 = custom_packs_s32(d0, d1);                                    \
+  }
+
+#define btf_16_neon_mode2(w0_l, w0_h, in0, in1, out0, out1, v_cos_bit) \
+  {                                                                    \
+    int32x4_t in_low0 = vmovl_s16(vget_low_s16(in0));                  \
+    int32x4_t in_high0 = vmovl_s16(vget_high_s16(in0));                \
+    int32x4_t in_low1 = vmovl_s16(vget_low_s16(in1));                  \
+    int32x4_t in_high1 = vmovl_s16(vget_high_s16(in1));                \
+    int32x4_t u0 = vmulq_n_s32(in_low1, w0_h);                         \
+    u0 = vmlaq_n_s32(u0, in_low0, w0_l);                               \
+    int32x4_t u1 = vmulq_n_s32(in_high1, w0_h);                        \
+    u1 = vmlaq_n_s32(u1, in_high0, w0_l);                              \
+    int32x4_t v0 = vmulq_n_s32(in_low1, w0_l);                         \
+    v0 = vmlsq_n_s32(v0, in_low0, w0_h);                               \
+    int32x4_t v1 = vmulq_n_s32(in_high1, w0_l);                        \
+    v1 = vmlsq_n_s32(v1, in_high0, w0_h);                              \
+    int32x4_t c0 = vrshlq_s32(u0, v_cos_bit);                          \
+    int32x4_t c1 = vrshlq_s32(u1, v_cos_bit);                          \
+    int32x4_t d0 = vrshlq_s32(v0, v_cos_bit);                          \
+    int32x4_t d1 = vrshlq_s32(v1, v_cos_bit);                          \
+    out0 = custom_packs_s32(c0, c1);                                   \
+    out1 = custom_packs_s32(d0, d1);                                   \
+  }
+
+#define btf_16_neon_mode3(w0_l, w0_h, in0, in1, out0, out1, v_cos_bit) \
+  {                                                                    \
+    int32x4_t in_low0 = vmovl_s16(vget_low_s16(in0));                  \
+    int32x4_t in_high0 = vmovl_s16(vget_high_s16(in0));                \
+    int32x4_t in_low1 = vmovl_s16(vget_low_s16(in1));                  \
+    int32x4_t in_high1 = vmovl_s16(vget_high_s16(in1));                \
+    int32x4_t u0 = vmulq_n_s32(in_low1, w0_h);                         \
+    u0 = vmlaq_n_s32(u0, in_low0, w0_l);                               \
+    int32x4_t u1 = vmulq_n_s32(in_high1, w0_h);                        \
+    u1 = vmlaq_n_s32(u1, in_high0, w0_l);                              \
+    int32x4_t v0 = vmulq_n_s32(in_low0, w0_h);                         \
+    v0 = vmlsq_n_s32(v0, in_low1, w0_l);                               \
+    int32x4_t v1 = vmulq_n_s32(in_high0, w0_h);                        \
+    v1 = vmlsq_n_s32(v1, in_high1, w0_l);                              \
+    int32x4_t c0 = vrshlq_s32(u0, v_cos_bit);                          \
+    int32x4_t c1 = vrshlq_s32(u1, v_cos_bit);                          \
+    int32x4_t d0 = vrshlq_s32(v0, v_cos_bit);                          \
+    int32x4_t d1 = vrshlq_s32(v1, v_cos_bit);                          \
+    out0 = custom_packs_s32(c0, c1);                                   \
+    out1 = custom_packs_s32(d0, d1);                                   \
+  }
+
+static void fdct8x4_neon(const int16x8_t *input, int16x8_t *output,
+                         int8_t cos_bit, const int8_t *stage_range) {
+  (void)stage_range;
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const int32x4_t v_cos_bit = vdupq_n_s32(-cos_bit);
+
+  // stage 1
+  int16x8_t x1[4];
+  x1[0] = vqaddq_s16(input[0], input[3]);
+  x1[3] = vqsubq_s16(input[0], input[3]);
+  x1[1] = vqaddq_s16(input[1], input[2]);
+  x1[2] = vqsubq_s16(input[1], input[2]);
+
+  // stage 2
+  int16x8_t x2[4];
+  btf_16_neon_mode3(cospi[32], cospi[32], x1[0], x1[1], x2[0], x2[1],
+                    v_cos_bit);
+  btf_16_neon_mode2(cospi[48], cospi[16], x1[2], x1[3], x2[2], x2[3],
+                    v_cos_bit);
+
+  // stage 3
+  output[0] = x2[0];
+  output[1] = x2[2];
+  output[2] = x2[1];
+  output[3] = x2[3];
+}
+
+static void fdct4x8_neon(const int16x8_t *input, int16x8_t *output,
+                         int8_t cos_bit, const int8_t *stage_range) {
+  (void)stage_range;
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const int32x4_t v_cos_bit = vdupq_n_s32(-cos_bit);
+
+  // stage 1
+  int16x8_t x1[8];
+  x1[0] = vqaddq_s16(input[0], input[7]);
+  x1[7] = vqsubq_s16(input[0], input[7]);
+  x1[1] = vqaddq_s16(input[1], input[6]);
+  x1[6] = vqsubq_s16(input[1], input[6]);
+  x1[2] = vqaddq_s16(input[2], input[5]);
+  x1[5] = vqsubq_s16(input[2], input[5]);
+  x1[3] = vqaddq_s16(input[3], input[4]);
+  x1[4] = vqsubq_s16(input[3], input[4]);
+
+  // stage 2
+  int16x8_t x2[8];
+  x2[0] = vqaddq_s16(x1[0], x1[3]);
+  x2[3] = vqsubq_s16(x1[0], x1[3]);
+  x2[1] = vqaddq_s16(x1[1], x1[2]);
+  x2[2] = vqsubq_s16(x1[1], x1[2]);
+
+  btf_16_w4_neon_mode0(cospi[32], cospi[32], x1[5], x1[6], x2[5], x2[6],
+                       v_cos_bit);
+
+  // stage 3
+  int16x8_t x3[8];
+  btf_16_w4_neon_mode3(cospi[32], cospi[32], x2[0], x2[1], output[0], output[4],
+                       v_cos_bit);
+
+  btf_16_w4_neon_mode2(cospi[48], cospi[16], x2[2], x2[3], output[2], output[6],
+                       v_cos_bit);
+  x3[4] = vqaddq_s16(x1[4], x2[5]);
+  x3[5] = vqsubq_s16(x1[4], x2[5]);
+  x3[6] = vqsubq_s16(x1[7], x2[6]);
+  x3[7] = vqaddq_s16(x1[7], x2[6]);
+
+  // stage 4-5
+  btf_16_w4_neon_mode2(cospi[56], cospi[8], x3[4], x3[7], output[1], output[7],
+                       v_cos_bit);
+  btf_16_w4_neon_mode2(cospi[24], cospi[40], x3[5], x3[6], output[5], output[3],
+                       v_cos_bit);
+}
+
+void fdct8x8_neon(const int16x8_t *input, int16x8_t *output, int8_t cos_bit,
+                  const int8_t *stage_range) {
+  (void)stage_range;
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const int32x4_t v_cos_bit = vdupq_n_s32(-cos_bit);
+
+  // stage 1
+  int16x8_t x1[8];
+  x1[0] = vqaddq_s16(input[0], input[7]);
+  x1[7] = vqsubq_s16(input[0], input[7]);
+  x1[1] = vqaddq_s16(input[1], input[6]);
+  x1[6] = vqsubq_s16(input[1], input[6]);
+  x1[2] = vqaddq_s16(input[2], input[5]);
+  x1[5] = vqsubq_s16(input[2], input[5]);
+  x1[3] = vqaddq_s16(input[3], input[4]);
+  x1[4] = vqsubq_s16(input[3], input[4]);
+
+  // stage 2
+  int16x8_t x2[8];
+  x2[0] = vqaddq_s16(x1[0], x1[3]);
+  x2[3] = vqsubq_s16(x1[0], x1[3]);
+  x2[1] = vqaddq_s16(x1[1], x1[2]);
+  x2[2] = vqsubq_s16(x1[1], x1[2]);
+  btf_16_neon_mode0(cospi[32], cospi[32], x1[5], x1[6], x2[5], x2[6],
+                    v_cos_bit);
+
+  // stage 3
+  int16x8_t x3[8];
+  btf_16_neon_mode3(cospi[32], cospi[32], x2[0], x2[1], output[0], output[4],
+                    v_cos_bit);
+  btf_16_neon_mode2(cospi[48], cospi[16], x2[2], x2[3], output[2], output[6],
+                    v_cos_bit);
+  x3[4] = vqaddq_s16(x1[4], x2[5]);
+  x3[5] = vqsubq_s16(x1[4], x2[5]);
+  x3[6] = vqsubq_s16(x1[7], x2[6]);
+  x3[7] = vqaddq_s16(x1[7], x2[6]);
+
+  // stage 4-5
+  btf_16_neon_mode2(cospi[56], cospi[8], x3[4], x3[7], output[1], output[7],
+                    v_cos_bit);
+  btf_16_neon_mode2(cospi[24], cospi[40], x3[5], x3[6], output[5], output[3],
+                    v_cos_bit);
+}
+
+static void fdct8x16_neon(const int16x8_t *input, int16x8_t *output,
+                          int8_t cos_bit, const int8_t *stage_range) {
+  (void)stage_range;
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const int32x4_t v_cos_bit = vdupq_n_s32(-cos_bit);
+
+  // stage 1
+  int16x8_t x1[16];
+  x1[0] = vqaddq_s16(input[0], input[15]);
+  x1[15] = vqsubq_s16(input[0], input[15]);
+  x1[1] = vqaddq_s16(input[1], input[14]);
+  x1[14] = vqsubq_s16(input[1], input[14]);
+  x1[2] = vqaddq_s16(input[2], input[13]);
+  x1[13] = vqsubq_s16(input[2], input[13]);
+  x1[3] = vqaddq_s16(input[3], input[12]);
+  x1[12] = vqsubq_s16(input[3], input[12]);
+  x1[4] = vqaddq_s16(input[4], input[11]);
+  x1[11] = vqsubq_s16(input[4], input[11]);
+  x1[5] = vqaddq_s16(input[5], input[10]);
+  x1[10] = vqsubq_s16(input[5], input[10]);
+  x1[6] = vqaddq_s16(input[6], input[9]);
+  x1[9] = vqsubq_s16(input[6], input[9]);
+  x1[7] = vqaddq_s16(input[7], input[8]);
+  x1[8] = vqsubq_s16(input[7], input[8]);
+
+  // stage 2
+  int16x8_t x2[16];
+  x2[0] = vqaddq_s16(x1[0], x1[7]);
+  x2[7] = vqsubq_s16(x1[0], x1[7]);
+  x2[1] = vqaddq_s16(x1[1], x1[6]);
+  x2[6] = vqsubq_s16(x1[1], x1[6]);
+  x2[2] = vqaddq_s16(x1[2], x1[5]);
+  x2[5] = vqsubq_s16(x1[2], x1[5]);
+  x2[3] = vqaddq_s16(x1[3], x1[4]);
+  x2[4] = vqsubq_s16(x1[3], x1[4]);
+
+  btf_16_neon_mode0(cospi[32], cospi[32], x1[10], x1[13], x2[10], x2[13],
+                    v_cos_bit);
+  btf_16_neon_mode0(cospi[32], cospi[32], x1[11], x1[12], x2[11], x2[12],
+                    v_cos_bit);
+
+  // stage 3
+  int16x8_t x3[16];
+  x3[0] = vqaddq_s16(x2[0], x2[3]);
+  x3[3] = vqsubq_s16(x2[0], x2[3]);
+  x3[1] = vqaddq_s16(x2[1], x2[2]);
+  x3[2] = vqsubq_s16(x2[1], x2[2]);
+
+  btf_16_neon_mode0(cospi[32], cospi[32], x2[5], x2[6], x3[5], x3[6],
+                    v_cos_bit);
+
+  x3[8] = vqaddq_s16(x1[8], x2[11]);
+  x3[11] = vqsubq_s16(x1[8], x2[11]);
+  x3[9] = vqaddq_s16(x1[9], x2[10]);
+  x3[10] = vqsubq_s16(x1[9], x2[10]);
+  x3[12] = vqsubq_s16(x1[15], x2[12]);
+  x3[15] = vqaddq_s16(x1[15], x2[12]);
+  x3[13] = vqsubq_s16(x1[14], x2[13]);
+  x3[14] = vqaddq_s16(x1[14], x2[13]);
+
+  // stage 4
+  int16x8_t x4[16];
+  btf_16_neon(cospi[32], cospi[32], cospi[32], -cospi[32], x3[0], x3[1],
+              output[0], output[8]);
+  btf_16_neon(cospi[48], cospi[16], -cospi[16], cospi[48], x3[2], x3[3],
+              output[4], output[12]);
+  x4[4] = vqaddq_s16(x2[4], x3[5]);
+  x4[5] = vqsubq_s16(x2[4], x3[5]);
+  x4[6] = vqsubq_s16(x2[7], x3[6]);
+  x4[7] = vqaddq_s16(x2[7], x3[6]);
+  btf_16_neon_mode0(cospi[16], cospi[48], x3[9], x3[14], x4[9], x4[14],
+                    v_cos_bit);
+  btf_16_neon_mode02(cospi[48], cospi[16], x3[10], x3[13], x4[10], x4[13],
+                     v_cos_bit);
+
+  // stage 5
+  int16x8_t x5[16];
+
+  btf_16_neon_mode2(cospi[56], cospi[8], x4[4], x4[7], output[2], output[14],
+                    v_cos_bit);
+  btf_16_neon_mode2(cospi[24], cospi[40], x4[5], x4[6], output[10], output[6],
+                    v_cos_bit);
+  x5[8] = vqaddq_s16(x3[8], x4[9]);
+  x5[9] = vqsubq_s16(x3[8], x4[9]);
+  x5[10] = vqsubq_s16(x3[11], x4[10]);
+  x5[11] = vqaddq_s16(x3[11], x4[10]);
+  x5[12] = vqaddq_s16(x3[12], x4[13]);
+  x5[13] = vqsubq_s16(x3[12], x4[13]);
+  x5[14] = vqsubq_s16(x3[15], x4[14]);
+  x5[15] = vqaddq_s16(x3[15], x4[14]);
+
+  // stage 6-7
+  btf_16_neon_mode2(cospi[60], cospi[4], x5[8], x5[15], output[1], output[15],
+                    v_cos_bit);
+  btf_16_neon_mode2(cospi[28], cospi[36], x5[9], x5[14], output[9], output[7],
+                    v_cos_bit);
+  btf_16_neon_mode2(cospi[44], cospi[20], x5[10], x5[13], output[5], output[11],
+                    v_cos_bit);
+  btf_16_neon_mode2(cospi[12], cospi[52], x5[11], x5[12], output[13], output[3],
+                    v_cos_bit);
+}
+
+void av1_fdct8x32_neon(const int16x8_t *input, int16x8_t *output,
+                       int8_t cos_bit, const int8_t *stage_range) {
+  (void)stage_range;
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const int32x4_t v_cos_bit = vdupq_n_s32(-cos_bit);
+
+  // stage 1
+  int16x8_t x1[32];
+  x1[0] = vqaddq_s16(input[0], input[31]);
+  x1[31] = vqsubq_s16(input[0], input[31]);
+  x1[1] = vqaddq_s16(input[1], input[30]);
+  x1[30] = vqsubq_s16(input[1], input[30]);
+  x1[2] = vqaddq_s16(input[2], input[29]);
+  x1[29] = vqsubq_s16(input[2], input[29]);
+  x1[3] = vqaddq_s16(input[3], input[28]);
+  x1[28] = vqsubq_s16(input[3], input[28]);
+  x1[4] = vqaddq_s16(input[4], input[27]);
+  x1[27] = vqsubq_s16(input[4], input[27]);
+  x1[5] = vqaddq_s16(input[5], input[26]);
+  x1[26] = vqsubq_s16(input[5], input[26]);
+  x1[6] = vqaddq_s16(input[6], input[25]);
+  x1[25] = vqsubq_s16(input[6], input[25]);
+  x1[7] = vqaddq_s16(input[7], input[24]);
+  x1[24] = vqsubq_s16(input[7], input[24]);
+  x1[8] = vqaddq_s16(input[8], input[23]);
+  x1[23] = vqsubq_s16(input[8], input[23]);
+  x1[9] = vqaddq_s16(input[9], input[22]);
+  x1[22] = vqsubq_s16(input[9], input[22]);
+  x1[10] = vqaddq_s16(input[10], input[21]);
+  x1[21] = vqsubq_s16(input[10], input[21]);
+  x1[11] = vqaddq_s16(input[11], input[20]);
+  x1[20] = vqsubq_s16(input[11], input[20]);
+  x1[12] = vqaddq_s16(input[12], input[19]);
+  x1[19] = vqsubq_s16(input[12], input[19]);
+  x1[13] = vqaddq_s16(input[13], input[18]);
+  x1[18] = vqsubq_s16(input[13], input[18]);
+  x1[14] = vqaddq_s16(input[14], input[17]);
+  x1[17] = vqsubq_s16(input[14], input[17]);
+  x1[15] = vqaddq_s16(input[15], input[16]);
+  x1[16] = vqsubq_s16(input[15], input[16]);
+
+  // stage 2
+  int16x8_t x2[32];
+  x2[0] = vqaddq_s16(x1[0], x1[15]);
+  x2[15] = vqsubq_s16(x1[0], x1[15]);
+  x2[1] = vqaddq_s16(x1[1], x1[14]);
+  x2[14] = vqsubq_s16(x1[1], x1[14]);
+  x2[2] = vqaddq_s16(x1[2], x1[13]);
+  x2[13] = vqsubq_s16(x1[2], x1[13]);
+  x2[3] = vqaddq_s16(x1[3], x1[12]);
+  x2[12] = vqsubq_s16(x1[3], x1[12]);
+  x2[4] = vqaddq_s16(x1[4], x1[11]);
+  x2[11] = vqsubq_s16(x1[4], x1[11]);
+  x2[5] = vqaddq_s16(x1[5], x1[10]);
+  x2[10] = vqsubq_s16(x1[5], x1[10]);
+  x2[6] = vqaddq_s16(x1[6], x1[9]);
+  x2[9] = vqsubq_s16(x1[6], x1[9]);
+  x2[7] = vqaddq_s16(x1[7], x1[8]);
+  x2[8] = vqsubq_s16(x1[7], x1[8]);
+
+  btf_16_neon_mode0(cospi[32], cospi[32], x1[20], x1[27], x2[20], x2[27],
+                    v_cos_bit);
+  btf_16_neon_mode0(cospi[32], cospi[32], x1[21], x1[26], x2[21], x2[26],
+                    v_cos_bit);
+  btf_16_neon_mode0(cospi[32], cospi[32], x1[22], x1[25], x2[22], x2[25],
+                    v_cos_bit);
+  btf_16_neon_mode0(cospi[32], cospi[32], x1[23], x1[24], x2[23], x2[24],
+                    v_cos_bit);
+
+  // stage 3
+  int16x8_t x3[32];
+  x3[0] = vqaddq_s16(x2[0], x2[7]);
+  x3[7] = vqsubq_s16(x2[0], x2[7]);
+  x3[1] = vqaddq_s16(x2[1], x2[6]);
+  x3[6] = vqsubq_s16(x2[1], x2[6]);
+  x3[2] = vqaddq_s16(x2[2], x2[5]);
+  x3[5] = vqsubq_s16(x2[2], x2[5]);
+  x3[3] = vqaddq_s16(x2[3], x2[4]);
+  x3[4] = vqsubq_s16(x2[3], x2[4]);
+
+  btf_16_neon_mode0(cospi[32], cospi[32], x2[10], x2[13], x3[10], x3[13],
+                    v_cos_bit);
+  btf_16_neon_mode0(cospi[32], cospi[32], x2[11], x2[12], x3[11], x3[12],
+                    v_cos_bit);
+
+  x3[16] = vqaddq_s16(x1[16], x2[23]);
+  x3[23] = vqsubq_s16(x1[16], x2[23]);
+  x3[17] = vqaddq_s16(x1[17], x2[22]);
+  x3[22] = vqsubq_s16(x1[17], x2[22]);
+  x3[18] = vqaddq_s16(x1[18], x2[21]);
+  x3[21] = vqsubq_s16(x1[18], x2[21]);
+  x3[19] = vqaddq_s16(x1[19], x2[20]);
+  x3[20] = vqsubq_s16(x1[19], x2[20]);
+  x3[24] = vqsubq_s16(x1[31], x2[24]);
+  x3[31] = vqaddq_s16(x1[31], x2[24]);
+  x3[25] = vqsubq_s16(x1[30], x2[25]);
+  x3[30] = vqaddq_s16(x1[30], x2[25]);
+  x3[26] = vqsubq_s16(x1[29], x2[26]);
+  x3[29] = vqaddq_s16(x1[29], x2[26]);
+  x3[27] = vqsubq_s16(x1[28], x2[27]);
+  x3[28] = vqaddq_s16(x1[28], x2[27]);
+
+  // stage 4
+  int16x8_t x4[32];
+  x4[0] = vqaddq_s16(x3[0], x3[3]);
+  x4[3] = vqsubq_s16(x3[0], x3[3]);
+  x4[1] = vqaddq_s16(x3[1], x3[2]);
+  x4[2] = vqsubq_s16(x3[1], x3[2]);
+  btf_16_neon_mode0(cospi[32], cospi[32], x3[5], x3[6], x4[5], x4[6],
+                    v_cos_bit);
+  x4[8] = vqaddq_s16(x2[8], x3[11]);
+  x4[11] = vqsubq_s16(x2[8], x3[11]);
+  x4[9] = vqaddq_s16(x2[9], x3[10]);
+  x4[10] = vqsubq_s16(x2[9], x3[10]);
+  x4[12] = vqsubq_s16(x2[15], x3[12]);
+  x4[15] = vqaddq_s16(x2[15], x3[12]);
+  x4[13] = vqsubq_s16(x2[14], x3[13]);
+  x4[14] = vqaddq_s16(x2[14], x3[13]);
+
+  btf_16_neon_mode0(cospi[16], cospi[48], x3[18], x3[29], x4[18], x4[29],
+                    v_cos_bit);
+  btf_16_neon_mode0(cospi[16], cospi[48], x3[19], x3[28], x4[19], x4[28],
+                    v_cos_bit);
+  btf_16_neon_mode02(cospi[48], cospi[16], x3[20], x3[27], x4[20], x4[27],
+                     v_cos_bit);
+  btf_16_neon_mode02(cospi[48], cospi[16], x3[21], x3[26], x4[21], x4[26],
+                     v_cos_bit);
+
+  // stage 5
+  int16x8_t x5[32];
+  btf_16_neon_mode3(cospi[32], cospi[32], x4[0], x4[1], output[0], output[16],
+                    v_cos_bit);
+  btf_16_neon_mode2(cospi[48], cospi[16], x4[2], x4[3], output[8], output[24],
+                    v_cos_bit);
+  x5[4] = vqaddq_s16(x3[4], x4[5]);
+  x5[5] = vqsubq_s16(x3[4], x4[5]);
+  x5[6] = vqsubq_s16(x3[7], x4[6]);
+  x5[7] = vqaddq_s16(x3[7], x4[6]);
+
+  btf_16_neon_mode0(cospi[16], cospi[48], x4[9], x4[14], x5[9], x5[14],
+                    v_cos_bit);
+  btf_16_neon_mode02(cospi[48], cospi[16], x4[10], x4[13], x5[10], x5[13],
+                     v_cos_bit);
+
+  x5[16] = vqaddq_s16(x3[16], x4[19]);
+  x5[19] = vqsubq_s16(x3[16], x4[19]);
+  x5[17] = vqaddq_s16(x3[17], x4[18]);
+  x5[18] = vqsubq_s16(x3[17], x4[18]);
+  x5[20] = vqsubq_s16(x3[23], x4[20]);
+  x5[23] = vqaddq_s16(x3[23], x4[20]);
+  x5[21] = vqsubq_s16(x3[22], x4[21]);
+  x5[22] = vqaddq_s16(x3[22], x4[21]);
+  x5[24] = vqaddq_s16(x3[24], x4[27]);
+  x5[27] = vqsubq_s16(x3[24], x4[27]);
+  x5[25] = vqaddq_s16(x3[25], x4[26]);
+  x5[26] = vqsubq_s16(x3[25], x4[26]);
+  x5[28] = vqsubq_s16(x3[31], x4[28]);
+  x5[31] = vqaddq_s16(x3[31], x4[28]);
+  x5[29] = vqsubq_s16(x3[30], x4[29]);
+  x5[30] = vqaddq_s16(x3[30], x4[29]);
+
+  // stage 6
+  int16x8_t x6[32];
+  btf_16_neon_mode2(cospi[56], cospi[8], x5[4], x5[7], output[4], output[28],
+                    v_cos_bit);
+  btf_16_neon_mode2(cospi[24], cospi[40], x5[5], x5[6], output[20], output[12],
+                    v_cos_bit);
+  x6[8] = vqaddq_s16(x4[8], x5[9]);
+  x6[9] = vqsubq_s16(x4[8], x5[9]);
+  x6[10] = vqsubq_s16(x4[11], x5[10]);
+  x6[11] = vqaddq_s16(x4[11], x5[10]);
+  x6[12] = vqaddq_s16(x4[12], x5[13]);
+  x6[13] = vqsubq_s16(x4[12], x5[13]);
+  x6[14] = vqsubq_s16(x4[15], x5[14]);
+  x6[15] = vqaddq_s16(x4[15], x5[14]);
+  btf_16_neon_mode0(cospi[8], cospi[56], x5[17], x5[30], x6[17], x6[30],
+                    v_cos_bit);
+  btf_16_neon_mode02(cospi[56], cospi[8], x5[18], x5[29], x6[18], x6[29],
+                     v_cos_bit);
+  btf_16_neon_mode0(cospi[40], cospi[24], x5[21], x5[26], x6[21], x6[26],
+                    v_cos_bit);
+  btf_16_neon_mode02(cospi[24], cospi[40], x5[22], x5[25], x6[22], x6[25],
+                     v_cos_bit);
+
+  // stage 7
+  int16x8_t x7[32];
+  btf_16_neon_mode2(cospi[60], cospi[4], x6[8], x6[15], output[2], output[30],
+                    v_cos_bit);
+  btf_16_neon_mode2(cospi[28], cospi[36], x6[9], x6[14], output[18], output[14],
+                    v_cos_bit);
+  btf_16_neon_mode2(cospi[44], cospi[20], x6[10], x6[13], output[10],
+                    output[22], v_cos_bit);
+  btf_16_neon_mode2(cospi[12], cospi[52], x6[11], x6[12], output[26], output[6],
+                    v_cos_bit);
+  x7[16] = vqaddq_s16(x5[16], x6[17]);
+  x7[17] = vqsubq_s16(x5[16], x6[17]);
+  x7[18] = vqsubq_s16(x5[19], x6[18]);
+  x7[19] = vqaddq_s16(x5[19], x6[18]);
+  x7[20] = vqaddq_s16(x5[20], x6[21]);
+  x7[21] = vqsubq_s16(x5[20], x6[21]);
+  x7[22] = vqsubq_s16(x5[23], x6[22]);
+  x7[23] = vqaddq_s16(x5[23], x6[22]);
+  x7[24] = vqaddq_s16(x5[24], x6[25]);
+  x7[25] = vqsubq_s16(x5[24], x6[25]);
+  x7[26] = vqsubq_s16(x5[27], x6[26]);
+  x7[27] = vqaddq_s16(x5[27], x6[26]);
+  x7[28] = vqaddq_s16(x5[28], x6[29]);
+  x7[29] = vqsubq_s16(x5[28], x6[29]);
+  x7[30] = vqsubq_s16(x5[31], x6[30]);
+  x7[31] = vqaddq_s16(x5[31], x6[30]);
+
+  btf_16_neon_mode2(cospi[62], cospi[2], x7[16], x7[31], output[1], output[31],
+                    v_cos_bit);
+  btf_16_neon_mode2(cospi[30], cospi[34], x7[17], x7[30], output[17],
+                    output[15], v_cos_bit);
+  btf_16_neon_mode2(cospi[46], cospi[18], x7[18], x7[29], output[9], output[23],
+                    v_cos_bit);
+  btf_16_neon_mode2(cospi[14], cospi[50], x7[19], x7[28], output[25], output[7],
+                    v_cos_bit);
+  btf_16_neon_mode2(cospi[54], cospi[10], x7[20], x7[27], output[5], output[27],
+                    v_cos_bit);
+  btf_16_neon_mode2(cospi[22], cospi[42], x7[21], x7[26], output[21],
+                    output[11], v_cos_bit);
+  btf_16_neon_mode2(cospi[38], cospi[26], x7[22], x7[25], output[13],
+                    output[19], v_cos_bit);
+  btf_16_neon_mode2(cospi[6], cospi[58], x7[23], x7[24], output[29], output[3],
+                    v_cos_bit);
+}
+
+void av1_fdct8x64_stage_1234_neon(const int16x8_t *input, int16x8_t *x3,
+                                  int16x8_t *x4, const int32_t *cospi32,
+                                  const int32x4_t *v_cos_bit) {
+  int16x8_t x1[64];
+  int16x8_t x2[64];
+  x1[0] = vqaddq_s16(input[0], input[63]);
+  x1[63] = vqsubq_s16(input[0], input[63]);
+  x1[1] = vqaddq_s16(input[1], input[62]);
+  x1[62] = vqsubq_s16(input[1], input[62]);
+  x1[2] = vqaddq_s16(input[2], input[61]);
+  x1[61] = vqsubq_s16(input[2], input[61]);
+  x1[3] = vqaddq_s16(input[3], input[60]);
+  x1[60] = vqsubq_s16(input[3], input[60]);
+  x1[4] = vqaddq_s16(input[4], input[59]);
+  x1[59] = vqsubq_s16(input[4], input[59]);
+  x1[5] = vqaddq_s16(input[5], input[58]);
+  x1[58] = vqsubq_s16(input[5], input[58]);
+  x1[6] = vqaddq_s16(input[6], input[57]);
+  x1[57] = vqsubq_s16(input[6], input[57]);
+  x1[7] = vqaddq_s16(input[7], input[56]);
+  x1[56] = vqsubq_s16(input[7], input[56]);
+  x1[8] = vqaddq_s16(input[8], input[55]);
+  x1[55] = vqsubq_s16(input[8], input[55]);
+  x1[9] = vqaddq_s16(input[9], input[54]);
+  x1[54] = vqsubq_s16(input[9], input[54]);
+  x1[10] = vqaddq_s16(input[10], input[53]);
+  x1[53] = vqsubq_s16(input[10], input[53]);
+  x1[11] = vqaddq_s16(input[11], input[52]);
+  x1[52] = vqsubq_s16(input[11], input[52]);
+  x1[12] = vqaddq_s16(input[12], input[51]);
+  x1[51] = vqsubq_s16(input[12], input[51]);
+  x1[13] = vqaddq_s16(input[13], input[50]);
+  x1[50] = vqsubq_s16(input[13], input[50]);
+  x1[14] = vqaddq_s16(input[14], input[49]);
+  x1[49] = vqsubq_s16(input[14], input[49]);
+  x1[15] = vqaddq_s16(input[15], input[48]);
+  x1[48] = vqsubq_s16(input[15], input[48]);
+  x1[16] = vqaddq_s16(input[16], input[47]);
+  x1[47] = vqsubq_s16(input[16], input[47]);
+  x1[17] = vqaddq_s16(input[17], input[46]);
+  x1[46] = vqsubq_s16(input[17], input[46]);
+  x1[18] = vqaddq_s16(input[18], input[45]);
+  x1[45] = vqsubq_s16(input[18], input[45]);
+  x1[19] = vqaddq_s16(input[19], input[44]);
+  x1[44] = vqsubq_s16(input[19], input[44]);
+  x1[20] = vqaddq_s16(input[20], input[43]);
+  x1[43] = vqsubq_s16(input[20], input[43]);
+  x1[21] = vqaddq_s16(input[21], input[42]);
+  x1[42] = vqsubq_s16(input[21], input[42]);
+  x1[22] = vqaddq_s16(input[22], input[41]);
+  x1[41] = vqsubq_s16(input[22], input[41]);
+  x1[23] = vqaddq_s16(input[23], input[40]);
+  x1[40] = vqsubq_s16(input[23], input[40]);
+  x1[24] = vqaddq_s16(input[24], input[39]);
+  x1[39] = vqsubq_s16(input[24], input[39]);
+  x1[25] = vqaddq_s16(input[25], input[38]);
+  x1[38] = vqsubq_s16(input[25], input[38]);
+  x1[26] = vqaddq_s16(input[26], input[37]);
+  x1[37] = vqsubq_s16(input[26], input[37]);
+  x1[27] = vqaddq_s16(input[27], input[36]);
+  x1[36] = vqsubq_s16(input[27], input[36]);
+  x1[28] = vqaddq_s16(input[28], input[35]);
+  x1[35] = vqsubq_s16(input[28], input[35]);
+  x1[29] = vqaddq_s16(input[29], input[34]);
+  x1[34] = vqsubq_s16(input[29], input[34]);
+  x1[30] = vqaddq_s16(input[30], input[33]);
+  x1[33] = vqsubq_s16(input[30], input[33]);
+  x1[31] = vqaddq_s16(input[31], input[32]);
+  x1[32] = vqsubq_s16(input[31], input[32]);
+
+  x2[0] = vqaddq_s16(x1[0], x1[31]);
+  x2[31] = vqsubq_s16(x1[0], x1[31]);
+  x2[1] = vqaddq_s16(x1[1], x1[30]);
+  x2[30] = vqsubq_s16(x1[1], x1[30]);
+  x2[2] = vqaddq_s16(x1[2], x1[29]);
+  x2[29] = vqsubq_s16(x1[2], x1[29]);
+  x2[3] = vqaddq_s16(x1[3], x1[28]);
+  x2[28] = vqsubq_s16(x1[3], x1[28]);
+  x2[4] = vqaddq_s16(x1[4], x1[27]);
+  x2[27] = vqsubq_s16(x1[4], x1[27]);
+  x2[5] = vqaddq_s16(x1[5], x1[26]);
+  x2[26] = vqsubq_s16(x1[5], x1[26]);
+  x2[6] = vqaddq_s16(x1[6], x1[25]);
+  x2[25] = vqsubq_s16(x1[6], x1[25]);
+  x2[7] = vqaddq_s16(x1[7], x1[24]);
+  x2[24] = vqsubq_s16(x1[7], x1[24]);
+  x2[8] = vqaddq_s16(x1[8], x1[23]);
+  x2[23] = vqsubq_s16(x1[8], x1[23]);
+  x2[9] = vqaddq_s16(x1[9], x1[22]);
+  x2[22] = vqsubq_s16(x1[9], x1[22]);
+  x2[10] = vqaddq_s16(x1[10], x1[21]);
+  x2[21] = vqsubq_s16(x1[10], x1[21]);
+  x2[11] = vqaddq_s16(x1[11], x1[20]);
+  x2[20] = vqsubq_s16(x1[11], x1[20]);
+  x2[12] = vqaddq_s16(x1[12], x1[19]);
+  x2[19] = vqsubq_s16(x1[12], x1[19]);
+  x2[13] = vqaddq_s16(x1[13], x1[18]);
+  x2[18] = vqsubq_s16(x1[13], x1[18]);
+  x2[14] = vqaddq_s16(x1[14], x1[17]);
+  x2[17] = vqsubq_s16(x1[14], x1[17]);
+  x2[15] = vqaddq_s16(x1[15], x1[16]);
+  x2[16] = vqsubq_s16(x1[15], x1[16]);
+
+  btf_16_neon_mode0(*cospi32, *cospi32, x1[40], x1[55], x2[40], x2[55],
+                    *v_cos_bit);
+  btf_16_neon_mode0(*cospi32, *cospi32, x1[41], x1[54], x2[41], x2[54],
+                    *v_cos_bit);
+  btf_16_neon_mode0(*cospi32, *cospi32, x1[42], x1[53], x2[42], x2[53],
+                    *v_cos_bit);
+  btf_16_neon_mode0(*cospi32, *cospi32, x1[43], x1[52], x2[43], x2[52],
+                    *v_cos_bit);
+  btf_16_neon_mode0(*cospi32, *cospi32, x1[44], x1[51], x2[44], x2[51],
+                    *v_cos_bit);
+  btf_16_neon_mode0(*cospi32, *cospi32, x1[45], x1[50], x2[45], x2[50],
+                    *v_cos_bit);
+  btf_16_neon_mode0(*cospi32, *cospi32, x1[46], x1[49], x2[46], x2[49],
+                    *v_cos_bit);
+  btf_16_neon_mode0(*cospi32, *cospi32, x1[47], x1[48], x2[47], x2[48],
+                    *v_cos_bit);
+
+  // stage 3
+  x3[0] = vqaddq_s16(x2[0], x2[15]);
+  x3[15] = vqsubq_s16(x2[0], x2[15]);
+  x3[1] = vqaddq_s16(x2[1], x2[14]);
+  x3[14] = vqsubq_s16(x2[1], x2[14]);
+  x3[2] = vqaddq_s16(x2[2], x2[13]);
+  x3[13] = vqsubq_s16(x2[2], x2[13]);
+  x3[3] = vqaddq_s16(x2[3], x2[12]);
+  x3[12] = vqsubq_s16(x2[3], x2[12]);
+  x3[4] = vqaddq_s16(x2[4], x2[11]);
+  x3[11] = vqsubq_s16(x2[4], x2[11]);
+  x3[5] = vqaddq_s16(x2[5], x2[10]);
+  x3[10] = vqsubq_s16(x2[5], x2[10]);
+  x3[6] = vqaddq_s16(x2[6], x2[9]);
+  x3[9] = vqsubq_s16(x2[6], x2[9]);
+  x3[7] = vqaddq_s16(x2[7], x2[8]);
+  x3[8] = vqsubq_s16(x2[7], x2[8]);
+  x3[16] = x2[16];
+  x3[17] = x2[17];
+  x3[18] = x2[18];
+  x3[19] = x2[19];
+  btf_16_neon_mode0(*cospi32, *cospi32, x2[20], x2[27], x3[20], x3[27],
+                    *v_cos_bit);
+  btf_16_neon_mode0(*cospi32, *cospi32, x2[21], x2[26], x3[21], x3[26],
+                    *v_cos_bit);
+  btf_16_neon_mode0(*cospi32, *cospi32, x2[22], x2[25], x3[22], x3[25],
+                    *v_cos_bit);
+  btf_16_neon_mode0(*cospi32, *cospi32, x2[23], x2[24], x3[23], x3[24],
+                    *v_cos_bit);
+  x3[28] = x2[28];
+  x3[29] = x2[29];
+  x3[30] = x2[30];
+  x3[31] = x2[31];
+  x3[32] = vqaddq_s16(x1[32], x2[47]);
+  x3[47] = vqsubq_s16(x1[32], x2[47]);
+  x3[33] = vqaddq_s16(x1[33], x2[46]);
+  x3[46] = vqsubq_s16(x1[33], x2[46]);
+  x3[34] = vqaddq_s16(x1[34], x2[45]);
+  x3[45] = vqsubq_s16(x1[34], x2[45]);
+  x3[35] = vqaddq_s16(x1[35], x2[44]);
+  x3[44] = vqsubq_s16(x1[35], x2[44]);
+  x3[36] = vqaddq_s16(x1[36], x2[43]);
+  x3[43] = vqsubq_s16(x1[36], x2[43]);
+  x3[37] = vqaddq_s16(x1[37], x2[42]);
+  x3[42] = vqsubq_s16(x1[37], x2[42]);
+  x3[38] = vqaddq_s16(x1[38], x2[41]);
+  x3[41] = vqsubq_s16(x1[38], x2[41]);
+  x3[39] = vqaddq_s16(x1[39], x2[40]);
+  x3[40] = vqsubq_s16(x1[39], x2[40]);
+  x3[48] = vqsubq_s16(x1[63], x2[48]);
+  x3[63] = vqaddq_s16(x1[63], x2[48]);
+  x3[49] = vqsubq_s16(x1[62], x2[49]);
+  x3[62] = vqaddq_s16(x1[62], x2[49]);
+  x3[50] = vqsubq_s16(x1[61], x2[50]);
+  x3[61] = vqaddq_s16(x1[61], x2[50]);
+  x3[51] = vqsubq_s16(x1[60], x2[51]);
+  x3[60] = vqaddq_s16(x1[60], x2[51]);
+  x3[52] = vqsubq_s16(x1[59], x2[52]);
+  x3[59] = vqaddq_s16(x1[59], x2[52]);
+  x3[53] = vqsubq_s16(x1[58], x2[53]);
+  x3[58] = vqaddq_s16(x1[58], x2[53]);
+  x3[54] = vqsubq_s16(x1[57], x2[54]);
+  x3[57] = vqaddq_s16(x1[57], x2[54]);
+  x3[55] = vqsubq_s16(x1[56], x2[55]);
+  x3[56] = vqaddq_s16(x1[56], x2[55]);
+
+  // stage 4
+  x4[0] = vqaddq_s16(x3[0], x3[7]);
+  x4[7] = vqsubq_s16(x3[0], x3[7]);
+  x4[1] = vqaddq_s16(x3[1], x3[6]);
+  x4[6] = vqsubq_s16(x3[1], x3[6]);
+  x4[2] = vqaddq_s16(x3[2], x3[5]);
+  x4[5] = vqsubq_s16(x3[2], x3[5]);
+  x4[3] = vqaddq_s16(x3[3], x3[4]);
+  x4[4] = vqsubq_s16(x3[3], x3[4]);
+
+  btf_16_neon_mode0(*cospi32, *cospi32, x3[10], x3[13], x4[10], x4[13],
+                    *v_cos_bit);
+  btf_16_neon_mode0(*cospi32, *cospi32, x3[11], x3[12], x4[11], x4[12],
+                    *v_cos_bit);
+
+  x4[16] = vqaddq_s16(x3[16], x3[23]);
+  x4[23] = vqsubq_s16(x3[16], x3[23]);
+  x4[17] = vqaddq_s16(x3[17], x3[22]);
+  x4[22] = vqsubq_s16(x3[17], x3[22]);
+  x4[18] = vqaddq_s16(x3[18], x3[21]);
+  x4[21] = vqsubq_s16(x3[18], x3[21]);
+  x4[19] = vqaddq_s16(x3[19], x3[20]);
+  x4[20] = vqsubq_s16(x3[19], x3[20]);
+  x4[24] = vqsubq_s16(x3[31], x3[24]);
+  x4[31] = vqaddq_s16(x3[31], x3[24]);
+  x4[25] = vqsubq_s16(x3[30], x3[25]);
+  x4[30] = vqaddq_s16(x3[30], x3[25]);
+  x4[26] = vqsubq_s16(x3[29], x3[26]);
+  x4[29] = vqaddq_s16(x3[29], x3[26]);
+  x4[27] = vqsubq_s16(x3[28], x3[27]);
+  x4[28] = vqaddq_s16(x3[28], x3[27]);
+}
+
+void av1_fdct8x64_neon(const int16x8_t *input, int16x8_t *output,
+                       int8_t cos_bit, const int8_t *stage_range) {
+  (void)stage_range;
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const int32x4_t v_cos_bit = vdupq_n_s32(-cos_bit);
+
+  int16x8_t x3[64];
+  int16x8_t x4[64];
+
+  av1_fdct8x64_stage_1234_neon(input, x3, x4, &cospi[32], &v_cos_bit);
+
+  btf_16_neon_mode0(cospi[16], cospi[48], x3[36], x3[59], x4[36], x4[59],
+                    v_cos_bit);
+  btf_16_neon_mode0(cospi[16], cospi[48], x3[37], x3[58], x4[37], x4[58],
+                    v_cos_bit);
+  btf_16_neon_mode0(cospi[16], cospi[48], x3[38], x3[57], x4[38], x4[57],
+                    v_cos_bit);
+  btf_16_neon_mode0(cospi[16], cospi[48], x3[39], x3[56], x4[39], x4[56],
+                    v_cos_bit);
+  btf_16_neon_mode02(cospi[48], cospi[16], x3[40], x3[55], x4[40], x4[55],
+                     v_cos_bit);
+  btf_16_neon_mode02(cospi[48], cospi[16], x3[41], x3[54], x4[41], x4[54],
+                     v_cos_bit);
+  btf_16_neon_mode02(cospi[48], cospi[16], x3[42], x3[53], x4[42], x4[53],
+                     v_cos_bit);
+  btf_16_neon_mode02(cospi[48], cospi[16], x3[43], x3[52], x4[43], x4[52],
+                     v_cos_bit);
+
+  // stage 5
+  int16x8_t x5[64];
+  x5[0] = vqaddq_s16(x4[0], x4[3]);
+  x5[3] = vqsubq_s16(x4[0], x4[3]);
+  x5[1] = vqaddq_s16(x4[1], x4[2]);
+  x5[2] = vqsubq_s16(x4[1], x4[2]);
+
+  btf_16_neon_mode0(cospi[32], cospi[32], x4[5], x4[6], x5[5], x5[6],
+                    v_cos_bit);
+
+  x5[8] = vqaddq_s16(x3[8], x4[11]);
+  x5[11] = vqsubq_s16(x3[8], x4[11]);
+  x5[9] = vqaddq_s16(x3[9], x4[10]);
+  x5[10] = vqsubq_s16(x3[9], x4[10]);
+  x5[12] = vqsubq_s16(x3[15], x4[12]);
+  x5[15] = vqaddq_s16(x3[15], x4[12]);
+  x5[13] = vqsubq_s16(x3[14], x4[13]);
+  x5[14] = vqaddq_s16(x3[14], x4[13]);
+
+  btf_16_neon_mode0(cospi[16], cospi[48], x4[18], x4[29], x5[18], x5[29],
+                    v_cos_bit);
+  btf_16_neon_mode0(cospi[16], cospi[48], x4[19], x4[28], x5[19], x5[28],
+                    v_cos_bit);
+  btf_16_neon_mode02(cospi[48], cospi[16], x4[20], x4[27], x5[20], x5[27],
+                     v_cos_bit);
+  btf_16_neon_mode02(cospi[48], cospi[16], x4[21], x4[26], x5[21], x5[26],
+                     v_cos_bit);
+
+  x5[32] = vqaddq_s16(x3[32], x4[39]);
+  x5[39] = vqsubq_s16(x3[32], x4[39]);
+  x5[33] = vqaddq_s16(x3[33], x4[38]);
+  x5[38] = vqsubq_s16(x3[33], x4[38]);
+  x5[34] = vqaddq_s16(x3[34], x4[37]);
+  x5[37] = vqsubq_s16(x3[34], x4[37]);
+  x5[35] = vqaddq_s16(x3[35], x4[36]);
+  x5[36] = vqsubq_s16(x3[35], x4[36]);
+  x5[40] = vqsubq_s16(x3[47], x4[40]);
+  x5[47] = vqaddq_s16(x3[47], x4[40]);
+  x5[41] = vqsubq_s16(x3[46], x4[41]);
+  x5[46] = vqaddq_s16(x3[46], x4[41]);
+  x5[42] = vqsubq_s16(x3[45], x4[42]);
+  x5[45] = vqaddq_s16(x3[45], x4[42]);
+  x5[43] = vqsubq_s16(x3[44], x4[43]);
+  x5[44] = vqaddq_s16(x3[44], x4[43]);
+  x5[48] = vqaddq_s16(x3[48], x4[55]);
+  x5[55] = vqsubq_s16(x3[48], x4[55]);
+  x5[49] = vqaddq_s16(x3[49], x4[54]);
+  x5[54] = vqsubq_s16(x3[49], x4[54]);
+  x5[50] = vqaddq_s16(x3[50], x4[53]);
+  x5[53] = vqsubq_s16(x3[50], x4[53]);
+  x5[51] = vqaddq_s16(x3[51], x4[52]);
+  x5[52] = vqsubq_s16(x3[51], x4[52]);
+  x5[56] = vqsubq_s16(x3[63], x4[56]);
+  x5[63] = vqaddq_s16(x3[63], x4[56]);
+  x5[57] = vqsubq_s16(x3[62], x4[57]);
+  x5[62] = vqaddq_s16(x3[62], x4[57]);
+  x5[58] = vqsubq_s16(x3[61], x4[58]);
+  x5[61] = vqaddq_s16(x3[61], x4[58]);
+  x5[59] = vqsubq_s16(x3[60], x4[59]);
+  x5[60] = vqaddq_s16(x3[60], x4[59]);
+
+  // stage 6
+  int16x8_t x6[64];
+  btf_16_neon_mode2(cospi[32], cospi[32], x5[0], x5[1], x6[0], x6[1],
+                    v_cos_bit);
+  btf_16_neon_mode2(cospi[48], cospi[16], x5[2], x5[3], x6[2], x6[3],
+                    v_cos_bit);
+  x6[4] = vqaddq_s16(x4[4], x5[5]);
+  x6[5] = vqsubq_s16(x4[4], x5[5]);
+  x6[6] = vqsubq_s16(x4[7], x5[6]);
+  x6[7] = vqaddq_s16(x4[7], x5[6]);
+
+  btf_16_neon_mode0(cospi[16], cospi[48], x5[9], x5[14], x6[9], x6[14],
+                    v_cos_bit);
+  btf_16_neon_mode02(cospi[48], cospi[16], x5[10], x5[13], x6[10], x6[13],
+                     v_cos_bit);
+
+  x6[16] = vqaddq_s16(x4[16], x5[19]);
+  x6[19] = vqsubq_s16(x4[16], x5[19]);
+  x6[17] = vqaddq_s16(x4[17], x5[18]);
+  x6[18] = vqsubq_s16(x4[17], x5[18]);
+  x6[20] = vqsubq_s16(x4[23], x5[20]);
+  x6[23] = vqaddq_s16(x4[23], x5[20]);
+  x6[21] = vqsubq_s16(x4[22], x5[21]);
+  x6[22] = vqaddq_s16(x4[22], x5[21]);
+  x6[24] = vqaddq_s16(x4[24], x5[27]);
+  x6[27] = vqsubq_s16(x4[24], x5[27]);
+  x6[25] = vqaddq_s16(x4[25], x5[26]);
+  x6[26] = vqsubq_s16(x4[25], x5[26]);
+  x6[28] = vqsubq_s16(x4[31], x5[28]);
+  x6[31] = vqaddq_s16(x4[31], x5[28]);
+  x6[29] = vqsubq_s16(x4[30], x5[29]);
+  x6[30] = vqaddq_s16(x4[30], x5[29]);
+
+  btf_16_neon_mode0(cospi[8], cospi[56], x5[34], x5[61], x6[34], x6[61],
+                    v_cos_bit);
+  btf_16_neon_mode0(cospi[8], cospi[56], x5[35], x5[60], x6[35], x6[60],
+                    v_cos_bit);
+  btf_16_neon_mode02(cospi[56], cospi[8], x5[36], x5[59], x6[36], x6[59],
+                     v_cos_bit);
+  btf_16_neon_mode02(cospi[56], cospi[8], x5[37], x5[58], x6[37], x6[58],
+                     v_cos_bit);
+  btf_16_neon_mode0(cospi[40], cospi[24], x5[42], x5[53], x6[42], x6[53],
+                    v_cos_bit);
+  btf_16_neon_mode0(cospi[40], cospi[24], x5[43], x5[52], x6[43], x6[52],
+                    v_cos_bit);
+  btf_16_neon_mode02(cospi[24], cospi[40], x5[44], x5[51], x6[44], x6[51],
+                     v_cos_bit);
+  btf_16_neon_mode02(cospi[24], cospi[40], x5[45], x5[50], x6[45], x6[50],
+                     v_cos_bit);
+
+  // stage 7
+  int16x8_t x7[64];
+
+  btf_16_neon_mode2(cospi[56], cospi[8], x6[4], x6[7], x7[4], x7[7], v_cos_bit);
+  btf_16_neon_mode2(cospi[24], cospi[40], x6[5], x6[6], x7[5], x7[6],
+                    v_cos_bit);
+  x7[8] = vqaddq_s16(x5[8], x6[9]);
+  x7[9] = vqsubq_s16(x5[8], x6[9]);
+  x7[10] = vqsubq_s16(x5[11], x6[10]);
+  x7[11] = vqaddq_s16(x5[11], x6[10]);
+  x7[12] = vqaddq_s16(x5[12], x6[13]);
+  x7[13] = vqsubq_s16(x5[12], x6[13]);
+  x7[14] = vqsubq_s16(x5[15], x6[14]);
+  x7[15] = vqaddq_s16(x5[15], x6[14]);
+
+  btf_16_neon_mode0(cospi[8], cospi[56], x6[17], x6[30], x7[17], x7[30],
+                    v_cos_bit);
+  btf_16_neon_mode02(cospi[56], cospi[8], x6[18], x6[29], x7[18], x7[29],
+                     v_cos_bit);
+
+  btf_16_neon_mode0(cospi[40], cospi[24], x6[21], x6[26], x7[21], x7[26],
+                    v_cos_bit);
+  btf_16_neon_mode02(cospi[24], cospi[40], x6[22], x6[25], x7[22], x7[25],
+                     v_cos_bit);
+
+  x7[32] = vqaddq_s16(x5[32], x6[35]);
+  x7[35] = vqsubq_s16(x5[32], x6[35]);
+  x7[33] = vqaddq_s16(x5[33], x6[34]);
+  x7[34] = vqsubq_s16(x5[33], x6[34]);
+  x7[36] = vqsubq_s16(x5[39], x6[36]);
+  x7[39] = vqaddq_s16(x5[39], x6[36]);
+  x7[37] = vqsubq_s16(x5[38], x6[37]);
+  x7[38] = vqaddq_s16(x5[38], x6[37]);
+  x7[40] = vqaddq_s16(x5[40], x6[43]);
+  x7[43] = vqsubq_s16(x5[40], x6[43]);
+  x7[41] = vqaddq_s16(x5[41], x6[42]);
+  x7[42] = vqsubq_s16(x5[41], x6[42]);
+  x7[44] = vqsubq_s16(x5[47], x6[44]);
+  x7[47] = vqaddq_s16(x5[47], x6[44]);
+  x7[45] = vqsubq_s16(x5[46], x6[45]);
+  x7[46] = vqaddq_s16(x5[46], x6[45]);
+  x7[48] = vqaddq_s16(x5[48], x6[51]);
+  x7[51] = vqsubq_s16(x5[48], x6[51]);
+  x7[49] = vqaddq_s16(x5[49], x6[50]);
+  x7[50] = vqsubq_s16(x5[49], x6[50]);
+  x7[52] = vqsubq_s16(x5[55], x6[52]);
+  x7[55] = vqaddq_s16(x5[55], x6[52]);
+  x7[53] = vqsubq_s16(x5[54], x6[53]);
+  x7[54] = vqaddq_s16(x5[54], x6[53]);
+  x7[56] = vqaddq_s16(x5[56], x6[59]);
+  x7[59] = vqsubq_s16(x5[56], x6[59]);
+  x7[57] = vqaddq_s16(x5[57], x6[58]);
+  x7[58] = vqsubq_s16(x5[57], x6[58]);
+  x7[60] = vqsubq_s16(x5[63], x6[60]);
+  x7[63] = vqaddq_s16(x5[63], x6[60]);
+  x7[61] = vqsubq_s16(x5[62], x6[61]);
+  x7[62] = vqaddq_s16(x5[62], x6[61]);
+
+  // stage 8
+  int16x8_t x8[64];
+
+  btf_16_neon_mode2(cospi[60], cospi[4], x7[8], x7[15], x8[8], x8[15],
+                    v_cos_bit);
+  btf_16_neon_mode2(cospi[28], cospi[36], x7[9], x7[14], x8[9], x8[14],
+                    v_cos_bit);
+  btf_16_neon_mode2(cospi[44], cospi[20], x7[10], x7[13], x8[10], x8[13],
+                    v_cos_bit);
+  btf_16_neon_mode2(cospi[12], cospi[52], x7[11], x7[12], x8[11], x8[12],
+                    v_cos_bit);
+  x8[16] = vqaddq_s16(x6[16], x7[17]);
+  x8[17] = vqsubq_s16(x6[16], x7[17]);
+  x8[18] = vqsubq_s16(x6[19], x7[18]);
+  x8[19] = vqaddq_s16(x6[19], x7[18]);
+  x8[20] = vqaddq_s16(x6[20], x7[21]);
+  x8[21] = vqsubq_s16(x6[20], x7[21]);
+  x8[22] = vqsubq_s16(x6[23], x7[22]);
+  x8[23] = vqaddq_s16(x6[23], x7[22]);
+  x8[24] = vqaddq_s16(x6[24], x7[25]);
+  x8[25] = vqsubq_s16(x6[24], x7[25]);
+  x8[26] = vqsubq_s16(x6[27], x7[26]);
+  x8[27] = vqaddq_s16(x6[27], x7[26]);
+  x8[28] = vqaddq_s16(x6[28], x7[29]);
+  x8[29] = vqsubq_s16(x6[28], x7[29]);
+  x8[30] = vqsubq_s16(x6[31], x7[30]);
+  x8[31] = vqaddq_s16(x6[31], x7[30]);
+
+  btf_16_neon_mode0(cospi[4], cospi[60], x7[33], x7[62], x8[33], x8[62],
+                    v_cos_bit);
+  btf_16_neon_mode02(cospi[60], cospi[4], x7[34], x7[61], x8[34], x8[61],
+                     v_cos_bit);
+  btf_16_neon_mode0(cospi[36], cospi[28], x7[37], x7[58], x8[37], x8[58],
+                    v_cos_bit);
+  btf_16_neon_mode02(cospi[28], cospi[36], x7[38], x7[57], x8[38], x8[57],
+                     v_cos_bit);
+  btf_16_neon_mode0(cospi[20], cospi[44], x7[41], x7[54], x8[41], x8[54],
+                    v_cos_bit);
+  btf_16_neon_mode02(cospi[44], cospi[20], x7[42], x7[53], x8[42], x8[53],
+                     v_cos_bit);
+  btf_16_neon_mode0(cospi[52], cospi[12], x7[45], x7[50], x8[45], x8[50],
+                    v_cos_bit);
+  btf_16_neon_mode02(cospi[12], cospi[52], x7[46], x7[49], x8[46], x8[49],
+                     v_cos_bit);
+
+  // stage 9
+  int16x8_t x9[64];
+
+  btf_16_neon_mode2(cospi[62], cospi[2], x8[16], x8[31], x9[16], x9[31],
+                    v_cos_bit);
+  btf_16_neon_mode2(cospi[30], cospi[34], x8[17], x8[30], x9[17], x9[30],
+                    v_cos_bit);
+  btf_16_neon_mode2(cospi[46], cospi[18], x8[18], x8[29], x9[18], x9[29],
+                    v_cos_bit);
+  btf_16_neon_mode2(cospi[14], cospi[50], x8[19], x8[28], x9[19], x9[28],
+                    v_cos_bit);
+  btf_16_neon_mode2(cospi[54], cospi[10], x8[20], x8[27], x9[20], x9[27],
+                    v_cos_bit);
+  btf_16_neon_mode2(cospi[22], cospi[42], x8[21], x8[26], x9[21], x9[26],
+                    v_cos_bit);
+  btf_16_neon_mode2(cospi[38], cospi[26], x8[22], x8[25], x9[22], x9[25],
+                    v_cos_bit);
+  btf_16_neon_mode2(cospi[6], cospi[58], x8[23], x8[24], x9[23], x9[24],
+                    v_cos_bit);
+  x9[32] = vqaddq_s16(x7[32], x8[33]);
+  x9[33] = vqsubq_s16(x7[32], x8[33]);
+  x9[34] = vqsubq_s16(x7[35], x8[34]);
+  x9[35] = vqaddq_s16(x7[35], x8[34]);
+  x9[36] = vqaddq_s16(x7[36], x8[37]);
+  x9[37] = vqsubq_s16(x7[36], x8[37]);
+  x9[38] = vqsubq_s16(x7[39], x8[38]);
+  x9[39] = vqaddq_s16(x7[39], x8[38]);
+  x9[40] = vqaddq_s16(x7[40], x8[41]);
+  x9[41] = vqsubq_s16(x7[40], x8[41]);
+  x9[42] = vqsubq_s16(x7[43], x8[42]);
+  x9[43] = vqaddq_s16(x7[43], x8[42]);
+  x9[44] = vqaddq_s16(x7[44], x8[45]);
+  x9[45] = vqsubq_s16(x7[44], x8[45]);
+  x9[46] = vqsubq_s16(x7[47], x8[46]);
+  x9[47] = vqaddq_s16(x7[47], x8[46]);
+  x9[48] = vqaddq_s16(x7[48], x8[49]);
+  x9[49] = vqsubq_s16(x7[48], x8[49]);
+  x9[50] = vqsubq_s16(x7[51], x8[50]);
+  x9[51] = vqaddq_s16(x7[51], x8[50]);
+  x9[52] = vqaddq_s16(x7[52], x8[53]);
+  x9[53] = vqsubq_s16(x7[52], x8[53]);
+  x9[54] = vqsubq_s16(x7[55], x8[54]);
+  x9[55] = vqaddq_s16(x7[55], x8[54]);
+  x9[56] = vqaddq_s16(x7[56], x8[57]);
+  x9[57] = vqsubq_s16(x7[56], x8[57]);
+  x9[58] = vqsubq_s16(x7[59], x8[58]);
+  x9[59] = vqaddq_s16(x7[59], x8[58]);
+  x9[60] = vqaddq_s16(x7[60], x8[61]);
+  x9[61] = vqsubq_s16(x7[60], x8[61]);
+  x9[62] = vqsubq_s16(x7[63], x8[62]);
+  x9[63] = vqaddq_s16(x7[63], x8[62]);
+
+  // stage 10
+  btf_16_neon_mode2(cospi[63], cospi[1], x9[32], x9[63], output[1], output[63],
+                    v_cos_bit);
+
+  btf_16_neon_mode2(cospi[31], cospi[33], x9[33], x9[62], output[33],
+                    output[31], v_cos_bit);
+
+  btf_16_neon_mode2(cospi[47], cospi[17], x9[34], x9[61], output[17],
+                    output[47], v_cos_bit);
+
+  btf_16_neon_mode2(cospi[15], cospi[49], x9[35], x9[60], output[49],
+                    output[15], v_cos_bit);
+
+  btf_16_neon_mode2(cospi[55], cospi[9], x9[36], x9[59], output[9], output[55],
+                    v_cos_bit);
+
+  btf_16_neon_mode2(cospi[23], cospi[41], x9[37], x9[58], output[41],
+                    output[23], v_cos_bit);
+
+  btf_16_neon_mode2(cospi[39], cospi[25], x9[38], x9[57], output[25],
+                    output[39], v_cos_bit);
+
+  btf_16_neon_mode2(cospi[7], cospi[57], x9[39], x9[56], output[57], output[7],
+                    v_cos_bit);
+
+  btf_16_neon_mode2(cospi[59], cospi[5], x9[40], x9[55], output[5], output[59],
+                    v_cos_bit);
+
+  btf_16_neon_mode2(cospi[27], cospi[37], x9[41], x9[54], output[37],
+                    output[27], v_cos_bit);
+
+  btf_16_neon_mode2(cospi[43], cospi[21], x9[42], x9[53], output[21],
+                    output[43], v_cos_bit);
+
+  btf_16_neon_mode2(cospi[11], cospi[53], x9[43], x9[52], output[53],
+                    output[11], v_cos_bit);
+
+  btf_16_neon_mode2(cospi[51], cospi[13], x9[44], x9[51], output[13],
+                    output[51], v_cos_bit);
+
+  btf_16_neon_mode2(cospi[19], cospi[45], x9[45], x9[50], output[45],
+                    output[19], v_cos_bit);
+
+  btf_16_neon_mode2(cospi[35], cospi[29], x9[46], x9[49], output[29],
+                    output[35], v_cos_bit);
+
+  btf_16_neon_mode2(cospi[3], cospi[61], x9[47], x9[48], output[61], output[3],
+                    v_cos_bit);
+
+  // stage 11
+  output[0] = x6[0];
+  output[2] = x9[16];
+  output[4] = x8[8];
+  output[6] = x9[24];
+  output[8] = x7[4];
+  output[10] = x9[20];
+  output[12] = x8[12];
+  output[14] = x9[28];
+  output[16] = x6[2];
+  output[18] = x9[18];
+  output[20] = x8[10];
+  output[22] = x9[26];
+  output[24] = x7[6];
+  output[26] = x9[22];
+  output[28] = x8[14];
+  output[30] = x9[30];
+  output[32] = x6[1];
+  output[34] = x9[17];
+  output[36] = x8[9];
+  output[38] = x9[25];
+  output[40] = x7[5];
+  output[42] = x9[21];
+  output[44] = x8[13];
+  output[46] = x9[29];
+  output[48] = x6[3];
+  output[52] = x8[11];
+  output[54] = x9[27];
+  output[56] = x7[7];
+  output[58] = x9[23];
+  output[60] = x8[15];
+  output[62] = x9[31];
+}
+
+void fadst_8x8_neon(const int16x8_t *input, int16x8_t *output, int8_t cos_bit,
+                    const int8_t *stage_range) {
+  (void)stage_range;
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const int32x4_t v_cos_bit = vdupq_n_s32(-cos_bit);
+
+  // stage 1
+  int16x8_t x1[4];
+
+  x1[0] = vqnegq_s16(input[7]);
+  x1[1] = vqnegq_s16(input[3]);
+  x1[2] = vqnegq_s16(input[1]);
+  x1[3] = vqnegq_s16(input[5]);
+
+  // stage 2
+  int16x8_t x2[8];
+
+  btf_16_neon_mode3(cospi[32], cospi[32], x1[1], input[4], x2[2], x2[3],
+                    v_cos_bit);
+  btf_16_neon_mode3(cospi[32], cospi[32], input[2], x1[3], x2[6], x2[7],
+                    v_cos_bit);
+  // stage 3
+  int16x8_t x3[8];
+  x3[0] = vqaddq_s16(input[0], x2[2]);
+  x3[2] = vqsubq_s16(input[0], x2[2]);
+  x3[1] = vqaddq_s16(x1[0], x2[3]);
+  x3[3] = vqsubq_s16(x1[0], x2[3]);
+  x3[4] = vqaddq_s16(x1[2], x2[6]);
+  x3[6] = vqsubq_s16(x1[2], x2[6]);
+  x3[5] = vqaddq_s16(input[6], x2[7]);
+  x3[7] = vqsubq_s16(input[6], x2[7]);
+
+  // stage 4
+  btf_16_neon_mode3(cospi[16], cospi[48], x3[4], x3[5], x3[4], x3[5],
+                    v_cos_bit);
+  btf_16_neon_mode0(cospi[48], cospi[16], x3[6], x3[7], x3[6], x3[7],
+                    v_cos_bit);
+
+  // stage 5
+  int16x8_t x5[8];
+  x5[0] = vqaddq_s16(x3[0], x3[4]);
+  x5[4] = vqsubq_s16(x3[0], x3[4]);
+  x5[1] = vqaddq_s16(x3[1], x3[5]);
+  x5[5] = vqsubq_s16(x3[1], x3[5]);
+  x5[2] = vqaddq_s16(x3[2], x3[6]);
+  x5[6] = vqsubq_s16(x3[2], x3[6]);
+  x5[3] = vqaddq_s16(x3[3], x3[7]);
+  x5[7] = vqsubq_s16(x3[3], x3[7]);
+
+  // stage 6
+  btf_16_neon_mode3(cospi[4], cospi[60], x5[0], x5[1], output[7], output[0],
+                    v_cos_bit);
+  btf_16_neon_mode3(cospi[20], cospi[44], x5[2], x5[3], output[5], output[2],
+                    v_cos_bit);
+  btf_16_neon_mode3(cospi[36], cospi[28], x5[4], x5[5], output[3], output[4],
+                    v_cos_bit);
+  btf_16_neon_mode3(cospi[52], cospi[12], x5[6], x5[7], output[1], output[6],
+                    v_cos_bit);
+}
+
+static void fadst8x16_neon(const int16x8_t *input, int16x8_t *output,
+                           int8_t cos_bit, const int8_t *stage_range) {
+  (void)stage_range;
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const int32x4_t v_cos_bit = vdupq_n_s32(-cos_bit);
+
+  // stage 1
+  int16x8_t x1[12];
+  x1[0] = vqnegq_s16(input[15]);
+  x1[1] = vqnegq_s16(input[3]);
+  x1[2] = vqnegq_s16(input[1]);
+  x1[3] = vqnegq_s16(input[13]);
+
+  // stage 2
+  btf_16_neon(-cospi[32], cospi[32], -cospi[32], -cospi[32], input[7], input[8],
+              x1[4], x1[5]);
+  btf_16_neon_mode1(cospi[32], cospi[32], input[4], input[11], x1[6], x1[7],
+                    v_cos_bit);
+  btf_16_neon_mode1(cospi[32], cospi[32], input[6], input[9], x1[8], x1[9],
+                    v_cos_bit);
+  btf_16_neon(-cospi[32], cospi[32], -cospi[32], -cospi[32], input[5],
+              input[10], x1[10], x1[11]);
+  // stage 3
+  int16x8_t x3[16];
+  x3[0] = vqaddq_s16(input[0], x1[4]);
+  x3[2] = vqsubq_s16(input[0], x1[4]);
+  x3[1] = vqaddq_s16(x1[0], x1[5]);
+  x3[3] = vqsubq_s16(x1[0], x1[5]);
+  x3[4] = vqaddq_s16(x1[1], x1[6]);
+  x3[6] = vqsubq_s16(x1[1], x1[6]);
+  x3[5] = vqaddq_s16(input[12], x1[7]);
+  x3[7] = vqsubq_s16(input[12], x1[7]);
+  x3[8] = vqaddq_s16(x1[2], x1[8]);
+  x3[10] = vqsubq_s16(x1[2], x1[8]);
+  x3[9] = vqaddq_s16(input[14], x1[9]);
+  x3[11] = vqsubq_s16(input[14], x1[9]);
+  x3[12] = vqaddq_s16(input[2], x1[10]);
+  x3[14] = vqsubq_s16(input[2], x1[10]);
+  x3[13] = vqaddq_s16(x1[3], x1[11]);
+  x3[15] = vqsubq_s16(x1[3], x1[11]);
+
+  // stage 4
+  btf_16_neon_mode3(cospi[16], cospi[48], x3[4], x3[5], x3[4], x3[5],
+                    v_cos_bit);
+  btf_16_neon_mode0(cospi[48], cospi[16], x3[6], x3[7], x3[6], x3[7],
+                    v_cos_bit);
+  btf_16_neon_mode3(cospi[16], cospi[48], x3[12], x3[13], x3[12], x3[13],
+                    v_cos_bit);
+  btf_16_neon_mode0(cospi[48], cospi[16], x3[14], x3[15], x3[14], x3[15],
+                    v_cos_bit);
+
+  // stage 5
+  int16x8_t x5[16];
+  x5[0] = vqaddq_s16(x3[0], x3[4]);
+  x5[4] = vqsubq_s16(x3[0], x3[4]);
+  x5[1] = vqaddq_s16(x3[1], x3[5]);
+  x5[5] = vqsubq_s16(x3[1], x3[5]);
+  x5[2] = vqaddq_s16(x3[2], x3[6]);
+  x5[6] = vqsubq_s16(x3[2], x3[6]);
+  x5[3] = vqaddq_s16(x3[3], x3[7]);
+  x5[7] = vqsubq_s16(x3[3], x3[7]);
+  x5[8] = vqaddq_s16(x3[8], x3[12]);
+  x5[12] = vqsubq_s16(x3[8], x3[12]);
+  x5[9] = vqaddq_s16(x3[9], x3[13]);
+  x5[13] = vqsubq_s16(x3[9], x3[13]);
+  x5[10] = vqaddq_s16(x3[10], x3[14]);
+  x5[14] = vqsubq_s16(x3[10], x3[14]);
+  x5[11] = vqaddq_s16(x3[11], x3[15]);
+  x5[15] = vqsubq_s16(x3[11], x3[15]);
+
+  // stage 6
+  btf_16_neon_mode3(cospi[8], cospi[56], x5[8], x5[9], x5[8], x5[9], v_cos_bit);
+  btf_16_neon_mode3(cospi[40], cospi[24], x5[10], x5[11], x5[10], x5[11],
+                    v_cos_bit);
+  btf_16_neon_mode0(cospi[56], cospi[8], x5[12], x5[13], x5[12], x5[13],
+                    v_cos_bit);
+  btf_16_neon_mode0(cospi[24], cospi[40], x5[14], x5[15], x5[14], x5[15],
+                    v_cos_bit);
+
+  // stage 7
+  int16x8_t x7[16];
+  x7[0] = vqaddq_s16(x5[0], x5[8]);
+  x7[8] = vqsubq_s16(x5[0], x5[8]);
+  x7[1] = vqaddq_s16(x5[1], x5[9]);
+  x7[9] = vqsubq_s16(x5[1], x5[9]);
+  x7[2] = vqaddq_s16(x5[2], x5[10]);
+  x7[10] = vqsubq_s16(x5[2], x5[10]);
+  x7[3] = vqaddq_s16(x5[3], x5[11]);
+  x7[11] = vqsubq_s16(x5[3], x5[11]);
+  x7[4] = vqaddq_s16(x5[4], x5[12]);
+  x7[12] = vqsubq_s16(x5[4], x5[12]);
+  x7[5] = vqaddq_s16(x5[5], x5[13]);
+  x7[13] = vqsubq_s16(x5[5], x5[13]);
+  x7[6] = vqaddq_s16(x5[6], x5[14]);
+  x7[14] = vqsubq_s16(x5[6], x5[14]);
+  x7[7] = vqaddq_s16(x5[7], x5[15]);
+  x7[15] = vqsubq_s16(x5[7], x5[15]);
+
+  // stage 8
+  btf_16_neon_mode3(cospi[2], cospi[62], x7[0], x7[1], output[15], output[0],
+                    v_cos_bit);
+  btf_16_neon_mode3(cospi[10], cospi[54], x7[2], x7[3], output[13], output[2],
+                    v_cos_bit);
+  btf_16_neon_mode3(cospi[18], cospi[46], x7[4], x7[5], output[11], output[4],
+                    v_cos_bit);
+  btf_16_neon_mode3(cospi[26], cospi[38], x7[6], x7[7], output[9], output[6],
+                    v_cos_bit);
+  btf_16_neon_mode3(cospi[34], cospi[30], x7[8], x7[9], output[7], output[8],
+                    v_cos_bit);
+  btf_16_neon_mode3(cospi[42], cospi[22], x7[10], x7[11], output[5], output[10],
+                    v_cos_bit);
+  btf_16_neon_mode3(cospi[50], cospi[14], x7[12], x7[13], output[3], output[12],
+                    v_cos_bit);
+  btf_16_neon_mode3(cospi[58], cospi[6], x7[14], x7[15], output[1], output[14],
+                    v_cos_bit);
+}
+
+void av1_fidentity4x4_neon(const int16x8_t *const input,
+                           int16x8_t *const output, const int8_t cos_bit,
+                           const int8_t *stage_range) {
+  (void)cos_bit;
+  (void)stage_range;
+  const int16x4_t v_newsqrt2 = vdup_n_s16(NewSqrt2);
+  for (int i = 0; i < 4; ++i) {
+    const int16x4_t b = vqrshrn_n_s32(
+        vmull_s16(vget_low_s16(input[i]), v_newsqrt2), NewSqrt2Bits);
+    output[i] = vcombine_s16(b, b);
+  }
+}
+
+static INLINE void fidentity8x4_neon(const int16x8_t *const input,
+                                     int16x8_t *const output,
+                                     const int8_t cos_bit,
+                                     const int8_t *stage_range) {
+  (void)stage_range;
+  (void)cos_bit;
+  const int16x4_t v_newsqrt2 = vdup_n_s16(NewSqrt2);
+  for (int i = 0; i < 4; ++i) {
+    const int16x4_t b_lo = vqrshrn_n_s32(
+        vmull_s16(vget_low_s16(input[i]), v_newsqrt2), NewSqrt2Bits);
+    const int16x4_t b_hi = vqrshrn_n_s32(
+        vmull_s16(vget_high_s16(input[i]), v_newsqrt2), NewSqrt2Bits);
+    output[i] = vcombine_s16(b_lo, b_hi);
+  }
+}
+
+void fidentity8x8_neon(const int16x8_t *input, int16x8_t *output,
+                       int8_t cos_bit, const int8_t *stage_range) {
+  (void)cos_bit;
+  (void)stage_range;
+  int16x8_t one = vdupq_n_s16(1);
+  output[0] = vqrshlq_s16(input[0], one);
+  output[1] = vqrshlq_s16(input[1], one);
+  output[2] = vqrshlq_s16(input[2], one);
+  output[3] = vqrshlq_s16(input[3], one);
+  output[4] = vqrshlq_s16(input[4], one);
+  output[5] = vqrshlq_s16(input[5], one);
+  output[6] = vqrshlq_s16(input[6], one);
+  output[7] = vqrshlq_s16(input[7], one);
+}
+
+static INLINE void fidentity8x16_neon(const int16x8_t *input, int16x8_t *output,
+                                      int8_t cos_bit,
+                                      const int8_t *stage_range) {
+  (void)stage_range;
+  (void)cos_bit;
+  const int16x4_t v_newsqrt2 = vdup_n_s16(NewSqrt2 * 2);
+  for (int i = 0; i < 16; ++i) {
+    const int16x4_t b_lo = vqrshrn_n_s32(
+        vmull_s16(vget_low_s16(input[i]), v_newsqrt2), NewSqrt2Bits);
+    const int16x4_t b_hi = vqrshrn_n_s32(
+        vmull_s16(vget_high_s16(input[i]), v_newsqrt2), NewSqrt2Bits);
+    output[i] = vcombine_s16(b_lo, b_hi);
+  }
+}
+
+static INLINE void fidentity8x32_neon(const int16x8_t *input, int16x8_t *output,
+                                      int8_t cos_bit,
+                                      const int8_t *stage_range) {
+  (void)stage_range;
+  (void)cos_bit;
+  for (int i = 0; i < 32; ++i) {
+    output[i] = vshlq_n_s16(input[i], 2);
+  }
+}
+
+typedef void (*transform_1d_lbd_neon)(const int16x8_t *input, int16x8_t *output,
+                                      int8_t cos_bit,
+                                      const int8_t *stage_range);
+
+static const transform_1d_lbd_neon col_txfm4x4_arr[TX_TYPES] = {
+  av1_fdct4x4_neon,       // DCT_DCT
+  av1_fadst4x4_neon,      // ADST_DCT
+  av1_fdct4x4_neon,       // DCT_ADST
+  av1_fadst4x4_neon,      // ADST_ADST
+  av1_fadst4x4_neon,      // FLIPADST_DCT
+  av1_fdct4x4_neon,       // DCT_FLIPADST
+  av1_fadst4x4_neon,      // FLIPADST_FLIPADST
+  av1_fadst4x4_neon,      // ADST_FLIPADST
+  av1_fadst4x4_neon,      // FLIPADST_ADST
+  av1_fidentity4x4_neon,  // IDTX
+  av1_fdct4x4_neon,       // V_DCT
+  av1_fidentity4x4_neon,  // H_DCT
+  av1_fadst4x4_neon,      // V_ADST
+  av1_fidentity4x4_neon,  // H_ADST
+  av1_fadst4x4_neon,      // V_FLIPADST
+  av1_fidentity4x4_neon   // H_FLIPADST
+};
+
+static const transform_1d_lbd_neon row_txfm4x4_arr[TX_TYPES] = {
+  av1_fdct4x4_neon,       // DCT_DCT
+  av1_fdct4x4_neon,       // ADST_DCT
+  av1_fadst4x4_neon,      // DCT_ADST
+  av1_fadst4x4_neon,      // ADST_ADST
+  av1_fdct4x4_neon,       // FLIPADST_DCT
+  av1_fadst4x4_neon,      // DCT_FLIPADST
+  av1_fadst4x4_neon,      // FLIPADST_FLIPADST
+  av1_fadst4x4_neon,      // ADST_FLIPADST
+  av1_fadst4x4_neon,      // FLIPADST_ADST
+  av1_fidentity4x4_neon,  // IDTX
+  av1_fidentity4x4_neon,  // V_DCT
+  av1_fdct4x4_neon,       // H_DCT
+  av1_fidentity4x4_neon,  // V_ADST
+  av1_fadst4x4_neon,      // H_ADST
+  av1_fidentity4x4_neon,  // V_FLIPADST
+  av1_fadst4x4_neon       // H_FLIPADST
+};
+
+static const transform_1d_lbd_neon col_txfm4x8_arr[TX_TYPES] = {
+  fdct4x8_neon,       // DCT_DCT
+  fadst4x8_neon,      // ADST_DCT
+  fdct4x8_neon,       // DCT_ADST
+  fadst4x8_neon,      // ADST_ADST
+  fadst4x8_neon,      // FLIPADST_DCT
+  fdct4x8_neon,       // DCT_FLIPADST
+  fadst4x8_neon,      // FLIPADST_FLIPADST
+  fadst4x8_neon,      // ADST_FLIPADST
+  fadst4x8_neon,      // FLIPADST_ADST
+  fidentity8x8_neon,  // IDTX
+  fdct4x8_neon,       // V_DCT
+  fidentity8x8_neon,  // H_DCT
+  fadst4x8_neon,      // V_ADST
+  fidentity8x8_neon,  // H_ADST
+  fadst4x8_neon,      // V_FLIPADST
+  fidentity8x8_neon   // H_FLIPADST
+};
+
+static const transform_1d_lbd_neon row_txfm8x4_arr[TX_TYPES] = {
+  fdct8x4_neon,       // DCT_DCT
+  fdct8x4_neon,       // ADST_DCT
+  fadst8x4_neon,      // DCT_ADST
+  fadst8x4_neon,      // ADST_ADST
+  fdct8x4_neon,       // FLIPADST_DCT
+  fadst8x4_neon,      // DCT_FLIPADST
+  fadst8x4_neon,      // FLIPADST_FLIPADST
+  fadst8x4_neon,      // ADST_FLIPADST
+  fadst8x4_neon,      // FLIPADST_ADST
+  fidentity8x4_neon,  // IDTX
+  fidentity8x4_neon,  // V_DCT
+  fdct8x4_neon,       // H_DCT
+  fidentity8x4_neon,  // V_ADST
+  fadst8x4_neon,      // H_ADST
+  fidentity8x4_neon,  // V_FLIPADST
+  fadst8x4_neon       // H_FLIPADST
+};
+
+static const transform_1d_lbd_neon col_txfm8x4_arr[TX_TYPES] = {
+  fdct8x4_neon,       // DCT_DCT
+  fadst8x4_neon,      // ADST_DCT
+  fdct8x4_neon,       // DCT_ADST
+  fadst8x4_neon,      // ADST_ADST
+  fadst8x4_neon,      // FLIPADST_DCT
+  fdct8x4_neon,       // DCT_FLIPADST
+  fadst8x4_neon,      // FLIPADST_FLIPADST
+  fadst8x4_neon,      // ADST_FLIPADST
+  fadst8x4_neon,      // FLIPADST_ADST
+  fidentity8x4_neon,  // IDTX
+  fdct8x4_neon,       // V_DCT
+  fidentity8x4_neon,  // H_DCT
+  fadst8x4_neon,      // V_ADST
+  fidentity8x4_neon,  // H_ADST
+  fadst8x4_neon,      // V_FLIPADST
+  fidentity8x4_neon   // H_FLIPADST
+};
+
+static const transform_1d_lbd_neon row_txfm4x8_arr[TX_TYPES] = {
+  fdct4x8_neon,       // DCT_DCT
+  fdct4x8_neon,       // ADST_DCT
+  fadst4x8_neon,      // DCT_ADST
+  fadst4x8_neon,      // ADST_ADST
+  fdct4x8_neon,       // FLIPADST_DCT
+  fadst4x8_neon,      // DCT_FLIPADST
+  fadst4x8_neon,      // FLIPADST_FLIPADST
+  fadst4x8_neon,      // ADST_FLIPADST
+  fadst4x8_neon,      // FLIPADST_ADST
+  fidentity8x8_neon,  // IDTX
+  fidentity8x8_neon,  // V_DCT
+  fdct4x8_neon,       // H_DCT
+  fidentity8x8_neon,  // V_ADST
+  fadst4x8_neon,      // H_ADST
+  fidentity8x8_neon,  // V_FLIPADST
+  fadst4x8_neon       // H_FLIPADST
+};
+
+static const transform_1d_lbd_neon col_txfm8x8_arr[TX_TYPES] = {
+  fdct8x8_neon,       // DCT_DCT
+  fadst_8x8_neon,     // ADST_DCT
+  fdct8x8_neon,       // DCT_ADST
+  fadst_8x8_neon,     // ADST_ADST
+  fadst_8x8_neon,     // FLIPADST_DCT
+  fdct8x8_neon,       // DCT_FLIPADST
+  fadst_8x8_neon,     // FLIPADST_FLIPADST
+  fadst_8x8_neon,     // ADST_FLIPADST
+  fadst_8x8_neon,     // FLIPADST_ADST
+  fidentity8x8_neon,  // IDTX
+  fdct8x8_neon,       // V_DCT
+  fidentity8x8_neon,  // H_DCT
+  fadst_8x8_neon,     // V_ADST
+  fidentity8x8_neon,  // H_ADST
+  fadst_8x8_neon,     // V_FLIPADST
+  fidentity8x8_neon,  // H_FLIPADST
+};
+
+static const transform_1d_lbd_neon row_txfm8x8_arr[TX_TYPES] = {
+  fdct8x8_neon,       // DCT_DCT
+  fdct8x8_neon,       // ADST_DCT
+  fadst_8x8_neon,     // DCT_ADST
+  fadst_8x8_neon,     // ADST_ADST
+  fdct8x8_neon,       // FLIPADST_DCT
+  fadst_8x8_neon,     // DCT_FLIPADST
+  fadst_8x8_neon,     // FLIPADST_FLIPADST
+  fadst_8x8_neon,     // ADST_FLIPADST
+  fadst_8x8_neon,     // FLIPADST_ADST
+  fidentity8x8_neon,  // IDTX
+  fidentity8x8_neon,  // V_DCT
+  fdct8x8_neon,       // H_DCT
+  fidentity8x8_neon,  // V_ADST
+  fadst_8x8_neon,     // H_ADST
+  fidentity8x8_neon,  // V_FLIPADST
+  fadst_8x8_neon      // H_FLIPADST
+};
+
+static const transform_1d_lbd_neon col_txfm8x16_arr[TX_TYPES] = {
+  fdct8x16_neon,       // DCT_DCT
+  fadst8x16_neon,      // ADST_DCT
+  fdct8x16_neon,       // DCT_ADST
+  fadst8x16_neon,      // ADST_ADST
+  fadst8x16_neon,      // FLIPADST_DCT
+  fdct8x16_neon,       // DCT_FLIPADST
+  fadst8x16_neon,      // FLIPADST_FLIPADST
+  fadst8x16_neon,      // ADST_FLIPADST
+  fadst8x16_neon,      // FLIPADST_ADST
+  fidentity8x16_neon,  // IDTX
+  fdct8x16_neon,       // V_DCT
+  fidentity8x16_neon,  // H_DCT
+  fadst8x16_neon,      // V_ADST
+  fidentity8x16_neon,  // H_ADST
+  fadst8x16_neon,      // V_FLIPADST
+  fidentity8x16_neon   // H_FLIPADST
+};
+
+static const transform_1d_lbd_neon row_txfm8x16_arr[TX_TYPES] = {
+  fdct8x16_neon,       // DCT_DCT
+  fdct8x16_neon,       // ADST_DCT
+  fadst8x16_neon,      // DCT_ADST
+  fadst8x16_neon,      // ADST_ADST
+  fdct8x16_neon,       // FLIPADST_DCT
+  fadst8x16_neon,      // DCT_FLIPADST
+  fadst8x16_neon,      // FLIPADST_FLIPADST
+  fadst8x16_neon,      // ADST_FLIPADST
+  fadst8x16_neon,      // FLIPADST_ADST
+  fidentity8x16_neon,  // IDTX
+  fidentity8x16_neon,  // V_DCT
+  fdct8x16_neon,       // H_DCT
+  fidentity8x16_neon,  // V_ADST
+  fadst8x16_neon,      // H_ADST
+  fidentity8x16_neon,  // V_FLIPADST
+  fadst8x16_neon       // H_FLIPADST
+};
+
+static const transform_1d_lbd_neon row_txfm8x32_arr[TX_TYPES] = {
+  av1_fdct8x32_neon,   // DCT_DCT
+  NULL,                // ADST_DCT
+  NULL,                // DCT_ADST
+  NULL,                // ADST_ADST
+  NULL,                // FLIPADST_DCT
+  NULL,                // DCT_FLIPADST
+  NULL,                // FLIPADST_FLIPADST
+  NULL,                // ADST_FLIPADST
+  NULL,                // FLIPADST_ADST
+  fidentity8x32_neon,  // IDTX
+  fidentity8x32_neon,  // V_DCT
+  av1_fdct8x32_neon,   // H_DCT
+  NULL,                // V_ADST
+  NULL,                // H_ADST
+  NULL,                // V_FLIPADST
+  NULL                 // H_FLIPADST
+};
+
+static const transform_1d_lbd_neon col_txfm8x32_arr[TX_TYPES] = {
+  av1_fdct8x32_neon,   // DCT_DCT
+  NULL,                // ADST_DCT
+  NULL,                // DCT_ADST
+  NULL,                // ADST_ADST
+  NULL,                // FLIPADST_DCT
+  NULL,                // DCT_FLIPADST
+  NULL,                // FLIPADST_FLIPADST
+  NULL,                // ADST_FLIPADST
+  NULL,                // FLIPADST_ADST
+  fidentity8x32_neon,  // IDTX
+  av1_fdct8x32_neon,   // V_DCT
+  fidentity8x32_neon,  // H_DCT
+  NULL,                // V_ADST
+  NULL,                // H_ADST
+  NULL,                // V_FLIPADST
+  NULL                 // H_FLIPADST
+};
+
+void av1_lowbd_fwd_txfm2d_4x4_neon(const int16_t *input, int32_t *output,
+                                   int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  int16x8_t buf0[4], buf1[4], *buf;
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_4X4];
+  const int txw_idx = get_txw_idx(TX_4X4);
+  const int txh_idx = get_txh_idx(TX_4X4);
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = 4;
+  const int height = 4;
+  const transform_1d_lbd_neon col_txfm = col_txfm4x4_arr[tx_type];
+  const transform_1d_lbd_neon row_txfm = row_txfm4x4_arr[tx_type];
+  int ud_flip, lr_flip;
+
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  const int16x4_t v_shifts = vget_low_s16(vmovl_s8(vld1_s8(&shift[0])));
+  const int16x8_t v_shift0 = vdupq_lane_s16(v_shifts, 0);
+  const int16x8_t v_shift1 = vdupq_lane_s16(v_shifts, 1);
+  const int16x8_t v_shift2 = vdupq_lane_s16(v_shifts, 2);
+  if (ud_flip) {
+    load_buffer_16bit_to_16bit_w4_flip(input, stride, buf0, height);
+  } else {
+    load_buffer_16bit_to_16bit_w4(input, stride, buf0, height);
+  }
+  round_shift_16bit_vector(buf0, height, &v_shift0);
+  col_txfm(buf0, buf0, cos_bit_col, NULL);
+  round_shift_16bit_vector(buf0, height, &v_shift1);
+  transpose_16bit_4x4(buf0, buf1);
+
+  if (lr_flip) {
+    buf = buf0;
+    flip_buf_neon(buf1, buf, width);
+  } else {
+    buf = buf1;
+  }
+  row_txfm(buf, buf, cos_bit_row, NULL);
+  round_shift_16bit_vector(buf0, height, &v_shift2);
+
+  transpose_16bit_4x4(buf, buf);
+  store_buffer_16bit_to_32bit_w4(buf, output, width, height);
+}
+
+void av1_lowbd_fwd_txfm2d_4x8_neon(const int16_t *input, int32_t *output,
+                                   int stride, TX_TYPE tx_type, int bd) {
+  (void)stride;
+  (void)bd;
+  int16x8_t buf0[8], buf1[8], *buf;
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_4X8];
+  const int txw_idx = get_txw_idx(TX_4X8);
+  const int txh_idx = get_txh_idx(TX_4X8);
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = 4;
+  const int height = 8;
+  const transform_1d_lbd_neon col_txfm = col_txfm4x8_arr[tx_type];
+  const transform_1d_lbd_neon row_txfm = row_txfm8x4_arr[tx_type];
+  int ud_flip, lr_flip;
+
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  const int16x4_t v_shifts = vget_low_s16(vmovl_s8(vld1_s8(&shift[0])));
+  const int16x8_t v_shift0 = vdupq_lane_s16(v_shifts, 0);
+  const int16x8_t v_shift1 = vdupq_lane_s16(v_shifts, 1);
+  const int16x8_t v_shift2 = vdupq_lane_s16(v_shifts, 2);
+  if (ud_flip) {
+    load_buffer_16bit_to_16bit_w4_flip(input, stride, buf0, height);
+  } else {
+    load_buffer_16bit_to_16bit_w4(input, stride, buf0, height);
+  }
+  round_shift_16bit_vector(buf0, height, &v_shift0);
+  col_txfm(buf0, buf0, cos_bit_col, NULL);
+  round_shift_16bit_vector(buf0, height, &v_shift1);
+  transpose_16bit_4x8(buf0, buf1);
+
+  if (lr_flip) {
+    buf = buf0;
+    flip_buf_neon(buf1, buf, width);
+  } else {
+    buf = buf1;
+  }
+  row_txfm(buf, buf, cos_bit_row, NULL);
+  round_shift_16bit_vector(buf0, height, &v_shift2);
+  transpose_16bit_8x4(buf, buf);
+  store_rect_buffer_16bit_to_32bit_w4(buf, output, width, height);
+}
+
+void av1_lowbd_fwd_txfm2d_4x16_neon(const int16_t *input, int32_t *output,
+                                    int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  int16x8_t buf0[16], buf1[16];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_4X16];
+  const int txw_idx = get_txw_idx(TX_4X16);
+  const int txh_idx = get_txh_idx(TX_4X16);
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = 4;
+  const int height = 16;
+  const transform_1d_lbd_neon col_txfm = col_txfm8x16_arr[tx_type];
+  const transform_1d_lbd_neon row_txfm = row_txfm8x4_arr[tx_type];
+  int ud_flip, lr_flip;
+
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  const int16x4_t v_shifts = vget_low_s16(vmovl_s8(vld1_s8(&shift[0])));
+  const int16x8_t v_shift0 = vdupq_lane_s16(v_shifts, 0);
+  const int16x8_t v_shift1 = vdupq_lane_s16(v_shifts, 1);
+  const int16x8_t v_shift2 = vdupq_lane_s16(v_shifts, 2);
+  if (ud_flip) {
+    load_buffer_16bit_to_16bit_w4_flip(input, stride, buf0, height);
+  } else {
+    load_buffer_16bit_to_16bit_w4(input, stride, buf0, height);
+  }
+  round_shift_16bit_vector(buf0, height, &v_shift0);
+  col_txfm(buf0, buf0, cos_bit_col, NULL);
+  round_shift_16bit_vector(buf0, height, &v_shift1);
+  transpose_16bit_4x8(buf0, buf1);
+  transpose_16bit_4x8(buf0 + 8, buf1 + 8);
+
+  for (int i = 0; i < 2; i++) {
+    int16x8_t *buf;
+    if (lr_flip) {
+      buf = buf0;
+      flip_buf_neon(buf1 + 8 * i, buf, width);
+    } else {
+      buf = buf1 + 8 * i;
+    }
+    row_txfm(buf, buf, cos_bit_row, NULL);
+    round_shift_16bit_vector(buf0, height, &v_shift2);
+    transpose_16bit_8x4(buf, buf);
+    store_buffer_16bit_to_32bit_w4(buf, output + 8 * width * i, width, 8);
+  }
+}
+
+void av1_lowbd_fwd_txfm2d_8x4_neon(const int16_t *input, int32_t *output,
+                                   int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  int16x8_t buf0[8], buf1[8], *buf;
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X4];
+  const int txw_idx = get_txw_idx(TX_8X4);
+  const int txh_idx = get_txh_idx(TX_8X4);
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = 8;
+  const int height = 4;
+  const transform_1d_lbd_neon col_txfm = col_txfm8x4_arr[tx_type];
+  const transform_1d_lbd_neon row_txfm = row_txfm4x8_arr[tx_type];
+  int ud_flip, lr_flip;
+
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  const int16x4_t v_shifts = vget_low_s16(vmovl_s8(vld1_s8(&shift[0])));
+  const int16x8_t v_shift0 = vdupq_lane_s16(v_shifts, 0);
+  const int16x8_t v_shift1 = vdupq_lane_s16(v_shifts, 1);
+  const int16x8_t v_shift2 = vdupq_lane_s16(v_shifts, 2);
+  if (ud_flip)
+    load_buffer_16bit_to_16bit_flip(input, stride, buf0, height);
+  else
+    load_buffer_16bit_to_16bit(input, stride, buf0, height);
+  round_shift_16bit_vector(buf0, height, &v_shift0);
+  col_txfm(buf0, buf0, cos_bit_col, NULL);
+  round_shift_16bit_vector(buf0, height, &v_shift1);
+  transpose_16bit_8x8(buf0, buf1);
+
+  if (lr_flip) {
+    buf = buf0;
+    flip_buf_neon(buf1, buf, width);
+  } else {
+    buf = buf1;
+  }
+  row_txfm(buf, buf, cos_bit_row, NULL);
+  round_shift_16bit_vector(buf0, height, &v_shift2);
+  transpose_16bit_8x8(buf, buf);
+  store_rect_buffer_16bit_to_32bit_w8(buf, output, width, height);
+}
+
+void av1_lowbd_fwd_txfm2d_8x8_neon(const int16_t *input, int32_t *output,
+                                   int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  int16x8_t buf0[8], buf1[8], *buf;
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X8];
+  const int txw_idx = get_txw_idx(TX_8X8);
+  const int txh_idx = get_txh_idx(TX_8X8);
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = 8;
+  const int height = 8;
+  const transform_1d_lbd_neon col_txfm = col_txfm8x8_arr[tx_type];
+  const transform_1d_lbd_neon row_txfm = row_txfm8x8_arr[tx_type];
+  int ud_flip, lr_flip;
+
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  const int16x4_t v_shifts = vget_low_s16(vmovl_s8(vld1_s8(&shift[0])));
+  const int16x8_t v_shift0 = vdupq_lane_s16(v_shifts, 0);
+  const int16x8_t v_shift1 = vdupq_lane_s16(v_shifts, 1);
+  const int16x8_t v_shift2 = vdupq_lane_s16(v_shifts, 2);
+  if (ud_flip)
+    load_buffer_16bit_to_16bit_flip(input, stride, buf0, height);
+  else
+    load_buffer_16bit_to_16bit(input, stride, buf0, height);
+  round_shift_16bit_vector(buf0, height, &v_shift0);
+  col_txfm(buf0, buf0, cos_bit_col, NULL);
+  round_shift_16bit_vector(buf0, height, &v_shift1);
+  transpose_16bit_8x8(buf0, buf1);
+
+  if (lr_flip) {
+    buf = buf0;
+    flip_buf_neon(buf1, buf, width);
+  } else {
+    buf = buf1;
+  }
+  row_txfm(buf, buf, cos_bit_row, NULL);
+  round_shift_16bit_vector(buf0, height, &v_shift2);
+  transpose_16bit_8x8(buf, buf);
+  store_buffer_16bit_to_32bit_w8(buf, output, width, height);
+}
+
+void av1_lowbd_fwd_txfm2d_8x16_neon(const int16_t *input, int32_t *output,
+                                    int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  int16x8_t buf0[16], buf1[16];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X16];
+  const int txw_idx = get_txw_idx(TX_8X16);
+  const int txh_idx = get_txh_idx(TX_8X16);
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = 8;
+  const int height = 16;
+  const transform_1d_lbd_neon col_txfm = col_txfm8x16_arr[tx_type];
+  const transform_1d_lbd_neon row_txfm = row_txfm8x8_arr[tx_type];
+  int ud_flip, lr_flip;
+
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  const int16x4_t v_shifts = vget_low_s16(vmovl_s8(vld1_s8(&shift[0])));
+  const int16x8_t v_shift0 = vdupq_lane_s16(v_shifts, 0);
+  const int16x8_t v_shift1 = vdupq_lane_s16(v_shifts, 1);
+  const int16x8_t v_shift2 = vdupq_lane_s16(v_shifts, 2);
+  if (ud_flip) {
+    load_buffer_16bit_to_16bit_flip(input, stride, buf0, height);
+  } else {
+    load_buffer_16bit_to_16bit(input, stride, buf0, height);
+  }
+  round_shift_16bit_vector(buf0, height, &v_shift0);
+  col_txfm(buf0, buf0, cos_bit_col, NULL);
+  round_shift_16bit_vector(buf0, height, &v_shift1);
+  transpose_16bit_8x8(buf0, buf1);
+  transpose_16bit_8x8(buf0 + 8, buf1 + 8);
+
+  for (int i = 0; i < 2; i++) {
+    int16x8_t *buf;
+    if (lr_flip) {
+      buf = buf0;
+      flip_buf_neon(buf1 + width * i, buf, width);
+    } else {
+      buf = buf1 + width * i;
+    }
+    row_txfm(buf, buf, cos_bit_row, NULL);
+    round_shift_16bit_vector(buf0, height, &v_shift2);
+    transpose_16bit_8x8(buf, buf);
+    store_rect_buffer_16bit_to_32bit_w8(buf, output + 8 * width * i, width, 8);
+  }
+}
+
+void av1_lowbd_fwd_txfm2d_8x32_neon(const int16_t *input, int32_t *output,
+                                    int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  int16x8_t buf0[32], buf1[32];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X32];
+  const int txw_idx = get_txw_idx(TX_8X32);
+  const int txh_idx = get_txh_idx(TX_8X32);
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = 8;
+  const int height = 32;
+  const transform_1d_lbd_neon col_txfm = col_txfm8x32_arr[tx_type];
+  const transform_1d_lbd_neon row_txfm = row_txfm8x8_arr[tx_type];
+  int ud_flip, lr_flip;
+
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  const int16x4_t v_shifts = vget_low_s16(vmovl_s8(vld1_s8(&shift[0])));
+  const int16x8_t v_shift0 = vdupq_lane_s16(v_shifts, 0);
+  const int16x8_t v_shift1 = vdupq_lane_s16(v_shifts, 1);
+  const int16x8_t v_shift2 = vdupq_lane_s16(v_shifts, 2);
+  if (ud_flip) {
+    load_buffer_16bit_to_16bit_flip(input, stride, buf0, height);
+  } else {
+    load_buffer_16bit_to_16bit(input, stride, buf0, height);
+  }
+  round_shift_16bit_vector(buf0, height, &v_shift0);
+  col_txfm(buf0, buf0, cos_bit_col, NULL);
+  round_shift_16bit_vector(buf0, height, &v_shift1);
+  transpose_16bit_8x8(buf0, buf1);
+  transpose_16bit_8x8(buf0 + 8, buf1 + 8);
+  transpose_16bit_8x8(buf0 + 16, buf1 + 16);
+  transpose_16bit_8x8(buf0 + 24, buf1 + 24);
+
+  for (int i = 0; i < 4; i++) {
+    int16x8_t *buf;
+    if (lr_flip) {
+      buf = buf0;
+      flip_buf_neon(buf1 + width * i, buf, width);
+    } else {
+      buf = buf1 + width * i;
+    }
+    row_txfm(buf, buf, cos_bit_row, NULL);
+    round_shift_16bit_vector(buf0, height, &v_shift2);
+    transpose_16bit_8x8(buf, buf);
+    store_buffer_16bit_to_32bit_w8(buf, output + 8 * width * i, width, 8);
+  }
+}
+
+void av1_lowbd_fwd_txfm2d_16x4_neon(const int16_t *input, int32_t *output,
+                                    int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  int16x8_t buf0[16], buf1[16];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X4];
+  const int txw_idx = get_txw_idx(TX_16X4);
+  const int txh_idx = get_txh_idx(TX_16X4);
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = 16;
+  const int height = 4;
+  const transform_1d_lbd_neon col_txfm = col_txfm8x4_arr[tx_type];
+  const transform_1d_lbd_neon row_txfm = row_txfm8x16_arr[tx_type];
+  int16x8_t *buf;
+  int ud_flip, lr_flip;
+
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  const int16x4_t v_shifts = vget_low_s16(vmovl_s8(vld1_s8(&shift[0])));
+  const int16x8_t v_shift0 = vdupq_lane_s16(v_shifts, 0);
+  const int16x8_t v_shift1 = vdupq_lane_s16(v_shifts, 1);
+  const int16x8_t v_shift2 = vdupq_lane_s16(v_shifts, 2);
+  for (int i = 0; i < 2; i++) {
+    if (ud_flip) {
+      load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height);
+    } else {
+      load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+    }
+    round_shift_16bit_vector(buf0, height, &v_shift0);
+    col_txfm(buf0, buf0, cos_bit_col, NULL);
+    round_shift_16bit_vector(buf0, height, &v_shift1);
+    transpose_16bit_8x4(buf0, buf1 + 8 * i);
+  }
+
+  if (lr_flip) {
+    buf = buf0;
+    flip_buf_neon(buf1, buf, width);
+  } else {
+    buf = buf1;
+  }
+  row_txfm(buf, buf, cos_bit_row, NULL);
+  round_shift_16bit_vector(buf0, height, &v_shift2);
+  transpose_16bit_4x8(buf, buf);
+  store_buffer_16bit_to_32bit_w8(buf, output, width, height);
+  transpose_16bit_4x8(buf + 8, buf + 8);
+  store_buffer_16bit_to_32bit_w8(buf + 8, output + 8, width, height);
+}
+
+void av1_lowbd_fwd_txfm2d_16x8_neon(const int16_t *input, int32_t *output,
+                                    int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  int16x8_t buf0[16], buf1[16];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X8];
+  const int txw_idx = get_txw_idx(TX_16X8);
+  const int txh_idx = get_txh_idx(TX_16X8);
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = 16;
+  const int height = 8;
+  const transform_1d_lbd_neon col_txfm = col_txfm8x8_arr[tx_type];
+  const transform_1d_lbd_neon row_txfm = row_txfm8x16_arr[tx_type];
+  int16x8_t *buf;
+  int ud_flip, lr_flip;
+
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  const int16x4_t v_shifts = vget_low_s16(vmovl_s8(vld1_s8(&shift[0])));
+  const int16x8_t v_shift0 = vdupq_lane_s16(v_shifts, 0);
+  const int16x8_t v_shift1 = vdupq_lane_s16(v_shifts, 1);
+  const int16x8_t v_shift2 = vdupq_lane_s16(v_shifts, 2);
+  for (int i = 0; i < 2; i++) {
+    if (ud_flip) {
+      load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height);
+    } else {
+      load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+    }
+    round_shift_16bit_vector(buf0, height, &v_shift0);
+    col_txfm(buf0, buf0, cos_bit_col, NULL);
+    round_shift_16bit_vector(buf0, height, &v_shift1);
+    transpose_16bit_8x8(buf0, buf1 + 8 * i);
+  }
+
+  if (lr_flip) {
+    buf = buf0;
+    flip_buf_neon(buf1, buf, width);
+  } else {
+    buf = buf1;
+  }
+  row_txfm(buf, buf, cos_bit_row, NULL);
+  round_shift_16bit_vector(buf0, height, &v_shift2);
+  transpose_16bit_8x8(buf, buf);
+  store_rect_buffer_16bit_to_32bit_w8(buf, output, width, height);
+  transpose_16bit_8x8(buf + 8, buf + 8);
+  store_rect_buffer_16bit_to_32bit_w8(buf + 8, output + 8, width, height);
+}
+
+void av1_lowbd_fwd_txfm2d_16x16_neon(const int16_t *input, int32_t *output,
+                                     int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  int16x8_t buf0[16], buf1[32];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X16];
+  const int txw_idx = get_txw_idx(TX_16X16);
+  const int txh_idx = get_txh_idx(TX_16X16);
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = 16;
+  const int height = 16;
+  const transform_1d_lbd_neon col_txfm = col_txfm8x16_arr[tx_type];
+  const transform_1d_lbd_neon row_txfm = row_txfm8x16_arr[tx_type];
+  int ud_flip, lr_flip;
+
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  const int16x4_t v_shifts = vget_low_s16(vmovl_s8(vld1_s8(&shift[0])));
+  const int16x8_t v_shift0 = vdupq_lane_s16(v_shifts, 0);
+  const int16x8_t v_shift1 = vdupq_lane_s16(v_shifts, 1);
+  const int16x8_t v_shift2 = vdupq_lane_s16(v_shifts, 2);
+
+  for (int i = 0; i < 2; i++) {
+    if (ud_flip) {
+      load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height);
+    } else {
+      load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+    }
+    round_shift_16bit_vector(buf0, height, &v_shift0);
+    col_txfm(buf0, buf0, cos_bit_col, NULL);
+    round_shift_16bit_vector(buf0, height, &v_shift1);
+    transpose_16bit_8x8(buf0, buf1 + 0 * width + 8 * i);
+    transpose_16bit_8x8(buf0 + 8, buf1 + 1 * width + 8 * i);
+  }
+
+  for (int i = 0; i < 2; i++) {
+    int16x8_t *buf;
+    if (lr_flip) {
+      buf = buf0;
+      flip_buf_neon(buf1 + width * i, buf, width);
+    } else {
+      buf = buf1 + width * i;
+    }
+    row_txfm(buf, buf, cos_bit_row, NULL);
+    round_shift_16bit_vector(buf0, height, &v_shift2);
+    transpose_16bit_8x8(buf, buf);
+    store_buffer_16bit_to_32bit_w8(buf, output + 8 * width * i, width, 8);
+    transpose_16bit_8x8(buf + 8, buf + 8);
+    store_buffer_16bit_to_32bit_w8(buf + 8, output + 8 * width * i + 8, width,
+                                   8);
+  }
+}
+
+void av1_lowbd_fwd_txfm2d_16x32_neon(const int16_t *input, int32_t *output,
+                                     int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  int16x8_t buf0[32], buf1[64];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X32];
+  const int txw_idx = get_txw_idx(TX_16X32);
+  const int txh_idx = get_txh_idx(TX_16X32);
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = 16;
+  const int height = 32;
+  const transform_1d_lbd_neon col_txfm = col_txfm8x32_arr[tx_type];
+  const transform_1d_lbd_neon row_txfm = row_txfm8x16_arr[tx_type];
+
+  if (col_txfm != NULL && row_txfm != NULL) {
+    int ud_flip, lr_flip;
+    get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+    const int16x4_t v_shifts = vget_low_s16(vmovl_s8(vld1_s8(&shift[0])));
+    const int16x8_t v_shift0 = vdupq_lane_s16(v_shifts, 0);
+    const int16x8_t v_shift1 = vdupq_lane_s16(v_shifts, 1);
+    const int16x8_t v_shift2 = vdupq_lane_s16(v_shifts, 2);
+
+    for (int i = 0; i < 2; i++) {
+      if (ud_flip) {
+        load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height);
+      } else {
+        load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+      }
+      round_shift_16bit_vector(buf0, height, &v_shift0);
+      col_txfm(buf0, buf0, cos_bit_col, NULL);
+      round_shift_16bit_vector(buf0, height, &v_shift1);
+      transpose_16bit_8x8(buf0 + 0 * 8, buf1 + 0 * width + 8 * i);
+      transpose_16bit_8x8(buf0 + 1 * 8, buf1 + 1 * width + 8 * i);
+      transpose_16bit_8x8(buf0 + 2 * 8, buf1 + 2 * width + 8 * i);
+      transpose_16bit_8x8(buf0 + 3 * 8, buf1 + 3 * width + 8 * i);
+    }
+
+    for (int i = 0; i < 4; i++) {
+      int16x8_t *buf;
+      if (lr_flip) {
+        buf = buf0;
+        flip_buf_neon(buf1 + width * i, buf, width);
+      } else {
+        buf = buf1 + width * i;
+      }
+      row_txfm(buf, buf, cos_bit_row, NULL);
+      round_shift_16bit_vector(buf0, height, &v_shift2);
+      transpose_16bit_8x8(buf, buf);
+      store_rect_buffer_16bit_to_32bit_w8(buf, output + 8 * width * i, width,
+                                          8);
+      transpose_16bit_8x8(buf + 8, buf + 8);
+      store_rect_buffer_16bit_to_32bit_w8(buf + 8, output + 8 * width * i + 8,
+                                          width, 8);
+    }
+  } else {
+    av1_fwd_txfm2d_16x32_c(input, output, stride, tx_type, bd);
+  }
+}
+
+void av1_lowbd_fwd_txfm2d_32x8_neon(const int16_t *input, int32_t *output,
+                                    int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  int16x8_t buf0[32], buf1[32];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_32X8];
+  const int txw_idx = get_txw_idx(TX_32X8);
+  const int txh_idx = get_txh_idx(TX_32X8);
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = 32;
+  const int height = 8;
+  const transform_1d_lbd_neon col_txfm = col_txfm8x8_arr[tx_type];
+  const transform_1d_lbd_neon row_txfm = row_txfm8x32_arr[tx_type];
+
+  if (col_txfm != NULL && row_txfm != NULL) {
+    int ud_flip, lr_flip;
+    get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+    const int16x4_t v_shifts = vget_low_s16(vmovl_s8(vld1_s8(&shift[0])));
+    const int16x8_t v_shift0 = vdupq_lane_s16(v_shifts, 0);
+    const int16x8_t v_shift1 = vdupq_lane_s16(v_shifts, 1);
+    const int16x8_t v_shift2 = vdupq_lane_s16(v_shifts, 2);
+
+    for (int i = 0; i < 4; i++) {
+      if (ud_flip) {
+        load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height);
+      } else {
+        load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+      }
+      round_shift_16bit_vector(buf0, height, &v_shift0);
+      col_txfm(buf0, buf0, cos_bit_col, NULL);
+      round_shift_16bit_vector(buf0, height, &v_shift1);
+      transpose_16bit_8x8(buf0, buf1 + 0 * width + 8 * i);
+    }
+
+    for (int i = 0; i < 1; i++) {
+      int16x8_t *buf;
+      if (lr_flip) {
+        buf = buf0;
+        flip_buf_neon(buf1 + width * i, buf, width);
+      } else {
+        buf = buf1 + width * i;
+      }
+      row_txfm(buf, buf, cos_bit_row, NULL);
+      round_shift_16bit_vector(buf, width, &v_shift2);
+      transpose_16bit_8x8(buf, buf);
+      store_buffer_16bit_to_32bit_w8(buf, output + 8 * width * i, width,
+                                     height);
+      transpose_16bit_8x8(buf + 8, buf + 8);
+      store_buffer_16bit_to_32bit_w8(buf + 8, output + 8 * width * i + 8, width,
+                                     height);
+      transpose_16bit_8x8(buf + 16, buf + 16);
+      store_buffer_16bit_to_32bit_w8(buf + 16, output + 8 * width * i + 16,
+                                     width, height);
+      transpose_16bit_8x8(buf + 24, buf + 24);
+      store_buffer_16bit_to_32bit_w8(buf + 24, output + 8 * width * i + 24,
+                                     width, height);
+    }
+  } else {
+    av1_fwd_txfm2d_32x16_c(input, output, stride, tx_type, bd);
+  }
+}
+
+void av1_lowbd_fwd_txfm2d_32x16_neon(const int16_t *input, int32_t *output,
+                                     int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  int16x8_t buf0[32], buf1[64];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_32X16];
+  const int txw_idx = get_txw_idx(TX_32X16);
+  const int txh_idx = get_txh_idx(TX_32X16);
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = 32;
+  const int height = 16;
+  const transform_1d_lbd_neon col_txfm = col_txfm8x16_arr[tx_type];
+  const transform_1d_lbd_neon row_txfm = row_txfm8x32_arr[tx_type];
+
+  if (col_txfm != NULL && row_txfm != NULL) {
+    const int16x4_t v_shifts = vget_low_s16(vmovl_s8(vld1_s8(&shift[0])));
+    const int16x8_t v_shift0 = vdupq_lane_s16(v_shifts, 0);
+    const int16x8_t v_shift1 = vdupq_lane_s16(v_shifts, 1);
+    const int16x8_t v_shift2 = vdupq_lane_s16(v_shifts, 2);
+    int ud_flip, lr_flip;
+    get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+    for (int i = 0; i < 4; i++) {
+      if (ud_flip) {
+        load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height);
+      } else {
+        load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+      }
+      round_shift_16bit_vector(buf0, height, &v_shift0);
+      col_txfm(buf0, buf0, cos_bit_col, NULL);
+      round_shift_16bit_vector(buf0, height, &v_shift1);
+      transpose_16bit_8x8(buf0, buf1 + 0 * width + 8 * i);
+      transpose_16bit_8x8(buf0 + 8, buf1 + 1 * width + 8 * i);
+    }
+
+    for (int i = 0; i < 2; i++) {
+      int16x8_t *buf;
+      if (lr_flip) {
+        buf = buf0;
+        flip_buf_neon(buf1 + width * i, buf, width);
+      } else {
+        buf = buf1 + width * i;
+      }
+      row_txfm(buf, buf, cos_bit_row, NULL);
+      round_shift_16bit_vector(buf, width, &v_shift2);
+      transpose_16bit_8x8(buf, buf);
+      store_rect_buffer_16bit_to_32bit_w8(buf, output + 8 * width * i, width,
+                                          8);
+      transpose_16bit_8x8(buf + 8, buf + 8);
+      store_rect_buffer_16bit_to_32bit_w8(buf + 8, output + 8 * width * i + 8,
+                                          width, 8);
+      transpose_16bit_8x8(buf + 16, buf + 16);
+      store_rect_buffer_16bit_to_32bit_w8(buf + 16, output + 8 * width * i + 16,
+                                          width, 8);
+      transpose_16bit_8x8(buf + 24, buf + 24);
+      store_rect_buffer_16bit_to_32bit_w8(buf + 24, output + 8 * width * i + 24,
+                                          width, 8);
+    }
+  } else {
+    av1_fwd_txfm2d_32x16_c(input, output, stride, tx_type, bd);
+  }
+}
+
+void av1_lowbd_fwd_txfm2d_32x32_neon(const int16_t *input, int32_t *output,
+                                     int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  int16x8_t buf0[32], buf1[128];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_32X32];
+  const int txw_idx = get_txw_idx(TX_32X32);
+  const int txh_idx = get_txh_idx(TX_32X32);
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = 32;
+  const int height = 32;
+  const transform_1d_lbd_neon col_txfm = col_txfm8x32_arr[tx_type];
+  const transform_1d_lbd_neon row_txfm = row_txfm8x32_arr[tx_type];
+
+  if (col_txfm != NULL && row_txfm != NULL) {
+    int ud_flip, lr_flip;
+    get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+    for (int i = 0; i < 4; i++) {
+      if (ud_flip) {
+        load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height);
+      } else {
+        load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+      }
+      round_shift_16bit(buf0, height, shift[0]);
+      col_txfm(buf0, buf0, cos_bit_col, NULL);
+      round_shift_16bit(buf0, height, shift[1]);
+      transpose_16bit_8x8(buf0 + 0 * 8, buf1 + 0 * width + 8 * i);
+      transpose_16bit_8x8(buf0 + 1 * 8, buf1 + 1 * width + 8 * i);
+      transpose_16bit_8x8(buf0 + 2 * 8, buf1 + 2 * width + 8 * i);
+      transpose_16bit_8x8(buf0 + 3 * 8, buf1 + 3 * width + 8 * i);
+    }
+
+    for (int i = 0; i < 4; i++) {
+      int16x8_t *buf;
+      if (lr_flip) {
+        buf = buf0;
+        flip_buf_neon(buf1 + width * i, buf, width);
+      } else {
+        buf = buf1 + width * i;
+      }
+      row_txfm(buf, buf, cos_bit_row, NULL);
+      round_shift_16bit(buf, width, shift[2]);
+      transpose_16bit_8x8(buf, buf);
+      store_buffer_16bit_to_32bit_w8(buf, output + 8 * width * i, width, 8);
+      transpose_16bit_8x8(buf + 8, buf + 8);
+      store_buffer_16bit_to_32bit_w8(buf + 8, output + 8 * width * i + 8, width,
+                                     8);
+      transpose_16bit_8x8(buf + 16, buf + 16);
+      store_buffer_16bit_to_32bit_w8(buf + 16, output + 8 * width * i + 16,
+                                     width, 8);
+      transpose_16bit_8x8(buf + 24, buf + 24);
+      store_buffer_16bit_to_32bit_w8(buf + 24, output + 8 * width * i + 24,
+                                     width, 8);
+    }
+  } else {
+    av1_fwd_txfm2d_32x32_c(input, output, stride, tx_type, bd);
+  }
+}
+
+void av1_lowbd_fwd_txfm2d_64x16_neon(const int16_t *input, int32_t *output,
+                                     int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  (void)tx_type;
+  assert(tx_type == DCT_DCT);
+  const TX_SIZE tx_size = TX_64X16;
+  int16x8_t buf0[64], buf1[128];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = tx_size_wide[tx_size];
+  const int height = tx_size_high[tx_size];
+  const transform_1d_lbd_neon col_txfm = fdct8x16_neon;
+  const transform_1d_lbd_neon row_txfm = av1_fdct8x64_neon;
+  const int width_div8 = (width >> 3);
+  const int height_div8 = (height >> 3);
+
+  for (int i = 0; i < width_div8; i++) {
+    load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+    round_shift_16bit(buf0, height, shift[0]);
+    col_txfm(buf0, buf0, cos_bit_col, NULL);
+    round_shift_16bit(buf0, height, shift[1]);
+    for (int j = 0; j < height_div8; ++j) {
+      transpose_16bit_8x8(buf0 + j * 8, buf1 + j * width + 8 * i);
+    }
+  }
+
+  for (int i = 0; i < height_div8; i++) {
+    int16x8_t *buf = buf1 + width * i;
+    row_txfm(buf, buf, cos_bit_row, NULL);
+    round_shift_16bit(buf, width, shift[2]);
+    int32_t *output8 = output + 8 * 32 * i;
+    for (int j = 0; j < 4; ++j) {
+      int16x8_t *buf8 = buf + 8 * j;
+      transpose_16bit_8x8(buf8, buf8);
+      store_buffer_16bit_to_32bit_w8(buf8, output8 + 8 * j, 32, 8);
+    }
+  }
+}
+
+void av1_lowbd_fwd_txfm2d_16x64_neon(const int16_t *input, int32_t *output,
+                                     int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  (void)tx_type;
+  assert(tx_type == DCT_DCT);
+  const TX_SIZE tx_size = TX_16X64;
+  int16x8_t buf0[64], buf1[128];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = tx_size_wide[tx_size];
+  const int height = tx_size_high[tx_size];
+  const transform_1d_lbd_neon col_txfm = av1_fdct8x64_neon;
+  const transform_1d_lbd_neon row_txfm = fdct8x16_neon;
+  const int width_div8 = (width >> 3);
+  const int height_div8 = (height >> 3);
+
+  for (int i = 0; i < width_div8; i++) {
+    load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+    round_shift_16bit(buf0, height, shift[0]);
+    col_txfm(buf0, buf0, cos_bit_col, NULL);
+    round_shift_16bit(buf0, height, shift[1]);
+    for (int j = 0; j < height_div8; ++j) {
+      transpose_16bit_8x8(buf0 + j * 8, buf1 + j * width + 8 * i);
+    }
+  }
+
+  for (int i = 0; i < AOMMIN(4, height_div8); i++) {
+    int16x8_t *buf = buf1 + width * i;
+    row_txfm(buf, buf, cos_bit_row, NULL);
+    round_shift_16bit(buf, width, shift[2]);
+    int32_t *output8 = output + 8 * width * i;
+    for (int j = 0; j < width_div8; ++j) {
+      int16x8_t *buf8 = buf + 8 * j;
+      transpose_16bit_8x8(buf8, buf8);
+      store_buffer_16bit_to_32bit_w8(buf8, output8 + 8 * j, width, 8);
+    }
+  }
+  // Zero out the bottom 16x32 area.
+  memset(output + 16 * 32, 0, 16 * 32 * sizeof(*output));
+}
+
+#define TRANSPOSE_4X4_L32(x0, x1, x2, x3, y0, y1, y2, y3)      \
+  do {                                                         \
+    int32x4x2_t temp01 = vzipq_s32(x0, x1);                    \
+    int32x4x2_t temp23 = vzipq_s32(x2, x3);                    \
+    int32x4x2_t y01 = vzipq_s32(temp01.val[0], temp23.val[0]); \
+    int32x4x2_t y23 = vzipq_s32(temp01.val[1], temp23.val[1]); \
+    y0 = y01.val[0];                                           \
+    y1 = y01.val[1];                                           \
+    y2 = y23.val[0];                                           \
+    y3 = y23.val[1];                                           \
+  } while (0)
+
+static INLINE void transpose_32_4x4x2(int stride, const int32x4_t *inputA,
+                                      const int32x4_t *inputB,
+                                      int32x4_t *output) {
+  TRANSPOSE_4X4_L32(inputA[0], inputA[2], inputA[1], inputA[3],
+                    output[0 * stride], output[1 * stride], output[2 * stride],
+                    output[3 * stride]);
+  TRANSPOSE_4X4_L32(inputB[0], inputB[2], inputB[1], inputB[3],
+                    output[4 * stride], output[5 * stride], output[6 * stride],
+                    output[7 * stride]);
+}
+
+static void av1_fdct32_new_neon(int32x4_t *input, int32x4_t *output,
+                                int cos_bit, const int stride,
+                                const int8_t *stage_range) {
+  (void)stage_range;
+  int32x4_t buf0[32];
+  int32x4_t buf1[32];
+  const int32_t *cospi;
+  cospi = cospi_arr(cos_bit);
+  const int32x4_t v_cos_bit = vdupq_n_s32(-cos_bit);
+
+  int startidx = 0 * stride;
+  int endidx = 31 * stride;
+  // stage 0
+  // stage 1
+  buf1[0] = vaddq_s32(input[startidx], input[endidx]);
+  buf1[31] = vsubq_s32(input[startidx], input[endidx]);
+  startidx += stride;
+  endidx -= stride;
+  buf1[1] = vaddq_s32(input[startidx], input[endidx]);
+  buf1[30] = vsubq_s32(input[startidx], input[endidx]);
+  startidx += stride;
+  endidx -= stride;
+  buf1[2] = vaddq_s32(input[startidx], input[endidx]);
+  buf1[29] = vsubq_s32(input[startidx], input[endidx]);
+  startidx += stride;
+  endidx -= stride;
+  buf1[3] = vaddq_s32(input[startidx], input[endidx]);
+  buf1[28] = vsubq_s32(input[startidx], input[endidx]);
+  startidx += stride;
+  endidx -= stride;
+  buf1[4] = vaddq_s32(input[startidx], input[endidx]);
+  buf1[27] = vsubq_s32(input[startidx], input[endidx]);
+  startidx += stride;
+  endidx -= stride;
+  buf1[5] = vaddq_s32(input[startidx], input[endidx]);
+  buf1[26] = vsubq_s32(input[startidx], input[endidx]);
+  startidx += stride;
+  endidx -= stride;
+  buf1[6] = vaddq_s32(input[startidx], input[endidx]);
+  buf1[25] = vsubq_s32(input[startidx], input[endidx]);
+  startidx += stride;
+  endidx -= stride;
+  buf1[7] = vaddq_s32(input[startidx], input[endidx]);
+  buf1[24] = vsubq_s32(input[startidx], input[endidx]);
+  startidx += stride;
+  endidx -= stride;
+  buf1[8] = vaddq_s32(input[startidx], input[endidx]);
+  buf1[23] = vsubq_s32(input[startidx], input[endidx]);
+  startidx += stride;
+  endidx -= stride;
+  buf1[9] = vaddq_s32(input[startidx], input[endidx]);
+  buf1[22] = vsubq_s32(input[startidx], input[endidx]);
+  startidx += stride;
+  endidx -= stride;
+  buf1[10] = vaddq_s32(input[startidx], input[endidx]);
+  buf1[21] = vsubq_s32(input[startidx], input[endidx]);
+  startidx += stride;
+  endidx -= stride;
+  buf1[11] = vaddq_s32(input[startidx], input[endidx]);
+  buf1[20] = vsubq_s32(input[startidx], input[endidx]);
+  startidx += stride;
+  endidx -= stride;
+  buf1[12] = vaddq_s32(input[startidx], input[endidx]);
+  buf1[19] = vsubq_s32(input[startidx], input[endidx]);
+  startidx += stride;
+  endidx -= stride;
+  buf1[13] = vaddq_s32(input[startidx], input[endidx]);
+  buf1[18] = vsubq_s32(input[startidx], input[endidx]);
+  startidx += stride;
+  endidx -= stride;
+  buf1[14] = vaddq_s32(input[startidx], input[endidx]);
+  buf1[17] = vsubq_s32(input[startidx], input[endidx]);
+  startidx += stride;
+  endidx -= stride;
+  buf1[15] = vaddq_s32(input[startidx], input[endidx]);
+  buf1[16] = vsubq_s32(input[startidx], input[endidx]);
+
+  // stage 2
+  buf0[0] = vaddq_s32(buf1[0], buf1[15]);
+  buf0[15] = vsubq_s32(buf1[0], buf1[15]);
+  buf0[1] = vaddq_s32(buf1[1], buf1[14]);
+  buf0[14] = vsubq_s32(buf1[1], buf1[14]);
+  buf0[2] = vaddq_s32(buf1[2], buf1[13]);
+  buf0[13] = vsubq_s32(buf1[2], buf1[13]);
+  buf0[3] = vaddq_s32(buf1[3], buf1[12]);
+  buf0[12] = vsubq_s32(buf1[3], buf1[12]);
+  buf0[4] = vaddq_s32(buf1[4], buf1[11]);
+  buf0[11] = vsubq_s32(buf1[4], buf1[11]);
+  buf0[5] = vaddq_s32(buf1[5], buf1[10]);
+  buf0[10] = vsubq_s32(buf1[5], buf1[10]);
+  buf0[6] = vaddq_s32(buf1[6], buf1[9]);
+  buf0[9] = vsubq_s32(buf1[6], buf1[9]);
+  buf0[7] = vaddq_s32(buf1[7], buf1[8]);
+  buf0[8] = vsubq_s32(buf1[7], buf1[8]);
+  buf0[16] = buf1[16];
+  buf0[17] = buf1[17];
+  buf0[18] = buf1[18];
+  buf0[19] = buf1[19];
+  btf_32_neon_mode0(cospi[32], cospi[32], buf1[20], buf1[27], buf0[20],
+                    buf0[27], v_cos_bit);
+  btf_32_neon_mode0(cospi[32], cospi[32], buf1[21], buf1[26], buf0[21],
+                    buf0[26], v_cos_bit);
+  btf_32_neon_mode0(cospi[32], cospi[32], buf1[22], buf1[25], buf0[22],
+                    buf0[25], v_cos_bit);
+  btf_32_neon_mode0(cospi[32], cospi[32], buf1[23], buf1[24], buf0[23],
+                    buf0[24], v_cos_bit);
+  buf0[28] = buf1[28];
+  buf0[29] = buf1[29];
+  buf0[30] = buf1[30];
+  buf0[31] = buf1[31];
+
+  // stage 3
+  cospi = cospi_arr(cos_bit);
+  buf1[0] = vaddq_s32(buf0[0], buf0[7]);
+  buf1[7] = vsubq_s32(buf0[0], buf0[7]);
+  buf1[1] = vaddq_s32(buf0[1], buf0[6]);
+  buf1[6] = vsubq_s32(buf0[1], buf0[6]);
+  buf1[2] = vaddq_s32(buf0[2], buf0[5]);
+  buf1[5] = vsubq_s32(buf0[2], buf0[5]);
+  buf1[3] = vaddq_s32(buf0[3], buf0[4]);
+  buf1[4] = vsubq_s32(buf0[3], buf0[4]);
+  buf1[8] = buf0[8];
+  buf1[9] = buf0[9];
+  btf_32_neon_mode0(cospi[32], cospi[32], buf0[10], buf0[13], buf1[10],
+                    buf1[13], v_cos_bit);
+  btf_32_neon_mode0(cospi[32], cospi[32], buf0[11], buf0[12], buf1[11],
+                    buf1[12], v_cos_bit);
+  buf1[14] = buf0[14];
+  buf1[15] = buf0[15];
+  buf1[16] = vaddq_s32(buf0[16], buf0[23]);
+  buf1[23] = vsubq_s32(buf0[16], buf0[23]);
+  buf1[17] = vaddq_s32(buf0[17], buf0[22]);
+  buf1[22] = vsubq_s32(buf0[17], buf0[22]);
+  buf1[18] = vaddq_s32(buf0[18], buf0[21]);
+  buf1[21] = vsubq_s32(buf0[18], buf0[21]);
+  buf1[19] = vaddq_s32(buf0[19], buf0[20]);
+  buf1[20] = vsubq_s32(buf0[19], buf0[20]);
+  buf1[24] = vsubq_s32(buf0[31], buf0[24]);
+  buf1[31] = vaddq_s32(buf0[31], buf0[24]);
+  buf1[25] = vsubq_s32(buf0[30], buf0[25]);
+  buf1[30] = vaddq_s32(buf0[30], buf0[25]);
+  buf1[26] = vsubq_s32(buf0[29], buf0[26]);
+  buf1[29] = vaddq_s32(buf0[29], buf0[26]);
+  buf1[27] = vsubq_s32(buf0[28], buf0[27]);
+  buf1[28] = vaddq_s32(buf0[28], buf0[27]);
+
+  // stage 4
+  cospi = cospi_arr(cos_bit);
+  buf0[0] = vaddq_s32(buf1[0], buf1[3]);
+  buf0[3] = vsubq_s32(buf1[0], buf1[3]);
+  buf0[1] = vaddq_s32(buf1[1], buf1[2]);
+  buf0[2] = vsubq_s32(buf1[1], buf1[2]);
+  buf0[4] = buf1[4];
+  btf_32_neon_mode0(cospi[32], cospi[32], buf1[5], buf1[6], buf0[5], buf0[6],
+                    v_cos_bit);
+  buf0[7] = buf1[7];
+  buf0[8] = vaddq_s32(buf1[8], buf1[11]);
+  buf0[11] = vsubq_s32(buf1[8], buf1[11]);
+  buf0[9] = vaddq_s32(buf1[9], buf1[10]);
+  buf0[10] = vsubq_s32(buf1[9], buf1[10]);
+  buf0[12] = vsubq_s32(buf1[15], buf1[12]);
+  buf0[15] = vaddq_s32(buf1[15], buf1[12]);
+  buf0[13] = vsubq_s32(buf1[14], buf1[13]);
+  buf0[14] = vaddq_s32(buf1[14], buf1[13]);
+  buf0[16] = buf1[16];
+  buf0[17] = buf1[17];
+  btf_32_neon_mode0(cospi[16], cospi[48], buf1[18], buf1[29], buf0[18],
+                    buf0[29], v_cos_bit);
+  btf_32_neon_mode0(cospi[16], cospi[48], buf1[19], buf1[28], buf0[19],
+                    buf0[28], v_cos_bit);
+  btf_32_neon_mode01(cospi[48], cospi[16], buf1[20], buf1[27], buf0[20],
+                     buf0[27], v_cos_bit);
+  btf_32_neon_mode01(cospi[48], cospi[16], buf1[21], buf1[26], buf0[21],
+                     buf0[26], v_cos_bit);
+  buf0[22] = buf1[22];
+  buf0[23] = buf1[23];
+  buf0[24] = buf1[24];
+  buf0[25] = buf1[25];
+  buf0[30] = buf1[30];
+  buf0[31] = buf1[31];
+
+  // stage 5
+  cospi = cospi_arr(cos_bit);
+  btf_32_neon(cospi[32], cospi[32], buf0[0], buf0[1], buf1[0], buf1[1],
+              v_cos_bit);
+  btf_32_type1_neon(cospi[48], cospi[16], buf0[2], buf0[3], buf1[2], buf1[3],
+                    v_cos_bit);
+  buf1[4] = vaddq_s32(buf0[4], buf0[5]);
+  buf1[5] = vsubq_s32(buf0[4], buf0[5]);
+  buf1[6] = vsubq_s32(buf0[7], buf0[6]);
+  buf1[7] = vaddq_s32(buf0[7], buf0[6]);
+  buf1[8] = buf0[8];
+  btf_32_neon_mode0(cospi[16], cospi[48], buf0[9], buf0[14], buf1[9], buf1[14],
+                    v_cos_bit);
+  btf_32_neon_mode01(cospi[48], cospi[16], buf0[10], buf0[13], buf1[10],
+                     buf1[13], v_cos_bit);
+  buf1[11] = buf0[11];
+  buf1[12] = buf0[12];
+  buf1[15] = buf0[15];
+  buf1[16] = vaddq_s32(buf0[16], buf0[19]);
+  buf1[19] = vsubq_s32(buf0[16], buf0[19]);
+  buf1[17] = vaddq_s32(buf0[17], buf0[18]);
+  buf1[18] = vsubq_s32(buf0[17], buf0[18]);
+  buf1[20] = vsubq_s32(buf0[23], buf0[20]);
+  buf1[23] = vaddq_s32(buf0[23], buf0[20]);
+  buf1[21] = vsubq_s32(buf0[22], buf0[21]);
+  buf1[22] = vaddq_s32(buf0[22], buf0[21]);
+  buf1[24] = vaddq_s32(buf0[24], buf0[27]);
+  buf1[27] = vsubq_s32(buf0[24], buf0[27]);
+  buf1[25] = vaddq_s32(buf0[25], buf0[26]);
+  buf1[26] = vsubq_s32(buf0[25], buf0[26]);
+  buf1[28] = vsubq_s32(buf0[31], buf0[28]);
+  buf1[31] = vaddq_s32(buf0[31], buf0[28]);
+  buf1[29] = vsubq_s32(buf0[30], buf0[29]);
+  buf1[30] = vaddq_s32(buf0[30], buf0[29]);
+
+  // stage 6
+  cospi = cospi_arr(cos_bit);
+  buf0[0] = buf1[0];
+  buf0[1] = buf1[1];
+  buf0[2] = buf1[2];
+  buf0[3] = buf1[3];
+  btf_32_type1_neon(cospi[56], cospi[8], buf1[4], buf1[7], buf0[4], buf0[7],
+                    v_cos_bit);
+  btf_32_type1_neon(cospi[24], cospi[40], buf1[5], buf1[6], buf0[5], buf0[6],
+                    v_cos_bit);
+  buf0[8] = vaddq_s32(buf1[8], buf1[9]);
+  buf0[9] = vsubq_s32(buf1[8], buf1[9]);
+  buf0[10] = vsubq_s32(buf1[11], buf1[10]);
+  buf0[11] = vaddq_s32(buf1[11], buf1[10]);
+  buf0[12] = vaddq_s32(buf1[12], buf1[13]);
+  buf0[13] = vsubq_s32(buf1[12], buf1[13]);
+  buf0[14] = vsubq_s32(buf1[15], buf1[14]);
+  buf0[15] = vaddq_s32(buf1[15], buf1[14]);
+  buf0[16] = buf1[16];
+  btf_32_neon_mode0(cospi[8], cospi[56], buf1[17], buf1[30], buf0[17], buf0[30],
+                    v_cos_bit);
+  btf_32_neon_mode01(cospi[56], cospi[8], buf1[18], buf1[29], buf0[18],
+                     buf0[29], v_cos_bit);
+  buf0[19] = buf1[19];
+  buf0[20] = buf1[20];
+  btf_32_neon_mode0(cospi[40], cospi[24], buf1[21], buf1[26], buf0[21],
+                    buf0[26], v_cos_bit);
+  btf_32_neon_mode01(cospi[24], cospi[40], buf1[22], buf1[25], buf0[22],
+                     buf0[25], v_cos_bit);
+  buf0[23] = buf1[23];
+  buf0[24] = buf1[24];
+  buf0[27] = buf1[27];
+  buf0[28] = buf1[28];
+  buf0[31] = buf1[31];
+
+  // stage 7
+  cospi = cospi_arr(cos_bit);
+  buf1[0] = buf0[0];
+  buf1[1] = buf0[1];
+  buf1[2] = buf0[2];
+  buf1[3] = buf0[3];
+  buf1[4] = buf0[4];
+  buf1[5] = buf0[5];
+  buf1[6] = buf0[6];
+  buf1[7] = buf0[7];
+
+  btf_32_type1_neon(cospi[60], cospi[4], buf0[8], buf0[15], buf1[8], buf1[15],
+                    v_cos_bit);
+  btf_32_type1_neon(cospi[28], cospi[36], buf0[9], buf0[14], buf1[9], buf1[14],
+                    v_cos_bit);
+  btf_32_type1_neon(cospi[44], cospi[20], buf0[10], buf0[13], buf1[10],
+                    buf1[13], v_cos_bit);
+  btf_32_type1_neon(cospi[12], cospi[52], buf0[11], buf0[12], buf1[11],
+                    buf1[12], v_cos_bit);
+  buf1[16] = vaddq_s32(buf0[16], buf0[17]);
+  buf1[17] = vsubq_s32(buf0[16], buf0[17]);
+  buf1[18] = vsubq_s32(buf0[19], buf0[18]);
+  buf1[19] = vaddq_s32(buf0[19], buf0[18]);
+  buf1[20] = vaddq_s32(buf0[20], buf0[21]);
+  buf1[21] = vsubq_s32(buf0[20], buf0[21]);
+  buf1[22] = vsubq_s32(buf0[23], buf0[22]);
+  buf1[23] = vaddq_s32(buf0[23], buf0[22]);
+  buf1[24] = vaddq_s32(buf0[24], buf0[25]);
+  buf1[25] = vsubq_s32(buf0[24], buf0[25]);
+  buf1[26] = vsubq_s32(buf0[27], buf0[26]);
+  buf1[27] = vaddq_s32(buf0[27], buf0[26]);
+  buf1[28] = vaddq_s32(buf0[28], buf0[29]);
+  buf1[29] = vsubq_s32(buf0[28], buf0[29]);
+  buf1[30] = vsubq_s32(buf0[31], buf0[30]);
+  buf1[31] = vaddq_s32(buf0[31], buf0[30]);
+
+  // stage 8
+  cospi = cospi_arr(cos_bit);
+  buf0[0] = buf1[0];
+  buf0[1] = buf1[1];
+  buf0[2] = buf1[2];
+  buf0[3] = buf1[3];
+  buf0[4] = buf1[4];
+  buf0[5] = buf1[5];
+  buf0[6] = buf1[6];
+  buf0[7] = buf1[7];
+  buf0[8] = buf1[8];
+  buf0[9] = buf1[9];
+  buf0[10] = buf1[10];
+  buf0[11] = buf1[11];
+  buf0[12] = buf1[12];
+  buf0[13] = buf1[13];
+  buf0[14] = buf1[14];
+  buf0[15] = buf1[15];
+
+  btf_32_type1_neon(cospi[62], cospi[2], buf1[16], buf1[31], buf0[16], buf0[31],
+                    v_cos_bit);
+  btf_32_type1_neon(cospi[30], cospi[34], buf1[17], buf1[30], buf0[17],
+                    buf0[30], v_cos_bit);
+  btf_32_type1_neon(cospi[46], cospi[18], buf1[18], buf1[29], buf0[18],
+                    buf0[29], v_cos_bit);
+  btf_32_type1_neon(cospi[14], cospi[50], buf1[19], buf1[28], buf0[19],
+                    buf0[28], v_cos_bit);
+  btf_32_type1_neon(cospi[54], cospi[10], buf1[20], buf1[27], buf0[20],
+                    buf0[27], v_cos_bit);
+  btf_32_type1_neon(cospi[22], cospi[42], buf1[21], buf1[26], buf0[21],
+                    buf0[26], v_cos_bit);
+  btf_32_type1_neon(cospi[38], cospi[26], buf1[22], buf1[25], buf0[22],
+                    buf0[25], v_cos_bit);
+  btf_32_type1_neon(cospi[6], cospi[58], buf1[23], buf1[24], buf0[23], buf0[24],
+                    v_cos_bit);
+
+  startidx = 0 * stride;
+  endidx = 31 * stride;
+  // stage 9
+  output[startidx] = buf0[0];
+  output[endidx] = buf0[31];
+  startidx += stride;
+  endidx -= stride;
+  output[startidx] = buf0[16];
+  output[endidx] = buf0[15];
+  startidx += stride;
+  endidx -= stride;
+  output[startidx] = buf0[8];
+  output[endidx] = buf0[23];
+  startidx += stride;
+  endidx -= stride;
+  output[startidx] = buf0[24];
+  output[endidx] = buf0[7];
+  startidx += stride;
+  endidx -= stride;
+  output[startidx] = buf0[4];
+  output[endidx] = buf0[27];
+  startidx += stride;
+  endidx -= stride;
+  output[startidx] = buf0[20];
+  output[endidx] = buf0[11];
+  startidx += stride;
+  endidx -= stride;
+  output[startidx] = buf0[12];
+  output[endidx] = buf0[19];
+  startidx += stride;
+  endidx -= stride;
+  output[startidx] = buf0[28];
+  output[endidx] = buf0[3];
+  startidx += stride;
+  endidx -= stride;
+  output[startidx] = buf0[2];
+  output[endidx] = buf0[29];
+  startidx += stride;
+  endidx -= stride;
+  output[startidx] = buf0[18];
+  output[endidx] = buf0[13];
+  startidx += stride;
+  endidx -= stride;
+  output[startidx] = buf0[10];
+  output[endidx] = buf0[21];
+  startidx += stride;
+  endidx -= stride;
+  output[startidx] = buf0[26];
+  output[endidx] = buf0[5];
+  startidx += stride;
+  endidx -= stride;
+  output[startidx] = buf0[6];
+  output[endidx] = buf0[25];
+  startidx += stride;
+  endidx -= stride;
+  output[startidx] = buf0[22];
+  output[endidx] = buf0[9];
+  startidx += stride;
+  endidx -= stride;
+  output[startidx] = buf0[14];
+  output[endidx] = buf0[17];
+  startidx += stride;
+  endidx -= stride;
+  output[startidx] = buf0[30];
+  output[endidx] = buf0[1];
+}
+
+static void av1_fdct64_new_stage1234_neon(int32x4_t *input, const int instride,
+                                          int32x4_t *x3, int32x4_t *x4,
+                                          const int32_t *cospi,
+                                          const int32x4_t *v_cos_bit,
+                                          int *startidx, int *endidx) {
+  // stage 1
+  int32x4_t x1[64];
+  x1[0] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[63] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[1] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[62] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[2] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[61] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[3] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[60] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[4] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[59] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[5] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[58] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[6] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[57] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[7] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[56] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[8] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[55] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[9] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[54] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[10] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[53] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[11] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[52] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[12] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[51] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[13] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[50] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[14] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[49] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[15] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[48] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[16] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[47] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[17] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[46] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[18] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[45] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[19] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[44] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[20] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[43] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[21] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[42] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[22] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[41] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[23] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[40] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[24] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[39] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[25] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[38] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[26] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[37] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[27] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[36] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[28] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[35] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[29] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[34] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[30] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[33] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[31] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[32] = vsubq_s32(input[*startidx], input[*endidx]);
+
+  // stage 2
+  int32x4_t x2[64];
+  x2[0] = vaddq_s32(x1[0], x1[31]);
+  x2[31] = vsubq_s32(x1[0], x1[31]);
+  x2[1] = vaddq_s32(x1[1], x1[30]);
+  x2[30] = vsubq_s32(x1[1], x1[30]);
+  x2[2] = vaddq_s32(x1[2], x1[29]);
+  x2[29] = vsubq_s32(x1[2], x1[29]);
+  x2[3] = vaddq_s32(x1[3], x1[28]);
+  x2[28] = vsubq_s32(x1[3], x1[28]);
+  x2[4] = vaddq_s32(x1[4], x1[27]);
+  x2[27] = vsubq_s32(x1[4], x1[27]);
+  x2[5] = vaddq_s32(x1[5], x1[26]);
+  x2[26] = vsubq_s32(x1[5], x1[26]);
+  x2[6] = vaddq_s32(x1[6], x1[25]);
+  x2[25] = vsubq_s32(x1[6], x1[25]);
+  x2[7] = vaddq_s32(x1[7], x1[24]);
+  x2[24] = vsubq_s32(x1[7], x1[24]);
+  x2[8] = vaddq_s32(x1[8], x1[23]);
+  x2[23] = vsubq_s32(x1[8], x1[23]);
+  x2[9] = vaddq_s32(x1[9], x1[22]);
+  x2[22] = vsubq_s32(x1[9], x1[22]);
+  x2[10] = vaddq_s32(x1[10], x1[21]);
+  x2[21] = vsubq_s32(x1[10], x1[21]);
+  x2[11] = vaddq_s32(x1[11], x1[20]);
+  x2[20] = vsubq_s32(x1[11], x1[20]);
+  x2[12] = vaddq_s32(x1[12], x1[19]);
+  x2[19] = vsubq_s32(x1[12], x1[19]);
+  x2[13] = vaddq_s32(x1[13], x1[18]);
+  x2[18] = vsubq_s32(x1[13], x1[18]);
+  x2[14] = vaddq_s32(x1[14], x1[17]);
+  x2[17] = vsubq_s32(x1[14], x1[17]);
+  x2[15] = vaddq_s32(x1[15], x1[16]);
+  x2[16] = vsubq_s32(x1[15], x1[16]);
+
+  btf_32_neon_mode0(cospi[32], cospi[32], x1[40], x1[55], x2[40], x2[55],
+                    *v_cos_bit);
+  btf_32_neon_mode0(cospi[32], cospi[32], x1[41], x1[54], x2[41], x2[54],
+                    *v_cos_bit);
+  btf_32_neon_mode0(cospi[32], cospi[32], x1[42], x1[53], x2[42], x2[53],
+                    *v_cos_bit);
+  btf_32_neon_mode0(cospi[32], cospi[32], x1[43], x1[52], x2[43], x2[52],
+                    *v_cos_bit);
+  btf_32_neon_mode0(cospi[32], cospi[32], x1[44], x1[51], x2[44], x2[51],
+                    *v_cos_bit);
+  btf_32_neon_mode0(cospi[32], cospi[32], x1[45], x1[50], x2[45], x2[50],
+                    *v_cos_bit);
+  btf_32_neon_mode0(cospi[32], cospi[32], x1[46], x1[49], x2[46], x2[49],
+                    *v_cos_bit);
+  btf_32_neon_mode0(cospi[32], cospi[32], x1[47], x1[48], x2[47], x2[48],
+                    *v_cos_bit);
+
+  // stage 3
+  x3[0] = vaddq_s32(x2[0], x2[15]);
+  x3[15] = vsubq_s32(x2[0], x2[15]);
+  x3[1] = vaddq_s32(x2[1], x2[14]);
+  x3[14] = vsubq_s32(x2[1], x2[14]);
+  x3[2] = vaddq_s32(x2[2], x2[13]);
+  x3[13] = vsubq_s32(x2[2], x2[13]);
+  x3[3] = vaddq_s32(x2[3], x2[12]);
+  x3[12] = vsubq_s32(x2[3], x2[12]);
+  x3[4] = vaddq_s32(x2[4], x2[11]);
+  x3[11] = vsubq_s32(x2[4], x2[11]);
+  x3[5] = vaddq_s32(x2[5], x2[10]);
+  x3[10] = vsubq_s32(x2[5], x2[10]);
+  x3[6] = vaddq_s32(x2[6], x2[9]);
+  x3[9] = vsubq_s32(x2[6], x2[9]);
+  x3[7] = vaddq_s32(x2[7], x2[8]);
+  x3[8] = vsubq_s32(x2[7], x2[8]);
+
+  btf_32_neon_mode0(cospi[32], cospi[32], x2[20], x2[27], x3[20], x3[27],
+                    *v_cos_bit);
+  btf_32_neon_mode0(cospi[32], cospi[32], x2[21], x2[26], x3[21], x3[26],
+                    *v_cos_bit);
+  btf_32_neon_mode0(cospi[32], cospi[32], x2[22], x2[25], x3[22], x3[25],
+                    *v_cos_bit);
+  btf_32_neon_mode0(cospi[32], cospi[32], x2[23], x2[24], x3[23], x3[24],
+                    *v_cos_bit);
+
+  x3[32] = vaddq_s32(x1[32], x2[47]);
+  x3[47] = vsubq_s32(x1[32], x2[47]);
+  x3[33] = vaddq_s32(x1[33], x2[46]);
+  x3[46] = vsubq_s32(x1[33], x2[46]);
+  x3[34] = vaddq_s32(x1[34], x2[45]);
+  x3[45] = vsubq_s32(x1[34], x2[45]);
+  x3[35] = vaddq_s32(x1[35], x2[44]);
+  x3[44] = vsubq_s32(x1[35], x2[44]);
+  x3[36] = vaddq_s32(x1[36], x2[43]);
+  x3[43] = vsubq_s32(x1[36], x2[43]);
+  x3[37] = vaddq_s32(x1[37], x2[42]);
+  x3[42] = vsubq_s32(x1[37], x2[42]);
+  x3[38] = vaddq_s32(x1[38], x2[41]);
+  x3[41] = vsubq_s32(x1[38], x2[41]);
+  x3[39] = vaddq_s32(x1[39], x2[40]);
+  x3[40] = vsubq_s32(x1[39], x2[40]);
+  x3[48] = vsubq_s32(x1[63], x2[48]);
+  x3[63] = vaddq_s32(x1[63], x2[48]);
+  x3[49] = vsubq_s32(x1[62], x2[49]);
+  x3[62] = vaddq_s32(x1[62], x2[49]);
+  x3[50] = vsubq_s32(x1[61], x2[50]);
+  x3[61] = vaddq_s32(x1[61], x2[50]);
+  x3[51] = vsubq_s32(x1[60], x2[51]);
+  x3[60] = vaddq_s32(x1[60], x2[51]);
+  x3[52] = vsubq_s32(x1[59], x2[52]);
+  x3[59] = vaddq_s32(x1[59], x2[52]);
+  x3[53] = vsubq_s32(x1[58], x2[53]);
+  x3[58] = vaddq_s32(x1[58], x2[53]);
+  x3[54] = vsubq_s32(x1[57], x2[54]);
+  x3[57] = vaddq_s32(x1[57], x2[54]);
+  x3[55] = vsubq_s32(x1[56], x2[55]);
+  x3[56] = vaddq_s32(x1[56], x2[55]);
+
+  // stage 4
+  x4[0] = vaddq_s32(x3[0], x3[7]);
+  x4[7] = vsubq_s32(x3[0], x3[7]);
+  x4[1] = vaddq_s32(x3[1], x3[6]);
+  x4[6] = vsubq_s32(x3[1], x3[6]);
+  x4[2] = vaddq_s32(x3[2], x3[5]);
+  x4[5] = vsubq_s32(x3[2], x3[5]);
+  x4[3] = vaddq_s32(x3[3], x3[4]);
+  x4[4] = vsubq_s32(x3[3], x3[4]);
+
+  btf_32_neon_mode0(cospi[32], cospi[32], x3[10], x3[13], x4[10], x4[13],
+                    *v_cos_bit);
+  btf_32_neon_mode0(cospi[32], cospi[32], x3[11], x3[12], x4[11], x4[12],
+                    *v_cos_bit);
+
+  x4[16] = vaddq_s32(x2[16], x3[23]);
+  x4[23] = vsubq_s32(x2[16], x3[23]);
+  x4[17] = vaddq_s32(x2[17], x3[22]);
+  x4[22] = vsubq_s32(x2[17], x3[22]);
+  x4[18] = vaddq_s32(x2[18], x3[21]);
+  x4[21] = vsubq_s32(x2[18], x3[21]);
+  x4[19] = vaddq_s32(x2[19], x3[20]);
+  x4[20] = vsubq_s32(x2[19], x3[20]);
+  x4[24] = vsubq_s32(x2[31], x3[24]);
+  x4[31] = vaddq_s32(x2[31], x3[24]);
+  x4[25] = vsubq_s32(x2[30], x3[25]);
+  x4[30] = vaddq_s32(x2[30], x3[25]);
+  x4[26] = vsubq_s32(x2[29], x3[26]);
+  x4[29] = vaddq_s32(x2[29], x3[26]);
+  x4[27] = vsubq_s32(x2[28], x3[27]);
+  x4[28] = vaddq_s32(x2[28], x3[27]);
+
+  btf_32_neon_mode0(cospi[16], cospi[48], x3[36], x3[59], x4[36], x4[59],
+                    *v_cos_bit);
+  btf_32_neon_mode0(cospi[16], cospi[48], x3[37], x3[58], x4[37], x4[58],
+                    *v_cos_bit);
+  btf_32_neon_mode0(cospi[16], cospi[48], x3[38], x3[57], x4[38], x4[57],
+                    *v_cos_bit);
+  btf_32_neon_mode0(cospi[16], cospi[48], x3[39], x3[56], x4[39], x4[56],
+                    *v_cos_bit);
+  btf_32_neon_mode01(cospi[48], cospi[16], x3[40], x3[55], x4[40], x4[55],
+                     *v_cos_bit);
+  btf_32_neon_mode01(cospi[48], cospi[16], x3[41], x3[54], x4[41], x4[54],
+                     *v_cos_bit);
+  btf_32_neon_mode01(cospi[48], cospi[16], x3[42], x3[53], x4[42], x4[53],
+                     *v_cos_bit);
+  btf_32_neon_mode01(cospi[48], cospi[16], x3[43], x3[52], x4[43], x4[52],
+                     *v_cos_bit);
+}
+
+static void av1_fdct64_new_neon(int32x4_t *input, int32x4_t *output,
+                                int8_t cos_bit, const int instride,
+                                const int outstride,
+                                const int8_t *stage_range) {
+  (void)stage_range;
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const int32x4_t v_cos_bit = vdupq_n_s32(-cos_bit);
+
+  int startidx = 0 * instride;
+  int endidx = 63 * instride;
+
+  // stage 1-2-3-4
+  int32x4_t x3[64], x4[64];
+  av1_fdct64_new_stage1234_neon(input, instride, x3, x4, cospi, &v_cos_bit,
+                                &startidx, &endidx);
+
+  // stage 5
+  int32x4_t x5[64];
+  x5[0] = vaddq_s32(x4[0], x4[3]);
+  x5[3] = vsubq_s32(x4[0], x4[3]);
+  x5[1] = vaddq_s32(x4[1], x4[2]);
+  x5[2] = vsubq_s32(x4[1], x4[2]);
+
+  btf_32_neon_mode0(cospi[32], cospi[32], x4[5], x4[6], x5[5], x5[6],
+                    v_cos_bit);
+
+  x5[8] = vaddq_s32(x3[8], x4[11]);
+  x5[11] = vsubq_s32(x3[8], x4[11]);
+  x5[9] = vaddq_s32(x3[9], x4[10]);
+  x5[10] = vsubq_s32(x3[9], x4[10]);
+  x5[12] = vsubq_s32(x3[15], x4[12]);
+  x5[15] = vaddq_s32(x3[15], x4[12]);
+  x5[13] = vsubq_s32(x3[14], x4[13]);
+  x5[14] = vaddq_s32(x3[14], x4[13]);
+
+  btf_32_neon_mode0(cospi[16], cospi[48], x4[18], x4[29], x5[18], x5[29],
+                    v_cos_bit);
+  btf_32_neon_mode0(cospi[16], cospi[48], x4[19], x4[28], x5[19], x5[28],
+                    v_cos_bit);
+  btf_32_neon_mode01(cospi[48], cospi[16], x4[20], x4[27], x5[20], x5[27],
+                     v_cos_bit);
+  btf_32_neon_mode01(cospi[48], cospi[16], x4[21], x4[26], x5[21], x5[26],
+                     v_cos_bit);
+
+  x5[32] = vaddq_s32(x3[32], x4[39]);
+  x5[39] = vsubq_s32(x3[32], x4[39]);
+  x5[33] = vaddq_s32(x3[33], x4[38]);
+  x5[38] = vsubq_s32(x3[33], x4[38]);
+  x5[34] = vaddq_s32(x3[34], x4[37]);
+  x5[37] = vsubq_s32(x3[34], x4[37]);
+  x5[35] = vaddq_s32(x3[35], x4[36]);
+  x5[36] = vsubq_s32(x3[35], x4[36]);
+  x5[40] = vsubq_s32(x3[47], x4[40]);
+  x5[47] = vaddq_s32(x3[47], x4[40]);
+  x5[41] = vsubq_s32(x3[46], x4[41]);
+  x5[46] = vaddq_s32(x3[46], x4[41]);
+  x5[42] = vsubq_s32(x3[45], x4[42]);
+  x5[45] = vaddq_s32(x3[45], x4[42]);
+  x5[43] = vsubq_s32(x3[44], x4[43]);
+  x5[44] = vaddq_s32(x3[44], x4[43]);
+  x5[48] = vaddq_s32(x3[48], x4[55]);
+  x5[55] = vsubq_s32(x3[48], x4[55]);
+  x5[49] = vaddq_s32(x3[49], x4[54]);
+  x5[54] = vsubq_s32(x3[49], x4[54]);
+  x5[50] = vaddq_s32(x3[50], x4[53]);
+  x5[53] = vsubq_s32(x3[50], x4[53]);
+  x5[51] = vaddq_s32(x3[51], x4[52]);
+  x5[52] = vsubq_s32(x3[51], x4[52]);
+  x5[56] = vsubq_s32(x3[63], x4[56]);
+  x5[63] = vaddq_s32(x3[63], x4[56]);
+  x5[57] = vsubq_s32(x3[62], x4[57]);
+  x5[62] = vaddq_s32(x3[62], x4[57]);
+  x5[58] = vsubq_s32(x3[61], x4[58]);
+  x5[61] = vaddq_s32(x3[61], x4[58]);
+  x5[59] = vsubq_s32(x3[60], x4[59]);
+  x5[60] = vaddq_s32(x3[60], x4[59]);
+
+  // stage 6
+  int32x4_t x6[64];
+  btf_32_neon(cospi[32], cospi[32], x5[0], x5[1], x6[0], x6[1], v_cos_bit);
+  btf_32_type1_neon(cospi[48], cospi[16], x5[2], x5[3], x6[2], x6[3],
+                    v_cos_bit);
+  x6[4] = vaddq_s32(x4[4], x5[5]);
+  x6[5] = vsubq_s32(x4[4], x5[5]);
+  x6[6] = vsubq_s32(x4[7], x5[6]);
+  x6[7] = vaddq_s32(x4[7], x5[6]);
+  btf_32_neon_mode0(cospi[16], cospi[48], x5[9], x5[14], x6[9], x6[14],
+                    v_cos_bit);
+  btf_32_neon_mode01(cospi[48], cospi[16], x5[10], x5[13], x6[10], x6[13],
+                     v_cos_bit);
+
+  x6[16] = vaddq_s32(x4[16], x5[19]);
+  x6[19] = vsubq_s32(x4[16], x5[19]);
+  x6[17] = vaddq_s32(x4[17], x5[18]);
+  x6[18] = vsubq_s32(x4[17], x5[18]);
+  x6[20] = vsubq_s32(x4[23], x5[20]);
+  x6[23] = vaddq_s32(x4[23], x5[20]);
+  x6[21] = vsubq_s32(x4[22], x5[21]);
+  x6[22] = vaddq_s32(x4[22], x5[21]);
+  x6[24] = vaddq_s32(x4[24], x5[27]);
+  x6[27] = vsubq_s32(x4[24], x5[27]);
+  x6[25] = vaddq_s32(x4[25], x5[26]);
+  x6[26] = vsubq_s32(x4[25], x5[26]);
+  x6[28] = vsubq_s32(x4[31], x5[28]);
+  x6[31] = vaddq_s32(x4[31], x5[28]);
+  x6[29] = vsubq_s32(x4[30], x5[29]);
+  x6[30] = vaddq_s32(x4[30], x5[29]);
+
+  btf_32_neon_mode0(cospi[8], cospi[56], x5[34], x5[61], x6[34], x6[61],
+                    v_cos_bit);
+  btf_32_neon_mode0(cospi[8], cospi[56], x5[35], x5[60], x6[35], x6[60],
+                    v_cos_bit);
+  btf_32_neon_mode01(cospi[56], cospi[8], x5[36], x5[59], x6[36], x6[59],
+                     v_cos_bit);
+  btf_32_neon_mode01(cospi[56], cospi[8], x5[37], x5[58], x6[37], x6[58],
+                     v_cos_bit);
+  btf_32_neon_mode0(cospi[40], cospi[24], x5[42], x5[53], x6[42], x6[53],
+                    v_cos_bit);
+  btf_32_neon_mode0(cospi[40], cospi[24], x5[43], x5[52], x6[43], x6[52],
+                    v_cos_bit);
+  btf_32_neon_mode01(cospi[24], cospi[40], x5[44], x5[51], x6[44], x6[51],
+                     v_cos_bit);
+  btf_32_neon_mode01(cospi[24], cospi[40], x5[45], x5[50], x6[45], x6[50],
+                     v_cos_bit);
+
+  // stage 7
+  int32x4_t x7[64];
+
+  btf_32_type1_neon(cospi[56], cospi[8], x6[4], x6[7], x7[4], x7[7], v_cos_bit);
+  btf_32_type1_neon(cospi[24], cospi[40], x6[5], x6[6], x7[5], x7[6],
+                    v_cos_bit);
+  x7[8] = vaddq_s32(x5[8], x6[9]);
+  x7[9] = vsubq_s32(x5[8], x6[9]);
+  x7[10] = vsubq_s32(x5[11], x6[10]);
+  x7[11] = vaddq_s32(x5[11], x6[10]);
+  x7[12] = vaddq_s32(x5[12], x6[13]);
+  x7[13] = vsubq_s32(x5[12], x6[13]);
+  x7[14] = vsubq_s32(x5[15], x6[14]);
+  x7[15] = vaddq_s32(x5[15], x6[14]);
+
+  btf_32_neon_mode0(cospi[8], cospi[56], x6[17], x6[30], x7[17], x7[30],
+                    v_cos_bit);
+  btf_32_neon_mode01(cospi[56], cospi[8], x6[18], x6[29], x7[18], x7[29],
+                     v_cos_bit);
+
+  btf_32_neon_mode0(cospi[40], cospi[24], x6[21], x6[26], x7[21], x7[26],
+                    v_cos_bit);
+  btf_32_neon_mode01(cospi[24], cospi[40], x6[22], x6[25], x7[22], x7[25],
+                     v_cos_bit);
+
+  x7[32] = vaddq_s32(x5[32], x6[35]);
+  x7[35] = vsubq_s32(x5[32], x6[35]);
+  x7[33] = vaddq_s32(x5[33], x6[34]);
+  x7[34] = vsubq_s32(x5[33], x6[34]);
+  x7[36] = vsubq_s32(x5[39], x6[36]);
+  x7[39] = vaddq_s32(x5[39], x6[36]);
+  x7[37] = vsubq_s32(x5[38], x6[37]);
+  x7[38] = vaddq_s32(x5[38], x6[37]);
+  x7[40] = vaddq_s32(x5[40], x6[43]);
+  x7[43] = vsubq_s32(x5[40], x6[43]);
+  x7[41] = vaddq_s32(x5[41], x6[42]);
+  x7[42] = vsubq_s32(x5[41], x6[42]);
+  x7[44] = vsubq_s32(x5[47], x6[44]);
+  x7[47] = vaddq_s32(x5[47], x6[44]);
+  x7[45] = vsubq_s32(x5[46], x6[45]);
+  x7[46] = vaddq_s32(x5[46], x6[45]);
+  x7[48] = vaddq_s32(x5[48], x6[51]);
+  x7[51] = vsubq_s32(x5[48], x6[51]);
+  x7[49] = vaddq_s32(x5[49], x6[50]);
+  x7[50] = vsubq_s32(x5[49], x6[50]);
+  x7[52] = vsubq_s32(x5[55], x6[52]);
+  x7[55] = vaddq_s32(x5[55], x6[52]);
+  x7[53] = vsubq_s32(x5[54], x6[53]);
+  x7[54] = vaddq_s32(x5[54], x6[53]);
+  x7[56] = vaddq_s32(x5[56], x6[59]);
+  x7[59] = vsubq_s32(x5[56], x6[59]);
+  x7[57] = vaddq_s32(x5[57], x6[58]);
+  x7[58] = vsubq_s32(x5[57], x6[58]);
+  x7[60] = vsubq_s32(x5[63], x6[60]);
+  x7[63] = vaddq_s32(x5[63], x6[60]);
+  x7[61] = vsubq_s32(x5[62], x6[61]);
+  x7[62] = vaddq_s32(x5[62], x6[61]);
+
+  // stage 8
+  int32x4_t x8[64];
+
+  btf_32_type1_neon(cospi[60], cospi[4], x7[8], x7[15], x8[8], x8[15],
+                    v_cos_bit);
+  btf_32_type1_neon(cospi[28], cospi[36], x7[9], x7[14], x8[9], x8[14],
+                    v_cos_bit);
+  btf_32_type1_neon(cospi[44], cospi[20], x7[10], x7[13], x8[10], x8[13],
+                    v_cos_bit);
+  btf_32_type1_neon(cospi[12], cospi[52], x7[11], x7[12], x8[11], x8[12],
+                    v_cos_bit);
+  x8[16] = vaddq_s32(x6[16], x7[17]);
+  x8[17] = vsubq_s32(x6[16], x7[17]);
+  x8[18] = vsubq_s32(x6[19], x7[18]);
+  x8[19] = vaddq_s32(x6[19], x7[18]);
+  x8[20] = vaddq_s32(x6[20], x7[21]);
+  x8[21] = vsubq_s32(x6[20], x7[21]);
+  x8[22] = vsubq_s32(x6[23], x7[22]);
+  x8[23] = vaddq_s32(x6[23], x7[22]);
+  x8[24] = vaddq_s32(x6[24], x7[25]);
+  x8[25] = vsubq_s32(x6[24], x7[25]);
+  x8[26] = vsubq_s32(x6[27], x7[26]);
+  x8[27] = vaddq_s32(x6[27], x7[26]);
+  x8[28] = vaddq_s32(x6[28], x7[29]);
+  x8[29] = vsubq_s32(x6[28], x7[29]);
+  x8[30] = vsubq_s32(x6[31], x7[30]);
+  x8[31] = vaddq_s32(x6[31], x7[30]);
+
+  btf_32_neon_mode0(cospi[4], cospi[60], x7[33], x7[62], x8[33], x8[62],
+                    v_cos_bit);
+  btf_32_neon_mode01(cospi[60], cospi[4], x7[34], x7[61], x8[34], x8[61],
+                     v_cos_bit);
+  btf_32_neon_mode0(cospi[36], cospi[28], x7[37], x7[58], x8[37], x8[58],
+                    v_cos_bit);
+  btf_32_neon_mode01(cospi[28], cospi[36], x7[38], x7[57], x8[38], x8[57],
+                     v_cos_bit);
+  btf_32_neon_mode0(cospi[20], cospi[44], x7[41], x7[54], x8[41], x8[54],
+                    v_cos_bit);
+  btf_32_neon_mode01(cospi[44], cospi[20], x7[42], x7[53], x8[42], x8[53],
+                     v_cos_bit);
+  btf_32_neon_mode0(cospi[52], cospi[12], x7[45], x7[50], x8[45], x8[50],
+                    v_cos_bit);
+  btf_32_neon_mode01(cospi[12], cospi[52], x7[46], x7[49], x8[46], x8[49],
+                     v_cos_bit);
+
+  // stage 9
+  int32x4_t x9[64];
+
+  btf_32_type1_neon(cospi[62], cospi[2], x8[16], x8[31], x9[16], x9[31],
+                    v_cos_bit);
+  btf_32_type1_neon(cospi[30], cospi[34], x8[17], x8[30], x9[17], x9[30],
+                    v_cos_bit);
+  btf_32_type1_neon(cospi[46], cospi[18], x8[18], x8[29], x9[18], x9[29],
+                    v_cos_bit);
+  btf_32_type1_neon(cospi[14], cospi[50], x8[19], x8[28], x9[19], x9[28],
+                    v_cos_bit);
+  btf_32_type1_neon(cospi[54], cospi[10], x8[20], x8[27], x9[20], x9[27],
+                    v_cos_bit);
+  btf_32_type1_neon(cospi[22], cospi[42], x8[21], x8[26], x9[21], x9[26],
+                    v_cos_bit);
+  btf_32_type1_neon(cospi[38], cospi[26], x8[22], x8[25], x9[22], x9[25],
+                    v_cos_bit);
+  btf_32_type1_neon(cospi[6], cospi[58], x8[23], x8[24], x9[23], x9[24],
+                    v_cos_bit);
+  x9[32] = vaddq_s32(x7[32], x8[33]);
+  x9[33] = vsubq_s32(x7[32], x8[33]);
+  x9[34] = vsubq_s32(x7[35], x8[34]);
+  x9[35] = vaddq_s32(x7[35], x8[34]);
+  x9[36] = vaddq_s32(x7[36], x8[37]);
+  x9[37] = vsubq_s32(x7[36], x8[37]);
+  x9[38] = vsubq_s32(x7[39], x8[38]);
+  x9[39] = vaddq_s32(x7[39], x8[38]);
+  x9[40] = vaddq_s32(x7[40], x8[41]);
+  x9[41] = vsubq_s32(x7[40], x8[41]);
+  x9[42] = vsubq_s32(x7[43], x8[42]);
+  x9[43] = vaddq_s32(x7[43], x8[42]);
+  x9[44] = vaddq_s32(x7[44], x8[45]);
+  x9[45] = vsubq_s32(x7[44], x8[45]);
+  x9[46] = vsubq_s32(x7[47], x8[46]);
+  x9[47] = vaddq_s32(x7[47], x8[46]);
+  x9[48] = vaddq_s32(x7[48], x8[49]);
+  x9[49] = vsubq_s32(x7[48], x8[49]);
+  x9[50] = vsubq_s32(x7[51], x8[50]);
+  x9[51] = vaddq_s32(x7[51], x8[50]);
+  x9[52] = vaddq_s32(x7[52], x8[53]);
+  x9[53] = vsubq_s32(x7[52], x8[53]);
+  x9[54] = vsubq_s32(x7[55], x8[54]);
+  x9[55] = vaddq_s32(x7[55], x8[54]);
+  x9[56] = vaddq_s32(x7[56], x8[57]);
+  x9[57] = vsubq_s32(x7[56], x8[57]);
+  x9[58] = vsubq_s32(x7[59], x8[58]);
+  x9[59] = vaddq_s32(x7[59], x8[58]);
+  x9[60] = vaddq_s32(x7[60], x8[61]);
+  x9[61] = vsubq_s32(x7[60], x8[61]);
+  x9[62] = vsubq_s32(x7[63], x8[62]);
+  x9[63] = vaddq_s32(x7[63], x8[62]);
+
+  // stage 10
+  int32x4_t x10[64];
+
+  btf_32_type1_neon(cospi[63], cospi[1], x9[32], x9[63], x10[32], x10[63],
+                    v_cos_bit);
+  btf_32_type1_neon(cospi[31], cospi[33], x9[33], x9[62], x10[33], x10[62],
+                    v_cos_bit);
+  btf_32_type1_neon(cospi[47], cospi[17], x9[34], x9[61], x10[34], x10[61],
+                    v_cos_bit);
+  btf_32_type1_neon(cospi[15], cospi[49], x9[35], x9[60], x10[35], x10[60],
+                    v_cos_bit);
+  btf_32_type1_neon(cospi[55], cospi[9], x9[36], x9[59], x10[36], x10[59],
+                    v_cos_bit);
+  btf_32_type1_neon(cospi[23], cospi[41], x9[37], x9[58], x10[37], x10[58],
+                    v_cos_bit);
+  btf_32_type1_neon(cospi[39], cospi[25], x9[38], x9[57], x10[38], x10[57],
+                    v_cos_bit);
+  btf_32_type1_neon(cospi[7], cospi[57], x9[39], x9[56], x10[39], x10[56],
+                    v_cos_bit);
+  btf_32_type1_neon(cospi[59], cospi[5], x9[40], x9[55], x10[40], x10[55],
+                    v_cos_bit);
+  btf_32_type1_neon(cospi[27], cospi[37], x9[41], x9[54], x10[41], x10[54],
+                    v_cos_bit);
+  btf_32_type1_neon(cospi[43], cospi[21], x9[42], x9[53], x10[42], x10[53],
+                    v_cos_bit);
+  btf_32_type1_neon(cospi[11], cospi[53], x9[43], x9[52], x10[43], x10[52],
+                    v_cos_bit);
+  btf_32_type1_neon(cospi[51], cospi[13], x9[44], x9[51], x10[44], x10[51],
+                    v_cos_bit);
+  btf_32_type1_neon(cospi[19], cospi[45], x9[45], x9[50], x10[45], x10[50],
+                    v_cos_bit);
+  btf_32_type1_neon(cospi[35], cospi[29], x9[46], x9[49], x10[46], x10[49],
+                    v_cos_bit);
+  btf_32_type1_neon(cospi[3], cospi[61], x9[47], x9[48], x10[47], x10[48],
+                    v_cos_bit);
+
+  startidx = 0 * outstride;
+  endidx = 63 * outstride;
+  // stage 11
+  output[startidx] = x6[0];
+  output[endidx] = x10[63];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[32];
+  output[endidx] = x9[31];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x9[16];
+  output[endidx] = x10[47];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[48];
+  output[endidx] = x8[15];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x8[8];
+  output[endidx] = x10[55];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[40];
+  output[endidx] = x9[23];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x9[24];
+  output[endidx] = x10[39];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[56];
+  output[endidx] = x7[7];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x7[4];
+  output[endidx] = x10[59];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[36];
+  output[endidx] = x9[27];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x9[20];
+  output[endidx] = x10[43];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[52];
+  output[endidx] = x8[11];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x8[12];
+  output[endidx] = x10[51];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[44];
+  output[endidx] = x9[19];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x9[28];
+  output[endidx] = x10[35];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[60];
+  output[endidx] = x6[3];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x6[2];
+  output[endidx] = x10[61];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[34];
+  output[endidx] = x9[29];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x9[18];
+  output[endidx] = x10[45];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[50];
+  output[endidx] = x8[13];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x8[10];
+  output[endidx] = x10[53];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[42];
+  output[endidx] = x9[21];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x9[26];
+  output[endidx] = x10[37];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[58];
+  output[endidx] = x7[5];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x7[6];
+  output[endidx] = x10[57];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[38];
+  output[endidx] = x9[25];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x9[22];
+  output[endidx] = x10[41];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[54];
+  output[endidx] = x8[9];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x8[14];
+  output[endidx] = x10[49];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[46];
+  output[endidx] = x9[17];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x9[30];
+  output[endidx] = x10[33];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[62];
+  output[endidx] = x6[1];
+}
+
+static void av1_lowbd_fwd_txfm2d_64x64_neon(const int16_t *input,
+                                            int32_t *output, int stride,
+                                            TX_TYPE tx_type, int bd) {
+  (void)bd;
+  (void)tx_type;
+  assert(tx_type == DCT_DCT);
+  const TX_SIZE tx_size = TX_64X64;
+  int16x8_t buf0[64], buf1[512];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = tx_size_wide[tx_size];
+  const int height = tx_size_high[tx_size];
+  const transform_1d_lbd_neon col_txfm = av1_fdct8x64_neon;
+  const int width_div8 = (width >> 3);
+  const int height_div8 = (height >> 3);
+
+  for (int i = 0; i < width_div8; i++) {
+    load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+    round_shift_16bit(buf0, height, shift[0]);
+    col_txfm(buf0, buf0, cos_bit_col, NULL);
+    round_shift_16bit(buf0, height, shift[1]);
+    for (int j = 0; j < AOMMIN(4, height_div8); ++j) {
+      transpose_16bit_8x8(buf0 + j * 8, buf1 + j * width + 8 * i);
+    }
+  }
+  for (int i = 0; i < AOMMIN(4, height_div8); i++) {
+    int32x4_t bufA[64];
+    int32x4_t bufB[64];
+    int16x8_t *buf = buf1 + width * i;
+    for (int j = 0; j < width; ++j) {
+      bufA[j] = vmovl_s16(vget_low_s16(buf[j]));
+      bufB[j] = vmovl_s16(vget_high_s16(buf[j]));
+    }
+    av1_fdct64_new_neon(bufA, bufA, cos_bit_row, 1, 1, NULL);
+    av1_fdct64_new_neon(bufB, bufB, cos_bit_row, 1, 1, NULL);
+    av1_round_shift_array_32_neon(bufA, bufA, 32);
+    av1_round_shift_array_32_neon(bufB, bufB, 32);
+
+    int32_t *output8 = output + 8 * 32 * i;
+    for (int j = 0; j < width_div8; ++j) {
+      int32x4_t *out = (int32x4_t *)(output8 + 4 * j);
+      transpose_32_4x4x2(8, bufA + 4 * j, bufB + 4 * j, out);
+    }
+  }
+}
+static void av1_lowbd_fwd_txfm2d_64x32_neon(const int16_t *input,
+                                            int32_t *output, int stride,
+                                            TX_TYPE tx_type, int bd) {
+  (void)bd;
+  const TX_SIZE tx_size = TX_64X32;
+  int16x8_t buf0[64], buf1[256];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = tx_size_wide[tx_size];
+  const int height = tx_size_high[tx_size];
+  const transform_1d_lbd_neon col_txfm = col_txfm8x32_arr[tx_type];
+  const int width_div8 = (width >> 3);
+  const int height_div8 = (height >> 3);
+
+  for (int i = 0; i < width_div8; i++) {
+    load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+    round_shift_16bit(buf0, height, shift[0]);
+    col_txfm(buf0, buf0, cos_bit_col, NULL);
+    round_shift_16bit(buf0, height, shift[1]);
+    for (int j = 0; j < AOMMIN(4, height_div8); ++j) {
+      transpose_16bit_8x8(buf0 + j * 8, buf1 + j * width + 8 * i);
+    }
+  }
+  assert(tx_type == DCT_DCT);
+  for (int i = 0; i < AOMMIN(4, height_div8); i++) {
+    int32x4_t bufA[64];
+    int32x4_t bufB[64];
+    int16x8_t *buf = buf1 + width * i;
+    for (int j = 0; j < width; ++j) {
+      bufA[j] = vmovl_s16(vget_low_s16(buf[j]));
+      bufB[j] = vmovl_s16(vget_high_s16(buf[j]));
+    }
+    av1_fdct64_new_neon(bufA, bufA, cos_bit_row, 1, 1, NULL);
+    av1_fdct64_new_neon(bufB, bufB, cos_bit_row, 1, 1, NULL);
+    av1_round_shift_rect_array_32_neon(bufA, bufA, 32);
+    av1_round_shift_rect_array_32_neon(bufB, bufB, 32);
+
+    int32_t *output8 = output + 8 * 32 * i;
+    for (int j = 0; j < width_div8; ++j) {
+      int32x4_t *out = (int32x4_t *)(output8 + 4 * j);
+      transpose_32_4x4x2(8, bufA + 4 * j, bufB + 4 * j, out);
+    }
+  }
+}
+
+static void av1_lowbd_fwd_txfm2d_32x64_neon(const int16_t *input,
+                                            int32_t *output, int stride,
+                                            TX_TYPE tx_type, int bd) {
+  (void)bd;
+  (void)tx_type;
+  assert(tx_type == DCT_DCT);
+  const TX_SIZE tx_size = TX_32X64;
+  int16x8_t buf0[64], buf1[256];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = tx_size_wide[tx_size];
+  const int height = tx_size_high[tx_size];
+  const transform_1d_lbd_neon col_txfm = av1_fdct8x64_neon;
+  const int width_div8 = (width >> 3);
+  const int height_div8 = (height >> 3);
+
+  for (int i = 0; i < width_div8; i++) {
+    load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+    round_shift_16bit(buf0, height, shift[0]);
+    col_txfm(buf0, buf0, cos_bit_col, NULL);
+    round_shift_16bit(buf0, height, shift[1]);
+    for (int j = 0; j < AOMMIN(4, height_div8); ++j) {
+      transpose_16bit_8x8(buf0 + j * 8, buf1 + j * width + 8 * i);
+    }
+  }
+
+  for (int i = 0; i < AOMMIN(4, height_div8); i++) {
+    int32x4_t bufA[32];
+    int32x4_t bufB[32];
+    int16x8_t *buf = buf1 + width * i;
+    for (int j = 0; j < width; ++j) {
+      bufA[j] = vmovl_s16(vget_low_s16(buf[j]));
+      bufB[j] = vmovl_s16(vget_high_s16(buf[j]));
+    }
+    av1_fdct32_new_neon(bufA, bufA, cos_bit_row, 1, NULL);
+    av1_fdct32_new_neon(bufB, bufB, cos_bit_row, 1, NULL);
+    av1_round_shift_rect_array_32_neon(bufA, bufA, 32);
+    av1_round_shift_rect_array_32_neon(bufB, bufB, 32);
+
+    int32_t *output8 = output + 8 * 32 * i;
+    for (int j = 0; j < (32 / 4); ++j) {
+      int32x4_t *out = (int32x4_t *)(output8 + 4 * j);
+      transpose_32_4x4x2(8, bufA + 4 * j, bufB + 4 * j, out);
+    }
+  }
+}
+
+static FwdTxfm2dFunc lowbd_fwd_txfm_func_ls[TX_SIZES_ALL] = {
+  av1_lowbd_fwd_txfm2d_4x4_neon,    // 4x4 transform
+  av1_lowbd_fwd_txfm2d_8x8_neon,    // 8x8 transform
+  av1_lowbd_fwd_txfm2d_16x16_neon,  // 16x16 transform
+  av1_lowbd_fwd_txfm2d_32x32_neon,  // 32x32 transform
+  av1_lowbd_fwd_txfm2d_64x64_neon,  // 64x64 transform
+  av1_lowbd_fwd_txfm2d_4x8_neon,    // 4x8 transform
+  av1_lowbd_fwd_txfm2d_8x4_neon,    // 8x4 transform
+  av1_lowbd_fwd_txfm2d_8x16_neon,   // 8x16 transform
+  av1_lowbd_fwd_txfm2d_16x8_neon,   // 16x8 transform
+  av1_lowbd_fwd_txfm2d_16x32_neon,  // 16x32 transform
+  av1_lowbd_fwd_txfm2d_32x16_neon,  // 32x16 transform
+  av1_lowbd_fwd_txfm2d_32x64_neon,  // 32x64 transform
+  av1_lowbd_fwd_txfm2d_64x32_neon,  // 64x32 transform
+  av1_lowbd_fwd_txfm2d_4x16_neon,   // 4x16 transform
+  av1_lowbd_fwd_txfm2d_16x4_neon,   // 16x4 transform
+  av1_lowbd_fwd_txfm2d_8x32_neon,   // 8x32 transform
+  av1_lowbd_fwd_txfm2d_32x8_neon,   // 32x8 transform
+  av1_lowbd_fwd_txfm2d_16x64_neon,  // 16x64 transform
+  av1_lowbd_fwd_txfm2d_64x16_neon,  // 64x16 transform
+};
+
+void av1_lowbd_fwd_txfm_neon(const int16_t *src_diff, tran_low_t *coeff,
+                             int diff_stride, TxfmParam *txfm_param) {
+  FwdTxfm2dFunc fwd_txfm2d_func = lowbd_fwd_txfm_func_ls[txfm_param->tx_size];
+  if ((fwd_txfm2d_func == NULL) ||
+      (txfm_param->lossless && txfm_param->tx_size == TX_4X4)) {
+    av1_lowbd_fwd_txfm_c(src_diff, coeff, diff_stride, txfm_param);
+  } else {
+    fwd_txfm2d_func(src_diff, coeff, diff_stride, txfm_param->tx_type,
+                    txfm_param->bd);
+  }
+}
diff --git a/av1/encoder/arm/neon/encodetxb_neon.c b/av1/encoder/arm/neon/encodetxb_neon.c
new file mode 100644
index 0000000..4ededd8
--- /dev/null
+++ b/av1/encoder/arm/neon/encodetxb_neon.c
@@ -0,0 +1,653 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+#include <math.h>
+
+#include "av1/common/txb_common.h"
+#include "av1/encoder/encodetxb.h"
+#include "av1/common/arm/mem_neon.h"
+
+void av1_txb_init_levels_neon(const tran_low_t *const coeff, const int width,
+                              const int height, uint8_t *const levels) {
+  const int stride = width + TX_PAD_HOR;
+  memset(levels - TX_PAD_TOP * stride, 0,
+         sizeof(*levels) * TX_PAD_TOP * stride);
+  memset(levels + stride * height, 0,
+         sizeof(*levels) * (TX_PAD_BOTTOM * stride + TX_PAD_END));
+
+  const int32x4_t zeros = vdupq_n_s32(0);
+  int i = 0;
+  uint8_t *ls = levels;
+  const tran_low_t *cf = coeff;
+  if (width == 4) {
+    do {
+      const int32x4_t coeffA = vld1q_s32(cf);
+      const int32x4_t coeffB = vld1q_s32(cf + width);
+      const int16x8_t coeffAB =
+          vcombine_s16(vqmovn_s32(coeffA), vqmovn_s32(coeffB));
+      const int16x8_t absAB = vqabsq_s16(coeffAB);
+      const int8x8_t absABs = vqmovn_s16(absAB);
+#if defined(__aarch64__)
+      const int8x16_t absAB8 =
+          vcombine_s8(absABs, vreinterpret_s8_s32(vget_low_s32(zeros)));
+      const uint8x16_t lsAB =
+          vreinterpretq_u8_s32(vzip1q_s32(vreinterpretq_s32_s8(absAB8), zeros));
+#else
+      const int32x2x2_t absAB8 =
+          vzip_s32(vreinterpret_s32_s8(absABs), vget_low_s32(zeros));
+      const uint8x16_t lsAB =
+          vreinterpretq_u8_s32(vcombine_s32(absAB8.val[0], absAB8.val[1]));
+#endif
+      vst1q_u8(ls, lsAB);
+      ls += (stride << 1);
+      cf += (width << 1);
+      i += 2;
+    } while (i < height);
+  } else if (width == 8) {
+    do {
+      const int32x4_t coeffA = vld1q_s32(cf);
+      const int32x4_t coeffB = vld1q_s32(cf + 4);
+      const int16x8_t coeffAB =
+          vcombine_s16(vqmovn_s32(coeffA), vqmovn_s32(coeffB));
+      const int16x8_t absAB = vqabsq_s16(coeffAB);
+      const uint8x16_t absAB8 = vreinterpretq_u8_s8(vcombine_s8(
+          vqmovn_s16(absAB), vreinterpret_s8_s32(vget_low_s32(zeros))));
+      vst1q_u8(ls, absAB8);
+      ls += stride;
+      cf += width;
+      i += 1;
+    } while (i < height);
+  } else {
+    do {
+      int j = 0;
+      do {
+        const int32x4_t coeffA = vld1q_s32(cf);
+        const int32x4_t coeffB = vld1q_s32(cf + 4);
+        const int32x4_t coeffC = vld1q_s32(cf + 8);
+        const int32x4_t coeffD = vld1q_s32(cf + 12);
+        const int16x8_t coeffAB =
+            vcombine_s16(vqmovn_s32(coeffA), vqmovn_s32(coeffB));
+        const int16x8_t coeffCD =
+            vcombine_s16(vqmovn_s32(coeffC), vqmovn_s32(coeffD));
+        const int16x8_t absAB = vqabsq_s16(coeffAB);
+        const int16x8_t absCD = vqabsq_s16(coeffCD);
+        const uint8x16_t absABCD = vreinterpretq_u8_s8(
+            vcombine_s8(vqmovn_s16(absAB), vqmovn_s16(absCD)));
+        vst1q_u8((ls + j), absABCD);
+        j += 16;
+        cf += 16;
+      } while (j < width);
+      *(int32_t *)(ls + width) = 0;
+      ls += stride;
+      i += 1;
+    } while (i < height);
+  }
+}
+
+// get_4_nz_map_contexts_2d coefficients:
+static const DECLARE_ALIGNED(16, uint8_t, c_4_po_2d[2][16]) = {
+  { 0, 1, 6, 6, 1, 6, 6, 21, 6, 6, 21, 21, 6, 21, 21, 21 },
+  { 0, 11, 11, 11, 11, 11, 11, 11, 6, 6, 21, 21, 6, 21, 21, 21 }
+};
+
+// get_4_nz_map_contexts_hor coefficients:
+/* clang-format off */
+#define SIG_COEF_CONTEXTS_2D_X4_051010                        \
+  (SIG_COEF_CONTEXTS_2D + ((SIG_COEF_CONTEXTS_2D + 5) << 8) + \
+  ((SIG_COEF_CONTEXTS_2D + 10) << 16) + ((SIG_COEF_CONTEXTS_2D + 10) << 24))
+/* clang-format on */
+
+// get_4_nz_map_contexts_ver coefficients:
+static const DECLARE_ALIGNED(16, uint8_t, c_4_po_ver[16]) = {
+  SIG_COEF_CONTEXTS_2D + 0,  SIG_COEF_CONTEXTS_2D + 0,
+  SIG_COEF_CONTEXTS_2D + 0,  SIG_COEF_CONTEXTS_2D + 0,
+  SIG_COEF_CONTEXTS_2D + 5,  SIG_COEF_CONTEXTS_2D + 5,
+  SIG_COEF_CONTEXTS_2D + 5,  SIG_COEF_CONTEXTS_2D + 5,
+  SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+  SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+  SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+  SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10
+};
+
+// get_8_coeff_contexts_2d coefficients:
+// if (height == 8)
+static const DECLARE_ALIGNED(16, uint8_t, c_8_po_2d_8[2][16]) = {
+  { 0, 1, 6, 6, 21, 21, 21, 21, 1, 6, 6, 21, 21, 21, 21, 21 },
+  { 6, 6, 21, 21, 21, 21, 21, 21, 6, 21, 21, 21, 21, 21, 21, 21 }
+};
+// if (height < 8)
+static const DECLARE_ALIGNED(16, uint8_t, c_8_po_2d_l[2][16]) = {
+  { 0, 16, 6, 6, 21, 21, 21, 21, 16, 16, 6, 21, 21, 21, 21, 21 },
+  { 16, 16, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21 }
+};
+
+// if (height > 8)
+static const DECLARE_ALIGNED(16, uint8_t, c_8_po_2d_g[2][16]) = {
+  { 0, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11 },
+  { 6, 6, 21, 21, 21, 21, 21, 21, 6, 21, 21, 21, 21, 21, 21, 21 }
+};
+
+// get_4_nz_map_contexts_ver coefficients:
+static const DECLARE_ALIGNED(16, uint8_t, c_8_po_hor[16]) = {
+  SIG_COEF_CONTEXTS_2D + 0,  SIG_COEF_CONTEXTS_2D + 5,
+  SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+  SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+  SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+  SIG_COEF_CONTEXTS_2D + 0,  SIG_COEF_CONTEXTS_2D + 5,
+  SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+  SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+  SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10
+};
+
+// get_16n_coeff_contexts_2d coefficients:
+// real_width == real_height
+static const DECLARE_ALIGNED(16, uint8_t, c_16_po_2d_e[4][16]) = {
+  { 0, 1, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 },
+  { 1, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 },
+  { 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 },
+  { 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 }
+};
+
+// real_width > real_height
+static const DECLARE_ALIGNED(16, uint8_t, c_16_po_2d_g[3][16]) = {
+  { 0, 16, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 },
+  { 16, 16, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 },
+  { 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 }
+};
+
+// real_width < real_height
+static const DECLARE_ALIGNED(16, uint8_t, c_16_po_2d_l[3][16]) = {
+  { 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11 },
+  { 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 },
+  { 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 }
+};
+
+// get_16n_coeff_contexts_hor coefficients:
+static const DECLARE_ALIGNED(16, uint8_t, c_16_po_hor[16]) = {
+  SIG_COEF_CONTEXTS_2D + 0,  SIG_COEF_CONTEXTS_2D + 5,
+  SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+  SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+  SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+  SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+  SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+  SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+  SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10
+};
+
+// end of coefficients declaration area
+
+static INLINE uint8x16_t load_8bit_4x4_to_1_reg(const uint8_t *const src,
+                                                const int byte_stride) {
+#ifdef __aarch64__
+  uint32x4_t v_data = vld1q_u32((uint32_t *)src);
+  v_data = vld1q_lane_u32((uint32_t *)(src + 1 * byte_stride), v_data, 1);
+  v_data = vld1q_lane_u32((uint32_t *)(src + 2 * byte_stride), v_data, 2);
+  v_data = vld1q_lane_u32((uint32_t *)(src + 3 * byte_stride), v_data, 3);
+
+  return vreinterpretq_u8_u32(v_data);
+#else
+  return load_unaligned_u8q(src, byte_stride);
+#endif
+}
+
+static INLINE uint8x16_t load_8bit_8x2_to_1_reg(const uint8_t *const src,
+                                                const int byte_stride) {
+#ifdef __aarch64__
+  uint64x2_t v_data = vld1q_u64((uint64_t *)src);
+  v_data = vld1q_lane_u64((uint64_t *)(src + 1 * byte_stride), v_data, 1);
+
+  return vreinterpretq_u8_u64(v_data);
+#else
+  uint8x8_t v_data_low = vld1_u8(src);
+  uint8x8_t v_data_high = vld1_u8(src + byte_stride);
+
+  return vcombine_u8(v_data_low, v_data_high);
+#endif
+}
+
+static INLINE uint8x16_t load_8bit_16x1_to_1_reg(const uint8_t *const src,
+                                                 const int byte_stride) {
+  (void)byte_stride;
+  return vld1q_u8(src);
+}
+
+static INLINE void load_levels_4x4x5(const uint8_t *const src, const int stride,
+                                     const ptrdiff_t *const offsets,
+                                     uint8x16_t *const level) {
+  level[0] = load_8bit_4x4_to_1_reg(&src[1], stride);
+  level[1] = load_8bit_4x4_to_1_reg(&src[stride], stride);
+  level[2] = load_8bit_4x4_to_1_reg(&src[offsets[0]], stride);
+  level[3] = load_8bit_4x4_to_1_reg(&src[offsets[1]], stride);
+  level[4] = load_8bit_4x4_to_1_reg(&src[offsets[2]], stride);
+}
+
+static INLINE void load_levels_8x2x5(const uint8_t *const src, const int stride,
+                                     const ptrdiff_t *const offsets,
+                                     uint8x16_t *const level) {
+  level[0] = load_8bit_8x2_to_1_reg(&src[1], stride);
+  level[1] = load_8bit_8x2_to_1_reg(&src[stride], stride);
+  level[2] = load_8bit_8x2_to_1_reg(&src[offsets[0]], stride);
+  level[3] = load_8bit_8x2_to_1_reg(&src[offsets[1]], stride);
+  level[4] = load_8bit_8x2_to_1_reg(&src[offsets[2]], stride);
+}
+
+static INLINE void load_levels_16x1x5(const uint8_t *const src,
+                                      const int stride,
+                                      const ptrdiff_t *const offsets,
+                                      uint8x16_t *const level) {
+  level[0] = load_8bit_16x1_to_1_reg(&src[1], stride);
+  level[1] = load_8bit_16x1_to_1_reg(&src[stride], stride);
+  level[2] = load_8bit_16x1_to_1_reg(&src[offsets[0]], stride);
+  level[3] = load_8bit_16x1_to_1_reg(&src[offsets[1]], stride);
+  level[4] = load_8bit_16x1_to_1_reg(&src[offsets[2]], stride);
+}
+
+static INLINE uint8x16_t get_coeff_contexts_kernel(uint8x16_t *const level) {
+  const uint8x16_t const_3 = vdupq_n_u8(3);
+  const uint8x16_t const_4 = vdupq_n_u8(4);
+  uint8x16_t count;
+
+  count = vminq_u8(level[0], const_3);
+  level[1] = vminq_u8(level[1], const_3);
+  level[2] = vminq_u8(level[2], const_3);
+  level[3] = vminq_u8(level[3], const_3);
+  level[4] = vminq_u8(level[4], const_3);
+  count = vaddq_u8(count, level[1]);
+  count = vaddq_u8(count, level[2]);
+  count = vaddq_u8(count, level[3]);
+  count = vaddq_u8(count, level[4]);
+
+  count = vrshrq_n_u8(count, 1);
+  count = vminq_u8(count, const_4);
+  return count;
+}
+
+static INLINE void get_4_nz_map_contexts_2d(const uint8_t *levels,
+                                            const int height,
+                                            const ptrdiff_t *const offsets,
+                                            uint8_t *const coeff_contexts) {
+  const int stride = 4 + TX_PAD_HOR;
+  const uint8x16_t pos_to_offset_large = vdupq_n_u8(21);
+
+  uint8x16_t pos_to_offset =
+      vld1q_u8((height == 4) ? c_4_po_2d[0] : c_4_po_2d[1]);
+
+  uint8x16_t count;
+  uint8x16_t level[5];
+  uint8_t *cc = coeff_contexts;
+
+  assert(!(height % 4));
+
+  int row = height;
+  do {
+    load_levels_4x4x5(levels, stride, offsets, level);
+    count = get_coeff_contexts_kernel(level);
+    count = vaddq_u8(count, pos_to_offset);
+    vst1q_u8(cc, count);
+    pos_to_offset = pos_to_offset_large;
+    levels += 4 * stride;
+    cc += 16;
+    row -= 4;
+  } while (row);
+
+  coeff_contexts[0] = 0;
+}
+
+static INLINE void get_4_nz_map_contexts_hor(const uint8_t *levels,
+                                             const int height,
+                                             const ptrdiff_t *const offsets,
+                                             uint8_t *coeff_contexts) {
+  const int stride = 4 + TX_PAD_HOR;
+
+  const uint8x16_t pos_to_offset =
+      vreinterpretq_u8_u32(vdupq_n_u32(SIG_COEF_CONTEXTS_2D_X4_051010));
+
+  uint8x16_t count;
+  uint8x16_t level[5];
+
+  assert(!(height % 4));
+
+  int row = height;
+  do {
+    load_levels_4x4x5(levels, stride, offsets, level);
+    count = get_coeff_contexts_kernel(level);
+    count = vaddq_u8(count, pos_to_offset);
+    vst1q_u8(coeff_contexts, count);
+    levels += 4 * stride;
+    coeff_contexts += 16;
+    row -= 4;
+  } while (row);
+}
+
+static INLINE void get_4_nz_map_contexts_ver(const uint8_t *levels,
+                                             const int height,
+                                             const ptrdiff_t *const offsets,
+                                             uint8_t *coeff_contexts) {
+  const int stride = 4 + TX_PAD_HOR;
+  const uint8x16_t pos_to_offset_large = vdupq_n_u8(SIG_COEF_CONTEXTS_2D + 10);
+
+  uint8x16_t pos_to_offset = vld1q_u8(c_4_po_ver);
+
+  uint8x16_t count;
+  uint8x16_t level[5];
+
+  assert(!(height % 4));
+
+  int row = height;
+  do {
+    load_levels_4x4x5(levels, stride, offsets, level);
+    count = get_coeff_contexts_kernel(level);
+    count = vaddq_u8(count, pos_to_offset);
+    vst1q_u8(coeff_contexts, count);
+    pos_to_offset = pos_to_offset_large;
+    levels += 4 * stride;
+    coeff_contexts += 16;
+    row -= 4;
+  } while (row);
+}
+
+static INLINE void get_8_coeff_contexts_2d(const uint8_t *levels,
+                                           const int height,
+                                           const ptrdiff_t *const offsets,
+                                           uint8_t *coeff_contexts) {
+  const int stride = 8 + TX_PAD_HOR;
+  uint8_t *cc = coeff_contexts;
+  uint8x16_t count;
+  uint8x16_t level[5];
+  uint8x16_t pos_to_offset[3];
+
+  assert(!(height % 2));
+
+  if (height == 8) {
+    pos_to_offset[0] = vld1q_u8(c_8_po_2d_8[0]);
+    pos_to_offset[1] = vld1q_u8(c_8_po_2d_8[1]);
+  } else if (height < 8) {
+    pos_to_offset[0] = vld1q_u8(c_8_po_2d_l[0]);
+    pos_to_offset[1] = vld1q_u8(c_8_po_2d_l[1]);
+  } else {
+    pos_to_offset[0] = vld1q_u8(c_8_po_2d_g[0]);
+    pos_to_offset[1] = vld1q_u8(c_8_po_2d_g[1]);
+  }
+  pos_to_offset[2] = vdupq_n_u8(21);
+
+  int row = height;
+  do {
+    load_levels_8x2x5(levels, stride, offsets, level);
+    count = get_coeff_contexts_kernel(level);
+    count = vaddq_u8(count, pos_to_offset[0]);
+    vst1q_u8(cc, count);
+    pos_to_offset[0] = pos_to_offset[1];
+    pos_to_offset[1] = pos_to_offset[2];
+    levels += 2 * stride;
+    cc += 16;
+    row -= 2;
+  } while (row);
+
+  coeff_contexts[0] = 0;
+}
+
+static INLINE void get_8_coeff_contexts_hor(const uint8_t *levels,
+                                            const int height,
+                                            const ptrdiff_t *const offsets,
+                                            uint8_t *coeff_contexts) {
+  const int stride = 8 + TX_PAD_HOR;
+
+  const uint8x16_t pos_to_offset = vld1q_u8(c_8_po_hor);
+
+  uint8x16_t count;
+  uint8x16_t level[5];
+
+  assert(!(height % 2));
+
+  int row = height;
+  do {
+    load_levels_8x2x5(levels, stride, offsets, level);
+    count = get_coeff_contexts_kernel(level);
+    count = vaddq_u8(count, pos_to_offset);
+    vst1q_u8(coeff_contexts, count);
+    levels += 2 * stride;
+    coeff_contexts += 16;
+    row -= 2;
+  } while (row);
+}
+
+static INLINE void get_8_coeff_contexts_ver(const uint8_t *levels,
+                                            const int height,
+                                            const ptrdiff_t *const offsets,
+                                            uint8_t *coeff_contexts) {
+  const int stride = 8 + TX_PAD_HOR;
+  const uint8x16_t pos_to_offset_large = vdupq_n_u8(SIG_COEF_CONTEXTS_2D + 10);
+
+  uint8x16_t pos_to_offset = vcombine_u8(vdup_n_u8(SIG_COEF_CONTEXTS_2D + 0),
+                                         vdup_n_u8(SIG_COEF_CONTEXTS_2D + 5));
+
+  uint8x16_t count;
+  uint8x16_t level[5];
+
+  assert(!(height % 2));
+
+  int row = height;
+  do {
+    load_levels_8x2x5(levels, stride, offsets, level);
+    count = get_coeff_contexts_kernel(level);
+    count = vaddq_u8(count, pos_to_offset);
+    vst1q_u8(coeff_contexts, count);
+    pos_to_offset = pos_to_offset_large;
+    levels += 2 * stride;
+    coeff_contexts += 16;
+    row -= 2;
+  } while (row);
+}
+
+static INLINE void get_16n_coeff_contexts_2d(const uint8_t *levels,
+                                             const int real_width,
+                                             const int real_height,
+                                             const int width, const int height,
+                                             const ptrdiff_t *const offsets,
+                                             uint8_t *coeff_contexts) {
+  const int stride = width + TX_PAD_HOR;
+  uint8_t *cc = coeff_contexts;
+  int row = height;
+  uint8x16_t pos_to_offset[5];
+  uint8x16_t pos_to_offset_large[3];
+  uint8x16_t count;
+  uint8x16_t level[5];
+
+  assert(!(width % 16));
+
+  pos_to_offset_large[2] = vdupq_n_u8(21);
+  if (real_width == real_height) {
+    pos_to_offset[0] = vld1q_u8(c_16_po_2d_e[0]);
+    pos_to_offset[1] = vld1q_u8(c_16_po_2d_e[1]);
+    pos_to_offset[2] = vld1q_u8(c_16_po_2d_e[2]);
+    pos_to_offset[3] = vld1q_u8(c_16_po_2d_e[3]);
+    pos_to_offset[4] = pos_to_offset_large[0] = pos_to_offset_large[1] =
+        pos_to_offset_large[2];
+  } else if (real_width > real_height) {
+    pos_to_offset[0] = vld1q_u8(c_16_po_2d_g[0]);
+    pos_to_offset[1] = vld1q_u8(c_16_po_2d_g[1]);
+    pos_to_offset[2] = pos_to_offset[3] = pos_to_offset[4] =
+        vld1q_u8(c_16_po_2d_g[2]);
+    pos_to_offset_large[0] = pos_to_offset_large[1] = pos_to_offset_large[2];
+  } else {  // real_width < real_height
+    pos_to_offset[0] = pos_to_offset[1] = vld1q_u8(c_16_po_2d_l[0]);
+    pos_to_offset[2] = vld1q_u8(c_16_po_2d_l[1]);
+    pos_to_offset[3] = vld1q_u8(c_16_po_2d_l[2]);
+    pos_to_offset[4] = pos_to_offset_large[2];
+    pos_to_offset_large[0] = pos_to_offset_large[1] = vdupq_n_u8(11);
+  }
+
+  do {
+    int w = width;
+
+    do {
+      load_levels_16x1x5(levels, stride, offsets, level);
+      count = get_coeff_contexts_kernel(level);
+      count = vaddq_u8(count, pos_to_offset[0]);
+      vst1q_u8(cc, count);
+      levels += 16;
+      cc += 16;
+      w -= 16;
+      pos_to_offset[0] = pos_to_offset_large[0];
+    } while (w);
+
+    pos_to_offset[0] = pos_to_offset[1];
+    pos_to_offset[1] = pos_to_offset[2];
+    pos_to_offset[2] = pos_to_offset[3];
+    pos_to_offset[3] = pos_to_offset[4];
+    pos_to_offset_large[0] = pos_to_offset_large[1];
+    pos_to_offset_large[1] = pos_to_offset_large[2];
+    levels += TX_PAD_HOR;
+  } while (--row);
+
+  coeff_contexts[0] = 0;
+}
+
+static INLINE void get_16n_coeff_contexts_hor(const uint8_t *levels,
+                                              const int width, const int height,
+                                              const ptrdiff_t *const offsets,
+                                              uint8_t *coeff_contexts) {
+  const int stride = width + TX_PAD_HOR;
+
+  const uint8x16_t pos_to_offset_large = vdupq_n_u8(SIG_COEF_CONTEXTS_2D + 10);
+
+  uint8x16_t count;
+  uint8x16_t level[5];
+
+  assert(!(width % 16));
+
+  int row = height;
+  do {
+    uint8x16_t pos_to_offset = vld1q_u8(c_16_po_hor);
+
+    int w = width;
+    do {
+      load_levels_16x1x5(levels, stride, offsets, level);
+      count = get_coeff_contexts_kernel(level);
+      count = vaddq_u8(count, pos_to_offset);
+      vst1q_u8(coeff_contexts, count);
+      pos_to_offset = pos_to_offset_large;
+      levels += 16;
+      coeff_contexts += 16;
+      w -= 16;
+    } while (w);
+
+    levels += TX_PAD_HOR;
+  } while (--row);
+}
+
+static INLINE void get_16n_coeff_contexts_ver(const uint8_t *levels,
+                                              const int width, const int height,
+                                              const ptrdiff_t *const offsets,
+                                              uint8_t *coeff_contexts) {
+  const int stride = width + TX_PAD_HOR;
+
+  uint8x16_t pos_to_offset[3];
+  uint8x16_t count;
+  uint8x16_t level[5];
+
+  assert(!(width % 16));
+
+  pos_to_offset[0] = vdupq_n_u8(SIG_COEF_CONTEXTS_2D + 0);
+  pos_to_offset[1] = vdupq_n_u8(SIG_COEF_CONTEXTS_2D + 5);
+  pos_to_offset[2] = vdupq_n_u8(SIG_COEF_CONTEXTS_2D + 10);
+
+  int row = height;
+  do {
+    int w = width;
+    do {
+      load_levels_16x1x5(levels, stride, offsets, level);
+      count = get_coeff_contexts_kernel(level);
+      count = vaddq_u8(count, pos_to_offset[0]);
+      vst1q_u8(coeff_contexts, count);
+      levels += 16;
+      coeff_contexts += 16;
+      w -= 16;
+    } while (w);
+
+    pos_to_offset[0] = pos_to_offset[1];
+    pos_to_offset[1] = pos_to_offset[2];
+    levels += TX_PAD_HOR;
+  } while (--row);
+}
+
+// Note: levels[] must be in the range [0, 127], inclusive.
+void av1_get_nz_map_contexts_neon(const uint8_t *const levels,
+                                  const int16_t *const scan, const uint16_t eob,
+                                  const TX_SIZE tx_size,
+                                  const TX_CLASS tx_class,
+                                  int8_t *const coeff_contexts) {
+  const int last_idx = eob - 1;
+  if (!last_idx) {
+    coeff_contexts[0] = 0;
+    return;
+  }
+
+  uint8_t *const coefficients = (uint8_t *const)coeff_contexts;
+
+  const int real_width = tx_size_wide[tx_size];
+  const int real_height = tx_size_high[tx_size];
+  const int width = get_txb_wide(tx_size);
+  const int height = get_txb_high(tx_size);
+  const int stride = width + TX_PAD_HOR;
+  ptrdiff_t offsets[3];
+
+  /* coeff_contexts must be 16 byte aligned. */
+  assert(!((intptr_t)coeff_contexts & 0xf));
+
+  if (tx_class == TX_CLASS_2D) {
+    offsets[0] = 0 * stride + 2;
+    offsets[1] = 1 * stride + 1;
+    offsets[2] = 2 * stride + 0;
+
+    if (width == 4) {
+      get_4_nz_map_contexts_2d(levels, height, offsets, coefficients);
+    } else if (width == 8) {
+      get_8_coeff_contexts_2d(levels, height, offsets, coefficients);
+    } else {
+      get_16n_coeff_contexts_2d(levels, real_width, real_height, width, height,
+                                offsets, coefficients);
+    }
+  } else if (tx_class == TX_CLASS_HORIZ) {
+    offsets[0] = 2;
+    offsets[1] = 3;
+    offsets[2] = 4;
+    if (width == 4) {
+      get_4_nz_map_contexts_hor(levels, height, offsets, coefficients);
+    } else if (width == 8) {
+      get_8_coeff_contexts_hor(levels, height, offsets, coefficients);
+    } else {
+      get_16n_coeff_contexts_hor(levels, width, height, offsets, coefficients);
+    }
+  } else {  // TX_CLASS_VERT
+    offsets[0] = 2 * stride;
+    offsets[1] = 3 * stride;
+    offsets[2] = 4 * stride;
+    if (width == 4) {
+      get_4_nz_map_contexts_ver(levels, height, offsets, coefficients);
+    } else if (width == 8) {
+      get_8_coeff_contexts_ver(levels, height, offsets, coefficients);
+    } else {
+      get_16n_coeff_contexts_ver(levels, width, height, offsets, coefficients);
+    }
+  }
+
+  const int bwl = get_txb_bwl(tx_size);
+  const int pos = scan[last_idx];
+  if (last_idx <= (height << bwl) / 8)
+    coeff_contexts[pos] = 1;
+  else if (last_idx <= (height << bwl) / 4)
+    coeff_contexts[pos] = 2;
+  else
+    coeff_contexts[pos] = 3;
+}
diff --git a/av1/encoder/arm/neon/highbd_fwd_txfm_neon.c b/av1/encoder/arm/neon/highbd_fwd_txfm_neon.c
new file mode 100644
index 0000000..e17cd90
--- /dev/null
+++ b/av1/encoder/arm/neon/highbd_fwd_txfm_neon.c
@@ -0,0 +1,4035 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "av1/common/av1_txfm.h"
+#include "av1/encoder/av1_fwd_txfm1d_cfg.h"
+#include "aom_dsp/txfm_common.h"
+#include "aom_ports/mem.h"
+#include "config/av1_rtcd.h"
+#include "config/aom_config.h"
+
+static INLINE int32x4_t half_btf_neon(const int32_t *w0, const int32x4_t *n0,
+                                      const int32_t *w1, const int32x4_t *n1,
+                                      const int32x4_t v_bit) {
+  int32x4_t x;
+  x = vmulq_n_s32(*n0, *w0);
+  x = vmlaq_n_s32(x, *n1, *w1);
+  x = vrshlq_s32(x, v_bit);
+  return x;
+}
+
+static INLINE int32x4_t half_btf_neon_m(const int32_t *w0, const int32x4_t *n0,
+                                        const int32_t *w1, const int32x4_t *n1,
+                                        const int32x4_t v_bit) {
+  int32x4_t x;
+  x = vmulq_n_s32(*n0, *w0);
+  x = vmlsq_n_s32(x, *n1, *w1);
+  x = vrshlq_s32(x, v_bit);
+  return x;
+}
+
+#if defined(__aarch64__)
+#define TRANSPOSE_4X4(x0, x1, x2, x3, y0, y1, y2, y3)         \
+  do {                                                        \
+    int32x4x2_t swap_low = vtrnq_s32(x0, x1);                 \
+    int32x4x2_t swap_high = vtrnq_s32(x2, x3);                \
+    y0 = vreinterpretq_s32_s64(                               \
+        vzip1q_s64(vreinterpretq_s64_s32(swap_low.val[0]),    \
+                   vreinterpretq_s64_s32(swap_high.val[0]))); \
+    y1 = vreinterpretq_s32_s64(                               \
+        vzip1q_s64(vreinterpretq_s64_s32(swap_low.val[1]),    \
+                   vreinterpretq_s64_s32(swap_high.val[1]))); \
+    y2 = vreinterpretq_s32_s64(                               \
+        vzip2q_s64(vreinterpretq_s64_s32(swap_low.val[0]),    \
+                   vreinterpretq_s64_s32(swap_high.val[0]))); \
+    y3 = vreinterpretq_s32_s64(                               \
+        vzip2q_s64(vreinterpretq_s64_s32(swap_low.val[1]),    \
+                   vreinterpretq_s64_s32(swap_high.val[1]))); \
+  } while (0)
+#else
+#define TRANSPOSE_4X4(x0, x1, x2, x3, y0, y1, y2, y3)                    \
+  do {                                                                   \
+    int32x4x2_t swap_low = vtrnq_s32(x0, x1);                            \
+    int32x4x2_t swap_high = vtrnq_s32(x2, x3);                           \
+    y0 = vextq_s32(vextq_s32(swap_low.val[0], swap_low.val[0], 2),       \
+                   swap_high.val[0], 2);                                 \
+    y1 = vextq_s32(vextq_s32(swap_low.val[1], swap_low.val[1], 2),       \
+                   swap_high.val[1], 2);                                 \
+    y2 = vextq_s32(swap_low.val[0],                                      \
+                   vextq_s32(swap_high.val[0], swap_high.val[0], 2), 2); \
+    y3 = vextq_s32(swap_low.val[1],                                      \
+                   vextq_s32(swap_high.val[1], swap_high.val[1], 2), 2); \
+  } while (0)
+#endif  // (__aarch64__)
+
+static INLINE void transpose_8x8(const int32x4_t *in, int32x4_t *out) {
+  TRANSPOSE_4X4(in[0], in[2], in[4], in[6], out[0], out[2], out[4], out[6]);
+  TRANSPOSE_4X4(in[1], in[3], in[5], in[7], out[8], out[10], out[12], out[14]);
+  TRANSPOSE_4X4(in[8], in[10], in[12], in[14], out[1], out[3], out[5], out[7]);
+  TRANSPOSE_4X4(in[9], in[11], in[13], in[15], out[9], out[11], out[13],
+                out[15]);
+}
+
+static INLINE void transpose_16x16(const int32x4_t *in, int32x4_t *out) {
+  // Upper left 8x8
+  TRANSPOSE_4X4(in[0], in[4], in[8], in[12], out[0], out[4], out[8], out[12]);
+  TRANSPOSE_4X4(in[1], in[5], in[9], in[13], out[16], out[20], out[24],
+                out[28]);
+  TRANSPOSE_4X4(in[16], in[20], in[24], in[28], out[1], out[5], out[9],
+                out[13]);
+  TRANSPOSE_4X4(in[17], in[21], in[25], in[29], out[17], out[21], out[25],
+                out[29]);
+
+  // Upper right 8x8
+  TRANSPOSE_4X4(in[2], in[6], in[10], in[14], out[32], out[36], out[40],
+                out[44]);
+  TRANSPOSE_4X4(in[3], in[7], in[11], in[15], out[48], out[52], out[56],
+                out[60]);
+  TRANSPOSE_4X4(in[18], in[22], in[26], in[30], out[33], out[37], out[41],
+                out[45]);
+  TRANSPOSE_4X4(in[19], in[23], in[27], in[31], out[49], out[53], out[57],
+                out[61]);
+
+  // Lower left 8x8
+  TRANSPOSE_4X4(in[32], in[36], in[40], in[44], out[2], out[6], out[10],
+                out[14]);
+  TRANSPOSE_4X4(in[33], in[37], in[41], in[45], out[18], out[22], out[26],
+                out[30]);
+  TRANSPOSE_4X4(in[48], in[52], in[56], in[60], out[3], out[7], out[11],
+                out[15]);
+  TRANSPOSE_4X4(in[49], in[53], in[57], in[61], out[19], out[23], out[27],
+                out[31]);
+  // Lower right 8x8
+  TRANSPOSE_4X4(in[34], in[38], in[42], in[46], out[34], out[38], out[42],
+                out[46]);
+  TRANSPOSE_4X4(in[35], in[39], in[43], in[47], out[50], out[54], out[58],
+                out[62]);
+  TRANSPOSE_4X4(in[50], in[54], in[58], in[62], out[35], out[39], out[43],
+                out[47]);
+  TRANSPOSE_4X4(in[51], in[55], in[59], in[63], out[51], out[55], out[59],
+                out[63]);
+}
+
+static INLINE void av1_round_shift_rect_array_32_neon(int32x4_t *input,
+                                                      int32x4_t *output,
+                                                      const int size,
+                                                      const int bit,
+                                                      const int val) {
+  const int32x4_t sqrt2 = vdupq_n_s32(val);
+  const int32x4_t v_bit = vdupq_n_s32(-bit);
+  int i;
+  for (i = 0; i < size; i++) {
+    const int32x4_t r0 = vrshlq_s32(input[i], v_bit);
+    const int32x4_t r1 = vmulq_s32(sqrt2, r0);
+    output[i] = vrshrq_n_s32(r1, NewSqrt2Bits);
+  }
+}
+
+#define btf_32_neon_type0(w0, w1, in0, in1, out0, out1, v_cos_bit) \
+  do {                                                             \
+    out0 = vmulq_n_s32(in0, w0);                                   \
+    out0 = vmlaq_n_s32(out0, in1, w1);                             \
+    out0 = vrshlq_s32(out0, v_cos_bit);                            \
+    out1 = vmulq_n_s32(in0, w1);                                   \
+    out1 = vmlsq_n_s32(out1, in1, w0);                             \
+    out1 = vrshlq_s32(out1, v_cos_bit);                            \
+  } while (0)
+
+#define btf_32_neon_type1(w0, w1, in0, in1, out0, out1, bit) \
+  do {                                                       \
+    btf_32_neon_type0(w1, w0, in1, in0, out0, out1, bit);    \
+  } while (0)
+
+static INLINE void load_buffer_4x4(const int16_t *input, int32x4_t *in,
+                                   int stride, int flipud, int fliplr,
+                                   const int32x4_t *v_shift) {
+  int16x4_t v0, v1, v2, v3;
+
+  if (!flipud) {
+    v0 = vld1_s16(input + 0 * stride);
+    v1 = vld1_s16(input + 1 * stride);
+    v2 = vld1_s16(input + 2 * stride);
+    v3 = vld1_s16(input + 3 * stride);
+  } else {
+    v0 = vld1_s16(input + 3 * stride);
+    v1 = vld1_s16(input + 2 * stride);
+    v2 = vld1_s16(input + 1 * stride);
+    v3 = vld1_s16(input + 0 * stride);
+  }
+
+  if (fliplr) {
+    v0 = vrev64_s16(v0);
+    v1 = vrev64_s16(v1);
+    v2 = vrev64_s16(v2);
+    v3 = vrev64_s16(v3);
+  }
+  in[0] = vshlq_s32(vmovl_s16(v0), *v_shift);
+  in[1] = vshlq_s32(vmovl_s16(v1), *v_shift);
+  in[2] = vshlq_s32(vmovl_s16(v2), *v_shift);
+  in[3] = vshlq_s32(vmovl_s16(v3), *v_shift);
+}
+
+static void fdct4x4_neon(int32x4_t *in, int32x4_t *out, int bit,
+                         const int num_col) {
+  const int32_t *cospi = cospi_arr(bit);
+  const int32x4_t cospi32 = vdupq_n_s32(cospi[32]);
+  const int32x4_t cospi48 = vdupq_n_s32(cospi[48]);
+  const int32x4_t cospi16 = vdupq_n_s32(cospi[16]);
+  int32x4_t s0, s1, s2, s3;
+  int32x4_t u0, u1, u2, u3;
+  int32x4_t v0, v2;
+
+  int endidx = 3 * num_col;
+  s0 = vaddq_s32(in[0], in[endidx]);
+  s3 = vsubq_s32(in[0], in[endidx]);
+  endidx -= num_col;
+  s1 = vaddq_s32(in[num_col], in[endidx]);
+  s2 = vsubq_s32(in[num_col], in[endidx]);
+
+  u0 = vmulq_s32(s0, cospi32);
+  u1 = vmulq_s32(s1, cospi32);
+  u2 = vaddq_s32(u0, u1);
+  v0 = vsubq_s32(u0, u1);
+  const int32x4_t v_bit = vdupq_n_s32(-bit);
+  u0 = vrshlq_s32(u2, v_bit);
+  u2 = vrshlq_s32(v0, v_bit);
+
+  v0 = vmulq_s32(s2, cospi48);
+  v2 = vmlaq_s32(v0, s3, cospi16);
+
+  u1 = vrshlq_s32(v2, v_bit);
+
+  v0 = vmulq_s32(s3, cospi48);
+  v2 = vmlsq_s32(v0, s2, cospi16);
+
+  u3 = vrshlq_s32(v2, v_bit);
+
+  TRANSPOSE_4X4(u0, u1, u2, u3, out[0], out[1], out[2], out[3]);
+}
+
+static INLINE void write_buffer_4x4(int32x4_t *res, int32_t *output) {
+  vst1q_s32((output + 0 * 4), res[0]);
+  vst1q_s32((output + 1 * 4), res[1]);
+  vst1q_s32((output + 2 * 4), res[2]);
+  vst1q_s32((output + 3 * 4), res[3]);
+}
+
+static void fadst4x4_neon(int32x4_t *in, int32x4_t *out, int bit,
+                          const int num_col) {
+  const int32_t *sinpi = sinpi_arr(bit);
+  const int32x4_t sinpi4x = vld1q_s32(&sinpi[1]);
+
+  const int32x4_t sinpi1 = vdupq_lane_s32(vget_low_s32(sinpi4x), 0);
+  const int32x4_t sinpi2 = vdupq_lane_s32(vget_low_s32(sinpi4x), 1);
+  const int32x4_t sinpi3 = vdupq_lane_s32(vget_high_s32(sinpi4x), 0);
+  const int32x4_t sinpi4 = vdupq_lane_s32(vget_high_s32(sinpi4x), 1);
+  int32x4_t t;
+  int32x4_t s0, s1, s2, s3, s7;
+  int32x4_t x0, x1, x2, x3;
+  int32x4_t u0, u1, u2, u3;
+
+  int idx = 0 * num_col;
+  s0 = vmulq_s32(in[idx], sinpi1);
+  s1 = vmulq_s32(in[idx], sinpi4);
+  t = vaddq_s32(in[idx], in[idx + num_col]);
+  idx += 2 * num_col;
+  x3 = vmulq_s32(in[idx], sinpi3);
+  idx += num_col;
+  s7 = vsubq_s32(t, in[idx]);
+
+  t = vmlaq_s32(s0, in[idx - 2 * num_col], sinpi2);
+  x0 = vmlaq_s32(t, in[idx], sinpi4);
+  x1 = vmulq_s32(s7, sinpi3);
+  t = vmlsq_s32(s1, in[idx - 2 * num_col], sinpi1);
+  x2 = vmlaq_s32(t, in[idx], sinpi2);
+
+  s0 = vaddq_s32(x0, x3);
+  s1 = x1;
+  s2 = vsubq_s32(x2, x3);
+  t = vsubq_s32(x2, x0);
+  s3 = vaddq_s32(t, x3);
+
+  const int32x4_t v_bit = vdupq_n_s32(-bit);
+  u0 = vrshlq_s32(s0, v_bit);
+  u1 = vrshlq_s32(s1, v_bit);
+  u2 = vrshlq_s32(s2, v_bit);
+  u3 = vrshlq_s32(s3, v_bit);
+
+  TRANSPOSE_4X4(u0, u1, u2, u3, out[0], out[1], out[2], out[3]);
+}
+static void idtx4x4_neon(int32x4_t *in, int32x4_t *out, int bit, int col_num) {
+  (void)bit;
+  int32x4_t fact = vdupq_n_s32(NewSqrt2);
+  int32x4_t a_low;
+
+  int i;
+  for (i = 0; i < 4; i++) {
+    a_low = vmulq_s32(in[i * col_num], fact);
+    out[i] = vrshrq_n_s32(a_low, NewSqrt2Bits);
+  }
+
+  TRANSPOSE_4X4(out[0], out[1], out[2], out[3], out[0], out[1], out[2], out[3]);
+}
+void av1_fwd_txfm2d_4x4_neon(const int16_t *input, int32_t *coeff,
+                             int input_stride, TX_TYPE tx_type, int bd) {
+  int32x4_t in[4];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_4X4];
+  const int txw_idx = get_txw_idx(TX_4X4);
+  const int txh_idx = get_txh_idx(TX_4X4);
+  int32x4_t v_shift0 = vdupq_n_s32(shift[0]);
+  switch (tx_type) {
+    case DCT_DCT:
+      load_buffer_4x4(input, in, input_stride, 0, 0, &v_shift0);
+      fdct4x4_neon(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
+      fdct4x4_neon(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
+      write_buffer_4x4(in, coeff);
+      break;
+    case ADST_DCT:
+      load_buffer_4x4(input, in, input_stride, 0, 0, &v_shift0);
+      fadst4x4_neon(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
+      fdct4x4_neon(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
+      write_buffer_4x4(in, coeff);
+      break;
+    case DCT_ADST:
+      load_buffer_4x4(input, in, input_stride, 0, 0, &v_shift0);
+      fdct4x4_neon(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
+      fadst4x4_neon(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
+      write_buffer_4x4(in, coeff);
+      break;
+    case ADST_ADST:
+      load_buffer_4x4(input, in, input_stride, 0, 0, &v_shift0);
+      fadst4x4_neon(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
+      fadst4x4_neon(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
+      write_buffer_4x4(in, coeff);
+      break;
+    case FLIPADST_DCT:
+      load_buffer_4x4(input, in, input_stride, 1, 0, &v_shift0);
+      fadst4x4_neon(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
+      fdct4x4_neon(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
+      write_buffer_4x4(in, coeff);
+      break;
+    case DCT_FLIPADST:
+      load_buffer_4x4(input, in, input_stride, 0, 1, &v_shift0);
+      fdct4x4_neon(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
+      fadst4x4_neon(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
+      write_buffer_4x4(in, coeff);
+      break;
+    case FLIPADST_FLIPADST:
+      load_buffer_4x4(input, in, input_stride, 1, 1, &v_shift0);
+      fadst4x4_neon(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
+      fadst4x4_neon(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
+      write_buffer_4x4(in, coeff);
+      break;
+    case ADST_FLIPADST:
+      load_buffer_4x4(input, in, input_stride, 0, 1, &v_shift0);
+      fadst4x4_neon(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
+      fadst4x4_neon(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
+      write_buffer_4x4(in, coeff);
+      break;
+    case FLIPADST_ADST:
+      load_buffer_4x4(input, in, input_stride, 1, 0, &v_shift0);
+      fadst4x4_neon(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
+      fadst4x4_neon(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
+      write_buffer_4x4(in, coeff);
+      break;
+    case IDTX:
+      load_buffer_4x4(input, in, input_stride, 0, 0, &v_shift0);
+      idtx4x4_neon(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
+      idtx4x4_neon(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
+      write_buffer_4x4(in, coeff);
+      break;
+    case V_DCT:
+      load_buffer_4x4(input, in, input_stride, 0, 0, &v_shift0);
+      fdct4x4_neon(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
+      idtx4x4_neon(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
+      write_buffer_4x4(in, coeff);
+      break;
+    case H_DCT:
+      load_buffer_4x4(input, in, input_stride, 0, 0, &v_shift0);
+      idtx4x4_neon(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
+      fdct4x4_neon(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
+      write_buffer_4x4(in, coeff);
+      break;
+    case V_ADST:
+      load_buffer_4x4(input, in, input_stride, 0, 0, &v_shift0);
+      fadst4x4_neon(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
+      idtx4x4_neon(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
+      write_buffer_4x4(in, coeff);
+      break;
+    case H_ADST:
+      load_buffer_4x4(input, in, input_stride, 0, 0, &v_shift0);
+      idtx4x4_neon(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
+      fadst4x4_neon(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
+      write_buffer_4x4(in, coeff);
+      break;
+    case V_FLIPADST:
+      load_buffer_4x4(input, in, input_stride, 1, 0, &v_shift0);
+      fadst4x4_neon(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
+      idtx4x4_neon(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
+      write_buffer_4x4(in, coeff);
+      break;
+    case H_FLIPADST:
+      load_buffer_4x4(input, in, input_stride, 0, 1, &v_shift0);
+      idtx4x4_neon(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
+      fadst4x4_neon(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
+      write_buffer_4x4(in, coeff);
+      break;
+    default: assert(0);
+  }
+  (void)bd;
+}
+
+static INLINE void load_buffer_8x8(const int16_t *input, int32x4_t *in,
+                                   int stride, int flipud, int fliplr,
+                                   const int shift) {
+  if (!flipud) {
+    in[0] = vreinterpretq_s32_s16(vld1q_s16((input + 0 * stride)));
+    in[1] = vreinterpretq_s32_s16(vld1q_s16((input + 1 * stride)));
+    in[2] = vreinterpretq_s32_s16(vld1q_s16((input + 2 * stride)));
+    in[3] = vreinterpretq_s32_s16(vld1q_s16((input + 3 * stride)));
+    in[4] = vreinterpretq_s32_s16(vld1q_s16((input + 4 * stride)));
+    in[5] = vreinterpretq_s32_s16(vld1q_s16((input + 5 * stride)));
+    in[6] = vreinterpretq_s32_s16(vld1q_s16((input + 6 * stride)));
+    in[7] = vreinterpretq_s32_s16(vld1q_s16((input + 7 * stride)));
+  } else {
+    in[0] = vreinterpretq_s32_s16(vld1q_s16((input + 7 * stride)));
+    in[1] = vreinterpretq_s32_s16(vld1q_s16((input + 6 * stride)));
+    in[2] = vreinterpretq_s32_s16(vld1q_s16((input + 5 * stride)));
+    in[3] = vreinterpretq_s32_s16(vld1q_s16((input + 4 * stride)));
+    in[4] = vreinterpretq_s32_s16(vld1q_s16((input + 3 * stride)));
+    in[5] = vreinterpretq_s32_s16(vld1q_s16((input + 2 * stride)));
+    in[6] = vreinterpretq_s32_s16(vld1q_s16((input + 1 * stride)));
+    in[7] = vreinterpretq_s32_s16(vld1q_s16((input + 0 * stride)));
+  }
+
+  if (fliplr) {
+    in[0] = vreinterpretq_s32_s16(vrev64q_s16(vreinterpretq_s16_s32(in[0])));
+    in[0] = vextq_s32(in[0], in[0], 2);
+    in[1] = vreinterpretq_s32_s16(vrev64q_s16(vreinterpretq_s16_s32(in[1])));
+    in[1] = vextq_s32(in[1], in[1], 2);
+    in[2] = vreinterpretq_s32_s16(vrev64q_s16(vreinterpretq_s16_s32(in[2])));
+    in[2] = vextq_s32(in[2], in[2], 2);
+    in[3] = vreinterpretq_s32_s16(vrev64q_s16(vreinterpretq_s16_s32(in[3])));
+    in[3] = vextq_s32(in[3], in[3], 2);
+    in[4] = vreinterpretq_s32_s16(vrev64q_s16(vreinterpretq_s16_s32(in[4])));
+    in[4] = vextq_s32(in[4], in[4], 2);
+    in[5] = vreinterpretq_s32_s16(vrev64q_s16(vreinterpretq_s16_s32(in[5])));
+    in[5] = vextq_s32(in[5], in[5], 2);
+    in[6] = vreinterpretq_s32_s16(vrev64q_s16(vreinterpretq_s16_s32(in[6])));
+    in[6] = vextq_s32(in[6], in[6], 2);
+    in[7] = vreinterpretq_s32_s16(vrev64q_s16(vreinterpretq_s16_s32(in[7])));
+    in[7] = vextq_s32(in[7], in[7], 2);
+  }
+
+  int16x4_t u = vget_high_s16(vreinterpretq_s16_s32(in[4]));
+  in[8] = vmovl_s16(vget_low_s16(vreinterpretq_s16_s32(in[4])));
+  in[9] = vmovl_s16(u);
+
+  u = vget_high_s16(vreinterpretq_s16_s32(in[5]));
+  in[10] = vmovl_s16(vget_low_s16(vreinterpretq_s16_s32(in[5])));
+  in[11] = vmovl_s16(u);
+
+  u = vget_high_s16(vreinterpretq_s16_s32(in[6]));
+  in[12] = vmovl_s16(vget_low_s16(vreinterpretq_s16_s32(in[6])));
+  in[13] = vmovl_s16(u);
+
+  u = vget_high_s16(vreinterpretq_s16_s32(in[7]));
+  in[14] = vmovl_s16(vget_low_s16(vreinterpretq_s16_s32(in[7])));
+  in[15] = vmovl_s16(u);
+
+  u = vget_high_s16(vreinterpretq_s16_s32(in[3]));
+  in[6] = vmovl_s16(vget_low_s16(vreinterpretq_s16_s32(in[3])));
+  in[7] = vmovl_s16(u);
+
+  u = vget_high_s16(vreinterpretq_s16_s32(in[2]));
+  in[4] = vmovl_s16(vget_low_s16(vreinterpretq_s16_s32(in[2])));
+  in[5] = vmovl_s16(u);
+
+  u = vget_high_s16(vreinterpretq_s16_s32(in[1]));
+  in[2] = vmovl_s16(vget_low_s16(vreinterpretq_s16_s32(in[1])));
+  in[3] = vmovl_s16(u);
+
+  u = vget_high_s16(vreinterpretq_s16_s32(in[0]));
+  in[0] = vmovl_s16(vget_low_s16(vreinterpretq_s16_s32(in[0])));
+  in[1] = vmovl_s16(u);
+
+  const int32x4_t v_shift = vdupq_n_s32(shift);
+
+  in[0] = vshlq_s32(in[0], v_shift);
+  in[1] = vshlq_s32(in[1], v_shift);
+  in[2] = vshlq_s32(in[2], v_shift);
+  in[3] = vshlq_s32(in[3], v_shift);
+  in[4] = vshlq_s32(in[4], v_shift);
+  in[5] = vshlq_s32(in[5], v_shift);
+  in[6] = vshlq_s32(in[6], v_shift);
+  in[7] = vshlq_s32(in[7], v_shift);
+
+  in[8] = vshlq_s32(in[8], v_shift);
+  in[9] = vshlq_s32(in[9], v_shift);
+  in[10] = vshlq_s32(in[10], v_shift);
+  in[11] = vshlq_s32(in[11], v_shift);
+  in[12] = vshlq_s32(in[12], v_shift);
+  in[13] = vshlq_s32(in[13], v_shift);
+  in[14] = vshlq_s32(in[14], v_shift);
+  in[15] = vshlq_s32(in[15], v_shift);
+}
+
+static INLINE void col_txfm_8x8_rounding(int32x4_t *in,
+                                         const int32x4_t *v_shift) {
+  in[0] = vrshlq_s32(in[0], *v_shift);
+  in[1] = vrshlq_s32(in[1], *v_shift);
+  in[2] = vrshlq_s32(in[2], *v_shift);
+  in[3] = vrshlq_s32(in[3], *v_shift);
+  in[4] = vrshlq_s32(in[4], *v_shift);
+  in[5] = vrshlq_s32(in[5], *v_shift);
+  in[6] = vrshlq_s32(in[6], *v_shift);
+  in[7] = vrshlq_s32(in[7], *v_shift);
+  in[8] = vrshlq_s32(in[8], *v_shift);
+  in[9] = vrshlq_s32(in[9], *v_shift);
+  in[10] = vrshlq_s32(in[10], *v_shift);
+  in[11] = vrshlq_s32(in[11], *v_shift);
+  in[12] = vrshlq_s32(in[12], *v_shift);
+  in[13] = vrshlq_s32(in[13], *v_shift);
+  in[14] = vrshlq_s32(in[14], *v_shift);
+  in[15] = vrshlq_s32(in[15], *v_shift);
+}
+
+static INLINE void col_txfm_4x8_rounding(int32x4_t *in,
+                                         const int32x4_t *v_shift) {
+  in[0] = vrshlq_s32(in[0], *v_shift);
+  in[1] = vrshlq_s32(in[1], *v_shift);
+  in[2] = vrshlq_s32(in[2], *v_shift);
+  in[3] = vrshlq_s32(in[3], *v_shift);
+  in[4] = vrshlq_s32(in[4], *v_shift);
+  in[5] = vrshlq_s32(in[5], *v_shift);
+  in[6] = vrshlq_s32(in[6], *v_shift);
+  in[7] = vrshlq_s32(in[7], *v_shift);
+}
+
+static INLINE void write_buffer_8x8(const int32x4_t *res, int32_t *output) {
+  vst1q_s32(output + 0 * 4, res[0]);
+  vst1q_s32(output + 1 * 4, res[1]);
+  vst1q_s32(output + 2 * 4, res[2]);
+  vst1q_s32(output + 3 * 4, res[3]);
+
+  vst1q_s32(output + 4 * 4, res[4]);
+  vst1q_s32(output + 5 * 4, res[5]);
+  vst1q_s32(output + 6 * 4, res[6]);
+  vst1q_s32(output + 7 * 4, res[7]);
+
+  vst1q_s32(output + 8 * 4, res[8]);
+  vst1q_s32(output + 9 * 4, res[9]);
+  vst1q_s32(output + 10 * 4, res[10]);
+  vst1q_s32(output + 11 * 4, res[11]);
+
+  vst1q_s32(output + 12 * 4, res[12]);
+  vst1q_s32(output + 13 * 4, res[13]);
+  vst1q_s32(output + 14 * 4, res[14]);
+  vst1q_s32(output + 15 * 4, res[15]);
+}
+
+static INLINE void write_buffer_16x8(const int32x4_t *res, int32_t *output,
+                                     const int stride) {
+  vst1q_s32(output, res[0]);
+  vst1q_s32(output + 4, res[1]);
+  vst1q_s32(output + stride, res[2]);
+  vst1q_s32(output + stride + 4, res[3]);
+
+  vst1q_s32(output + (stride * 2), res[4]);
+  vst1q_s32(output + (stride * 2) + 4, res[5]);
+  vst1q_s32(output + (stride * 3), res[6]);
+  vst1q_s32(output + (stride * 3) + 4, res[7]);
+
+  vst1q_s32(output + (stride * 4), res[8]);
+  vst1q_s32(output + (stride * 4) + 4, res[9]);
+  vst1q_s32(output + (stride * 5), res[10]);
+  vst1q_s32(output + (stride * 5) + 4, res[11]);
+
+  vst1q_s32(output + (stride * 6), res[12]);
+  vst1q_s32(output + (stride * 6) + 4, res[13]);
+  vst1q_s32(output + (stride * 7), res[14]);
+  vst1q_s32(output + (stride * 7) + 4, res[15]);
+}
+
+static void fdct4x8_neon(int32x4_t *in, int32x4_t *out, int bit,
+                         const int col_num) {
+  const int32_t *cospi = cospi_arr(bit);
+  const int32x4_t v_bit = vdupq_n_s32(-bit);
+  int32x4_t u[8], v[8];
+
+  int startidx = 0 * col_num;
+  int endidx = 7 * col_num;
+  // stage 0-1
+  u[0] = vaddq_s32(in[startidx], in[endidx]);
+  v[7] = vsubq_s32(in[startidx], in[endidx]);
+  startidx += col_num;
+  endidx -= col_num;
+  u[1] = vaddq_s32(in[startidx], in[endidx]);
+  u[6] = vsubq_s32(in[startidx], in[endidx]);
+  startidx += col_num;
+  endidx -= col_num;
+  u[2] = vaddq_s32(in[startidx], in[endidx]);
+  u[5] = vsubq_s32(in[startidx], in[endidx]);
+  startidx += col_num;
+  endidx -= col_num;
+  u[3] = vaddq_s32(in[startidx], in[endidx]);
+  v[4] = vsubq_s32(in[startidx], in[endidx]);
+
+  // stage 2
+  v[0] = vaddq_s32(u[0], u[3]);
+  v[3] = vsubq_s32(u[0], u[3]);
+  v[1] = vaddq_s32(u[1], u[2]);
+  v[2] = vsubq_s32(u[1], u[2]);
+
+  v[5] = vmulq_n_s32(u[6], cospi[32]);
+  v[5] = vmlsq_n_s32(v[5], u[5], cospi[32]);
+  v[5] = vrshlq_s32(v[5], v_bit);
+
+  u[0] = vmulq_n_s32(u[5], cospi[32]);
+  v[6] = vmlaq_n_s32(u[0], u[6], cospi[32]);
+  v[6] = vrshlq_s32(v[6], v_bit);
+
+  // stage 3
+  // type 0
+  v[0] = vmulq_n_s32(v[0], cospi[32]);
+  v[1] = vmulq_n_s32(v[1], cospi[32]);
+  u[0] = vaddq_s32(v[0], v[1]);
+  u[0] = vrshlq_s32(u[0], v_bit);
+
+  u[1] = vsubq_s32(v[0], v[1]);
+  u[1] = vrshlq_s32(u[1], v_bit);
+
+  // type 1
+  v[0] = vmulq_n_s32(v[2], cospi[48]);
+  u[2] = vmlaq_n_s32(v[0], v[3], cospi[16]);
+  u[2] = vrshlq_s32(u[2], v_bit);
+
+  v[1] = vmulq_n_s32(v[3], cospi[48]);
+  u[3] = vmlsq_n_s32(v[1], v[2], cospi[16]);
+  u[3] = vrshlq_s32(u[3], v_bit);
+
+  u[4] = vaddq_s32(v[4], v[5]);
+  u[5] = vsubq_s32(v[4], v[5]);
+  u[6] = vsubq_s32(v[7], v[6]);
+  u[7] = vaddq_s32(v[7], v[6]);
+
+  // stage 4-5
+  v[0] = vmulq_n_s32(u[4], cospi[56]);
+  v[0] = vmlaq_n_s32(v[0], u[7], cospi[8]);
+  out[1 * col_num] = vrshlq_s32(v[0], v_bit);
+
+  v[1] = vmulq_n_s32(u[7], cospi[56]);
+  v[0] = vmlsq_n_s32(v[1], u[4], cospi[8]);
+  out[7 * col_num] = vrshlq_s32(v[0], v_bit);
+
+  v[0] = vmulq_n_s32(u[5], cospi[24]);
+  v[0] = vmlaq_n_s32(v[0], u[6], cospi[40]);
+  out[5 * col_num] = vrshlq_s32(v[0], v_bit);
+
+  v[1] = vmulq_n_s32(u[6], cospi[24]);
+  v[0] = vmlsq_n_s32(v[1], u[5], cospi[40]);
+  out[3 * col_num] = vrshlq_s32(v[0], v_bit);
+
+  out[0 * col_num] = u[0];
+  out[4 * col_num] = u[1];
+  out[2 * col_num] = u[2];
+  out[6 * col_num] = u[3];
+}
+
+static void fdct8x8_neon(int32x4_t *in, int32x4_t *out, int bit,
+                         const int col_num) {
+  fdct4x8_neon(in, out, bit, col_num);
+  fdct4x8_neon(in + 1, out + 1, bit, col_num);
+}
+
+static void fadst8x8_neon(int32x4_t *in, int32x4_t *out, int bit,
+                          const int col_num) {
+  const int32_t *cospi = cospi_arr(bit);
+
+  const int32x4_t v_bit = vdupq_n_s32(-bit);
+  int32x4_t u0, u1, u2, u3, u4, u5, u6, u7;
+  int32x4_t v0, v1, v2, v3, v4, v5, v6, v7;
+  int32x4_t x, y;
+  int col;
+
+  for (col = 0; col < col_num; ++col) {
+    // stage 0-1
+    u0 = in[col_num * 0 + col];
+    u1 = vnegq_s32(in[col_num * 7 + col]);
+    u2 = vnegq_s32(in[col_num * 3 + col]);
+    u3 = in[col_num * 4 + col];
+    u4 = vnegq_s32(in[col_num * 1 + col]);
+    u5 = in[col_num * 6 + col];
+    u6 = in[col_num * 2 + col];
+    u7 = vnegq_s32(in[col_num * 5 + col]);
+
+    // stage 2
+    v0 = u0;
+    v1 = u1;
+
+    x = vmulq_n_s32(u2, cospi[32]);
+    y = vmulq_n_s32(u3, cospi[32]);
+    v2 = vaddq_s32(x, y);
+    v2 = vrshlq_s32(v2, v_bit);
+
+    v3 = vsubq_s32(x, y);
+    v3 = vrshlq_s32(v3, v_bit);
+
+    v4 = u4;
+    v5 = u5;
+
+    x = vmulq_n_s32(u6, cospi[32]);
+    y = vmulq_n_s32(u7, cospi[32]);
+    v6 = vaddq_s32(x, y);
+    v6 = vrshlq_s32(v6, v_bit);
+
+    v7 = vsubq_s32(x, y);
+    v7 = vrshlq_s32(v7, v_bit);
+
+    // stage 3
+    u0 = vaddq_s32(v0, v2);
+    u1 = vaddq_s32(v1, v3);
+    u2 = vsubq_s32(v0, v2);
+    u3 = vsubq_s32(v1, v3);
+    u4 = vaddq_s32(v4, v6);
+    u5 = vaddq_s32(v5, v7);
+    u6 = vsubq_s32(v4, v6);
+    u7 = vsubq_s32(v5, v7);
+
+    // stage 4
+    v0 = u0;
+    v1 = u1;
+    v2 = u2;
+    v3 = u3;
+
+    v4 = vmulq_n_s32(u4, cospi[16]);
+    v4 = vmlaq_n_s32(v4, u5, cospi[48]);
+    v4 = vrshlq_s32(v4, v_bit);
+
+    v5 = vmulq_n_s32(u4, cospi[48]);
+    v5 = vmlsq_n_s32(v5, u5, cospi[16]);
+    v5 = vrshlq_s32(v5, v_bit);
+
+    v6 = vmulq_n_s32(u7, cospi[16]);
+    v6 = vmlsq_n_s32(v6, u6, cospi[48]);
+    v6 = vrshlq_s32(v6, v_bit);
+
+    v7 = vmulq_n_s32(u6, cospi[16]);
+    v7 = vmlaq_n_s32(v7, u7, cospi[48]);
+    v7 = vrshlq_s32(v7, v_bit);
+
+    // stage 5
+    u0 = vaddq_s32(v0, v4);
+    u1 = vaddq_s32(v1, v5);
+    u2 = vaddq_s32(v2, v6);
+    u3 = vaddq_s32(v3, v7);
+    u4 = vsubq_s32(v0, v4);
+    u5 = vsubq_s32(v1, v5);
+    u6 = vsubq_s32(v2, v6);
+    u7 = vsubq_s32(v3, v7);
+
+    // stage 6
+    v0 = vmulq_n_s32(u0, cospi[4]);
+    v0 = vmlaq_n_s32(v0, u1, cospi[60]);
+    v0 = vrshlq_s32(v0, v_bit);
+
+    v1 = vmulq_n_s32(u0, cospi[60]);
+    v1 = vmlsq_n_s32(v1, u1, cospi[4]);
+    v1 = vrshlq_s32(v1, v_bit);
+
+    v2 = vmulq_n_s32(u2, cospi[20]);
+    v2 = vmlaq_n_s32(v2, u3, cospi[44]);
+    v2 = vrshlq_s32(v2, v_bit);
+
+    v3 = vmulq_n_s32(u2, cospi[44]);
+    v3 = vmlsq_n_s32(v3, u3, cospi[20]);
+    v3 = vrshlq_s32(v3, v_bit);
+
+    v4 = vmulq_n_s32(u4, cospi[36]);
+    v4 = vmlaq_n_s32(v4, u5, cospi[28]);
+    v4 = vrshlq_s32(v4, v_bit);
+
+    v5 = vmulq_n_s32(u4, cospi[28]);
+    v5 = vmlsq_n_s32(v5, u5, cospi[36]);
+    v5 = vrshlq_s32(v5, v_bit);
+
+    x = vmulq_n_s32(u6, cospi[52]);
+    v6 = vmlaq_n_s32(x, u7, cospi[12]);
+    v6 = vrshlq_s32(v6, v_bit);
+
+    v7 = vmulq_n_s32(u6, cospi[12]);
+    v7 = vmlsq_n_s32(v7, u7, cospi[52]);
+    v7 = vrshlq_s32(v7, v_bit);
+
+    // stage 7
+    out[col_num * 0 + col] = v1;
+    out[col_num * 1 + col] = v6;
+    out[col_num * 2 + col] = v3;
+    out[col_num * 3 + col] = v4;
+    out[col_num * 4 + col] = v5;
+    out[col_num * 5 + col] = v2;
+    out[col_num * 6 + col] = v7;
+    out[col_num * 7 + col] = v0;
+  }
+}
+static void idtx8x8_neon(int32x4_t *in, int32x4_t *out, int bit, int col_num) {
+  (void)bit;
+
+  for (int i = 0; i < col_num; i += 1) {
+    out[0 + 8 * i] = vshlq_n_s32(in[0 + 8 * i], 1);
+    out[1 + 8 * i] = vshlq_n_s32(in[1 + 8 * i], 1);
+    out[2 + 8 * i] = vshlq_n_s32(in[2 + 8 * i], 1);
+    out[3 + 8 * i] = vshlq_n_s32(in[3 + 8 * i], 1);
+    out[4 + 8 * i] = vshlq_n_s32(in[4 + 8 * i], 1);
+    out[5 + 8 * i] = vshlq_n_s32(in[5 + 8 * i], 1);
+    out[6 + 8 * i] = vshlq_n_s32(in[6 + 8 * i], 1);
+    out[7 + 8 * i] = vshlq_n_s32(in[7 + 8 * i], 1);
+  }
+}
+static void idtx32x8_neon(int32x4_t *in, int32x4_t *out, int bit, int col_num) {
+  (void)bit;
+  (void)col_num;
+  for (int j = 0; j < 2; j++) {
+    out[j + 8 * 0] = vshlq_n_s32(in[j + 8 * 0], 1);
+    out[j + 8 * 1] = vshlq_n_s32(in[j + 8 * 1], 1);
+    out[j + 8 * 2] = vshlq_n_s32(in[j + 8 * 2], 1);
+    out[j + 8 * 3] = vshlq_n_s32(in[j + 8 * 3], 1);
+    out[j + 8 * 4] = vshlq_n_s32(in[j + 8 * 4], 1);
+    out[j + 8 * 5] = vshlq_n_s32(in[j + 8 * 5], 1);
+    out[j + 8 * 6] = vshlq_n_s32(in[j + 8 * 6], 1);
+    out[j + 8 * 7] = vshlq_n_s32(in[j + 8 * 7], 1);
+  }
+}
+void av1_fwd_txfm2d_8x8_neon(const int16_t *input, int32_t *coeff, int stride,
+                             TX_TYPE tx_type, int bd) {
+  int32x4_t in[16], out[16];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X8];
+  const int txw_idx = get_txw_idx(TX_8X8);
+  const int txh_idx = get_txh_idx(TX_8X8);
+  const int32x4_t v_shift1 = vdupq_n_s32(shift[1]);
+  switch (tx_type) {
+    case DCT_DCT:
+      load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
+      fdct8x8_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      col_txfm_8x8_rounding(out, &v_shift1);
+      transpose_8x8(out, in);
+      fdct8x8_neon(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2);
+      transpose_8x8(out, in);
+      write_buffer_8x8(in, coeff);
+      break;
+    case ADST_DCT:
+      load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
+      fadst8x8_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      col_txfm_8x8_rounding(out, &v_shift1);
+      transpose_8x8(out, in);
+      fdct8x8_neon(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2);
+      transpose_8x8(out, in);
+      write_buffer_8x8(in, coeff);
+      break;
+    case DCT_ADST:
+      load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
+      fdct8x8_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      col_txfm_8x8_rounding(out, &v_shift1);
+      transpose_8x8(out, in);
+      fadst8x8_neon(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2);
+      transpose_8x8(out, in);
+      write_buffer_8x8(in, coeff);
+      break;
+    case ADST_ADST:
+      load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
+      fadst8x8_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      col_txfm_8x8_rounding(out, &v_shift1);
+      transpose_8x8(out, in);
+      fadst8x8_neon(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2);
+      transpose_8x8(out, in);
+      write_buffer_8x8(in, coeff);
+      break;
+    case FLIPADST_DCT:
+      load_buffer_8x8(input, in, stride, 1, 0, shift[0]);
+      fadst8x8_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      col_txfm_8x8_rounding(out, &v_shift1);
+      transpose_8x8(out, in);
+      fdct8x8_neon(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2);
+      transpose_8x8(out, in);
+      write_buffer_8x8(in, coeff);
+      break;
+    case DCT_FLIPADST:
+      load_buffer_8x8(input, in, stride, 0, 1, shift[0]);
+      fdct8x8_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      col_txfm_8x8_rounding(out, &v_shift1);
+      transpose_8x8(out, in);
+      fadst8x8_neon(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2);
+      transpose_8x8(out, in);
+      write_buffer_8x8(in, coeff);
+      break;
+    case FLIPADST_FLIPADST:
+      load_buffer_8x8(input, in, stride, 1, 1, shift[0]);
+      fadst8x8_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      col_txfm_8x8_rounding(out, &v_shift1);
+      transpose_8x8(out, in);
+      fadst8x8_neon(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2);
+      transpose_8x8(out, in);
+      write_buffer_8x8(in, coeff);
+      break;
+    case ADST_FLIPADST:
+      load_buffer_8x8(input, in, stride, 0, 1, shift[0]);
+      fadst8x8_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      col_txfm_8x8_rounding(out, &v_shift1);
+      transpose_8x8(out, in);
+      fadst8x8_neon(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2);
+      transpose_8x8(out, in);
+      write_buffer_8x8(in, coeff);
+      break;
+    case FLIPADST_ADST:
+      load_buffer_8x8(input, in, stride, 1, 0, shift[0]);
+      fadst8x8_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      col_txfm_8x8_rounding(out, &v_shift1);
+      transpose_8x8(out, in);
+      fadst8x8_neon(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2);
+      transpose_8x8(out, in);
+      write_buffer_8x8(in, coeff);
+      break;
+    case IDTX:
+      load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
+      idtx8x8_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      col_txfm_8x8_rounding(out, &v_shift1);
+      transpose_8x8(out, in);
+      idtx8x8_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      transpose_8x8(out, in);
+      write_buffer_8x8(in, coeff);
+      break;
+    case V_DCT:
+      load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
+      fdct8x8_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      col_txfm_8x8_rounding(out, &v_shift1);
+      transpose_8x8(out, in);
+      idtx8x8_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      transpose_8x8(out, in);
+      write_buffer_8x8(in, coeff);
+      break;
+    case H_DCT:
+      load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
+      idtx8x8_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      col_txfm_8x8_rounding(out, &v_shift1);
+      transpose_8x8(out, in);
+      fdct8x8_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      transpose_8x8(out, in);
+      write_buffer_8x8(in, coeff);
+      break;
+    case V_ADST:
+      load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
+      fadst8x8_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      col_txfm_8x8_rounding(out, &v_shift1);
+      transpose_8x8(out, in);
+      idtx8x8_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      transpose_8x8(out, in);
+      write_buffer_8x8(in, coeff);
+      break;
+    case H_ADST:
+      load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
+      idtx8x8_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      col_txfm_8x8_rounding(out, &v_shift1);
+      transpose_8x8(out, in);
+      fadst8x8_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      transpose_8x8(out, in);
+      write_buffer_8x8(in, coeff);
+      break;
+    case V_FLIPADST:
+      load_buffer_8x8(input, in, stride, 1, 0, shift[0]);
+      fadst8x8_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      col_txfm_8x8_rounding(out, &v_shift1);
+      transpose_8x8(out, in);
+      idtx8x8_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      transpose_8x8(out, in);
+      write_buffer_8x8(in, coeff);
+      break;
+    case H_FLIPADST:
+      load_buffer_8x8(input, in, stride, 0, 1, shift[0]);
+      idtx8x8_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      col_txfm_8x8_rounding(out, &v_shift1);
+      transpose_8x8(out, in);
+      fadst8x8_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      transpose_8x8(out, in);
+      write_buffer_8x8(in, coeff);
+      break;
+    default: assert(0);
+  }
+  (void)bd;
+}
+
+// Hybrid Transform 16x16
+
+static INLINE void convert_8x8_to_16x16(const int32x4_t *in, int32x4_t *out) {
+  int row_index = 0;
+  int dst_index = 0;
+  int src_index = 0;
+
+  // row 0, 1, .., 7
+  do {
+    out[dst_index] = in[src_index];
+    out[dst_index + 1] = in[src_index + 1];
+    out[dst_index + 2] = in[src_index + 16];
+    out[dst_index + 3] = in[src_index + 17];
+    dst_index += 4;
+    src_index += 2;
+    row_index += 1;
+  } while (row_index < 8);
+
+  // row 8, 9, ..., 15
+  src_index += 16;
+  do {
+    out[dst_index] = in[src_index];
+    out[dst_index + 1] = in[src_index + 1];
+    out[dst_index + 2] = in[src_index + 16];
+    out[dst_index + 3] = in[src_index + 17];
+    dst_index += 4;
+    src_index += 2;
+    row_index += 1;
+  } while (row_index < 16);
+}
+
+static INLINE void load_buffer_16x16(const int16_t *input, int32x4_t *out,
+                                     int stride, int flipud, int fliplr,
+                                     int shift) {
+  int32x4_t in[64];
+  // Load 4 8x8 blocks
+  const int16_t *topL = input;
+  const int16_t *topR = input + 8;
+  const int16_t *botL = input + 8 * stride;
+  const int16_t *botR = input + 8 * stride + 8;
+
+  const int16_t *tmp;
+
+  if (flipud) {
+    // Swap left columns
+    tmp = topL;
+    topL = botL;
+    botL = tmp;
+    // Swap right columns
+    tmp = topR;
+    topR = botR;
+    botR = tmp;
+  }
+
+  if (fliplr) {
+    // Swap top rows
+    tmp = topL;
+    topL = topR;
+    topR = tmp;
+    // Swap bottom rows
+    tmp = botL;
+    botL = botR;
+    botR = tmp;
+  }
+
+  // load first 8 columns
+  load_buffer_8x8(topL, &in[0], stride, flipud, fliplr, shift);
+  load_buffer_8x8(botL, &in[32], stride, flipud, fliplr, shift);
+
+  // load second 8 columns
+  load_buffer_8x8(topR, &in[16], stride, flipud, fliplr, shift);
+  load_buffer_8x8(botR, &in[48], stride, flipud, fliplr, shift);
+
+  convert_8x8_to_16x16(in, out);
+}
+
+static INLINE void load_buffer_8x16(const int16_t *input, int32x4_t *out,
+                                    int stride, int flipud, int fliplr,
+                                    int shift) {
+  const int16_t *topL = input;
+  const int16_t *botL = input + 8 * stride;
+
+  const int16_t *tmp;
+
+  if (flipud) {
+    tmp = topL;
+    topL = botL;
+    botL = tmp;
+  }
+
+  load_buffer_8x8(topL, out, stride, flipud, fliplr, shift);
+  load_buffer_8x8(botL, out + 16, stride, flipud, fliplr, shift);
+}
+
+static INLINE void load_buffer_8x4(const int16_t *input, int32x4_t *out,
+                                   int stride, int flipud, int fliplr,
+                                   const int32x4_t *v_shift) {
+  const int16_t *topL = input;
+  const int16_t *topR = input + 4;
+
+  const int16_t *tmp;
+
+  if (fliplr) {
+    tmp = topL;
+    topL = topR;
+    topR = tmp;
+  }
+  load_buffer_4x4(topL, out, stride, flipud, fliplr, v_shift);
+  load_buffer_4x4(topR, out + 4, stride, flipud, fliplr, v_shift);
+}
+
+static INLINE void load_buffer_16x4(const int16_t *input, int32x4_t *out,
+                                    int stride, int flipud, int fliplr,
+                                    const int32x4_t *v_shift) {
+  const int16_t *topL = input;
+  const int16_t *topR = input + 8;
+
+  const int16_t *tmp;
+
+  if (fliplr) {
+    tmp = topL;
+    topL = topR;
+    topR = tmp;
+  }
+
+  load_buffer_8x4(topL, out, stride, flipud, fliplr, v_shift);
+  load_buffer_8x4(topR, out + 8, stride, flipud, fliplr, v_shift);
+}
+
+static INLINE void load_buffer_4x8(const int16_t *input, int32x4_t *out,
+                                   int stride, int flipud, int fliplr,
+                                   const int32x4_t *v_shift) {
+  const int16_t *topL = input;
+  const int16_t *botL = input + 4 * stride;
+
+  const int16_t *tmp;
+
+  if (flipud) {
+    tmp = topL;
+    topL = botL;
+    botL = tmp;
+  }
+
+  load_buffer_4x4(topL, out, stride, flipud, fliplr, v_shift);
+  load_buffer_4x4(botL, out + 4, stride, flipud, fliplr, v_shift);
+}
+
+static INLINE void load_buffer_4x16(const int16_t *input, int32x4_t *out,
+                                    const int stride, const int flipud,
+                                    const int fliplr,
+                                    const int32x4_t *v_shift) {
+  const int16_t *topL = input;
+  const int16_t *botL = input + 8 * stride;
+
+  const int16_t *tmp;
+
+  if (flipud) {
+    tmp = topL;
+    topL = botL;
+    botL = tmp;
+  }
+  load_buffer_4x8(topL, out, stride, flipud, fliplr, v_shift);
+  load_buffer_4x8(botL, out + 8, stride, flipud, fliplr, v_shift);
+}
+
+static INLINE void load_buffer_32x8n(const int16_t *input, int32x4_t *out,
+                                     int stride, int flipud, int fliplr,
+                                     int shift, const int height) {
+  const int16_t *in = input;
+  int32x4_t *output = out;
+  for (int col = 0; col < height; col++) {
+    in = input + col * stride;
+    output = out + col * 8;
+    int32x4_t v_shift = vdupq_n_s32(shift);
+    load_buffer_4x4(in, output, 4, flipud, fliplr, &v_shift);
+    load_buffer_4x4((in + 16), (output + 4), 4, flipud, fliplr, &v_shift);
+  }
+}
+
+static void fdct16x16_neon(int32x4_t *in, int32x4_t *out, int bit,
+                           const int col_num) {
+  const int32_t *cospi = cospi_arr(bit);
+  const int32x4_t v_bit = vdupq_n_s32(-bit);
+  int32x4_t u[16], v[16];
+  int col;
+
+  // Calculate the column 0, 1, 2, 3
+  for (col = 0; col < col_num; ++col) {
+    // stage 0
+    // stage 1
+    u[0] = vaddq_s32(in[0 * col_num + col], in[15 * col_num + col]);
+    u[15] = vsubq_s32(in[0 * col_num + col], in[15 * col_num + col]);
+    u[1] = vaddq_s32(in[1 * col_num + col], in[14 * col_num + col]);
+    u[14] = vsubq_s32(in[1 * col_num + col], in[14 * col_num + col]);
+    u[2] = vaddq_s32(in[2 * col_num + col], in[13 * col_num + col]);
+    u[13] = vsubq_s32(in[2 * col_num + col], in[13 * col_num + col]);
+    u[3] = vaddq_s32(in[3 * col_num + col], in[12 * col_num + col]);
+    u[12] = vsubq_s32(in[3 * col_num + col], in[12 * col_num + col]);
+    u[4] = vaddq_s32(in[4 * col_num + col], in[11 * col_num + col]);
+    u[11] = vsubq_s32(in[4 * col_num + col], in[11 * col_num + col]);
+    u[5] = vaddq_s32(in[5 * col_num + col], in[10 * col_num + col]);
+    u[10] = vsubq_s32(in[5 * col_num + col], in[10 * col_num + col]);
+    u[6] = vaddq_s32(in[6 * col_num + col], in[9 * col_num + col]);
+    u[9] = vsubq_s32(in[6 * col_num + col], in[9 * col_num + col]);
+    u[7] = vaddq_s32(in[7 * col_num + col], in[8 * col_num + col]);
+    u[8] = vsubq_s32(in[7 * col_num + col], in[8 * col_num + col]);
+
+    // stage 2
+    v[0] = vaddq_s32(u[0], u[7]);
+    v[7] = vsubq_s32(u[0], u[7]);
+    v[1] = vaddq_s32(u[1], u[6]);
+    v[6] = vsubq_s32(u[1], u[6]);
+    v[2] = vaddq_s32(u[2], u[5]);
+    v[5] = vsubq_s32(u[2], u[5]);
+    v[3] = vaddq_s32(u[3], u[4]);
+    v[4] = vsubq_s32(u[3], u[4]);
+    v[8] = u[8];
+    v[9] = u[9];
+
+    v[10] = vmulq_n_s32(u[13], cospi[32]);
+    v[10] = vmlsq_n_s32(v[10], u[10], cospi[32]);
+    v[10] = vrshlq_s32(v[10], v_bit);
+
+    v[13] = vmulq_n_s32(u[10], cospi[32]);
+    v[13] = vmlaq_n_s32(v[13], u[13], cospi[32]);
+    v[13] = vrshlq_s32(v[13], v_bit);
+
+    v[11] = vmulq_n_s32(u[12], cospi[32]);
+    v[11] = vmlsq_n_s32(v[11], u[11], cospi[32]);
+    v[11] = vrshlq_s32(v[11], v_bit);
+
+    v[12] = vmulq_n_s32(u[11], cospi[32]);
+    v[12] = vmlaq_n_s32(v[12], u[12], cospi[32]);
+    v[12] = vrshlq_s32(v[12], v_bit);
+    v[14] = u[14];
+    v[15] = u[15];
+
+    // stage 3
+    u[0] = vaddq_s32(v[0], v[3]);
+    u[3] = vsubq_s32(v[0], v[3]);
+    u[1] = vaddq_s32(v[1], v[2]);
+    u[2] = vsubq_s32(v[1], v[2]);
+    u[4] = v[4];
+
+    u[5] = vmulq_n_s32(v[6], cospi[32]);
+    u[5] = vmlsq_n_s32(u[5], v[5], cospi[32]);
+    u[5] = vrshlq_s32(u[5], v_bit);
+
+    u[6] = vmulq_n_s32(v[5], cospi[32]);
+    u[6] = vmlaq_n_s32(u[6], v[6], cospi[32]);
+    u[6] = vrshlq_s32(u[6], v_bit);
+
+    u[7] = v[7];
+    u[8] = vaddq_s32(v[8], v[11]);
+    u[11] = vsubq_s32(v[8], v[11]);
+    u[9] = vaddq_s32(v[9], v[10]);
+    u[10] = vsubq_s32(v[9], v[10]);
+    u[12] = vsubq_s32(v[15], v[12]);
+    u[15] = vaddq_s32(v[15], v[12]);
+    u[13] = vsubq_s32(v[14], v[13]);
+    u[14] = vaddq_s32(v[14], v[13]);
+
+    // stage 4
+    u[0] = vmulq_n_s32(u[0], cospi[32]);
+    u[1] = vmulq_n_s32(u[1], cospi[32]);
+    v[0] = vaddq_s32(u[0], u[1]);
+    v[0] = vrshlq_s32(v[0], v_bit);
+
+    v[1] = vsubq_s32(u[0], u[1]);
+    v[1] = vrshlq_s32(v[1], v_bit);
+
+    v[2] = vmulq_n_s32(u[2], cospi[48]);
+    v[2] = vmlaq_n_s32(v[2], u[3], cospi[16]);
+    v[2] = vrshlq_s32(v[2], v_bit);
+
+    v[3] = vmulq_n_s32(u[3], cospi[48]);
+    v[3] = vmlsq_n_s32(v[3], u[2], cospi[16]);
+    v[3] = vrshlq_s32(v[3], v_bit);
+
+    v[4] = vaddq_s32(u[4], u[5]);
+    v[5] = vsubq_s32(u[4], u[5]);
+    v[6] = vsubq_s32(u[7], u[6]);
+    v[7] = vaddq_s32(u[7], u[6]);
+    v[8] = u[8];
+
+    v[9] = vmulq_n_s32(u[14], cospi[48]);
+    v[9] = vmlsq_n_s32(v[9], u[9], cospi[16]);
+    v[9] = vrshlq_s32(v[9], v_bit);
+
+    v[14] = vmulq_n_s32(u[9], cospi[48]);
+    v[14] = vmlaq_n_s32(v[14], u[14], cospi[16]);
+    v[14] = vrshlq_s32(v[14], v_bit);
+
+    v[10] = vmulq_n_s32(u[13], -cospi[16]);
+    v[10] = vmlsq_n_s32(v[10], u[10], cospi[48]);
+    v[10] = vrshlq_s32(v[10], v_bit);
+
+    v[13] = vmulq_n_s32(u[10], -cospi[16]);
+    v[13] = vmlaq_n_s32(v[13], u[13], cospi[48]);
+    v[13] = vrshlq_s32(v[13], v_bit);
+
+    v[11] = u[11];
+    v[12] = u[12];
+    v[15] = u[15];
+
+    // stage 5
+    u[0] = v[0];
+    u[1] = v[1];
+    u[2] = v[2];
+    u[3] = v[3];
+
+    u[4] = vmulq_n_s32(v[4], cospi[56]);
+    u[4] = vmlaq_n_s32(u[4], v[7], cospi[8]);
+    u[4] = vrshlq_s32(u[4], v_bit);
+
+    u[7] = vmulq_n_s32(v[7], cospi[56]);
+    u[7] = vmlsq_n_s32(u[7], v[4], cospi[8]);
+    u[7] = vrshlq_s32(u[7], v_bit);
+
+    u[5] = vmulq_n_s32(v[5], cospi[24]);
+    u[5] = vmlaq_n_s32(u[5], v[6], cospi[40]);
+    u[5] = vrshlq_s32(u[5], v_bit);
+
+    u[6] = vmulq_n_s32(v[6], cospi[24]);
+    u[6] = vmlsq_n_s32(u[6], v[5], cospi[40]);
+    u[6] = vrshlq_s32(u[6], v_bit);
+
+    u[8] = vaddq_s32(v[8], v[9]);
+    u[9] = vsubq_s32(v[8], v[9]);
+    u[10] = vsubq_s32(v[11], v[10]);
+    u[11] = vaddq_s32(v[11], v[10]);
+    u[12] = vaddq_s32(v[12], v[13]);
+    u[13] = vsubq_s32(v[12], v[13]);
+    u[14] = vsubq_s32(v[15], v[14]);
+    u[15] = vaddq_s32(v[15], v[14]);
+
+    // stage 6
+    v[0] = u[0];
+    v[1] = u[1];
+    v[2] = u[2];
+    v[3] = u[3];
+    v[4] = u[4];
+    v[5] = u[5];
+    v[6] = u[6];
+    v[7] = u[7];
+
+    v[8] = vmulq_n_s32(u[8], cospi[60]);
+    v[8] = vmlaq_n_s32(v[8], u[15], cospi[4]);
+    v[8] = vrshlq_s32(v[8], v_bit);
+
+    v[15] = vmulq_n_s32(u[15], cospi[60]);
+    v[15] = vmlsq_n_s32(v[15], u[8], cospi[4]);
+    v[15] = vrshlq_s32(v[15], v_bit);
+
+    v[9] = vmulq_n_s32(u[9], cospi[28]);
+    v[9] = vmlaq_n_s32(v[9], u[14], cospi[36]);
+    v[9] = vrshlq_s32(v[9], v_bit);
+
+    v[14] = vmulq_n_s32(u[14], cospi[28]);
+    v[14] = vmlsq_n_s32(v[14], u[9], cospi[36]);
+    v[14] = vrshlq_s32(v[14], v_bit);
+
+    v[10] = vmulq_n_s32(u[10], cospi[44]);
+    v[10] = vmlaq_n_s32(v[10], u[13], cospi[20]);
+    v[10] = vrshlq_s32(v[10], v_bit);
+
+    v[13] = vmulq_n_s32(u[13], cospi[44]);
+    v[13] = vmlsq_n_s32(v[13], u[10], cospi[20]);
+    v[13] = vrshlq_s32(v[13], v_bit);
+
+    v[11] = vmulq_n_s32(u[11], cospi[12]);
+    v[11] = vmlaq_n_s32(v[11], u[12], cospi[52]);
+    v[11] = vrshlq_s32(v[11], v_bit);
+
+    v[12] = vmulq_n_s32(u[12], cospi[12]);
+    v[12] = vmlsq_n_s32(v[12], u[11], cospi[52]);
+    v[12] = vrshlq_s32(v[12], v_bit);
+
+    out[0 * col_num + col] = v[0];
+    out[1 * col_num + col] = v[8];
+    out[2 * col_num + col] = v[4];
+    out[3 * col_num + col] = v[12];
+    out[4 * col_num + col] = v[2];
+    out[5 * col_num + col] = v[10];
+    out[6 * col_num + col] = v[6];
+    out[7 * col_num + col] = v[14];
+    out[8 * col_num + col] = v[1];
+    out[9 * col_num + col] = v[9];
+    out[10 * col_num + col] = v[5];
+    out[11 * col_num + col] = v[13];
+    out[12 * col_num + col] = v[3];
+    out[13 * col_num + col] = v[11];
+    out[14 * col_num + col] = v[7];
+    out[15 * col_num + col] = v[15];
+  }
+}
+
+static void fadst16x16_neon(int32x4_t *in, int32x4_t *out, int bit,
+                            const int num_cols) {
+  const int32_t *cospi = cospi_arr(bit);
+
+  const int32x4_t v_bit = vdupq_n_s32(-bit);
+
+  int32x4_t u[16], v[16], x, y;
+  int col;
+
+  for (col = 0; col < num_cols; ++col) {
+    // stage 0-1
+    u[0] = in[0 * num_cols + col];
+    u[1] = vnegq_s32(in[15 * num_cols + col]);
+    u[2] = vnegq_s32(in[7 * num_cols + col]);
+    u[3] = in[8 * num_cols + col];
+    u[4] = vnegq_s32(in[3 * num_cols + col]);
+    u[5] = in[12 * num_cols + col];
+    u[6] = in[4 * num_cols + col];
+    u[7] = vnegq_s32(in[11 * num_cols + col]);
+    u[8] = vnegq_s32(in[1 * num_cols + col]);
+    u[9] = in[14 * num_cols + col];
+    u[10] = in[6 * num_cols + col];
+    u[11] = vnegq_s32(in[9 * num_cols + col]);
+    u[12] = in[2 * num_cols + col];
+    u[13] = vnegq_s32(in[13 * num_cols + col]);
+    u[14] = vnegq_s32(in[5 * num_cols + col]);
+    u[15] = in[10 * num_cols + col];
+
+    // stage 2
+    v[0] = u[0];
+    v[1] = u[1];
+
+    x = vmulq_n_s32(u[2], cospi[32]);
+    y = vmulq_n_s32(u[3], cospi[32]);
+    v[2] = vaddq_s32(x, y);
+    v[2] = vrshlq_s32(v[2], v_bit);
+
+    v[3] = vsubq_s32(x, y);
+    v[3] = vrshlq_s32(v[3], v_bit);
+
+    v[4] = u[4];
+    v[5] = u[5];
+
+    x = vmulq_n_s32(u[6], cospi[32]);
+    y = vmulq_n_s32(u[7], cospi[32]);
+    v[6] = vaddq_s32(x, y);
+    v[6] = vrshlq_s32(v[6], v_bit);
+
+    v[7] = vsubq_s32(x, y);
+    v[7] = vrshlq_s32(v[7], v_bit);
+
+    v[8] = u[8];
+    v[9] = u[9];
+
+    x = vmulq_n_s32(u[10], cospi[32]);
+    y = vmulq_n_s32(u[11], cospi[32]);
+    v[10] = vaddq_s32(x, y);
+    v[10] = vrshlq_s32(v[10], v_bit);
+
+    v[11] = vsubq_s32(x, y);
+    v[11] = vrshlq_s32(v[11], v_bit);
+
+    v[12] = u[12];
+    v[13] = u[13];
+
+    x = vmulq_n_s32(u[14], cospi[32]);
+    y = vmulq_n_s32(u[15], cospi[32]);
+    v[14] = vaddq_s32(x, y);
+    v[14] = vrshlq_s32(v[14], v_bit);
+
+    v[15] = vsubq_s32(x, y);
+    v[15] = vrshlq_s32(v[15], v_bit);
+
+    // stage 3
+    u[0] = vaddq_s32(v[0], v[2]);
+    u[1] = vaddq_s32(v[1], v[3]);
+    u[2] = vsubq_s32(v[0], v[2]);
+    u[3] = vsubq_s32(v[1], v[3]);
+    u[4] = vaddq_s32(v[4], v[6]);
+    u[5] = vaddq_s32(v[5], v[7]);
+    u[6] = vsubq_s32(v[4], v[6]);
+    u[7] = vsubq_s32(v[5], v[7]);
+    u[8] = vaddq_s32(v[8], v[10]);
+    u[9] = vaddq_s32(v[9], v[11]);
+    u[10] = vsubq_s32(v[8], v[10]);
+    u[11] = vsubq_s32(v[9], v[11]);
+    u[12] = vaddq_s32(v[12], v[14]);
+    u[13] = vaddq_s32(v[13], v[15]);
+    u[14] = vsubq_s32(v[12], v[14]);
+    u[15] = vsubq_s32(v[13], v[15]);
+
+    // stage 4
+    v[0] = u[0];
+    v[1] = u[1];
+    v[2] = u[2];
+    v[3] = u[3];
+    v[4] = half_btf_neon(&cospi[16], &u[4], &cospi[48], &u[5], v_bit);
+    v[7] = half_btf_neon(&cospi[16], &u[6], &cospi[48], &u[7], v_bit);
+    v[5] = half_btf_neon_m(&cospi[48], &u[4], &cospi[16], &u[5], v_bit);
+    v[6] = half_btf_neon_m(&cospi[16], &u[7], &cospi[48], &u[6], v_bit);
+
+    v[8] = u[8];
+    v[9] = u[9];
+    v[10] = u[10];
+    v[11] = u[11];
+
+    v[12] = half_btf_neon(&cospi[16], &u[12], &cospi[48], &u[13], v_bit);
+    v[15] = half_btf_neon(&cospi[16], &u[14], &cospi[48], &u[15], v_bit);
+    v[13] = half_btf_neon_m(&cospi[48], &u[12], &cospi[16], &u[13], v_bit);
+    v[14] = half_btf_neon_m(&cospi[16], &u[15], &cospi[48], &u[14], v_bit);
+
+    // stage 5
+    u[0] = vaddq_s32(v[0], v[4]);
+    u[1] = vaddq_s32(v[1], v[5]);
+    u[2] = vaddq_s32(v[2], v[6]);
+    u[3] = vaddq_s32(v[3], v[7]);
+    u[4] = vsubq_s32(v[0], v[4]);
+    u[5] = vsubq_s32(v[1], v[5]);
+    u[6] = vsubq_s32(v[2], v[6]);
+    u[7] = vsubq_s32(v[3], v[7]);
+    u[8] = vaddq_s32(v[8], v[12]);
+    u[9] = vaddq_s32(v[9], v[13]);
+    u[10] = vaddq_s32(v[10], v[14]);
+    u[11] = vaddq_s32(v[11], v[15]);
+    u[12] = vsubq_s32(v[8], v[12]);
+    u[13] = vsubq_s32(v[9], v[13]);
+    u[14] = vsubq_s32(v[10], v[14]);
+    u[15] = vsubq_s32(v[11], v[15]);
+
+    // stage 6
+    v[0] = u[0];
+    v[1] = u[1];
+    v[2] = u[2];
+    v[3] = u[3];
+    v[4] = u[4];
+    v[5] = u[5];
+    v[6] = u[6];
+    v[7] = u[7];
+
+    v[8] = half_btf_neon(&cospi[8], &u[8], &cospi[56], &u[9], v_bit);
+    v[13] = half_btf_neon(&cospi[8], &u[12], &cospi[56], &u[13], v_bit);
+    v[9] = half_btf_neon_m(&cospi[56], &u[8], &cospi[8], &u[9], v_bit);
+    v[12] = half_btf_neon_m(&cospi[8], &u[13], &cospi[56], &u[12], v_bit);
+
+    v[10] = half_btf_neon(&cospi[40], &u[10], &cospi[24], &u[11], v_bit);
+    v[15] = half_btf_neon(&cospi[40], &u[14], &cospi[24], &u[15], v_bit);
+    v[11] = half_btf_neon_m(&cospi[24], &u[10], &cospi[40], &u[11], v_bit);
+    v[14] = half_btf_neon_m(&cospi[40], &u[15], &cospi[24], &u[14], v_bit);
+
+    // stage 7
+    u[0] = vaddq_s32(v[0], v[8]);
+    u[1] = vaddq_s32(v[1], v[9]);
+    u[2] = vaddq_s32(v[2], v[10]);
+    u[3] = vaddq_s32(v[3], v[11]);
+    u[4] = vaddq_s32(v[4], v[12]);
+    u[5] = vaddq_s32(v[5], v[13]);
+    u[6] = vaddq_s32(v[6], v[14]);
+    u[7] = vaddq_s32(v[7], v[15]);
+    u[8] = vsubq_s32(v[0], v[8]);
+    u[9] = vsubq_s32(v[1], v[9]);
+    u[10] = vsubq_s32(v[2], v[10]);
+    u[11] = vsubq_s32(v[3], v[11]);
+    u[12] = vsubq_s32(v[4], v[12]);
+    u[13] = vsubq_s32(v[5], v[13]);
+    u[14] = vsubq_s32(v[6], v[14]);
+    u[15] = vsubq_s32(v[7], v[15]);
+
+    // stage 8
+    v[0] = half_btf_neon(&cospi[2], &u[0], &cospi[62], &u[1], v_bit);
+    v[1] = half_btf_neon_m(&cospi[62], &u[0], &cospi[2], &u[1], v_bit);
+    v[2] = half_btf_neon(&cospi[10], &u[2], &cospi[54], &u[3], v_bit);
+    v[3] = half_btf_neon_m(&cospi[54], &u[2], &cospi[10], &u[3], v_bit);
+    v[4] = half_btf_neon(&cospi[18], &u[4], &cospi[46], &u[5], v_bit);
+    v[5] = half_btf_neon_m(&cospi[46], &u[4], &cospi[18], &u[5], v_bit);
+    v[6] = half_btf_neon(&cospi[26], &u[6], &cospi[38], &u[7], v_bit);
+    v[7] = half_btf_neon_m(&cospi[38], &u[6], &cospi[26], &u[7], v_bit);
+    v[8] = half_btf_neon(&cospi[34], &u[8], &cospi[30], &u[9], v_bit);
+    v[9] = half_btf_neon_m(&cospi[30], &u[8], &cospi[34], &u[9], v_bit);
+    v[10] = half_btf_neon(&cospi[42], &u[10], &cospi[22], &u[11], v_bit);
+    v[11] = half_btf_neon_m(&cospi[22], &u[10], &cospi[42], &u[11], v_bit);
+    v[12] = half_btf_neon(&cospi[50], &u[12], &cospi[14], &u[13], v_bit);
+    v[13] = half_btf_neon_m(&cospi[14], &u[12], &cospi[50], &u[13], v_bit);
+    v[14] = half_btf_neon(&cospi[58], &u[14], &cospi[6], &u[15], v_bit);
+    v[15] = half_btf_neon_m(&cospi[6], &u[14], &cospi[58], &u[15], v_bit);
+
+    // stage 9
+    out[0 * num_cols + col] = v[1];
+    out[1 * num_cols + col] = v[14];
+    out[2 * num_cols + col] = v[3];
+    out[3 * num_cols + col] = v[12];
+    out[4 * num_cols + col] = v[5];
+    out[5 * num_cols + col] = v[10];
+    out[6 * num_cols + col] = v[7];
+    out[7 * num_cols + col] = v[8];
+    out[8 * num_cols + col] = v[9];
+    out[9 * num_cols + col] = v[6];
+    out[10 * num_cols + col] = v[11];
+    out[11 * num_cols + col] = v[4];
+    out[12 * num_cols + col] = v[13];
+    out[13 * num_cols + col] = v[2];
+    out[14 * num_cols + col] = v[15];
+    out[15 * num_cols + col] = v[0];
+  }
+}
+
+static void col_txfm_16x16_rounding(int32x4_t *in, const int32x4_t *v_shift) {
+  // Note:
+  //  We split 16x16 rounding into 4 sections of 8x8 rounding,
+  //  instead of 4 columns
+  col_txfm_8x8_rounding(&in[0], v_shift);
+  col_txfm_8x8_rounding(&in[16], v_shift);
+  col_txfm_8x8_rounding(&in[32], v_shift);
+  col_txfm_8x8_rounding(&in[48], v_shift);
+}
+
+static void col_txfm_8x16_rounding(int32x4_t *in, const int32x4_t *v_shift) {
+  col_txfm_8x8_rounding(&in[0], v_shift);
+  col_txfm_8x8_rounding(&in[16], v_shift);
+}
+
+static void write_buffer_16x16(const int32x4_t *in, int32_t *output) {
+  const int size_8x8 = 16 * 4;
+  write_buffer_8x8(&in[0], output);
+  output += size_8x8;
+  write_buffer_8x8(&in[16], output);
+  output += size_8x8;
+  write_buffer_8x8(&in[32], output);
+  output += size_8x8;
+  write_buffer_8x8(&in[48], output);
+}
+static void idtx16x16_neon(int32x4_t *in, int32x4_t *out, int bit,
+                           int col_num) {
+  (void)bit;
+  int32x4_t fact = vdupq_n_s32(2 * NewSqrt2);
+  int32x4_t offset = vdupq_n_s32(1 << (NewSqrt2Bits - 1));
+  int32x4_t a_low;
+
+  int num_iters = 16 * col_num;
+  for (int i = 0; i < num_iters; i++) {
+    a_low = vmulq_s32(in[i], fact);
+    a_low = vaddq_s32(a_low, offset);
+    out[i] = vshrq_n_s32(a_low, NewSqrt2Bits);
+  }
+}
+void av1_fwd_txfm2d_16x16_neon(const int16_t *input, int32_t *coeff, int stride,
+                               TX_TYPE tx_type, int bd) {
+  int32x4_t in[64], out[64];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X16];
+  const int txw_idx = get_txw_idx(TX_16X16);
+  const int txh_idx = get_txh_idx(TX_16X16);
+  const int col_num = 4;
+  const int32x4_t v_shift = vdupq_n_s32(shift[1]);
+  switch (tx_type) {
+    case DCT_DCT:
+      load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
+      fdct16x16_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
+      col_txfm_16x16_rounding(out, &v_shift);
+      transpose_16x16(out, in);
+      fdct16x16_neon(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
+      transpose_16x16(out, in);
+      write_buffer_16x16(in, coeff);
+      break;
+    case ADST_DCT:
+      load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
+      fadst16x16_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
+      col_txfm_16x16_rounding(out, &v_shift);
+      transpose_16x16(out, in);
+      fdct16x16_neon(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
+      transpose_16x16(out, in);
+      write_buffer_16x16(in, coeff);
+      break;
+    case DCT_ADST:
+      load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
+      fdct16x16_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
+      col_txfm_16x16_rounding(out, &v_shift);
+      transpose_16x16(out, in);
+      fadst16x16_neon(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
+      transpose_16x16(out, in);
+      write_buffer_16x16(in, coeff);
+      break;
+    case ADST_ADST:
+      load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
+      fadst16x16_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
+      col_txfm_16x16_rounding(out, &v_shift);
+      transpose_16x16(out, in);
+      fadst16x16_neon(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
+      transpose_16x16(out, in);
+      write_buffer_16x16(in, coeff);
+      break;
+    case FLIPADST_DCT:
+      load_buffer_16x16(input, in, stride, 1, 0, shift[0]);
+      fadst16x16_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
+      col_txfm_16x16_rounding(out, &v_shift);
+      transpose_16x16(out, in);
+      fdct16x16_neon(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
+      transpose_16x16(out, in);
+      write_buffer_16x16(in, coeff);
+      break;
+    case DCT_FLIPADST:
+      load_buffer_16x16(input, in, stride, 0, 1, shift[0]);
+      fdct16x16_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
+      col_txfm_16x16_rounding(out, &v_shift);
+      transpose_16x16(out, in);
+      fadst16x16_neon(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
+      transpose_16x16(out, in);
+      write_buffer_16x16(in, coeff);
+      break;
+    case FLIPADST_FLIPADST:
+      load_buffer_16x16(input, in, stride, 1, 1, shift[0]);
+      fadst16x16_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
+      col_txfm_16x16_rounding(out, &v_shift);
+      transpose_16x16(out, in);
+      fadst16x16_neon(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
+      transpose_16x16(out, in);
+      write_buffer_16x16(in, coeff);
+      break;
+    case ADST_FLIPADST:
+      load_buffer_16x16(input, in, stride, 0, 1, shift[0]);
+      fadst16x16_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
+      col_txfm_16x16_rounding(out, &v_shift);
+      transpose_16x16(out, in);
+      fadst16x16_neon(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
+      transpose_16x16(out, in);
+      write_buffer_16x16(in, coeff);
+      break;
+    case FLIPADST_ADST:
+      load_buffer_16x16(input, in, stride, 1, 0, shift[0]);
+      fadst16x16_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
+      col_txfm_16x16_rounding(out, &v_shift);
+      transpose_16x16(out, in);
+      fadst16x16_neon(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
+      transpose_16x16(out, in);
+      write_buffer_16x16(in, coeff);
+      break;
+    case IDTX:
+      load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
+      idtx16x16_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
+      col_txfm_16x16_rounding(out, &v_shift);
+      transpose_16x16(out, in);
+      idtx16x16_neon(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
+      transpose_16x16(out, in);
+      write_buffer_16x16(in, coeff);
+      break;
+    case V_DCT:
+      load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
+      fdct16x16_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
+      col_txfm_16x16_rounding(out, &v_shift);
+      transpose_16x16(out, in);
+      idtx16x16_neon(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
+      transpose_16x16(out, in);
+      write_buffer_16x16(in, coeff);
+      break;
+    case H_DCT:
+      load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
+      idtx16x16_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
+      col_txfm_16x16_rounding(out, &v_shift);
+      transpose_16x16(out, in);
+      fdct16x16_neon(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
+      transpose_16x16(out, in);
+      write_buffer_16x16(in, coeff);
+      break;
+    case V_ADST:
+      load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
+      fadst16x16_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
+      col_txfm_16x16_rounding(out, &v_shift);
+      transpose_16x16(out, in);
+      idtx16x16_neon(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
+      transpose_16x16(out, in);
+      write_buffer_16x16(in, coeff);
+      break;
+    case H_ADST:
+      load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
+      idtx16x16_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
+      col_txfm_16x16_rounding(out, &v_shift);
+      transpose_16x16(out, in);
+      fadst16x16_neon(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
+      transpose_16x16(out, in);
+      write_buffer_16x16(in, coeff);
+      break;
+    case V_FLIPADST:
+      load_buffer_16x16(input, in, stride, 1, 0, shift[0]);
+      fadst16x16_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
+      col_txfm_16x16_rounding(out, &v_shift);
+      transpose_16x16(out, in);
+      idtx16x16_neon(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
+      transpose_16x16(out, in);
+      write_buffer_16x16(in, coeff);
+      break;
+    case H_FLIPADST:
+      load_buffer_16x16(input, in, stride, 0, 1, shift[0]);
+      idtx16x16_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
+      col_txfm_16x16_rounding(out, &v_shift);
+      transpose_16x16(out, in);
+      fadst16x16_neon(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
+      transpose_16x16(out, in);
+      write_buffer_16x16(in, coeff);
+      break;
+    default: assert(0);
+  }
+  (void)bd;
+}
+
+static INLINE void flip_buf_neon(int32x4_t *in, int32x4_t *out, int size) {
+  for (int i = 0; i < size; i += 2) in[30 - i] = out[i];
+  for (int i = 1; i < size; i += 2) in[size - i] = out[i];
+}
+
+typedef void (*fwd_transform_1d_neon)(int32x4_t *in, int32x4_t *out, int bit,
+                                      const int num_cols);
+
+static const fwd_transform_1d_neon col_highbd_txfm8x8_arr[TX_TYPES] = {
+  fdct8x8_neon,   // DCT_DCT
+  fadst8x8_neon,  // ADST_DCT
+  fdct8x8_neon,   // DCT_ADST
+  fadst8x8_neon,  // ADST_ADST
+  fadst8x8_neon,  // FLIPADST_DCT
+  fdct8x8_neon,   // DCT_FLIPADST
+  fadst8x8_neon,  // FLIPADST_FLIPADST
+  fadst8x8_neon,  // ADST_FLIPADST
+  fadst8x8_neon,  // FLIPADST_ADST
+  idtx8x8_neon,   // IDTX
+  fdct8x8_neon,   // V_DCT
+  idtx8x8_neon,   // H_DCT
+  fadst8x8_neon,  // V_ADST
+  idtx8x8_neon,   // H_ADST
+  fadst8x8_neon,  // V_FLIPADST
+  idtx8x8_neon    // H_FLIPADST
+};
+static const fwd_transform_1d_neon row_highbd_txfm32x8_arr[TX_TYPES] = {
+  fdct8x8_neon,   // DCT_DCT
+  NULL,           // ADST_DCT
+  NULL,           // DCT_ADST
+  NULL,           // ADST_ADST
+  NULL,           // FLIPADST_DCT
+  NULL,           // DCT_FLIPADST
+  NULL,           // FLIPADST_FLIPADST
+  NULL,           // ADST_FLIPADST
+  NULL,           // FLIPADST-ADST
+  idtx32x8_neon,  // IDTX
+  NULL,           // V_DCT
+  NULL,           // H_DCT
+  NULL,           // V_ADST
+  NULL,           // H_ADST
+  NULL,           // V_FLIPADST
+  NULL,           // H_FLIPADST
+};
+static const fwd_transform_1d_neon col_highbd_txfm4x8_arr[TX_TYPES] = {
+  fdct4x8_neon,   // DCT_DCT
+  fadst8x8_neon,  // ADST_DCT
+  fdct4x8_neon,   // DCT_ADST
+  fadst8x8_neon,  // ADST_ADST
+  fadst8x8_neon,  // FLIPADST_DCT
+  fdct4x8_neon,   // DCT_FLIPADST
+  fadst8x8_neon,  // FLIPADST_FLIPADST
+  fadst8x8_neon,  // ADST_FLIPADST
+  fadst8x8_neon,  // FLIPADST_ADST
+  idtx8x8_neon,   // IDTX
+  fdct4x8_neon,   // V_DCT
+  idtx8x8_neon,   // H_DCT
+  fadst8x8_neon,  // V_ADST
+  idtx8x8_neon,   // H_ADST
+  fadst8x8_neon,  // V_FLIPADST
+  idtx8x8_neon    // H_FLIPADST
+};
+
+static const fwd_transform_1d_neon row_highbd_txfm8x16_arr[TX_TYPES] = {
+  fdct16x16_neon,   // DCT_DCT
+  fdct16x16_neon,   // ADST_DCT
+  fadst16x16_neon,  // DCT_ADST
+  fadst16x16_neon,  // ADST_ADST
+  fdct16x16_neon,   // FLIPADST_DCT
+  fadst16x16_neon,  // DCT_FLIPADST
+  fadst16x16_neon,  // FLIPADST_FLIPADST
+  fadst16x16_neon,  // ADST_FLIPADST
+  fadst16x16_neon,  // FLIPADST_ADST
+  idtx16x16_neon,   // IDTX
+  idtx16x16_neon,   // V_DCT
+  fdct16x16_neon,   // H_DCT
+  idtx16x16_neon,   // V_ADST
+  fadst16x16_neon,  // H_ADST
+  idtx16x16_neon,   // V_FLIPADST
+  fadst16x16_neon   // H_FLIPADST
+};
+
+static const fwd_transform_1d_neon col_highbd_txfm8x16_arr[TX_TYPES] = {
+  fdct16x16_neon,   // DCT_DCT
+  fadst16x16_neon,  // ADST_DCT
+  fdct16x16_neon,   // DCT_ADST
+  fadst16x16_neon,  // ADST_ADST
+  fadst16x16_neon,  // FLIPADST_DCT
+  fdct16x16_neon,   // DCT_FLIPADST
+  fadst16x16_neon,  // FLIPADST_FLIPADST
+  fadst16x16_neon,  // ADST_FLIPADST
+  fadst16x16_neon,  // FLIPADST_ADST
+  idtx16x16_neon,   // IDTX
+  fdct16x16_neon,   // V_DCT
+  idtx16x16_neon,   // H_DCT
+  fadst16x16_neon,  // V_ADST
+  idtx16x16_neon,   // H_ADST
+  fadst16x16_neon,  // V_FLIPADST
+  idtx16x16_neon    // H_FLIPADST
+};
+static const fwd_transform_1d_neon row_highbd_txfm8x8_arr[TX_TYPES] = {
+  fdct8x8_neon,   // DCT_DCT
+  fdct8x8_neon,   // ADST_DCT
+  fadst8x8_neon,  // DCT_ADST
+  fadst8x8_neon,  // ADST_ADST
+  fdct8x8_neon,   // FLIPADST_DCT
+  fadst8x8_neon,  // DCT_FLIPADST
+  fadst8x8_neon,  // FLIPADST_FLIPADST
+  fadst8x8_neon,  // ADST_FLIPADST
+  fadst8x8_neon,  // FLIPADST_ADST
+  idtx8x8_neon,   // IDTX
+  idtx8x8_neon,   // V_DCT
+  fdct8x8_neon,   // H_DCT
+  idtx8x8_neon,   // V_ADST
+  fadst8x8_neon,  // H_ADST
+  idtx8x8_neon,   // V_FLIPADST
+  fadst8x8_neon   // H_FLIPADST
+};
+
+static const fwd_transform_1d_neon row_highbd_txfm4x8_arr[TX_TYPES] = {
+  fdct4x8_neon,   // DCT_DCT
+  fdct4x8_neon,   // ADST_DCT
+  fadst8x8_neon,  // DCT_ADST
+  fadst8x8_neon,  // ADST_ADST
+  fdct4x8_neon,   // FLIPADST_DCT
+  fadst8x8_neon,  // DCT_FLIPADST
+  fadst8x8_neon,  // FLIPADST_FLIPADST
+  fadst8x8_neon,  // ADST_FLIPADST
+  fadst8x8_neon,  // FLIPADST_ADST
+  idtx8x8_neon,   // IDTX
+  idtx8x8_neon,   // V_DCT
+  fdct4x8_neon,   // H_DCT
+  idtx8x8_neon,   // V_ADST
+  fadst8x8_neon,  // H_ADST
+  idtx8x8_neon,   // V_FLIPADST
+  fadst8x8_neon   // H_FLIPADST
+};
+
+static const fwd_transform_1d_neon row_highbd_txfm4x4_arr[TX_TYPES] = {
+  fdct4x4_neon,   // DCT_DCT
+  fdct4x4_neon,   // ADST_DCT
+  fadst4x4_neon,  // DCT_ADST
+  fadst4x4_neon,  // ADST_ADST
+  fdct4x4_neon,   // FLIPADST_DCT
+  fadst4x4_neon,  // DCT_FLIPADST
+  fadst4x4_neon,  // FLIPADST_FLIPADST
+  fadst4x4_neon,  // ADST_FLIPADST
+  fadst4x4_neon,  // FLIPADST_ADST
+  idtx4x4_neon,   // IDTX
+  idtx4x4_neon,   // V_DCT
+  fdct4x4_neon,   // H_DCT
+  idtx4x4_neon,   // V_ADST
+  fadst4x4_neon,  // H_ADST
+  idtx4x4_neon,   // V_FLIPADST
+  fadst4x4_neon   // H_FLIPADST
+};
+
+static const fwd_transform_1d_neon col_highbd_txfm4x4_arr[TX_TYPES] = {
+  fdct4x4_neon,   // DCT_DCT
+  fadst4x4_neon,  // ADST_DCT
+  fdct4x4_neon,   // DCT_ADST
+  fadst4x4_neon,  // ADST_ADST
+  fadst4x4_neon,  // FLIPADST_DCT
+  fdct4x4_neon,   // DCT_FLIPADST
+  fadst4x4_neon,  // FLIPADST_FLIPADST
+  fadst4x4_neon,  // ADST_FLIPADST
+  fadst4x4_neon,  // FLIPADST_ADST
+  idtx4x4_neon,   // IDTX
+  fdct4x4_neon,   // V_DCT
+  idtx4x4_neon,   // H_DCT
+  fadst4x4_neon,  // V_ADST
+  idtx4x4_neon,   // H_ADST
+  fadst4x4_neon,  // V_FLIPADST
+  idtx4x4_neon    // H_FLIPADST
+};
+
+void av1_fdct32_new_neon(int32x4_t *input, int32x4_t *output, int cos_bit,
+                         const int stride) {
+  int32x4_t buf0[32];
+  int32x4_t buf1[32];
+  const int32_t *cospi;
+  const int32x4_t v_cos_bit = vdupq_n_s32(-cos_bit);
+
+  int startidx = 0 * stride;
+  int endidx = 31 * stride;
+  // stage 0
+  // stage 1
+  buf1[0] = vaddq_s32(input[startidx], input[endidx]);
+  buf1[31] = vsubq_s32(input[startidx], input[endidx]);
+  startidx += stride;
+  endidx -= stride;
+  buf1[1] = vaddq_s32(input[startidx], input[endidx]);
+  buf1[30] = vsubq_s32(input[startidx], input[endidx]);
+  startidx += stride;
+  endidx -= stride;
+  buf1[2] = vaddq_s32(input[startidx], input[endidx]);
+  buf1[29] = vsubq_s32(input[startidx], input[endidx]);
+  startidx += stride;
+  endidx -= stride;
+  buf1[3] = vaddq_s32(input[startidx], input[endidx]);
+  buf1[28] = vsubq_s32(input[startidx], input[endidx]);
+  startidx += stride;
+  endidx -= stride;
+  buf1[4] = vaddq_s32(input[startidx], input[endidx]);
+  buf1[27] = vsubq_s32(input[startidx], input[endidx]);
+  startidx += stride;
+  endidx -= stride;
+  buf1[5] = vaddq_s32(input[startidx], input[endidx]);
+  buf1[26] = vsubq_s32(input[startidx], input[endidx]);
+  startidx += stride;
+  endidx -= stride;
+  buf1[6] = vaddq_s32(input[startidx], input[endidx]);
+  buf1[25] = vsubq_s32(input[startidx], input[endidx]);
+  startidx += stride;
+  endidx -= stride;
+  buf1[7] = vaddq_s32(input[startidx], input[endidx]);
+  buf1[24] = vsubq_s32(input[startidx], input[endidx]);
+  startidx += stride;
+  endidx -= stride;
+  buf1[8] = vaddq_s32(input[startidx], input[endidx]);
+  buf1[23] = vsubq_s32(input[startidx], input[endidx]);
+  startidx += stride;
+  endidx -= stride;
+  buf1[9] = vaddq_s32(input[startidx], input[endidx]);
+  buf1[22] = vsubq_s32(input[startidx], input[endidx]);
+  startidx += stride;
+  endidx -= stride;
+  buf1[10] = vaddq_s32(input[startidx], input[endidx]);
+  buf1[21] = vsubq_s32(input[startidx], input[endidx]);
+  startidx += stride;
+  endidx -= stride;
+  buf1[11] = vaddq_s32(input[startidx], input[endidx]);
+  buf1[20] = vsubq_s32(input[startidx], input[endidx]);
+  startidx += stride;
+  endidx -= stride;
+  buf1[12] = vaddq_s32(input[startidx], input[endidx]);
+  buf1[19] = vsubq_s32(input[startidx], input[endidx]);
+  startidx += stride;
+  endidx -= stride;
+  buf1[13] = vaddq_s32(input[startidx], input[endidx]);
+  buf1[18] = vsubq_s32(input[startidx], input[endidx]);
+  startidx += stride;
+  endidx -= stride;
+  buf1[14] = vaddq_s32(input[startidx], input[endidx]);
+  buf1[17] = vsubq_s32(input[startidx], input[endidx]);
+  startidx += stride;
+  endidx -= stride;
+  buf1[15] = vaddq_s32(input[startidx], input[endidx]);
+  buf1[16] = vsubq_s32(input[startidx], input[endidx]);
+
+  // stage 2
+  cospi = cospi_arr(cos_bit);
+  buf0[0] = vaddq_s32(buf1[0], buf1[15]);
+  buf0[15] = vsubq_s32(buf1[0], buf1[15]);
+  buf0[1] = vaddq_s32(buf1[1], buf1[14]);
+  buf0[14] = vsubq_s32(buf1[1], buf1[14]);
+  buf0[2] = vaddq_s32(buf1[2], buf1[13]);
+  buf0[13] = vsubq_s32(buf1[2], buf1[13]);
+  buf0[3] = vaddq_s32(buf1[3], buf1[12]);
+  buf0[12] = vsubq_s32(buf1[3], buf1[12]);
+  buf0[4] = vaddq_s32(buf1[4], buf1[11]);
+  buf0[11] = vsubq_s32(buf1[4], buf1[11]);
+  buf0[5] = vaddq_s32(buf1[5], buf1[10]);
+  buf0[10] = vsubq_s32(buf1[5], buf1[10]);
+  buf0[6] = vaddq_s32(buf1[6], buf1[9]);
+  buf0[9] = vsubq_s32(buf1[6], buf1[9]);
+  buf0[7] = vaddq_s32(buf1[7], buf1[8]);
+  buf0[8] = vsubq_s32(buf1[7], buf1[8]);
+  buf0[16] = buf1[16];
+  buf0[17] = buf1[17];
+  buf0[18] = buf1[18];
+  buf0[19] = buf1[19];
+  btf_32_neon_type0(-cospi[32], cospi[32], buf1[20], buf1[27], buf0[20],
+                    buf0[27], v_cos_bit);
+  btf_32_neon_type0(-cospi[32], cospi[32], buf1[21], buf1[26], buf0[21],
+                    buf0[26], v_cos_bit);
+  btf_32_neon_type0(-cospi[32], cospi[32], buf1[22], buf1[25], buf0[22],
+                    buf0[25], v_cos_bit);
+  btf_32_neon_type0(-cospi[32], cospi[32], buf1[23], buf1[24], buf0[23],
+                    buf0[24], v_cos_bit);
+  buf0[28] = buf1[28];
+  buf0[29] = buf1[29];
+  buf0[30] = buf1[30];
+  buf0[31] = buf1[31];
+
+  // stage 3
+  cospi = cospi_arr(cos_bit);
+  buf1[0] = vaddq_s32(buf0[0], buf0[7]);
+  buf1[7] = vsubq_s32(buf0[0], buf0[7]);
+  buf1[1] = vaddq_s32(buf0[1], buf0[6]);
+  buf1[6] = vsubq_s32(buf0[1], buf0[6]);
+  buf1[2] = vaddq_s32(buf0[2], buf0[5]);
+  buf1[5] = vsubq_s32(buf0[2], buf0[5]);
+  buf1[3] = vaddq_s32(buf0[3], buf0[4]);
+  buf1[4] = vsubq_s32(buf0[3], buf0[4]);
+  buf1[8] = buf0[8];
+  buf1[9] = buf0[9];
+  btf_32_neon_type0(-cospi[32], cospi[32], buf0[10], buf0[13], buf1[10],
+                    buf1[13], v_cos_bit);
+  btf_32_neon_type0(-cospi[32], cospi[32], buf0[11], buf0[12], buf1[11],
+                    buf1[12], v_cos_bit);
+  buf1[14] = buf0[14];
+  buf1[15] = buf0[15];
+  buf1[16] = vaddq_s32(buf0[16], buf0[23]);
+  buf1[23] = vsubq_s32(buf0[16], buf0[23]);
+  buf1[17] = vaddq_s32(buf0[17], buf0[22]);
+  buf1[22] = vsubq_s32(buf0[17], buf0[22]);
+  buf1[18] = vaddq_s32(buf0[18], buf0[21]);
+  buf1[21] = vsubq_s32(buf0[18], buf0[21]);
+  buf1[19] = vaddq_s32(buf0[19], buf0[20]);
+  buf1[20] = vsubq_s32(buf0[19], buf0[20]);
+  buf1[24] = vsubq_s32(buf0[31], buf0[24]);
+  buf1[31] = vaddq_s32(buf0[31], buf0[24]);
+  buf1[25] = vsubq_s32(buf0[30], buf0[25]);
+  buf1[30] = vaddq_s32(buf0[30], buf0[25]);
+  buf1[26] = vsubq_s32(buf0[29], buf0[26]);
+  buf1[29] = vaddq_s32(buf0[29], buf0[26]);
+  buf1[27] = vsubq_s32(buf0[28], buf0[27]);
+  buf1[28] = vaddq_s32(buf0[28], buf0[27]);
+
+  // stage 4
+  cospi = cospi_arr(cos_bit);
+  buf0[0] = vaddq_s32(buf1[0], buf1[3]);
+  buf0[3] = vsubq_s32(buf1[0], buf1[3]);
+  buf0[1] = vaddq_s32(buf1[1], buf1[2]);
+  buf0[2] = vsubq_s32(buf1[1], buf1[2]);
+  buf0[4] = buf1[4];
+  btf_32_neon_type0(-cospi[32], cospi[32], buf1[5], buf1[6], buf0[5], buf0[6],
+                    v_cos_bit);
+  buf0[7] = buf1[7];
+  buf0[8] = vaddq_s32(buf1[8], buf1[11]);
+  buf0[11] = vsubq_s32(buf1[8], buf1[11]);
+  buf0[9] = vaddq_s32(buf1[9], buf1[10]);
+  buf0[10] = vsubq_s32(buf1[9], buf1[10]);
+  buf0[12] = vsubq_s32(buf1[15], buf1[12]);
+  buf0[15] = vaddq_s32(buf1[15], buf1[12]);
+  buf0[13] = vsubq_s32(buf1[14], buf1[13]);
+  buf0[14] = vaddq_s32(buf1[14], buf1[13]);
+  buf0[16] = buf1[16];
+  buf0[17] = buf1[17];
+
+  btf_32_neon_type0(-cospi[16], cospi[48], buf1[18], buf1[29], buf0[18],
+                    buf0[29], v_cos_bit);
+  btf_32_neon_type0(-cospi[16], cospi[48], buf1[19], buf1[28], buf0[19],
+                    buf0[28], v_cos_bit);
+
+  btf_32_neon_type0(-cospi[48], -cospi[16], buf1[20], buf1[27], buf0[20],
+                    buf0[27], v_cos_bit);
+  btf_32_neon_type0(-cospi[48], -cospi[16], buf1[21], buf1[26], buf0[21],
+                    buf0[26], v_cos_bit);
+
+  buf0[22] = buf1[22];
+  buf0[23] = buf1[23];
+  buf0[24] = buf1[24];
+  buf0[25] = buf1[25];
+  buf0[30] = buf1[30];
+  buf0[31] = buf1[31];
+
+  // stage 5
+  btf_32_neon_type0(cospi[32], cospi[32], buf0[0], buf0[1], buf1[0], buf1[1],
+                    v_cos_bit);
+
+  btf_32_neon_type1(cospi[48], cospi[16], buf0[2], buf0[3], buf1[2], buf1[3],
+                    v_cos_bit);
+  buf1[4] = vaddq_s32(buf0[4], buf0[5]);
+  buf1[5] = vsubq_s32(buf0[4], buf0[5]);
+  buf1[6] = vsubq_s32(buf0[7], buf0[6]);
+  buf1[7] = vaddq_s32(buf0[7], buf0[6]);
+  buf1[8] = buf0[8];
+  btf_32_neon_type0(-cospi[16], cospi[48], buf0[9], buf0[14], buf1[9], buf1[14],
+                    v_cos_bit);
+  btf_32_neon_type0(-cospi[48], -cospi[16], buf0[10], buf0[13], buf1[10],
+                    buf1[13], v_cos_bit);
+  buf1[11] = buf0[11];
+  buf1[12] = buf0[12];
+  buf1[15] = buf0[15];
+  buf1[16] = vaddq_s32(buf0[16], buf0[19]);
+  buf1[19] = vsubq_s32(buf0[16], buf0[19]);
+  buf1[17] = vaddq_s32(buf0[17], buf0[18]);
+  buf1[18] = vsubq_s32(buf0[17], buf0[18]);
+  buf1[20] = vsubq_s32(buf0[23], buf0[20]);
+  buf1[23] = vaddq_s32(buf0[23], buf0[20]);
+  buf1[21] = vsubq_s32(buf0[22], buf0[21]);
+  buf1[22] = vaddq_s32(buf0[22], buf0[21]);
+  buf1[24] = vaddq_s32(buf0[24], buf0[27]);
+  buf1[27] = vsubq_s32(buf0[24], buf0[27]);
+  buf1[25] = vaddq_s32(buf0[25], buf0[26]);
+  buf1[26] = vsubq_s32(buf0[25], buf0[26]);
+  buf1[28] = vsubq_s32(buf0[31], buf0[28]);
+  buf1[31] = vaddq_s32(buf0[31], buf0[28]);
+  buf1[29] = vsubq_s32(buf0[30], buf0[29]);
+  buf1[30] = vaddq_s32(buf0[30], buf0[29]);
+
+  // stage 6
+  cospi = cospi_arr(cos_bit);
+  buf0[0] = buf1[0];
+  buf0[1] = buf1[1];
+  buf0[2] = buf1[2];
+  buf0[3] = buf1[3];
+
+  btf_32_neon_type1(cospi[56], cospi[8], buf1[4], buf1[7], buf0[4], buf0[7],
+                    v_cos_bit);
+  btf_32_neon_type0(-cospi[8], cospi[56], buf1[17], buf1[30], buf0[17],
+                    buf0[30], v_cos_bit);
+  btf_32_neon_type0(-cospi[56], -cospi[8], buf1[18], buf1[29], buf0[18],
+                    buf0[29], v_cos_bit);
+
+  buf0[8] = vaddq_s32(buf1[8], buf1[9]);
+  buf0[9] = vsubq_s32(buf1[8], buf1[9]);
+  buf0[10] = vsubq_s32(buf1[11], buf1[10]);
+  buf0[11] = vaddq_s32(buf1[11], buf1[10]);
+  buf0[12] = vaddq_s32(buf1[12], buf1[13]);
+  buf0[13] = vsubq_s32(buf1[12], buf1[13]);
+  buf0[14] = vsubq_s32(buf1[15], buf1[14]);
+  buf0[15] = vaddq_s32(buf1[15], buf1[14]);
+  buf0[16] = buf1[16];
+  buf0[19] = buf1[19];
+  buf0[20] = buf1[20];
+
+  btf_32_neon_type1(cospi[24], cospi[40], buf1[5], buf1[6], buf0[5], buf0[6],
+                    v_cos_bit);
+  btf_32_neon_type0(-cospi[40], cospi[24], buf1[21], buf1[26], buf0[21],
+                    buf0[26], v_cos_bit);
+  btf_32_neon_type0(-cospi[24], -cospi[40], buf1[22], buf1[25], buf0[22],
+                    buf0[25], v_cos_bit);
+
+  buf0[23] = buf1[23];
+  buf0[24] = buf1[24];
+  buf0[27] = buf1[27];
+  buf0[28] = buf1[28];
+  buf0[31] = buf1[31];
+
+  // stage 7
+  cospi = cospi_arr(cos_bit);
+  buf1[0] = buf0[0];
+  buf1[1] = buf0[1];
+  buf1[2] = buf0[2];
+  buf1[3] = buf0[3];
+  buf1[4] = buf0[4];
+  buf1[5] = buf0[5];
+  buf1[6] = buf0[6];
+  buf1[7] = buf0[7];
+  btf_32_neon_type1(cospi[60], cospi[4], buf0[8], buf0[15], buf1[8], buf1[15],
+                    v_cos_bit);
+  btf_32_neon_type1(cospi[28], cospi[36], buf0[9], buf0[14], buf1[9], buf1[14],
+                    v_cos_bit);
+  btf_32_neon_type1(cospi[44], cospi[20], buf0[10], buf0[13], buf1[10],
+                    buf1[13], v_cos_bit);
+  btf_32_neon_type1(cospi[12], cospi[52], buf0[11], buf0[12], buf1[11],
+                    buf1[12], v_cos_bit);
+  buf1[16] = vaddq_s32(buf0[16], buf0[17]);
+  buf1[17] = vsubq_s32(buf0[16], buf0[17]);
+  buf1[18] = vsubq_s32(buf0[19], buf0[18]);
+  buf1[19] = vaddq_s32(buf0[19], buf0[18]);
+  buf1[20] = vaddq_s32(buf0[20], buf0[21]);
+  buf1[21] = vsubq_s32(buf0[20], buf0[21]);
+  buf1[22] = vsubq_s32(buf0[23], buf0[22]);
+  buf1[23] = vaddq_s32(buf0[23], buf0[22]);
+  buf1[24] = vaddq_s32(buf0[24], buf0[25]);
+  buf1[25] = vsubq_s32(buf0[24], buf0[25]);
+  buf1[26] = vsubq_s32(buf0[27], buf0[26]);
+  buf1[27] = vaddq_s32(buf0[27], buf0[26]);
+  buf1[28] = vaddq_s32(buf0[28], buf0[29]);
+  buf1[29] = vsubq_s32(buf0[28], buf0[29]);
+  buf1[30] = vsubq_s32(buf0[31], buf0[30]);
+  buf1[31] = vaddq_s32(buf0[31], buf0[30]);
+
+  // stage 8
+  cospi = cospi_arr(cos_bit);
+  buf0[0] = buf1[0];
+  buf0[1] = buf1[1];
+  buf0[2] = buf1[2];
+  buf0[3] = buf1[3];
+  buf0[4] = buf1[4];
+  buf0[5] = buf1[5];
+  buf0[6] = buf1[6];
+  buf0[7] = buf1[7];
+  buf0[8] = buf1[8];
+  buf0[9] = buf1[9];
+  buf0[10] = buf1[10];
+  buf0[11] = buf1[11];
+  buf0[12] = buf1[12];
+  buf0[13] = buf1[13];
+  buf0[14] = buf1[14];
+  buf0[15] = buf1[15];
+  btf_32_neon_type1(cospi[62], cospi[2], buf1[16], buf1[31], buf0[16], buf0[31],
+                    v_cos_bit);
+  btf_32_neon_type1(cospi[30], cospi[34], buf1[17], buf1[30], buf0[17],
+                    buf0[30], v_cos_bit);
+  btf_32_neon_type1(cospi[46], cospi[18], buf1[18], buf1[29], buf0[18],
+                    buf0[29], v_cos_bit);
+  btf_32_neon_type1(cospi[14], cospi[50], buf1[19], buf1[28], buf0[19],
+                    buf0[28], v_cos_bit);
+  btf_32_neon_type1(cospi[54], cospi[10], buf1[20], buf1[27], buf0[20],
+                    buf0[27], v_cos_bit);
+  btf_32_neon_type1(cospi[22], cospi[42], buf1[21], buf1[26], buf0[21],
+                    buf0[26], v_cos_bit);
+  btf_32_neon_type1(cospi[38], cospi[26], buf1[22], buf1[25], buf0[22],
+                    buf0[25], v_cos_bit);
+  btf_32_neon_type1(cospi[6], cospi[58], buf1[23], buf1[24], buf0[23], buf0[24],
+                    v_cos_bit);
+
+  startidx = 0 * stride;
+  endidx = 31 * stride;
+  // stage 9
+  output[startidx] = buf0[0];
+  output[endidx] = buf0[31];
+  startidx += stride;
+  endidx -= stride;
+  output[startidx] = buf0[16];
+  output[endidx] = buf0[15];
+  startidx += stride;
+  endidx -= stride;
+  output[startidx] = buf0[8];
+  output[endidx] = buf0[23];
+  startidx += stride;
+  endidx -= stride;
+  output[startidx] = buf0[24];
+  output[endidx] = buf0[7];
+  startidx += stride;
+  endidx -= stride;
+  output[startidx] = buf0[4];
+  output[endidx] = buf0[27];
+  startidx += stride;
+  endidx -= stride;
+  output[startidx] = buf0[20];
+  output[endidx] = buf0[11];
+  startidx += stride;
+  endidx -= stride;
+  output[startidx] = buf0[12];
+  output[endidx] = buf0[19];
+  startidx += stride;
+  endidx -= stride;
+  output[startidx] = buf0[28];
+  output[endidx] = buf0[3];
+  startidx += stride;
+  endidx -= stride;
+  output[startidx] = buf0[2];
+  output[endidx] = buf0[29];
+  startidx += stride;
+  endidx -= stride;
+  output[startidx] = buf0[18];
+  output[endidx] = buf0[13];
+  startidx += stride;
+  endidx -= stride;
+  output[startidx] = buf0[10];
+  output[endidx] = buf0[21];
+  startidx += stride;
+  endidx -= stride;
+  output[startidx] = buf0[26];
+  output[endidx] = buf0[5];
+  startidx += stride;
+  endidx -= stride;
+  output[startidx] = buf0[6];
+  output[endidx] = buf0[25];
+  startidx += stride;
+  endidx -= stride;
+  output[startidx] = buf0[22];
+  output[endidx] = buf0[9];
+  startidx += stride;
+  endidx -= stride;
+  output[startidx] = buf0[14];
+  output[endidx] = buf0[17];
+  startidx += stride;
+  endidx -= stride;
+  output[startidx] = buf0[30];
+  output[endidx] = buf0[1];
+}
+
+void av1_fadst4_new_neon(const int32x4_t *input, int32x4_t *output,
+                         const int8_t cos_bit, const int8_t *stage_range) {
+  const int txfm_size = 4;
+  const int num_per_128 = 4;
+  const int32_t *cospi;
+  int32x4_t v_cos_bit = vdupq_n_s32(-cos_bit);
+  int32x4_t buf0[4];
+  int32x4_t buf1[4];
+  int col_num = txfm_size / num_per_128;
+  int col;
+  (void)stage_range;
+  cospi = cospi_arr(cos_bit);
+  for (col = 0; col < col_num; col++) {
+    // stage 0;
+    int32_t stage_idx = 0;
+    int j;
+    for (j = 0; j < 4; ++j) {
+      buf0[j] = input[j * col_num + col];
+    }
+
+    // stage 1
+    stage_idx++;
+    buf1[0] = buf0[3];
+    buf1[1] = buf0[0];
+    buf1[2] = buf0[1];
+    buf1[3] = buf0[2];
+
+    // stage 2
+    stage_idx++;
+
+    btf_32_neon_type0(cospi[8], cospi[56], buf1[0], buf1[1], buf0[0], buf0[1],
+                      v_cos_bit);
+    btf_32_neon_type0(cospi[40], cospi[24], buf1[2], buf1[3], buf0[2], buf0[3],
+                      v_cos_bit);
+
+    // stage 3
+    stage_idx++;
+    buf1[0] = vaddq_s32(buf0[0], buf0[2]);
+    buf1[2] = vsubq_s32(buf0[0], buf0[2]);
+    buf1[1] = vaddq_s32(buf0[1], buf0[3]);
+    buf1[3] = vsubq_s32(buf0[1], buf0[3]);
+
+    // stage 4
+    stage_idx++;
+
+    cospi = cospi_arr(cos_bit);
+    buf0[0] = buf1[0];
+    buf0[1] = buf1[1];
+
+    btf_32_neon_type0(cospi[32], cospi[32], buf1[2], buf1[3], buf0[2], buf0[3],
+                      v_cos_bit);
+
+    // stage 5
+    stage_idx++;
+    buf1[0] = buf0[0];
+    buf1[1] = vnegq_s32(buf0[2]);
+    buf1[2] = buf0[3];
+    buf1[3] = vnegq_s32(buf0[1]);
+
+    for (j = 0; j < 4; ++j) {
+      output[j * col_num + col] = buf1[j];
+    }
+  }
+}
+
+static void av1_fdct64_new_stage12345_neon(int32x4_t *input, const int instride,
+                                           int32x4_t *x5, const int32_t *cospi,
+                                           const int32x4_t *v_cos_bit,
+                                           int *startidx, int *endidx) {
+  int32x4_t x1[64];
+  x1[0] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[63] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[1] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[62] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[2] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[61] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[3] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[60] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[4] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[59] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[5] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[58] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[6] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[57] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[7] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[56] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[8] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[55] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[9] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[54] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[10] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[53] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[11] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[52] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[12] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[51] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[13] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[50] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[14] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[49] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[15] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[48] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[16] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[47] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[17] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[46] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[18] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[45] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[19] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[44] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[20] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[43] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[21] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[42] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[22] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[41] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[23] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[40] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[24] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[39] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[25] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[38] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[26] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[37] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[27] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[36] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[28] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[35] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[29] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[34] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[30] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[33] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[31] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[32] = vsubq_s32(input[*startidx], input[*endidx]);
+
+  // stage 2
+  int32x4_t x2[64];
+  x2[0] = vaddq_s32(x1[0], x1[31]);
+  x2[31] = vsubq_s32(x1[0], x1[31]);
+  x2[1] = vaddq_s32(x1[1], x1[30]);
+  x2[30] = vsubq_s32(x1[1], x1[30]);
+  x2[2] = vaddq_s32(x1[2], x1[29]);
+  x2[29] = vsubq_s32(x1[2], x1[29]);
+  x2[3] = vaddq_s32(x1[3], x1[28]);
+  x2[28] = vsubq_s32(x1[3], x1[28]);
+  x2[4] = vaddq_s32(x1[4], x1[27]);
+  x2[27] = vsubq_s32(x1[4], x1[27]);
+  x2[5] = vaddq_s32(x1[5], x1[26]);
+  x2[26] = vsubq_s32(x1[5], x1[26]);
+  x2[6] = vaddq_s32(x1[6], x1[25]);
+  x2[25] = vsubq_s32(x1[6], x1[25]);
+  x2[7] = vaddq_s32(x1[7], x1[24]);
+  x2[24] = vsubq_s32(x1[7], x1[24]);
+  x2[8] = vaddq_s32(x1[8], x1[23]);
+  x2[23] = vsubq_s32(x1[8], x1[23]);
+  x2[9] = vaddq_s32(x1[9], x1[22]);
+  x2[22] = vsubq_s32(x1[9], x1[22]);
+  x2[10] = vaddq_s32(x1[10], x1[21]);
+  x2[21] = vsubq_s32(x1[10], x1[21]);
+  x2[11] = vaddq_s32(x1[11], x1[20]);
+  x2[20] = vsubq_s32(x1[11], x1[20]);
+  x2[12] = vaddq_s32(x1[12], x1[19]);
+  x2[19] = vsubq_s32(x1[12], x1[19]);
+  x2[13] = vaddq_s32(x1[13], x1[18]);
+  x2[18] = vsubq_s32(x1[13], x1[18]);
+  x2[14] = vaddq_s32(x1[14], x1[17]);
+  x2[17] = vsubq_s32(x1[14], x1[17]);
+  x2[15] = vaddq_s32(x1[15], x1[16]);
+  x2[16] = vsubq_s32(x1[15], x1[16]);
+  x2[32] = x1[32];
+  x2[33] = x1[33];
+  x2[34] = x1[34];
+  x2[35] = x1[35];
+  x2[36] = x1[36];
+  x2[37] = x1[37];
+  x2[38] = x1[38];
+  x2[39] = x1[39];
+
+  btf_32_neon_type0(-cospi[32], cospi[32], x1[40], x1[55], x2[40], x2[55],
+                    *v_cos_bit);
+  btf_32_neon_type0(-cospi[32], cospi[32], x1[41], x1[54], x2[41], x2[54],
+                    *v_cos_bit);
+  btf_32_neon_type0(-cospi[32], cospi[32], x1[42], x1[53], x2[42], x2[53],
+                    *v_cos_bit);
+  btf_32_neon_type0(-cospi[32], cospi[32], x1[43], x1[52], x2[43], x2[52],
+                    *v_cos_bit);
+  btf_32_neon_type0(-cospi[32], cospi[32], x1[44], x1[51], x2[44], x2[51],
+                    *v_cos_bit);
+  btf_32_neon_type0(-cospi[32], cospi[32], x1[45], x1[50], x2[45], x2[50],
+                    *v_cos_bit);
+  btf_32_neon_type0(-cospi[32], cospi[32], x1[46], x1[49], x2[46], x2[49],
+                    *v_cos_bit);
+  btf_32_neon_type0(-cospi[32], cospi[32], x1[47], x1[48], x2[47], x2[48],
+                    *v_cos_bit);
+  x2[56] = x1[56];
+  x2[57] = x1[57];
+  x2[58] = x1[58];
+  x2[59] = x1[59];
+  x2[60] = x1[60];
+  x2[61] = x1[61];
+  x2[62] = x1[62];
+  x2[63] = x1[63];
+
+  // stage 3
+  int32x4_t x3[64];
+  x3[0] = vaddq_s32(x2[0], x2[15]);
+  x3[15] = vsubq_s32(x2[0], x2[15]);
+  x3[1] = vaddq_s32(x2[1], x2[14]);
+  x3[14] = vsubq_s32(x2[1], x2[14]);
+  x3[2] = vaddq_s32(x2[2], x2[13]);
+  x3[13] = vsubq_s32(x2[2], x2[13]);
+  x3[3] = vaddq_s32(x2[3], x2[12]);
+  x3[12] = vsubq_s32(x2[3], x2[12]);
+  x3[4] = vaddq_s32(x2[4], x2[11]);
+  x3[11] = vsubq_s32(x2[4], x2[11]);
+  x3[5] = vaddq_s32(x2[5], x2[10]);
+  x3[10] = vsubq_s32(x2[5], x2[10]);
+  x3[6] = vaddq_s32(x2[6], x2[9]);
+  x3[9] = vsubq_s32(x2[6], x2[9]);
+  x3[7] = vaddq_s32(x2[7], x2[8]);
+  x3[8] = vsubq_s32(x2[7], x2[8]);
+  x3[16] = x2[16];
+  x3[17] = x2[17];
+  x3[18] = x2[18];
+  x3[19] = x2[19];
+  btf_32_neon_type0(-cospi[32], cospi[32], x2[20], x2[27], x3[20], x3[27],
+                    *v_cos_bit);
+  btf_32_neon_type0(-cospi[32], cospi[32], x2[21], x2[26], x3[21], x3[26],
+                    *v_cos_bit);
+  btf_32_neon_type0(-cospi[32], cospi[32], x2[22], x2[25], x3[22], x3[25],
+                    *v_cos_bit);
+  btf_32_neon_type0(-cospi[32], cospi[32], x2[23], x2[24], x3[23], x3[24],
+                    *v_cos_bit);
+  x3[28] = x2[28];
+  x3[29] = x2[29];
+  x3[30] = x2[30];
+  x3[31] = x2[31];
+  x3[32] = vaddq_s32(x2[32], x2[47]);
+  x3[47] = vsubq_s32(x2[32], x2[47]);
+  x3[33] = vaddq_s32(x2[33], x2[46]);
+  x3[46] = vsubq_s32(x2[33], x2[46]);
+  x3[34] = vaddq_s32(x2[34], x2[45]);
+  x3[45] = vsubq_s32(x2[34], x2[45]);
+  x3[35] = vaddq_s32(x2[35], x2[44]);
+  x3[44] = vsubq_s32(x2[35], x2[44]);
+  x3[36] = vaddq_s32(x2[36], x2[43]);
+  x3[43] = vsubq_s32(x2[36], x2[43]);
+  x3[37] = vaddq_s32(x2[37], x2[42]);
+  x3[42] = vsubq_s32(x2[37], x2[42]);
+  x3[38] = vaddq_s32(x2[38], x2[41]);
+  x3[41] = vsubq_s32(x2[38], x2[41]);
+  x3[39] = vaddq_s32(x2[39], x2[40]);
+  x3[40] = vsubq_s32(x2[39], x2[40]);
+  x3[48] = vsubq_s32(x2[63], x2[48]);
+  x3[63] = vaddq_s32(x2[63], x2[48]);
+  x3[49] = vsubq_s32(x2[62], x2[49]);
+  x3[62] = vaddq_s32(x2[62], x2[49]);
+  x3[50] = vsubq_s32(x2[61], x2[50]);
+  x3[61] = vaddq_s32(x2[61], x2[50]);
+  x3[51] = vsubq_s32(x2[60], x2[51]);
+  x3[60] = vaddq_s32(x2[60], x2[51]);
+  x3[52] = vsubq_s32(x2[59], x2[52]);
+  x3[59] = vaddq_s32(x2[59], x2[52]);
+  x3[53] = vsubq_s32(x2[58], x2[53]);
+  x3[58] = vaddq_s32(x2[58], x2[53]);
+  x3[54] = vsubq_s32(x2[57], x2[54]);
+  x3[57] = vaddq_s32(x2[57], x2[54]);
+  x3[55] = vsubq_s32(x2[56], x2[55]);
+  x3[56] = vaddq_s32(x2[56], x2[55]);
+
+  // stage 4
+  int32x4_t x4[64];
+  x4[0] = vaddq_s32(x3[0], x3[7]);
+  x4[7] = vsubq_s32(x3[0], x3[7]);
+  x4[1] = vaddq_s32(x3[1], x3[6]);
+  x4[6] = vsubq_s32(x3[1], x3[6]);
+  x4[2] = vaddq_s32(x3[2], x3[5]);
+  x4[5] = vsubq_s32(x3[2], x3[5]);
+  x4[3] = vaddq_s32(x3[3], x3[4]);
+  x4[4] = vsubq_s32(x3[3], x3[4]);
+  x4[8] = x3[8];
+  x4[9] = x3[9];
+  btf_32_neon_type0(-cospi[32], cospi[32], x3[10], x3[13], x4[10], x4[13],
+                    *v_cos_bit);
+  btf_32_neon_type0(-cospi[32], cospi[32], x3[11], x3[12], x4[11], x4[12],
+                    *v_cos_bit);
+  x4[14] = x3[14];
+  x4[15] = x3[15];
+  x4[16] = vaddq_s32(x3[16], x3[23]);
+  x4[23] = vsubq_s32(x3[16], x3[23]);
+  x4[17] = vaddq_s32(x3[17], x3[22]);
+  x4[22] = vsubq_s32(x3[17], x3[22]);
+  x4[18] = vaddq_s32(x3[18], x3[21]);
+  x4[21] = vsubq_s32(x3[18], x3[21]);
+  x4[19] = vaddq_s32(x3[19], x3[20]);
+  x4[20] = vsubq_s32(x3[19], x3[20]);
+  x4[24] = vsubq_s32(x3[31], x3[24]);
+  x4[31] = vaddq_s32(x3[31], x3[24]);
+  x4[25] = vsubq_s32(x3[30], x3[25]);
+  x4[30] = vaddq_s32(x3[30], x3[25]);
+  x4[26] = vsubq_s32(x3[29], x3[26]);
+  x4[29] = vaddq_s32(x3[29], x3[26]);
+  x4[27] = vsubq_s32(x3[28], x3[27]);
+  x4[28] = vaddq_s32(x3[28], x3[27]);
+  x4[32] = x3[32];
+  x4[33] = x3[33];
+  x4[34] = x3[34];
+  x4[35] = x3[35];
+
+  btf_32_neon_type0(-cospi[16], cospi[48], x3[36], x3[59], x4[36], x4[59],
+                    *v_cos_bit);
+  btf_32_neon_type0(-cospi[16], cospi[48], x3[37], x3[58], x4[37], x4[58],
+                    *v_cos_bit);
+  btf_32_neon_type0(-cospi[16], cospi[48], x3[38], x3[57], x4[38], x4[57],
+                    *v_cos_bit);
+  btf_32_neon_type0(-cospi[16], cospi[48], x3[39], x3[56], x4[39], x4[56],
+                    *v_cos_bit);
+  btf_32_neon_type0(-cospi[48], -cospi[16], x3[40], x3[55], x4[40], x4[55],
+                    *v_cos_bit);
+  btf_32_neon_type0(-cospi[48], -cospi[16], x3[41], x3[54], x4[41], x4[54],
+                    *v_cos_bit);
+  btf_32_neon_type0(-cospi[48], -cospi[16], x3[42], x3[53], x4[42], x4[53],
+                    *v_cos_bit);
+  btf_32_neon_type0(-cospi[48], -cospi[16], x3[43], x3[52], x4[43], x4[52],
+                    *v_cos_bit);
+  x4[44] = x3[44];
+  x4[45] = x3[45];
+  x4[46] = x3[46];
+  x4[47] = x3[47];
+  x4[48] = x3[48];
+  x4[49] = x3[49];
+  x4[50] = x3[50];
+  x4[51] = x3[51];
+  x4[60] = x3[60];
+  x4[61] = x3[61];
+  x4[62] = x3[62];
+  x4[63] = x3[63];
+
+  // stage 5
+  x5[0] = vaddq_s32(x4[0], x4[3]);
+  x5[3] = vsubq_s32(x4[0], x4[3]);
+  x5[1] = vaddq_s32(x4[1], x4[2]);
+  x5[2] = vsubq_s32(x4[1], x4[2]);
+  x5[4] = x4[4];
+
+  btf_32_neon_type0(-cospi[32], cospi[32], x4[5], x4[6], x5[5], x5[6],
+                    *v_cos_bit);
+  x5[7] = x4[7];
+  x5[8] = vaddq_s32(x4[8], x4[11]);
+  x5[11] = vsubq_s32(x4[8], x4[11]);
+  x5[9] = vaddq_s32(x4[9], x4[10]);
+  x5[10] = vsubq_s32(x4[9], x4[10]);
+  x5[12] = vsubq_s32(x4[15], x4[12]);
+  x5[15] = vaddq_s32(x4[15], x4[12]);
+  x5[13] = vsubq_s32(x4[14], x4[13]);
+  x5[14] = vaddq_s32(x4[14], x4[13]);
+  x5[16] = x4[16];
+  x5[17] = x4[17];
+
+  btf_32_neon_type0(-cospi[16], cospi[48], x4[18], x4[29], x5[18], x5[29],
+                    *v_cos_bit);
+  btf_32_neon_type0(-cospi[16], cospi[48], x4[19], x4[28], x5[19], x5[28],
+                    *v_cos_bit);
+  btf_32_neon_type0(-cospi[48], -cospi[16], x4[20], x4[27], x5[20], x5[27],
+                    *v_cos_bit);
+  btf_32_neon_type0(-cospi[48], -cospi[16], x4[21], x4[26], x5[21], x5[26],
+                    *v_cos_bit);
+  x5[22] = x4[22];
+  x5[23] = x4[23];
+  x5[24] = x4[24];
+  x5[25] = x4[25];
+  x5[30] = x4[30];
+  x5[31] = x4[31];
+  x5[32] = vaddq_s32(x4[32], x4[39]);
+  x5[39] = vsubq_s32(x4[32], x4[39]);
+  x5[33] = vaddq_s32(x4[33], x4[38]);
+  x5[38] = vsubq_s32(x4[33], x4[38]);
+  x5[34] = vaddq_s32(x4[34], x4[37]);
+  x5[37] = vsubq_s32(x4[34], x4[37]);
+  x5[35] = vaddq_s32(x4[35], x4[36]);
+  x5[36] = vsubq_s32(x4[35], x4[36]);
+  x5[40] = vsubq_s32(x4[47], x4[40]);
+  x5[47] = vaddq_s32(x4[47], x4[40]);
+  x5[41] = vsubq_s32(x4[46], x4[41]);
+  x5[46] = vaddq_s32(x4[46], x4[41]);
+  x5[42] = vsubq_s32(x4[45], x4[42]);
+  x5[45] = vaddq_s32(x4[45], x4[42]);
+  x5[43] = vsubq_s32(x4[44], x4[43]);
+  x5[44] = vaddq_s32(x4[44], x4[43]);
+  x5[48] = vaddq_s32(x4[48], x4[55]);
+  x5[55] = vsubq_s32(x4[48], x4[55]);
+  x5[49] = vaddq_s32(x4[49], x4[54]);
+  x5[54] = vsubq_s32(x4[49], x4[54]);
+  x5[50] = vaddq_s32(x4[50], x4[53]);
+  x5[53] = vsubq_s32(x4[50], x4[53]);
+  x5[51] = vaddq_s32(x4[51], x4[52]);
+  x5[52] = vsubq_s32(x4[51], x4[52]);
+  x5[56] = vsubq_s32(x4[63], x4[56]);
+  x5[63] = vaddq_s32(x4[63], x4[56]);
+  x5[57] = vsubq_s32(x4[62], x4[57]);
+  x5[62] = vaddq_s32(x4[62], x4[57]);
+  x5[58] = vsubq_s32(x4[61], x4[58]);
+  x5[61] = vaddq_s32(x4[61], x4[58]);
+  x5[59] = vsubq_s32(x4[60], x4[59]);
+  x5[60] = vaddq_s32(x4[60], x4[59]);
+}
+
+static void av1_fdct64_new_neon(int32x4_t *input, int32x4_t *output,
+                                int8_t cos_bit, const int instride,
+                                const int outstride) {
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const int32x4_t v_cos_bit = vdupq_n_s32(-cos_bit);
+
+  int startidx = 0 * instride;
+  int endidx = 63 * instride;
+
+  // stage 1-2-3-4-5
+  int32x4_t x5[64];
+  av1_fdct64_new_stage12345_neon(input, instride, x5, cospi, &v_cos_bit,
+                                 &startidx, &endidx);
+
+  // stage 6
+  int32x4_t x6[64];
+  btf_32_neon_type0(cospi[32], cospi[32], x5[0], x5[1], x6[0], x6[1],
+                    v_cos_bit);
+  btf_32_neon_type1(cospi[48], cospi[16], x5[2], x5[3], x6[2], x6[3],
+                    v_cos_bit);
+  x6[4] = vaddq_s32(x5[4], x5[5]);
+  x6[5] = vsubq_s32(x5[4], x5[5]);
+  x6[6] = vsubq_s32(x5[7], x5[6]);
+  x6[7] = vaddq_s32(x5[7], x5[6]);
+  x6[8] = x5[8];
+  btf_32_neon_type0(-cospi[16], cospi[48], x5[9], x5[14], x6[9], x6[14],
+                    v_cos_bit);
+  btf_32_neon_type0(-cospi[48], -cospi[16], x5[10], x5[13], x6[10], x6[13],
+                    v_cos_bit);
+  x6[11] = x5[11];
+  x6[12] = x5[12];
+  x6[15] = x5[15];
+  x6[16] = vaddq_s32(x5[16], x5[19]);
+  x6[19] = vsubq_s32(x5[16], x5[19]);
+  x6[17] = vaddq_s32(x5[17], x5[18]);
+  x6[18] = vsubq_s32(x5[17], x5[18]);
+  x6[20] = vsubq_s32(x5[23], x5[20]);
+  x6[23] = vaddq_s32(x5[23], x5[20]);
+  x6[21] = vsubq_s32(x5[22], x5[21]);
+  x6[22] = vaddq_s32(x5[22], x5[21]);
+  x6[24] = vaddq_s32(x5[24], x5[27]);
+  x6[27] = vsubq_s32(x5[24], x5[27]);
+  x6[25] = vaddq_s32(x5[25], x5[26]);
+  x6[26] = vsubq_s32(x5[25], x5[26]);
+  x6[28] = vsubq_s32(x5[31], x5[28]);
+  x6[31] = vaddq_s32(x5[31], x5[28]);
+  x6[29] = vsubq_s32(x5[30], x5[29]);
+  x6[30] = vaddq_s32(x5[30], x5[29]);
+  x6[32] = x5[32];
+  x6[33] = x5[33];
+
+  btf_32_neon_type0(-cospi[40], cospi[24], x5[42], x5[53], x6[42], x6[53],
+                    v_cos_bit);
+  btf_32_neon_type0(-cospi[40], cospi[24], x5[43], x5[52], x6[43], x6[52],
+                    v_cos_bit);
+  btf_32_neon_type0(-cospi[24], -cospi[40], x5[44], x5[51], x6[44], x6[51],
+                    v_cos_bit);
+  btf_32_neon_type0(-cospi[24], -cospi[40], x5[45], x5[50], x6[45], x6[50],
+                    v_cos_bit);
+
+  x6[46] = x5[46];
+  x6[47] = x5[47];
+  x6[48] = x5[48];
+  x6[49] = x5[49];
+  x6[54] = x5[54];
+  x6[55] = x5[55];
+  x6[56] = x5[56];
+  x6[57] = x5[57];
+  x6[62] = x5[62];
+  x6[63] = x5[63];
+
+  // stage 7
+  int32x4_t x7[64];
+  x7[0] = x6[0];
+  x7[1] = x6[1];
+  x7[2] = x6[2];
+  x7[3] = x6[3];
+  btf_32_neon_type1(cospi[24], cospi[40], x6[5], x6[6], x7[5], x7[6],
+                    v_cos_bit);
+
+  x7[8] = vaddq_s32(x6[8], x6[9]);
+  x7[9] = vsubq_s32(x6[8], x6[9]);
+  x7[10] = vsubq_s32(x6[11], x6[10]);
+  x7[11] = vaddq_s32(x6[11], x6[10]);
+  x7[12] = vaddq_s32(x6[12], x6[13]);
+  x7[13] = vsubq_s32(x6[12], x6[13]);
+  x7[14] = vsubq_s32(x6[15], x6[14]);
+  x7[15] = vaddq_s32(x6[15], x6[14]);
+  x7[16] = x6[16];
+
+  btf_32_neon_type0(-cospi[40], cospi[24], x6[21], x6[26], x7[21], x7[26],
+                    v_cos_bit);
+  btf_32_neon_type0(-cospi[24], -cospi[40], x6[22], x6[25], x7[22], x7[25],
+                    v_cos_bit);
+  x7[23] = x6[23];
+  x7[24] = x6[24];
+  x7[27] = x6[27];
+  x7[28] = x6[28];
+  x7[31] = x6[31];
+
+  btf_32_neon_type0(-cospi[8], cospi[56], x5[34], x5[61], x6[34], x6[61],
+                    v_cos_bit);
+  btf_32_neon_type0(-cospi[8], cospi[56], x5[35], x5[60], x6[35], x6[60],
+                    v_cos_bit);
+  btf_32_neon_type0(-cospi[56], -cospi[8], x5[36], x5[59], x6[36], x6[59],
+                    v_cos_bit);
+  btf_32_neon_type0(-cospi[56], -cospi[8], x5[37], x5[58], x6[37], x6[58],
+                    v_cos_bit);
+  x6[38] = x5[38];
+  x6[39] = x5[39];
+  x6[40] = x5[40];
+  x6[41] = x5[41];
+
+  btf_32_neon_type1(cospi[56], cospi[8], x6[4], x6[7], x7[4], x7[7], v_cos_bit);
+  btf_32_neon_type0(-cospi[8], cospi[56], x6[17], x6[30], x7[17], x7[30],
+                    v_cos_bit);
+  btf_32_neon_type0(-cospi[56], -cospi[8], x6[18], x6[29], x7[18], x7[29],
+                    v_cos_bit);
+  x7[19] = x6[19];
+  x7[20] = x6[20];
+
+  x7[32] = vaddq_s32(x6[32], x6[35]);
+  x7[35] = vsubq_s32(x6[32], x6[35]);
+  x7[33] = vaddq_s32(x6[33], x6[34]);
+  x7[34] = vsubq_s32(x6[33], x6[34]);
+  x7[36] = vsubq_s32(x6[39], x6[36]);
+  x7[39] = vaddq_s32(x6[39], x6[36]);
+  x7[37] = vsubq_s32(x6[38], x6[37]);
+  x7[38] = vaddq_s32(x6[38], x6[37]);
+  x7[40] = vaddq_s32(x6[40], x6[43]);
+  x7[43] = vsubq_s32(x6[40], x6[43]);
+  x7[41] = vaddq_s32(x6[41], x6[42]);
+  x7[42] = vsubq_s32(x6[41], x6[42]);
+  x7[44] = vsubq_s32(x6[47], x6[44]);
+  x7[47] = vaddq_s32(x6[47], x6[44]);
+  x7[45] = vsubq_s32(x6[46], x6[45]);
+  x7[46] = vaddq_s32(x6[46], x6[45]);
+  x7[48] = vaddq_s32(x6[48], x6[51]);
+  x7[51] = vsubq_s32(x6[48], x6[51]);
+  x7[49] = vaddq_s32(x6[49], x6[50]);
+  x7[50] = vsubq_s32(x6[49], x6[50]);
+  x7[52] = vsubq_s32(x6[55], x6[52]);
+  x7[55] = vaddq_s32(x6[55], x6[52]);
+  x7[53] = vsubq_s32(x6[54], x6[53]);
+  x7[54] = vaddq_s32(x6[54], x6[53]);
+  x7[56] = vaddq_s32(x6[56], x6[59]);
+  x7[59] = vsubq_s32(x6[56], x6[59]);
+  x7[57] = vaddq_s32(x6[57], x6[58]);
+  x7[58] = vsubq_s32(x6[57], x6[58]);
+  x7[60] = vsubq_s32(x6[63], x6[60]);
+  x7[63] = vaddq_s32(x6[63], x6[60]);
+  x7[61] = vsubq_s32(x6[62], x6[61]);
+  x7[62] = vaddq_s32(x6[62], x6[61]);
+
+  // stage 8
+  int32x4_t x8[64];
+  x8[0] = x7[0];
+  x8[1] = x7[1];
+  x8[2] = x7[2];
+  x8[3] = x7[3];
+  x8[4] = x7[4];
+  x8[5] = x7[5];
+  x8[6] = x7[6];
+  x8[7] = x7[7];
+
+  btf_32_neon_type1(cospi[60], cospi[4], x7[8], x7[15], x8[8], x8[15],
+                    v_cos_bit);
+  btf_32_neon_type1(cospi[28], cospi[36], x7[9], x7[14], x8[9], x8[14],
+                    v_cos_bit);
+  btf_32_neon_type1(cospi[44], cospi[20], x7[10], x7[13], x8[10], x8[13],
+                    v_cos_bit);
+  btf_32_neon_type1(cospi[12], cospi[52], x7[11], x7[12], x8[11], x8[12],
+                    v_cos_bit);
+  x8[16] = vaddq_s32(x7[16], x7[17]);
+  x8[17] = vsubq_s32(x7[16], x7[17]);
+  x8[18] = vsubq_s32(x7[19], x7[18]);
+  x8[19] = vaddq_s32(x7[19], x7[18]);
+  x8[20] = vaddq_s32(x7[20], x7[21]);
+  x8[21] = vsubq_s32(x7[20], x7[21]);
+  x8[22] = vsubq_s32(x7[23], x7[22]);
+  x8[23] = vaddq_s32(x7[23], x7[22]);
+  x8[24] = vaddq_s32(x7[24], x7[25]);
+  x8[25] = vsubq_s32(x7[24], x7[25]);
+  x8[26] = vsubq_s32(x7[27], x7[26]);
+  x8[27] = vaddq_s32(x7[27], x7[26]);
+  x8[28] = vaddq_s32(x7[28], x7[29]);
+  x8[29] = vsubq_s32(x7[28], x7[29]);
+  x8[30] = vsubq_s32(x7[31], x7[30]);
+  x8[31] = vaddq_s32(x7[31], x7[30]);
+  x8[32] = x7[32];
+
+  btf_32_neon_type0(-cospi[4], cospi[60], x7[33], x7[62], x8[33], x8[62],
+                    v_cos_bit);
+  btf_32_neon_type0(-cospi[60], -cospi[4], x7[34], x7[61], x8[34], x8[61],
+                    v_cos_bit);
+  x8[35] = x7[35];
+  x8[36] = x7[36];
+  btf_32_neon_type0(-cospi[36], cospi[28], x7[37], x7[58], x8[37], x8[58],
+                    v_cos_bit);
+  btf_32_neon_type0(-cospi[28], -cospi[36], x7[38], x7[57], x8[38], x8[57],
+                    v_cos_bit);
+  x8[39] = x7[39];
+  x8[40] = x7[40];
+  btf_32_neon_type0(-cospi[20], cospi[44], x7[41], x7[54], x8[41], x8[54],
+                    v_cos_bit);
+  btf_32_neon_type0(-cospi[44], -cospi[20], x7[42], x7[53], x8[42], x8[53],
+                    v_cos_bit);
+  x8[43] = x7[43];
+  x8[44] = x7[44];
+  btf_32_neon_type0(-cospi[52], cospi[12], x7[45], x7[50], x8[45], x8[50],
+                    v_cos_bit);
+  btf_32_neon_type0(-cospi[12], -cospi[52], x7[46], x7[49], x8[46], x8[49],
+                    v_cos_bit);
+  x8[47] = x7[47];
+  x8[48] = x7[48];
+  x8[51] = x7[51];
+  x8[52] = x7[52];
+  x8[55] = x7[55];
+  x8[56] = x7[56];
+  x8[59] = x7[59];
+  x8[60] = x7[60];
+  x8[63] = x7[63];
+
+  // stage 9
+  int32x4_t x9[64];
+  x9[0] = x8[0];
+  x9[1] = x8[1];
+  x9[2] = x8[2];
+  x9[3] = x8[3];
+  x9[4] = x8[4];
+  x9[5] = x8[5];
+  x9[6] = x8[6];
+  x9[7] = x8[7];
+  x9[8] = x8[8];
+  x9[9] = x8[9];
+  x9[10] = x8[10];
+  x9[11] = x8[11];
+  x9[12] = x8[12];
+  x9[13] = x8[13];
+  x9[14] = x8[14];
+  x9[15] = x8[15];
+
+  btf_32_neon_type1(cospi[62], cospi[2], x8[16], x8[31], x9[16], x9[31],
+                    v_cos_bit);
+  btf_32_neon_type1(cospi[30], cospi[34], x8[17], x8[30], x9[17], x9[30],
+                    v_cos_bit);
+  btf_32_neon_type1(cospi[46], cospi[18], x8[18], x8[29], x9[18], x9[29],
+                    v_cos_bit);
+  btf_32_neon_type1(cospi[14], cospi[50], x8[19], x8[28], x9[19], x9[28],
+                    v_cos_bit);
+  btf_32_neon_type1(cospi[54], cospi[10], x8[20], x8[27], x9[20], x9[27],
+                    v_cos_bit);
+  btf_32_neon_type1(cospi[22], cospi[42], x8[21], x8[26], x9[21], x9[26],
+                    v_cos_bit);
+  btf_32_neon_type1(cospi[38], cospi[26], x8[22], x8[25], x9[22], x9[25],
+                    v_cos_bit);
+  btf_32_neon_type1(cospi[6], cospi[58], x8[23], x8[24], x9[23], x9[24],
+                    v_cos_bit);
+
+  x9[32] = vaddq_s32(x8[32], x8[33]);
+  x9[33] = vsubq_s32(x8[32], x8[33]);
+  x9[34] = vsubq_s32(x8[35], x8[34]);
+  x9[35] = vaddq_s32(x8[35], x8[34]);
+  x9[36] = vaddq_s32(x8[36], x8[37]);
+  x9[37] = vsubq_s32(x8[36], x8[37]);
+  x9[38] = vsubq_s32(x8[39], x8[38]);
+  x9[39] = vaddq_s32(x8[39], x8[38]);
+  x9[40] = vaddq_s32(x8[40], x8[41]);
+  x9[41] = vsubq_s32(x8[40], x8[41]);
+  x9[42] = vsubq_s32(x8[43], x8[42]);
+  x9[43] = vaddq_s32(x8[43], x8[42]);
+  x9[44] = vaddq_s32(x8[44], x8[45]);
+  x9[45] = vsubq_s32(x8[44], x8[45]);
+  x9[46] = vsubq_s32(x8[47], x8[46]);
+  x9[47] = vaddq_s32(x8[47], x8[46]);
+  x9[48] = vaddq_s32(x8[48], x8[49]);
+  x9[49] = vsubq_s32(x8[48], x8[49]);
+  x9[50] = vsubq_s32(x8[51], x8[50]);
+  x9[51] = vaddq_s32(x8[51], x8[50]);
+  x9[52] = vaddq_s32(x8[52], x8[53]);
+  x9[53] = vsubq_s32(x8[52], x8[53]);
+  x9[54] = vsubq_s32(x8[55], x8[54]);
+  x9[55] = vaddq_s32(x8[55], x8[54]);
+  x9[56] = vaddq_s32(x8[56], x8[57]);
+  x9[57] = vsubq_s32(x8[56], x8[57]);
+  x9[58] = vsubq_s32(x8[59], x8[58]);
+  x9[59] = vaddq_s32(x8[59], x8[58]);
+  x9[60] = vaddq_s32(x8[60], x8[61]);
+  x9[61] = vsubq_s32(x8[60], x8[61]);
+  x9[62] = vsubq_s32(x8[63], x8[62]);
+  x9[63] = vaddq_s32(x8[63], x8[62]);
+
+  // stage 10
+  int32x4_t x10[64];
+  x10[0] = x9[0];
+  x10[1] = x9[1];
+  x10[2] = x9[2];
+  x10[3] = x9[3];
+  x10[4] = x9[4];
+  x10[5] = x9[5];
+  x10[6] = x9[6];
+  x10[7] = x9[7];
+  x10[8] = x9[8];
+  x10[9] = x9[9];
+  x10[10] = x9[10];
+  x10[11] = x9[11];
+  x10[12] = x9[12];
+  x10[13] = x9[13];
+  x10[14] = x9[14];
+  x10[15] = x9[15];
+  x10[16] = x9[16];
+  x10[17] = x9[17];
+  x10[18] = x9[18];
+  x10[19] = x9[19];
+  x10[20] = x9[20];
+  x10[21] = x9[21];
+  x10[22] = x9[22];
+  x10[23] = x9[23];
+  x10[24] = x9[24];
+  x10[25] = x9[25];
+  x10[26] = x9[26];
+  x10[27] = x9[27];
+  x10[28] = x9[28];
+  x10[29] = x9[29];
+  x10[30] = x9[30];
+  x10[31] = x9[31];
+  btf_32_neon_type1(cospi[63], cospi[1], x9[32], x9[63], x10[32], x10[63],
+                    v_cos_bit);
+  btf_32_neon_type1(cospi[31], cospi[33], x9[33], x9[62], x10[33], x10[62],
+                    v_cos_bit);
+  btf_32_neon_type1(cospi[47], cospi[17], x9[34], x9[61], x10[34], x10[61],
+                    v_cos_bit);
+  btf_32_neon_type1(cospi[15], cospi[49], x9[35], x9[60], x10[35], x10[60],
+                    v_cos_bit);
+  btf_32_neon_type1(cospi[55], cospi[9], x9[36], x9[59], x10[36], x10[59],
+                    v_cos_bit);
+  btf_32_neon_type1(cospi[23], cospi[41], x9[37], x9[58], x10[37], x10[58],
+                    v_cos_bit);
+  btf_32_neon_type1(cospi[39], cospi[25], x9[38], x9[57], x10[38], x10[57],
+                    v_cos_bit);
+  btf_32_neon_type1(cospi[7], cospi[57], x9[39], x9[56], x10[39], x10[56],
+                    v_cos_bit);
+  btf_32_neon_type1(cospi[59], cospi[5], x9[40], x9[55], x10[40], x10[55],
+                    v_cos_bit);
+  btf_32_neon_type1(cospi[27], cospi[37], x9[41], x9[54], x10[41], x10[54],
+                    v_cos_bit);
+  btf_32_neon_type1(cospi[43], cospi[21], x9[42], x9[53], x10[42], x10[53],
+                    v_cos_bit);
+  btf_32_neon_type1(cospi[11], cospi[53], x9[43], x9[52], x10[43], x10[52],
+                    v_cos_bit);
+  btf_32_neon_type1(cospi[51], cospi[13], x9[44], x9[51], x10[44], x10[51],
+                    v_cos_bit);
+  btf_32_neon_type1(cospi[19], cospi[45], x9[45], x9[50], x10[45], x10[50],
+                    v_cos_bit);
+  btf_32_neon_type1(cospi[35], cospi[29], x9[46], x9[49], x10[46], x10[49],
+                    v_cos_bit);
+  btf_32_neon_type1(cospi[3], cospi[61], x9[47], x9[48], x10[47], x10[48],
+                    v_cos_bit);
+
+  startidx = 0 * outstride;
+  endidx = 63 * outstride;
+  // stage 11
+  output[startidx] = x10[0];
+  output[endidx] = x10[63];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[32];
+  output[endidx] = x10[31];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[16];
+  output[endidx] = x10[47];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[48];
+  output[endidx] = x10[15];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[8];
+  output[endidx] = x10[55];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[40];
+  output[endidx] = x10[23];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[24];
+  output[endidx] = x10[39];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[56];
+  output[endidx] = x10[7];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[4];
+  output[endidx] = x10[59];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[36];
+  output[endidx] = x10[27];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[20];
+  output[endidx] = x10[43];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[52];
+  output[endidx] = x10[11];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[12];
+  output[endidx] = x10[51];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[44];
+  output[endidx] = x10[19];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[28];
+  output[endidx] = x10[35];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[60];
+  output[endidx] = x10[3];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[2];
+  output[endidx] = x10[61];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[34];
+  output[endidx] = x10[29];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[18];
+  output[endidx] = x10[45];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[50];
+  output[endidx] = x10[13];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[10];
+  output[endidx] = x10[53];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[42];
+  output[endidx] = x10[21];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[26];
+  output[endidx] = x10[37];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[58];
+  output[endidx] = x10[5];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[6];
+  output[endidx] = x10[57];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[38];
+  output[endidx] = x10[25];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[22];
+  output[endidx] = x10[41];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[54];
+  output[endidx] = x10[9];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[14];
+  output[endidx] = x10[49];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[46];
+  output[endidx] = x10[17];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[30];
+  output[endidx] = x10[33];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[62];
+  output[endidx] = x10[1];
+}
+
+void av1_idtx32_new_neon(int32x4_t *input, int32x4_t *output, int cos_bit,
+                         const int col_num) {
+  (void)cos_bit;
+  for (int i = 0; i < 32; i++) {
+    output[i * col_num] = vshlq_n_s32(input[i * col_num], 2);
+  }
+}
+
+static const fwd_transform_1d_neon col_highbd_txfm8x32_arr[TX_TYPES] = {
+  av1_fdct32_new_neon,  // DCT_DCT
+  NULL,                 // ADST_DCT
+  NULL,                 // DCT_ADST
+  NULL,                 // ADST_ADST
+  NULL,                 // FLIPADST_DCT
+  NULL,                 // DCT_FLIPADST
+  NULL,                 // FLIPADST_FLIPADST
+  NULL,                 // ADST_FLIPADST
+  NULL,                 // FLIPADST_ADST
+  av1_idtx32_new_neon,  // IDTX
+  NULL,                 // V_DCT
+  NULL,                 // H_DCT
+  NULL,                 // V_ADST
+  NULL,                 // H_ADST
+  NULL,                 // V_FLIPADST
+  NULL                  // H_FLIPADST
+};
+
+static const fwd_transform_1d_neon row_highbd_txfm8x32_arr[TX_TYPES] = {
+  fdct16x16_neon,  // DCT_DCT
+  NULL,            // ADST_DCT
+  NULL,            // DCT_ADST
+  NULL,            // ADST_ADST
+  NULL,            // FLIPADST_DCT
+  NULL,            // DCT_FLIPADST
+  NULL,            // FLIPADST_FLIPADST
+  NULL,            // ADST_FLIPADST
+  NULL,            // FLIPADST_ADST
+  idtx16x16_neon,  // IDTX
+  NULL,            // V_DCT
+  NULL,            // H_DCT
+  NULL,            // V_ADST
+  NULL,            // H_ADST
+  NULL,            // V_FLIPADST
+  NULL             // H_FLIPADST
+};
+
+void av1_fwd_txfm2d_16x8_neon(const int16_t *input, int32_t *coeff, int stride,
+                              TX_TYPE tx_type, int bd) {
+  (void)bd;
+  int32x4_t in[32], out[32];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X8];
+  const int txw_idx = get_txw_idx(TX_16X8);
+  const int txh_idx = get_txh_idx(TX_16X8);
+  const fwd_transform_1d_neon col_txfm = col_highbd_txfm8x8_arr[tx_type];
+  const fwd_transform_1d_neon row_txfm = row_highbd_txfm8x16_arr[tx_type];
+  int bit = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  const int32x4_t v_shift1 = vdupq_n_s32(shift[1]);
+  for (int i = 0; i < 2; i++) {
+    load_buffer_8x8(input + i * 8, in, stride, ud_flip, 0, shift[0]);
+    col_txfm(in, in, bit, 2);
+    col_txfm_8x8_rounding(in, &v_shift1);
+    transpose_8x8(in, out + i * 16);
+  }
+
+  if (lr_flip) {
+    flip_buf_neon(in, out, 32);
+    row_txfm(in, out, bit, 2);
+  } else {
+    row_txfm(out, out, bit, 2);
+  }
+
+  for (int i = 0; i < 2; i++) {
+    transpose_8x8(out + i * 16, in);
+    av1_round_shift_rect_array_32_neon(in, in, 16, -shift[2], NewSqrt2);
+    write_buffer_16x8(in, coeff + i * 8, 16);
+  }
+}
+
+void av1_fwd_txfm2d_8x16_neon(const int16_t *input, int32_t *coeff, int stride,
+                              TX_TYPE tx_type, int bd) {
+  (void)bd;
+  int32x4_t in[32], out[32];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X16];
+  const int txw_idx = get_txw_idx(TX_8X16);
+  const int txh_idx = get_txh_idx(TX_8X16);
+  const fwd_transform_1d_neon col_txfm = col_highbd_txfm8x16_arr[tx_type];
+  const fwd_transform_1d_neon row_txfm = row_highbd_txfm8x8_arr[tx_type];
+  int bit = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  load_buffer_8x16(input, in, stride, ud_flip, lr_flip, shift[0]);
+  col_txfm(in, in, bit, 2);
+  const int32x4_t v_shift1 = vdupq_n_s32(shift[1]);
+  col_txfm_8x16_rounding(in, &v_shift1);
+  transpose_8x8(in, out);
+  transpose_8x8(in + 16, out + 16);
+
+  for (int i = 0; i < 2; i++) {
+    row_txfm(out + i * 16, out, bit, 2);
+    transpose_8x8(out, in);
+    av1_round_shift_rect_array_32_neon(in, in, 16, -shift[2], NewSqrt2);
+    write_buffer_8x8(in, coeff + i * 64);
+  }
+}
+
+static INLINE void transpose_8nx8n(const int32x4_t *input, int32x4_t *output,
+                                   const int width, const int height) {
+  const int numcol = height >> 2;
+  const int numrow = width >> 2;
+  for (int j = 0; j < numrow; j++) {
+    for (int i = 0; i < numcol; i++) {
+      TRANSPOSE_4X4(input[i * width + j + (numrow * 0)],
+                    input[i * width + j + (numrow * 1)],
+                    input[i * width + j + (numrow * 2)],
+                    input[i * width + j + (numrow * 3)],
+                    output[j * height + i + (numcol * 0)],
+                    output[j * height + i + (numcol * 1)],
+                    output[j * height + i + (numcol * 2)],
+                    output[j * height + i + (numcol * 3)]);
+    }
+  }
+}
+
+void av1_fwd_txfm2d_4x16_neon(const int16_t *input, int32_t *coeff, int stride,
+                              TX_TYPE tx_type, int bd) {
+  (void)bd;
+
+  int32x4_t in[16];
+  int32x4_t *outcoeff128 = (int32x4_t *)coeff;
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_4X16];
+  const int txw_idx = get_txw_idx(TX_4X16);
+  const int txh_idx = get_txh_idx(TX_4X16);
+  const int txfm_size_col = tx_size_wide[TX_4X16];
+  const int txfm_size_row = tx_size_high[TX_4X16];
+  int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const fwd_transform_1d_neon col_txfm = col_highbd_txfm8x16_arr[tx_type];
+  const fwd_transform_1d_neon row_txfm = row_highbd_txfm4x4_arr[tx_type];
+
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  // col transform
+  int32x4_t v_shift0 = vdupq_n_s32(shift[0]);
+  load_buffer_4x16(input, in, stride, ud_flip, lr_flip, &v_shift0);
+  col_txfm(in, outcoeff128, bitcol, 1);
+  const int32x4_t v_shift1 = vdupq_n_s32(shift[1]);
+  col_txfm_8x8_rounding(outcoeff128, &v_shift1);
+  transpose_8nx8n(outcoeff128, in, txfm_size_col, txfm_size_row);
+
+  // row transform
+  for (int i = 0; i < txfm_size_col; i++) {
+    row_txfm(in + i, outcoeff128 + i * txfm_size_col, bitrow, txfm_size_col);
+  }
+}
+
+void av1_fwd_txfm2d_16x4_neon(const int16_t *input, int32_t *coeff, int stride,
+                              TX_TYPE tx_type, int bd) {
+  (void)bd;
+
+  int32x4_t in[16];
+  int32x4_t *outcoeff128 = (int32x4_t *)coeff;
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X4];
+  const int txw_idx = get_txw_idx(TX_16X4);
+  const int txh_idx = get_txh_idx(TX_16X4);
+  const int txfm_size_col = tx_size_wide[TX_16X4];
+  const int txfm_size_row = tx_size_high[TX_16X4];
+  int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const fwd_transform_1d_neon col_txfm = col_highbd_txfm4x4_arr[tx_type];
+  const fwd_transform_1d_neon row_txfm = row_highbd_txfm8x16_arr[tx_type];
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  // col transform
+  const int32x4_t v_shift0 = vdupq_n_s32(shift[0]);
+  load_buffer_16x4(input, in, stride, ud_flip, lr_flip, &v_shift0);
+
+  for (int i = 0; i < txfm_size_row; i++) {
+    col_txfm(in + i * txfm_size_row, outcoeff128 + i * txfm_size_row, bitcol,
+             1);
+  }
+  const int32x4_t v_shift1 = vdupq_n_s32(shift[1]);
+  col_txfm_8x8_rounding(outcoeff128, &v_shift1);
+
+  // row transform
+  row_txfm(outcoeff128, in, bitrow, 1);
+  transpose_8nx8n(in, outcoeff128, txfm_size_row, txfm_size_col);
+}
+
+void av1_fwd_txfm2d_16x32_neon(const int16_t *input, int32_t *coeff, int stride,
+                               TX_TYPE tx_type, int bd) {
+  (void)bd;
+
+  int32x4_t in[128];
+  int32x4_t *outcoef128 = (int32x4_t *)coeff;
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X32];
+  const int txw_idx = get_txw_idx(TX_16X32);
+  const int txh_idx = get_txh_idx(TX_16X32);
+  const fwd_transform_1d_neon col_txfm = col_highbd_txfm8x32_arr[tx_type];
+  const fwd_transform_1d_neon row_txfm = row_highbd_txfm8x32_arr[tx_type];
+  int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+
+  // column transform
+  load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
+  load_buffer_16x16(input + 16 * stride, in + 64, stride, 0, 0, shift[0]);
+
+  for (int i = 0; i < 4; i++) {
+    col_txfm((in + i), (in + i), bitcol, 4);
+  }
+
+  const int32x4_t v_shift = vdupq_n_s32(shift[1]);
+  col_txfm_16x16_rounding(&in[0], &v_shift);
+  col_txfm_16x16_rounding(&in[64], &v_shift);
+  transpose_8nx8n(in, outcoef128, 16, 32);
+
+  // row transform
+  row_txfm(outcoef128, in, bitrow, 8);
+  transpose_8nx8n(in, outcoef128, 32, 16);
+  av1_round_shift_rect_array_32_neon(outcoef128, outcoef128, 128, -shift[2],
+                                     NewSqrt2);
+}
+
+void av1_fwd_txfm2d_32x64_neon(const int16_t *input, int32_t *coeff, int stride,
+                               TX_TYPE tx_type, int bd) {
+  (void)tx_type;
+  (void)bd;
+
+  int32x4_t in[512];
+  int32x4_t *outcoef128 = (int32x4_t *)coeff;
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_32X64];
+  const int txw_idx = get_txw_idx(TX_32X64);
+  const int txh_idx = get_txh_idx(TX_32X64);
+  const int txfm_size_col = tx_size_wide[TX_32X64];
+  const int txfm_size_row = tx_size_high[TX_32X64];
+  int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const int num_row = txfm_size_row >> 2;
+  const int num_col = txfm_size_col >> 2;
+
+  // column transform
+  load_buffer_32x8n(input, in, stride, 0, 0, shift[0], txfm_size_row);
+  for (int i = 0; i < num_col; i++) {
+    av1_fdct64_new_neon((in + i), (in + i), bitcol, num_col, num_col);
+  }
+
+  const int32x4_t v_shift = vdupq_n_s32(shift[1]);
+  for (int i = 0; i < num_col; i++) {
+    col_txfm_16x16_rounding((in + i * txfm_size_row), &v_shift);
+  }
+  transpose_8nx8n(in, outcoef128, txfm_size_col, txfm_size_row);
+
+  // row transform
+  for (int i = 0; i < num_row; i++) {
+    av1_fdct32_new_neon((outcoef128 + i), (in + i), bitrow, num_row);
+  }
+  transpose_8nx8n(in, outcoef128, txfm_size_row, txfm_size_col);
+  av1_round_shift_rect_array_32_neon(outcoef128, outcoef128, 512, -shift[2],
+                                     NewSqrt2);
+}
+
+void av1_fwd_txfm2d_64x32_neon(const int16_t *input, int32_t *coeff, int stride,
+                               TX_TYPE tx_type, int bd) {
+  (void)tx_type;
+  int32x4_t in[512];
+  int32x4_t *outcoef128 = (int32x4_t *)coeff;
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_64X32];
+  const int txw_idx = get_txw_idx(TX_64X32);
+  const int txh_idx = get_txh_idx(TX_64X32);
+  const int txfm_size_col = tx_size_wide[TX_64X32];
+  const int txfm_size_row = tx_size_high[TX_64X32];
+  int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const int num_row = txfm_size_row >> 2;
+  const int num_col = txfm_size_col >> 2;
+
+  // column transform
+  const int32x4_t v_shift0 = vdupq_n_s32(shift[0]);
+  for (int i = 0; i < 32; i++) {
+    load_buffer_4x4(input + 0 + i * stride, in + 0 + i * 16, 4, 0, 0,
+                    &v_shift0);
+    load_buffer_4x4(input + 16 + i * stride, in + 4 + i * 16, 4, 0, 0,
+                    &v_shift0);
+    load_buffer_4x4(input + 32 + i * stride, in + 8 + i * 16, 4, 0, 0,
+                    &v_shift0);
+    load_buffer_4x4(input + 48 + i * stride, in + 12 + i * 16, 4, 0, 0,
+                    &v_shift0);
+  }
+
+  for (int i = 0; i < num_col; i++) {
+    av1_fdct32_new_neon((in + i), (in + i), bitcol, num_col);
+  }
+
+  const int32x4_t v_shift1 = vdupq_n_s32(shift[1]);
+  for (int i = 0; i < num_row; i++) {
+    col_txfm_16x16_rounding((in + i * txfm_size_col), &v_shift1);
+  }
+  transpose_8nx8n(in, outcoef128, txfm_size_col, txfm_size_row);
+
+  // row transform
+  for (int i = 0; i < num_row; i++) {
+    av1_fdct64_new_neon((outcoef128 + i), (in + i), bitrow, num_row, num_row);
+  }
+  transpose_8nx8n(in, outcoef128, txfm_size_row, txfm_size_col >> 1);
+  av1_round_shift_rect_array_32_neon(outcoef128, outcoef128, 512 >> 1,
+                                     -shift[2], NewSqrt2);
+  (void)bd;
+}
+
+void av1_fwd_txfm2d_32x16_neon(const int16_t *input, int32_t *coeff, int stride,
+                               TX_TYPE tx_type, int bd) {
+  int32x4_t in[128];
+  int32x4_t *outcoef128 = (int32x4_t *)coeff;
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_32X16];
+  const int txw_idx = get_txw_idx(TX_32X16);
+  const int txh_idx = get_txh_idx(TX_32X16);
+  const fwd_transform_1d_neon col_txfm = row_highbd_txfm8x32_arr[tx_type];
+  const fwd_transform_1d_neon row_txfm = col_highbd_txfm8x32_arr[tx_type];
+  int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+
+  // column transform
+  load_buffer_32x8n(input, in, stride, 0, 0, shift[0], 16);
+  col_txfm(in, in, bitcol, 8);
+  const int32x4_t v_shift = vdupq_n_s32(shift[1]);
+  col_txfm_16x16_rounding(&in[0], &v_shift);
+  col_txfm_16x16_rounding(&in[64], &v_shift);
+  transpose_8nx8n(in, outcoef128, 32, 16);
+
+  // row transform
+  for (int i = 0; i < 4; i++) {
+    row_txfm((outcoef128 + i), (in + i), bitrow, 4);
+  }
+  transpose_8nx8n(in, outcoef128, 16, 32);
+  av1_round_shift_rect_array_32_neon(outcoef128, outcoef128, 128, -shift[2],
+                                     NewSqrt2);
+  (void)bd;
+}
+
+void av1_fwd_txfm2d_8x32_neon(const int16_t *input, int32_t *coeff, int stride,
+                              TX_TYPE tx_type, int bd) {
+  int32x4_t in[64];
+  int32x4_t *outcoef128 = (int32x4_t *)coeff;
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X32];
+  const int txw_idx = get_txw_idx(TX_8X32);
+  const int txh_idx = get_txh_idx(TX_8X32);
+  const fwd_transform_1d_neon col_txfm = col_highbd_txfm8x32_arr[tx_type];
+  const fwd_transform_1d_neon row_txfm = row_highbd_txfm32x8_arr[tx_type];
+  int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+
+  const int txfm_size_col = tx_size_wide[TX_8X32];
+  const int txfm_size_row = tx_size_high[TX_8X32];
+  const int num_col = txfm_size_col >> 2;
+
+  // column transform
+  load_buffer_8x16(input, in, stride, 0, 0, shift[0]);
+  load_buffer_8x16(input + (txfm_size_row >> 1) * stride, in + txfm_size_row,
+                   stride, 0, 0, shift[0]);
+
+  for (int i = 0; i < num_col; i++) {
+    col_txfm((in + i), (in + i), bitcol, num_col);
+  }
+
+  const int32x4_t v_shift = vdupq_n_s32(shift[1]);
+  col_txfm_16x16_rounding(in, &v_shift);
+  transpose_8nx8n(in, outcoef128, txfm_size_col, txfm_size_row);
+
+  // row transform
+  for (int i = 0; i < txfm_size_col; i += 2) {
+    row_txfm((outcoef128 + i), (in + i), bitrow, txfm_size_col);
+  }
+  transpose_8nx8n(in, outcoef128, txfm_size_row, txfm_size_col);
+  (void)bd;
+}
+
+void av1_fwd_txfm2d_32x8_neon(const int16_t *input, int32_t *coeff, int stride,
+                              TX_TYPE tx_type, int bd) {
+  int32x4_t in[64];
+  int32x4_t *outcoef128 = (int32x4_t *)coeff;
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_32X8];
+  const int txw_idx = get_txw_idx(TX_32X8);
+  const int txh_idx = get_txh_idx(TX_32X8);
+  const fwd_transform_1d_neon col_txfm = row_highbd_txfm32x8_arr[tx_type];
+  const fwd_transform_1d_neon row_txfm = col_highbd_txfm8x32_arr[tx_type];
+  int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+
+  const int txfm_size_col = tx_size_wide[TX_32X8];
+  const int txfm_size_row = tx_size_high[TX_32X8];
+  const int num_col = txfm_size_row >> 2;
+
+  // column transform
+  load_buffer_32x8n(input, in, stride, 0, 0, shift[0], 8);
+  for (int i = 0; i < txfm_size_row; i += 2) {
+    col_txfm((in + i), (in + i), bitcol, txfm_size_row);
+  }
+
+  const int32x4_t v_shift = vdupq_n_s32(shift[1]);
+  col_txfm_16x16_rounding(&in[0], &v_shift);
+  transpose_8nx8n(in, outcoef128, txfm_size_col, txfm_size_row);
+
+  // row transform
+  for (int i = 0; i < num_col; i++) {
+    row_txfm((outcoef128 + i), (in + i), bitrow, num_col);
+  }
+  transpose_8nx8n(in, outcoef128, txfm_size_row, txfm_size_col);
+  (void)bd;
+}
+
+void av1_fwd_txfm2d_4x8_neon(const int16_t *input, int32_t *coeff, int stride,
+                             TX_TYPE tx_type, int bd) {
+  int32x4_t in[8];
+  int32x4_t *outcoeff128 = (int32x4_t *)coeff;
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_4X8];
+  const int txw_idx = get_txw_idx(TX_4X8);
+  const int txh_idx = get_txh_idx(TX_4X8);
+  const int txfm_size_col = tx_size_wide[TX_4X8];
+  const int txfm_size_row = tx_size_high[TX_4X8];
+  int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const fwd_transform_1d_neon col_txfm = col_highbd_txfm4x8_arr[tx_type];
+  const fwd_transform_1d_neon row_txfm = row_highbd_txfm4x4_arr[tx_type];
+
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  const int32x4_t v_shift0 = vdupq_n_s32(shift[0]);
+  load_buffer_4x8(input, in, stride, ud_flip, lr_flip, &v_shift0);
+  col_txfm(in, in, bitcol, 1);
+  int32x4_t v_shift1 = vdupq_n_s32(shift[1]);
+  col_txfm_4x8_rounding(in, &v_shift1);
+  transpose_8nx8n(in, outcoeff128, txfm_size_col, txfm_size_row);
+
+  for (int i = 0; i < 2; i++) {
+    row_txfm(outcoeff128 + i, in + i * txfm_size_col, bitrow, 2);
+  }
+  av1_round_shift_rect_array_32_neon(in, outcoeff128, txfm_size_row, -shift[2],
+                                     NewSqrt2);
+  (void)bd;
+}
+
+void av1_fwd_txfm2d_8x4_neon(const int16_t *input, int32_t *coeff, int stride,
+                             TX_TYPE tx_type, int bd) {
+  int32x4_t in[8];
+  int32x4_t *outcoeff128 = (int32x4_t *)coeff;
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X4];
+  const int txw_idx = get_txw_idx(TX_8X4);
+  const int txh_idx = get_txh_idx(TX_8X4);
+  const int txfm_size_col = tx_size_wide[TX_8X4];
+  const int txfm_size_row = tx_size_high[TX_8X4];
+  int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const fwd_transform_1d_neon col_txfm = col_highbd_txfm4x4_arr[tx_type];
+  const fwd_transform_1d_neon row_txfm = row_highbd_txfm4x8_arr[tx_type];
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  // col tranform
+  int32x4_t v_shift0 = vdupq_n_s32(shift[0]);
+  load_buffer_8x4(input, in, stride, ud_flip, lr_flip, &v_shift0);
+  for (int i = 0; i < 2; i++) {
+    col_txfm(in + i * txfm_size_row, in + i * txfm_size_row, bitcol, 1);
+  }
+  int32x4_t v_shift1 = vdupq_n_s32(shift[1]);
+  col_txfm_4x8_rounding(in, &v_shift1);
+
+  // row tranform
+  row_txfm(in, outcoeff128, bitrow, 1);
+  av1_round_shift_rect_array_32_neon(outcoeff128, in, txfm_size_col, -shift[2],
+                                     NewSqrt2);
+  transpose_8nx8n(in, outcoeff128, txfm_size_row, txfm_size_col);
+  (void)bd;
+}
+
+void av1_fwd_txfm2d_16x64_neon(const int16_t *input, int32_t *coeff, int stride,
+                               TX_TYPE tx_type, int bd) {
+  int32x4_t in[256];
+  int32x4_t *outcoeff128 = (int32x4_t *)coeff;
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X64];
+  const int txw_idx = get_txw_idx(TX_16X64);
+  const int txh_idx = get_txh_idx(TX_16X64);
+  const int txfm_size_col = tx_size_wide[TX_16X64];
+  const int txfm_size_row = tx_size_high[TX_16X64];
+  int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  const int num_col = txfm_size_col >> 2;
+  // col tranform
+  const int32x4_t v_shift0 = vdupq_n_s32(shift[0]);
+  for (int i = 0; i < txfm_size_row; i += num_col) {
+    load_buffer_4x4(input + (i + 0) * stride, in + (i + 0) * num_col, num_col,
+                    ud_flip, lr_flip, &v_shift0);
+    load_buffer_4x4(input + (i + 1) * stride, in + (i + 1) * num_col, num_col,
+                    ud_flip, lr_flip, &v_shift0);
+    load_buffer_4x4(input + (i + 2) * stride, in + (i + 2) * num_col, num_col,
+                    ud_flip, lr_flip, &v_shift0);
+    load_buffer_4x4(input + (i + 3) * stride, in + (i + 3) * num_col, num_col,
+                    ud_flip, lr_flip, &v_shift0);
+  }
+
+  for (int i = 0; i < num_col; i++) {
+    av1_fdct64_new_neon(in + i, outcoeff128 + i, bitcol, num_col, num_col);
+  }
+
+  const int32x4_t v_shift = vdupq_n_s32(shift[1]);
+  col_txfm_16x16_rounding(outcoeff128, &v_shift);
+  col_txfm_16x16_rounding(outcoeff128 + 64, &v_shift);
+  col_txfm_16x16_rounding(outcoeff128 + 128, &v_shift);
+  col_txfm_16x16_rounding(outcoeff128 + 192, &v_shift);
+
+  transpose_8nx8n(outcoeff128, in, txfm_size_col, 32);
+  fdct16x16_neon(in, in, bitrow, 8);
+  transpose_8nx8n(in, outcoeff128, 32, txfm_size_col);
+  memset(coeff + txfm_size_col * 32, 0, txfm_size_col * 32 * sizeof(*coeff));
+  (void)bd;
+}
+
+void av1_fwd_txfm2d_64x16_neon(const int16_t *input, int32_t *coeff, int stride,
+                               TX_TYPE tx_type, int bd) {
+  int32x4_t in[256];
+  int32x4_t *outcoeff128 = (int32x4_t *)coeff;
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_64X16];
+  const int txw_idx = get_txw_idx(TX_64X16);
+  const int txh_idx = get_txh_idx(TX_64X16);
+  const int txfm_size_col = tx_size_wide[TX_64X16];
+  const int txfm_size_row = tx_size_high[TX_64X16];
+  int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  // col tranform
+  const int32x4_t v_shift0 = vdupq_n_s32(shift[0]);
+  for (int i = 0; i < txfm_size_row; i++) {
+    load_buffer_4x4(input + 0 + i * stride, in + 0 + i * txfm_size_row, 4,
+                    ud_flip, lr_flip, &v_shift0);
+    load_buffer_4x4(input + 16 + i * stride, in + 4 + i * txfm_size_row, 4,
+                    ud_flip, lr_flip, &v_shift0);
+    load_buffer_4x4(input + 32 + i * stride, in + 8 + i * txfm_size_row, 4,
+                    ud_flip, lr_flip, &v_shift0);
+    load_buffer_4x4(input + 48 + i * stride, in + 12 + i * txfm_size_row, 4,
+                    ud_flip, lr_flip, &v_shift0);
+  }
+
+  fdct16x16_neon(in, outcoeff128, bitcol, txfm_size_row);
+  const int32x4_t v_shift = vdupq_n_s32(shift[1]);
+  col_txfm_16x16_rounding(outcoeff128, &v_shift);
+  col_txfm_16x16_rounding(outcoeff128 + 64, &v_shift);
+  col_txfm_16x16_rounding(outcoeff128 + 128, &v_shift);
+  col_txfm_16x16_rounding(outcoeff128 + 192, &v_shift);
+
+  transpose_8nx8n(outcoeff128, in, txfm_size_col, txfm_size_row);
+  for (int i = 0; i < 4; i++) {
+    av1_fdct64_new_neon(in + i, in + i, bitrow, 4, 4);
+  }
+  transpose_8nx8n(in, outcoeff128, txfm_size_row, 32);
+  (void)bd;
+}
+
+static void fdct64_new_neon(int32x4_t *input, int32x4_t *output,
+                            const int8_t cos_bit, const int8_t *stage_range) {
+  const int txfm_size = 64;
+  const int num_per_128 = 4;
+  int col_num = txfm_size / num_per_128;
+  (void)stage_range;
+  for (int col = 0; col < col_num; col++) {
+    av1_fdct64_new_neon((input + col), (output + col), cos_bit, col_num,
+                        col_num);
+  }
+}
+
+static void fdct32_new_neon(int32x4_t *input, int32x4_t *output,
+                            const int8_t cos_bit, const int8_t *stage_range) {
+  const int txfm_size = 32;
+  const int num_per_128 = 4;
+  int col_num = txfm_size / num_per_128;
+  int col;
+  (void)stage_range;
+  for (col = 0; col < col_num; col++) {
+    av1_fdct32_new_neon((input + col), (output + col), cos_bit, col_num);
+  }
+}
+
+static void idtx32x32_neon(int32x4_t *input, int32x4_t *output,
+                           const int8_t cos_bit, const int8_t *stage_range) {
+  (void)stage_range;
+
+  for (int i = 0; i < 8; i++) {
+    av1_idtx32_new_neon(&input[i * 32], &output[i * 32], cos_bit, 1);
+  }
+}
+
+typedef void (*TxfmFuncNEON)(int32x4_t *input, int32x4_t *output,
+                             const int8_t cos_bit, const int8_t *stage_range);
+
+static INLINE TxfmFuncNEON fwd_txfm_type_to_func(TXFM_TYPE txfm_type) {
+  switch (txfm_type) {
+    case TXFM_TYPE_DCT32: return fdct32_new_neon; break;
+    case TXFM_TYPE_DCT64: return fdct64_new_neon; break;
+    case TXFM_TYPE_IDENTITY32: return idtx32x32_neon; break;
+    default: assert(0);
+  }
+  return NULL;
+}
+
+static INLINE void int16_array_with_stride_to_int32_array_without_stride(
+    const int16_t *input, int stride, int32_t *output, int txfm1d_size) {
+  int r, c;
+  for (r = 0; r < txfm1d_size; r++) {
+    for (c = 0; c < txfm1d_size; c++) {
+      output[r * txfm1d_size + c] = (int32_t)input[r * stride + c];
+    }
+  }
+}
+
+static INLINE void av1_round_shift_array_32_neon(int32x4_t *input,
+                                                 int32x4_t *output,
+                                                 const int size,
+                                                 const int bit) {
+  const int32x4_t v_bit = vdupq_n_s32(-bit);
+  for (int i = 0; i < size; i++) output[i] = vrshlq_s32(input[i], v_bit);
+}
+
+static INLINE void transpose_32_4x4(int stride, const int32x4_t *input,
+                                    int32x4_t *output) {
+  int32x4x2_t temp01 = vzipq_s32(input[0 * stride], input[2 * stride]);
+  int32x4x2_t temp23 = vzipq_s32(input[1 * stride], input[3 * stride]);
+
+  const int32x4x2_t output01 = vzipq_s32(temp01.val[0], temp23.val[0]);
+  const int32x4x2_t output23 = vzipq_s32(temp01.val[1], temp23.val[1]);
+
+  output[0 * stride] = output01.val[0];
+  output[1 * stride] = output01.val[1];
+  output[2 * stride] = output23.val[0];
+  output[3 * stride] = output23.val[1];
+}
+
+static INLINE void transpose_32(int txfm_size, const int32x4_t *input,
+                                int32x4_t *output) {
+  const int num_per_128 = 4;
+  const int row_size = txfm_size;
+  const int col_size = txfm_size / num_per_128;
+  int r, c;
+
+  // transpose each 4x4 block internally
+  for (r = 0; r < row_size; r += 4) {
+    for (c = 0; c < col_size; c++) {
+      transpose_32_4x4(col_size, &input[r * col_size + c],
+                       &output[c * 4 * col_size + r / 4]);
+    }
+  }
+}
+
+static INLINE void fwd_txfm2d_64x64_neon(const int16_t *input, int32_t *output,
+                                         const int stride,
+                                         const TXFM_2D_FLIP_CFG *cfg,
+                                         int32_t *txfm_buf) {
+  assert(cfg->tx_size < TX_SIZES);
+  const int txfm_size = tx_size_wide[cfg->tx_size];
+  const int8_t *shift = cfg->shift;
+  const int8_t *stage_range_col = cfg->stage_range_col;
+  const int8_t cos_bit_col = cfg->cos_bit_col;
+  const int8_t cos_bit_row = cfg->cos_bit_row;
+  const TxfmFuncNEON txfm_func_col = fwd_txfm_type_to_func(cfg->txfm_type_col);
+  int32x4_t *buf_128 = (int32x4_t *)txfm_buf;
+  int32x4_t *out_128 = (int32x4_t *)output;
+
+  const int num_per_128 = 4;
+  int txfm2d_size_128 = txfm_size * txfm_size / num_per_128;
+  int col_num = txfm_size / num_per_128;
+
+  int16_array_with_stride_to_int32_array_without_stride(input, stride, output,
+                                                        txfm_size);
+  /*col wise transform*/
+  txfm_func_col(out_128, buf_128, cos_bit_col, stage_range_col);
+  av1_round_shift_array_32_neon(buf_128, out_128, txfm2d_size_128, -shift[1]);
+  transpose_32(txfm_size, out_128, buf_128);
+
+  /*row wise transform*/
+  for (int col = 0; col < (col_num >> 1); col++) {
+    av1_fdct64_new_neon((buf_128 + col), (out_128 + col), cos_bit_row, col_num,
+                        (col_num >> 1));
+  }
+
+  txfm2d_size_128 = (col_num >> 1) * (txfm_size >> 1);
+  av1_round_shift_array_32_neon(out_128, buf_128, txfm2d_size_128, -shift[2]);
+  transpose_8nx8n(buf_128, out_128, 32, 32);
+}
+
+static INLINE void fwd_txfm2d_neon(const int16_t *input, int32_t *output,
+                                   const int stride,
+                                   const TXFM_2D_FLIP_CFG *cfg,
+                                   int32_t *txfm_buf) {
+  assert(cfg->tx_size < TX_SIZES);
+  const int txfm_size = tx_size_wide[cfg->tx_size];
+  const int8_t *shift = cfg->shift;
+  const int8_t *stage_range_col = cfg->stage_range_col;
+  const int8_t *stage_range_row = cfg->stage_range_row;
+  const int8_t cos_bit_col = cfg->cos_bit_col;
+  const int8_t cos_bit_row = cfg->cos_bit_row;
+  const TxfmFuncNEON txfm_func_col = fwd_txfm_type_to_func(cfg->txfm_type_col);
+  const TxfmFuncNEON txfm_func_row = fwd_txfm_type_to_func(cfg->txfm_type_row);
+
+  int32x4_t *buf_128 = (int32x4_t *)txfm_buf;
+  int32x4_t *out_128 = (int32x4_t *)output;
+  int num_per_128 = 4;
+  int txfm2d_size_128 = txfm_size * txfm_size / num_per_128;
+
+  int16_array_with_stride_to_int32_array_without_stride(input, stride, txfm_buf,
+                                                        txfm_size);
+  av1_round_shift_array_32_neon(buf_128, out_128, txfm2d_size_128, -shift[0]);
+  txfm_func_col(out_128, buf_128, cos_bit_col, stage_range_col);
+  av1_round_shift_array_32_neon(buf_128, out_128, txfm2d_size_128, -shift[1]);
+  transpose_32(txfm_size, out_128, buf_128);
+  txfm_func_row(buf_128, out_128, cos_bit_row, stage_range_row);
+  av1_round_shift_array_32_neon(out_128, buf_128, txfm2d_size_128, -shift[2]);
+  transpose_32(txfm_size, buf_128, out_128);
+}
+
+void av1_fwd_txfm2d_32x32_neon(const int16_t *input, int32_t *output,
+                               int stride, TX_TYPE tx_type, int bd) {
+  DECLARE_ALIGNED(16, int32_t, txfm_buf[1024]);
+  TXFM_2D_FLIP_CFG cfg;
+  av1_get_fwd_txfm_cfg(tx_type, TX_32X32, &cfg);
+  (void)bd;
+  fwd_txfm2d_neon(input, output, stride, &cfg, txfm_buf);
+}
+
+void av1_fwd_txfm2d_64x64_neon(const int16_t *input, int32_t *output,
+                               int stride, TX_TYPE tx_type, int bd) {
+  DECLARE_ALIGNED(16, int32_t, txfm_buf[4096]);
+  TXFM_2D_FLIP_CFG cfg;
+  av1_get_fwd_txfm_cfg(tx_type, TX_64X64, &cfg);
+  (void)bd;
+  fwd_txfm2d_64x64_neon(input, output, stride, &cfg, txfm_buf);
+}
diff --git a/av1/encoder/arm/neon/hybrid_fwd_txfm_neon.c b/av1/encoder/arm/neon/hybrid_fwd_txfm_neon.c
new file mode 100644
index 0000000..0ad1131
--- /dev/null
+++ b/av1/encoder/arm/neon/hybrid_fwd_txfm_neon.c
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "aom_dsp/txfm_common.h"
+
+static void transpose4x4(int16x8_t in[2], int16x4_t out[4]) {
+  int32x4x2_t b0 =
+      vtrnq_s32(vreinterpretq_s32_s16(in[0]), vreinterpretq_s32_s16(in[1]));
+  int16x4x2_t c0 = vtrn_s16(vreinterpret_s16_s32(vget_low_s32(b0.val[0])),
+                            vreinterpret_s16_s32(vget_high_s32(b0.val[0])));
+  int16x4x2_t c1 = vtrn_s16(vreinterpret_s16_s32(vget_low_s32(b0.val[1])),
+                            vreinterpret_s16_s32(vget_high_s32(b0.val[1])));
+  out[0] = c0.val[0];
+  out[1] = c0.val[1];
+  out[2] = c1.val[0];
+  out[3] = c1.val[1];
+}
+
+void av1_fwht4x4_neon(const int16_t *input, tran_low_t *output, int stride) {
+  // Load the 4x4 source in transposed form.
+  int16x4_t a1, b1, c1, d1, e;
+  a1 = vld1_s16(&input[0]);
+  b1 = vld1_s16(&input[1 * stride]);
+  c1 = vld1_s16(&input[2 * stride]);
+  d1 = vld1_s16(&input[3 * stride]);
+
+  // WHT.
+
+  // Row transforms.
+  a1 = vadd_s16(a1, b1);
+  d1 = vsub_s16(d1, c1);
+  e = vhsub_s16(a1, d1);
+  b1 = vsub_s16(e, b1);
+  c1 = vsub_s16(e, c1);
+  a1 = vsub_s16(a1, c1);
+  d1 = vadd_s16(d1, b1);
+
+  int16x8_t x[2];
+  x[0] = vcombine_s16(a1, c1);
+  x[1] = vcombine_s16(d1, b1);
+
+  int16x4_t s[4];
+  transpose4x4(x, s);
+
+  a1 = s[0];
+  b1 = s[1];
+  c1 = s[2];
+  d1 = s[3];
+
+  // Row transforms.
+  a1 = vadd_s16(a1, b1);
+  d1 = vsub_s16(d1, c1);
+  e = vhsub_s16(a1, d1);
+  b1 = vsub_s16(e, b1);
+  c1 = vsub_s16(e, c1);
+  a1 = vsub_s16(a1, c1);
+  d1 = vadd_s16(d1, b1);
+
+  x[0] = vcombine_s16(a1, c1);
+  x[1] = vcombine_s16(d1, b1);
+
+  transpose4x4(x, s);
+
+  vst1q_s32(&output[0], vshll_n_s16(s[0], UNIT_QUANT_SHIFT));
+  vst1q_s32(&output[4], vshll_n_s16(s[1], UNIT_QUANT_SHIFT));
+  vst1q_s32(&output[8], vshll_n_s16(s[2], UNIT_QUANT_SHIFT));
+  vst1q_s32(&output[12], vshll_n_s16(s[3], UNIT_QUANT_SHIFT));
+}
+
+void av1_highbd_fwht4x4_neon(const int16_t *input, tran_low_t *output,
+                             int stride) {
+  av1_fwht4x4_neon(input, output, stride);
+}
diff --git a/av1/encoder/arm/neon/ml_neon.c b/av1/encoder/arm/neon/ml_neon.c
new file mode 100644
index 0000000..fcff3a9
--- /dev/null
+++ b/av1/encoder/arm/neon/ml_neon.c
@@ -0,0 +1,338 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdbool.h>
+#include <assert.h>
+#include <arm_neon.h>
+
+#include "config/av1_rtcd.h"
+#include "av1/encoder/ml.h"
+
+static void nn_activate8(float32x4_t *out_h, float32x4_t *out_l,
+                         const float32x4_t *zero) {
+  *out_h = vmaxq_f32(*out_h, *zero);
+  *out_l = vmaxq_f32(*out_l, *zero);
+}
+
+static void nn_activate4(float32x4_t *x, const float32x4_t *zero) {
+  *x = vmaxq_f32(*x, *zero);
+}
+
+#define CLAMP_0(x) (x = x > 0 ? x : 0)
+
+static void nn_propagate_8to1(int num_inputs, const float *const inputs,
+                              const float *const weights,
+                              const float *layer_bias,
+                              float *const output_nodes, bool output_layer) {
+  const float32x4_t zero = vdupq_n_f32(0);
+  float32x4_t vadd = zero;
+  float total = *layer_bias;
+
+  for (int in = 0; in < num_inputs; in += 8) {
+    const float32x4_t inputs_h = vld1q_f32(&inputs[in + 4]);
+    const float32x4_t inputs_l = vld1q_f32(&inputs[in]);
+
+    const float32x4_t weights_h = vld1q_f32(&weights[in + 4]);
+    const float32x4_t weights_l = vld1q_f32(&weights[in]);
+
+    vadd = vmlaq_f32(vadd, inputs_h, weights_h);
+    vadd = vmlaq_f32(vadd, inputs_l, weights_l);
+  }
+#if defined(__aarch64__)
+  total += vaddvq_f32(vadd);
+#else
+  float32x2_t vadd_lo = vadd_f32(vget_low_f32(vadd), vget_high_f32(vadd));
+  vadd_lo = vpadd_f32(vadd_lo, vadd_lo);
+  total += vget_lane_f32(vadd_lo, 0);
+#endif
+
+  if (!output_layer) CLAMP_0(total);
+  *output_nodes = total;
+}
+
+static void nn_propagate_xto1(int num_inputs, const float *const inputs,
+                              const float *const weights,
+                              const float *layer_bias,
+                              float *const output_nodes) {
+  float32x4_t vadd = vdupq_n_f32(0);
+
+  float total = *layer_bias;
+  int j = num_inputs;
+  int in = 0;
+  while (j > 7) {
+    const float32x4_t inputs_h = vld1q_f32(&inputs[in + 4]);
+    const float32x4_t inputs_l = vld1q_f32(&inputs[in]);
+
+    const float32x4_t weights_h = vld1q_f32(&weights[in + 4]);
+    const float32x4_t weights_l = vld1q_f32(&weights[in]);
+
+    vadd = vmlaq_f32(vadd, inputs_h, weights_h);
+    vadd = vmlaq_f32(vadd, inputs_l, weights_l);
+    in += 8;
+    j -= 8;
+  }
+
+#if defined(__aarch64__)
+  total += vaddvq_f32(vadd);
+
+#else
+  float32x2_t vadd_lo = vadd_f32(vget_low_f32(vadd), vget_high_f32(vadd));
+  vadd_lo = vpadd_f32(vadd_lo, vadd_lo);
+  total += vget_lane_f32(vadd_lo, 0);
+#endif
+  for (; in < num_inputs; in++) total += weights[in] * inputs[in];
+
+  *output_nodes = CLAMP_0(total);
+}
+
+static void nn_propagate_xsto1(int num_inputs, const float *const inputs,
+                               const float *const weights,
+                               const float *layer_bias,
+                               float *const output_nodes) {
+  float total = *layer_bias;
+#if defined(__aarch64__)
+  const float32x4_t v_inputs = vld1q_f32(inputs);
+  const float32x4_t v_weights = vld1q_f32(weights);
+  const float32x4_t vadd = vmulq_f32(v_inputs, v_weights);
+  total += vaddvq_f32(vadd);
+  int in = 4;
+#else
+  int in = 0;
+#endif
+  for (; in < num_inputs; in++) total += weights[in] * inputs[in];
+
+  *output_nodes = CLAMP_0(total);
+}
+
+static void nn_propagate_4to1(int num_inputs, const float *const inputs,
+                              const float *const weights,
+                              const float *layer_bias,
+                              float *const output_nodes, bool output_layer) {
+  const float32x4_t zero = vdupq_n_f32(0);
+  float32x4_t vadd = zero;
+  float total = *layer_bias;
+
+  for (int in = 0; in < num_inputs; in += 4) {
+    const float32x4_t v_inputs = vld1q_f32(&inputs[in]);
+    const float32x4_t v_weights = vld1q_f32(&weights[in]);
+    vadd = vmlaq_f32(vadd, v_inputs, v_weights);
+  }
+
+#if defined(__aarch64__)
+  total += vaddvq_f32(vadd);
+#else
+  float32x2_t vadd_lo = vadd_f32(vget_low_f32(vadd), vget_high_f32(vadd));
+  vadd_lo = vpadd_f32(vadd_lo, vadd_lo);
+  total += vget_lane_f32(vadd_lo, 0);
+#endif
+
+  if (!output_layer) CLAMP_0(total);
+  *output_nodes = total;
+}
+
+static void nn_propagate_4to4(int num_inputs, const float *const inputs,
+                              const float *const weights,
+                              const float *layer_bias,
+                              float *const output_nodes, bool output_layer) {
+  float32x4_t outputs = vld1q_f32(layer_bias);
+  const float32x4_t zero = vdupq_n_f32(0);
+
+  float32x4_t mul0[2] = { zero, zero };
+  float32x4_t mul1[2] = { zero, zero };
+  for (int in = 0; in < num_inputs; in += 4) {
+    const float32x4_t v_input = vld1q_f32(&inputs[in]);
+
+    for (int i = 0; i < 2; i++) {
+      const float32x4_t weight0 = vld1q_f32(&weights[in + 2 * i * num_inputs]);
+      mul0[i] = vmlaq_f32(mul0[i], weight0, v_input);
+      const float32x4_t weight1 =
+          vld1q_f32(&weights[in + (2 * i + 1) * num_inputs]);
+      mul1[i] = vmlaq_f32(mul1[i], weight1, v_input);
+    }
+  }
+  for (int i = 0; i < 2; i++)
+#if defined(__aarch64__)
+    mul0[i] = vpaddq_f32(mul0[i], mul1[i]);
+  const float32x4_t hh = vpaddq_f32(mul0[0], mul0[1]);
+#else
+    mul0[i] =
+        vcombine_f32(vpadd_f32(vget_low_f32(mul0[i]), vget_high_f32(mul0[i])),
+                     vpadd_f32(vget_low_f32(mul1[i]), vget_high_f32(mul1[i])));
+  const float32x4_t hh =
+      vcombine_f32(vpadd_f32(vget_low_f32(mul0[0]), vget_high_f32(mul0[0])),
+                   vpadd_f32(vget_low_f32(mul0[1]), vget_high_f32(mul0[1])));
+#endif
+
+  outputs = vaddq_f32(outputs, hh);
+  if (!output_layer) nn_activate4(&outputs, &zero);
+  vst1q_f32(output_nodes, outputs);
+}
+
+static void nn_propagate_4to8(const int num_inputs, const float *const inputs,
+                              const float *const weights,
+                              const float *layer_bias,
+                              float *const output_nodes, bool output_layer) {
+  float32x4_t out_h = vld1q_f32(&layer_bias[4]);
+  float32x4_t out_l = vld1q_f32(layer_bias);
+  const float32x4_t zero = vdupq_n_f32(0);
+  float32x4_t mul0[4] = { zero, zero, zero, zero };
+  float32x4_t mul1[4] = { zero, zero, zero, zero };
+
+  for (int in = 0; in < num_inputs; in += 4) {
+    const float32x4_t v_input = vld1q_f32(&inputs[in]);
+    for (int i = 0; i < 4; i++) {
+      const float32x4_t weight0 = vld1q_f32(&weights[in + 2 * i * num_inputs]);
+      const float32x4_t weight1 =
+          vld1q_f32(&weights[in + (2 * i + 1) * num_inputs]);
+      mul0[i] = vmlaq_f32(mul0[i], v_input, weight0);
+      mul1[i] = vmlaq_f32(mul1[i], v_input, weight1);
+    }
+  }
+  for (int i = 0; i < 4; i++)
+#if defined(__aarch64__)
+    mul0[i] = vpaddq_f32(mul0[i], mul1[i]);
+  const float32x4_t hh0 = vpaddq_f32(mul0[0], mul0[1]);
+  const float32x4_t hh1 = vpaddq_f32(mul0[2], mul0[3]);
+#else
+    mul0[i] =
+        vcombine_f32(vpadd_f32(vget_low_f32(mul0[i]), vget_high_f32(mul0[i])),
+                     vpadd_f32(vget_low_f32(mul1[i]), vget_high_f32(mul1[i])));
+  const float32x4_t hh0 =
+      vcombine_f32(vpadd_f32(vget_low_f32(mul0[0]), vget_high_f32(mul0[0])),
+                   vpadd_f32(vget_low_f32(mul0[1]), vget_high_f32(mul0[1])));
+  const float32x4_t hh1 =
+      vcombine_f32(vpadd_f32(vget_low_f32(mul0[2]), vget_high_f32(mul0[2])),
+                   vpadd_f32(vget_low_f32(mul0[3]), vget_high_f32(mul0[3])));
+#endif
+
+  out_h = vaddq_f32(out_h, hh1);
+  out_l = vaddq_f32(out_l, hh0);
+
+  if (!output_layer) nn_activate8(&out_h, &out_l, &zero);
+  vst1q_f32(&output_nodes[4], out_h);
+  vst1q_f32(output_nodes, out_l);
+}
+
+static void nn_propagate_8to4(const int num_inputs, const float *const inputs,
+                              const float *const weights,
+                              const float *layer_bias,
+                              float *const output_nodes, bool output_layer) {
+  float32x4_t outputs = vld1q_f32(layer_bias);
+  const float32x4_t zero = vdupq_n_f32(0);
+  float32x4_t add[4] = { zero, zero, zero, zero };
+  for (int in = 0; in < num_inputs; in += 8) {
+    const float32x4_t inputs_l = vld1q_f32(&inputs[in]);
+    const float32x4_t inputs_h = vld1q_f32(&inputs[in + 4]);
+
+    for (int i = 0; i < 4; i++) {
+      const float32x4_t weight_l = vld1q_f32(&weights[in + i * num_inputs]);
+      const float32x4_t weight_h = vld1q_f32(&weights[in + i * num_inputs + 4]);
+      add[i] = vmlaq_f32(add[i], inputs_l, weight_l);
+      add[i] = vmlaq_f32(add[i], inputs_h, weight_h);
+    }
+  }
+#if defined(__aarch64__)
+  const float32x4_t hadd_h = vpaddq_f32(add[2], add[3]);
+  const float32x4_t hadd_l = vpaddq_f32(add[0], add[1]);
+  const float32x4_t haddhadd = vpaddq_f32(hadd_l, hadd_h);
+#else
+  const float32x4_t hadd_h =
+      vcombine_f32(vpadd_f32(vget_low_f32(add[2]), vget_high_f32(add[2])),
+                   vpadd_f32(vget_low_f32(add[3]), vget_high_f32(add[3])));
+  const float32x4_t hadd_l =
+      vcombine_f32(vpadd_f32(vget_low_f32(add[0]), vget_high_f32(add[0])),
+                   vpadd_f32(vget_low_f32(add[1]), vget_high_f32(add[1])));
+  const float32x4_t haddhadd =
+      vcombine_f32(vpadd_f32(vget_low_f32(hadd_l), vget_high_f32(hadd_l)),
+                   vpadd_f32(vget_low_f32(hadd_h), vget_high_f32(hadd_h)));
+#endif
+
+  outputs = vaddq_f32(outputs, haddhadd);
+  if (!output_layer) nn_activate4(&outputs, &zero);
+  vst1q_f32(output_nodes, outputs);
+}
+
+// Calculate prediction based on the given input features and neural net config.
+// Assume there are no more than NN_MAX_NODES_PER_LAYER nodes in each hidden
+// layer.
+void av1_nn_predict_neon(const float *input_nodes,
+                         const NN_CONFIG *const nn_config, int reduce_prec,
+                         float *const output) {
+  float buf[2][NN_MAX_NODES_PER_LAYER];
+  int buf_index = 0;
+  int num_inputs = nn_config->num_inputs;
+  // Hidden layers, except the final iteration is the output layer.
+  for (int layer = 0; layer <= nn_config->num_hidden_layers; layer++) {
+    const float *layer_weights = nn_config->weights[layer];
+    const float *layer_bias = nn_config->bias[layer];
+    bool output_layer = (layer == nn_config->num_hidden_layers);
+    float *const output_nodes = output_layer ? output : buf[buf_index];
+    const int num_outputs = output_layer ? nn_config->num_outputs
+                                         : nn_config->num_hidden_nodes[layer];
+
+    if (num_inputs % 4 == 0 && num_outputs % 8 == 0) {
+      for (int out = 0; out < num_outputs; out += 8) {
+        nn_propagate_4to8(num_inputs, input_nodes,
+                          &layer_weights[out * num_inputs], &layer_bias[out],
+                          &output_nodes[out], output_layer);
+      }
+    } else if (num_inputs % 8 == 0 && num_outputs % 4 == 0) {
+      for (int out = 0; out < num_outputs; out += 4) {
+        nn_propagate_8to4(num_inputs, input_nodes,
+                          &layer_weights[out * num_inputs], &layer_bias[out],
+                          &output_nodes[out], output_layer);
+      }
+    } else if (num_inputs % 4 == 0 && num_outputs % 4 == 0) {
+      for (int out = 0; out < num_outputs; out += 4) {
+        nn_propagate_4to4(num_inputs, input_nodes,
+                          &layer_weights[out * num_inputs], &layer_bias[out],
+                          &output_nodes[out], output_layer);
+      }
+    } else if (num_inputs % 8 == 0) {
+      for (int out = 0; out < num_outputs; out++) {
+        nn_propagate_8to1(num_inputs, input_nodes,
+                          &layer_weights[out * num_inputs], &layer_bias[out],
+                          &output_nodes[out], output_layer);
+      }
+    } else if (num_inputs % 4 == 0) {
+      for (int out = 0; out < num_outputs; out++) {
+        nn_propagate_4to1(num_inputs, input_nodes,
+                          &layer_weights[out * num_inputs], &layer_bias[out],
+                          &output_nodes[out], output_layer);
+      }
+    } else if (num_inputs > 8) {
+      for (int out = 0; out < num_outputs; out++) {
+        nn_propagate_xto1(num_inputs, input_nodes,
+                          &layer_weights[out * num_inputs], &layer_bias[out],
+                          &output_nodes[out]);
+      }
+    } else if (num_inputs >= 4) {
+      for (int out = 0; out < num_outputs; out++) {
+        nn_propagate_xsto1(num_inputs, input_nodes,
+                           &layer_weights[out * num_inputs], &layer_bias[out],
+                           &output_nodes[out]);
+      }
+    } else {
+      for (int node = 0; node < num_outputs; ++node) {
+        float val = layer_bias[node];
+        for (int i = 0; i < num_inputs; ++i)
+          val += layer_weights[node * num_inputs + i] * input_nodes[i];
+        // ReLU as activation function.
+        val = val > 0.0f ? val : 0.0f;  // Could use AOMMAX().
+        output_nodes[node] = val;
+      }
+    }
+    input_nodes = output_nodes;
+    num_inputs = num_outputs;
+    buf_index = 1 - buf_index;
+  }
+  if (reduce_prec) av1_nn_output_prec_reduce(output, nn_config->num_outputs);
+}
diff --git a/av1/encoder/arm/neon/picksrt_neon.c b/av1/encoder/arm/neon/picksrt_neon.c
new file mode 100644
index 0000000..34595e8
--- /dev/null
+++ b/av1/encoder/arm/neon/picksrt_neon.c
@@ -0,0 +1,151 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <math.h>
+
+#include "aom/aom_integer.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+#include "av1/common/restoration.h"
+#include "common/tools_common.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+int64_t av1_lowbd_pixel_proj_error_neon(
+    const uint8_t *src8, int width, int height, int src_stride,
+    const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
+    int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params) {
+  int i, j, k;
+  const int32_t shift = SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS;
+  const int32x4_t zero = vdupq_n_s32(0);
+  uint64x2_t sum64 = vreinterpretq_u64_s32(zero);
+  const uint8_t *src = src8;
+  const uint8_t *dat = dat8;
+
+  int64_t err = 0;
+  if (params->r[0] > 0 && params->r[1] > 0) {
+    for (i = 0; i < height; ++i) {
+      int32x4_t err0 = zero;
+      for (j = 0; j <= width - 8; j += 8) {
+        const uint8x8_t d0 = vld1_u8(&dat[j]);
+        const uint8x8_t s0 = vld1_u8(&src[j]);
+        const int16x8_t flt0_16b =
+            vcombine_s16(vqmovn_s32(vld1q_s32(&flt0[j])),
+                         vqmovn_s32(vld1q_s32(&flt0[j + 4])));
+        const int16x8_t flt1_16b =
+            vcombine_s16(vqmovn_s32(vld1q_s32(&flt1[j])),
+                         vqmovn_s32(vld1q_s32(&flt1[j + 4])));
+        const int16x8_t u0 =
+            vreinterpretq_s16_u16(vshll_n_u8(d0, SGRPROJ_RST_BITS));
+        const int16x8_t flt0_0_sub_u = vsubq_s16(flt0_16b, u0);
+        const int16x8_t flt1_0_sub_u = vsubq_s16(flt1_16b, u0);
+        const int16x4_t flt0_16b_sub_u_lo = vget_low_s16(flt0_0_sub_u);
+        const int16x4_t flt0_16b_sub_u_hi = vget_high_s16(flt0_0_sub_u);
+        const int16x4_t flt1_16b_sub_u_lo = vget_low_s16(flt1_0_sub_u);
+        const int16x4_t flt1_16b_sub_u_hi = vget_high_s16(flt1_0_sub_u);
+
+        int32x4_t v0 = vmull_n_s16(flt0_16b_sub_u_lo, (int16_t)xq[0]);
+        v0 = vmlal_n_s16(v0, flt1_16b_sub_u_lo, (int16_t)xq[1]);
+        int32x4_t v1 = vmull_n_s16(flt0_16b_sub_u_hi, (int16_t)xq[0]);
+        v1 = vmlal_n_s16(v1, flt1_16b_sub_u_hi, (int16_t)xq[1]);
+        const int16x4_t vr0 = vqrshrn_n_s32(v0, 11);
+        const int16x4_t vr1 = vqrshrn_n_s32(v1, 11);
+        const int16x8_t e0 = vaddq_s16(vcombine_s16(vr0, vr1),
+                                       vreinterpretq_s16_u16(vsubl_u8(d0, s0)));
+        const int16x4_t e0_lo = vget_low_s16(e0);
+        const int16x4_t e0_hi = vget_high_s16(e0);
+        err0 = vmlal_s16(err0, e0_lo, e0_lo);
+        err0 = vmlal_s16(err0, e0_hi, e0_hi);
+      }
+      for (k = j; k < width; ++k) {
+        const int32_t u = dat[k] << SGRPROJ_RST_BITS;
+        int32_t v = xq[0] * (flt0[k] - u) + xq[1] * (flt1[k] - u);
+        const int32_t e = ROUND_POWER_OF_TWO(v, 11) + dat[k] - src[k];
+        err += e * e;
+      }
+      dat += dat_stride;
+      src += src_stride;
+      flt0 += flt0_stride;
+      flt1 += flt1_stride;
+      sum64 = vpadalq_u32(sum64, vreinterpretq_u32_s32(err0));
+    }
+
+  } else if (params->r[0] > 0 || params->r[1] > 0) {
+    const int xq_active = (params->r[0] > 0) ? xq[0] : xq[1];
+    const int32_t *flt = (params->r[0] > 0) ? flt0 : flt1;
+    const int flt_stride = (params->r[0] > 0) ? flt0_stride : flt1_stride;
+    for (i = 0; i < height; ++i) {
+      int32x4_t err0 = zero;
+      for (j = 0; j <= width - 8; j += 8) {
+        const uint8x8_t d0 = vld1_u8(&dat[j]);
+        const uint8x8_t s0 = vld1_u8(&src[j]);
+        const uint16x8_t d0s0 = vsubl_u8(d0, s0);
+        const uint16x8x2_t d0w =
+            vzipq_u16(vmovl_u8(d0), vreinterpretq_u16_s32(zero));
+
+        const int32x4_t flt_16b_lo = vld1q_s32(&flt[j]);
+        const int32x4_t flt_16b_hi = vld1q_s32(&flt[j + 4]);
+
+        int32x4_t v0 = vmulq_n_s32(flt_16b_lo, xq_active);
+        v0 = vmlsq_n_s32(v0, vreinterpretq_s32_u16(d0w.val[0]),
+                         xq_active << SGRPROJ_RST_BITS);
+        int32x4_t v1 = vmulq_n_s32(flt_16b_hi, xq_active);
+        v1 = vmlsq_n_s32(v1, vreinterpretq_s32_u16(d0w.val[1]),
+                         xq_active << SGRPROJ_RST_BITS);
+        const int16x4_t vr0 = vqrshrn_n_s32(v0, 11);
+        const int16x4_t vr1 = vqrshrn_n_s32(v1, 11);
+        const int16x8_t e0 =
+            vaddq_s16(vcombine_s16(vr0, vr1), vreinterpretq_s16_u16(d0s0));
+        const int16x4_t e0_lo = vget_low_s16(e0);
+        const int16x4_t e0_hi = vget_high_s16(e0);
+        err0 = vmlal_s16(err0, e0_lo, e0_lo);
+        err0 = vmlal_s16(err0, e0_hi, e0_hi);
+      }
+      for (k = j; k < width; ++k) {
+        const int32_t u = dat[k] << SGRPROJ_RST_BITS;
+        int32_t v = xq_active * (flt[k] - u);
+        const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k];
+        err += e * e;
+      }
+      dat += dat_stride;
+      src += src_stride;
+      flt += flt_stride;
+      sum64 = vpadalq_u32(sum64, vreinterpretq_u32_s32(err0));
+    }
+  } else {
+    uint32x4_t err0 = vreinterpretq_u32_s32(zero);
+    for (i = 0; i < height; ++i) {
+      for (j = 0; j <= width - 16; j += 16) {
+        const uint8x16_t d = vld1q_u8(&dat[j]);
+        const uint8x16_t s = vld1q_u8(&src[j]);
+        const uint8x16_t diff = vabdq_u8(d, s);
+        const uint8x8_t diff0 = vget_low_u8(diff);
+        const uint8x8_t diff1 = vget_high_u8(diff);
+        err0 = vpadalq_u16(err0, vmull_u8(diff0, diff0));
+        err0 = vpadalq_u16(err0, vmull_u8(diff1, diff1));
+      }
+      for (k = j; k < width; ++k) {
+        const int32_t e = dat[k] - src[k];
+        err += e * e;
+      }
+      dat += dat_stride;
+      src += src_stride;
+    }
+    sum64 = vpaddlq_u32(err0);
+  }
+#if defined(__aarch64__)
+  err += vaddvq_u64(sum64);
+#else
+  err += vget_lane_u64(vadd_u64(vget_low_u64(sum64), vget_high_u64(sum64)), 0);
+#endif  // __aarch64__
+  return err;
+}
diff --git a/av1/encoder/arm/neon/quantize_neon.c b/av1/encoder/arm/neon/quantize_neon.c
index c2f50a2..4eadbbc 100644
--- a/av1/encoder/arm/neon/quantize_neon.c
+++ b/av1/encoder/arm/neon/quantize_neon.c
@@ -213,3 +213,892 @@
   }
 #endif  // __aarch64__
 }
+
+void av1_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                                const int16_t *zbin_ptr,
+                                const int16_t *round_ptr,
+                                const int16_t *quant_ptr,
+                                const int16_t *quant_shift_ptr,
+                                tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                                const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                                const int16_t *scan, const int16_t *iscan) {
+  const int log_scale = 1;
+  const int rounding[2] = { ROUND_POWER_OF_TWO(round_ptr[0], log_scale),
+                            ROUND_POWER_OF_TWO(round_ptr[1], log_scale) };
+
+  (void)zbin_ptr;
+  (void)quant_shift_ptr;
+  (void)scan;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  const int16x8_t zero = vdupq_n_s16(0);
+  int16x8_t v_eobmax_76543210 = vreinterpretq_s16_u16(vceqq_s16(zero, zero));
+  int16x8_t round = vdupq_n_s16(rounding[1]);
+  int16x8_t quant = vdupq_n_s16(quant_ptr[1]);
+  int16x8_t dequant = vdupq_n_s16(dequant_ptr[1]);
+  dequant = vsetq_lane_s16(dequant_ptr[0], dequant, 0);
+
+  int16x8_t coeff = load_tran_low_to_s16q(&coeff_ptr[0]);
+
+  int16x8_t abs = vabsq_s16(coeff);
+  uint16x8_t check = vcgeq_s16(abs, vshrq_n_s16(dequant, 2));
+  uint64_t nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(check)), 0);
+  if (nz_check) {
+    const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
+    const int16x8_t v_iscan = vld1q_s16(&iscan[0]);
+    round = vsetq_lane_s16(rounding[0], round, 0);
+    quant = vsetq_lane_s16(quant_ptr[0], quant, 0);
+
+    abs = vqaddq_s16(abs, round);
+    int16x8_t temp = vqdmulhq_s16(abs, quant);
+    int16x8_t qcoeff_temp = vsubq_s16(veorq_s16(temp, coeff_sign), coeff_sign);
+    abs = vreinterpretq_s16_u16(
+        vshrq_n_u16(vreinterpretq_u16_s16(vmulq_s16(temp, dequant)), 1));
+    int16x8_t dqcoeff_temp = vsubq_s16(veorq_s16(abs, coeff_sign), coeff_sign);
+
+    int16x8_t coeff_nz_mask =
+        vbslq_s16(check, qcoeff_temp, load_tran_low_to_s16q(&qcoeff_ptr[0]));
+    store_s16q_to_tran_low(&qcoeff_ptr[0], coeff_nz_mask);
+    coeff_nz_mask =
+        vbslq_s16(check, dqcoeff_temp, load_tran_low_to_s16q(&dqcoeff_ptr[0]));
+    store_s16q_to_tran_low(&dqcoeff_ptr[0], coeff_nz_mask);
+
+    round = vsetq_lane_s16(rounding[1], round, 0);
+    quant = vsetq_lane_s16(quant_ptr[1], quant, 0);
+
+    uint16x8_t vtmp_mask = vcgtq_s16(abs, zero);
+    const uint16x8_t v_nz_mask = vandq_u16(vtmp_mask, check);
+    check = vandq_u16(v_nz_mask, vcgtq_s16(v_iscan, v_eobmax_76543210));
+    v_eobmax_76543210 = vbslq_s16(check, v_iscan, v_eobmax_76543210);
+  }
+
+  dequant = vsetq_lane_s16(dequant_ptr[1], dequant, 0);
+
+  for (int i = 8; i < n_coeffs; i += 8) {
+    coeff = load_tran_low_to_s16q(&coeff_ptr[i]);
+    abs = vabsq_s16(coeff);
+    check = vcgeq_s16(abs, vshrq_n_s16(dequant, 2));
+
+    nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(check)), 0);
+    if (nz_check) {
+      const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
+      const int16x8_t v_iscan = vld1q_s16(&iscan[i]);
+
+      abs = vqaddq_s16(abs, round);
+      int16x8_t temp = vqdmulhq_s16(abs, quant);
+      int16x8_t qcoeff_temp =
+          vsubq_s16(veorq_s16(temp, coeff_sign), coeff_sign);
+      abs = vreinterpretq_s16_u16(
+          vshrq_n_u16(vreinterpretq_u16_s16(vmulq_s16(temp, dequant)), 1));
+      int16x8_t dqcoeff_temp =
+          vsubq_s16(veorq_s16(abs, coeff_sign), coeff_sign);
+
+      int16x8_t coeff_nz_mask =
+          vbslq_s16(check, qcoeff_temp, load_tran_low_to_s16q(&qcoeff_ptr[i]));
+      store_s16q_to_tran_low(&qcoeff_ptr[i], coeff_nz_mask);
+      coeff_nz_mask = vbslq_s16(check, dqcoeff_temp,
+                                load_tran_low_to_s16q(&dqcoeff_ptr[i]));
+      store_s16q_to_tran_low(&dqcoeff_ptr[i], coeff_nz_mask);
+
+      uint16x8_t vtmp_mask = vcgtq_s16(abs, zero);
+      const uint16x8_t v_nz_mask = vandq_u16(vtmp_mask, check);
+      check = vandq_u16(v_nz_mask, vcgtq_s16(v_iscan, v_eobmax_76543210));
+      v_eobmax_76543210 = vbslq_s16(check, v_iscan, v_eobmax_76543210);
+    }
+  }
+#ifdef __aarch64__
+  *eob_ptr = vmaxvq_s16(v_eobmax_76543210) + 1;
+#else
+  {
+    const int16x4_t v_eobmax_3210 = vmax_s16(vget_low_s16(v_eobmax_76543210),
+                                             vget_high_s16(v_eobmax_76543210));
+    const int64x1_t v_eobmax_xx32 =
+        vshr_n_s64(vreinterpret_s64_s16(v_eobmax_3210), 32);
+    const int16x4_t v_eobmax_tmp =
+        vmax_s16(v_eobmax_3210, vreinterpret_s16_s64(v_eobmax_xx32));
+    const int64x1_t v_eobmax_xxx3 =
+        vshr_n_s64(vreinterpret_s64_s16(v_eobmax_tmp), 16);
+    const int16x4_t v_eobmax_final =
+        vmax_s16(v_eobmax_tmp, vreinterpret_s16_s64(v_eobmax_xxx3));
+
+    *eob_ptr = (uint16_t)vget_lane_s16(v_eobmax_final, 0) + 1;
+  }
+#endif  // __aarch64__
+}
+
+void av1_quantize_fp_64x64_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                                const int16_t *zbin_ptr,
+                                const int16_t *round_ptr,
+                                const int16_t *quant_ptr,
+                                const int16_t *quant_shift_ptr,
+                                tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                                const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                                const int16_t *scan, const int16_t *iscan) {
+  const int log_scale = 2;
+  const int16x8_t v_log_scale =
+      vreinterpretq_s16_s64(vdupq_n_s64(0xFFFEFFFEFFFEFFFE));
+
+  const int rounding[2] = { ROUND_POWER_OF_TWO(round_ptr[0], log_scale),
+                            ROUND_POWER_OF_TWO(round_ptr[1], log_scale) };
+
+  (void)zbin_ptr;
+  (void)quant_shift_ptr;
+  (void)scan;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  const int16x8_t zero = vdupq_n_s16(0);
+  int16x8_t v_eobmax_76543210 = vreinterpretq_s16_u16(vceqq_s16(zero, zero));
+
+  int16x8_t round = vdupq_n_s16(rounding[1]);
+  int16x8_t quant = vdupq_n_s16(quant_ptr[1]);
+  int16x8_t dequant = vdupq_n_s16(dequant_ptr[1]);
+  dequant = vsetq_lane_s16(dequant_ptr[0], dequant, 0);
+
+  int16x8_t coeff = load_tran_low_to_s16q(&coeff_ptr[0]);
+  int16x8_t abs = vabsq_s16(coeff);
+  uint16x8_t check = vcgeq_u16(vshlq_n_u16(vreinterpretq_u16_s16(abs), 1),
+                               vshrq_n_u16(vreinterpretq_u16_s16(dequant), 2));
+  uint64_t nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(check)), 0);
+  if (nz_check) {
+    const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
+    const int16x8_t v_iscan = vld1q_s16(&iscan[0]);
+    round = vsetq_lane_s16(rounding[0], round, 0);
+    quant = vsetq_lane_s16(quant_ptr[0], quant, 0);
+    abs = vqaddq_s16(abs, round);
+    int16x8_t temp =
+        vorrq_s16(vshlq_n_s16(vqdmulhq_s16(abs, quant), 1),
+                  vreinterpretq_s16_u16(vshrq_n_u16(
+                      vreinterpretq_u16_s16(vmulq_s16(abs, quant)), 14)));
+    int16x8_t qcoeff_temp = vsubq_s16(veorq_s16(temp, coeff_sign), coeff_sign);
+
+    abs = vreinterpretq_s16_u16(vshlq_u16(
+        vreinterpretq_u16_s16(vmulq_s16(temp, dequant)), v_log_scale));
+    abs = vorrq_s16(vshlq_n_s16(vqdmulhq_s16(temp, dequant), 13), abs);
+    int16x8_t dqcoeff_temp = vsubq_s16(veorq_s16(abs, coeff_sign), coeff_sign);
+    int16x8_t coeff_nz_mask =
+        vbslq_s16(check, qcoeff_temp, load_tran_low_to_s16q(&qcoeff_ptr[0]));
+    store_s16q_to_tran_low(&qcoeff_ptr[0], coeff_nz_mask);
+    coeff_nz_mask =
+        vbslq_s16(check, dqcoeff_temp, load_tran_low_to_s16q(&dqcoeff_ptr[0]));
+    store_s16q_to_tran_low(&dqcoeff_ptr[0], coeff_nz_mask);
+
+    round = vsetq_lane_s16(rounding[1], round, 0);
+    quant = vsetq_lane_s16(quant_ptr[1], quant, 0);
+
+    uint16x8_t vtmp_mask = vcgtq_s16(abs, zero);
+    const uint16x8_t v_nz_mask = vandq_u16(vtmp_mask, check);
+    check = vandq_u16(v_nz_mask, vcgtq_s16(v_iscan, v_eobmax_76543210));
+    v_eobmax_76543210 = vbslq_s16(check, v_iscan, v_eobmax_76543210);
+  }
+
+  dequant = vsetq_lane_s16(dequant_ptr[1], dequant, 0);
+
+  for (int i = 8; i < n_coeffs; i += 8) {
+    coeff = load_tran_low_to_s16q(&coeff_ptr[i]);
+    abs = vabsq_s16(coeff);
+    check = vcgeq_u16(vshlq_n_u16(vreinterpretq_u16_s16(abs), 1),
+                      vshrq_n_u16(vreinterpretq_u16_s16(dequant), 2));
+    nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(check)), 0);
+    if (nz_check) {
+      const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
+      const int16x8_t v_iscan = vld1q_s16(&iscan[i]);
+      abs = vqaddq_s16(abs, round);
+      int16x8_t temp =
+          vorrq_s16(vshlq_n_s16(vqdmulhq_s16(abs, quant), 1),
+                    vreinterpretq_s16_u16(vshrq_n_u16(
+                        vreinterpretq_u16_s16(vmulq_s16(abs, quant)), 14)));
+
+      int16x8_t qcoeff_temp =
+          vsubq_s16(veorq_s16(temp, coeff_sign), coeff_sign);
+
+      abs = vreinterpretq_s16_u16(vshlq_u16(
+          vreinterpretq_u16_s16(vmulq_s16(temp, dequant)), v_log_scale));
+      abs = vorrq_s16(vshlq_n_s16(vqdmulhq_s16(temp, dequant), 13), abs);
+
+      int16x8_t dqcoeff_temp =
+          vsubq_s16(veorq_s16(abs, coeff_sign), coeff_sign);
+      int16x8_t coeff_nz_mask =
+          vbslq_s16(check, qcoeff_temp, load_tran_low_to_s16q(&qcoeff_ptr[i]));
+      store_s16q_to_tran_low(&qcoeff_ptr[i], coeff_nz_mask);
+      coeff_nz_mask = vbslq_s16(check, dqcoeff_temp,
+                                load_tran_low_to_s16q(&dqcoeff_ptr[i]));
+      store_s16q_to_tran_low(&dqcoeff_ptr[i], coeff_nz_mask);
+
+      uint16x8_t vtmp_mask = vcgtq_s16(abs, zero);
+      const uint16x8_t v_nz_mask = vandq_u16(vtmp_mask, check);
+
+      check = vandq_u16(v_nz_mask, vcgtq_s16(v_iscan, v_eobmax_76543210));
+      v_eobmax_76543210 = vbslq_s16(check, v_iscan, v_eobmax_76543210);
+    }
+  }
+#ifdef __aarch64__
+  *eob_ptr = vmaxvq_s16(v_eobmax_76543210) + 1;
+#else
+  {
+    const int16x4_t v_eobmax_3210 = vmax_s16(vget_low_s16(v_eobmax_76543210),
+                                             vget_high_s16(v_eobmax_76543210));
+    const int64x1_t v_eobmax_xx32 =
+        vshr_n_s64(vreinterpret_s64_s16(v_eobmax_3210), 32);
+    const int16x4_t v_eobmax_tmp =
+        vmax_s16(v_eobmax_3210, vreinterpret_s16_s64(v_eobmax_xx32));
+    const int64x1_t v_eobmax_xxx3 =
+        vshr_n_s64(vreinterpret_s64_s16(v_eobmax_tmp), 16);
+    const int16x4_t v_eobmax_final =
+        vmax_s16(v_eobmax_tmp, vreinterpret_s16_s64(v_eobmax_xxx3));
+
+    *eob_ptr = (uint16_t)vget_lane_s16(v_eobmax_final, 0) + 1;
+  }
+#endif  // __aarch64__
+}
+
+void aom_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                         const int16_t *zbin_ptr, const int16_t *round_ptr,
+                         const int16_t *quant_ptr,
+                         const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+                         tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+                         uint16_t *eob_ptr, const int16_t *scan,
+                         const int16_t *iscan) {
+  (void)quant_shift_ptr;
+  (void)scan;
+
+  const int zbins[2] = { zbin_ptr[0], zbin_ptr[1] };
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  const int16x8_t zero = vdupq_n_s16(0);
+  int16x8_t v_eobmax_76543210 = vreinterpretq_s16_u16(vceqq_s16(zero, zero));
+
+  int16x8_t vzbins = vdupq_n_s16(zbins[1]), vround = vdupq_n_s16(round_ptr[1]);
+  int16x8_t vdequant = vdupq_n_s16(dequant_ptr[1]);
+  int16x8_t vquant = vdupq_n_s16(quant_ptr[1]);
+  int16x8_t vquant_shift = vdupq_n_s16(quant_shift_ptr[1]);
+
+  int16x8_t v_coeff = load_tran_low_to_s16q(&coeff_ptr[0]);
+  int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
+  int16x8_t v_abs = vabsq_s16(v_coeff);
+
+  vzbins = vsetq_lane_s16(zbins[0], vzbins, 0);
+
+  uint16x8_t vcond = vcgeq_s16(v_abs, vzbins);
+  uint64_t nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(vcond)), 0);
+  if (nz_check) {
+    vround = vsetq_lane_s16(round_ptr[0], vround, 0);
+    vquant = vsetq_lane_s16(quant_ptr[0], vquant, 0);
+    vdequant = vsetq_lane_s16(dequant_ptr[0], vdequant, 0);
+    vquant_shift = vsetq_lane_s16(quant_shift_ptr[0], vquant_shift, 0);
+
+    int16x8_t vtmp = vqaddq_s16(v_abs, vround);
+    int16x8_t vtmp2 = vsraq_n_s16(vtmp, vqdmulhq_s16(vtmp, vquant), 1);
+    vtmp2 = vshrq_n_s16(vqdmulhq_s16(vtmp2, vquant_shift), 1);
+
+    int16x8_t vdest = vsubq_s16(veorq_s16(vtmp2, v_coeff_sign), v_coeff_sign);
+    int16x8_t coeff_nz_mask =
+        vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&qcoeff_ptr[0]));
+    store_s16q_to_tran_low(&qcoeff_ptr[0], coeff_nz_mask);
+    int16x8_t v_deq_abs = vmulq_s16(vtmp2, vdequant);
+
+    vdest = vsubq_s16(veorq_s16(v_deq_abs, v_coeff_sign), v_coeff_sign);
+    coeff_nz_mask =
+        vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&dqcoeff_ptr[0]));
+    store_s16q_to_tran_low(&dqcoeff_ptr[0], coeff_nz_mask);
+
+    vround = vsetq_lane_s16(round_ptr[1], vround, 0);
+    vquant = vsetq_lane_s16(quant_ptr[1], vquant, 0);
+    vdequant = vsetq_lane_s16(dequant_ptr[1], vdequant, 0);
+    vquant_shift = vsetq_lane_s16(quant_shift_ptr[1], vquant_shift, 0);
+
+    uint16x8_t vtmp_mask = vcgtq_s16(vtmp2, zero);
+    const uint16x8_t v_nz_mask = vandq_u16(vtmp_mask, vcond);
+    int16x8_t v_iscan = vld1q_s16(&iscan[0]);
+    vcond = vandq_u16(v_nz_mask, vcgtq_s16(v_iscan, v_eobmax_76543210));
+    v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210);
+  }
+  vzbins = vsetq_lane_s16(zbins[1], vzbins, 0);
+
+  for (int i = 8; i < n_coeffs; i += 8) {
+    v_coeff = load_tran_low_to_s16q(&coeff_ptr[i]);
+    v_coeff_sign = vshrq_n_s16(v_coeff, 15);
+    v_abs = vabsq_s16(v_coeff);
+    vcond = vcgeq_s16(v_abs, vzbins);
+
+    nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(vcond)), 0);
+    if (nz_check) {
+      int16x8_t vtmp = vqaddq_s16(v_abs, vround);
+      int16x8_t vtmp2 = vsraq_n_s16(vtmp, vqdmulhq_s16(vtmp, vquant), 1);
+
+      vtmp2 = vshrq_n_s16(vqdmulhq_s16(vtmp2, vquant_shift), 1);
+      int16x8_t vdest = vsubq_s16(veorq_s16(vtmp2, v_coeff_sign), v_coeff_sign);
+      int16x8_t coeff_nz_mask =
+          vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&qcoeff_ptr[i]));
+      store_s16q_to_tran_low(&qcoeff_ptr[i], coeff_nz_mask);
+      int16x8_t v_deq_abs = vmulq_s16(vtmp2, vdequant);
+      vdest = vsubq_s16(veorq_s16(v_deq_abs, v_coeff_sign), v_coeff_sign);
+      coeff_nz_mask =
+          vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&dqcoeff_ptr[i]));
+      store_s16q_to_tran_low(&dqcoeff_ptr[i], coeff_nz_mask);
+
+      uint16x8_t vtmp_mask = vcgtq_s16(vtmp2, zero);
+      const uint16x8_t v_nz_mask = vandq_u16(vtmp_mask, vcond);
+      int16x8_t v_iscan = vld1q_s16(&iscan[i]);
+      vcond = vandq_u16(v_nz_mask, vcgtq_s16(v_iscan, v_eobmax_76543210));
+      v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210);
+    }
+  }
+
+#ifdef __aarch64__
+  *eob_ptr = vmaxvq_s16(v_eobmax_76543210) + 1;
+#else
+  {
+    const int16x4_t v_eobmax_3210 = vmax_s16(vget_low_s16(v_eobmax_76543210),
+                                             vget_high_s16(v_eobmax_76543210));
+    const int64x1_t v_eobmax_xx32 =
+        vshr_n_s64(vreinterpret_s64_s16(v_eobmax_3210), 32);
+    const int16x4_t v_eobmax_tmp =
+        vmax_s16(v_eobmax_3210, vreinterpret_s16_s64(v_eobmax_xx32));
+    const int64x1_t v_eobmax_xxx3 =
+        vshr_n_s64(vreinterpret_s64_s16(v_eobmax_tmp), 16);
+    const int16x4_t v_eobmax_final =
+        vmax_s16(v_eobmax_tmp, vreinterpret_s16_s64(v_eobmax_xxx3));
+
+    *eob_ptr = (uint16_t)vget_lane_s16(v_eobmax_final, 0) + 1;
+  }
+#endif  // __aarch64__
+}
+
+#define QM_MULL_SHIFT(x0, x1)                                              \
+  vreinterpretq_s16_u16(vorrq_u16(                                         \
+      vreinterpretq_u16_s16(vshlq_n_s16(                                   \
+          vqdmulhq_s16(x0, vreinterpretq_s16_u16(x1)), 15 - AOM_QM_BITS)), \
+      vshrq_n_u16(vmulq_u16(vreinterpretq_u16_s16(x0), x1), AOM_QM_BITS)))
+
+static void aom_quantize_b_helper_16x16_neon(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr,
+    const qm_val_t *iqm_ptr) {
+  (void)scan;
+
+  uint16x8_t vwt, viwt;
+  const int zbins[2] = { zbin_ptr[0], zbin_ptr[1] };
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  const int16x8_t zero = vdupq_n_s16(0);
+  int16x8_t v_eobmax_76543210 = vreinterpretq_s16_u16(vceqq_s16(zero, zero));
+
+  int16x8_t vzbins = vdupq_n_s16(zbins[1]), vround = vdupq_n_s16(round_ptr[1]);
+  int16x8_t vdequant = vdupq_n_s16(dequant_ptr[1]);
+  int16x8_t vquant = vdupq_n_s16(quant_ptr[1]);
+  int16x8_t vquant_shift = vdupq_n_s16(quant_shift_ptr[1]);
+
+  int16x8_t v_coeff = load_tran_low_to_s16q(&coeff_ptr[0]);
+  int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
+  int16x8_t v_abs = vabsq_s16(v_coeff);
+  vzbins = vsetq_lane_s16(zbins[0], vzbins, 0);
+  uint16x8_t vcond;
+  if (qm_ptr == NULL) {
+    vcond = vcgeq_s16(v_abs, vzbins);
+  } else {
+    vwt = vmovl_u8(vld1_u8(&qm_ptr[0]));
+    vcond = vcgeq_s16(QM_MULL_SHIFT(v_abs, vwt), vzbins);
+  }
+  uint64_t nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(vcond)), 0);
+  if (nz_check) {
+    vround = vsetq_lane_s16(round_ptr[0], vround, 0);
+    vquant = vsetq_lane_s16(quant_ptr[0], vquant, 0);
+    vdequant = vsetq_lane_s16(dequant_ptr[0], vdequant, 0);
+    vquant_shift = vsetq_lane_s16(quant_shift_ptr[0], vquant_shift, 0);
+
+    int16x8_t vtmp = vqaddq_s16(v_abs, vround);
+
+    int16x8_t vtmp2;
+    if (qm_ptr == NULL) {
+      vtmp2 = vsraq_n_s16(vtmp, vqdmulhq_s16(vtmp, vquant), 1);
+    } else {
+      vtmp2 = QM_MULL_SHIFT(vtmp, vwt);
+      vtmp2 = vaddq_s16(vtmp2, vtmp);
+    }
+
+    vtmp2 = vshrq_n_s16(vqdmulhq_s16(vtmp2, vquant_shift), 1);
+    int16x8_t vdest = vsubq_s16(veorq_s16(vtmp2, v_coeff_sign), v_coeff_sign);
+    int16x8_t coeff_nz_mask =
+        vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&qcoeff_ptr[0]));
+    store_s16q_to_tran_low(&qcoeff_ptr[0], coeff_nz_mask);
+
+    if (iqm_ptr != NULL) {
+      viwt = vmovl_u8(vld1_u8(&iqm_ptr[0]));
+      vdequant = QM_MULL_SHIFT(vdequant, viwt);
+    }
+    int16x8_t v_deq_abs = vmulq_s16(vtmp2, vdequant);
+    vdest = vsubq_s16(veorq_s16(v_deq_abs, v_coeff_sign), v_coeff_sign);
+    coeff_nz_mask =
+        vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&dqcoeff_ptr[0]));
+    store_s16q_to_tran_low(&dqcoeff_ptr[0], coeff_nz_mask);
+
+    vround = vsetq_lane_s16(round_ptr[1], vround, 0);
+    vquant = vsetq_lane_s16(quant_ptr[1], vquant, 0);
+    vdequant = vsetq_lane_s16(dequant_ptr[1], vdequant, 0);
+    vquant_shift = vsetq_lane_s16(quant_shift_ptr[1], vquant_shift, 0);
+
+    uint16x8_t vtmp_mask = vcgtq_s16(vtmp2, zero);
+    const uint16x8_t v_nz_mask = vandq_u16(vtmp_mask, vcond);
+    int16x8_t v_iscan = vld1q_s16(&iscan[0]);
+    vcond = vandq_u16(v_nz_mask, vcgtq_s16(v_iscan, v_eobmax_76543210));
+    v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210);
+  }
+  vzbins = vsetq_lane_s16(zbins[1], vzbins, 0);
+
+  for (int i = 8; i < n_coeffs; i += 8) {
+    v_coeff = load_tran_low_to_s16q(&coeff_ptr[i]);
+    v_coeff_sign = vshrq_n_s16(v_coeff, 15);
+    v_abs = vabsq_s16(v_coeff);
+
+    if (qm_ptr == NULL) {
+      vcond = vcgeq_s16(v_abs, vzbins);
+    } else {
+      vwt = vmovl_u8(vld1_u8(&qm_ptr[i]));
+      vcond = vcgeq_s16(QM_MULL_SHIFT(v_abs, vwt), vzbins);
+    }
+    nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(vcond)), 0);
+    if (nz_check) {
+      int16x8_t vtmp = vqaddq_s16(v_abs, vround);
+
+      int16x8_t vtmp2;
+      if (qm_ptr == NULL) {
+        vtmp2 = vsraq_n_s16(vtmp, vqdmulhq_s16(vtmp, vquant), 1);
+      } else {
+        vtmp2 = QM_MULL_SHIFT(vtmp, vwt);
+        vtmp2 = vaddq_s16(vtmp2, vtmp);
+      }
+
+      vtmp2 = vshrq_n_s16(vqdmulhq_s16(vtmp2, vquant_shift), 1);
+      int16x8_t vdest = vsubq_s16(veorq_s16(vtmp2, v_coeff_sign), v_coeff_sign);
+      int16x8_t coeff_nz_mask =
+          vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&qcoeff_ptr[i]));
+      store_s16q_to_tran_low(&qcoeff_ptr[i], coeff_nz_mask);
+
+      if (iqm_ptr != NULL) {
+        viwt = vmovl_u8(vld1_u8(&iqm_ptr[i]));
+        vdequant = QM_MULL_SHIFT(vdequant, viwt);
+      }
+      int16x8_t v_deq_abs = vmulq_s16(vtmp2, vdequant);
+      vdest = vsubq_s16(veorq_s16(v_deq_abs, v_coeff_sign), v_coeff_sign);
+      coeff_nz_mask =
+          vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&dqcoeff_ptr[i]));
+      store_s16q_to_tran_low(&dqcoeff_ptr[i], coeff_nz_mask);
+
+      uint16x8_t vtmp_mask = vcgtq_s16(vtmp2, zero);
+      const uint16x8_t v_nz_mask = vandq_u16(vtmp_mask, vcond);
+      int16x8_t v_iscan = vld1q_s16(&iscan[i]);
+      vcond = vandq_u16(v_nz_mask, vcgtq_s16(v_iscan, v_eobmax_76543210));
+      v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210);
+    }
+  }
+
+#ifdef __aarch64__
+  *eob_ptr = vmaxvq_s16(v_eobmax_76543210) + 1;
+#else
+  {
+    const int16x4_t v_eobmax_3210 = vmax_s16(vget_low_s16(v_eobmax_76543210),
+                                             vget_high_s16(v_eobmax_76543210));
+    const int64x1_t v_eobmax_xx32 =
+        vshr_n_s64(vreinterpret_s64_s16(v_eobmax_3210), 32);
+    const int16x4_t v_eobmax_tmp =
+        vmax_s16(v_eobmax_3210, vreinterpret_s16_s64(v_eobmax_xx32));
+    const int64x1_t v_eobmax_xxx3 =
+        vshr_n_s64(vreinterpret_s64_s16(v_eobmax_tmp), 16);
+    const int16x4_t v_eobmax_final =
+        vmax_s16(v_eobmax_tmp, vreinterpret_s16_s64(v_eobmax_xxx3));
+
+    *eob_ptr = (uint16_t)vget_lane_s16(v_eobmax_final, 0) + 1;
+  }
+#endif  // __aarch64__
+}
+
+static void aom_quantize_b_helper_32x32_neon(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr,
+    const qm_val_t *iqm_ptr) {
+  (void)scan;
+
+  uint16x8_t vwt, viwt;
+  const int log_scale = 1;
+  const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale),
+                         ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) };
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  const int16x8_t zero = vdupq_n_s16(0);
+  int16x8_t v_eobmax_76543210 = vreinterpretq_s16_u16(vceqq_s16(zero, zero));
+  const int16x8_t v_log_scale = v_eobmax_76543210;
+
+  int16x8_t vzbins = vdupq_n_s16(zbins[1]),
+            vround = vdupq_n_s16(ROUND_POWER_OF_TWO(round_ptr[1], log_scale));
+  int16x8_t vdequant = vdupq_n_s16(dequant_ptr[1]);
+  int16x8_t vquant = vdupq_n_s16(quant_ptr[1]);
+  int16x8_t vquant_shift = vdupq_n_s16(quant_shift_ptr[1]);
+
+  int16x8_t v_coeff = load_tran_low_to_s16q(&coeff_ptr[0]);
+  int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
+  int16x8_t v_abs = vabsq_s16(v_coeff);
+  vzbins = vsetq_lane_s16(zbins[0], vzbins, 0);
+  uint16x8_t vcond;
+  if (qm_ptr == NULL) {
+    vcond = vcgeq_s16(v_abs, vzbins);
+  } else {
+    vwt = vmovl_u8(vld1_u8(&qm_ptr[0]));
+    vcond = vcgeq_s16(QM_MULL_SHIFT(v_abs, vwt), vzbins);
+  }
+  uint64_t nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(vcond)), 0);
+  if (nz_check) {
+    vround =
+        vsetq_lane_s16(ROUND_POWER_OF_TWO(round_ptr[0], log_scale), vround, 0);
+    vquant = vsetq_lane_s16(quant_ptr[0], vquant, 0);
+    vdequant = vsetq_lane_s16(dequant_ptr[0], vdequant, 0);
+    vquant_shift = vsetq_lane_s16(quant_shift_ptr[0], vquant_shift, 0);
+
+    int16x8_t vtmp = vqaddq_s16(v_abs, vround);
+
+    int16x8_t vtmp2;
+    if (qm_ptr == NULL) {
+      vtmp2 = vsraq_n_s16(vtmp, vqdmulhq_s16(vtmp, vquant), 1);
+    } else {
+      vtmp2 = QM_MULL_SHIFT(vtmp, vwt);
+      vtmp2 = vaddq_s16(vtmp2, vtmp);
+    }
+
+    vtmp2 = vqdmulhq_s16(vtmp2, vquant_shift);
+    int16x8_t vdest = vsubq_s16(veorq_s16(vtmp2, v_coeff_sign), v_coeff_sign);
+    int16x8_t coeff_nz_mask =
+        vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&qcoeff_ptr[0]));
+    store_s16q_to_tran_low(&qcoeff_ptr[0], coeff_nz_mask);
+
+    if (iqm_ptr != NULL) {
+      viwt = vmovl_u8(vld1_u8(&iqm_ptr[0]));
+      vdequant = QM_MULL_SHIFT(vdequant, viwt);
+    }
+    int16x8_t v_deq_abs = vreinterpretq_s16_u16(vshlq_u16(
+        vreinterpretq_u16_s16(vmulq_s16(vtmp2, vdequant)), v_log_scale));
+    vdest = vsubq_s16(veorq_s16(v_deq_abs, v_coeff_sign), v_coeff_sign);
+    coeff_nz_mask =
+        vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&dqcoeff_ptr[0]));
+    store_s16q_to_tran_low(&dqcoeff_ptr[0], coeff_nz_mask);
+
+    vzbins = vsetq_lane_s16(zbins[1], vzbins, 0);
+    vround =
+        vsetq_lane_s16(ROUND_POWER_OF_TWO(round_ptr[1], log_scale), vround, 0);
+    vquant = vsetq_lane_s16(quant_ptr[1], vquant, 0);
+    vdequant = vsetq_lane_s16(dequant_ptr[1], vdequant, 0);
+    vquant_shift = vsetq_lane_s16(quant_shift_ptr[1], vquant_shift, 0);
+
+    uint16x8_t vtmp_mask = vcgtq_s16(vtmp2, zero);
+    const uint16x8_t v_nz_mask = vandq_u16(vtmp_mask, vcond);
+    int16x8_t v_iscan = vld1q_s16(&iscan[0]);
+    vcond = vandq_u16(v_nz_mask, vcgtq_s16(v_iscan, v_eobmax_76543210));
+    v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210);
+  }
+  vzbins = vsetq_lane_s16(zbins[1], vzbins, 0);
+
+  for (int i = 8; i < n_coeffs; i += 8) {
+    v_coeff = load_tran_low_to_s16q(&coeff_ptr[i]);
+    v_coeff_sign = vshrq_n_s16(v_coeff, 15);
+    v_abs = vabsq_s16(v_coeff);
+
+    if (qm_ptr == NULL) {
+      vcond = vcgeq_s16(v_abs, vzbins);
+    } else {
+      vwt = vmovl_u8(vld1_u8(&qm_ptr[i]));
+      vcond = vcgeq_s16(QM_MULL_SHIFT(v_abs, vwt), vzbins);
+    }
+    nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(vcond)), 0);
+    if (nz_check) {
+      int16x8_t vtmp = vqaddq_s16(v_abs, vround);
+
+      int16x8_t vtmp2;
+      if (qm_ptr == NULL) {
+        vtmp2 = vsraq_n_s16(vtmp, vqdmulhq_s16(vtmp, vquant), 1);
+      } else {
+        vtmp2 = QM_MULL_SHIFT(vtmp, vwt);
+        vtmp2 = vaddq_s16(vtmp2, vtmp);
+      }
+      vtmp2 = vqdmulhq_s16(vtmp2, vquant_shift);
+
+      int16x8_t vdest = vsubq_s16(veorq_s16(vtmp2, v_coeff_sign), v_coeff_sign);
+      int16x8_t coeff_nz_mask =
+          vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&qcoeff_ptr[i]));
+      store_s16q_to_tran_low(&qcoeff_ptr[i], coeff_nz_mask);
+
+      if (iqm_ptr != NULL) {
+        viwt = vmovl_u8(vld1_u8(&iqm_ptr[i]));
+        vdequant = QM_MULL_SHIFT(vdequant, viwt);
+      }
+      int16x8_t v_deq_abs = vreinterpretq_s16_u16(vshlq_u16(
+          vreinterpretq_u16_s16(vmulq_s16(vtmp2, vdequant)), v_log_scale));
+      vdest = vsubq_s16(veorq_s16(v_deq_abs, v_coeff_sign), v_coeff_sign);
+      coeff_nz_mask =
+          vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&dqcoeff_ptr[i]));
+      store_s16q_to_tran_low(&dqcoeff_ptr[i], coeff_nz_mask);
+
+      uint16x8_t vtmp_mask = vcgtq_s16(vtmp2, zero);
+      const uint16x8_t v_nz_mask = vandq_u16(vtmp_mask, vcond);
+      int16x8_t v_iscan = vld1q_s16(&iscan[i]);
+      vcond = vandq_u16(v_nz_mask, vcgtq_s16(v_iscan, v_eobmax_76543210));
+      v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210);
+    }
+  }
+
+#ifdef __aarch64__
+  *eob_ptr = vmaxvq_s16(v_eobmax_76543210) + 1;
+#else
+  {
+    const int16x4_t v_eobmax_3210 = vmax_s16(vget_low_s16(v_eobmax_76543210),
+                                             vget_high_s16(v_eobmax_76543210));
+    const int64x1_t v_eobmax_xx32 =
+        vshr_n_s64(vreinterpret_s64_s16(v_eobmax_3210), 32);
+    const int16x4_t v_eobmax_tmp =
+        vmax_s16(v_eobmax_3210, vreinterpret_s16_s64(v_eobmax_xx32));
+    const int64x1_t v_eobmax_xxx3 =
+        vshr_n_s64(vreinterpret_s64_s16(v_eobmax_tmp), 16);
+    const int16x4_t v_eobmax_final =
+        vmax_s16(v_eobmax_tmp, vreinterpret_s16_s64(v_eobmax_xxx3));
+
+    *eob_ptr = (uint16_t)vget_lane_s16(v_eobmax_final, 0) + 1;
+  }
+#endif  // __aarch64__
+}
+
+static void aom_quantize_b_helper_64x64_neon(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr,
+    const qm_val_t *iqm_ptr) {
+  (void)scan;
+
+  uint16x8_t vwt, viwt;
+  const int log_scale = 2;
+  const int16x8_t v_log_scale =
+      vreinterpretq_s16_s64(vdupq_n_s64(0xFFFEFFFEFFFEFFFE));
+
+  const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale),
+                         ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) };
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  const int16x8_t zero = vdupq_n_s16(0);
+  int16x8_t v_eobmax_76543210 = vreinterpretq_s16_u16(vceqq_s16(zero, zero));
+  int16x8_t v_ones = vnegq_s16(v_eobmax_76543210);
+
+  int16x8_t vzbins = vdupq_n_s16(zbins[1]),
+            vround = vdupq_n_s16(ROUND_POWER_OF_TWO(round_ptr[1], log_scale));
+  int16x8_t vdequant = vdupq_n_s16(dequant_ptr[1]);
+  int16x8_t vquant = vdupq_n_s16(quant_ptr[1]);
+  int16x8_t vquant_shift = vdupq_n_s16(quant_shift_ptr[1]);
+
+  int16x8_t v_coeff = load_tran_low_to_s16q(&coeff_ptr[0]);
+  int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
+  int16x8_t v_abs = vabsq_s16(v_coeff);
+  vzbins = vsetq_lane_s16(zbins[0], vzbins, 0);
+  uint16x8_t vcond;
+  if (qm_ptr == NULL) {
+    vcond = vcgeq_s16(v_abs, vzbins);
+  } else {
+    vwt = vmovl_u8(vld1_u8(&qm_ptr[0]));
+    vcond = vcgeq_s16(QM_MULL_SHIFT(v_abs, vwt), vzbins);
+  }
+  uint64_t nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(vcond)), 0);
+  if (nz_check) {
+    vround =
+        vsetq_lane_s16(ROUND_POWER_OF_TWO(round_ptr[0], log_scale), vround, 0);
+    vquant = vsetq_lane_s16(quant_ptr[0], vquant, 0);
+    vdequant = vsetq_lane_s16(dequant_ptr[0], vdequant, 0);
+    vquant_shift = vsetq_lane_s16(quant_shift_ptr[0], vquant_shift, 0);
+    int16x8_t vtmp = vqaddq_s16(v_abs, vround);
+
+    int16x8_t vtmp2;
+    if (qm_ptr == NULL) {
+      vtmp2 = vsraq_n_s16(vtmp, vqdmulhq_s16(vtmp, vquant), 1);
+    } else {
+      vtmp2 = QM_MULL_SHIFT(vtmp, vwt);
+      vtmp2 = vaddq_s16(vtmp2, vtmp);
+    }
+
+    int16x8_t ones =
+        vandq_s16(vshrq_n_s16(vmulq_s16(vtmp2, vquant_shift), 14), v_ones);
+    vtmp2 =
+        vaddq_s16(vshlq_s16(vqdmulhq_s16(vtmp2, vquant_shift), v_ones), ones);
+    int16x8_t vdest = vsubq_s16(veorq_s16(vtmp2, v_coeff_sign), v_coeff_sign);
+    int16x8_t coeff_nz_mask =
+        vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&qcoeff_ptr[0]));
+    store_s16q_to_tran_low(&qcoeff_ptr[0], coeff_nz_mask);
+
+    if (iqm_ptr != NULL) {
+      viwt = vmovl_u8(vld1_u8(&iqm_ptr[0]));
+      vdequant = QM_MULL_SHIFT(vdequant, viwt);
+    }
+    int16x8_t v_deq_abs = vreinterpretq_s16_u16(vshlq_u16(
+        vreinterpretq_u16_s16(vmulq_s16(vtmp2, vdequant)), v_log_scale));
+    v_deq_abs =
+        vorrq_s16(vshlq_n_s16(vqdmulhq_s16(vtmp2, vdequant), 13), v_deq_abs);
+    vdest = vsubq_s16(veorq_s16(v_deq_abs, v_coeff_sign), v_coeff_sign);
+    coeff_nz_mask =
+        vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&dqcoeff_ptr[0]));
+    store_s16q_to_tran_low(&dqcoeff_ptr[0], coeff_nz_mask);
+
+    vround =
+        vsetq_lane_s16(ROUND_POWER_OF_TWO(round_ptr[1], log_scale), vround, 0);
+    vquant = vsetq_lane_s16(quant_ptr[1], vquant, 0);
+    vdequant = vsetq_lane_s16(dequant_ptr[1], vdequant, 0);
+    vquant_shift = vsetq_lane_s16(quant_shift_ptr[1], vquant_shift, 0);
+
+    uint16x8_t vtmp_mask = vcgtq_s16(vtmp2, zero);
+    const uint16x8_t v_nz_mask = vandq_u16(vtmp_mask, vcond);
+    int16x8_t v_iscan = vld1q_s16(&iscan[0]);
+    vcond = vandq_u16(v_nz_mask, vcgtq_s16(v_iscan, v_eobmax_76543210));
+    v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210);
+  }
+  vzbins = vsetq_lane_s16(zbins[1], vzbins, 0);
+
+  for (int i = 8; i < n_coeffs; i += 8) {
+    v_coeff = load_tran_low_to_s16q(&coeff_ptr[i]);
+    v_coeff_sign = vshrq_n_s16(v_coeff, 15);
+    v_abs = vabsq_s16(v_coeff);
+
+    if (qm_ptr == NULL) {
+      vcond = vcgeq_s16(v_abs, vzbins);
+    } else {
+      vwt = vmovl_u8(vld1_u8(&qm_ptr[i]));
+      vcond = vcgeq_s16(QM_MULL_SHIFT(v_abs, vwt), vzbins);
+    }
+    nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(vcond)), 0);
+    if (nz_check) {
+      int16x8_t vtmp = vqaddq_s16(v_abs, vround);
+
+      int16x8_t vtmp2;
+      if (qm_ptr == NULL) {
+        vtmp2 = vsraq_n_s16(vtmp, vqdmulhq_s16(vtmp, vquant), 1);
+      } else {
+        vtmp2 = QM_MULL_SHIFT(vtmp, vwt);
+        vtmp2 = vaddq_s16(vtmp2, vtmp);
+      }
+
+      int16x8_t ones =
+          vandq_s16(vshrq_n_s16(vmulq_s16(vtmp2, vquant_shift), 14), v_ones);
+      vtmp2 =
+          vaddq_s16(vshlq_s16(vqdmulhq_s16(vtmp2, vquant_shift), v_ones), ones);
+      int16x8_t vdest = vsubq_s16(veorq_s16(vtmp2, v_coeff_sign), v_coeff_sign);
+      int16x8_t coeff_nz_mask =
+          vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&qcoeff_ptr[i]));
+      store_s16q_to_tran_low(&qcoeff_ptr[i], coeff_nz_mask);
+
+      if (iqm_ptr != NULL) {
+        viwt = vmovl_u8(vld1_u8(&iqm_ptr[i]));
+        vdequant = QM_MULL_SHIFT(vdequant, viwt);
+      }
+      int16x8_t v_deq_abs = vreinterpretq_s16_u16(vshlq_u16(
+          vreinterpretq_u16_s16(vmulq_s16(vtmp2, vdequant)), v_log_scale));
+      v_deq_abs =
+          vorrq_s16(vshlq_n_s16(vqdmulhq_s16(vtmp2, vdequant), 13), v_deq_abs);
+      vdest = vsubq_s16(veorq_s16(v_deq_abs, v_coeff_sign), v_coeff_sign);
+      coeff_nz_mask =
+          vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&dqcoeff_ptr[i]));
+      store_s16q_to_tran_low(&dqcoeff_ptr[i], coeff_nz_mask);
+
+      uint16x8_t vtmp_mask = vcgtq_s16(vtmp2, zero);
+      const uint16x8_t v_nz_mask = vandq_u16(vtmp_mask, vcond);
+      int16x8_t v_iscan = vld1q_s16(&iscan[i]);
+      vcond = vandq_u16(v_nz_mask, vcgtq_s16(v_iscan, v_eobmax_76543210));
+      v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210);
+    }
+  }
+
+#ifdef __aarch64__
+  *eob_ptr = vmaxvq_s16(v_eobmax_76543210) + 1;
+#else
+  {
+    const int16x4_t v_eobmax_3210 = vmax_s16(vget_low_s16(v_eobmax_76543210),
+                                             vget_high_s16(v_eobmax_76543210));
+    const int64x1_t v_eobmax_xx32 =
+        vshr_n_s64(vreinterpret_s64_s16(v_eobmax_3210), 32);
+    const int16x4_t v_eobmax_tmp =
+        vmax_s16(v_eobmax_3210, vreinterpret_s16_s64(v_eobmax_xx32));
+    const int64x1_t v_eobmax_xxx3 =
+        vshr_n_s64(vreinterpret_s64_s16(v_eobmax_tmp), 16);
+    const int16x4_t v_eobmax_final =
+        vmax_s16(v_eobmax_tmp, vreinterpret_s16_s64(v_eobmax_xxx3));
+
+    *eob_ptr = (uint16_t)vget_lane_s16(v_eobmax_final, 0) + 1;
+  }
+#endif  // __aarch64__
+}
+
+void aom_quantize_b_helper_neon(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr,
+    const qm_val_t *iqm_ptr, const int log_scale) {
+  switch (log_scale) {  // log_scale for AV1 encoder can be only 0, 1, 2
+    case 0:
+      aom_quantize_b_helper_16x16_neon(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
+                                       quant_ptr, quant_shift_ptr, qcoeff_ptr,
+                                       dqcoeff_ptr, dequant_ptr, eob_ptr, scan,
+                                       iscan, qm_ptr, iqm_ptr);
+      break;
+    case 1:
+      aom_quantize_b_helper_32x32_neon(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
+                                       quant_ptr, quant_shift_ptr, qcoeff_ptr,
+                                       dqcoeff_ptr, dequant_ptr, eob_ptr, scan,
+                                       iscan, qm_ptr, iqm_ptr);
+      break;
+    case 2:
+      aom_quantize_b_helper_64x64_neon(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
+                                       quant_ptr, quant_shift_ptr, qcoeff_ptr,
+                                       dqcoeff_ptr, dequant_ptr, eob_ptr, scan,
+                                       iscan, qm_ptr, iqm_ptr);
+      break;
+  }
+}
+
+void aom_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                               const int16_t *zbin_ptr,
+                               const int16_t *round_ptr,
+                               const int16_t *quant_ptr,
+                               const int16_t *quant_shift_ptr,
+                               tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                               const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                               const int16_t *scan, const int16_t *iscan) {
+  aom_quantize_b_helper_neon(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
+                             quant_ptr, quant_shift_ptr, qcoeff_ptr,
+                             dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan,
+                             NULL, NULL, 1);
+}
+
+void aom_quantize_b_64x64_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                               const int16_t *zbin_ptr,
+                               const int16_t *round_ptr,
+                               const int16_t *quant_ptr,
+                               const int16_t *quant_shift_ptr,
+                               tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                               const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                               const int16_t *scan, const int16_t *iscan) {
+  aom_quantize_b_helper_neon(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
+                             quant_ptr, quant_shift_ptr, qcoeff_ptr,
+                             dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan,
+                             NULL, NULL, 2);
+}
diff --git a/av1/encoder/arm/neon/rdopt_neon.c b/av1/encoder/arm/neon/rdopt_neon.c
new file mode 100644
index 0000000..1786b27
--- /dev/null
+++ b/av1/encoder/arm/neon/rdopt_neon.c
@@ -0,0 +1,462 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include "aom_ports/system_state.h"
+
+#include "av1/encoder/rdopt.h"
+#include "config/av1_rtcd.h"
+
+// Process horizontal and vertical correlations in a 4x4 block of pixels.
+// We actually use the 4x4 pixels to calculate correlations corresponding to
+// the top-left 3x3 pixels, so this function must be called with 1x1 overlap,
+// moving the window along/down by 3 pixels at a time.
+INLINE static void horver_correlation_4x4(const int16_t *diff, int stride,
+                                          int32x4_t *xy_sum_32,
+                                          int32x4_t *xz_sum_32,
+                                          int32x4_t *x_sum_32,
+                                          int32x4_t *x2_sum_32) {
+  // Pixels in this 4x4   [ a b c d ]
+  // are referred to as:  [ e f g h ]
+  //                      [ i j k l ]
+  //                      [ m n o p ]
+
+  const int16x4_t pixelsa_2_lo = vld1_s16(diff + (0 * stride));
+  const int16x4_t pixelsa_2_sli =
+      vreinterpret_s16_s64(vshl_n_s64(vreinterpret_s64_s16(pixelsa_2_lo), 16));
+  const int16x4_t pixelsb_2_lo = vld1_s16(diff + (1 * stride));
+  const int16x4_t pixelsb_2_sli =
+      vreinterpret_s16_s64(vshl_n_s64(vreinterpret_s64_s16(pixelsb_2_lo), 16));
+  const int16x4_t pixelsa_1_lo = vld1_s16(diff + (2 * stride));
+  const int16x4_t pixelsa_1_sli =
+      vreinterpret_s16_s64(vshl_n_s64(vreinterpret_s64_s16(pixelsa_1_lo), 16));
+  const int16x4_t pixelsb_1_lo = vld1_s16(diff + (3 * stride));
+  const int16x4_t pixelsb_1_sli =
+      vreinterpret_s16_s64(vshl_n_s64(vreinterpret_s64_s16(pixelsb_1_lo), 16));
+
+  const int16x8_t slli_a = vcombine_s16(pixelsa_1_sli, pixelsa_2_sli);
+
+  *xy_sum_32 = vmlal_s16(*xy_sum_32, pixelsa_1_lo, pixelsa_1_sli);
+  *xy_sum_32 = vmlal_s16(*xy_sum_32, pixelsa_2_lo, pixelsa_2_sli);
+  *xy_sum_32 = vmlal_s16(*xy_sum_32, pixelsb_2_lo, pixelsb_2_sli);
+
+  *xz_sum_32 = vmlal_s16(*xz_sum_32, pixelsa_1_sli, pixelsb_1_sli);
+  *xz_sum_32 = vmlal_s16(*xz_sum_32, pixelsa_2_sli, pixelsb_2_sli);
+  *xz_sum_32 = vmlal_s16(*xz_sum_32, pixelsa_1_sli, pixelsb_2_sli);
+
+  // Now calculate the straight sums, x_sum += a+b+c+e+f+g+i+j+k
+  // (sum up every element in slli_a and swap_b)
+  *x_sum_32 = vpadalq_s16(*x_sum_32, slli_a);
+  *x_sum_32 = vaddw_s16(*x_sum_32, pixelsb_2_sli);
+
+  // Also sum their squares
+  *x2_sum_32 = vmlal_s16(*x2_sum_32, pixelsa_1_sli, pixelsa_1_sli);
+  *x2_sum_32 = vmlal_s16(*x2_sum_32, pixelsa_2_sli, pixelsa_2_sli);
+  *x2_sum_32 = vmlal_s16(*x2_sum_32, pixelsb_2_sli, pixelsb_2_sli);
+}
+
+void av1_get_horver_correlation_full_neon(const int16_t *diff, int stride,
+                                          int width, int height, float *hcorr,
+                                          float *vcorr) {
+  // The following notation is used:
+  // x - current pixel
+  // y - right neighbour pixel
+  // z - below neighbour pixel
+  // w - down-right neighbour pixel
+  int64_t xy_sum = 0, xz_sum = 0;
+  int64_t x_sum = 0, x2_sum = 0;
+  int32x4_t zero = vdupq_n_s32(0);
+  int64x2_t v_x_sum = vreinterpretq_s64_s32(zero);
+  int64x2_t v_xy_sum = vreinterpretq_s64_s32(zero);
+  int64x2_t v_xz_sum = vreinterpretq_s64_s32(zero);
+  int64x2_t v_x2_sum = vreinterpretq_s64_s32(zero);
+  // Process horizontal and vertical correlations through the body in 4x4
+  // blocks.  This excludes the final row and column and possibly one extra
+  // column depending how 3 divides into width and height
+
+  for (int i = 0; i <= height - 4; i += 3) {
+    int32x4_t xy_sum_32 = zero;
+    int32x4_t xz_sum_32 = zero;
+    int32x4_t x_sum_32 = zero;
+    int32x4_t x2_sum_32 = zero;
+    for (int j = 0; j <= width - 4; j += 3) {
+      horver_correlation_4x4(&diff[i * stride + j], stride, &xy_sum_32,
+                             &xz_sum_32, &x_sum_32, &x2_sum_32);
+    }
+    v_xy_sum = vpadalq_s32(v_xy_sum, xy_sum_32);
+    v_xz_sum = vpadalq_s32(v_xz_sum, xz_sum_32);
+    v_x_sum = vpadalq_s32(v_x_sum, x_sum_32);
+    v_x2_sum = vpadalq_s32(v_x2_sum, x2_sum_32);
+  }
+#if defined(__aarch64__)
+  xy_sum = vaddvq_s64(v_xy_sum);
+  xz_sum = vaddvq_s64(v_xz_sum);
+  x2_sum = vaddvq_s64(v_x2_sum);
+  x_sum = vaddvq_s64(v_x_sum);
+#else
+  xy_sum = vget_lane_s64(
+      vadd_s64(vget_low_s64(v_xy_sum), vget_high_s64(v_xy_sum)), 0);
+  xz_sum = vget_lane_s64(
+      vadd_s64(vget_low_s64(v_xz_sum), vget_high_s64(v_xz_sum)), 0);
+  x2_sum = vget_lane_s64(
+      vadd_s64(vget_low_s64(v_x2_sum), vget_high_s64(v_x2_sum)), 0);
+  x_sum =
+      vget_lane_s64(vadd_s64(vget_low_s64(v_x_sum), vget_high_s64(v_x_sum)), 0);
+#endif
+  // x_sum now covers every pixel except the final 1-2 rows and 1-2 cols
+  int64_t x_finalrow = 0, x_finalcol = 0, x2_finalrow = 0, x2_finalcol = 0;
+
+  // Do we have 2 rows remaining or just the one?  Note that width and height
+  // are powers of 2, so each modulo 3 must be 1 or 2.
+  if (height % 3 == 1) {  // Just horiz corrs on the final row
+    const int16_t x0 = diff[(height - 1) * stride];
+    x_sum += x0;
+    x_finalrow += x0;
+    x2_sum += x0 * x0;
+    x2_finalrow += x0 * x0;
+    if (width >= 8) {
+      int32x4_t v_y_sum = zero;
+      int32x4_t v_y2_sum = zero;
+      int32x4_t v_xy_sum_a = zero;
+      int k = width - 1;
+      int j = 0;
+      while ((k - 8) > 0) {
+        const int16x8_t v_x = vld1q_s16(&diff[(height - 1) * stride + j]);
+        const int16x8_t v_y = vld1q_s16(&diff[(height - 1) * stride + j + 1]);
+        const int16x4_t v_x_lo = vget_low_s16(v_x);
+        const int16x4_t v_x_hi = vget_high_s16(v_x);
+        const int16x4_t v_y_lo = vget_low_s16(v_y);
+        const int16x4_t v_y_hi = vget_high_s16(v_y);
+        v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_x_lo, v_y_lo);
+        v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_x_hi, v_y_hi);
+        v_y2_sum = vmlal_s16(v_y2_sum, v_y_lo, v_y_lo);
+        v_y2_sum = vmlal_s16(v_y2_sum, v_y_hi, v_y_hi);
+        v_y_sum = vpadalq_s16(v_y_sum, v_y);
+        k -= 8;
+        j += 8;
+      }
+
+      const int16x8_t v_l = vld1q_s16(&diff[(height - 1) * stride] + j);
+      const int16x8_t v_x =
+          vextq_s16(vextq_s16(vreinterpretq_s16_s32(zero), v_l, 7),
+                    vreinterpretq_s16_s32(zero), 1);
+      const int16x8_t v_y = vextq_s16(v_l, vreinterpretq_s16_s32(zero), 1);
+      const int16x4_t v_x_lo = vget_low_s16(v_x);
+      const int16x4_t v_x_hi = vget_high_s16(v_x);
+      const int16x4_t v_y_lo = vget_low_s16(v_y);
+      const int16x4_t v_y_hi = vget_high_s16(v_y);
+      v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_x_lo, v_y_lo);
+      v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_x_hi, v_y_hi);
+      v_y2_sum = vmlal_s16(v_y2_sum, v_y_lo, v_y_lo);
+      v_y2_sum = vmlal_s16(v_y2_sum, v_y_hi, v_y_hi);
+      const int32x4_t v_y_sum_a = vpadalq_s16(v_y_sum, v_y);
+      const int64x2_t v_xy_sum2 = vpaddlq_s32(v_xy_sum_a);
+#if defined(__aarch64__)
+      const int64x2_t v_y2_sum_a = vpaddlq_s32(v_y2_sum);
+      xy_sum += vaddvq_s64(v_xy_sum2);
+      const int32_t y = vaddvq_s32(v_y_sum_a);
+      const int64_t y2 = vaddvq_s64(v_y2_sum_a);
+#else
+      xy_sum += vget_lane_s64(
+          vadd_s64(vget_low_s64(v_xy_sum2), vget_high_s64(v_xy_sum2)), 0);
+      const int64x2_t v_y_a = vpaddlq_s32(v_y_sum_a);
+      const int64_t y =
+          vget_lane_s64(vadd_s64(vget_low_s64(v_y_a), vget_high_s64(v_y_a)), 0);
+      const int64x2_t v_y2_sum_b = vpaddlq_s32(v_y2_sum);
+      int64_t y2 = vget_lane_s64(
+          vadd_s64(vget_low_s64(v_y2_sum_b), vget_high_s64(v_y2_sum_b)), 0);
+#endif
+      x_sum += y;
+      x2_sum += y2;
+      x_finalrow += y;
+      x2_finalrow += y2;
+    } else {
+      for (int j = 0; j < width - 1; ++j) {
+        const int16_t x = diff[(height - 1) * stride + j];
+        const int16_t y = diff[(height - 1) * stride + j + 1];
+        xy_sum += x * y;
+        x_sum += y;
+        x2_sum += y * y;
+        x_finalrow += y;
+        x2_finalrow += y * y;
+      }
+    }
+  } else {  // Two rows remaining to do
+    const int16_t x0 = diff[(height - 2) * stride];
+    const int16_t z0 = diff[(height - 1) * stride];
+    x_sum += x0 + z0;
+    x2_sum += x0 * x0 + z0 * z0;
+    x_finalrow += z0;
+    x2_finalrow += z0 * z0;
+    if (width >= 8) {
+      int32x4_t v_y2_sum = zero;
+      int32x4_t v_w2_sum = zero;
+      int32x4_t v_xy_sum_a = zero;
+      int32x4_t v_xz_sum_a = zero;
+      int32x4_t v_x_sum_a = zero;
+      int32x4_t v_w_sum = zero;
+      int k = width - 1;
+      int j = 0;
+      while ((k - 8) > 0) {
+        const int16x8_t v_x = vld1q_s16(&diff[(height - 2) * stride + j]);
+        const int16x8_t v_y = vld1q_s16(&diff[(height - 2) * stride + j + 1]);
+        const int16x8_t v_z = vld1q_s16(&diff[(height - 1) * stride + j]);
+        const int16x8_t v_w = vld1q_s16(&diff[(height - 1) * stride + j + 1]);
+
+        const int16x4_t v_x_lo = vget_low_s16(v_x);
+        const int16x4_t v_y_lo = vget_low_s16(v_y);
+        const int16x4_t v_z_lo = vget_low_s16(v_z);
+        const int16x4_t v_w_lo = vget_low_s16(v_w);
+        const int16x4_t v_x_hi = vget_high_s16(v_x);
+        const int16x4_t v_y_hi = vget_high_s16(v_y);
+        const int16x4_t v_z_hi = vget_high_s16(v_z);
+        const int16x4_t v_w_hi = vget_high_s16(v_w);
+
+        v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_x_lo, v_y_lo);
+        v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_x_hi, v_y_hi);
+        v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_z_lo, v_w_lo);
+        v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_z_hi, v_w_hi);
+
+        v_xz_sum_a = vmlal_s16(v_xz_sum_a, v_x_lo, v_z_lo);
+        v_xz_sum_a = vmlal_s16(v_xz_sum_a, v_x_hi, v_z_hi);
+
+        v_w2_sum = vmlal_s16(v_w2_sum, v_w_lo, v_w_lo);
+        v_w2_sum = vmlal_s16(v_w2_sum, v_w_hi, v_w_hi);
+        v_y2_sum = vmlal_s16(v_y2_sum, v_y_lo, v_y_lo);
+        v_y2_sum = vmlal_s16(v_y2_sum, v_y_hi, v_y_hi);
+
+        v_w_sum = vpadalq_s16(v_w_sum, v_w);
+        v_x_sum_a = vpadalq_s16(v_x_sum_a, v_y);
+        v_x_sum_a = vpadalq_s16(v_x_sum_a, v_w);
+
+        k -= 8;
+        j += 8;
+      }
+      const int16x8_t v_l = vld1q_s16(&diff[(height - 2) * stride] + j);
+      const int16x8_t v_x =
+          vextq_s16(vextq_s16(vreinterpretq_s16_s32(zero), v_l, 7),
+                    vreinterpretq_s16_s32(zero), 1);
+      const int16x8_t v_y = vextq_s16(v_l, vreinterpretq_s16_s32(zero), 1);
+      const int16x8_t v_l_2 = vld1q_s16(&diff[(height - 1) * stride] + j);
+      const int16x8_t v_z =
+          vextq_s16(vextq_s16(vreinterpretq_s16_s32(zero), v_l_2, 7),
+                    vreinterpretq_s16_s32(zero), 1);
+      const int16x8_t v_w = vextq_s16(v_l_2, vreinterpretq_s16_s32(zero), 1);
+
+      const int16x4_t v_x_lo = vget_low_s16(v_x);
+      const int16x4_t v_y_lo = vget_low_s16(v_y);
+      const int16x4_t v_z_lo = vget_low_s16(v_z);
+      const int16x4_t v_w_lo = vget_low_s16(v_w);
+      const int16x4_t v_x_hi = vget_high_s16(v_x);
+      const int16x4_t v_y_hi = vget_high_s16(v_y);
+      const int16x4_t v_z_hi = vget_high_s16(v_z);
+      const int16x4_t v_w_hi = vget_high_s16(v_w);
+
+      v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_x_lo, v_y_lo);
+      v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_x_hi, v_y_hi);
+      v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_z_lo, v_w_lo);
+      v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_z_hi, v_w_hi);
+
+      v_xz_sum_a = vmlal_s16(v_xz_sum_a, v_x_lo, v_z_lo);
+      v_xz_sum_a = vmlal_s16(v_xz_sum_a, v_x_hi, v_z_hi);
+
+      v_w2_sum = vmlal_s16(v_w2_sum, v_w_lo, v_w_lo);
+      v_w2_sum = vmlal_s16(v_w2_sum, v_w_hi, v_w_hi);
+      v_y2_sum = vmlal_s16(v_y2_sum, v_y_lo, v_y_lo);
+      v_y2_sum = vmlal_s16(v_y2_sum, v_y_hi, v_y_hi);
+
+      v_w_sum = vpadalq_s16(v_w_sum, v_w);
+      v_x_sum_a = vpadalq_s16(v_x_sum_a, v_y);
+      v_x_sum_a = vpadalq_s16(v_x_sum_a, v_w);
+
+#if defined(__aarch64__)
+      xy_sum += vaddvq_s64(vpaddlq_s32(v_xy_sum_a));
+      xz_sum += vaddvq_s64(vpaddlq_s32(v_xz_sum_a));
+      x_sum += vaddvq_s32(v_x_sum_a);
+      x_finalrow += vaddvq_s32(v_w_sum);
+      int64_t y2 = vaddvq_s64(vpaddlq_s32(v_y2_sum));
+      int64_t w2 = vaddvq_s64(vpaddlq_s32(v_w2_sum));
+#else
+      const int64x2_t v_xy_sum2 = vpaddlq_s32(v_xy_sum_a);
+      xy_sum += vget_lane_s64(
+          vadd_s64(vget_low_s64(v_xy_sum2), vget_high_s64(v_xy_sum2)), 0);
+      const int64x2_t v_xz_sum2 = vpaddlq_s32(v_xz_sum_a);
+      xz_sum += vget_lane_s64(
+          vadd_s64(vget_low_s64(v_xz_sum2), vget_high_s64(v_xz_sum2)), 0);
+      const int64x2_t v_x_sum2 = vpaddlq_s32(v_x_sum_a);
+      x_sum += vget_lane_s64(
+          vadd_s64(vget_low_s64(v_x_sum2), vget_high_s64(v_x_sum2)), 0);
+      const int64x2_t v_w_sum_a = vpaddlq_s32(v_w_sum);
+      x_finalrow += vget_lane_s64(
+          vadd_s64(vget_low_s64(v_w_sum_a), vget_high_s64(v_w_sum_a)), 0);
+      const int64x2_t v_y2_sum_a = vpaddlq_s32(v_y2_sum);
+      int64_t y2 = vget_lane_s64(
+          vadd_s64(vget_low_s64(v_y2_sum_a), vget_high_s64(v_y2_sum_a)), 0);
+      const int64x2_t v_w2_sum_a = vpaddlq_s32(v_w2_sum);
+      int64_t w2 = vget_lane_s64(
+          vadd_s64(vget_low_s64(v_w2_sum_a), vget_high_s64(v_w2_sum_a)), 0);
+#endif
+      x2_sum += y2 + w2;
+      x2_finalrow += w2;
+    } else {
+      for (int j = 0; j < width - 1; ++j) {
+        const int16_t x = diff[(height - 2) * stride + j];
+        const int16_t y = diff[(height - 2) * stride + j + 1];
+        const int16_t z = diff[(height - 1) * stride + j];
+        const int16_t w = diff[(height - 1) * stride + j + 1];
+
+        // Horizontal and vertical correlations for the penultimate row:
+        xy_sum += x * y;
+        xz_sum += x * z;
+
+        // Now just horizontal correlations for the final row:
+        xy_sum += z * w;
+
+        x_sum += y + w;
+        x2_sum += y * y + w * w;
+        x_finalrow += w;
+        x2_finalrow += w * w;
+      }
+    }
+  }
+
+  // Do we have 2 columns remaining or just the one?
+  if (width % 3 == 1) {  // Just vert corrs on the final col
+    const int16_t x0 = diff[width - 1];
+    x_sum += x0;
+    x_finalcol += x0;
+    x2_sum += x0 * x0;
+    x2_finalcol += x0 * x0;
+    for (int i = 0; i < height - 1; ++i) {
+      const int16_t x = diff[i * stride + width - 1];
+      const int16_t z = diff[(i + 1) * stride + width - 1];
+      xz_sum += x * z;
+      x_finalcol += z;
+      x2_finalcol += z * z;
+      // So the bottom-right elements don't get counted twice:
+      if (i < height - (height % 3 == 1 ? 2 : 3)) {
+        x_sum += z;
+        x2_sum += z * z;
+      }
+    }
+  } else {  // Two cols remaining
+    const int16_t x0 = diff[width - 2];
+    const int16_t y0 = diff[width - 1];
+    x_sum += x0 + y0;
+    x2_sum += x0 * x0 + y0 * y0;
+    x_finalcol += y0;
+    x2_finalcol += y0 * y0;
+    for (int i = 0; i < height - 1; ++i) {
+      const int16_t x = diff[i * stride + width - 2];
+      const int16_t y = diff[i * stride + width - 1];
+      const int16_t z = diff[(i + 1) * stride + width - 2];
+      const int16_t w = diff[(i + 1) * stride + width - 1];
+
+      // Horizontal and vertical correlations for the penultimate col:
+      // Skip these on the last iteration of this loop if we also had two
+      // rows remaining, otherwise the final horizontal and vertical correlation
+      // get erroneously processed twice
+      if (i < height - 2 || height % 3 == 1) {
+        xy_sum += x * y;
+        xz_sum += x * z;
+      }
+
+      x_finalcol += w;
+      x2_finalcol += w * w;
+      // So the bottom-right elements don't get counted twice:
+      if (i < height - (height % 3 == 1 ? 2 : 3)) {
+        x_sum += z + w;
+        x2_sum += z * z + w * w;
+      }
+
+      // Now just vertical correlations for the final column:
+      xz_sum += y * w;
+    }
+  }
+
+  // Calculate the simple sums and squared-sums
+  int64_t x_firstrow = 0, x_firstcol = 0;
+  int64_t x2_firstrow = 0, x2_firstcol = 0;
+
+  if (width >= 8) {
+    int32x4_t v_x_firstrow = zero;
+    int32x4_t v_x2_firstrow = zero;
+    for (int j = 0; j < width; j += 8) {
+      const int16x8_t v_diff = vld1q_s16(diff + j);
+      const int16x4_t v_diff_lo = vget_low_s16(v_diff);
+      const int16x4_t v_diff_hi = vget_high_s16(v_diff);
+      v_x_firstrow = vpadalq_s16(v_x_firstrow, v_diff);
+      v_x2_firstrow = vmlal_s16(v_x2_firstrow, v_diff_lo, v_diff_lo);
+      v_x2_firstrow = vmlal_s16(v_x2_firstrow, v_diff_hi, v_diff_hi);
+    }
+#if defined(__aarch64__)
+    x_firstrow += vaddvq_s32(v_x_firstrow);
+    x2_firstrow += vaddvq_s32(v_x2_firstrow);
+#else
+    const int64x2_t v_x_firstrow_64 = vpaddlq_s32(v_x_firstrow);
+    x_firstrow += vget_lane_s64(
+        vadd_s64(vget_low_s64(v_x_firstrow_64), vget_high_s64(v_x_firstrow_64)),
+        0);
+    const int64x2_t v_x2_firstrow_64 = vpaddlq_s32(v_x2_firstrow);
+    x2_firstrow += vget_lane_s64(vadd_s64(vget_low_s64(v_x2_firstrow_64),
+                                          vget_high_s64(v_x2_firstrow_64)),
+                                 0);
+#endif
+  } else {
+    for (int j = 0; j < width; ++j) {
+      x_firstrow += diff[j];
+      x2_firstrow += diff[j] * diff[j];
+    }
+  }
+  for (int i = 0; i < height; ++i) {
+    x_firstcol += diff[i * stride];
+    x2_firstcol += diff[i * stride] * diff[i * stride];
+  }
+
+  int64_t xhor_sum = x_sum - x_finalcol;
+  int64_t xver_sum = x_sum - x_finalrow;
+  int64_t y_sum = x_sum - x_firstcol;
+  int64_t z_sum = x_sum - x_firstrow;
+  int64_t x2hor_sum = x2_sum - x2_finalcol;
+  int64_t x2ver_sum = x2_sum - x2_finalrow;
+  int64_t y2_sum = x2_sum - x2_firstcol;
+  int64_t z2_sum = x2_sum - x2_firstrow;
+
+  aom_clear_system_state();
+
+  const float num_hor = (float)(height * (width - 1));
+  const float num_ver = (float)((height - 1) * width);
+
+  const float xhor_var_n = x2hor_sum - (xhor_sum * xhor_sum) / num_hor;
+  const float xver_var_n = x2ver_sum - (xver_sum * xver_sum) / num_ver;
+
+  const float y_var_n = y2_sum - (y_sum * y_sum) / num_hor;
+  const float z_var_n = z2_sum - (z_sum * z_sum) / num_ver;
+
+  const float xy_var_n = xy_sum - (xhor_sum * y_sum) / num_hor;
+  const float xz_var_n = xz_sum - (xver_sum * z_sum) / num_ver;
+
+  if (xhor_var_n > 0 && y_var_n > 0) {
+    *hcorr = xy_var_n / sqrtf(xhor_var_n * y_var_n);
+    *hcorr = *hcorr < 0 ? 0 : *hcorr;
+  } else {
+    *hcorr = 1.0;
+  }
+  if (xver_var_n > 0 && z_var_n > 0) {
+    *vcorr = xz_var_n / sqrtf(xver_var_n * z_var_n);
+    *vcorr = *vcorr < 0 ? 0 : *vcorr;
+  } else {
+    *vcorr = 1.0;
+  }
+}
diff --git a/av1/encoder/av1_ml_partition_models.h b/av1/encoder/av1_ml_partition_models.h
new file mode 100644
index 0000000..2572b13
--- /dev/null
+++ b/av1/encoder/av1_ml_partition_models.h
@@ -0,0 +1,179 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_AV1_ML_PARTITION_MODELS_H_
+#define AOM_AV1_ENCODER_AV1_ML_PARTITION_MODELS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "av1/encoder/ml.h"
+
+// TODO(kyslov): Replace with proper weights after training AV1 models
+
+#define FEATURES 6
+static const float av1_var_part_nn_weights_64_layer0[FEATURES * 8] = {
+  0.35755366f,  0.86281112f,  -0.20871686f, 0.0409634f,   0.97305766f,
+  0.75510254f,  0.04860447f,  0.77095283f,  -0.44105278f, -0.3755049f,
+  -0.08456618f, 1.1821136f,   -0.73956301f, 1.30016453f,  0.45566902f,
+  0.4742967f,   0.44213975f,  0.4876028f,   0.26720522f,  -0.34429858f,
+  -0.25148252f, -0.49623932f, -0.46747941f, -0.36656624f, 0.10213375f,
+  0.60262819f,  -0.54788715f, -0.27272022f, 1.0995462f,   -0.36338376f,
+  -0.64836313f, 0.16057039f,  1.02782791f,  0.9985311f,   0.90607883f,
+  0.80570411f,  -0.07750863f, -0.74006402f, 1.72839526f,  1.72355343f,
+  1.69288916f,  1.59102043f,  0.14140216f,  -1.47262839f, 0.4262519f,
+  -0.33805936f, -0.02449707f, 0.67203692f
+};
+
+static const float av1_var_part_nn_bias_64_layer0[8] = {
+  0.39995694f, 0.65593756f, 1.12876737f,  1.28790576f,
+  0.53468556f, 0.3177908f,  -0.74388266f, -1.81131248f
+};
+
+static const float av1_var_part_nn_weights_64_layer1[8] = {
+  -1.31174053f, 0.69696917f, 0.78721456f, 0.45326379f,
+  0.79258322f,  1.74626188f, -5.41831f,   3.33887435f
+};
+
+static const float av1_var_part_nn_bias_64_layer1[1] = { -0.90951047f };
+
+static const float av1_var_part_means_64[FEATURES] = {
+  5.36750249f, 11.58023127f, 0.25550964f, 0.23809917f, 0.24650665f, 0.22117687f
+};
+static const float av1_var_part_vars_64[FEATURES] = {
+  0.89599769f, 2.2686018f, 0.02568608f, 0.02523411f, 0.02443085f, 0.01922085f
+};
+
+static const NN_CONFIG av1_var_part_nnconfig_64 = {
+  FEATURES,  // num_inputs
+  1,         // num_outputs
+  1,         // num_hidden_layers
+  {
+      8,
+  },  // num_hidden_nodes
+  {
+      av1_var_part_nn_weights_64_layer0,
+      av1_var_part_nn_weights_64_layer1,
+  },
+  {
+      av1_var_part_nn_bias_64_layer0,
+      av1_var_part_nn_bias_64_layer1,
+  },
+};
+
+static const float av1_var_part_nn_weights_32_layer0[FEATURES * 8] = {
+  0.97886049f,  -1.66262011f, 0.94902798f,  0.7080922f,   0.91181186f,
+  0.35222601f,  -0.04428585f, 0.42086472f,  -0.0206325f,  -0.77937809f,
+  -0.70947522f, -1.24463119f, 0.23739497f,  -1.34327359f, 0.01024804f,
+  0.4544633f,   -0.96907661f, 0.67279522f,  0.23180693f,  1.54063368f,
+  -0.15700707f, 0.18597331f,  0.34167589f,  0.40736558f,  0.69213366f,
+  -1.33584593f, 1.21190814f,  1.26725267f,  1.21284802f,  1.26611399f,
+  0.17546514f,  -0.30248399f, -1.32589316f, -1.37432674f, -1.37423023f,
+  -1.26890855f, 0.12166347f,  -0.94565678f, -1.47475267f, -0.69279948f,
+  -0.10166587f, -0.23489881f, 0.57123565f,  0.80051137f,  -1.28411946f,
+  -1.36576732f, -1.30257508f, -1.30575106f
+};
+
+static const float av1_var_part_nn_bias_32_layer0[8] = {
+  -1.6301435f, 0.61879037f, -1.68612662f, 1.66960165f,
+  -0.0838243f, 0.32253287f, -0.65755282f, 0.96661531f
+};
+
+static const float av1_var_part_nn_weights_32_layer1[8] = {
+  1.99257161f,  0.7331492f,  1.33539961f,  1.13501456f,
+  -2.21154528f, 1.85858542f, -0.85565298f, -1.96410246f
+};
+
+static const float av1_var_part_nn_bias_32_layer1[1] = { -0.14880827f };
+
+static const float av1_var_part_means_32[FEATURES] = {
+  5.36360686f, 9.88421868f, 0.23543671f, 0.23621205f, 0.23409667f, 0.22855539f
+};
+
+static const float av1_var_part_vars_32[FEATURES] = {
+  0.89077225f, 2.32312894f, 0.02167654f, 0.02392842f, 0.02466495f, 0.02047641f
+};
+
+static const NN_CONFIG av1_var_part_nnconfig_32 = {
+  FEATURES,  // num_inputs
+  1,         // num_outputs
+  1,         // num_hidden_layers
+  {
+      8,
+  },  // num_hidden_nodes
+  {
+      av1_var_part_nn_weights_32_layer0,
+      av1_var_part_nn_weights_32_layer1,
+  },
+  {
+      av1_var_part_nn_bias_32_layer0,
+      av1_var_part_nn_bias_32_layer1,
+  },
+};
+
+static const float av1_var_part_nn_weights_16_layer0[FEATURES * 8] = {
+  0.45118305f,  -0.22068295f, 0.4604435f,   -0.1446326f,  -0.15765035f,
+  0.42260198f,  -0.0945916f,  0.49544996f,  0.62781567f,  -0.41564372f,
+  -0.39103292f, 0.44407624f,  0.48382613f,  -0.85424238f, -0.00961433f,
+  0.25383582f,  0.14403897f,  0.00901859f,  -0.83201967f, -0.19323284f,
+  0.59271213f,  0.69487457f,  0.6897112f,   0.62768521f,  0.9204492f,
+  -1.42448347f, -0.16491054f, -0.10114424f, -0.1069687f,  -0.11289049f,
+  0.26290832f,  -0.41850393f, 0.17239733f,  0.41770622f,  0.43725942f,
+  0.19362467f,  -0.35955731f, -0.899446f,   0.49726389f,  0.66569571f,
+  0.65893982f,  0.53199654f,  -0.1158694f,  -0.26472603f, 0.4155923f,
+  0.15059544f,  0.09596755f,  0.26247133f
+};
+
+static const float av1_var_part_nn_bias_16_layer0[8] = {
+  1.64486321f, -0.11851574f, 1.29322833f,  -0.61193136f,
+  0.33027532f, 1.04197232f,  -0.80716674f, 0.88681233f
+};
+
+static const float av1_var_part_nn_weights_16_layer1[8] = {
+  -1.02832118f, 0.72800106f, -0.42904783f, 1.44490586f,
+  -1.03888227f, -0.9023916f, -1.51543102f, -0.43059521f
+};
+
+static const float av1_var_part_nn_bias_16_layer1[1] = { -0.85087946f };
+
+static const float av1_var_part_means_16[FEATURES] = {
+  5.32551326f, 8.218448f, 0.21954822f, 0.22808377f, 0.23019798f, 0.22320699f
+};
+
+static const float av1_var_part_vars_16[FEATURES] = { 0.86806032f, 2.39938956f,
+                                                      0.01958579f, 0.02437927f,
+                                                      0.02420755f, 0.0192003f };
+
+static const NN_CONFIG av1_var_part_nnconfig_16 = {
+  FEATURES,  // num_inputs
+  1,         // num_outputs
+  1,         // num_hidden_layers
+  {
+      8,
+  },  // num_hidden_nodes
+  {
+      av1_var_part_nn_weights_16_layer0,
+      av1_var_part_nn_weights_16_layer1,
+  },
+  {
+      av1_var_part_nn_bias_16_layer0,
+      av1_var_part_nn_bias_16_layer1,
+  },
+};
+
+#undef FEATURES
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_AV1_ML_PARTITION_MODELS_H_
diff --git a/av1/encoder/av1_multi_thread.c b/av1/encoder/av1_multi_thread.c
deleted file mode 100644
index d170b0c..0000000
--- a/av1/encoder/av1_multi_thread.c
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-
-#include "av1/encoder/encoder.h"
-#include "av1/encoder/ethread.h"
-#include "av1/encoder/av1_multi_thread.h"
-
-void av1_row_mt_mem_alloc(AV1_COMP *cpi, int max_sb_rows) {
-  struct AV1Common *cm = &cpi->common;
-  MultiThreadHandle *multi_thread_ctxt = &cpi->multi_thread_ctxt;
-  int tile_row, tile_col;
-  const int tile_cols = cm->tiles.cols;
-  const int tile_rows = cm->tiles.rows;
-
-  multi_thread_ctxt->allocated_tile_cols = tile_cols;
-  multi_thread_ctxt->allocated_tile_rows = tile_rows;
-  multi_thread_ctxt->allocated_sb_rows = max_sb_rows;
-
-  // Allocate memory for row based multi-threading
-  for (tile_row = 0; tile_row < multi_thread_ctxt->allocated_tile_rows;
-       tile_row++) {
-    for (tile_col = 0; tile_col < multi_thread_ctxt->allocated_tile_cols;
-         tile_col++) {
-      TileDataEnc *this_tile =
-          &cpi->tile_data[tile_row * multi_thread_ctxt->allocated_tile_cols +
-                          tile_col];
-      av1_row_mt_sync_mem_alloc(&this_tile->row_mt_sync, cm, max_sb_rows);
-      if (cpi->oxcf.cdf_update_mode)
-        CHECK_MEM_ERROR(
-            cm, this_tile->row_ctx,
-            (FRAME_CONTEXT *)aom_memalign(
-                16,
-                AOMMAX(1, (av1_get_sb_cols_in_tile(cm, this_tile->tile_info) -
-                           1)) *
-                    sizeof(*this_tile->row_ctx)));
-    }
-  }
-}
-
-void av1_row_mt_mem_dealloc(AV1_COMP *cpi) {
-  MultiThreadHandle *multi_thread_ctxt = &cpi->multi_thread_ctxt;
-  int tile_col;
-  int tile_row;
-
-  // Free row based multi-threading sync memory
-  for (tile_row = 0; tile_row < multi_thread_ctxt->allocated_tile_rows;
-       tile_row++) {
-    for (tile_col = 0; tile_col < multi_thread_ctxt->allocated_tile_cols;
-         tile_col++) {
-      TileDataEnc *this_tile =
-          &cpi->tile_data[tile_row * multi_thread_ctxt->allocated_tile_cols +
-                          tile_col];
-      av1_row_mt_sync_mem_dealloc(&this_tile->row_mt_sync);
-      if (cpi->oxcf.cdf_update_mode) aom_free(this_tile->row_ctx);
-    }
-  }
-  multi_thread_ctxt->allocated_sb_rows = 0;
-  multi_thread_ctxt->allocated_tile_cols = 0;
-  multi_thread_ctxt->allocated_tile_rows = 0;
-}
diff --git a/av1/encoder/av1_multi_thread.h b/av1/encoder/av1_multi_thread.h
deleted file mode 100644
index 2a1cc7d..0000000
--- a/av1/encoder/av1_multi_thread.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AV1_ENCODER_AV1_MULTI_THREAD_H
-#define AV1_ENCODER_AV1_MULTI_THREAD_H
-
-#include "av1/encoder/encoder.h"
-
-void av1_row_mt_mem_alloc(AV1_COMP *cpi, int max_sb_rows);
-
-void av1_row_mt_mem_dealloc(AV1_COMP *cpi);
-
-#endif  // AV1_ENCODER_AV1_MULTI_THREAD_H
diff --git a/av1/encoder/av1_noise_estimate.c b/av1/encoder/av1_noise_estimate.c
new file mode 100644
index 0000000..0e6ee15
--- /dev/null
+++ b/av1/encoder/av1_noise_estimate.c
@@ -0,0 +1,304 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <limits.h>
+#include <math.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_scale/yv12config.h"
+#include "aom/aom_integer.h"
+#include "av1/encoder/context_tree.h"
+#include "av1/encoder/av1_noise_estimate.h"
+#include "av1/encoder/encoder.h"
+
+#if CONFIG_AV1_TEMPORAL_DENOISING
+// For SVC: only do noise estimation on top spatial layer.
+static INLINE int noise_est_svc(const struct AV1_COMP *const cpi) {
+  return (!cpi->use_svc ||
+          (cpi->use_svc &&
+           cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1));
+}
+#endif
+
+void av1_noise_estimate_init(NOISE_ESTIMATE *const ne, int width, int height) {
+  ne->enabled = 0;
+  ne->level = (width * height < 1280 * 720) ? kLowLow : kLow;
+  ne->value = 0;
+  ne->count = 0;
+  ne->thresh = 90;
+  ne->last_w = 0;
+  ne->last_h = 0;
+  if (width * height >= 1920 * 1080) {
+    ne->thresh = 200;
+  } else if (width * height >= 1280 * 720) {
+    ne->thresh = 140;
+  } else if (width * height >= 640 * 360) {
+    ne->thresh = 115;
+  }
+  ne->num_frames_estimate = 15;
+  ne->adapt_thresh = (3 * ne->thresh) >> 1;
+}
+
+static int enable_noise_estimation(AV1_COMP *const cpi) {
+  ResizePendingParams *const resize_pending_params =
+      &cpi->resize_pending_params;
+  const int resize_pending =
+      (resize_pending_params->width && resize_pending_params->height &&
+       (cpi->common.width != resize_pending_params->width ||
+        cpi->common.height != resize_pending_params->height));
+
+#if CONFIG_AV1_HIGHBITDEPTH
+  if (cpi->common.seq_params.use_highbitdepth) return 0;
+#endif
+// Enable noise estimation if denoising is on.
+#if CONFIG_AV1_TEMPORAL_DENOISING
+  if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi) &&
+      cpi->common.width >= 320 && cpi->common.height >= 180)
+    return 1;
+#endif
+  // Only allow noise estimate under certain encoding mode.
+  // Enabled for 1 pass CBR, speed >=5, and if resolution is same as original.
+  // Not enabled for SVC mode and screen_content_mode.
+  // Not enabled for low resolutions.
+  if (cpi->oxcf.pass == 0 && cpi->oxcf.rc_cfg.mode == AOM_CBR &&
+      cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ && cpi->oxcf.speed >= 5 &&
+      resize_pending == 0 && !cpi->use_svc &&
+      cpi->oxcf.tune_cfg.content != AOM_CONTENT_SCREEN &&
+      cpi->common.width * cpi->common.height >= 640 * 360)
+    return 1;
+  else
+    return 0;
+}
+
+#if CONFIG_AV1_TEMPORAL_DENOISING
+static void copy_frame(YV12_BUFFER_CONFIG *const dest,
+                       const YV12_BUFFER_CONFIG *const src) {
+  const uint8_t *srcbuf = src->y_buffer;
+  uint8_t *destbuf = dest->y_buffer;
+
+  assert(dest->y_width == src->y_width);
+  assert(dest->y_height == src->y_height);
+
+  for (int r = 0; r < dest->y_height; ++r) {
+    memcpy(destbuf, srcbuf, dest->y_width);
+    destbuf += dest->y_stride;
+    srcbuf += src->y_stride;
+  }
+}
+#endif  // CONFIG_AV1_TEMPORAL_DENOISING
+
+NOISE_LEVEL av1_noise_estimate_extract_level(NOISE_ESTIMATE *const ne) {
+  int noise_level = kLowLow;
+  if (ne->value > (ne->thresh << 1)) {
+    noise_level = kHigh;
+  } else {
+    if (ne->value > ne->thresh)
+      noise_level = kMedium;
+    else if (ne->value > (ne->thresh >> 1))
+      noise_level = kLow;
+    else
+      noise_level = kLowLow;
+  }
+  return noise_level;
+}
+
+void av1_update_noise_estimate(AV1_COMP *const cpi) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+
+  NOISE_ESTIMATE *const ne = &cpi->noise_estimate;
+  const int low_res = (cm->width <= 352 && cm->height <= 288);
+  // Estimate of noise level every frame_period frames.
+  int frame_period = 8;
+  int thresh_consec_zeromv = 2;
+  int frame_counter = cm->current_frame.frame_number;
+  // Estimate is between current source and last source.
+  YV12_BUFFER_CONFIG *last_source = cpi->last_source;
+#if CONFIG_AV1_TEMPORAL_DENOISING
+  if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi)) {
+    last_source = &cpi->denoiser.last_source;
+    // Tune these thresholds for different resolutions when denoising is
+    // enabled.
+    if (cm->width > 640 && cm->width <= 1920) {
+      thresh_consec_zeromv = 2;
+    }
+  }
+#endif
+  ne->enabled = enable_noise_estimation(cpi);
+  if (cpi->svc.number_spatial_layers > 1)
+    frame_counter = cpi->svc.current_superframe;
+  if (!ne->enabled || frame_counter % frame_period != 0 ||
+      last_source == NULL ||
+      (cpi->svc.number_spatial_layers == 1 &&
+       (ne->last_w != cm->width || ne->last_h != cm->height))) {
+#if CONFIG_AV1_TEMPORAL_DENOISING
+    if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi))
+      copy_frame(&cpi->denoiser.last_source, cpi->source);
+#endif
+    if (last_source != NULL) {
+      ne->last_w = cm->width;
+      ne->last_h = cm->height;
+    }
+    return;
+  } else if (frame_counter > 60 && cpi->svc.num_encoded_top_layer > 1 &&
+             cpi->rc.frames_since_key > cpi->svc.number_spatial_layers &&
+             cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1 &&
+             cpi->rc.avg_frame_low_motion < (low_res ? 60 : 40)) {
+    // Force noise estimation to 0 and denoiser off if content has high motion.
+    ne->level = kLowLow;
+    ne->count = 0;
+    ne->num_frames_estimate = 10;
+#if CONFIG_AV1_TEMPORAL_DENOISING
+    if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi) &&
+        cpi->svc.current_superframe > 1) {
+      av1_denoiser_set_noise_level(cpi, ne->level);
+      copy_frame(&cpi->denoiser.last_source, cpi->source);
+    }
+#endif
+    return;
+  } else {
+    unsigned int bin_size = 100;
+    unsigned int hist[MAX_VAR_HIST_BINS] = { 0 };
+    unsigned int hist_avg[MAX_VAR_HIST_BINS];
+    unsigned int max_bin = 0;
+    unsigned int max_bin_count = 0;
+    unsigned int bin_cnt;
+    int bsize = BLOCK_16X16;
+    // Loop over sub-sample of 16x16 blocks of frame, and for blocks that have
+    // been encoded as zero/small mv at least x consecutive frames, compute
+    // the variance to update estimate of noise in the source.
+    const uint8_t *src_y = cpi->source->y_buffer;
+    const int src_ystride = cpi->source->y_stride;
+    const uint8_t *last_src_y = last_source->y_buffer;
+    const int last_src_ystride = last_source->y_stride;
+    const uint8_t *src_u = cpi->source->u_buffer;
+    const uint8_t *src_v = cpi->source->v_buffer;
+    const int src_uvstride = cpi->source->uv_stride;
+    int mi_row, mi_col;
+    int num_low_motion = 0;
+    int frame_low_motion = 1;
+    for (mi_row = 0; mi_row < mi_params->mi_rows; mi_row += 2) {
+      for (mi_col = 0; mi_col < mi_params->mi_cols; mi_col += 2) {
+        int bl_index =
+            (mi_row >> 1) * (mi_params->mi_cols >> 1) + (mi_col >> 1);
+        if (cpi->consec_zero_mv[bl_index] > thresh_consec_zeromv)
+          num_low_motion++;
+      }
+    }
+    if (num_low_motion <
+        (((3 * (mi_params->mi_rows * mi_params->mi_cols) >> 2)) >> 3))
+      frame_low_motion = 0;
+    for (mi_row = 0; mi_row < mi_params->mi_rows; mi_row++) {
+      for (mi_col = 0; mi_col < mi_params->mi_cols; mi_col++) {
+        // 16x16 blocks, 1/4 sample of frame.
+        if (mi_row % 8 == 0 && mi_col % 8 == 0 &&
+            mi_row < mi_params->mi_rows - 3 &&
+            mi_col < mi_params->mi_cols - 3) {
+          int bl_index =
+              (mi_row >> 1) * (mi_params->mi_cols >> 1) + (mi_col >> 1);
+          int bl_index1 = bl_index + 1;
+          int bl_index2 = bl_index + (mi_params->mi_cols >> 1);
+          int bl_index3 = bl_index2 + 1;
+          int consec_zeromv =
+              AOMMIN(cpi->consec_zero_mv[bl_index],
+                     AOMMIN(cpi->consec_zero_mv[bl_index1],
+                            AOMMIN(cpi->consec_zero_mv[bl_index2],
+                                   cpi->consec_zero_mv[bl_index3])));
+          // Only consider blocks that are likely steady background. i.e, have
+          // been encoded as zero/low motion x (= thresh_consec_zeromv) frames
+          // in a row. consec_zero_mv[] defined for 8x8 blocks, so consider all
+          // 4 sub-blocks for 16x16 block. And exclude this frame if
+          // high_source_sad is true (i.e., scene/content change).
+          if (frame_low_motion && consec_zeromv > thresh_consec_zeromv &&
+              !cpi->rc.high_source_sad) {
+            unsigned int sse;
+            // Compute variance between co-located blocks from current and
+            // last input frames.
+            unsigned int variance = cpi->fn_ptr[bsize].vf(
+                src_y, src_ystride, last_src_y, last_src_ystride, &sse);
+            unsigned int hist_index = variance / bin_size;
+            if (hist_index < MAX_VAR_HIST_BINS)
+              hist[hist_index]++;
+            else if (hist_index < 3 * (MAX_VAR_HIST_BINS >> 1))
+              hist[MAX_VAR_HIST_BINS - 1]++;  // Account for the tail
+          }
+        }
+        src_y += 4;
+        last_src_y += 4;
+        src_u += 2;
+        src_v += 2;
+      }
+      src_y += (src_ystride << 2) - (mi_params->mi_cols << 2);
+      last_src_y += (last_src_ystride << 2) - (mi_params->mi_cols << 2);
+      src_u += (src_uvstride << 1) - (mi_params->mi_cols << 1);
+      src_v += (src_uvstride << 1) - (mi_params->mi_cols << 1);
+    }
+    ne->last_w = cm->width;
+    ne->last_h = cm->height;
+    // Adjust histogram to account for effect that histogram flattens
+    // and shifts to zero as scene darkens.
+    if (hist[0] > 10 && (hist[MAX_VAR_HIST_BINS - 1] > hist[0] >> 2)) {
+      hist[0] = 0;
+      hist[1] >>= 2;
+      hist[2] >>= 2;
+      hist[3] >>= 2;
+      hist[4] >>= 1;
+      hist[5] >>= 1;
+      hist[6] = 3 * hist[6] >> 1;
+      hist[MAX_VAR_HIST_BINS - 1] >>= 1;
+    }
+
+    // Average hist[] and find largest bin
+    for (bin_cnt = 0; bin_cnt < MAX_VAR_HIST_BINS; bin_cnt++) {
+      if (bin_cnt == 0)
+        hist_avg[bin_cnt] = (hist[0] + hist[1] + hist[2]) / 3;
+      else if (bin_cnt == MAX_VAR_HIST_BINS - 1)
+        hist_avg[bin_cnt] = hist[MAX_VAR_HIST_BINS - 1] >> 2;
+      else if (bin_cnt == MAX_VAR_HIST_BINS - 2)
+        hist_avg[bin_cnt] = (hist[bin_cnt - 1] + 2 * hist[bin_cnt] +
+                             (hist[bin_cnt + 1] >> 1) + 2) >>
+                            2;
+      else
+        hist_avg[bin_cnt] =
+            (hist[bin_cnt - 1] + 2 * hist[bin_cnt] + hist[bin_cnt + 1] + 2) >>
+            2;
+
+      if (hist_avg[bin_cnt] > max_bin_count) {
+        max_bin_count = hist_avg[bin_cnt];
+        max_bin = bin_cnt;
+      }
+    }
+    // Scale by 40 to work with existing thresholds
+    ne->value = (int)((3 * ne->value + max_bin * 40) >> 2);
+    // Quickly increase VNR strength when the noise level increases suddenly.
+    if (ne->level < kMedium && ne->value > ne->adapt_thresh) {
+      ne->count = ne->num_frames_estimate;
+    } else {
+      ne->count++;
+    }
+    if (ne->count == ne->num_frames_estimate) {
+      // Reset counter and check noise level condition.
+      ne->num_frames_estimate = 30;
+      ne->count = 0;
+      ne->level = av1_noise_estimate_extract_level(ne);
+#if CONFIG_AV1_TEMPORAL_DENOISING
+      if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi))
+        av1_denoiser_set_noise_level(cpi, ne->level);
+#endif
+    }
+  }
+#if CONFIG_AV1_TEMPORAL_DENOISING
+  if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi))
+    copy_frame(&cpi->denoiser.last_source, cpi->source);
+#endif
+}
diff --git a/av1/encoder/av1_noise_estimate.h b/av1/encoder/av1_noise_estimate.h
new file mode 100644
index 0000000..8553066
--- /dev/null
+++ b/av1/encoder/av1_noise_estimate.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_AV1_NOISE_ESTIMATE_H_
+#define AOM_AV1_ENCODER_AV1_NOISE_ESTIMATE_H_
+
+#include "av1/encoder/block.h"
+#include "aom_scale/yv12config.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MAX_VAR_HIST_BINS 20
+
+typedef enum noise_level { kLowLow, kLow, kMedium, kHigh } NOISE_LEVEL;
+
+typedef struct noise_estimate {
+  int enabled;
+  NOISE_LEVEL level;
+  int value;
+  int thresh;
+  int adapt_thresh;
+  int count;
+  int last_w;
+  int last_h;
+  int num_frames_estimate;
+} NOISE_ESTIMATE;
+
+struct AV1_COMP;
+
+void av1_noise_estimate_init(NOISE_ESTIMATE *const ne, int width, int height);
+
+NOISE_LEVEL av1_noise_estimate_extract_level(NOISE_ESTIMATE *const ne);
+
+void av1_update_noise_estimate(struct AV1_COMP *const cpi);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_AV1_NOISE_ESTIMATE_H_
diff --git a/av1/encoder/av1_quantize.c b/av1/encoder/av1_quantize.c
index 569784a..ff62c45 100644
--- a/av1/encoder/av1_quantize.c
+++ b/av1/encoder/av1_quantize.c
@@ -586,7 +586,7 @@
     const int qrounding_factor = q == 0 ? 64 : 48;
 
     for (i = 0; i < 2; ++i) {
-      int qrounding_factor_fp = 64;
+      const int qrounding_factor_fp = 64;
       // y quantizer with TX scale
       quant_QTX = i == 0 ? av1_dc_quant_QTX(q, y_dc_delta_q, bit_depth)
                          : av1_ac_quant_QTX(q, 0, bit_depth);
@@ -637,7 +637,8 @@
       quants->u_zbin[q][i] = quants->u_zbin[q][1];
       quants->u_round[q][i] = quants->u_round[q][1];
       deq->u_dequant_QTX[q][i] = deq->u_dequant_QTX[q][1];
-      quants->v_quant[q][i] = quants->u_quant[q][1];
+
+      quants->v_quant[q][i] = quants->v_quant[q][1];
       quants->v_quant_fp[q][i] = quants->v_quant_fp[q][1];
       quants->v_round_fp[q][i] = quants->v_round_fp[q][1];
       quants->v_quant_shift[q][i] = quants->v_quant_shift[q][1];
@@ -667,11 +668,11 @@
   const QUANTS *const quants = &cpi->enc_quant_dequant_params.quants;
   const Dequants *const dequants = &cpi->enc_quant_dequant_params.dequants;
 
-  const int current_qindex =
-      AOMMAX(0, AOMMIN(QINDEX_RANGE - 1,
-                       cm->delta_q_info.delta_q_present_flag
-                           ? quant_params->base_qindex + xd->delta_qindex
-                           : quant_params->base_qindex));
+  const int current_qindex = AOMMAX(
+      0,
+      AOMMIN(QINDEX_RANGE - 1, cm->delta_q_info.delta_q_present_flag
+                                   ? quant_params->base_qindex + x->delta_qindex
+                                   : quant_params->base_qindex));
   const int qindex = av1_get_qindex(&cm->seg, segment_id, current_qindex);
   const int rdmult =
       av1_compute_rd_mult(cpi, qindex + quant_params->y_dc_delta_q);
@@ -726,12 +727,12 @@
   memcpy(&xd->plane[2].seg_iqmatrix[segment_id],
          quant_params->giqmatrix[qmlevel_v][2],
          sizeof(quant_params->giqmatrix[qmlevel_v][2]));
-  x->skip_block = segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP);
+  x->seg_skip_block = segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP);
   x->qindex = qindex;
 
-  set_error_per_bit(x, rdmult);
-
-  av1_initialize_me_consts(cpi, x, qindex);
+  MvCosts *mv_costs = &x->mv_costs;
+  av1_set_error_per_bit(mv_costs, rdmult);
+  av1_set_sad_per_bit(cpi, mv_costs, qindex);
 }
 
 void av1_frame_init_quantizer(AV1_COMP *cpi) {
@@ -741,16 +742,26 @@
 }
 
 void av1_set_quantizer(AV1_COMMON *const cm, int min_qmlevel, int max_qmlevel,
-                       int q) {
+                       int q, int enable_chroma_deltaq) {
   // quantizer has to be reinitialized with av1_init_quantizer() if any
   // delta_q changes.
   CommonQuantParams *quant_params = &cm->quant_params;
   quant_params->base_qindex = AOMMAX(cm->delta_q_info.delta_q_present_flag, q);
+
   quant_params->y_dc_delta_q = 0;
-  quant_params->u_dc_delta_q = 0;
-  quant_params->u_ac_delta_q = 0;
-  quant_params->v_dc_delta_q = 0;
-  quant_params->v_ac_delta_q = 0;
+  if (enable_chroma_deltaq) {
+    // TODO(aomedia:2717): need to design better delta
+    quant_params->u_dc_delta_q = 2;
+    quant_params->u_ac_delta_q = 2;
+    quant_params->v_dc_delta_q = 2;
+    quant_params->v_ac_delta_q = 2;
+  } else {
+    quant_params->u_dc_delta_q = 0;
+    quant_params->u_ac_delta_q = 0;
+    quant_params->v_dc_delta_q = 0;
+    quant_params->v_ac_delta_q = 0;
+  }
+
   quant_params->qmatrix_level_y =
       aom_get_qmlevel(quant_params->base_qindex, min_qmlevel, max_qmlevel);
   quant_params->qmatrix_level_u =
diff --git a/av1/encoder/av1_quantize.h b/av1/encoder/av1_quantize.h
index 40fb4be..ad96197 100644
--- a/av1/encoder/av1_quantize.h
+++ b/av1/encoder/av1_quantize.h
@@ -109,7 +109,7 @@
                         aom_bit_depth_t bit_depth);
 
 void av1_set_quantizer(struct AV1Common *const cm, int min_qmlevel,
-                       int max_qmlevel, int q);
+                       int max_qmlevel, int q, int enable_chroma_deltaq);
 
 int av1_quantizer_to_qindex(int quantizer);
 
diff --git a/av1/encoder/bitstream.c b/av1/encoder/bitstream.c
index daa8ce1..2bde02b 100644
--- a/av1/encoder/bitstream.c
+++ b/av1/encoder/bitstream.c
@@ -60,9 +60,11 @@
   }
 }
 
+#if !CONFIG_REALTIME_ONLY
 static AOM_INLINE void loop_restoration_write_sb_coeffs(
     const AV1_COMMON *const cm, MACROBLOCKD *xd, const RestorationUnitInfo *rui,
     aom_writer *const w, int plane, FRAME_COUNTS *counts);
+#endif
 
 static AOM_INLINE void write_intra_y_mode_kf(FRAME_CONTEXT *frame_ctx,
                                              const MB_MODE_INFO *mi,
@@ -145,8 +147,8 @@
                                            int blk_row, int blk_col,
                                            aom_writer *w) {
   FRAME_CONTEXT *const ec_ctx = xd->tile_ctx;
-  const int max_blocks_high = max_block_high(xd, mbmi->sb_type, 0);
-  const int max_blocks_wide = max_block_wide(xd, mbmi->sb_type, 0);
+  const int max_blocks_high = max_block_high(xd, mbmi->bsize, 0);
+  const int max_blocks_wide = max_block_wide(xd, mbmi->bsize, 0);
 
   if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
 
@@ -158,9 +160,9 @@
 
   const int ctx = txfm_partition_context(xd->above_txfm_context + blk_col,
                                          xd->left_txfm_context + blk_row,
-                                         mbmi->sb_type, tx_size);
+                                         mbmi->bsize, tx_size);
   const int txb_size_index =
-      av1_get_txb_size_index(mbmi->sb_type, blk_row, blk_col);
+      av1_get_txb_size_index(mbmi->bsize, blk_row, blk_col);
   const int write_txfm_partition =
       tx_size == mbmi->inter_tx_size[txb_size_index];
   if (write_txfm_partition) {
@@ -195,7 +197,7 @@
 static AOM_INLINE void write_selected_tx_size(const MACROBLOCKD *xd,
                                               aom_writer *w) {
   const MB_MODE_INFO *const mbmi = xd->mi[0];
-  const BLOCK_SIZE bsize = mbmi->sb_type;
+  const BLOCK_SIZE bsize = mbmi->bsize;
   FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
   if (block_signals_txsize(bsize)) {
     const TX_SIZE tx_size = mbmi->tx_size;
@@ -218,11 +220,11 @@
   if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) {
     return 1;
   } else {
-    const int skip = mi->skip;
-    const int ctx = av1_get_skip_context(xd);
+    const int skip_txfm = mi->skip_txfm;
+    const int ctx = av1_get_skip_txfm_context(xd);
     FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
-    aom_write_symbol(w, skip, ec_ctx->skip_cdfs[ctx], 2);
-    return skip;
+    aom_write_symbol(w, skip_txfm, ec_ctx->skip_txfm_cdfs[ctx], 2);
+    return skip_txfm;
   }
 }
 
@@ -234,7 +236,7 @@
     return 0;
   }
   const int skip_mode = mi->skip_mode;
-  if (!is_comp_ref_allowed(mi->sb_type)) {
+  if (!is_comp_ref_allowed(mi->bsize)) {
     assert(!skip_mode);
     return 0;
   }
@@ -278,11 +280,11 @@
     case SIMPLE_TRANSLATION: break;
     case OBMC_CAUSAL:
       aom_write_symbol(w, mbmi->motion_mode == OBMC_CAUSAL,
-                       xd->tile_ctx->obmc_cdf[mbmi->sb_type], 2);
+                       xd->tile_ctx->obmc_cdf[mbmi->bsize], 2);
       break;
     default:
       aom_write_symbol(w, mbmi->motion_mode,
-                       xd->tile_ctx->motion_mode_cdf[mbmi->sb_type],
+                       xd->tile_ctx->motion_mode_cdf[mbmi->bsize],
                        MOTION_MODES);
   }
 }
@@ -339,22 +341,25 @@
   }
 }
 
-static AOM_INLINE void pack_map_tokens(aom_writer *w, const TOKENEXTRA **tp,
-                                       int n, int num) {
-  const TOKENEXTRA *p = *tp;
+static AOM_INLINE void pack_map_tokens(aom_writer *w, const TokenExtra **tp,
+                                       int n, int num, MapCdf map_pb_cdf) {
+  const TokenExtra *p = *tp;
+  const int palette_size_idx = n - PALETTE_MIN_SIZE;
   write_uniform(w, n, p->token);  // The first color index.
   ++p;
   --num;
   for (int i = 0; i < num; ++i) {
-    aom_write_symbol(w, p->token, p->color_map_cdf, n);
+    assert((p->color_ctx >= 0) && (p->color_ctx < CDF_SIZE(PALETTE_COLORS)));
+    aom_cdf_prob *color_map_cdf = map_pb_cdf[palette_size_idx][p->color_ctx];
+    aom_write_symbol(w, p->token, color_map_cdf, n);
     ++p;
   }
   *tp = p;
 }
 
 static AOM_INLINE void pack_txb_tokens(
-    aom_writer *w, AV1_COMMON *cm, MACROBLOCK *const x, const TOKENEXTRA **tp,
-    const TOKENEXTRA *const tok_end, MACROBLOCKD *xd, MB_MODE_INFO *mbmi,
+    aom_writer *w, AV1_COMMON *cm, MACROBLOCK *const x, const TokenExtra **tp,
+    const TokenExtra *const tok_end, MACROBLOCKD *xd, MB_MODE_INFO *mbmi,
     int plane, BLOCK_SIZE plane_bsize, aom_bit_depth_t bit_depth, int block,
     int blk_row, int blk_col, TX_SIZE tx_size, TOKEN_STATS *token_stats) {
   const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
@@ -364,7 +369,7 @@
 
   const struct macroblockd_plane *const pd = &xd->plane[plane];
   const TX_SIZE plane_tx_size =
-      plane ? av1_get_max_uv_txsize(mbmi->sb_type, pd->subsampling_x,
+      plane ? av1_get_max_uv_txsize(mbmi->bsize, pd->subsampling_x,
                                     pd->subsampling_y)
             : mbmi->inter_tx_size[av1_get_txb_size_index(plane_bsize, blk_row,
                                                          blk_col)];
@@ -439,9 +444,12 @@
   }
 }
 
-static AOM_INLINE void write_segment_id(
-    AV1_COMP *cpi, const MB_MODE_INFO *const mbmi, aom_writer *w,
-    const struct segmentation *seg, struct segmentation_probs *segp, int skip) {
+static AOM_INLINE void write_segment_id(AV1_COMP *cpi,
+                                        const MB_MODE_INFO *const mbmi,
+                                        aom_writer *w,
+                                        const struct segmentation *seg,
+                                        struct segmentation_probs *segp,
+                                        int skip_txfm) {
   if (!seg->enabled || !seg->update_map) return;
 
   AV1_COMMON *const cm = &cpi->common;
@@ -451,15 +459,15 @@
   const int mi_row = xd->mi_row;
   const int mi_col = xd->mi_col;
 
-  if (skip) {
-    // Still need to transmit tx size for intra blocks even if skip is
+  if (skip_txfm) {
+    // Still need to transmit tx size for intra blocks even if skip_txfm is
     // true. Changing segment_id may make the tx size become invalid, e.g
     // changing from lossless to lossy.
     assert(is_inter_block(mbmi) || !cpi->enc_seg.has_lossless_segment);
 
-    set_spatial_segment_id(&cm->mi_params, cm->cur_frame->seg_map,
-                           mbmi->sb_type, mi_row, mi_col, pred);
-    set_spatial_segment_id(&cm->mi_params, cpi->enc_seg.map, mbmi->sb_type,
+    set_spatial_segment_id(&cm->mi_params, cm->cur_frame->seg_map, mbmi->bsize,
+                           mi_row, mi_col, pred);
+    set_spatial_segment_id(&cm->mi_params, cpi->enc_seg.map, mbmi->bsize,
                            mi_row, mi_col, pred);
     /* mbmi is read only but we need to update segment_id */
     ((MB_MODE_INFO *)mbmi)->segment_id = pred;
@@ -470,7 +478,7 @@
       av1_neg_interleave(mbmi->segment_id, pred, seg->last_active_segid + 1);
   aom_cdf_prob *pred_cdf = segp->spatial_pred_seg_cdf[cdf_num];
   aom_write_symbol(w, coded_id, pred_cdf, MAX_SEGMENTS);
-  set_spatial_segment_id(&cm->mi_params, cm->cur_frame->seg_map, mbmi->sb_type,
+  set_spatial_segment_id(&cm->mi_params, cm->cur_frame->seg_map, mbmi->bsize,
                          mi_row, mi_col, mbmi->segment_id);
 }
 
@@ -498,7 +506,7 @@
     // does the feature use compound prediction or not
     // (if not specified at the frame/segment level)
     if (cm->current_frame.reference_mode == REFERENCE_MODE_SELECT) {
-      if (is_comp_ref_allowed(mbmi->sb_type))
+      if (is_comp_ref_allowed(mbmi->bsize))
         aom_write_symbol(w, is_compound, av1_get_reference_mode_cdf(xd), 2);
     } else {
       assert((!is_compound) ==
@@ -587,7 +595,7 @@
     aom_writer *w) {
   if (av1_filter_intra_allowed(cm, mbmi)) {
     aom_write_symbol(w, mbmi->filter_intra_mode_info.use_filter_intra,
-                     xd->tile_ctx->filter_intra_cdfs[mbmi->sb_type], 2);
+                     xd->tile_ctx->filter_intra_cdfs[mbmi->bsize], 2);
     if (mbmi->filter_intra_mode_info.use_filter_intra) {
       const FILTER_INTRA_MODE mode =
           mbmi->filter_intra_mode_info.filter_intra_mode;
@@ -753,7 +761,7 @@
                                                const MB_MODE_INFO *const mbmi,
                                                aom_writer *w) {
   const int num_planes = av1_num_planes(cm);
-  const BLOCK_SIZE bsize = mbmi->sb_type;
+  const BLOCK_SIZE bsize = mbmi->bsize;
   assert(av1_allow_palette(cm->features.allow_screen_content_tools, bsize));
   const PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
   const int bsize_ctx = av1_get_palette_bsize_ctx(bsize);
@@ -796,7 +804,7 @@
   if (get_ext_tx_types(tx_size, is_inter, features->reduced_tx_set_used) > 1 &&
       ((!cm->seg.enabled && cm->quant_params.base_qindex > 0) ||
        (cm->seg.enabled && xd->qindex[mbmi->segment_id] > 0)) &&
-      !mbmi->skip &&
+      !mbmi->skip_txfm &&
       !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
     FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
     const TX_SIZE square_tx_size = txsize_sqr_map[tx_size];
@@ -928,7 +936,7 @@
       }
       if (pred_flag) {
         set_spatial_segment_id(&cm->mi_params, cm->cur_frame->seg_map,
-                               mbmi->sb_type, mi_row, mi_col, mbmi->segment_id);
+                               mbmi->bsize, mi_row, mi_col, mbmi->segment_id);
       }
     } else {
       write_segment_id(cpi, mbmi, w, seg, segp, 0);
@@ -947,7 +955,7 @@
     MACROBLOCK *const x = &cpi->td.mb;
     MACROBLOCKD *const xd = &x->e_mbd;
     const MB_MODE_INFO *const mbmi = xd->mi[0];
-    const BLOCK_SIZE bsize = mbmi->sb_type;
+    const BLOCK_SIZE bsize = mbmi->bsize;
     const int super_block_upper_left =
         ((xd->mi_row & (cm->seq_params.mib_size - 1)) == 0) &&
         ((xd->mi_col & (cm->seq_params.mib_size - 1)) == 0);
@@ -956,10 +964,10 @@
         super_block_upper_left) {
       assert(mbmi->current_qindex > 0);
       const int reduced_delta_qindex =
-          (mbmi->current_qindex - xd->current_qindex) /
+          (mbmi->current_qindex - xd->current_base_qindex) /
           delta_q_info->delta_q_res;
       write_delta_qindex(xd, reduced_delta_qindex, w);
-      xd->current_qindex = mbmi->current_qindex;
+      xd->current_base_qindex = mbmi->current_qindex;
       if (delta_q_info->delta_lf_present_flag) {
         if (delta_q_info->delta_lf_multi) {
           const int frame_lf_count =
@@ -992,7 +1000,7 @@
   FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
   const MB_MODE_INFO *const mbmi = xd->mi[0];
   const PREDICTION_MODE mode = mbmi->mode;
-  const BLOCK_SIZE bsize = mbmi->sb_type;
+  const BLOCK_SIZE bsize = mbmi->bsize;
 
   // Y mode.
   if (is_keyframe) {
@@ -1084,7 +1092,7 @@
   const MB_MODE_INFO_EXT_FRAME *const mbmi_ext_frame = x->mbmi_ext_frame;
   const PREDICTION_MODE mode = mbmi->mode;
   const int segment_id = mbmi->segment_id;
-  const BLOCK_SIZE bsize = mbmi->sb_type;
+  const BLOCK_SIZE bsize = mbmi->bsize;
   const int allow_hp = cm->features.allow_high_precision_mv;
   const int is_inter = is_inter_block(mbmi);
   const int is_compound = has_second_ref(mbmi);
@@ -1094,7 +1102,7 @@
 
   write_skip_mode(cm, xd, segment_id, mbmi, w);
 
-  assert(IMPLIES(mbmi->skip_mode, mbmi->skip));
+  assert(IMPLIES(mbmi->skip_mode, mbmi->skip_txfm));
   const int skip =
       mbmi->skip_mode ? 1 : write_skip(cm, xd, segment_id, mbmi, w);
 
@@ -1278,7 +1286,7 @@
 static AOM_INLINE void dump_mode_info(MB_MODE_INFO *mi) {
   printf("\nmi->mi_row == %d\n", mi->mi_row);
   printf("&& mi->mi_col == %d\n", mi->mi_col);
-  printf("&& mi->sb_type == %d\n", mi->sb_type);
+  printf("&& mi->bsize == %d\n", mi->bsize);
   printf("&& mi->tx_size == %d\n", mi->tx_size);
   printf("&& mi->mode == %d\n", mi->mode);
 }
@@ -1324,7 +1332,7 @@
 #define FRAME_TO_CHECK 11
     if (cm->current_frame.frame_number == FRAME_TO_CHECK &&
         cm->show_frame == 1) {
-      const BLOCK_SIZE bsize = mbmi->sb_type;
+      const BLOCK_SIZE bsize = mbmi->bsize;
 
       int_mv mv[2] = { 0 };
       const int is_comp_ref = has_second_ref(mbmi);
@@ -1390,12 +1398,12 @@
 
 static AOM_INLINE void write_inter_txb_coeff(
     AV1_COMMON *const cm, MACROBLOCK *const x, MB_MODE_INFO *const mbmi,
-    aom_writer *w, const TOKENEXTRA **tok, const TOKENEXTRA *const tok_end,
+    aom_writer *w, const TokenExtra **tok, const TokenExtra *const tok_end,
     TOKEN_STATS *token_stats, const int row, const int col, int *block,
     const int plane) {
   MACROBLOCKD *const xd = &x->e_mbd;
   const struct macroblockd_plane *const pd = &xd->plane[plane];
-  const BLOCK_SIZE bsize = mbmi->sb_type;
+  const BLOCK_SIZE bsize = mbmi->bsize;
   assert(bsize < BLOCK_SIZES_ALL);
   const int ss_x = pd->subsampling_x;
   const int ss_y = pd->subsampling_y;
@@ -1425,19 +1433,19 @@
 }
 
 static AOM_INLINE void write_tokens_b(AV1_COMP *cpi, aom_writer *w,
-                                      const TOKENEXTRA **tok,
-                                      const TOKENEXTRA *const tok_end) {
+                                      const TokenExtra **tok,
+                                      const TokenExtra *const tok_end) {
   AV1_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &cpi->td.mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = xd->mi[0];
-  const BLOCK_SIZE bsize = mbmi->sb_type;
+  const BLOCK_SIZE bsize = mbmi->bsize;
 
-  assert(!mbmi->skip);
+  assert(!mbmi->skip_txfm);
 
   const int is_inter = is_inter_block(mbmi);
   if (!is_inter) {
-    av1_write_coeffs_mb(cm, x, w, bsize);
+    av1_write_intra_coeffs_mb(cm, x, w, bsize);
   } else {
     int block[MAX_MB_PLANE] = { 0 };
     assert(bsize == get_plane_block_size(bsize, xd->plane[0].subsampling_x,
@@ -1468,7 +1476,7 @@
     }
 #if CONFIG_RD_DEBUG
     for (int plane = 0; plane < num_planes; ++plane) {
-      if (mbmi->sb_type >= BLOCK_8X8 &&
+      if (mbmi->bsize >= BLOCK_8X8 &&
           rd_token_stats_mismatch(&mbmi->rd_stats, &token_stats, plane)) {
         dump_mode_info(mbmi);
         assert(0);
@@ -1479,12 +1487,13 @@
 }
 
 static AOM_INLINE void write_modes_b(AV1_COMP *cpi, const TileInfo *const tile,
-                                     aom_writer *w, const TOKENEXTRA **tok,
-                                     const TOKENEXTRA *const tok_end,
+                                     aom_writer *w, const TokenExtra **tok,
+                                     const TokenExtra *const tok_end,
                                      int mi_row, int mi_col) {
   const AV1_COMMON *cm = &cpi->common;
   const CommonModeInfoParams *const mi_params = &cm->mi_params;
   MACROBLOCKD *xd = &cpi->td.mb.e_mbd;
+  FRAME_CONTEXT *tile_ctx = xd->tile_ctx;
   const int grid_idx = mi_row * mi_params->mi_stride + mi_col;
   xd->mi = mi_params->mi_grid_base + grid_idx;
   cpi->td.mb.mbmi_ext_frame =
@@ -1495,7 +1504,7 @@
   xd->tx_type_map_stride = mi_params->mi_stride;
 
   const MB_MODE_INFO *mbmi = xd->mi[0];
-  const BLOCK_SIZE bsize = mbmi->sb_type;
+  const BLOCK_SIZE bsize = mbmi->bsize;
   assert(bsize <= cm->seq_params.sb_size ||
          (bsize >= BLOCK_SIZES && bsize < BLOCK_SIZES_ALL));
 
@@ -1517,21 +1526,23 @@
     if (palette_size_plane > 0) {
       assert(mbmi->use_intrabc == 0);
       assert(av1_allow_palette(cm->features.allow_screen_content_tools,
-                               mbmi->sb_type));
+                               mbmi->bsize));
       assert(!plane || xd->is_chroma_ref);
       int rows, cols;
-      av1_get_block_dimensions(mbmi->sb_type, plane, xd, NULL, NULL, &rows,
+      av1_get_block_dimensions(mbmi->bsize, plane, xd, NULL, NULL, &rows,
                                &cols);
       assert(*tok < tok_end);
-      pack_map_tokens(w, tok, palette_size_plane, rows * cols);
+      MapCdf map_pb_cdf = plane ? tile_ctx->palette_uv_color_index_cdf
+                                : tile_ctx->palette_y_color_index_cdf;
+      pack_map_tokens(w, tok, palette_size_plane, rows * cols, map_pb_cdf);
     }
   }
 
   const int is_inter_tx = is_inter_block(mbmi);
-  const int skip = mbmi->skip;
+  const int skip_txfm = mbmi->skip_txfm;
   const int segment_id = mbmi->segment_id;
   if (cm->features.tx_mode == TX_MODE_SELECT && block_signals_txsize(bsize) &&
-      !(is_inter_tx && skip) && !xd->lossless[segment_id]) {
+      !(is_inter_tx && skip_txfm) && !xd->lossless[segment_id]) {
     if (is_inter_tx) {  // This implies skip flag is 0.
       const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, bsize, 0);
       const int txbh = tx_size_high_unit[max_tx_size];
@@ -1548,12 +1559,17 @@
       set_txfm_ctxs(mbmi->tx_size, xd->width, xd->height, 0, xd);
     }
   } else {
-    set_txfm_ctxs(mbmi->tx_size, xd->width, xd->height, skip && is_inter_tx,
-                  xd);
+    set_txfm_ctxs(mbmi->tx_size, xd->width, xd->height,
+                  skip_txfm && is_inter_tx, xd);
   }
 
-  if (!mbmi->skip) {
+  if (!mbmi->skip_txfm) {
+    int start = aom_tell_size(w);
+
     write_tokens_b(cpi, w, tok, tok_end);
+
+    const int end = aom_tell_size(w);
+    cpi->rc.coefficient_size += end - start;
   }
 }
 
@@ -1596,7 +1612,7 @@
 
 static AOM_INLINE void write_modes_sb(
     AV1_COMP *const cpi, const TileInfo *const tile, aom_writer *const w,
-    const TOKENEXTRA **tok, const TOKENEXTRA *const tok_end, int mi_row,
+    const TokenExtra **tok, const TokenExtra *const tok_end, int mi_row,
     int mi_col, BLOCK_SIZE bsize) {
   const AV1_COMMON *const cm = &cpi->common;
   const CommonModeInfoParams *const mi_params = &cm->mi_params;
@@ -1610,6 +1626,7 @@
 
   if (mi_row >= mi_params->mi_rows || mi_col >= mi_params->mi_cols) return;
 
+#if !CONFIG_REALTIME_ONLY
   const int num_planes = av1_num_planes(cm);
   for (int plane = 0; plane < num_planes; ++plane) {
     int rcol0, rcol1, rrow0, rrow1;
@@ -1627,6 +1644,7 @@
       }
     }
   }
+#endif
 
   write_partition(cm, xd, hbs, mi_row, mi_col, partition, bsize, w);
   switch (partition) {
@@ -1709,7 +1727,7 @@
   av1_init_above_context(&cm->above_contexts, num_planes, tile->tile_row, xd);
 
   if (cpi->common.delta_q_info.delta_q_present_flag) {
-    xd->current_qindex = cpi->common.quant_params.base_qindex;
+    xd->current_base_qindex = cpi->common.quant_params.base_qindex;
     if (cpi->common.delta_q_info.delta_lf_present_flag) {
       av1_reset_loop_filter_delta(xd, num_planes);
     }
@@ -1719,10 +1737,10 @@
        mi_row += cm->seq_params.mib_size) {
     const int sb_row_in_tile =
         (mi_row - tile->mi_row_start) >> cm->seq_params.mib_size_log2;
-    const TOKENEXTRA *tok =
-        cpi->tplist[tile_row][tile_col][sb_row_in_tile].start;
-    const TOKENEXTRA *tok_end =
-        tok + cpi->tplist[tile_row][tile_col][sb_row_in_tile].count;
+    const TokenExtra *tok =
+        cpi->token_info.tplist[tile_row][tile_col][sb_row_in_tile].start;
+    const TokenExtra *tok_end =
+        tok + cpi->token_info.tplist[tile_row][tile_col][sb_row_in_tile].count;
 
     av1_zero_left_context(xd);
 
@@ -1732,7 +1750,7 @@
       write_modes_sb(cpi, tile, w, &tok, tok_end, mi_row, mi_col,
                      cm->seq_params.sb_size);
     }
-    assert(tok == cpi->tplist[tile_row][tile_col][sb_row_in_tile].stop);
+    assert(tok == tok_end);
   }
 }
 
@@ -1807,6 +1825,7 @@
   }
 }
 
+#if !CONFIG_REALTIME_ONLY
 static AOM_INLINE void write_wiener_filter(int wiener_win,
                                            const WienerInfo *wiener_info,
                                            WienerInfo *ref_wiener_info,
@@ -1888,7 +1907,7 @@
     aom_writer *const w, int plane, FRAME_COUNTS *counts) {
   const RestorationInfo *rsi = cm->rst_info + plane;
   RestorationType frame_rtype = rsi->frame_restoration_type;
-  if (frame_rtype == RESTORE_NONE) return;
+  assert(frame_rtype != RESTORE_NONE);
 
   (void)counts;
   assert(!cm->features.all_lossless);
@@ -1933,6 +1952,7 @@
     }
   }
 }
+#endif  // !CONFIG_REALTIME_ONLY
 
 // Only write out the ref delta section if any of the elements
 // will signal a delta.
@@ -3077,7 +3097,7 @@
     aom_wb_write_bit(wb, delta_q_info->delta_q_present_flag);
     if (delta_q_info->delta_q_present_flag) {
       aom_wb_write_literal(wb, get_msb(delta_q_info->delta_q_res), 2);
-      xd->current_qindex = quant_params->base_qindex;
+      xd->current_base_qindex = quant_params->base_qindex;
       if (features->allow_intrabc)
         assert(delta_q_info->delta_lf_present_flag == 0);
       else
@@ -3492,7 +3512,7 @@
     total_size += frame_header_size;
 
     // (yunqing) This test ensures the correctness of large scale tile coding.
-    if (cpi->oxcf.ext_tile_debug) {
+    if (cpi->oxcf.tile_cfg.enable_ext_tile_debug) {
       char fn[20] = "./fh";
       fn[4] = cm->current_frame.frame_number / 100 + '0';
       fn[5] = (cm->current_frame.frame_number % 100) / 10 + '0';
@@ -3864,7 +3884,7 @@
   // The TD is now written outside the frame encode loop
 
   // write sequence header obu if KEY_FRAME, preceded by 4-byte size
-  if (cm->current_frame.frame_type == KEY_FRAME && cm->show_frame) {
+  if (cm->current_frame.frame_type == KEY_FRAME && !cpi->no_show_fwd_kf) {
     obu_header_size =
         av1_write_obu_header(level_params, OBU_SEQUENCE_HEADER, 0, data);
 
@@ -3885,7 +3905,8 @@
 
   const int write_frame_header =
       (cpi->num_tg > 1 || encode_show_existing_frame(cm));
-  struct aom_write_bit_buffer saved_wb;
+  struct aom_write_bit_buffer saved_wb = { NULL, 0 };
+  size_t length_field = 0;
   if (write_frame_header) {
     // Write Frame Header OBU.
     fh_info.frame_header = data;
@@ -3894,26 +3915,24 @@
     obu_payload_size =
         write_frame_header_obu(cpi, &saved_wb, data + obu_header_size, 1);
 
-    const size_t length_field_size =
-        obu_memmove(obu_header_size, obu_payload_size, data);
+    length_field = obu_memmove(obu_header_size, obu_payload_size, data);
     if (av1_write_uleb_obu_size(obu_header_size, obu_payload_size, data) !=
         AOM_CODEC_OK) {
       return AOM_CODEC_ERROR;
     }
 
     fh_info.obu_header_byte_offset = 0;
-    fh_info.total_length =
-        obu_header_size + obu_payload_size + length_field_size;
+    fh_info.total_length = obu_header_size + obu_payload_size + length_field;
     data += fh_info.total_length;
-
-    // Since length_field_size is determined adaptively after frame header
-    // encoding, saved_wb must be adjusted accordingly.
-    saved_wb.bit_buffer += length_field_size;
   }
 
   if (encode_show_existing_frame(cm)) {
     data_size = 0;
   } else {
+    // Since length_field is determined adaptively after frame header
+    // encoding, saved_wb must be adjusted accordingly.
+    saved_wb.bit_buffer += length_field;
+
     //  Each tile group obu will be preceded by 4-byte size of the tile group
     //  obu
     data_size = write_tiles_in_tg_obus(
diff --git a/av1/encoder/bitstream.h b/av1/encoder/bitstream.h
index 45151e2..df35ecc 100644
--- a/av1/encoder/bitstream.h
+++ b/av1/encoder/bitstream.h
@@ -35,6 +35,11 @@
 int av1_write_uleb_obu_size(size_t obu_header_size, size_t obu_payload_size,
                             uint8_t *dest);
 
+/*!\brief Pack the bitstream for one frame
+ *
+ * \ingroup high_level_algo
+ * \callgraph
+ */
 int av1_pack_bitstream(AV1_COMP *const cpi, uint8_t *dst, size_t *size,
                        int *const largest_tile_id);
 
diff --git a/av1/encoder/block.h b/av1/encoder/block.h
index 5a74567..d77c281 100644
--- a/av1/encoder/block.h
+++ b/av1/encoder/block.h
@@ -9,11 +9,16 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
+/*! \file
+ * Declares various structs used to encode the current partition block.
+ */
 #ifndef AOM_AV1_ENCODER_BLOCK_H_
 #define AOM_AV1_ENCODER_BLOCK_H_
 
+#include "av1/common/blockd.h"
 #include "av1/common/entropymv.h"
 #include "av1/common/entropy.h"
+#include "av1/common/enums.h"
 #include "av1/common/mvref_common.h"
 
 #include "av1/encoder/enc_enums.h"
@@ -27,471 +32,1154 @@
 extern "C" {
 #endif
 
-#define MC_FLOW_BSIZE_1D 16
-#define MC_FLOW_NUM_PELS (MC_FLOW_BSIZE_1D * MC_FLOW_BSIZE_1D)
-#define MAX_MC_FLOW_BLK_IN_SB (MAX_SB_SIZE / MC_FLOW_BSIZE_1D)
+//! Minimum linear dimension of a tpl block
+#define MIN_TPL_BSIZE_1D 16
+//! Maximum number of tpl block in a super block
+#define MAX_TPL_BLK_IN_SB (MAX_SB_SIZE / MIN_TPL_BSIZE_1D)
+//! Number of intra winner modes kept
 #define MAX_WINNER_MODE_COUNT_INTRA 3
+//! Number of inter winner modes kept
 #define MAX_WINNER_MODE_COUNT_INTER 1
+//! Number of txfm hash records kept for the partition block.
+#define RD_RECORD_BUFFER_LEN 8
+//! Number of txfm hash records kept for the txfm block.
+#define TX_SIZE_RD_RECORD_BUFFER_LEN 256
+
+/*! \brief Superblock level encoder info
+ *
+ * SuperblockEnc stores superblock level information used by the encoder for
+ * more efficient encoding. Currently this is mostly used to store TPL data
+ * for the current superblock.
+ */
 typedef struct {
+  //! Maximum partition size for the sb.
+  BLOCK_SIZE min_partition_size;
+  //! Minimum partition size for the sb.
+  BLOCK_SIZE max_partition_size;
+
+  /*****************************************************************************
+   * \name TPL Info
+   *
+   * Information gathered from tpl_model at tpl block precision for the
+   * superblock to speed up the encoding process..
+   ****************************************************************************/
+  /**@{*/
+  //! Number of TPL blocks in this superblock.
+  int tpl_data_count;
+  //! TPL's estimate of inter cost for each tpl block.
+  int64_t tpl_inter_cost[MAX_TPL_BLK_IN_SB * MAX_TPL_BLK_IN_SB];
+  //! TPL's estimate of tpl cost for each tpl block.
+  int64_t tpl_intra_cost[MAX_TPL_BLK_IN_SB * MAX_TPL_BLK_IN_SB];
+  //! Motion vectors found by TPL model for each tpl block.
+  int_mv tpl_mv[MAX_TPL_BLK_IN_SB * MAX_TPL_BLK_IN_SB][INTER_REFS_PER_FRAME];
+  //! TPL's stride for the arrays in this struct.
+  int tpl_stride;
+  /**@}*/
+} SuperBlockEnc;
+
+/*! \brief Stores the best performing modes.
+ */
+typedef struct {
+  //! The mbmi used to reconstruct the winner mode.
   MB_MODE_INFO mbmi;
+  //! Rdstats of the winner mode.
   RD_STATS rd_cost;
+  //! Rdcost of the winner mode
   int64_t rd;
+  //! Luma rate of the winner mode.
   int rate_y;
+  //! Chroma rate of the winner mode.
   int rate_uv;
-  uint8_t color_index_map[64 * 64];
+  //! The color map needed to reconstruct palette mode.
+  uint8_t color_index_map[MAX_SB_SQUARE];
+  //! The current winner mode.
   THR_MODES mode_index;
 } WinnerModeStats;
 
-typedef struct {
-  unsigned int sse;
-  int sum;
-  unsigned int var;
-} DIFF;
-
-enum {
-  NO_TRELLIS_OPT,          // No trellis optimization
-  FULL_TRELLIS_OPT,        // Trellis optimization in all stages
-  FINAL_PASS_TRELLIS_OPT,  // Trellis optimization in only the final encode pass
-  NO_ESTIMATE_YRD_TRELLIS_OPT  // Disable trellis in estimate_yrd_for_sb
-} UENUM1BYTE(TRELLIS_OPT_TYPE);
-
+/*! \brief Each source plane of the current macroblock
+ *
+ * This struct also stores the txfm buffers and quantizer settings.
+ */
 typedef struct macroblock_plane {
+  //! Stores source - pred so the txfm can be computed later
   DECLARE_ALIGNED(32, int16_t, src_diff[MAX_SB_SQUARE]);
+  //! Dequantized coefficients
+  tran_low_t *dqcoeff;
+  //! Quantized coefficients
   tran_low_t *qcoeff;
+  //! Transformed coefficients
   tran_low_t *coeff;
+  //! Location of the end of qcoeff (end of block).
   uint16_t *eobs;
+  //! Contexts used to code the transform coefficients.
   uint8_t *txb_entropy_ctx;
+  //! A buffer containing the source frame.
   struct buf_2d src;
 
-  // Quantizer setings
-  // These are used/accessed only in the quantization process
-  // RDO does not / must not depend on any of these values
-  // All values below share the coefficient scale/shift used in TX
+  /*! \name Quantizer Settings
+   *
+   * \attention These are used/accessed only in the quantization process.
+   * RDO does not and *must not* depend on any of these values.
+   * All values below share the coefficient scale/shift used in TX.
+   */
+  /**@{*/
+  //! Quantization step size used by AV1_XFORM_QUANT_FP.
   const int16_t *quant_fp_QTX;
+  //! Offset used for rounding in the quantizer process by AV1_XFORM_QUANT_FP.
   const int16_t *round_fp_QTX;
+  //! Quantization step size used by AV1_XFORM_QUANT_B.
   const int16_t *quant_QTX;
-  const int16_t *quant_shift_QTX;
-  const int16_t *zbin_QTX;
+  //! Offset used for rounding in the quantizer process by AV1_XFORM_QUANT_B.
   const int16_t *round_QTX;
+  //! Scale factor to shift coefficients toward zero. Only used by QUANT_B.
+  const int16_t *quant_shift_QTX;
+  //! Size of the quantization bin around 0. Only Used by QUANT_B
+  const int16_t *zbin_QTX;
+  //! Dequantizer
   const int16_t *dequant_QTX;
+  /**@}*/
 } MACROBLOCK_PLANE;
 
+/*! \brief Costs for encoding the coefficients within a level.
+ *
+ * Covers everything including txb_skip, eob, dc_sign,
+ */
 typedef struct {
+  //! Cost to skip txfm for the current txfm block.
   int txb_skip_cost[TXB_SKIP_CONTEXTS][2];
+  /*! \brief Cost for encoding the base_eob of a level.
+   *
+   * Decoder uses base_eob to derive the base_level as base_eob := base_eob+1.
+   */
   int base_eob_cost[SIG_COEF_CONTEXTS_EOB][3];
+  /*! \brief Cost for encoding the base level of a coefficient.
+   *
+   * Decoder derives coeff_base as coeff_base := base_eob + 1.
+   */
   int base_cost[SIG_COEF_CONTEXTS][8];
+  /*! \brief Cost for encoding the last non-zero coefficient.
+   *
+   * Eob is derived from eob_extra at the decoder as eob := eob_extra + 1
+   */
   int eob_extra_cost[EOB_COEF_CONTEXTS][2];
+  //! Cost for encoding the dc_sign
   int dc_sign_cost[DC_SIGN_CONTEXTS][2];
+  //! Cost for encoding an increment to the coefficient
   int lps_cost[LEVEL_CONTEXTS][COEFF_BASE_RANGE + 1 + COEFF_BASE_RANGE + 1];
 } LV_MAP_COEFF_COST;
 
+/*! \brief Costs for encoding the eob.
+ */
 typedef struct {
+  //! eob_cost.
   int eob_cost[2][11];
 } LV_MAP_EOB_COST;
 
+/*! \brief Stores the transforms coefficients for the whole superblock.
+ */
 typedef struct {
-  tran_low_t tcoeff[MAX_MB_PLANE][MAX_SB_SQUARE];
-  uint16_t eobs[MAX_MB_PLANE][MAX_SB_SQUARE / (TX_SIZE_W_MIN * TX_SIZE_H_MIN)];
-  // Transform block entropy contexts.
-  // Bits 0~3: txb_skip_ctx; bits 4~5: dc_sign_ctx.
-  uint8_t entropy_ctx[MAX_MB_PLANE]
-                     [MAX_SB_SQUARE / (TX_SIZE_W_MIN * TX_SIZE_H_MIN)];
+  //! The transformed coefficients.
+  tran_low_t *tcoeff[MAX_MB_PLANE];
+  //! Where the transformed coefficients end.
+  uint16_t *eobs[MAX_MB_PLANE];
+  /*! \brief Transform block entropy contexts.
+   *
+   * Each element is used as a bit field.
+   * - Bits 0~3: txb_skip_ctx
+   * - Bits 4~5: dc_sign_ctx.
+   */
+  uint8_t *entropy_ctx[MAX_MB_PLANE];
 } CB_COEFF_BUFFER;
 
+/*! \brief Extended mode info derived from mbmi.
+ */
 typedef struct {
   // TODO(angiebird): Reduce the buffer size according to sb_type
+  //! The reference mv list for the current block.
   CANDIDATE_MV ref_mv_stack[MODE_CTX_REF_FRAMES][USABLE_REF_MV_STACK_SIZE];
+  //! The weights used to compute the ref mvs.
   uint16_t weight[MODE_CTX_REF_FRAMES][USABLE_REF_MV_STACK_SIZE];
-  int_mv global_mvs[REF_FRAMES];
-  int16_t mode_context[MODE_CTX_REF_FRAMES];
+  //! Number of ref mvs in the drl.
   uint8_t ref_mv_count[MODE_CTX_REF_FRAMES];
+  //! Global mvs
+  int_mv global_mvs[REF_FRAMES];
+  //! Context used to encode the current mode.
+  int16_t mode_context[MODE_CTX_REF_FRAMES];
 } MB_MODE_INFO_EXT;
 
-// Structure to store best mode information at frame level. This
-// frame level information will be used during bitstream preparation stage.
+/*! \brief Stores best extended mode information at frame level.
+ *
+ * The frame level in here is used in bitstream preparation stage. The
+ * information in \ref MB_MODE_INFO_EXT are copied to this struct to save
+ * memory.
+ */
 typedef struct {
+  //! \copydoc MB_MODE_INFO_EXT::ref_mv_stack
   CANDIDATE_MV ref_mv_stack[USABLE_REF_MV_STACK_SIZE];
+  //! \copydoc MB_MODE_INFO_EXT::weight
   uint16_t weight[USABLE_REF_MV_STACK_SIZE];
-  // TODO(Ravi/Remya): Reduce the buffer size of global_mvs
-  int_mv global_mvs[REF_FRAMES];
-  int cb_offset;
-  int16_t mode_context;
+  //! \copydoc MB_MODE_INFO_EXT::ref_mv_count
   uint8_t ref_mv_count;
+  // TODO(Ravi/Remya): Reduce the buffer size of global_mvs
+  //! \copydoc MB_MODE_INFO_EXT::global_mvs
+  int_mv global_mvs[REF_FRAMES];
+  //! \copydoc MB_MODE_INFO_EXT::mode_context
+  int16_t mode_context;
+  //! Offset of current coding block's coeff buffer relative to the sb.
+  uint16_t cb_offset[PLANE_TYPES];
 } MB_MODE_INFO_EXT_FRAME;
 
+/*! \brief Txfm search results for a partition
+ */
 typedef struct {
-  uint8_t best_palette_color_map[MAX_PALETTE_SQUARE];
-  int kmeans_data_buf[2 * MAX_PALETTE_SQUARE];
-} PALETTE_BUFFER;
-
-typedef struct {
+  //! Txfm size used if the current mode is intra mode.
   TX_SIZE tx_size;
+  //! Txfm sizes used if the current mode is inter mode.
   TX_SIZE inter_tx_size[INTER_TX_SIZE_BUF_LEN];
+  //! Map showing which txfm block skips the txfm process.
   uint8_t blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
+  //! Map showing the txfm types for each blcok.
   uint8_t tx_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE];
+  //! Rd_stats for the whole partition block.
   RD_STATS rd_stats;
+  //! Hash value of the current record.
   uint32_t hash_value;
 } MB_RD_INFO;
 
-#define RD_RECORD_BUFFER_LEN 8
+/*! \brief Hash records of txfm search results for the partition block.
+ */
 typedef struct {
+  //! Circular buffer that stores the txfm search results.
   MB_RD_INFO tx_rd_info[RD_RECORD_BUFFER_LEN];  // Circular buffer.
+  //! Index to insert the newest \ref TXB_RD_INFO.
   int index_start;
+  //! Number of info stored in this record.
   int num;
-  CRC32C crc_calculator;  // Hash function.
+  //! Hash function
+  CRC32C crc_calculator;
 } MB_RD_RECORD;
 
+/*! \brief Txfm search results for a tx block.
+ */
 typedef struct {
+  //! Distortion after the txfm process
   int64_t dist;
+  //! SSE of the prediction before the txfm process
   int64_t sse;
+  //! Rate used to encode the txfm.
   int rate;
+  //! Location of the end of non-zero entries.
   uint16_t eob;
+  //! Transform type used on the current block.
   TX_TYPE tx_type;
+  //! Unknown usage
   uint16_t entropy_context;
+  //! Context used to code the coefficients.
   uint8_t txb_entropy_ctx;
+  //! Whether the current info block contains  valid info
   uint8_t valid;
-  uint8_t fast;  // This is not being used now.
+  //! Unused
+  uint8_t fast;
+  //! Whether trellis optimization is done.
   uint8_t perform_block_coeff_opt;
 } TXB_RD_INFO;
 
-#define TX_SIZE_RD_RECORD_BUFFER_LEN 256
+/*! \brief Hash records of txfm search result for each tx block.
+ */
 typedef struct {
+  //! The hash values.
   uint32_t hash_vals[TX_SIZE_RD_RECORD_BUFFER_LEN];
+  //! The txfm search results
   TXB_RD_INFO tx_rd_info[TX_SIZE_RD_RECORD_BUFFER_LEN];
+  //! Index to insert the newest \ref TXB_RD_INFO.
   int index_start;
+  //! Number of info stored in this record.
   int num;
 } TXB_RD_RECORD;
 
-typedef struct tx_size_rd_info_node {
-  TXB_RD_INFO *rd_info_array;  // Points to array of size TX_TYPES.
-  struct tx_size_rd_info_node *children[4];
-} TXB_RD_INFO_NODE;
-
-// Simple translation rd state for prune_comp_search_by_single_result
-typedef struct {
-  RD_STATS rd_stats;
-  RD_STATS rd_stats_y;
-  RD_STATS rd_stats_uv;
-  uint8_t blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
-  uint8_t tx_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE];
-  uint8_t skip;
-  uint8_t disable_skip;
-  uint8_t early_skipped;
-} SimpleRDState;
-
-// 4: NEAREST, NEW, NEAR, GLOBAL
-#define SINGLE_REF_MODES ((REF_FRAMES - 1) * 4)
-
+//! Number of compound rd stats
 #define MAX_COMP_RD_STATS 64
+/*! \brief Rdcost stats in compound mode.
+ */
 typedef struct {
+  //! Rate of the compound modes.
   int32_t rate[COMPOUND_TYPES];
+  //! Distortion of the compound modes.
   int64_t dist[COMPOUND_TYPES];
+  //! Estimated rate of the compound modes.
   int32_t model_rate[COMPOUND_TYPES];
+  //! Estimated distortion of the compound modes.
   int64_t model_dist[COMPOUND_TYPES];
+  //! Rate need to send the mask type.
   int comp_rs2[COMPOUND_TYPES];
+  //! Motion vector for each predictor.
   int_mv mv[2];
+  //! Ref frame for each predictor.
   MV_REFERENCE_FRAME ref_frames[2];
+  //! Current prediction mode.
   PREDICTION_MODE mode;
+  //! Current interpolation filter.
   int_interpfilters filter;
+  //! Refmv index in the drl.
   int ref_mv_idx;
+  //! Whether the predictors are GLOBALMV.
   int is_global[2];
+  //! Current parameters for interinter mode.
   INTERINTER_COMPOUND_DATA interinter_comp;
 } COMP_RD_STATS;
 
-// Struct for buffers used by av1_compound_type_rd() function.
-// For sizes and alignment of these arrays, refer to
-// alloc_compound_type_rd_buffers() function.
+/*! \brief Contains buffers used to speed up rdopt for obmc.
+ *
+ * See the comments for calc_target_weighted_pred for details.
+ */
 typedef struct {
+  /*! \brief A new source weighted with the above and left predictors.
+   *
+   * Used to efficiently construct multiple obmc predictors during rdopt.
+   */
+  int32_t *wsrc;
+  /*! \brief A new mask constructed from the original horz/vert mask.
+   *
+   * \copydetails wsrc
+   */
+  int32_t *mask;
+  /*! \brief Prediction from the up predictor.
+   *
+   * Used to build the obmc predictor.
+   */
+  uint8_t *above_pred;
+  /*! \brief Prediction from the up predictor.
+   *
+   * \copydetails above_pred
+   */
+  uint8_t *left_pred;
+} OBMCBuffer;
+
+/*! \brief Contains color maps used in palette mode.
+ */
+typedef struct {
+  //! The best color map found.
+  uint8_t best_palette_color_map[MAX_PALETTE_SQUARE];
+  //! A temporary buffer used for k-means clustering.
+  int kmeans_data_buf[2 * MAX_PALETTE_SQUARE];
+} PALETTE_BUFFER;
+
+/*! \brief Contains buffers used by av1_compound_type_rd()
+ *
+ * For sizes and alignment of these arrays, refer to
+ * alloc_compound_type_rd_buffers() function.
+ */
+typedef struct {
+  //! First prediction.
   uint8_t *pred0;
+  //! Second prediction.
   uint8_t *pred1;
-  int16_t *residual1;          // src - pred1
-  int16_t *diff10;             // pred1 - pred0
-  uint8_t *tmp_best_mask_buf;  // backup of the best segmentation mask
+  //! Source - first prediction.
+  int16_t *residual1;
+  //! Second prediction - first prediction.
+  int16_t *diff10;
+  //! Backup of the best segmentation mask.
+  uint8_t *tmp_best_mask_buf;
 } CompoundTypeRdBuffers;
 
-enum {
-  MV_COST_ENTROPY,    // Use the entropy rate of the mv as the cost
-  MV_COST_L1_LOWRES,  // Use the l1 norm of the mv as the cost (<480p)
-  MV_COST_L1_MIDRES,  // Use the l1 norm of the mv as the cost (>=480p)
-  MV_COST_L1_HDRES,   // Use the l1 norm of the mv as the cost (>=720p)
-  MV_COST_NONE        // Use 0 as as cost irrespective of the current mv
-} UENUM1BYTE(MV_COST_TYPE);
-
-struct inter_modes_info;
-typedef struct macroblock MACROBLOCK;
-struct macroblock {
-  struct macroblock_plane plane[MAX_MB_PLANE];
-
-  // Determine if one would go with reduced complexity transform block
-  // search model to select prediction modes, or full complexity model
-  // to select transform kernel.
-  int rd_model;
-
-  // prune_comp_search_by_single_result (3:MAX_REF_MV_SEARCH)
-  SimpleRDState simple_rd_state[SINGLE_REF_MODES][3];
-
-  // Inter macroblock RD search info.
-  MB_RD_RECORD mb_rd_record;
-
-  // Inter transform block RD search info. for square TX sizes.
-  TXB_RD_RECORD txb_rd_record_8X8[(MAX_MIB_SIZE >> 1) * (MAX_MIB_SIZE >> 1)];
-  TXB_RD_RECORD txb_rd_record_16X16[(MAX_MIB_SIZE >> 2) * (MAX_MIB_SIZE >> 2)];
-  TXB_RD_RECORD txb_rd_record_32X32[(MAX_MIB_SIZE >> 3) * (MAX_MIB_SIZE >> 3)];
-  TXB_RD_RECORD txb_rd_record_64X64[(MAX_MIB_SIZE >> 4) * (MAX_MIB_SIZE >> 4)];
-
-  // Intra transform block RD search info. for square TX sizes.
-  TXB_RD_RECORD txb_rd_record_intra;
-
-  MACROBLOCKD e_mbd;
-  MB_MODE_INFO_EXT *mbmi_ext;
-  MB_MODE_INFO_EXT_FRAME *mbmi_ext_frame;
-  // Array of mode stats for winner mode processing
-  WinnerModeStats winner_mode_stats[AOMMAX(MAX_WINNER_MODE_COUNT_INTRA,
-                                           MAX_WINNER_MODE_COUNT_INTER)];
-  int winner_mode_count;
-  int skip_block;
-  int qindex;
-
-  // The equivalent error at the current rdmult of one whole bit (not one
-  // bitcost unit).
-  int errorperbit;
-  // The equivalend SAD error of one (whole) bit at the current quantizer
-  // for large blocks.
-  int sadperbit;
-  int rdmult;
-  int mb_energy;
-  int sb_energy_level;
-
-  unsigned int txb_split_count;
-#if CONFIG_SPEED_STATS
-  unsigned int tx_search_count;
-#endif  // CONFIG_SPEED_STATS
-
-  // These are set to their default values at the beginning, and then adjusted
-  // further in the encoding process.
-  BLOCK_SIZE min_partition_size;
-  BLOCK_SIZE max_partition_size;
-
-  unsigned int max_mv_context[REF_FRAMES];
-  unsigned int source_variance;
-  unsigned int simple_motion_pred_sse;
-  unsigned int pred_sse[REF_FRAMES];
-  int pred_mv_sad[REF_FRAMES];
-  int best_pred_mv_sad;
-
-  int nmv_vec_cost[MV_JOINTS];
-  int nmv_costs[2][MV_VALS];
-  int nmv_costs_hp[2][MV_VALS];
-  int *nmvcost[2];
-  int *nmvcost_hp[2];
-  int **mv_cost_stack;
-
-  int32_t *wsrc_buf;
-  int32_t *mask_buf;
-  uint8_t *above_pred_buf;
-  uint8_t *left_pred_buf;
-
-  PALETTE_BUFFER *palette_buffer;
-  CompoundTypeRdBuffers comp_rd_buffer;
-
-  CONV_BUF_TYPE *tmp_conv_dst;
-  uint8_t *tmp_obmc_bufs[2];
-
-  FRAME_CONTEXT *row_ctx;
-  // This context will be used to update color_map_cdf pointer which would be
-  // used during pack bitstream. For single thread and tile-multithreading case
-  // this ponter will be same as xd->tile_ctx, but for the case of row-mt:
-  // xd->tile_ctx will point to a temporary context while tile_pb_ctx will point
-  // to the accurate tile context.
-  FRAME_CONTEXT *tile_pb_ctx;
-
-  struct inter_modes_info *inter_modes_info;
-
-  // Contains the hash table, hash function, and buffer used for intrabc
-  IntraBCHashInfo intrabc_hash_info;
-
-  // These define limits to motion vector components to prevent them
-  // from extending outside the UMV borders
-  FullMvLimits mv_limits;
-
-  uint8_t blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
-  uint8_t tx_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE];
-
-  // Force the coding block to skip transform and quantization.
-  int force_skip;
-  int skip_cost[SKIP_CONTEXTS][2];
-
-  int skip_mode;  // 0: off; 1: on
-  int skip_mode_cost[SKIP_CONTEXTS][2];
-
-  LV_MAP_COEFF_COST coeff_costs[TX_SIZES][PLANE_TYPES];
-  LV_MAP_EOB_COST eob_costs[7][2];
-  uint16_t cb_offset;
-
-  // mode costs
-  int intra_inter_cost[INTRA_INTER_CONTEXTS][2];
-
-  int mbmode_cost[BLOCK_SIZE_GROUPS][INTRA_MODES];
-  int newmv_mode_cost[NEWMV_MODE_CONTEXTS][2];
-  int zeromv_mode_cost[GLOBALMV_MODE_CONTEXTS][2];
-  int refmv_mode_cost[REFMV_MODE_CONTEXTS][2];
-  int drl_mode_cost0[DRL_MODE_CONTEXTS][2];
-
-  int comp_inter_cost[COMP_INTER_CONTEXTS][2];
-  int single_ref_cost[REF_CONTEXTS][SINGLE_REFS - 1][2];
-  int comp_ref_type_cost[COMP_REF_TYPE_CONTEXTS]
-                        [CDF_SIZE(COMP_REFERENCE_TYPES)];
-  int uni_comp_ref_cost[UNI_COMP_REF_CONTEXTS][UNIDIR_COMP_REFS - 1]
-                       [CDF_SIZE(2)];
-  // Cost for signaling ref_frame[0] (LAST_FRAME, LAST2_FRAME, LAST3_FRAME or
-  // GOLDEN_FRAME) in bidir-comp mode.
-  int comp_ref_cost[REF_CONTEXTS][FWD_REFS - 1][2];
-  // Cost for signaling ref_frame[1] (ALTREF_FRAME, ALTREF2_FRAME, or
-  // BWDREF_FRAME) in bidir-comp mode.
-  int comp_bwdref_cost[REF_CONTEXTS][BWD_REFS - 1][2];
-  int inter_compound_mode_cost[INTER_MODE_CONTEXTS][INTER_COMPOUND_MODES];
-  int compound_type_cost[BLOCK_SIZES_ALL][MASKED_COMPOUND_TYPES];
-  int wedge_idx_cost[BLOCK_SIZES_ALL][16];
-  int interintra_cost[BLOCK_SIZE_GROUPS][2];
-  int wedge_interintra_cost[BLOCK_SIZES_ALL][2];
-  int interintra_mode_cost[BLOCK_SIZE_GROUPS][INTERINTRA_MODES];
-  int motion_mode_cost[BLOCK_SIZES_ALL][MOTION_MODES];
-  int motion_mode_cost1[BLOCK_SIZES_ALL][2];
-  int intra_uv_mode_cost[CFL_ALLOWED_TYPES][INTRA_MODES][UV_INTRA_MODES];
-  int y_mode_costs[INTRA_MODES][INTRA_MODES][INTRA_MODES];
-  int filter_intra_cost[BLOCK_SIZES_ALL][2];
-  int filter_intra_mode_cost[FILTER_INTRA_MODES];
-  int switchable_interp_costs[SWITCHABLE_FILTER_CONTEXTS][SWITCHABLE_FILTERS];
-  int partition_cost[PARTITION_CONTEXTS][EXT_PARTITION_TYPES];
-  int palette_y_size_cost[PALATTE_BSIZE_CTXS][PALETTE_SIZES];
-  int palette_uv_size_cost[PALATTE_BSIZE_CTXS][PALETTE_SIZES];
-  int palette_y_color_cost[PALETTE_SIZES][PALETTE_COLOR_INDEX_CONTEXTS]
-                          [PALETTE_COLORS];
-  int palette_uv_color_cost[PALETTE_SIZES][PALETTE_COLOR_INDEX_CONTEXTS]
-                           [PALETTE_COLORS];
-  int palette_y_mode_cost[PALATTE_BSIZE_CTXS][PALETTE_Y_MODE_CONTEXTS][2];
-  int palette_uv_mode_cost[PALETTE_UV_MODE_CONTEXTS][2];
-  // The rate associated with each alpha codeword
-  int cfl_cost[CFL_JOINT_SIGNS][CFL_PRED_PLANES][CFL_ALPHABET_SIZE];
-  int tx_size_cost[TX_SIZES - 1][TX_SIZE_CONTEXTS][TX_SIZES];
-  int txfm_partition_cost[TXFM_PARTITION_CONTEXTS][2];
-  int inter_tx_type_costs[EXT_TX_SETS_INTER][EXT_TX_SIZES][TX_TYPES];
-  int intra_tx_type_costs[EXT_TX_SETS_INTRA][EXT_TX_SIZES][INTRA_MODES]
-                         [TX_TYPES];
-  int angle_delta_cost[DIRECTIONAL_MODES][2 * MAX_ANGLE_DELTA + 1];
-  int switchable_restore_cost[RESTORE_SWITCHABLE_TYPES];
-  int wiener_restore_cost[2];
-  int sgrproj_restore_cost[2];
-  int intrabc_cost[2];
-
-  // Used to store sub partition's choices.
-  MV pred_mv[REF_FRAMES];
-
-  // Ref frames that are selected by square partition blocks within a super-
-  // block, in MI resolution. They can be used to prune ref frames for
-  // rectangular blocks.
-  int picked_ref_frames_mask[32 * 32];
-
-  // use default transform and skip transform type search for intra modes
-  int use_default_intra_tx_type;
-  // use default transform and skip transform type search for inter modes
-  int use_default_inter_tx_type;
-  int comp_idx_cost[COMP_INDEX_CONTEXTS][2];
-  int comp_group_idx_cost[COMP_GROUP_IDX_CONTEXTS][2];
-  int must_find_valid_partition;
-  int recalc_luma_mc_data;  // Flag to indicate recalculation of MC data during
-                            // interpolation filter search
-  int prune_mode;
-  uint32_t tx_domain_dist_threshold;
-  int use_transform_domain_distortion;
-  // The likelihood of an edge existing in the block (using partial Canny edge
-  // detection). For reference, 556 is the value returned for a solid
-  // vertical black/white edge.
-  uint16_t edge_strength;
-  // The strongest edge strength seen along the x/y axis.
-  uint16_t edge_strength_x;
-  uint16_t edge_strength_y;
-  uint8_t compound_idx;
-
-  // [Saved stat index]
-  COMP_RD_STATS comp_rd_stats[MAX_COMP_RD_STATS];
-  int comp_rd_stats_idx;
-
-  CB_COEFF_BUFFER *cb_coef_buff;
-
-  // Threshold used to decide the applicability of R-D optimization of
-  // quantized coeffs
-  uint32_t coeff_opt_dist_threshold;
-
+/*! \brief Holds some parameters related to partitioning schemes in AV1.
+ */
+// TODO(chiyotsai@google.com): Consolidate this with SIMPLE_MOTION_DATA_TREE
+typedef struct {
 #if !CONFIG_REALTIME_ONLY
+  // The following 4 parameters are used for cnn-based partitioning on intra
+  // frame.
+  /*! \brief Current index on the partition block quad tree.
+   *
+   * Used to index into the cnn buffer for partition decision.
+   */
   int quad_tree_idx;
+  //! Whether the CNN buffer contains valid output.
   int cnn_output_valid;
+  //! A buffer used by our segmentation CNN for intra-frame partitioning.
   float cnn_buffer[CNN_OUT_BUF_SIZE];
+  //! log of the quantization parameter of the ancestor BLOCK_64X64.
   float log_q;
 #endif
-  int thresh_freq_fact[BLOCK_SIZES_ALL][MAX_MODES];
-  // 0 - 128x128
-  // 1-2 - 128x64
-  // 3-4 - 64x128
-  // 5-8 - 64x64
-  // 9-16 - 64x32
-  // 17-24 - 32x64
-  // 25-40 - 32x32
-  // 41-104 - 16x16
-  uint8_t variance_low[105];
-  uint8_t content_state_sb;
-  // Strong color activity detection. Used in REALTIME coding mode to enhance
-  // the visual quality at the boundary of moving color objects.
-  uint8_t color_sensitivity[2];
-  int nonrd_prune_ref_frame_search;
 
-  // Used to control the tx size search evaluation for mode processing
-  // (normal/winner mode)
-  int tx_size_search_method;
-  // This tx_mode_search_type is used internally by the encoder, and is not
-  // written to the bitstream. It determines what kind of tx_mode should be
-  // searched. For example, we might set it to TX_MODE_LARGEST to find a good
-  // candidate, then use TX_MODE_SELECT on it
+  /*! \brief Variance of the subblocks in the superblock.
+   *
+   * This is used by rt mode for variance based partitioning.
+   * The indices corresponds to the following block sizes:
+   * -   0    - 128x128
+   * -  1-2   - 128x64
+   * -  3-4   -  64x128
+   * -  5-8   -  64x64
+   * -  9-16  -  64x32
+   * - 17-24  -  32x64
+   * - 25-40  -  32x32
+   * - 41-104 -  16x16
+   */
+  uint8_t variance_low[105];
+} PartitionSearchInfo;
+
+/*! \brief Defines the parameters used to perform txfm search.
+ *
+ * For the most part, this determines how various speed features are used.
+ */
+typedef struct {
+  /*! \brief Whether to limit the intra txfm search type to the default txfm.
+   *
+   * This could either be a result of either sequence parameter or speed
+   * features.
+   */
+  int use_default_intra_tx_type;
+  /*! \brief Whether to limit the inter txfm search type to the default txfm.
+   *
+   * \copydetails use_default_intra_tx_type
+   */
+  int use_default_inter_tx_type;
+
+  //! Whether to prune 2d transforms based on 1d transform results.
+  int prune_2d_txfm_mode;
+
+  /*! \brief Variable from \ref WinnerModeParams based on current eval mode.
+   *
+   * See the documentation for \ref WinnerModeParams for more detail.
+   */
+  unsigned int coeff_opt_dist_threshold;
+  //! \copydoc coeff_opt_dist_threshold
+  unsigned int coeff_opt_satd_threshold;
+  //! \copydoc coeff_opt_dist_threshold
+  unsigned int tx_domain_dist_threshold;
+  //! \copydoc coeff_opt_dist_threshold
+  TX_SIZE_SEARCH_METHOD tx_size_search_method;
+  //! \copydoc coeff_opt_dist_threshold
+  unsigned int use_transform_domain_distortion;
+  //! \copydoc coeff_opt_dist_threshold
+  unsigned int skip_txfm_level;
+
+  /*! \brief How to search for the optimal tx_size
+   *
+   * If ONLY_4X4, use TX_4X4; if TX_MODE_LARGEST, use the largest tx_size for
+   * the current partition block; if TX_MODE_SELECT, search through the whole
+   * tree.
+   *
+   * \attention
+   * Although this looks suspicious similar to a bitstream element, this
+   * tx_mode_search_type is only used internally by the encoder, and is *not*
+   * written to the bitstream. It determines what kind of tx_mode would be
+   * searched. For example, we might set it to TX_MODE_LARGEST to find a good
+   * candidate, then code it as TX_MODE_SELECT.
+   */
   TX_MODE tx_mode_search_type;
 
-  // Used to control aggressiveness of skip flag prediction for mode processing
-  // (normal/winner mode)
-  unsigned int predict_skip_level;
+  /*!
+   * Flag to enable/disable DC block prediction.
+   */
+  unsigned int predict_dc_level;
+} TxfmSearchParams;
 
-  // Copy out this SB's TPL block stats.
-  int valid_cost_b;
-  int64_t inter_cost_b[MAX_MC_FLOW_BLK_IN_SB * MAX_MC_FLOW_BLK_IN_SB];
-  int64_t intra_cost_b[MAX_MC_FLOW_BLK_IN_SB * MAX_MC_FLOW_BLK_IN_SB];
-  int_mv mv_b[MAX_MC_FLOW_BLK_IN_SB * MAX_MC_FLOW_BLK_IN_SB]
-             [INTER_REFS_PER_FRAME];
-  int cost_stride;
+/*!\cond */
+#define MAX_NUM_8X8_TXBS ((MAX_MIB_SIZE >> 1) * (MAX_MIB_SIZE >> 1))
+#define MAX_NUM_16X16_TXBS ((MAX_MIB_SIZE >> 2) * (MAX_MIB_SIZE >> 2))
+#define MAX_NUM_32X32_TXBS ((MAX_MIB_SIZE >> 3) * (MAX_MIB_SIZE >> 3))
+#define MAX_NUM_64X64_TXBS ((MAX_MIB_SIZE >> 4) * (MAX_MIB_SIZE >> 4))
+/*!\endcond */
 
-  // The type of mv cost used during motion search
-  MV_COST_TYPE mv_cost_type;
+/*! \brief Stores various encoding/search decisions related to txfm search.
+ *
+ * This struct contains a cache of previous txfm results, and some buffers for
+ * the current txfm decision.
+ */
+typedef struct {
+  //! Whether to skip transform and quantization on a partition block level.
+  int skip_txfm;
 
-  uint8_t search_ref_frame[REF_FRAMES];
+  /*! \brief Whether to skip transform and quantization on a txfm block level.
+   *
+   * Skips transform and quantization on a transform block level inside the
+   * current partition block. Each element of this array is used as a bit-field.
+   * So for example, the we are skipping on the luma plane, then the last bit
+   * would be set to 1.
+   */
+  uint8_t blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
 
-#if CONFIG_AV1_HIGHBITDEPTH
-  void (*fwd_txfm4x4)(const int16_t *input, tran_low_t *output, int stride);
-  void (*inv_txfm_add)(const tran_low_t *input, uint8_t *dest, int stride,
-                       int eob);
-#else
-  void (*fwd_txfm4x4)(const int16_t *input, int16_t *output, int stride);
-  void (*inv_txfm_add)(const int16_t *input, uint8_t *dest, int stride,
-                       int eob);
-#endif
-};
+  /*! \brief Transform types inside the partition block
+   *
+   * Keeps a record of what kind of transform to use for each of the transform
+   * block inside the partition block.
+   * \attention The buffer here is *never* directly used. Instead, this just
+   * allocates the memory for MACROBLOCKD::tx_type_map during rdopt on the
+   * partition block. So if we need to save memory, we could move the allocation
+   * to pick_sb_mode instead.
+   */
+  uint8_t tx_type_map_[MAX_MIB_SIZE * MAX_MIB_SIZE];
 
-// Only consider full SB, MC_FLOW_BSIZE_1D = 16.
-static INLINE int tpl_blocks_in_sb(BLOCK_SIZE bsize) {
-  switch (bsize) {
-    case BLOCK_64X64: return 16;
-    case BLOCK_128X128: return 64;
-    default: assert(0);
-  }
-  return -1;
-}
+  /** \name Txfm hash records
+   * Hash records of the transform search results based on the residue. There
+   * are two main types here:
+   * - MB_RD_RECORD: records a whole *partition block*'s inter-mode txfm result.
+   *   Since this operates on the partition block level, this can give us a
+   *   whole txfm partition tree.
+   * - TXB_RD_RECORD: records a txfm search result within a transform blcok
+   *   itself. This operates on txb level only and onlyt appplies to square
+   *   txfms.
+   */
+  /**@{*/
+  //! Txfm hash record for the whole coding block.
+  MB_RD_RECORD mb_rd_record;
 
+  //! Inter mode txfm hash record for TX_8X8 blocks.
+  TXB_RD_RECORD txb_rd_record_8X8[MAX_NUM_8X8_TXBS];
+  //! Inter mode txfm hash record for TX_16X16 blocks.
+  TXB_RD_RECORD txb_rd_record_16X16[MAX_NUM_16X16_TXBS];
+  //! Inter mode txfm hash record for TX_32X32 blocks.
+  TXB_RD_RECORD txb_rd_record_32X32[MAX_NUM_32X32_TXBS];
+  //! Inter mode txfm hash record for TX_64X64 blocks.
+  TXB_RD_RECORD txb_rd_record_64X64[MAX_NUM_64X64_TXBS];
+  //! Intra mode txfm hash record for square tx blocks.
+  TXB_RD_RECORD txb_rd_record_intra;
+  /**@}*/
+
+  /*! \brief Number of txb splits.
+   *
+   * Keep track of how many times we've used split tx partition for transform
+   * blocks. Somewhat misleadingly, this parameter doesn't actually keep track
+   * of the count of the current block. Instead, it's a cumulative count across
+   * of the whole frame. The main usage is that if txb_split_count is zero, then
+   * we can signal TX_MODE_LARGEST at frame level.
+   */
+  // TODO(chiyotsai@google.com): Move this to a more appropriate location such
+  // as ThreadData.
+  unsigned int txb_split_count;
+#if CONFIG_SPEED_STATS
+  //! For debugging. Used to check how many txfm searches we are doing.
+  unsigned int tx_search_count;
+#endif  // CONFIG_SPEED_STATS
+} TxfmSearchInfo;
+#undef MAX_NUM_8X8_TXBS
+#undef MAX_NUM_16X16_TXBS
+#undef MAX_NUM_32X32_TXBS
+#undef MAX_NUM_64X64_TXBS
+
+/*! \brief Holds the entropy costs for various modes sent to the bitstream.
+ *
+ * \attention This does not include the costs for mv and transformed
+ * coefficients.
+ */
+typedef struct {
+  /*****************************************************************************
+   * \name Partition Costs
+   ****************************************************************************/
+  /**@{*/
+  //! Cost for coding the partition.
+  int partition_cost[PARTITION_CONTEXTS][EXT_PARTITION_TYPES];
+  /**@}*/
+
+  /*****************************************************************************
+   * \name Intra Costs: General
+   ****************************************************************************/
+  /**@{*/
+  //! Luma mode cost for inter frame.
+  int mbmode_cost[BLOCK_SIZE_GROUPS][INTRA_MODES];
+  //! Luma mode cost for intra frame.
+  int y_mode_costs[INTRA_MODES][INTRA_MODES][INTRA_MODES];
+  //! Chroma mode cost
+  int intra_uv_mode_cost[CFL_ALLOWED_TYPES][INTRA_MODES][UV_INTRA_MODES];
+  //! filter_intra_cost
+  int filter_intra_cost[BLOCK_SIZES_ALL][2];
+  //! filter_intra_mode_cost
+  int filter_intra_mode_cost[FILTER_INTRA_MODES];
+  //! angle_delta_cost
+  int angle_delta_cost[DIRECTIONAL_MODES][2 * MAX_ANGLE_DELTA + 1];
+
+  //! Rate rate associated with each alpha codeword
+  int cfl_cost[CFL_JOINT_SIGNS][CFL_PRED_PLANES][CFL_ALPHABET_SIZE];
+  /**@}*/
+
+  /*****************************************************************************
+   * \name Intra Costs: Screen Contents
+   ****************************************************************************/
+  /**@{*/
+  //! intrabc_cost
+  int intrabc_cost[2];
+
+  //! palette_y_size_cost
+  int palette_y_size_cost[PALATTE_BSIZE_CTXS][PALETTE_SIZES];
+  //! palette_uv_size_cost
+  int palette_uv_size_cost[PALATTE_BSIZE_CTXS][PALETTE_SIZES];
+  //! palette_y_color_cost
+  int palette_y_color_cost[PALETTE_SIZES][PALETTE_COLOR_INDEX_CONTEXTS]
+                          [PALETTE_COLORS];
+  //! palette_uv_color_cost
+  int palette_uv_color_cost[PALETTE_SIZES][PALETTE_COLOR_INDEX_CONTEXTS]
+                           [PALETTE_COLORS];
+  //! palette_y_mode_cost
+  int palette_y_mode_cost[PALATTE_BSIZE_CTXS][PALETTE_Y_MODE_CONTEXTS][2];
+  //! palette_uv_mode_cost
+  int palette_uv_mode_cost[PALETTE_UV_MODE_CONTEXTS][2];
+  /**@}*/
+
+  /*****************************************************************************
+   * \name Inter Costs: MV Modes
+   ****************************************************************************/
+  /**@{*/
+  //! skip_mode_cost
+  int skip_mode_cost[SKIP_MODE_CONTEXTS][2];
+  //! newmv_mode_cost
+  int newmv_mode_cost[NEWMV_MODE_CONTEXTS][2];
+  //! zeromv_mode_cost
+  int zeromv_mode_cost[GLOBALMV_MODE_CONTEXTS][2];
+  //! refmv_mode_cost
+  int refmv_mode_cost[REFMV_MODE_CONTEXTS][2];
+  //! drl_mode_cost0
+  int drl_mode_cost0[DRL_MODE_CONTEXTS][2];
+  /**@}*/
+
+  /*****************************************************************************
+   * \name Inter Costs: Ref Frame Types
+   ****************************************************************************/
+  /**@{*/
+  //! single_ref_cost
+  int single_ref_cost[REF_CONTEXTS][SINGLE_REFS - 1][2];
+  //! comp_inter_cost
+  int comp_inter_cost[COMP_INTER_CONTEXTS][2];
+  //! comp_ref_type_cost
+  int comp_ref_type_cost[COMP_REF_TYPE_CONTEXTS]
+                        [CDF_SIZE(COMP_REFERENCE_TYPES)];
+  //! uni_comp_ref_cost
+  int uni_comp_ref_cost[UNI_COMP_REF_CONTEXTS][UNIDIR_COMP_REFS - 1]
+                       [CDF_SIZE(2)];
+  /*! \brief Cost for signaling ref_frame[0] in bidir-comp mode
+   *
+   * Includes LAST_FRAME, LAST2_FRAME, LAST3_FRAME, and GOLDEN_FRAME.
+   */
+  int comp_ref_cost[REF_CONTEXTS][FWD_REFS - 1][2];
+  /*! \brief Cost for signaling ref_frame[1] in bidir-comp mode
+   *
+   * Includes ALTREF_FRAME, ALTREF2_FRAME, and BWDREF_FRAME.
+   */
+  int comp_bwdref_cost[REF_CONTEXTS][BWD_REFS - 1][2];
+  /**@}*/
+
+  /*****************************************************************************
+   * \name Inter Costs: Compound Types
+   ****************************************************************************/
+  /**@{*/
+  //! intra_inter_cost
+  int intra_inter_cost[INTRA_INTER_CONTEXTS][2];
+  //! inter_compound_mode_cost
+  int inter_compound_mode_cost[INTER_MODE_CONTEXTS][INTER_COMPOUND_MODES];
+  //! compound_type_cost
+  int compound_type_cost[BLOCK_SIZES_ALL][MASKED_COMPOUND_TYPES];
+  //! wedge_idx_cost
+  int wedge_idx_cost[BLOCK_SIZES_ALL][16];
+  //! interintra_cost
+  int interintra_cost[BLOCK_SIZE_GROUPS][2];
+  //! wedge_interintra_cost
+  int wedge_interintra_cost[BLOCK_SIZES_ALL][2];
+  //! interintra_mode_cost
+  int interintra_mode_cost[BLOCK_SIZE_GROUPS][INTERINTRA_MODES];
+  /**@}*/
+
+  /*****************************************************************************
+   * \name Inter Costs: Compound Masks
+   ****************************************************************************/
+  /**@{*/
+  //! comp_idx_cost
+  int comp_idx_cost[COMP_INDEX_CONTEXTS][2];
+  //! comp_group_idx_cost
+  int comp_group_idx_cost[COMP_GROUP_IDX_CONTEXTS][2];
+  /**@}*/
+
+  /*****************************************************************************
+   * \name Inter Costs: Motion Modes/Filters
+   ****************************************************************************/
+  /**@{*/
+  //! motion_mode_cost
+  int motion_mode_cost[BLOCK_SIZES_ALL][MOTION_MODES];
+  //! motion_mode_cost1
+  int motion_mode_cost1[BLOCK_SIZES_ALL][2];
+  //! switchable_interp_costs
+  int switchable_interp_costs[SWITCHABLE_FILTER_CONTEXTS][SWITCHABLE_FILTERS];
+  /**@}*/
+
+  /*****************************************************************************
+   * \name Txfm Mode Costs
+   ****************************************************************************/
+  /**@{*/
+  //! skip_txfm_cost
+  int skip_txfm_cost[SKIP_CONTEXTS][2];
+  //! tx_size_cost
+  int tx_size_cost[TX_SIZES - 1][TX_SIZE_CONTEXTS][TX_SIZES];
+  //! txfm_partition_cost
+  int txfm_partition_cost[TXFM_PARTITION_CONTEXTS][2];
+  //! inter_tx_type_costs
+  int inter_tx_type_costs[EXT_TX_SETS_INTER][EXT_TX_SIZES][TX_TYPES];
+  //! intra_tx_type_costs
+  int intra_tx_type_costs[EXT_TX_SETS_INTRA][EXT_TX_SIZES][INTRA_MODES]
+                         [TX_TYPES];
+  /**@}*/
+
+  /*****************************************************************************
+   * \name Restoration Mode Costs
+   ****************************************************************************/
+  /**@{*/
+  //! switchable_restore_cost
+  int switchable_restore_cost[RESTORE_SWITCHABLE_TYPES];
+  //! wiener_restore_cost
+  int wiener_restore_cost[2];
+  //! sgrproj_restore_cost
+  int sgrproj_restore_cost[2];
+  /**@}*/
+} ModeCosts;
+
+/*! \brief Holds mv costs for encoding and motion search.
+ */
+typedef struct {
+  /*****************************************************************************
+   * \name Rate to Distortion Multipliers
+   ****************************************************************************/
+  /**@{*/
+  //! A multiplier that converts mv cost to l2 error.
+  int errorperbit;
+  //! A multiplier that converts mv cost to l1 error.
+  int sadperbit;
+  /**@}*/
+
+  /*****************************************************************************
+   * \name Encoding Costs
+   * Here are the entropy costs needed to encode a given mv.
+   * \ref nmv_cost_alloc and \ref nmv_cost_hp_alloc are two arrays that holds
+   * the memory for holding the mv cost. But since the motion vectors can be
+   * negative, we shift them to the middle and store the resulting pointer in
+   * \ref nmv_cost and \ref nmv_cost_hp for easier referencing. Finally, \ref
+   * mv_cost_stack points to the \ref nmv_cost with the mv precision we are
+   * currently working with. In essence, only \ref mv_cost_stack is needed for
+   * motion search, the other can be considered private.
+   ****************************************************************************/
+  /**@{*/
+  //! Costs for coding the zero components.
+  int nmv_joint_cost[MV_JOINTS];
+
+  //! Allocates memory for 1/4-pel motion vector costs.
+  int nmv_cost_alloc[2][MV_VALS];
+  //! Allocates memory for 1/8-pel motion vector costs.
+  int nmv_cost_hp_alloc[2][MV_VALS];
+  //! Points to the middle of \ref nmv_cost_alloc
+  int *nmv_cost[2];
+  //! Points to the middle of \ref nmv_cost_hp_alloc
+  int *nmv_cost_hp[2];
+  //! Points to the nmv_cost_hp in use.
+  int **mv_cost_stack;
+  /**@}*/
+} MvCosts;
+
+/*! \brief Holds the costs needed to encode the coefficients
+ */
+typedef struct {
+  //! Costs for coding the coefficients.
+  LV_MAP_COEFF_COST coeff_costs[TX_SIZES][PLANE_TYPES];
+  //! Costs for coding the eobs.
+  LV_MAP_EOB_COST eob_costs[7][2];
+} CoeffCosts;
+
+/*!\cond */
+// 4: NEAREST, NEW, NEAR, GLOBAL
+#define SINGLE_REF_MODES ((REF_FRAMES - 1) * 4)
+/*!\endcond */
+struct inter_modes_info;
+
+/*! \brief Holds the motion samples for warp motion model estimation
+ */
+typedef struct {
+  //! Number of samples.
+  int num;
+  //! Sample locations in current frame.
+  int pts[16];
+  //! Sample location in the reference frame.
+  int pts_inref[16];
+} WARP_SAMPLE_INFO;
+
+/*!\cond */
+typedef enum {
+  kInvalid = 0,
+  kLowSad = 1,
+  kMedSad = 2,
+  kHighSad = 3
+} SOURCE_SAD;
+
+typedef struct {
+  SOURCE_SAD source_sad;
+  int lighting_change;
+  int low_sumdiff;
+} CONTENT_STATE_SB;
+/*!\endcond */
+
+/*! \brief Encoder's parameters related to the current coding block.
+ *
+ * This struct contains most of the information the encoder needs to encode the
+ * current coding block. This includes the src and pred buffer, a copy of the
+ * decoder's view of the current block, the txfm coefficients. This struct also
+ * contains various buffers and data used to speed up the encoding process.
+ */
+typedef struct macroblock {
+  /*****************************************************************************
+   * \name Source, Buffers and Decoder
+   ****************************************************************************/
+  /**@{*/
+  /*! \brief Each of the encoding plane.
+   *
+   * An array holding the src buffer for each of plane of the current block. It
+   * also contains the txfm and quantized txfm coefficients.
+   */
+  struct macroblock_plane plane[MAX_MB_PLANE];
+
+  /*! \brief Decoder's view of current coding block.
+   *
+   * Contains the encoder's copy of what the decoder sees in the current block.
+   * Most importantly, this struct contains pointers to mbmi that is used in
+   * final bitstream packing.
+   */
+  MACROBLOCKD e_mbd;
+
+  /*! \brief Derived coding information.
+   *
+   * Contains extra information not transmitted in the bitstream but are
+   * derived. For example, this contains the stack of ref_mvs.
+   */
+  MB_MODE_INFO_EXT mbmi_ext;
+
+  /*! \brief Finalized mbmi_ext for the whole frame.
+   *
+   * Contains the finalized info in mbmi_ext that gets used at the frame level
+   * for bitstream packing.
+   */
+  MB_MODE_INFO_EXT_FRAME *mbmi_ext_frame;
+
+  //! Entropy context for the current row.
+  FRAME_CONTEXT *row_ctx;
+  /*! \brief Entropy context for the current tile.
+   *
+   * This context will be used to update color_map_cdf pointer which would be
+   * used during pack bitstream. For single thread and tile-multithreading case
+   * this pointer will be same as xd->tile_ctx, but for the case of row-mt:
+   * xd->tile_ctx will point to a temporary context while tile_pb_ctx will point
+   * to the accurate tile context.
+   */
+  FRAME_CONTEXT *tile_pb_ctx;
+
+  /*! \brief Buffer of transformed coefficients
+   *
+   * Points to cb_coef_buff in the AV1_COMP struct, which contains the finalized
+   * coefficients. This is here to conveniently copy the best coefficients to
+   * frame level for bitstream packing. Since CB_COEFF_BUFFER is allocated on a
+   * superblock level, we need to combine it with cb_offset to get the proper
+   * position for the current coding block.
+   */
+  CB_COEFF_BUFFER *cb_coef_buff;
+  //! Offset of current coding block's coeff buffer relative to the sb.
+  uint16_t cb_offset[PLANE_TYPES];
+
+  //! Modified source and masks used for fast OBMC search.
+  OBMCBuffer obmc_buffer;
+  //! Buffer to store the best palette map.
+  PALETTE_BUFFER *palette_buffer;
+  //! Buffer used for compound_type_rd().
+  CompoundTypeRdBuffers comp_rd_buffer;
+  //! Buffer to store convolution during averaging process in compound mode.
+  CONV_BUF_TYPE *tmp_conv_dst;
+
+  /*! \brief Temporary buffer to hold prediction.
+   *
+   * Points to a buffer that is used to hold temporary prediction results. This
+   * is used in two ways:
+   * - This is a temporary buffer used to pingpong the prediction in
+   *   handle_inter_mode.
+   * - xd->tmp_obmc_bufs also points to this buffer, and is used in ombc
+   *   prediction.
+   */
+  uint8_t *tmp_pred_bufs[2];
+  /**@}*/
+
+  /*****************************************************************************
+   * \name Rdopt Costs
+   ****************************************************************************/
+  /**@{*/
+  /*! \brief Quantization index for the current partition block.
+   *
+   * This is used to as the index to find quantization parameter for luma and
+   * chroma transformed coefficients.
+   */
+  int qindex;
+
+  /*! \brief Difference between frame-level qindex and current qindex.
+   *
+   *  This is used to track whether a non-zero delta for qindex is used at least
+   *  once in the current frame.
+   */
+  int delta_qindex;
+
+  /*! \brief Rate-distortion multiplier.
+   *
+   * The rd multiplier used to determine the rate-distortion trade-off. This is
+   * roughly proportional to the inverse of q-index for a given frame, but this
+   * can be manipulated for better rate-control. For example, in tune_ssim
+   * mode, this is scaled by a factor related to the variance of the current
+   * block.
+   */
+  int rdmult;
+
+  //! Energy in the current source coding block. Used to calculate \ref rdmult
+  int mb_energy;
+  //! Energy in the current source superblock. Used to calculate \ref rdmult
+  int sb_energy_level;
+
+  //! The rate needed to signal a mode to the bitstream.
+  ModeCosts mode_costs;
+
+  //! The rate needed to encode a new motion vector to the bitstream and some
+  //! multipliers for motion search.
+  MvCosts mv_costs;
+
+  //! The rate needed to signal the txfm coefficients to the bitstream.
+  CoeffCosts coeff_costs;
+  /**@}*/
+
+  /******************************************************************************
+   * \name Segmentation
+   *****************************************************************************/
+  /**@{*/
+  /*! \brief Skip mode for the segment
+   *
+   * A syntax element of the segmentation mode. In skip_block mode, all mvs are
+   * set 0 and all txfms are skipped.
+   */
+  int seg_skip_block;
+  /**@}*/
+
+  /*****************************************************************************
+   * \name Superblock
+   ****************************************************************************/
+  /**@{*/
+  //! Information on a whole superblock level.
+  // TODO(chiyotsai@google.com): Refactor this out of macroblock
+  SuperBlockEnc sb_enc;
+
+  /*! \brief Characteristics of the current superblock.
+   *
+   *  Characteristics like whether the block has high sad, low sad, etc. This is
+   *  only used by av1 realtime mode.
+   */
+  CONTENT_STATE_SB content_state_sb;
+  /**@}*/
+
+  /*****************************************************************************
+   * \name Reference Frame Search
+   ****************************************************************************/
+  /**@{*/
+  /*! \brief Sum absolute distortion of the predicted mv for each ref frame.
+   *
+   * This is used to measure how viable a reference frame is.
+   */
+  int pred_mv_sad[REF_FRAMES];
+  //! The minimum of \ref pred_mv_sad.
+  int best_pred_mv_sad;
+
+  /*! \brief Disables certain ref frame pruning based on tpl.
+   *
+   * Determines whether a given ref frame is "good" based on data from the TPL
+   * model. If so, this stops selective_ref frame from pruning the given ref
+   * frame at block level.
+   */
+  uint8_t tpl_keep_ref_frame[REF_FRAMES];
+
+  /*! \brief Warp motion samples buffer.
+   *
+   * Store the motion samples used for warp motion.
+   */
+  WARP_SAMPLE_INFO warp_sample_info[REF_FRAMES];
+
+  /*! \brief Reference frames picked by the square subblocks in a superblock.
+   *
+   * Keeps track of ref frames that are selected by square partition blocks
+   * within a superblock, in MI resolution. They can be used to prune ref frames
+   * for rectangular blocks.
+   */
+  int picked_ref_frames_mask[MAX_MIB_SIZE * MAX_MIB_SIZE];
+
+  /*! \brief Prune ref frames in real-time mode.
+   *
+   * Determines whether to prune reference frames in real-time mode. For the
+   * most part, this is the same as nonrd_prune_ref_frame_search in
+   * cpi->sf.rt_sf.nonrd_prune_ref_frame_search, but this can be selectively
+   * turned off if the only frame available is GOLDEN_FRAME.
+   */
+  int nonrd_prune_ref_frame_search;
+  /**@}*/
+
+  /*****************************************************************************
+   * \name Partition Search
+   ****************************************************************************/
+  /**@{*/
+  //! Stores some partition-search related buffers.
+  PartitionSearchInfo part_search_info;
+
+  /*! \brief Whether to disable some features to force a mode in current block.
+   *
+   * In some cases, our speed features can be overly aggressive and remove all
+   * modes search in the superblock. When this happens, we set
+   * must_find_valid_partition to 1 to reduce the number of speed features, and
+   * recode the superblock again.
+   */
+  int must_find_valid_partition;
+  /**@}*/
+
+  /*****************************************************************************
+   * \name Prediction Mode Search
+   ****************************************************************************/
+  /**@{*/
+  /*! \brief Inter skip mode.
+   *
+   * Skip mode tries to use the closest forward and backward references for
+   * inter prediction. Skip here means to skip transmitting the reference
+   * frames, not to be confused with skip_txfm.
+   */
+  int skip_mode;
+
+  /*! \brief Factors used for rd-thresholding.
+   *
+   * Determines a rd threshold to determine whether to continue searching the
+   * current mode. If the current best rd is already <= threshold, then we skip
+   * the current mode.
+   */
+  int thresh_freq_fact[BLOCK_SIZES_ALL][MAX_MODES];
+
+  /*! \brief Tracks the winner modes in the current coding block.
+   *
+   * Winner mode is a two-pass strategy to find the best prediction mode. In the
+   * first pass, we search the prediction modes with a limited set of txfm
+   * options, and keep the top modes. These modes are called the winner modes.
+   * In the second pass, we retry the winner modes with more thorough txfm
+   * options.
+   */
+  WinnerModeStats winner_mode_stats[AOMMAX(MAX_WINNER_MODE_COUNT_INTRA,
+                                           MAX_WINNER_MODE_COUNT_INTER)];
+  //! Tracks how many winner modes there are.
+  int winner_mode_count;
+
+  /*! \brief The model used for rd-estimation to avoid txfm
+   *
+   * These are for inter_mode_rd_model_estimation, which is another two pass
+   * approach. In this speed feature, we collect data in the first couple frames
+   * to build an rd model to estimate the rdcost of a prediction model based on
+   * the residue error. Once enough data is collected, this speed feature uses
+   * the estimated rdcost to find the most performant prediction mode. Then we
+   * follow up with a second pass find the best transform for the mode.
+   * Determines if one would go with reduced complexity transform block
+   * search model to select prediction modes, or full complexity model
+   * to select transform kernel.
+   */
+  TXFM_RD_MODEL rd_model;
+
+  /*! \brief Stores the inter mode information needed to build an rd model.
+   *
+   * These are for inter_mode_rd_model_estimation, which is another two pass
+   * approach. In this speed feature, we collect data in the first couple frames
+   * to build an rd model to estimate the rdcost of a prediction model based on
+   * the residue error. Once enough data is collected, this speed feature uses
+   * the estimated rdcost to find the most performant prediction mode. Then we
+   * follow up with a second pass find the best transform for the mode.
+   */
+  // TODO(any): try to consolidate this speed feature with winner mode
+  // processing.
+  struct inter_modes_info *inter_modes_info;
+
+  //! How to blend the compound predictions.
+  uint8_t compound_idx;
+
+  //! A caches of results of compound type search so they can be reused later.
+  COMP_RD_STATS comp_rd_stats[MAX_COMP_RD_STATS];
+  //! The idx for the latest compound mode in the cache \ref comp_rd_stats.
+  int comp_rd_stats_idx;
+
+  /*! \brief Whether to recompute the luma prediction.
+   *
+   * In interpolation search, we can usually skip recalculating the luma
+   * prediction because it is already calculated by a previous predictor. This
+   * flag signifies that some modes might have been skipped, so we need to
+   * rebuild the prediction.
+   */
+  int recalc_luma_mc_data;
+
+  /*! \brief Data structure to speed up intrabc search.
+   *
+   * Contains the hash table, hash function, and buffer used for intrabc.
+   */
+  IntraBCHashInfo intrabc_hash_info;
+
+  /*! \brief Whether to reuse the mode stored in intermode_cache. */
+  int use_intermode_cache;
+  /*! \brief The mode to reuse during \ref av1_rd_pick_inter_mode. */
+  const MB_MODE_INFO *intermode_cache;
+  /**@}*/
+
+  /*****************************************************************************
+   * \name MV Search
+   ****************************************************************************/
+  /**@{*/
+  /*! \brief Context used to determine the initial step size in motion search.
+   *
+   * This context is defined as the \f$l_\inf\f$ norm of the best ref_mvs for
+   * each frame.
+   */
+  unsigned int max_mv_context[REF_FRAMES];
+
+  /*! \brief Limit for the range of motion vectors.
+   *
+   * These define limits to motion vector components to prevent them from
+   * extending outside the UMV borders
+   */
+  FullMvLimits mv_limits;
+  /**@}*/
+
+  /*****************************************************************************
+   * \name Txfm Search
+   ****************************************************************************/
+  /**@{*/
+  /*! \brief Parameters that control how motion search is done.
+   *
+   * Stores various txfm search related parameters such as txfm_type, txfm_size,
+   * trellis eob search, etc.
+   */
+  TxfmSearchParams txfm_search_params;
+
+  /*! \brief Results of the txfm searches that have been done.
+   *
+   * Caches old txfm search results and keeps the current txfm decisions to
+   * facilitate rdopt.
+   */
+  TxfmSearchInfo txfm_search_info;
+
+  /*! \brief Whether there is a strong color activity.
+   *
+   * Used in REALTIME coding mode to enhance the visual quality at the boundary
+   * of moving color objects.
+   */
+  uint8_t color_sensitivity[2];
+  /**@}*/
+
+  /*****************************************************************************
+   * \name Misc
+   ****************************************************************************/
+  /**@{*/
+  //! Variance of the source frame.
+  unsigned int source_variance;
+  //! SSE of the current predictor.
+  unsigned int pred_sse[REF_FRAMES];
+  //! Prediction for ML based partition.
+  DECLARE_ALIGNED(16, uint8_t, est_pred[128 * 128]);
+  /**@}*/
+} MACROBLOCK;
+#undef SINGLE_REF_MODES
+
+/*!\cond */
 static INLINE int is_rect_tx_allowed_bsize(BLOCK_SIZE bsize) {
   static const char LUT[BLOCK_SIZES_ALL] = {
     0,  // BLOCK_4X4
@@ -523,7 +1211,7 @@
 
 static INLINE int is_rect_tx_allowed(const MACROBLOCKD *xd,
                                      const MB_MODE_INFO *mbmi) {
-  return is_rect_tx_allowed_bsize(mbmi->sb_type) &&
+  return is_rect_tx_allowed_bsize(mbmi->bsize) &&
          !xd->lossless[mbmi->segment_id];
 }
 
@@ -538,36 +1226,38 @@
   return depth;
 }
 
-static INLINE void set_blk_skip(MACROBLOCK *x, int plane, int blk_idx,
+static INLINE void set_blk_skip(uint8_t txb_skip[], int plane, int blk_idx,
                                 int skip) {
   if (skip)
-    x->blk_skip[blk_idx] |= 1UL << plane;
+    txb_skip[blk_idx] |= 1UL << plane;
   else
-    x->blk_skip[blk_idx] &= ~(1UL << plane);
+    txb_skip[blk_idx] &= ~(1UL << plane);
 #ifndef NDEBUG
   // Set chroma planes to uninitialized states when luma is set to check if
   // it will be set later
   if (plane == 0) {
-    x->blk_skip[blk_idx] |= 1UL << (1 + 4);
-    x->blk_skip[blk_idx] |= 1UL << (2 + 4);
+    txb_skip[blk_idx] |= 1UL << (1 + 4);
+    txb_skip[blk_idx] |= 1UL << (2 + 4);
   }
 
   // Clear the initialization checking bit
-  x->blk_skip[blk_idx] &= ~(1UL << (plane + 4));
+  txb_skip[blk_idx] &= ~(1UL << (plane + 4));
 #endif
 }
 
-static INLINE int is_blk_skip(MACROBLOCK *x, int plane, int blk_idx) {
+static INLINE int is_blk_skip(uint8_t *txb_skip, int plane, int blk_idx) {
 #ifndef NDEBUG
   // Check if this is initialized
-  assert(!(x->blk_skip[blk_idx] & (1UL << (plane + 4))));
+  assert(!(txb_skip[blk_idx] & (1UL << (plane + 4))));
 
   // The magic number is 0x77, this is to test if there is garbage data
-  assert((x->blk_skip[blk_idx] & 0x88) == 0);
+  assert((txb_skip[blk_idx] & 0x88) == 0);
 #endif
-  return (x->blk_skip[blk_idx] >> plane) & 1;
+  return (txb_skip[blk_idx] >> plane) & 1;
 }
 
+/*!\endcond */
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/av1/encoder/cnn.c b/av1/encoder/cnn.c
index 5d8a236..6b2a55a 100644
--- a/av1/encoder/cnn.c
+++ b/av1/encoder/cnn.c
@@ -785,7 +785,6 @@
                   const int jj =
                       CLAMPINDEX(w / layer_config->skip_width, in_width);
                   assert(ii >= 0 && ii < in_height && jj >= 0 && jj < in_width);
-                  continue;
                   sum += layer_config->weights[off] *
                          input[k][ii * in_stride + jj];
                 }
@@ -833,8 +832,8 @@
                        int in_stride, const CNN_CONFIG *cnn_config,
                        const CNN_THREAD_DATA *thread_data,
                        CNN_MULTI_OUT *output_struct) {
-  TENSOR tensor1[CNN_MAX_BRANCHES] = { 0 };
-  TENSOR tensor2[CNN_MAX_BRANCHES] = { 0 };
+  TENSOR tensor1[CNN_MAX_BRANCHES] = { { 0 } };
+  TENSOR tensor2[CNN_MAX_BRANCHES] = { { 0 } };
 
   float **output[CNN_MAX_BRANCHES];
   const int *out_chs = output_struct->output_channels;
diff --git a/av1/encoder/compound_type.c b/av1/encoder/compound_type.c
index 42095b7..8e8cca8 100644
--- a/av1/encoder/compound_type.c
+++ b/av1/encoder/compound_type.c
@@ -11,6 +11,7 @@
 
 #include "av1/common/pred_common.h"
 #include "av1/encoder/compound_type.h"
+#include "av1/encoder/encoder_alloc.h"
 #include "av1/encoder/model_rd.h"
 #include "av1/encoder/motion_search_facade.h"
 #include "av1/encoder/rdopt_utils.h"
@@ -100,20 +101,20 @@
                                        const AV1_COMP *const cpi) {
   // Enable wedge search if source variance and edge strength are above
   // the thresholds.
-  return x->source_variance >
-             cpi->sf.inter_sf.disable_wedge_search_var_thresh &&
-         x->edge_strength > cpi->sf.inter_sf.disable_wedge_search_edge_thresh;
+  return x->source_variance > cpi->sf.inter_sf.disable_wedge_search_var_thresh;
 }
 
 static INLINE bool enable_wedge_interinter_search(MACROBLOCK *const x,
                                                   const AV1_COMP *const cpi) {
-  return enable_wedge_search(x, cpi) && cpi->oxcf.enable_interinter_wedge &&
+  return enable_wedge_search(x, cpi) &&
+         cpi->oxcf.comp_type_cfg.enable_interinter_wedge &&
          !cpi->sf.inter_sf.disable_interinter_wedge;
 }
 
 static INLINE bool enable_wedge_interintra_search(MACROBLOCK *const x,
                                                   const AV1_COMP *const cpi) {
-  return enable_wedge_search(x, cpi) && cpi->oxcf.enable_interintra_wedge &&
+  return enable_wedge_search(x, cpi) &&
+         cpi->oxcf.comp_type_cfg.enable_interintra_wedge &&
          !cpi->sf.inter_sf.disable_wedge_interintra_search;
 }
 
@@ -241,7 +242,7 @@
     // sse, rate, dist, rate2, dist2); dist = dist2;
     // rate = rate2;
 
-    rate += x->wedge_idx_cost[bsize][wedge_index];
+    rate += x->mode_costs.wedge_idx_cost[bsize][wedge_index];
     rd = RDCOST(x->rdmult, rate, dist);
 
     if (rd < best_rd) {
@@ -253,7 +254,8 @@
   }
 
   return best_rd -
-         RDCOST(x->rdmult, x->wedge_idx_cost[bsize][*best_wedge_index], 0);
+         RDCOST(x->rdmult,
+                x->mode_costs.wedge_idx_cost[bsize][*best_wedge_index], 0);
 }
 
 // Choose the best wedge index the specified sign
@@ -284,7 +286,7 @@
 
     model_rd_sse_fn[MODELRD_TYPE_MASKED_COMPOUND](cpi, x, bsize, 0, sse, N,
                                                   &rate, &dist);
-    rate += x->wedge_idx_cost[bsize][wedge_index];
+    rate += x->mode_costs.wedge_idx_cost[bsize][wedge_index];
     rd = RDCOST(x->rdmult, rate, dist);
 
     if (rd < best_rd) {
@@ -294,7 +296,8 @@
     }
   }
   return best_rd -
-         RDCOST(x->rdmult, x->wedge_idx_cost[bsize][*best_wedge_index], 0);
+         RDCOST(x->rdmult,
+                x->mode_costs.wedge_idx_cost[bsize][*best_wedge_index], 0);
 }
 
 static int64_t pick_interinter_wedge(
@@ -488,12 +491,12 @@
                            max_txsize_rect_lookup[bs], FTXS_NONE, skip_trellis);
   x->rd_model = FULL_TXFM_RD;
   if (rd != INT64_MAX) {
-    const int skip_ctx = av1_get_skip_context(xd);
-    if (rd_stats->skip) {
-      const int s1 = x->skip_cost[skip_ctx][1];
+    const int skip_ctx = av1_get_skip_txfm_context(xd);
+    if (rd_stats->skip_txfm) {
+      const int s1 = x->mode_costs.skip_txfm_cost[skip_ctx][1];
       rd_stats->rate = s1;
     } else {
-      const int s0 = x->skip_cost[skip_ctx][0];
+      const int s0 = x->mode_costs.skip_txfm_cost[skip_ctx][0];
       rd_stats->rate += s0;
     }
   }
@@ -529,7 +532,7 @@
     int64_t rd = pick_interintra_wedge(cpi, x, bsize, intrapred_, tmp_buf_);
     const int rate_overhead =
         interintra_mode_cost[mode] +
-        x->wedge_idx_cost[bsize][mbmi->interintra_wedge_index];
+        x->mode_costs.wedge_idx_cost[bsize][mbmi->interintra_wedge_index];
     const int64_t total_rd = rd + RDCOST(x->rdmult, rate_overhead, 0);
     if (total_rd < best_total_rd) {
       best_total_rd = total_rd;
@@ -541,28 +544,217 @@
   return best_interintra_rd_wedge;
 }
 
+static int handle_smooth_inter_intra_mode(
+    const AV1_COMP *const cpi, MACROBLOCK *const x, BLOCK_SIZE bsize,
+    MB_MODE_INFO *mbmi, int64_t ref_best_rd, int *rate_mv,
+    INTERINTRA_MODE *best_interintra_mode, int64_t *best_rd,
+    int *best_mode_rate, const BUFFER_SET *orig_dst, uint8_t *tmp_buf,
+    uint8_t *intrapred, HandleInterModeArgs *args) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  const ModeCosts *mode_costs = &x->mode_costs;
+  const int *const interintra_mode_cost =
+      mode_costs->interintra_mode_cost[size_group_lookup[bsize]];
+  const AV1_COMMON *const cm = &cpi->common;
+  const int bw = block_size_wide[bsize];
+
+  mbmi->use_wedge_interintra = 0;
+
+  if (cpi->sf.inter_sf.reuse_inter_intra_mode == 0 ||
+      *best_interintra_mode == INTERINTRA_MODES) {
+    int64_t best_interintra_rd = INT64_MAX;
+    for (INTERINTRA_MODE cur_mode = 0; cur_mode < INTERINTRA_MODES;
+         ++cur_mode) {
+      if ((!cpi->oxcf.intra_mode_cfg.enable_smooth_intra ||
+           cpi->sf.intra_sf.disable_smooth_intra) &&
+          cur_mode == II_SMOOTH_PRED)
+        continue;
+      compute_best_interintra_mode(
+          cpi, mbmi, xd, x, interintra_mode_cost, orig_dst, intrapred, tmp_buf,
+          best_interintra_mode, &best_interintra_rd, cur_mode, bsize);
+    }
+    args->inter_intra_mode[mbmi->ref_frame[0]] = *best_interintra_mode;
+  }
+  assert(IMPLIES(!cpi->oxcf.comp_type_cfg.enable_smooth_interintra ||
+                     cpi->sf.inter_sf.disable_smooth_interintra,
+                 *best_interintra_mode != II_SMOOTH_PRED));
+  // Recompute prediction if required
+  bool interintra_mode_reuse = cpi->sf.inter_sf.reuse_inter_intra_mode ||
+                               *best_interintra_mode != INTERINTRA_MODES;
+  if (interintra_mode_reuse || *best_interintra_mode != INTERINTRA_MODES - 1) {
+    mbmi->interintra_mode = *best_interintra_mode;
+    av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst,
+                                              intrapred, bw);
+    av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw);
+  }
+
+  // Compute rd cost for best smooth_interintra
+  RD_STATS rd_stats;
+  const int is_wedge_used = av1_is_wedge_used(bsize);
+  const int rmode =
+      interintra_mode_cost[*best_interintra_mode] +
+      (is_wedge_used ? mode_costs->wedge_interintra_cost[bsize][0] : 0);
+  const int total_mode_rate = rmode + *rate_mv;
+  const int64_t rd_thresh = compute_rd_thresh(x, total_mode_rate, ref_best_rd);
+  int64_t rd = estimate_yrd_for_sb(cpi, bsize, x, rd_thresh, &rd_stats);
+  if (rd != INT64_MAX) {
+    rd = RDCOST(x->rdmult, total_mode_rate + rd_stats.rate, rd_stats.dist);
+  } else {
+    return IGNORE_MODE;
+  }
+  *best_rd = rd;
+  *best_mode_rate = rmode;
+  // Return early if best rd not good enough
+  if (ref_best_rd < INT64_MAX &&
+      (*best_rd >> INTER_INTRA_RD_THRESH_SHIFT) * INTER_INTRA_RD_THRESH_SCALE >
+          ref_best_rd) {
+    return IGNORE_MODE;
+  }
+  return 0;
+}
+
+static int handle_wedge_inter_intra_mode(
+    const AV1_COMP *const cpi, MACROBLOCK *const x, BLOCK_SIZE bsize,
+    MB_MODE_INFO *mbmi, int *rate_mv, INTERINTRA_MODE *best_interintra_mode,
+    int64_t *best_rd, const BUFFER_SET *orig_dst, uint8_t *tmp_buf_,
+    uint8_t *tmp_buf, uint8_t *intrapred_, uint8_t *intrapred,
+    HandleInterModeArgs *args, int *tmp_rate_mv, int *rate_overhead,
+    int_mv *tmp_mv, int64_t best_rd_no_wedge) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  const ModeCosts *mode_costs = &x->mode_costs;
+  const int *const interintra_mode_cost =
+      mode_costs->interintra_mode_cost[size_group_lookup[bsize]];
+  const AV1_COMMON *const cm = &cpi->common;
+  const int bw = block_size_wide[bsize];
+  const int try_smooth_interintra =
+      cpi->oxcf.comp_type_cfg.enable_smooth_interintra &&
+      !cpi->sf.inter_sf.disable_smooth_interintra;
+
+  mbmi->use_wedge_interintra = 1;
+
+  if (!cpi->sf.inter_sf.fast_interintra_wedge_search) {
+    // Exhaustive search of all wedge and mode combinations.
+    int best_mode = 0;
+    int best_wedge_index = 0;
+    *best_rd = compute_best_wedge_interintra(
+        cpi, mbmi, xd, x, interintra_mode_cost, orig_dst, intrapred_, tmp_buf_,
+        &best_mode, &best_wedge_index, bsize);
+    mbmi->interintra_mode = best_mode;
+    mbmi->interintra_wedge_index = best_wedge_index;
+    if (best_mode != INTERINTRA_MODES - 1) {
+      av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst,
+                                                intrapred, bw);
+    }
+  } else if (!try_smooth_interintra) {
+    if (*best_interintra_mode == INTERINTRA_MODES) {
+      mbmi->interintra_mode = INTERINTRA_MODES - 1;
+      *best_interintra_mode = INTERINTRA_MODES - 1;
+      av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst,
+                                                intrapred, bw);
+      // Pick wedge mask based on INTERINTRA_MODES - 1
+      *best_rd = pick_interintra_wedge(cpi, x, bsize, intrapred_, tmp_buf_);
+      // Find the best interintra mode for the chosen wedge mask
+      for (INTERINTRA_MODE cur_mode = 0; cur_mode < INTERINTRA_MODES;
+           ++cur_mode) {
+        compute_best_interintra_mode(
+            cpi, mbmi, xd, x, interintra_mode_cost, orig_dst, intrapred,
+            tmp_buf, best_interintra_mode, best_rd, cur_mode, bsize);
+      }
+      args->inter_intra_mode[mbmi->ref_frame[0]] = *best_interintra_mode;
+      mbmi->interintra_mode = *best_interintra_mode;
+
+      // Recompute prediction if required
+      if (*best_interintra_mode != INTERINTRA_MODES - 1) {
+        av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst,
+                                                  intrapred, bw);
+      }
+    } else {
+      // Pick wedge mask for the best interintra mode (reused)
+      mbmi->interintra_mode = *best_interintra_mode;
+      av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst,
+                                                intrapred, bw);
+      *best_rd = pick_interintra_wedge(cpi, x, bsize, intrapred_, tmp_buf_);
+    }
+  } else {
+    // Pick wedge mask for the best interintra mode from smooth_interintra
+    *best_rd = pick_interintra_wedge(cpi, x, bsize, intrapred_, tmp_buf_);
+  }
+
+  *rate_overhead =
+      interintra_mode_cost[mbmi->interintra_mode] +
+      mode_costs->wedge_idx_cost[bsize][mbmi->interintra_wedge_index] +
+      mode_costs->wedge_interintra_cost[bsize][1];
+  *best_rd += RDCOST(x->rdmult, *rate_overhead + *rate_mv, 0);
+
+  int64_t rd = INT64_MAX;
+  const int_mv mv0 = mbmi->mv[0];
+  // Refine motion vector for NEWMV case.
+  if (have_newmv_in_inter_mode(mbmi->mode)) {
+    int rate_sum, skip_txfm_sb;
+    int64_t dist_sum, skip_sse_sb;
+    // get negative of mask
+    const uint8_t *mask =
+        av1_get_contiguous_soft_mask(mbmi->interintra_wedge_index, 1, bsize);
+    av1_compound_single_motion_search(cpi, x, bsize, &tmp_mv->as_mv, intrapred,
+                                      mask, bw, tmp_rate_mv, 0);
+    if (mbmi->mv[0].as_int != tmp_mv->as_int) {
+      mbmi->mv[0].as_int = tmp_mv->as_int;
+      // Set ref_frame[1] to NONE_FRAME temporarily so that the intra
+      // predictor is not calculated again in av1_enc_build_inter_predictor().
+      mbmi->ref_frame[1] = NONE_FRAME;
+      const int mi_row = xd->mi_row;
+      const int mi_col = xd->mi_col;
+      av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize,
+                                    AOM_PLANE_Y, AOM_PLANE_Y);
+      mbmi->ref_frame[1] = INTRA_FRAME;
+      av1_combine_interintra(xd, bsize, 0, xd->plane[AOM_PLANE_Y].dst.buf,
+                             xd->plane[AOM_PLANE_Y].dst.stride, intrapred, bw);
+      model_rd_sb_fn[MODELRD_TYPE_MASKED_COMPOUND](
+          cpi, bsize, x, xd, 0, 0, &rate_sum, &dist_sum, &skip_txfm_sb,
+          &skip_sse_sb, NULL, NULL, NULL);
+      rd =
+          RDCOST(x->rdmult, *tmp_rate_mv + *rate_overhead + rate_sum, dist_sum);
+    }
+  }
+  if (rd >= *best_rd) {
+    tmp_mv->as_int = mv0.as_int;
+    *tmp_rate_mv = *rate_mv;
+    av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw);
+  }
+  // Evaluate closer to true rd
+  RD_STATS rd_stats;
+  const int64_t mode_rd = RDCOST(x->rdmult, *rate_overhead + *tmp_rate_mv, 0);
+  const int64_t tmp_rd_thresh = best_rd_no_wedge - mode_rd;
+  rd = estimate_yrd_for_sb(cpi, bsize, x, tmp_rd_thresh, &rd_stats);
+  if (rd != INT64_MAX) {
+    rd = RDCOST(x->rdmult, *rate_overhead + *tmp_rate_mv + rd_stats.rate,
+                rd_stats.dist);
+  } else {
+    if (*best_rd == INT64_MAX) return IGNORE_MODE;
+  }
+  *best_rd = rd;
+  return 0;
+}
+
 int av1_handle_inter_intra_mode(const AV1_COMP *const cpi, MACROBLOCK *const x,
                                 BLOCK_SIZE bsize, MB_MODE_INFO *mbmi,
                                 HandleInterModeArgs *args, int64_t ref_best_rd,
                                 int *rate_mv, int *tmp_rate2,
                                 const BUFFER_SET *orig_dst) {
-  const int try_smooth_interintra = cpi->oxcf.enable_smooth_interintra &&
-                                    !cpi->sf.inter_sf.disable_smooth_interintra;
+  const int try_smooth_interintra =
+      cpi->oxcf.comp_type_cfg.enable_smooth_interintra &&
+      !cpi->sf.inter_sf.disable_smooth_interintra;
+
   const int is_wedge_used = av1_is_wedge_used(bsize);
   const int try_wedge_interintra =
       is_wedge_used && enable_wedge_interintra_search(x, cpi);
-  if (!try_smooth_interintra && !try_wedge_interintra) return -1;
 
   const AV1_COMMON *const cm = &cpi->common;
   MACROBLOCKD *xd = &x->e_mbd;
-  int64_t rd = INT64_MAX;
   const int bw = block_size_wide[bsize];
   DECLARE_ALIGNED(16, uint8_t, tmp_buf_[2 * MAX_INTERINTRA_SB_SQUARE]);
   DECLARE_ALIGNED(16, uint8_t, intrapred_[2 * MAX_INTERINTRA_SB_SQUARE]);
   uint8_t *tmp_buf = get_buf_by_bd(xd, tmp_buf_);
   uint8_t *intrapred = get_buf_by_bd(xd, intrapred_);
-  const int *const interintra_mode_cost =
-      x->interintra_mode_cost[size_group_lookup[bsize]];
   const int mi_row = xd->mi_row;
   const int mi_col = xd->mi_col;
 
@@ -584,189 +776,50 @@
   int64_t best_interintra_rd_nowedge = INT64_MAX;
   int best_mode_rate = INT_MAX;
   if (try_smooth_interintra) {
-    mbmi->use_wedge_interintra = 0;
-    int interintra_mode_reuse = 1;
-    if (cpi->sf.inter_sf.reuse_inter_intra_mode == 0 ||
-        best_interintra_mode == INTERINTRA_MODES) {
-      interintra_mode_reuse = 0;
-      int64_t best_interintra_rd = INT64_MAX;
-      for (INTERINTRA_MODE cur_mode = 0; cur_mode < INTERINTRA_MODES;
-           ++cur_mode) {
-        if ((!cpi->oxcf.enable_smooth_intra ||
-             cpi->sf.intra_sf.disable_smooth_intra) &&
-            cur_mode == II_SMOOTH_PRED)
-          continue;
-        compute_best_interintra_mode(cpi, mbmi, xd, x, interintra_mode_cost,
-                                     orig_dst, intrapred, tmp_buf,
-                                     &best_interintra_mode, &best_interintra_rd,
-                                     cur_mode, bsize);
-      }
-      args->inter_intra_mode[mbmi->ref_frame[0]] = best_interintra_mode;
-    }
-    assert(IMPLIES(!cpi->oxcf.enable_smooth_interintra ||
-                       cpi->sf.inter_sf.disable_smooth_interintra,
-                   best_interintra_mode != II_SMOOTH_PRED));
-    // Recompute prediction if required
-    if (interintra_mode_reuse || best_interintra_mode != INTERINTRA_MODES - 1) {
-      mbmi->interintra_mode = best_interintra_mode;
-      av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst,
-                                                intrapred, bw);
-      av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw);
-    }
-
-    // Compute rd cost for best smooth_interintra
-    RD_STATS rd_stats;
-    const int rmode = interintra_mode_cost[best_interintra_mode] +
-                      (is_wedge_used ? x->wedge_interintra_cost[bsize][0] : 0);
-    const int total_mode_rate = rmode + *rate_mv;
-    const int64_t rd_thresh =
-        compute_rd_thresh(x, total_mode_rate, ref_best_rd);
-    rd = estimate_yrd_for_sb(cpi, bsize, x, rd_thresh, &rd_stats);
-    if (rd != INT64_MAX) {
-      rd = RDCOST(x->rdmult, total_mode_rate + rd_stats.rate, rd_stats.dist);
-    } else {
-      return -1;
-    }
-    best_interintra_rd_nowedge = rd;
-    best_mode_rate = rmode;
-    // Return early if best_interintra_rd_nowedge not good enough
-    if (ref_best_rd < INT64_MAX &&
-        (best_interintra_rd_nowedge >> INTER_INTRA_RD_THRESH_SHIFT) *
-                INTER_INTRA_RD_THRESH_SCALE >
-            ref_best_rd) {
-      return -1;
+    int ret = handle_smooth_inter_intra_mode(
+        cpi, x, bsize, mbmi, ref_best_rd, rate_mv, &best_interintra_mode,
+        &best_interintra_rd_nowedge, &best_mode_rate, orig_dst, tmp_buf,
+        intrapred, args);
+    if (ret == IGNORE_MODE) {
+      return IGNORE_MODE;
     }
   }
 
   // Compute wedge interintra
   int64_t best_interintra_rd_wedge = INT64_MAX;
+  const int_mv mv0 = mbmi->mv[0];
+  int_mv tmp_mv = mv0;
+  int tmp_rate_mv = 0;
+  int rate_overhead = 0;
   if (try_wedge_interintra) {
-    mbmi->use_wedge_interintra = 1;
-    if (!cpi->sf.inter_sf.fast_interintra_wedge_search) {
-      // Exhaustive search of all wedge and mode combinations.
-      int best_mode = 0;
-      int best_wedge_index = 0;
-      best_interintra_rd_wedge = compute_best_wedge_interintra(
-          cpi, mbmi, xd, x, interintra_mode_cost, orig_dst, intrapred_,
-          tmp_buf_, &best_mode, &best_wedge_index, bsize);
-      mbmi->interintra_mode = best_mode;
-      mbmi->interintra_wedge_index = best_wedge_index;
-      if (best_mode != INTERINTRA_MODES - 1) {
-        av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst,
-                                                  intrapred, bw);
-      }
-    } else if (!try_smooth_interintra) {
-      if (best_interintra_mode == INTERINTRA_MODES) {
-        mbmi->interintra_mode = INTERINTRA_MODES - 1;
-        best_interintra_mode = INTERINTRA_MODES - 1;
-        av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst,
-                                                  intrapred, bw);
-        // Pick wedge mask based on INTERINTRA_MODES - 1
-        best_interintra_rd_wedge =
-            pick_interintra_wedge(cpi, x, bsize, intrapred_, tmp_buf_);
-        // Find the best interintra mode for the chosen wedge mask
-        for (INTERINTRA_MODE cur_mode = 0; cur_mode < INTERINTRA_MODES;
-             ++cur_mode) {
-          compute_best_interintra_mode(
-              cpi, mbmi, xd, x, interintra_mode_cost, orig_dst, intrapred,
-              tmp_buf, &best_interintra_mode, &best_interintra_rd_wedge,
-              cur_mode, bsize);
-        }
-        args->inter_intra_mode[mbmi->ref_frame[0]] = best_interintra_mode;
-        mbmi->interintra_mode = best_interintra_mode;
-
-        // Recompute prediction if required
-        if (best_interintra_mode != INTERINTRA_MODES - 1) {
-          av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst,
-                                                    intrapred, bw);
-        }
-      } else {
-        // Pick wedge mask for the best interintra mode (reused)
-        mbmi->interintra_mode = best_interintra_mode;
-        av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst,
-                                                  intrapred, bw);
-        best_interintra_rd_wedge =
-            pick_interintra_wedge(cpi, x, bsize, intrapred_, tmp_buf_);
-      }
-    } else {
-      // Pick wedge mask for the best interintra mode from smooth_interintra
-      best_interintra_rd_wedge =
-          pick_interintra_wedge(cpi, x, bsize, intrapred_, tmp_buf_);
-    }
-
-    const int rate_overhead =
-        interintra_mode_cost[mbmi->interintra_mode] +
-        x->wedge_idx_cost[bsize][mbmi->interintra_wedge_index] +
-        x->wedge_interintra_cost[bsize][1];
-    best_interintra_rd_wedge += RDCOST(x->rdmult, rate_overhead + *rate_mv, 0);
-
-    const int_mv mv0 = mbmi->mv[0];
-    int_mv tmp_mv = mv0;
-    rd = INT64_MAX;
-    int tmp_rate_mv = 0;
-    // Refine motion vector for NEWMV case.
-    if (have_newmv_in_inter_mode(mbmi->mode)) {
-      int rate_sum, skip_txfm_sb;
-      int64_t dist_sum, skip_sse_sb;
-      // get negative of mask
-      const uint8_t *mask =
-          av1_get_contiguous_soft_mask(mbmi->interintra_wedge_index, 1, bsize);
-      av1_compound_single_motion_search(cpi, x, bsize, &tmp_mv.as_mv, intrapred,
-                                        mask, bw, &tmp_rate_mv, 0);
-      if (mbmi->mv[0].as_int != tmp_mv.as_int) {
-        mbmi->mv[0].as_int = tmp_mv.as_int;
-        // Set ref_frame[1] to NONE_FRAME temporarily so that the intra
-        // predictor is not calculated again in av1_enc_build_inter_predictor().
-        mbmi->ref_frame[1] = NONE_FRAME;
-        av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize,
-                                      AOM_PLANE_Y, AOM_PLANE_Y);
-        mbmi->ref_frame[1] = INTRA_FRAME;
-        av1_combine_interintra(xd, bsize, 0, xd->plane[AOM_PLANE_Y].dst.buf,
-                               xd->plane[AOM_PLANE_Y].dst.stride, intrapred,
-                               bw);
-        model_rd_sb_fn[MODELRD_TYPE_MASKED_COMPOUND](
-            cpi, bsize, x, xd, 0, 0, &rate_sum, &dist_sum, &skip_txfm_sb,
-            &skip_sse_sb, NULL, NULL, NULL);
-        rd =
-            RDCOST(x->rdmult, tmp_rate_mv + rate_overhead + rate_sum, dist_sum);
-      }
-    }
-    if (rd >= best_interintra_rd_wedge) {
-      tmp_mv.as_int = mv0.as_int;
-      tmp_rate_mv = *rate_mv;
-      av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw);
-    }
-    // Evaluate closer to true rd
-    RD_STATS rd_stats;
-    const int64_t mode_rd = RDCOST(x->rdmult, rate_overhead + tmp_rate_mv, 0);
-    const int64_t tmp_rd_thresh = best_interintra_rd_nowedge - mode_rd;
-    rd = estimate_yrd_for_sb(cpi, bsize, x, tmp_rd_thresh, &rd_stats);
-    if (rd != INT64_MAX) {
-      rd = RDCOST(x->rdmult, rate_overhead + tmp_rate_mv + rd_stats.rate,
-                  rd_stats.dist);
-    } else {
-      if (best_interintra_rd_nowedge == INT64_MAX) return -1;
-    }
-    best_interintra_rd_wedge = rd;
-    if (best_interintra_rd_wedge < best_interintra_rd_nowedge) {
-      mbmi->mv[0].as_int = tmp_mv.as_int;
-      *tmp_rate2 += tmp_rate_mv - *rate_mv;
-      *rate_mv = tmp_rate_mv;
-      best_mode_rate = rate_overhead;
-    } else {
-      mbmi->use_wedge_interintra = 0;
-      mbmi->interintra_mode = best_interintra_mode;
-      mbmi->mv[0].as_int = mv0.as_int;
-      av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize,
-                                    AOM_PLANE_Y, AOM_PLANE_Y);
+    int ret = handle_wedge_inter_intra_mode(
+        cpi, x, bsize, mbmi, rate_mv, &best_interintra_mode,
+        &best_interintra_rd_wedge, orig_dst, tmp_buf_, tmp_buf, intrapred_,
+        intrapred, args, &tmp_rate_mv, &rate_overhead, &tmp_mv,
+        best_interintra_rd_nowedge);
+    if (ret == IGNORE_MODE) {
+      return IGNORE_MODE;
     }
   }
 
   if (best_interintra_rd_nowedge == INT64_MAX &&
       best_interintra_rd_wedge == INT64_MAX) {
-    return -1;
+    return IGNORE_MODE;
   }
-
+  if (best_interintra_rd_wedge < best_interintra_rd_nowedge) {
+    mbmi->mv[0].as_int = tmp_mv.as_int;
+    *tmp_rate2 += tmp_rate_mv - *rate_mv;
+    *rate_mv = tmp_rate_mv;
+    best_mode_rate = rate_overhead;
+  } else if (try_smooth_interintra && try_wedge_interintra) {
+    // If smooth was best, but we over-wrote the values when evaluating the
+    // wedge mode, we need to recompute the smooth values.
+    mbmi->use_wedge_interintra = 0;
+    mbmi->interintra_mode = best_interintra_mode;
+    mbmi->mv[0].as_int = mv0.as_int;
+    av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize,
+                                  AOM_PLANE_Y, AOM_PLANE_Y);
+  }
   *tmp_rate2 += best_mode_rate;
 
   if (num_planes > 1) {
@@ -791,10 +844,12 @@
 }
 
 // Computes the valid compound_types to be evaluated
-static INLINE int compute_valid_comp_types(
-    MACROBLOCK *x, const AV1_COMP *const cpi, int *try_average_and_distwtd_comp,
-    BLOCK_SIZE bsize, int masked_compound_used, int mode_search_mask,
-    COMPOUND_TYPE *valid_comp_types) {
+static INLINE int compute_valid_comp_types(MACROBLOCK *x,
+                                           const AV1_COMP *const cpi,
+                                           BLOCK_SIZE bsize,
+                                           int masked_compound_used,
+                                           int mode_search_mask,
+                                           COMPOUND_TYPE *valid_comp_types) {
   const AV1_COMMON *cm = &cpi->common;
   int valid_type_count = 0;
   int comp_type, valid_check;
@@ -805,15 +860,13 @@
       ((mode_search_mask & (1 << COMPOUND_DISTWTD)) &&
        cm->seq_params.order_hint_info.enable_dist_wtd_comp == 1 &&
        cpi->sf.inter_sf.use_dist_wtd_comp_flag != DIST_WTD_COMP_DISABLED);
-  *try_average_and_distwtd_comp = try_average_comp && try_distwtd_comp;
 
   // Check if COMPOUND_AVERAGE and COMPOUND_DISTWTD are valid cases
   for (comp_type = COMPOUND_AVERAGE; comp_type <= COMPOUND_DISTWTD;
        comp_type++) {
     valid_check =
         (comp_type == COMPOUND_AVERAGE) ? try_average_comp : try_distwtd_comp;
-    if (!*try_average_and_distwtd_comp && valid_check &&
-        is_interinter_compound_used(comp_type, bsize))
+    if (valid_check && is_interinter_compound_used(comp_type, bsize))
       valid_comp_types[valid_type_count++] = comp_type;
   }
   // Check if COMPOUND_WEDGE and COMPOUND_DIFFWTD are valid cases
@@ -821,7 +874,7 @@
     // enable_masked_type[0] corresponds to COMPOUND_WEDGE
     // enable_masked_type[1] corresponds to COMPOUND_DIFFWTD
     enable_masked_type[0] = enable_wedge_interinter_search(x, cpi);
-    enable_masked_type[1] = cpi->oxcf.enable_diff_wtd_comp;
+    enable_masked_type[1] = cpi->oxcf.comp_type_cfg.enable_diff_wtd_comp;
     for (comp_type = COMPOUND_WEDGE; comp_type <= COMPOUND_DIFFWTD;
          comp_type++) {
       if ((mode_search_mask & (1 << comp_type)) &&
@@ -834,11 +887,9 @@
 }
 
 // Calculates the cost for compound type mask
-static INLINE void calc_masked_type_cost(MACROBLOCK *x, BLOCK_SIZE bsize,
-                                         int comp_group_idx_ctx,
-                                         int comp_index_ctx,
-                                         int masked_compound_used,
-                                         int *masked_type_cost) {
+static INLINE void calc_masked_type_cost(
+    const ModeCosts *mode_costs, BLOCK_SIZE bsize, int comp_group_idx_ctx,
+    int comp_index_ctx, int masked_compound_used, int *masked_type_cost) {
   av1_zero_array(masked_type_cost, COMPOUND_TYPES);
   // Account for group index cost when wedge and/or diffwtd prediction are
   // enabled
@@ -846,18 +897,21 @@
     // Compound group index of average and distwtd is 0
     // Compound group index of wedge and diffwtd is 1
     masked_type_cost[COMPOUND_AVERAGE] +=
-        x->comp_group_idx_cost[comp_group_idx_ctx][0];
+        mode_costs->comp_group_idx_cost[comp_group_idx_ctx][0];
     masked_type_cost[COMPOUND_DISTWTD] += masked_type_cost[COMPOUND_AVERAGE];
     masked_type_cost[COMPOUND_WEDGE] +=
-        x->comp_group_idx_cost[comp_group_idx_ctx][1];
+        mode_costs->comp_group_idx_cost[comp_group_idx_ctx][1];
     masked_type_cost[COMPOUND_DIFFWTD] += masked_type_cost[COMPOUND_WEDGE];
   }
 
   // Compute the cost to signal compound index/type
-  masked_type_cost[COMPOUND_AVERAGE] += x->comp_idx_cost[comp_index_ctx][1];
-  masked_type_cost[COMPOUND_DISTWTD] += x->comp_idx_cost[comp_index_ctx][0];
-  masked_type_cost[COMPOUND_WEDGE] += x->compound_type_cost[bsize][0];
-  masked_type_cost[COMPOUND_DIFFWTD] += x->compound_type_cost[bsize][1];
+  masked_type_cost[COMPOUND_AVERAGE] +=
+      mode_costs->comp_idx_cost[comp_index_ctx][1];
+  masked_type_cost[COMPOUND_DISTWTD] +=
+      mode_costs->comp_idx_cost[comp_index_ctx][0];
+  masked_type_cost[COMPOUND_WEDGE] += mode_costs->compound_type_cost[bsize][0];
+  masked_type_cost[COMPOUND_DIFFWTD] +=
+      mode_costs->compound_type_cost[bsize][1];
 }
 
 // Updates mbmi structure with the relevant compound type info
@@ -904,39 +958,11 @@
 
 // Updates best_mv for masked compound types
 static INLINE void update_mask_best_mv(const MB_MODE_INFO *const mbmi,
-                                       int_mv *best_mv, int_mv *cur_mv,
-                                       const COMPOUND_TYPE cur_type,
-                                       int *best_tmp_rate_mv, int tmp_rate_mv,
-                                       const SPEED_FEATURES *const sf) {
-  if (cur_type == COMPOUND_WEDGE ||
-      (sf->inter_sf.enable_interinter_diffwtd_newmv_search &&
-       cur_type == COMPOUND_DIFFWTD)) {
-    *best_tmp_rate_mv = tmp_rate_mv;
-    best_mv[0].as_int = mbmi->mv[0].as_int;
-    best_mv[1].as_int = mbmi->mv[1].as_int;
-  } else {
-    best_mv[0].as_int = cur_mv[0].as_int;
-    best_mv[1].as_int = cur_mv[1].as_int;
-  }
-}
-
-// Choose the better of the two COMPOUND_AVERAGE,
-// COMPOUND_DISTWTD based on modeled cost
-static int find_best_avg_distwtd_comp_type(MACROBLOCK *x, int *comp_model_rate,
-                                           int64_t *comp_model_dist,
-                                           int rate_mv, int64_t *best_rd) {
-  int64_t est_rd[2];
-  est_rd[COMPOUND_AVERAGE] =
-      RDCOST(x->rdmult, comp_model_rate[COMPOUND_AVERAGE] + rate_mv,
-             comp_model_dist[COMPOUND_AVERAGE]);
-  est_rd[COMPOUND_DISTWTD] =
-      RDCOST(x->rdmult, comp_model_rate[COMPOUND_DISTWTD] + rate_mv,
-             comp_model_dist[COMPOUND_DISTWTD]);
-  int best_type = (est_rd[COMPOUND_AVERAGE] <= est_rd[COMPOUND_DISTWTD])
-                      ? COMPOUND_AVERAGE
-                      : COMPOUND_DISTWTD;
-  *best_rd = est_rd[best_type];
-  return best_type;
+                                       int_mv *best_mv, int *best_tmp_rate_mv,
+                                       int tmp_rate_mv) {
+  *best_tmp_rate_mv = tmp_rate_mv;
+  best_mv[0].as_int = mbmi->mv[0].as_int;
+  best_mv[1].as_int = mbmi->mv[1].as_int;
 }
 
 static INLINE void save_comp_rd_search_stat(
@@ -969,14 +995,15 @@
 }
 
 static INLINE int get_interinter_compound_mask_rate(
-    const MACROBLOCK *const x, const MB_MODE_INFO *const mbmi) {
+    const ModeCosts *const mode_costs, const MB_MODE_INFO *const mbmi) {
   const COMPOUND_TYPE compound_type = mbmi->interinter_comp.type;
   // This function will be called only for COMPOUND_WEDGE and COMPOUND_DIFFWTD
   if (compound_type == COMPOUND_WEDGE) {
-    return av1_is_wedge_used(mbmi->sb_type)
+    return av1_is_wedge_used(mbmi->bsize)
                ? av1_cost_literal(1) +
-                     x->wedge_idx_cost[mbmi->sb_type]
-                                      [mbmi->interinter_comp.wedge_index]
+                     mode_costs
+                         ->wedge_idx_cost[mbmi->bsize]
+                                         [mbmi->interinter_comp.wedge_index]
                : 0;
   } else {
     assert(compound_type == COMPOUND_DIFFWTD);
@@ -1049,7 +1076,7 @@
   uint64_t cur_sse = UINT64_MAX;
   best_rd_cur = pick_interinter_mask[compound_type - COMPOUND_WEDGE](
       cpi, x, bsize, *preds0, *preds1, residual1, diff10, &cur_sse);
-  *rs2 += get_interinter_compound_mask_rate(x, mbmi);
+  *rs2 += get_interinter_compound_mask_rate(&x->mode_costs, mbmi);
   best_rd_cur += RDCOST(x->rdmult, *rs2 + rate_mv, 0);
   assert(cur_sse != UINT64_MAX);
   int64_t skip_rd_cur = RDCOST(x->rdmult, *rs2 + rate_mv, (cur_sse << 4));
@@ -1128,7 +1155,7 @@
         av1_build_wedge_inter_predictor_from_buf(xd, bsize, 0, 0, tmp_preds0,
                                                  strides, tmp_preds1, strides);
       }
-      av1_release_compound_type_rd_buffers(&tmp_buf);
+      release_compound_type_rd_buffers(&tmp_buf);
     } else {
       *out_rate_mv = rate_mv;
       av1_build_wedge_inter_predictor_from_buf(xd, bsize, 0, 0, preds0, strides,
@@ -1197,7 +1224,8 @@
 static int comp_type_rd_threshold_div[3] = { 3, 16, 16 };
 
 int av1_compound_type_rd(const AV1_COMP *const cpi, MACROBLOCK *x,
-                         BLOCK_SIZE bsize, int_mv *cur_mv, int mode_search_mask,
+                         HandleInterModeArgs *args, BLOCK_SIZE bsize,
+                         int_mv *cur_mv, int mode_search_mask,
                          int masked_compound_used, const BUFFER_SET *orig_dst,
                          const BUFFER_SET *tmp_dst,
                          const CompoundTypeRdBuffers *buffers, int *rate_mv,
@@ -1208,6 +1236,7 @@
   MACROBLOCKD *xd = &x->e_mbd;
   MB_MODE_INFO *mbmi = xd->mi[0];
   const PREDICTION_MODE this_mode = mbmi->mode;
+  int ref_frame = av1_ref_frame_type(mbmi->ref_frame);
   const int bw = block_size_wide[bsize];
   int rs2;
   int_mv best_mv[2];
@@ -1222,8 +1251,6 @@
   uint8_t *preds1[1] = { buffers->pred1 };
   int strides[1] = { bw };
   int tmp_rate_mv;
-  const int num_pix = 1 << num_pels_log2_lookup[bsize];
-  const int mask_len = 2 * num_pix * sizeof(uint8_t);
   COMPOUND_TYPE cur_type;
   // Local array to store the mask cost for different compound types
   int masked_type_cost[COMPOUND_TYPES];
@@ -1244,8 +1271,6 @@
   best_mv[0].as_int = cur_mv[0].as_int;
   best_mv[1].as_int = cur_mv[1].as_int;
   *rd = INT64_MAX;
-  int rate_sum, tmp_skip_txfm_sb;
-  int64_t dist_sum, tmp_skip_sse_sb;
 
   // Local array to store the valid compound types to be evaluated in the core
   // loop
@@ -1253,24 +1278,22 @@
     COMPOUND_AVERAGE, COMPOUND_DISTWTD, COMPOUND_WEDGE, COMPOUND_DIFFWTD
   };
   int valid_type_count = 0;
-  int try_average_and_distwtd_comp = 0;
   // compute_valid_comp_types() returns the number of valid compound types to be
   // evaluated and populates the same in the local array valid_comp_types[].
   // It also sets the flag 'try_average_and_distwtd_comp'
   valid_type_count = compute_valid_comp_types(
-      x, cpi, &try_average_and_distwtd_comp, bsize, masked_compound_used,
-      mode_search_mask, valid_comp_types);
+      x, cpi, bsize, masked_compound_used, mode_search_mask, valid_comp_types);
 
   // The following context indices are independent of compound type
   const int comp_group_idx_ctx = get_comp_group_idx_context(xd);
   const int comp_index_ctx = get_comp_index_context(cm, xd);
 
   // Populates masked_type_cost local array for the 4 compound types
-  calc_masked_type_cost(x, bsize, comp_group_idx_ctx, comp_index_ctx,
-                        masked_compound_used, masked_type_cost);
+  calc_masked_type_cost(&x->mode_costs, bsize, comp_group_idx_ctx,
+                        comp_index_ctx, masked_compound_used, masked_type_cost);
 
   int64_t comp_model_rd_cur = INT64_MAX;
-  int64_t best_rd_cur = INT64_MAX;
+  int64_t best_rd_cur = ref_best_rd;
   const int mi_row = xd->mi_row;
   const int mi_col = xd->mi_col;
 
@@ -1281,98 +1304,6 @@
                                          comp_rate, comp_dist, comp_rs2,
                                          rate_mv, rd, match_index);
   }
-  // Special handling if both compound_average and compound_distwtd
-  // are to be searched. In this case, first estimate between the two
-  // modes and then call estimate_yrd_for_sb() only for the better of
-  // the two.
-  if (try_average_and_distwtd_comp) {
-    int est_rate[2];
-    int64_t est_dist[2], est_rd;
-    COMPOUND_TYPE best_type;
-    // Since modelled rate and dist are separately stored,
-    // compute better of COMPOUND_AVERAGE and COMPOUND_DISTWTD
-    // using the stored stats.
-    if ((comp_model_rate[COMPOUND_AVERAGE] != INT_MAX) &&
-        comp_model_rate[COMPOUND_DISTWTD] != INT_MAX) {
-      // Choose the better of the COMPOUND_AVERAGE,
-      // COMPOUND_DISTWTD on modeled cost.
-      best_type = find_best_avg_distwtd_comp_type(
-          x, comp_model_rate, comp_model_dist, *rate_mv, &est_rd);
-      update_mbmi_for_compound_type(mbmi, best_type);
-      if (comp_rate[best_type] != INT_MAX)
-        best_rd_cur = RDCOST(
-            x->rdmult,
-            masked_type_cost[best_type] + *rate_mv + comp_rate[best_type],
-            comp_dist[best_type]);
-      comp_model_rd_cur = est_rd;
-      // Update stats for best compound type
-      if (best_rd_cur < *rd) {
-        update_best_info(mbmi, rd, &best_type_stats, best_rd_cur,
-                         comp_model_rd_cur, masked_type_cost[best_type]);
-      }
-      restore_dst_buf(xd, *tmp_dst, 1);
-    } else {
-      int64_t sse_y[COMPOUND_DISTWTD + 1];
-      // Calculate model_rd for COMPOUND_AVERAGE and COMPOUND_DISTWTD
-      for (int comp_type = COMPOUND_AVERAGE; comp_type <= COMPOUND_DISTWTD;
-           comp_type++) {
-        update_mbmi_for_compound_type(mbmi, comp_type);
-        av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize,
-                                      AOM_PLANE_Y, AOM_PLANE_Y);
-        model_rd_sb_fn[MODELRD_CURVFIT](
-            cpi, bsize, x, xd, 0, 0, &est_rate[comp_type], &est_dist[comp_type],
-            NULL, NULL, NULL, NULL, NULL);
-        est_rate[comp_type] += masked_type_cost[comp_type];
-        comp_model_rate[comp_type] = est_rate[comp_type];
-        comp_model_dist[comp_type] = est_dist[comp_type];
-        sse_y[comp_type] = x->pred_sse[xd->mi[0]->ref_frame[0]];
-        if (comp_type == COMPOUND_AVERAGE) {
-          *is_luma_interp_done = 1;
-          restore_dst_buf(xd, *tmp_dst, 1);
-        }
-      }
-      // Choose the better of the two based on modeled cost and call
-      // estimate_yrd_for_sb() for that one.
-      best_type = find_best_avg_distwtd_comp_type(
-          x, comp_model_rate, comp_model_dist, *rate_mv, &est_rd);
-      update_mbmi_for_compound_type(mbmi, best_type);
-      if (best_type == COMPOUND_AVERAGE) restore_dst_buf(xd, *orig_dst, 1);
-      rs2 = masked_type_cost[best_type];
-      RD_STATS est_rd_stats;
-      const int64_t mode_rd = RDCOST(x->rdmult, rs2 + *rate_mv, 0);
-      const int64_t tmp_rd_thresh = AOMMIN(*rd, rd_thresh) - mode_rd;
-      int64_t est_rd_ = INT64_MAX;
-      int eval_txfm = 1;
-      // Check if the mode is good enough based on skip rd
-      if (cpi->sf.inter_sf.txfm_rd_gate_level) {
-        int64_t skip_rd =
-            RDCOST(x->rdmult, rs2 + *rate_mv, (sse_y[best_type] << 4));
-        eval_txfm = check_txfm_eval(x, bsize, ref_skip_rd, skip_rd,
-                                    cpi->sf.inter_sf.txfm_rd_gate_level, 1);
-      }
-      // Evaluate further if skip rd is low enough
-      if (eval_txfm) {
-        est_rd_ =
-            estimate_yrd_for_sb(cpi, bsize, x, tmp_rd_thresh, &est_rd_stats);
-      }
-
-      if (est_rd_ != INT64_MAX) {
-        best_rd_cur = RDCOST(x->rdmult, rs2 + *rate_mv + est_rd_stats.rate,
-                             est_rd_stats.dist);
-        // Backup rate and distortion for future reuse
-        backup_stats(best_type, comp_rate, comp_dist, comp_model_rate,
-                     comp_model_dist, est_rate[best_type], est_dist[best_type],
-                     &est_rd_stats, comp_rs2, rs2);
-        comp_model_rd_cur = est_rd;
-      }
-      if (best_type == COMPOUND_AVERAGE) restore_dst_buf(xd, *tmp_dst, 1);
-      // Update stats for best compound type
-      if (best_rd_cur < *rd) {
-        update_best_info(mbmi, rd, &best_type_stats, best_rd_cur,
-                         comp_model_rd_cur, rs2);
-      }
-    }
-  }
 
   // If COMPOUND_AVERAGE is not valid, use the spare buffer
   if (valid_comp_types[0] != COMPOUND_AVERAGE) restore_dst_buf(xd, *tmp_dst, 1);
@@ -1380,71 +1311,212 @@
   // Loop over valid compound types
   for (int i = 0; i < valid_type_count; i++) {
     cur_type = valid_comp_types[i];
+
+    if (args->cmp_mode[ref_frame] == COMPOUND_AVERAGE) {
+      if (cur_type == COMPOUND_WEDGE) continue;
+    }
+
     comp_model_rd_cur = INT64_MAX;
     tmp_rate_mv = *rate_mv;
     best_rd_cur = INT64_MAX;
+    ref_best_rd = AOMMIN(ref_best_rd, *rd);
+    update_mbmi_for_compound_type(mbmi, cur_type);
+    rs2 = masked_type_cost[cur_type];
+
+    int64_t mode_rd = RDCOST(x->rdmult, rs2 + rd_stats->rate, 0);
+    if (mode_rd >= ref_best_rd) continue;
 
     // Case COMPOUND_AVERAGE and COMPOUND_DISTWTD
     if (cur_type < COMPOUND_WEDGE) {
-      update_mbmi_for_compound_type(mbmi, cur_type);
-      rs2 = masked_type_cost[cur_type];
-      const int64_t mode_rd = RDCOST(x->rdmult, rs2 + rd_stats->rate, 0);
-      if (mode_rd < ref_best_rd) {
-        // Reuse data if matching record is found
-        if (comp_rate[cur_type] == INT_MAX) {
-          av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize,
-                                        AOM_PLANE_Y, AOM_PLANE_Y);
-          if (cur_type == COMPOUND_AVERAGE) *is_luma_interp_done = 1;
-
-          // Compute RD cost for the current type
-          RD_STATS est_rd_stats;
-          const int64_t tmp_rd_thresh = AOMMIN(*rd, rd_thresh) - mode_rd;
-          int64_t est_rd = INT64_MAX;
-          int eval_txfm = 1;
-          // Check if the mode is good enough based on skip rd
-          if (cpi->sf.inter_sf.txfm_rd_gate_level) {
-            int64_t sse_y = compute_sse_plane(x, xd, PLANE_TYPE_Y, bsize);
-            int64_t skip_rd = RDCOST(x->rdmult, rs2 + *rate_mv, (sse_y << 4));
-            eval_txfm = check_txfm_eval(x, bsize, ref_skip_rd, skip_rd,
-                                        cpi->sf.inter_sf.txfm_rd_gate_level, 1);
-          }
-          // Evaluate further if skip rd is low enough
-          if (eval_txfm) {
-            est_rd = estimate_yrd_for_sb(cpi, bsize, x, tmp_rd_thresh,
-                                         &est_rd_stats);
-          }
-
-          if (est_rd != INT64_MAX) {
-            best_rd_cur = RDCOST(x->rdmult, rs2 + *rate_mv + est_rd_stats.rate,
-                                 est_rd_stats.dist);
-            model_rd_sb_fn[MODELRD_TYPE_MASKED_COMPOUND](
-                cpi, bsize, x, xd, 0, 0, &rate_sum, &dist_sum,
-                &tmp_skip_txfm_sb, &tmp_skip_sse_sb, NULL, NULL, NULL);
-            comp_model_rd_cur =
-                RDCOST(x->rdmult, rs2 + *rate_mv + rate_sum, dist_sum);
-
-            // Backup rate and distortion for future reuse
-            backup_stats(cur_type, comp_rate, comp_dist, comp_model_rate,
-                         comp_model_dist, rate_sum, dist_sum, &est_rd_stats,
-                         comp_rs2, rs2);
-          }
-        } else {
-          // Calculate RD cost based on stored stats
-          assert(comp_dist[cur_type] != INT64_MAX);
-          best_rd_cur = RDCOST(x->rdmult, rs2 + *rate_mv + comp_rate[cur_type],
-                               comp_dist[cur_type]);
-          // Recalculate model rdcost with the updated rate
-          comp_model_rd_cur =
-              RDCOST(x->rdmult, rs2 + *rate_mv + comp_model_rate[cur_type],
-                     comp_model_dist[cur_type]);
-        }
+      tmp_rate_mv = *rate_mv;
+      if (have_newmv_in_inter_mode(this_mode)) {
+        InterPredParams inter_pred_params;
+        av1_dist_wtd_comp_weight_assign(
+            &cpi->common, mbmi, 0, &inter_pred_params.conv_params.fwd_offset,
+            &inter_pred_params.conv_params.bck_offset,
+            &inter_pred_params.conv_params.use_dist_wtd_comp_avg, 1);
+        int mask_value = inter_pred_params.conv_params.fwd_offset * 4;
+        memset(xd->seg_mask, mask_value, sizeof(xd->seg_mask));
+        tmp_rate_mv = av1_interinter_compound_motion_search(cpi, x, cur_mv,
+                                                            bsize, this_mode);
       }
+      av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize,
+                                    AOM_PLANE_Y, AOM_PLANE_Y);
+      if (cur_type == COMPOUND_AVERAGE) *is_luma_interp_done = 1;
+
+      RD_STATS est_rd_stats;
+      estimate_yrd_for_sb(cpi, bsize, x, INT64_MAX, &est_rd_stats);
+
+      best_rd_cur = RDCOST(x->rdmult, rs2 + tmp_rate_mv + est_rd_stats.rate,
+                           est_rd_stats.dist);
       // use spare buffer for following compound type try
       if (cur_type == COMPOUND_AVERAGE) restore_dst_buf(xd, *tmp_dst, 1);
+    } else if (cur_type == COMPOUND_WEDGE) {
+      int best_mask_index = 0;
+      int best_wedge_sign = 0;
+      int_mv tmp_mv[2] = { mbmi->mv[0], mbmi->mv[1] };
+      int best_rs2 = 0;
+      int best_rate_mv = *rate_mv;
+      const int wedge_mask_size = get_wedge_types_lookup(bsize);
+      int need_mask_search = args->wedge_index == -1;
+
+      if (need_mask_search && !have_newmv_in_inter_mode(this_mode)) {
+        // short cut repeated single reference block build
+        av1_build_inter_predictors_for_planes_single_buf(xd, bsize, 0, 0, 0,
+                                                         preds0, strides);
+        av1_build_inter_predictors_for_planes_single_buf(xd, bsize, 0, 0, 1,
+                                                         preds1, strides);
+      }
+
+      for (int wedge_mask = 0; wedge_mask < wedge_mask_size && need_mask_search;
+           ++wedge_mask) {
+        for (int wedge_sign = 0; wedge_sign < 2; ++wedge_sign) {
+          tmp_rate_mv = *rate_mv;
+          mbmi->interinter_comp.wedge_index = wedge_mask;
+          mbmi->interinter_comp.wedge_sign = wedge_sign;
+          rs2 = masked_type_cost[cur_type];
+          rs2 += get_interinter_compound_mask_rate(&x->mode_costs, mbmi);
+
+          mode_rd = RDCOST(x->rdmult, rs2 + rd_stats->rate, 0);
+          if (mode_rd >= ref_best_rd / 2) continue;
+
+          if (have_newmv_in_inter_mode(this_mode)) {
+            tmp_rate_mv = av1_interinter_compound_motion_search(
+                cpi, x, cur_mv, bsize, this_mode);
+            av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst,
+                                          bsize, AOM_PLANE_Y, AOM_PLANE_Y);
+          } else {
+            av1_build_wedge_inter_predictor_from_buf(xd, bsize, 0, 0, preds0,
+                                                     strides, preds1, strides);
+          }
+
+          RD_STATS est_rd_stats;
+          int64_t this_rd_cur = estimate_yrd_for_sb(
+              cpi, bsize, x, AOMMIN(best_rd_cur, ref_best_rd), &est_rd_stats);
+          if (this_rd_cur < INT64_MAX) {
+            this_rd_cur =
+                RDCOST(x->rdmult, rs2 + tmp_rate_mv + est_rd_stats.rate,
+                       est_rd_stats.dist);
+          }
+          if (this_rd_cur < best_rd_cur) {
+            best_mask_index = wedge_mask;
+            best_wedge_sign = wedge_sign;
+            best_rd_cur = this_rd_cur;
+            tmp_mv[0] = mbmi->mv[0];
+            tmp_mv[1] = mbmi->mv[1];
+            best_rate_mv = tmp_rate_mv;
+            best_rs2 = rs2;
+          }
+        }
+      }
+
+      if (need_mask_search) {
+        if (this_mode == NEW_NEWMV) {
+          args->wedge_index = best_mask_index;
+          args->wedge_sign = best_wedge_sign;
+        }
+      } else {
+        mbmi->interinter_comp.wedge_index = args->wedge_index;
+        mbmi->interinter_comp.wedge_sign = args->wedge_sign;
+        rs2 = masked_type_cost[cur_type];
+        rs2 += get_interinter_compound_mask_rate(&x->mode_costs, mbmi);
+
+        if (have_newmv_in_inter_mode(this_mode)) {
+          tmp_rate_mv = av1_interinter_compound_motion_search(cpi, x, cur_mv,
+                                                              bsize, this_mode);
+        }
+
+        best_mask_index = args->wedge_index;
+        best_wedge_sign = args->wedge_sign;
+        tmp_mv[0] = mbmi->mv[0];
+        tmp_mv[1] = mbmi->mv[1];
+        best_rate_mv = tmp_rate_mv;
+        best_rs2 = masked_type_cost[cur_type];
+        best_rs2 += get_interinter_compound_mask_rate(&x->mode_costs, mbmi);
+        av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize,
+                                      AOM_PLANE_Y, AOM_PLANE_Y);
+        RD_STATS est_rd_stats;
+        estimate_yrd_for_sb(cpi, bsize, x, INT64_MAX, &est_rd_stats);
+        best_rd_cur =
+            RDCOST(x->rdmult, best_rs2 + tmp_rate_mv + est_rd_stats.rate,
+                   est_rd_stats.dist);
+      }
+
+      mbmi->interinter_comp.wedge_index = best_mask_index;
+      mbmi->interinter_comp.wedge_sign = best_wedge_sign;
+      mbmi->mv[0] = tmp_mv[0];
+      mbmi->mv[1] = tmp_mv[1];
+      tmp_rate_mv = best_rate_mv;
+      rs2 = best_rs2;
+    } else if (cur_type == COMPOUND_DIFFWTD) {
+      int_mv tmp_mv[2];
+      int best_mask_index = 0;
+      rs2 += get_interinter_compound_mask_rate(&x->mode_costs, mbmi);
+
+      int need_mask_search = args->diffwtd_index == -1;
+
+      for (int mask_index = 0; mask_index < 2 && need_mask_search;
+           ++mask_index) {
+        tmp_rate_mv = *rate_mv;
+        mbmi->interinter_comp.mask_type = mask_index;
+        if (have_newmv_in_inter_mode(this_mode)) {
+          // hard coded number for diff wtd
+          int mask_value = mask_index == 0 ? 38 : 26;
+          memset(xd->seg_mask, mask_value, sizeof(xd->seg_mask));
+          tmp_rate_mv = av1_interinter_compound_motion_search(cpi, x, cur_mv,
+                                                              bsize, this_mode);
+        }
+        av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize,
+                                      AOM_PLANE_Y, AOM_PLANE_Y);
+        RD_STATS est_rd_stats;
+        int64_t this_rd_cur =
+            estimate_yrd_for_sb(cpi, bsize, x, ref_best_rd, &est_rd_stats);
+        if (this_rd_cur < INT64_MAX) {
+          this_rd_cur = RDCOST(x->rdmult, rs2 + tmp_rate_mv + est_rd_stats.rate,
+                               est_rd_stats.dist);
+        }
+
+        if (this_rd_cur < best_rd_cur) {
+          best_rd_cur = this_rd_cur;
+          best_mask_index = mbmi->interinter_comp.mask_type;
+          tmp_mv[0] = mbmi->mv[0];
+          tmp_mv[1] = mbmi->mv[1];
+        }
+      }
+
+      if (need_mask_search) {
+        if (this_mode == NEW_NEWMV) args->diffwtd_index = best_mask_index;
+      } else {
+        mbmi->interinter_comp.mask_type = args->diffwtd_index;
+        rs2 = masked_type_cost[cur_type];
+        rs2 += get_interinter_compound_mask_rate(&x->mode_costs, mbmi);
+
+        int mask_value = mbmi->interinter_comp.mask_type == 0 ? 38 : 26;
+        memset(xd->seg_mask, mask_value, sizeof(xd->seg_mask));
+
+        if (have_newmv_in_inter_mode(this_mode)) {
+          tmp_rate_mv = av1_interinter_compound_motion_search(cpi, x, cur_mv,
+                                                              bsize, this_mode);
+        }
+        best_mask_index = mbmi->interinter_comp.mask_type;
+        tmp_mv[0] = mbmi->mv[0];
+        tmp_mv[1] = mbmi->mv[1];
+        av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize,
+                                      AOM_PLANE_Y, AOM_PLANE_Y);
+        RD_STATS est_rd_stats;
+        int64_t this_rd_cur =
+            estimate_yrd_for_sb(cpi, bsize, x, ref_best_rd, &est_rd_stats);
+        if (this_rd_cur < INT64_MAX) {
+          best_rd_cur = RDCOST(x->rdmult, rs2 + tmp_rate_mv + est_rd_stats.rate,
+                               est_rd_stats.dist);
+        }
+      }
+
+      mbmi->interinter_comp.mask_type = best_mask_index;
+      mbmi->mv[0] = tmp_mv[0];
+      mbmi->mv[1] = tmp_mv[1];
     } else {
       // Handle masked compound types
-      update_mbmi_for_compound_type(mbmi, cur_type);
-      rs2 = masked_type_cost[cur_type];
       // Factors to control gating of compound type selection based on best
       // approximate rd so far
       const int max_comp_type_rd_threshold_mul =
@@ -1469,37 +1541,35 @@
             ref_skip_rd);
       }
     }
+
     // Update stats for best compound type
     if (best_rd_cur < *rd) {
       update_best_info(mbmi, rd, &best_type_stats, best_rd_cur,
                        comp_model_rd_cur, rs2);
-      if (masked_compound_used && cur_type >= COMPOUND_WEDGE) {
-        memcpy(buffers->tmp_best_mask_buf, xd->seg_mask, mask_len);
-        if (have_newmv_in_inter_mode(this_mode))
-          update_mask_best_mv(mbmi, best_mv, cur_mv, cur_type,
-                              &best_tmp_rate_mv, tmp_rate_mv, &cpi->sf);
-      }
+      if (have_newmv_in_inter_mode(this_mode))
+        update_mask_best_mv(mbmi, best_mv, &best_tmp_rate_mv, tmp_rate_mv);
     }
     // reset to original mvs for next iteration
     mbmi->mv[0].as_int = cur_mv[0].as_int;
     mbmi->mv[1].as_int = cur_mv[1].as_int;
   }
-  if (mbmi->interinter_comp.type != best_type_stats.best_compound_data.type) {
-    mbmi->comp_group_idx =
-        (best_type_stats.best_compound_data.type < COMPOUND_WEDGE) ? 0 : 1;
-    mbmi->compound_idx =
-        !(best_type_stats.best_compound_data.type == COMPOUND_DISTWTD);
-    mbmi->interinter_comp = best_type_stats.best_compound_data;
-    memcpy(xd->seg_mask, buffers->tmp_best_mask_buf, mask_len);
-  }
+
+  mbmi->comp_group_idx =
+      (best_type_stats.best_compound_data.type < COMPOUND_WEDGE) ? 0 : 1;
+  mbmi->compound_idx =
+      !(best_type_stats.best_compound_data.type == COMPOUND_DISTWTD);
+  mbmi->interinter_comp = best_type_stats.best_compound_data;
+
   if (have_newmv_in_inter_mode(this_mode)) {
     mbmi->mv[0].as_int = best_mv[0].as_int;
     mbmi->mv[1].as_int = best_mv[1].as_int;
-    if (mbmi->interinter_comp.type == COMPOUND_WEDGE) {
-      rd_stats->rate += best_tmp_rate_mv - *rate_mv;
-      *rate_mv = best_tmp_rate_mv;
-    }
+    rd_stats->rate += best_tmp_rate_mv - *rate_mv;
+    *rate_mv = best_tmp_rate_mv;
   }
+
+  if (this_mode == NEW_NEWMV)
+    args->cmp_mode[ref_frame] = mbmi->interinter_comp.type;
+
   restore_dst_buf(xd, *orig_dst, 1);
   if (!match_found)
     save_comp_rd_search_stat(x, mbmi, comp_rate, comp_dist, comp_model_rate,
diff --git a/av1/encoder/compound_type.h b/av1/encoder/compound_type.h
index f2bd857..a028a35 100644
--- a/av1/encoder/compound_type.h
+++ b/av1/encoder/compound_type.h
@@ -26,6 +26,9 @@
   int best_compmode_interinter_cost;
 } BEST_COMP_TYPE_STATS;
 
+#define IGNORE_MODE -1
+// Searches for the best inter-intra mode. Returns IGNORE_MODE if no good mode
+// is found, 0 otherwise.
 int av1_handle_inter_intra_mode(const AV1_COMP *const cpi, MACROBLOCK *const x,
                                 BLOCK_SIZE bsize, MB_MODE_INFO *mbmi,
                                 HandleInterModeArgs *args, int64_t ref_best_rd,
@@ -33,7 +36,8 @@
                                 const BUFFER_SET *orig_dst);
 
 int av1_compound_type_rd(const AV1_COMP *const cpi, MACROBLOCK *x,
-                         BLOCK_SIZE bsize, int_mv *cur_mv, int mode_search_mask,
+                         HandleInterModeArgs *args, BLOCK_SIZE bsize,
+                         int_mv *cur_mv, int mode_search_mask,
                          int masked_compound_used, const BUFFER_SET *orig_dst,
                          const BUFFER_SET *tmp_dst,
                          const CompoundTypeRdBuffers *buffers, int *rate_mv,
diff --git a/av1/encoder/context_tree.c b/av1/encoder/context_tree.c
index 9b5b1cb..6554be6 100644
--- a/av1/encoder/context_tree.c
+++ b/av1/encoder/context_tree.c
@@ -11,234 +11,12 @@
 
 #include "av1/encoder/context_tree.h"
 #include "av1/encoder/encoder.h"
+#include "av1/encoder/rd.h"
 
 static const BLOCK_SIZE square[MAX_SB_SIZE_LOG2 - 1] = {
   BLOCK_4X4, BLOCK_8X8, BLOCK_16X16, BLOCK_32X32, BLOCK_64X64, BLOCK_128X128,
 };
 
-typedef struct {
-  tran_low_t *coeff_buf[MAX_MB_PLANE];
-  tran_low_t *qcoeff_buf[MAX_MB_PLANE];
-  tran_low_t *dqcoeff_buf[MAX_MB_PLANE];
-} PC_TREE_SHARED_BUFFERS;
-
-static AOM_INLINE void alloc_mode_context(AV1_COMMON *cm, int num_pix,
-                                          PICK_MODE_CONTEXT *ctx,
-                                          PC_TREE_SHARED_BUFFERS *shared_bufs) {
-  const int num_planes = av1_num_planes(cm);
-  int i;
-  const int num_blk = num_pix / 16;
-  ctx->num_4x4_blk = num_blk;
-
-  CHECK_MEM_ERROR(cm, ctx->blk_skip,
-                  aom_calloc(num_blk, sizeof(*ctx->blk_skip)));
-  CHECK_MEM_ERROR(cm, ctx->tx_type_map,
-                  aom_calloc(num_blk, sizeof(*ctx->tx_type_map)));
-  for (i = 0; i < num_planes; ++i) {
-    ctx->coeff[i] = shared_bufs->coeff_buf[i];
-    ctx->qcoeff[i] = shared_bufs->qcoeff_buf[i];
-    ctx->dqcoeff[i] = shared_bufs->dqcoeff_buf[i];
-    CHECK_MEM_ERROR(cm, ctx->eobs[i],
-                    aom_memalign(32, num_blk * sizeof(*ctx->eobs[i])));
-    CHECK_MEM_ERROR(
-        cm, ctx->txb_entropy_ctx[i],
-        aom_memalign(32, num_blk * sizeof(*ctx->txb_entropy_ctx[i])));
-  }
-
-  if (num_pix <= MAX_PALETTE_SQUARE) {
-    for (i = 0; i < 2; ++i) {
-      CHECK_MEM_ERROR(
-          cm, ctx->color_index_map[i],
-          aom_memalign(32, num_pix * sizeof(*ctx->color_index_map[i])));
-    }
-  }
-}
-
-static AOM_INLINE void free_mode_context(PICK_MODE_CONTEXT *ctx,
-                                         const int num_planes) {
-  int i;
-  aom_free(ctx->blk_skip);
-  ctx->blk_skip = 0;
-  aom_free(ctx->tx_type_map);
-  ctx->tx_type_map = 0;
-  for (i = 0; i < num_planes; ++i) {
-    ctx->coeff[i] = 0;
-    ctx->qcoeff[i] = 0;
-    ctx->dqcoeff[i] = 0;
-    aom_free(ctx->eobs[i]);
-    ctx->eobs[i] = 0;
-    aom_free(ctx->txb_entropy_ctx[i]);
-    ctx->txb_entropy_ctx[i] = 0;
-  }
-
-  for (i = 0; i < 2; ++i) {
-    aom_free(ctx->color_index_map[i]);
-    ctx->color_index_map[i] = 0;
-  }
-}
-
-static AOM_INLINE void alloc_tree_contexts(
-    AV1_COMMON *cm, PC_TREE *tree, int num_pix, int is_leaf,
-    PC_TREE_SHARED_BUFFERS *shared_bufs) {
-  alloc_mode_context(cm, num_pix, &tree->none, shared_bufs);
-
-  if (is_leaf) return;
-
-  alloc_mode_context(cm, num_pix / 2, &tree->horizontal[0], shared_bufs);
-  alloc_mode_context(cm, num_pix / 2, &tree->vertical[0], shared_bufs);
-
-  alloc_mode_context(cm, num_pix / 2, &tree->horizontal[1], shared_bufs);
-  alloc_mode_context(cm, num_pix / 2, &tree->vertical[1], shared_bufs);
-
-  alloc_mode_context(cm, num_pix / 4, &tree->horizontala[0], shared_bufs);
-  alloc_mode_context(cm, num_pix / 4, &tree->horizontala[1], shared_bufs);
-  alloc_mode_context(cm, num_pix / 2, &tree->horizontala[2], shared_bufs);
-
-  alloc_mode_context(cm, num_pix / 2, &tree->horizontalb[0], shared_bufs);
-  alloc_mode_context(cm, num_pix / 4, &tree->horizontalb[1], shared_bufs);
-  alloc_mode_context(cm, num_pix / 4, &tree->horizontalb[2], shared_bufs);
-
-  alloc_mode_context(cm, num_pix / 4, &tree->verticala[0], shared_bufs);
-  alloc_mode_context(cm, num_pix / 4, &tree->verticala[1], shared_bufs);
-  alloc_mode_context(cm, num_pix / 2, &tree->verticala[2], shared_bufs);
-
-  alloc_mode_context(cm, num_pix / 2, &tree->verticalb[0], shared_bufs);
-  alloc_mode_context(cm, num_pix / 4, &tree->verticalb[1], shared_bufs);
-  alloc_mode_context(cm, num_pix / 4, &tree->verticalb[2], shared_bufs);
-
-  for (int i = 0; i < 4; ++i) {
-    alloc_mode_context(cm, num_pix / 4, &tree->horizontal4[i], shared_bufs);
-    alloc_mode_context(cm, num_pix / 4, &tree->vertical4[i], shared_bufs);
-  }
-}
-
-static AOM_INLINE void free_tree_contexts(PC_TREE *tree, const int num_planes) {
-  int i;
-  for (i = 0; i < 3; i++) {
-    free_mode_context(&tree->horizontala[i], num_planes);
-    free_mode_context(&tree->horizontalb[i], num_planes);
-    free_mode_context(&tree->verticala[i], num_planes);
-    free_mode_context(&tree->verticalb[i], num_planes);
-  }
-  for (i = 0; i < 4; ++i) {
-    free_mode_context(&tree->horizontal4[i], num_planes);
-    free_mode_context(&tree->vertical4[i], num_planes);
-  }
-  free_mode_context(&tree->none, num_planes);
-  free_mode_context(&tree->horizontal[0], num_planes);
-  free_mode_context(&tree->horizontal[1], num_planes);
-  free_mode_context(&tree->vertical[0], num_planes);
-  free_mode_context(&tree->vertical[1], num_planes);
-}
-
-// This function will compute the number of pc_tree nodes to be allocated
-// or freed as per the super block size of BLOCK_128X128 or BLOCK_64X64
-static AOM_INLINE int get_pc_tree_nodes(const int is_sb_size_128,
-                                        int stat_generation_stage) {
-  const int tree_nodes_inc = is_sb_size_128 ? 1024 : 0;
-  const int tree_nodes =
-      stat_generation_stage ? 1 : (tree_nodes_inc + 256 + 64 + 16 + 4 + 1);
-  return tree_nodes;
-}
-
-// This function sets up a tree of contexts such that at each square
-// partition level. There are contexts for none, horizontal, vertical, and
-// split.  Along with a block_size value and a selected block_size which
-// represents the state of our search.
-void av1_setup_pc_tree(AV1_COMP *const cpi, ThreadData *td) {
-  AV1_COMMON *const cm = &cpi->common;
-  int i, j, stat_generation_stage = is_stat_generation_stage(cpi);
-  const int is_sb_size_128 = cm->seq_params.sb_size == BLOCK_128X128;
-  const int tree_nodes =
-      get_pc_tree_nodes(is_sb_size_128, stat_generation_stage);
-  int pc_tree_index = 0;
-  PC_TREE *this_pc;
-  PC_TREE_SHARED_BUFFERS shared_bufs;
-  int square_index = 1;
-  int nodes;
-
-  aom_free(td->pc_tree);
-  CHECK_MEM_ERROR(cm, td->pc_tree,
-                  aom_calloc(tree_nodes, sizeof(*td->pc_tree)));
-  this_pc = &td->pc_tree[0];
-
-  for (i = 0; i < 3; i++) {
-    const int max_num_pix = MAX_SB_SIZE * MAX_SB_SIZE;
-    CHECK_MEM_ERROR(cm, td->tree_coeff_buf[i],
-                    aom_memalign(32, max_num_pix * sizeof(tran_low_t)));
-    CHECK_MEM_ERROR(cm, td->tree_qcoeff_buf[i],
-                    aom_memalign(32, max_num_pix * sizeof(tran_low_t)));
-    CHECK_MEM_ERROR(cm, td->tree_dqcoeff_buf[i],
-                    aom_memalign(32, max_num_pix * sizeof(tran_low_t)));
-    shared_bufs.coeff_buf[i] = td->tree_coeff_buf[i];
-    shared_bufs.qcoeff_buf[i] = td->tree_qcoeff_buf[i];
-    shared_bufs.dqcoeff_buf[i] = td->tree_dqcoeff_buf[i];
-  }
-
-  if (!stat_generation_stage) {
-    const int leaf_factor = is_sb_size_128 ? 4 : 1;
-    const int leaf_nodes = 256 * leaf_factor;
-
-    // Sets up all the leaf nodes in the tree.
-    for (pc_tree_index = 0; pc_tree_index < leaf_nodes; ++pc_tree_index) {
-      PC_TREE *const tree = &td->pc_tree[pc_tree_index];
-      tree->block_size = square[0];
-      alloc_tree_contexts(cm, tree, 16, 1, &shared_bufs);
-    }
-
-    // Each node has 4 leaf nodes, fill each block_size level of the tree
-    // from leafs to the root.
-    for (nodes = leaf_nodes >> 2; nodes > 0; nodes >>= 2) {
-      for (i = 0; i < nodes; ++i) {
-        PC_TREE *const tree = &td->pc_tree[pc_tree_index];
-        alloc_tree_contexts(cm, tree, 16 << (2 * square_index), 0,
-                            &shared_bufs);
-        tree->block_size = square[square_index];
-        for (j = 0; j < 4; j++) tree->split[j] = this_pc++;
-        ++pc_tree_index;
-      }
-      ++square_index;
-    }
-  } else {
-    // Allocation for firstpass/LAP stage
-    // TODO(Mufaddal): refactor square_index to use a common block_size macro
-    // from firstpass.c
-    PC_TREE *const tree = &td->pc_tree[pc_tree_index];
-    square_index = 2;
-    alloc_tree_contexts(cm, tree, 16 << (2 * square_index), 1, &shared_bufs);
-    tree->block_size = square[square_index];
-  }
-
-  // Set up the root node for the applicable superblock size
-  td->pc_root = &td->pc_tree[tree_nodes - 1];
-#if CONFIG_INTERNAL_STATS
-  td->pc_root->none.best_mode_index = THR_INVALID;
-#endif  // CONFIG_INTERNAL_STATS
-}
-
-void av1_free_pc_tree(const AV1_COMP *const cpi, ThreadData *td,
-                      const int num_planes, BLOCK_SIZE sb_size) {
-  int stat_generation_stage = is_stat_generation_stage(cpi);
-  if (td->pc_tree != NULL) {
-    const int is_sb_size_128 = sb_size == BLOCK_128X128;
-    const int tree_nodes =
-        get_pc_tree_nodes(is_sb_size_128, stat_generation_stage);
-    for (int i = 0; i < tree_nodes; ++i) {
-      free_tree_contexts(&td->pc_tree[i], num_planes);
-    }
-    for (int i = 0; i < 3; ++i) {
-      aom_free(td->tree_coeff_buf[i]);
-      aom_free(td->tree_qcoeff_buf[i]);
-      aom_free(td->tree_dqcoeff_buf[i]);
-      td->tree_coeff_buf[i] = NULL;
-      td->tree_qcoeff_buf[i] = NULL;
-      td->tree_dqcoeff_buf[i] = NULL;
-    }
-    aom_free(td->pc_tree);
-    td->pc_tree = NULL;
-  }
-}
-
 void av1_copy_tree_context(PICK_MODE_CONTEXT *dst_ctx,
                            PICK_MODE_CONTEXT *src_ctx) {
   dst_ctx->mic = src_ctx->mic;
@@ -261,8 +39,239 @@
 
   dst_ctx->rd_stats = src_ctx->rd_stats;
   dst_ctx->rd_mode_is_ready = src_ctx->rd_mode_is_ready;
+}
 
-  memcpy(dst_ctx->pred_mv, src_ctx->pred_mv, sizeof(MV) * REF_FRAMES);
+void av1_setup_shared_coeff_buffer(AV1_COMMON *cm,
+                                   PC_TREE_SHARED_BUFFERS *shared_bufs) {
+  for (int i = 0; i < 3; i++) {
+    const int max_num_pix = MAX_SB_SIZE * MAX_SB_SIZE;
+    CHECK_MEM_ERROR(cm, shared_bufs->coeff_buf[i],
+                    aom_memalign(32, max_num_pix * sizeof(tran_low_t)));
+    CHECK_MEM_ERROR(cm, shared_bufs->qcoeff_buf[i],
+                    aom_memalign(32, max_num_pix * sizeof(tran_low_t)));
+    CHECK_MEM_ERROR(cm, shared_bufs->dqcoeff_buf[i],
+                    aom_memalign(32, max_num_pix * sizeof(tran_low_t)));
+  }
+}
 
-  dst_ctx->partition = src_ctx->partition;
+void av1_free_shared_coeff_buffer(PC_TREE_SHARED_BUFFERS *shared_bufs) {
+  for (int i = 0; i < 3; i++) {
+    aom_free(shared_bufs->coeff_buf[i]);
+    aom_free(shared_bufs->qcoeff_buf[i]);
+    aom_free(shared_bufs->dqcoeff_buf[i]);
+    shared_bufs->coeff_buf[i] = NULL;
+    shared_bufs->qcoeff_buf[i] = NULL;
+    shared_bufs->dqcoeff_buf[i] = NULL;
+  }
+}
+
+PICK_MODE_CONTEXT *av1_alloc_pmc(const AV1_COMMON *cm, BLOCK_SIZE bsize,
+                                 PC_TREE_SHARED_BUFFERS *shared_bufs) {
+  PICK_MODE_CONTEXT *ctx = NULL;
+  struct aom_internal_error_info error;
+
+  AOM_CHECK_MEM_ERROR(&error, ctx, aom_calloc(1, sizeof(*ctx)));
+  ctx->rd_mode_is_ready = 0;
+
+  const int num_planes = av1_num_planes(cm);
+  const int num_pix = block_size_wide[bsize] * block_size_high[bsize];
+  const int num_blk = num_pix / 16;
+
+  AOM_CHECK_MEM_ERROR(&error, ctx->blk_skip,
+                      aom_calloc(num_blk, sizeof(*ctx->blk_skip)));
+  AOM_CHECK_MEM_ERROR(&error, ctx->tx_type_map,
+                      aom_calloc(num_blk, sizeof(*ctx->tx_type_map)));
+  ctx->num_4x4_blk = num_blk;
+
+  for (int i = 0; i < num_planes; ++i) {
+    ctx->coeff[i] = shared_bufs->coeff_buf[i];
+    ctx->qcoeff[i] = shared_bufs->qcoeff_buf[i];
+    ctx->dqcoeff[i] = shared_bufs->dqcoeff_buf[i];
+    AOM_CHECK_MEM_ERROR(&error, ctx->eobs[i],
+                        aom_memalign(32, num_blk * sizeof(*ctx->eobs[i])));
+    AOM_CHECK_MEM_ERROR(
+        &error, ctx->txb_entropy_ctx[i],
+        aom_memalign(32, num_blk * sizeof(*ctx->txb_entropy_ctx[i])));
+  }
+
+  if (num_pix <= MAX_PALETTE_SQUARE) {
+    for (int i = 0; i < 2; ++i) {
+      AOM_CHECK_MEM_ERROR(
+          &error, ctx->color_index_map[i],
+          aom_memalign(32, num_pix * sizeof(*ctx->color_index_map[i])));
+    }
+  }
+
+  av1_invalid_rd_stats(&ctx->rd_stats);
+
+  return ctx;
+}
+
+void av1_free_pmc(PICK_MODE_CONTEXT *ctx, int num_planes) {
+  if (ctx == NULL) return;
+
+  aom_free(ctx->blk_skip);
+  ctx->blk_skip = NULL;
+  aom_free(ctx->tx_type_map);
+  for (int i = 0; i < num_planes; ++i) {
+    ctx->coeff[i] = NULL;
+    ctx->qcoeff[i] = NULL;
+    ctx->dqcoeff[i] = NULL;
+    aom_free(ctx->eobs[i]);
+    ctx->eobs[i] = NULL;
+    aom_free(ctx->txb_entropy_ctx[i]);
+    ctx->txb_entropy_ctx[i] = NULL;
+  }
+
+  for (int i = 0; i < 2; ++i) {
+    aom_free(ctx->color_index_map[i]);
+    ctx->color_index_map[i] = NULL;
+  }
+
+  aom_free(ctx);
+}
+
+PC_TREE *av1_alloc_pc_tree_node(BLOCK_SIZE bsize) {
+  PC_TREE *pc_tree = NULL;
+  struct aom_internal_error_info error;
+
+  AOM_CHECK_MEM_ERROR(&error, pc_tree, aom_calloc(1, sizeof(*pc_tree)));
+
+  pc_tree->partitioning = PARTITION_NONE;
+  pc_tree->block_size = bsize;
+  pc_tree->index = 0;
+
+  pc_tree->none = NULL;
+  for (int i = 0; i < 2; ++i) {
+    pc_tree->horizontal[i] = NULL;
+    pc_tree->vertical[i] = NULL;
+  }
+  for (int i = 0; i < 3; ++i) {
+    pc_tree->horizontala[i] = NULL;
+    pc_tree->horizontalb[i] = NULL;
+    pc_tree->verticala[i] = NULL;
+    pc_tree->verticalb[i] = NULL;
+  }
+  for (int i = 0; i < 4; ++i) {
+    pc_tree->horizontal4[i] = NULL;
+    pc_tree->vertical4[i] = NULL;
+    pc_tree->split[i] = NULL;
+  }
+
+  return pc_tree;
+}
+
+#define FREE_PMC_NODE(CTX)         \
+  do {                             \
+    av1_free_pmc(CTX, num_planes); \
+    CTX = NULL;                    \
+  } while (0)
+
+void av1_free_pc_tree_recursive(PC_TREE *pc_tree, int num_planes, int keep_best,
+                                int keep_none) {
+  if (pc_tree == NULL) return;
+
+  const PARTITION_TYPE partition = pc_tree->partitioning;
+
+  if (!keep_none && (!keep_best || (partition != PARTITION_NONE)))
+    FREE_PMC_NODE(pc_tree->none);
+
+  for (int i = 0; i < 2; ++i) {
+    if (!keep_best || (partition != PARTITION_HORZ))
+      FREE_PMC_NODE(pc_tree->horizontal[i]);
+    if (!keep_best || (partition != PARTITION_VERT))
+      FREE_PMC_NODE(pc_tree->vertical[i]);
+  }
+  for (int i = 0; i < 3; ++i) {
+    if (!keep_best || (partition != PARTITION_HORZ_A))
+      FREE_PMC_NODE(pc_tree->horizontala[i]);
+    if (!keep_best || (partition != PARTITION_HORZ_B))
+      FREE_PMC_NODE(pc_tree->horizontalb[i]);
+    if (!keep_best || (partition != PARTITION_VERT_A))
+      FREE_PMC_NODE(pc_tree->verticala[i]);
+    if (!keep_best || (partition != PARTITION_VERT_B))
+      FREE_PMC_NODE(pc_tree->verticalb[i]);
+  }
+  for (int i = 0; i < 4; ++i) {
+    if (!keep_best || (partition != PARTITION_HORZ_4))
+      FREE_PMC_NODE(pc_tree->horizontal4[i]);
+    if (!keep_best || (partition != PARTITION_VERT_4))
+      FREE_PMC_NODE(pc_tree->vertical4[i]);
+  }
+
+  if (!keep_best || (partition != PARTITION_SPLIT)) {
+    for (int i = 0; i < 4; ++i) {
+      if (pc_tree->split[i] != NULL) {
+        av1_free_pc_tree_recursive(pc_tree->split[i], num_planes, 0, 0);
+        pc_tree->split[i] = NULL;
+      }
+    }
+  }
+
+  if (!keep_best && !keep_none) aom_free(pc_tree);
+}
+
+static AOM_INLINE int get_pc_tree_nodes(const int is_sb_size_128,
+                                        int stat_generation_stage) {
+  const int tree_nodes_inc = is_sb_size_128 ? 1024 : 0;
+  const int tree_nodes =
+      stat_generation_stage ? 1 : (tree_nodes_inc + 256 + 64 + 16 + 4 + 1);
+  return tree_nodes;
+}
+
+void av1_setup_sms_tree(AV1_COMP *const cpi, ThreadData *td) {
+  AV1_COMMON *const cm = &cpi->common;
+  const int stat_generation_stage = is_stat_generation_stage(cpi);
+  const int is_sb_size_128 = cm->seq_params.sb_size == BLOCK_128X128;
+  const int tree_nodes =
+      get_pc_tree_nodes(is_sb_size_128, stat_generation_stage);
+  int sms_tree_index = 0;
+  SIMPLE_MOTION_DATA_TREE *this_sms;
+  int square_index = 1;
+  int nodes;
+
+  aom_free(td->sms_tree);
+  CHECK_MEM_ERROR(cm, td->sms_tree,
+                  aom_calloc(tree_nodes, sizeof(*td->sms_tree)));
+  this_sms = &td->sms_tree[0];
+
+  if (!stat_generation_stage) {
+    const int leaf_factor = is_sb_size_128 ? 4 : 1;
+    const int leaf_nodes = 256 * leaf_factor;
+
+    // Sets up all the leaf nodes in the tree.
+    for (sms_tree_index = 0; sms_tree_index < leaf_nodes; ++sms_tree_index) {
+      SIMPLE_MOTION_DATA_TREE *const tree = &td->sms_tree[sms_tree_index];
+      tree->block_size = square[0];
+    }
+
+    // Each node has 4 leaf nodes, fill each block_size level of the tree
+    // from leafs to the root.
+    for (nodes = leaf_nodes >> 2; nodes > 0; nodes >>= 2) {
+      for (int i = 0; i < nodes; ++i) {
+        SIMPLE_MOTION_DATA_TREE *const tree = &td->sms_tree[sms_tree_index];
+        tree->block_size = square[square_index];
+        for (int j = 0; j < 4; j++) tree->split[j] = this_sms++;
+        ++sms_tree_index;
+      }
+      ++square_index;
+    }
+  } else {
+    // Allocation for firstpass/LAP stage
+    // TODO(Mufaddal): refactor square_index to use a common block_size macro
+    // from firstpass.c
+    SIMPLE_MOTION_DATA_TREE *const tree = &td->sms_tree[sms_tree_index];
+    square_index = 2;
+    tree->block_size = square[square_index];
+  }
+
+  // Set up the root node for the largest superblock size
+  td->sms_root = &td->sms_tree[tree_nodes - 1];
+}
+
+void av1_free_sms_tree(ThreadData *td) {
+  if (td->sms_tree != NULL) {
+    aom_free(td->sms_tree);
+    td->sms_tree = NULL;
+  }
 }
diff --git a/av1/encoder/context_tree.h b/av1/encoder/context_tree.h
index a399794..f243233 100644
--- a/av1/encoder/context_tree.h
+++ b/av1/encoder/context_tree.h
@@ -25,8 +25,14 @@
 struct AV1Common;
 struct ThreadData;
 
-// Structure to hold snapshot of coding context during the mode picking process
 typedef struct {
+  tran_low_t *coeff_buf[MAX_MB_PLANE];
+  tran_low_t *qcoeff_buf[MAX_MB_PLANE];
+  tran_low_t *dqcoeff_buf[MAX_MB_PLANE];
+} PC_TREE_SHARED_BUFFERS;
+
+// Structure to hold snapshot of coding context during the mode picking process
+typedef struct PICK_MODE_CONTEXT {
   MB_MODE_INFO mic;
   MB_MODE_INFO_EXT_FRAME mbmi_ext_best;
   uint8_t *color_index_map[2];
@@ -54,27 +60,28 @@
 
   int rd_mode_is_ready;  // Flag to indicate whether rd pick mode decision has
                          // been made.
-
-  // motion vector cache for adaptive motion search control in partition
-  // search loop
-  MV pred_mv[REF_FRAMES];
-  PARTITION_TYPE partition;
 } PICK_MODE_CONTEXT;
 
 typedef struct PC_TREE {
   PARTITION_TYPE partitioning;
   BLOCK_SIZE block_size;
-  PICK_MODE_CONTEXT none;
-  PICK_MODE_CONTEXT horizontal[2];
-  PICK_MODE_CONTEXT vertical[2];
-  PICK_MODE_CONTEXT horizontala[3];
-  PICK_MODE_CONTEXT horizontalb[3];
-  PICK_MODE_CONTEXT verticala[3];
-  PICK_MODE_CONTEXT verticalb[3];
-  PICK_MODE_CONTEXT horizontal4[4];
-  PICK_MODE_CONTEXT vertical4[4];
+  PICK_MODE_CONTEXT *none;
+  PICK_MODE_CONTEXT *horizontal[2];
+  PICK_MODE_CONTEXT *vertical[2];
+  PICK_MODE_CONTEXT *horizontala[3];
+  PICK_MODE_CONTEXT *horizontalb[3];
+  PICK_MODE_CONTEXT *verticala[3];
+  PICK_MODE_CONTEXT *verticalb[3];
+  PICK_MODE_CONTEXT *horizontal4[4];
+  PICK_MODE_CONTEXT *vertical4[4];
   struct PC_TREE *split[4];
   int index;
+} PC_TREE;
+
+typedef struct SIMPLE_MOTION_DATA_TREE {
+  BLOCK_SIZE block_size;
+  PARTITION_TYPE partitioning;
+  struct SIMPLE_MOTION_DATA_TREE *split[4];
 
   // Simple motion search_features
   FULLPEL_MV start_mvs[REF_FRAMES];
@@ -82,14 +89,25 @@
   unsigned int sms_rect_feat[8];
   int sms_none_valid;
   int sms_rect_valid;
-} PC_TREE;
+} SIMPLE_MOTION_DATA_TREE;
 
-void av1_setup_pc_tree(struct AV1_COMP *const cpi, struct ThreadData *td);
-void av1_free_pc_tree(const struct AV1_COMP *const cpi, struct ThreadData *td,
-                      const int num_planes, BLOCK_SIZE sb_size);
+void av1_setup_shared_coeff_buffer(AV1_COMMON *cm,
+                                   PC_TREE_SHARED_BUFFERS *shared_bufs);
+void av1_free_shared_coeff_buffer(PC_TREE_SHARED_BUFFERS *shared_bufs);
+
+PC_TREE *av1_alloc_pc_tree_node(BLOCK_SIZE bsize);
+void av1_free_pc_tree_recursive(PC_TREE *tree, int num_planes, int keep_best,
+                                int keep_none);
+
+PICK_MODE_CONTEXT *av1_alloc_pmc(const AV1_COMMON *cm, BLOCK_SIZE bsize,
+                                 PC_TREE_SHARED_BUFFERS *shared_bufs);
+void av1_free_pmc(PICK_MODE_CONTEXT *ctx, int num_planes);
 void av1_copy_tree_context(PICK_MODE_CONTEXT *dst_ctx,
                            PICK_MODE_CONTEXT *src_ctx);
 
+void av1_setup_sms_tree(struct AV1_COMP *const cpi, struct ThreadData *td);
+void av1_free_sms_tree(struct ThreadData *td);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/av1/encoder/corner_match.c b/av1/encoder/corner_match.c
index 12f633b..c2f1b2e 100644
--- a/av1/encoder/corner_match.c
+++ b/av1/encoder/corner_match.c
@@ -141,20 +141,20 @@
   }
 }
 
-int av1_determine_correspondence(unsigned char *frm, int *frm_corners,
-                                 int num_frm_corners, unsigned char *ref,
+int av1_determine_correspondence(unsigned char *src, int *src_corners,
+                                 int num_src_corners, unsigned char *ref,
                                  int *ref_corners, int num_ref_corners,
-                                 int width, int height, int frm_stride,
+                                 int width, int height, int src_stride,
                                  int ref_stride, int *correspondence_pts) {
   // TODO(sarahparker) Improve this to include 2-way match
   int i, j;
   Correspondence *correspondences = (Correspondence *)correspondence_pts;
   int num_correspondences = 0;
-  for (i = 0; i < num_frm_corners; ++i) {
+  for (i = 0; i < num_src_corners; ++i) {
     double best_match_ncc = 0.0;
     double template_norm;
     int best_match_j = -1;
-    if (!is_eligible_point(frm_corners[2 * i], frm_corners[2 * i + 1], width,
+    if (!is_eligible_point(src_corners[2 * i], src_corners[2 * i + 1], width,
                            height))
       continue;
     for (j = 0; j < num_ref_corners; ++j) {
@@ -162,12 +162,12 @@
       if (!is_eligible_point(ref_corners[2 * j], ref_corners[2 * j + 1], width,
                              height))
         continue;
-      if (!is_eligible_distance(frm_corners[2 * i], frm_corners[2 * i + 1],
+      if (!is_eligible_distance(src_corners[2 * i], src_corners[2 * i + 1],
                                 ref_corners[2 * j], ref_corners[2 * j + 1],
                                 width, height))
         continue;
       match_ncc = av1_compute_cross_correlation(
-          frm, frm_stride, frm_corners[2 * i], frm_corners[2 * i + 1], ref,
+          src, src_stride, src_corners[2 * i], src_corners[2 * i + 1], ref,
           ref_stride, ref_corners[2 * j], ref_corners[2 * j + 1]);
       if (match_ncc > best_match_ncc) {
         best_match_ncc = match_ncc;
@@ -177,18 +177,18 @@
     // Note: We want to test if the best correlation is >= THRESHOLD_NCC,
     // but need to account for the normalization in
     // av1_compute_cross_correlation.
-    template_norm = compute_variance(frm, frm_stride, frm_corners[2 * i],
-                                     frm_corners[2 * i + 1]);
+    template_norm = compute_variance(src, src_stride, src_corners[2 * i],
+                                     src_corners[2 * i + 1]);
     if (best_match_ncc > THRESHOLD_NCC * sqrt(template_norm)) {
-      correspondences[num_correspondences].x = frm_corners[2 * i];
-      correspondences[num_correspondences].y = frm_corners[2 * i + 1];
+      correspondences[num_correspondences].x = src_corners[2 * i];
+      correspondences[num_correspondences].y = src_corners[2 * i + 1];
       correspondences[num_correspondences].rx = ref_corners[2 * best_match_j];
       correspondences[num_correspondences].ry =
           ref_corners[2 * best_match_j + 1];
       num_correspondences++;
     }
   }
-  improve_correspondence(frm, ref, width, height, frm_stride, ref_stride,
+  improve_correspondence(src, ref, width, height, src_stride, ref_stride,
                          correspondences, num_correspondences);
   return num_correspondences;
 }
diff --git a/av1/encoder/corner_match.h b/av1/encoder/corner_match.h
index 3cf6de1..45c90f3 100644
--- a/av1/encoder/corner_match.h
+++ b/av1/encoder/corner_match.h
@@ -24,10 +24,10 @@
   int rx, ry;
 } Correspondence;
 
-int av1_determine_correspondence(unsigned char *frm, int *frm_corners,
-                                 int num_frm_corners, unsigned char *ref,
+int av1_determine_correspondence(unsigned char *src, int *src_corners,
+                                 int num_src_corners, unsigned char *ref,
                                  int *ref_corners, int num_ref_corners,
-                                 int width, int height, int frm_stride,
+                                 int width, int height, int src_stride,
                                  int ref_stride, int *correspondence_pts);
 
 #endif  // AOM_AV1_ENCODER_CORNER_MATCH_H_
diff --git a/av1/encoder/enc_enums.h b/av1/encoder/enc_enums.h
index 5a06514..319e5d0 100644
--- a/av1/encoder/enc_enums.h
+++ b/av1/encoder/enc_enums.h
@@ -68,132 +68,132 @@
   THR_COMP_NEAREST_NEARESTLG,
   THR_COMP_NEAREST_NEARESTBA,
 
-  THR_COMP_NEAR_NEARLA,
-  THR_COMP_NEW_NEARESTLA,
-  THR_COMP_NEAREST_NEWLA,
-  THR_COMP_NEW_NEARLA,
-  THR_COMP_NEAR_NEWLA,
-  THR_COMP_NEW_NEWLA,
-  THR_COMP_GLOBAL_GLOBALLA,
-
-  THR_COMP_NEAR_NEARL2A,
-  THR_COMP_NEW_NEARESTL2A,
-  THR_COMP_NEAREST_NEWL2A,
-  THR_COMP_NEW_NEARL2A,
-  THR_COMP_NEAR_NEWL2A,
-  THR_COMP_NEW_NEWL2A,
-  THR_COMP_GLOBAL_GLOBALL2A,
-
-  THR_COMP_NEAR_NEARL3A,
-  THR_COMP_NEW_NEARESTL3A,
-  THR_COMP_NEAREST_NEWL3A,
-  THR_COMP_NEW_NEARL3A,
-  THR_COMP_NEAR_NEWL3A,
-  THR_COMP_NEW_NEWL3A,
-  THR_COMP_GLOBAL_GLOBALL3A,
-
-  THR_COMP_NEAR_NEARGA,
-  THR_COMP_NEW_NEARESTGA,
-  THR_COMP_NEAREST_NEWGA,
-  THR_COMP_NEW_NEARGA,
-  THR_COMP_NEAR_NEWGA,
-  THR_COMP_NEW_NEWGA,
-  THR_COMP_GLOBAL_GLOBALGA,
-
   THR_COMP_NEAR_NEARLB,
+  THR_COMP_NEW_NEWLB,
   THR_COMP_NEW_NEARESTLB,
   THR_COMP_NEAREST_NEWLB,
   THR_COMP_NEW_NEARLB,
   THR_COMP_NEAR_NEWLB,
-  THR_COMP_NEW_NEWLB,
   THR_COMP_GLOBAL_GLOBALLB,
 
+  THR_COMP_NEAR_NEARLA,
+  THR_COMP_NEW_NEWLA,
+  THR_COMP_NEW_NEARESTLA,
+  THR_COMP_NEAREST_NEWLA,
+  THR_COMP_NEW_NEARLA,
+  THR_COMP_NEAR_NEWLA,
+  THR_COMP_GLOBAL_GLOBALLA,
+
+  THR_COMP_NEAR_NEARL2A,
+  THR_COMP_NEW_NEWL2A,
+  THR_COMP_NEW_NEARESTL2A,
+  THR_COMP_NEAREST_NEWL2A,
+  THR_COMP_NEW_NEARL2A,
+  THR_COMP_NEAR_NEWL2A,
+  THR_COMP_GLOBAL_GLOBALL2A,
+
+  THR_COMP_NEAR_NEARL3A,
+  THR_COMP_NEW_NEWL3A,
+  THR_COMP_NEW_NEARESTL3A,
+  THR_COMP_NEAREST_NEWL3A,
+  THR_COMP_NEW_NEARL3A,
+  THR_COMP_NEAR_NEWL3A,
+  THR_COMP_GLOBAL_GLOBALL3A,
+
+  THR_COMP_NEAR_NEARGA,
+  THR_COMP_NEW_NEWGA,
+  THR_COMP_NEW_NEARESTGA,
+  THR_COMP_NEAREST_NEWGA,
+  THR_COMP_NEW_NEARGA,
+  THR_COMP_NEAR_NEWGA,
+  THR_COMP_GLOBAL_GLOBALGA,
+
   THR_COMP_NEAR_NEARL2B,
+  THR_COMP_NEW_NEWL2B,
   THR_COMP_NEW_NEARESTL2B,
   THR_COMP_NEAREST_NEWL2B,
   THR_COMP_NEW_NEARL2B,
   THR_COMP_NEAR_NEWL2B,
-  THR_COMP_NEW_NEWL2B,
   THR_COMP_GLOBAL_GLOBALL2B,
 
   THR_COMP_NEAR_NEARL3B,
+  THR_COMP_NEW_NEWL3B,
   THR_COMP_NEW_NEARESTL3B,
   THR_COMP_NEAREST_NEWL3B,
   THR_COMP_NEW_NEARL3B,
   THR_COMP_NEAR_NEWL3B,
-  THR_COMP_NEW_NEWL3B,
   THR_COMP_GLOBAL_GLOBALL3B,
 
   THR_COMP_NEAR_NEARGB,
+  THR_COMP_NEW_NEWGB,
   THR_COMP_NEW_NEARESTGB,
   THR_COMP_NEAREST_NEWGB,
   THR_COMP_NEW_NEARGB,
   THR_COMP_NEAR_NEWGB,
-  THR_COMP_NEW_NEWGB,
   THR_COMP_GLOBAL_GLOBALGB,
 
   THR_COMP_NEAR_NEARLA2,
+  THR_COMP_NEW_NEWLA2,
   THR_COMP_NEW_NEARESTLA2,
   THR_COMP_NEAREST_NEWLA2,
   THR_COMP_NEW_NEARLA2,
   THR_COMP_NEAR_NEWLA2,
-  THR_COMP_NEW_NEWLA2,
   THR_COMP_GLOBAL_GLOBALLA2,
 
   THR_COMP_NEAR_NEARL2A2,
+  THR_COMP_NEW_NEWL2A2,
   THR_COMP_NEW_NEARESTL2A2,
   THR_COMP_NEAREST_NEWL2A2,
   THR_COMP_NEW_NEARL2A2,
   THR_COMP_NEAR_NEWL2A2,
-  THR_COMP_NEW_NEWL2A2,
   THR_COMP_GLOBAL_GLOBALL2A2,
 
   THR_COMP_NEAR_NEARL3A2,
+  THR_COMP_NEW_NEWL3A2,
   THR_COMP_NEW_NEARESTL3A2,
   THR_COMP_NEAREST_NEWL3A2,
   THR_COMP_NEW_NEARL3A2,
   THR_COMP_NEAR_NEWL3A2,
-  THR_COMP_NEW_NEWL3A2,
   THR_COMP_GLOBAL_GLOBALL3A2,
 
   THR_COMP_NEAR_NEARGA2,
+  THR_COMP_NEW_NEWGA2,
   THR_COMP_NEW_NEARESTGA2,
   THR_COMP_NEAREST_NEWGA2,
   THR_COMP_NEW_NEARGA2,
   THR_COMP_NEAR_NEWGA2,
-  THR_COMP_NEW_NEWGA2,
   THR_COMP_GLOBAL_GLOBALGA2,
 
   THR_COMP_NEAR_NEARLL2,
+  THR_COMP_NEW_NEWLL2,
   THR_COMP_NEW_NEARESTLL2,
   THR_COMP_NEAREST_NEWLL2,
   THR_COMP_NEW_NEARLL2,
   THR_COMP_NEAR_NEWLL2,
-  THR_COMP_NEW_NEWLL2,
   THR_COMP_GLOBAL_GLOBALLL2,
 
   THR_COMP_NEAR_NEARLL3,
+  THR_COMP_NEW_NEWLL3,
   THR_COMP_NEW_NEARESTLL3,
   THR_COMP_NEAREST_NEWLL3,
   THR_COMP_NEW_NEARLL3,
   THR_COMP_NEAR_NEWLL3,
-  THR_COMP_NEW_NEWLL3,
   THR_COMP_GLOBAL_GLOBALLL3,
 
   THR_COMP_NEAR_NEARLG,
+  THR_COMP_NEW_NEWLG,
   THR_COMP_NEW_NEARESTLG,
   THR_COMP_NEAREST_NEWLG,
   THR_COMP_NEW_NEARLG,
   THR_COMP_NEAR_NEWLG,
-  THR_COMP_NEW_NEWLG,
   THR_COMP_GLOBAL_GLOBALLG,
 
   THR_COMP_NEAR_NEARBA,
+  THR_COMP_NEW_NEWBA,
   THR_COMP_NEW_NEARESTBA,
   THR_COMP_NEAREST_NEWBA,
   THR_COMP_NEW_NEARBA,
   THR_COMP_NEAR_NEWBA,
-  THR_COMP_NEW_NEWBA,
   THR_COMP_GLOBAL_GLOBALBA,
 
   THR_DC,
@@ -248,6 +248,17 @@
   MAX_REFS
 } UENUM1BYTE(THR_MODES_SUB8X8);
 
+enum {
+  FULL_TXFM_RD,
+  LOW_TXFM_RD,
+} UENUM1BYTE(TXFM_RD_MODEL);
+
+enum {
+  USE_FULL_RD = 0,
+  USE_FAST_RD,
+  USE_LARGESTALL,
+} UENUM1BYTE(TX_SIZE_SEARCH_METHOD);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/av1/encoder/encode_strategy.c b/av1/encoder/encode_strategy.c
index 326ecc0..975f3bd 100644
--- a/av1/encoder/encode_strategy.c
+++ b/av1/encoder/encode_strategy.c
@@ -11,6 +11,7 @@
 
 #include <stdint.h>
 
+#include "av1/common/blockd.h"
 #include "config/aom_config.h"
 #include "config/aom_scale_rtcd.h"
 
@@ -34,81 +35,85 @@
 #include "av1/encoder/temporal_filter.h"
 #include "av1/encoder/tpl_model.h"
 
+#if CONFIG_TUNE_VMAF
+#include "av1/encoder/tune_vmaf.h"
+#endif
+
 #define TEMPORAL_FILTER_KEY_FRAME (CONFIG_REALTIME_ONLY ? 0 : 1)
 
-void av1_configure_buffer_updates(AV1_COMP *const cpi,
-                                  EncodeFrameParams *const frame_params,
-                                  const FRAME_UPDATE_TYPE type,
-                                  int force_refresh_all) {
+static INLINE void set_refresh_frame_flags(
+    RefreshFrameFlagsInfo *const refresh_frame_flags, bool refresh_gf,
+    bool refresh_bwdref, bool refresh_arf) {
+  refresh_frame_flags->golden_frame = refresh_gf;
+  refresh_frame_flags->bwd_ref_frame = refresh_bwdref;
+  refresh_frame_flags->alt_ref_frame = refresh_arf;
+}
+
+void av1_configure_buffer_updates(
+    AV1_COMP *const cpi, RefreshFrameFlagsInfo *const refresh_frame_flags,
+    const FRAME_UPDATE_TYPE type, const FRAME_TYPE frame_type,
+    int force_refresh_all) {
   // NOTE(weitinglin): Should we define another function to take care of
   // cpi->rc.is_$Source_Type to make this function as it is in the comment?
 
-  const ExternalFlags *const ext_flags = &cpi->ext_flags;
+  const ExtRefreshFrameFlagsInfo *const ext_refresh_frame_flags =
+      &cpi->ext_flags.refresh_frame;
   cpi->rc.is_src_frame_alt_ref = 0;
 
   switch (type) {
     case KF_UPDATE:
-      frame_params->refresh_golden_frame = 1;
-      frame_params->refresh_bwd_ref_frame = 1;
-      frame_params->refresh_alt_ref_frame = 1;
+      set_refresh_frame_flags(refresh_frame_flags, true, true, true);
       break;
 
     case LF_UPDATE:
-      frame_params->refresh_golden_frame = 0;
-      frame_params->refresh_bwd_ref_frame = 0;
-      frame_params->refresh_alt_ref_frame = 0;
+      set_refresh_frame_flags(refresh_frame_flags, false, false, false);
       break;
 
     case GF_UPDATE:
-      frame_params->refresh_golden_frame = 1;
-      frame_params->refresh_bwd_ref_frame = 0;
-      frame_params->refresh_alt_ref_frame = 0;
+      set_refresh_frame_flags(refresh_frame_flags, true, false, false);
       break;
 
     case OVERLAY_UPDATE:
-      frame_params->refresh_golden_frame = 1;
-      frame_params->refresh_bwd_ref_frame = 0;
-      frame_params->refresh_alt_ref_frame = 0;
-
+      if (frame_type == KEY_FRAME && cpi->rc.frames_to_key == 0) {
+        set_refresh_frame_flags(refresh_frame_flags, true, true, true);
+      } else {
+        set_refresh_frame_flags(refresh_frame_flags, true, false, false);
+      }
       cpi->rc.is_src_frame_alt_ref = 1;
       break;
 
     case ARF_UPDATE:
-      frame_params->refresh_golden_frame = 0;
       // NOTE: BWDREF does not get updated along with ALTREF_FRAME.
-      frame_params->refresh_bwd_ref_frame = 0;
-      frame_params->refresh_alt_ref_frame = 1;
+      if (frame_type == KEY_FRAME && !cpi->no_show_fwd_kf) {
+        // TODO(bohanli): consider moving this to force_refresh_all?
+        // This is Keyframe as arf
+        set_refresh_frame_flags(refresh_frame_flags, true, true, true);
+      } else {
+        set_refresh_frame_flags(refresh_frame_flags, false, false, true);
+      }
       break;
 
     case INTNL_OVERLAY_UPDATE:
-      frame_params->refresh_golden_frame = 0;
-      frame_params->refresh_bwd_ref_frame = 0;
-      frame_params->refresh_alt_ref_frame = 0;
-
+      set_refresh_frame_flags(refresh_frame_flags, false, false, false);
       cpi->rc.is_src_frame_alt_ref = 1;
       break;
 
     case INTNL_ARF_UPDATE:
-      frame_params->refresh_golden_frame = 0;
-      frame_params->refresh_bwd_ref_frame = 1;
-      frame_params->refresh_alt_ref_frame = 0;
+      set_refresh_frame_flags(refresh_frame_flags, false, true, false);
       break;
 
     default: assert(0); break;
   }
 
-  if (ext_flags->refresh_frame_flags_pending &&
-      (!is_stat_generation_stage(cpi))) {
-    frame_params->refresh_golden_frame = ext_flags->refresh_golden_frame;
-    frame_params->refresh_alt_ref_frame = ext_flags->refresh_alt_ref_frame;
-    frame_params->refresh_bwd_ref_frame = ext_flags->refresh_bwd_ref_frame;
-  }
+  if (ext_refresh_frame_flags->update_pending &&
+      (!is_stat_generation_stage(cpi)))
+    set_refresh_frame_flags(refresh_frame_flags,
+                            ext_refresh_frame_flags->golden_frame,
+                            ext_refresh_frame_flags->bwd_ref_frame,
+                            ext_refresh_frame_flags->alt_ref_frame);
 
-  if (force_refresh_all) {
-    frame_params->refresh_golden_frame = 1;
-    frame_params->refresh_bwd_ref_frame = 1;
-    frame_params->refresh_alt_ref_frame = 1;
-  }
+  if (force_refresh_all)
+    set_refresh_frame_flags(refresh_frame_flags, true, true, true);
 }
 
 static void set_additional_frame_flags(const AV1_COMMON *const cm,
@@ -126,28 +131,24 @@
 
 static INLINE void update_keyframe_counters(AV1_COMP *cpi) {
   if (cpi->common.show_frame) {
-    if (!cpi->common.show_existing_frame || cpi->rc.is_src_frame_alt_ref ||
-        cpi->common.current_frame.frame_type == KEY_FRAME) {
-      // If this is a show_existing_frame with a source other than altref,
-      // or if it is not a displayed forward keyframe, the keyframe update
-      // counters were incremented when it was originally encoded.
-      cpi->rc.frames_since_key++;
-      cpi->rc.frames_to_key--;
-    }
+    cpi->rc.frames_since_key++;
+    cpi->rc.frames_to_key--;
   }
 }
 
-static INLINE int is_frame_droppable(const SVC *const svc,
-                                     const ExternalFlags *const ext_flags) {
+static INLINE int is_frame_droppable(
+    const SVC *const svc,
+    const ExtRefreshFrameFlagsInfo *const ext_refresh_frame_flags) {
   // Droppable frame is only used by external refresh flags. VoD setting won't
   // trigger its use case.
   if (svc->external_ref_frame_config)
     return svc->non_reference_frame;
-  else if (ext_flags->refresh_frame_flags_pending)
-    return !(ext_flags->refresh_alt_ref_frame ||
-             ext_flags->refresh_alt2_ref_frame ||
-             ext_flags->refresh_bwd_ref_frame ||
-             ext_flags->refresh_golden_frame || ext_flags->refresh_last_frame);
+  else if (ext_refresh_frame_flags->update_pending)
+    return !(ext_refresh_frame_flags->alt_ref_frame ||
+             ext_refresh_frame_flags->alt2_ref_frame ||
+             ext_refresh_frame_flags->bwd_ref_frame ||
+             ext_refresh_frame_flags->golden_frame ||
+             ext_refresh_frame_flags->last_frame);
   else
     return 0;
 }
@@ -158,7 +159,7 @@
   // We should fix the cpi->common.show_frame flag
   // instead of checking the other condition to update the counter properly.
   if (cpi->common.show_frame ||
-      is_frame_droppable(&cpi->svc, &cpi->ext_flags)) {
+      is_frame_droppable(&cpi->svc, &cpi->ext_flags.refresh_frame)) {
     // Decrement count down till next gf
     if (cpi->rc.frames_till_gf_update_due > 0)
       cpi->rc.frames_till_gf_update_due--;
@@ -166,14 +167,8 @@
 }
 
 static INLINE void update_gf_group_index(AV1_COMP *cpi) {
-  // Increment the gf group index ready for the next frame. If this is
-  // a show_existing_frame with a source other than altref, or if it is not
-  // a displayed forward keyframe, the index was incremented when it was
-  // originally encoded.
-  if (!cpi->common.show_existing_frame || cpi->rc.is_src_frame_alt_ref ||
-      cpi->common.current_frame.frame_type == KEY_FRAME) {
-    ++cpi->gf_group.index;
-  }
+  // Increment the gf group index ready for the next frame.
+  ++cpi->gf_group.index;
 }
 
 static void update_rc_counts(AV1_COMP *cpi) {
@@ -236,7 +231,7 @@
 
   const int intra_only = frame_params->frame_type == KEY_FRAME ||
                          frame_params->frame_type == INTRA_ONLY_FRAME;
-  if (intra_only || frame_params->error_resilient_mode || cpi->use_svc ||
+  if (intra_only || frame_params->error_resilient_mode ||
       cpi->ext_flags.use_primary_ref_none) {
     return PRIMARY_REF_NONE;
   }
@@ -247,6 +242,8 @@
   // frame bit allocation.
   if (cm->tiles.large_scale) return (LAST_FRAME - LAST_FRAME);
 
+  if (cpi->use_svc) return av1_svc_primary_ref_frame(cpi);
+
   // Find the most recent reference frame with the same reference type as the
   // current frame
   const int current_ref_type = get_current_frame_ref_type(cpi, frame_params);
@@ -298,18 +295,6 @@
   }
 }
 
-static int get_order_offset(const GF_GROUP *const gf_group,
-                            const EncodeFrameParams *const frame_params) {
-  // shown frame by definition has order offset 0
-  // show_existing_frame ignores order_offset and simply takes the order_hint
-  // from the reference frame being shown.
-  if (frame_params->show_frame || frame_params->show_existing_frame) return 0;
-
-  const int arf_offset =
-      AOMMIN((MAX_GF_INTERVAL - 1), gf_group->arf_src_offset[gf_group->index]);
-  return AOMMIN((MAX_GF_INTERVAL - 1), arf_offset);
-}
-
 static void adjust_frame_rate(AV1_COMP *cpi, int64_t ts_start, int64_t ts_end) {
   TimeStamps *time_stamps = &cpi->time_stamps;
   int64_t this_duration;
@@ -324,14 +309,14 @@
     return;
   }
 
-  if (ts_start == time_stamps->first_ever) {
+  if (ts_start == time_stamps->first_ts_start) {
     this_duration = ts_end - ts_start;
     step = 1;
   } else {
     int64_t last_duration =
-        time_stamps->prev_end_seen - time_stamps->prev_start_seen;
+        time_stamps->prev_ts_end - time_stamps->prev_ts_start;
 
-    this_duration = ts_end - time_stamps->prev_end_seen;
+    this_duration = ts_end - time_stamps->prev_ts_end;
 
     // do a step update if the duration changes by 10%
     if (last_duration)
@@ -346,7 +331,7 @@
       // frame rate. If we haven't seen 1 second yet, then average
       // over the whole interval seen.
       const double interval =
-          AOMMIN((double)(ts_end - time_stamps->first_ever), 10000000.0);
+          AOMMIN((double)(ts_end - time_stamps->first_ts_start), 10000000.0);
       double avg_duration = 10000000.0 / cpi->framerate;
       avg_duration *= (interval - avg_duration + this_duration);
       avg_duration /= interval;
@@ -354,66 +339,8 @@
       av1_new_framerate(cpi, 10000000.0 / avg_duration);
     }
   }
-  time_stamps->prev_start_seen = ts_start;
-  time_stamps->prev_end_seen = ts_end;
-}
-
-// If this is an alt-ref, returns the offset of the source frame used
-// as the arf midpoint. Otherwise, returns 0.
-static int get_arf_src_index(GF_GROUP *gf_group, int pass) {
-  int arf_src_index = 0;
-  if (pass != 1) arf_src_index = gf_group->arf_src_offset[gf_group->index];
-  return arf_src_index;
-}
-
-// Called if this frame is an ARF or ARF2. Also handles forward-keyframes
-// For an ARF set arf2=0, for ARF2 set arf2=1
-// temporal_filtered is set to 1 if we temporally filter the ARF frame, so that
-// the correct post-filter buffer can be used.
-static struct lookahead_entry *setup_arf_frame(
-    AV1_COMP *const cpi, const int arf_src_index, int *code_arf,
-    EncodeFrameParams *const frame_params, int *show_existing_alt_ref) {
-  AV1_COMMON *const cm = &cpi->common;
-  RATE_CONTROL *const rc = &cpi->rc;
-#if !CONFIG_REALTIME_ONLY
-  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
-#endif
-
-  assert(arf_src_index <= rc->frames_to_key);
-  *code_arf = 0;
-
-  struct lookahead_entry *source =
-      av1_lookahead_peek(cpi->lookahead, arf_src_index, cpi->compressor_stage);
-
-  if (source != NULL) {
-    cm->showable_frame = 1;
-
-    // When arf_src_index == rc->frames_to_key, it indicates a fwd_kf
-    if (arf_src_index == rc->frames_to_key) {
-      // Skip temporal filtering and mark as intra_only if we have a fwd_kf
-      cpi->no_show_kf = 1;
-    } else {
-#if !CONFIG_REALTIME_ONLY
-      if (oxcf->arnr_max_frames > 0) {
-        // Produce the filtered ARF frame.
-        cm->current_frame.frame_type = INTER_FRAME;
-        FRAME_UPDATE_TYPE frame_update_type =
-            get_frame_update_type(&cpi->gf_group);
-        av1_configure_buffer_updates(cpi, frame_params, frame_update_type, 0);
-        *code_arf =
-            av1_temporal_filter(cpi, arf_src_index, show_existing_alt_ref);
-        if (*code_arf) {
-          aom_extend_frame_borders(&cpi->alt_ref_buffer, av1_num_planes(cm));
-        }
-      }
-#else
-      (void)show_existing_alt_ref;
-#endif
-    }
-    frame_params->show_frame = 0;
-  }
-  rc->source_alt_ref_pending = 0;
-  return source;
+  time_stamps->prev_ts_start = ts_start;
+  time_stamps->prev_ts_end = ts_end;
 }
 
 // Determine whether there is a forced keyframe pending in the lookahead buffer
@@ -441,29 +368,44 @@
 // temporal_filtered, flush, and frame_update_type are outputs.
 // Return the frame source, or NULL if we couldn't find one
 static struct lookahead_entry *choose_frame_source(
-    AV1_COMP *const cpi, int *const code_arf, int *const flush,
-    struct lookahead_entry **last_source, EncodeFrameParams *const frame_params,
-    int *show_existing_alt_ref) {
+    AV1_COMP *const cpi, int *const flush, struct lookahead_entry **last_source,
+    EncodeFrameParams *const frame_params) {
   AV1_COMMON *const cm = &cpi->common;
+  const GF_GROUP *const gf_group = &cpi->gf_group;
   struct lookahead_entry *source = NULL;
-  *code_arf = 0;
 
-  // Should we encode an alt-ref frame.
-  int arf_src_index = get_arf_src_index(&cpi->gf_group, cpi->oxcf.pass);
+  // Source index in lookahead buffer.
+  int src_index = gf_group->arf_src_offset[gf_group->index];
+
   // TODO(Aasaipriya): Forced key frames need to be fixed when rc_mode != AOM_Q
-  if (arf_src_index &&
-      (is_forced_keyframe_pending(cpi->lookahead, arf_src_index,
+  if (src_index &&
+      (is_forced_keyframe_pending(cpi->lookahead, src_index,
                                   cpi->compressor_stage) != -1) &&
-      cpi->oxcf.rc_mode != AOM_Q) {
-    arf_src_index = 0;
+      cpi->oxcf.rc_cfg.mode != AOM_Q) {
+    src_index = 0;
     *flush = 1;
   }
 
-  if (arf_src_index)
-    source = setup_arf_frame(cpi, arf_src_index, code_arf, frame_params,
-                             show_existing_alt_ref);
-
-  if (!source) {
+  // If the current frame is arf, then we should not pop from the lookahead
+  // buffer. If the current frame is not arf, then pop it. This assumes the
+  // first frame in the GF group is not arf. May need to change if it is not
+  // true.
+  int pop_lookahead = (src_index == 0);
+  // If this is a key frame and keyframe filtering is enabled with overlay,
+  // then do not pop.
+  if (pop_lookahead && cpi->oxcf.kf_cfg.enable_keyframe_filtering > 1 &&
+      gf_group->update_type[gf_group->index] == ARF_UPDATE &&
+      !is_stat_generation_stage(cpi) && cpi->lookahead) {
+    if (cpi->lookahead->read_ctxs[cpi->compressor_stage].sz &&
+        (*flush ||
+         cpi->lookahead->read_ctxs[cpi->compressor_stage].sz ==
+             cpi->lookahead->read_ctxs[cpi->compressor_stage].pop_sz)) {
+      pop_lookahead = 0;
+    }
+  }
+  frame_params->show_frame = pop_lookahead;
+  if (pop_lookahead) {
+    // show frame, pop from buffer
     // Get last frame source.
     if (cm->current_frame.frame_number > 0) {
       *last_source =
@@ -471,8 +413,17 @@
     }
     // Read in the source frame.
     source = av1_lookahead_pop(cpi->lookahead, *flush, cpi->compressor_stage);
-    if (source == NULL) return NULL;
-    frame_params->show_frame = 1;
+  } else {
+    // no show frames are arf frames
+    source =
+        av1_lookahead_peek(cpi->lookahead, src_index, cpi->compressor_stage);
+    // When src_index == rc->frames_to_key, it indicates a fwd_kf
+    if (src_index == cpi->rc.frames_to_key && src_index != 0) {
+      cpi->no_show_fwd_kf = 1;
+    }
+    if (source != NULL) {
+      cm->showable_frame = 1;
+    }
   }
   return source;
 }
@@ -489,10 +440,10 @@
   if (lookahead_src == NULL) return 1;
 
   const int is_error_resilient =
-      cpi->oxcf.error_resilient_mode ||
+      cpi->oxcf.tool_cfg.error_resilient_mode ||
       (lookahead_src->flags & AOM_EFLAG_ERROR_RESILIENT);
-  const int is_s_frame =
-      cpi->oxcf.s_frame_mode || (lookahead_src->flags & AOM_EFLAG_SET_S_FRAME);
+  const int is_s_frame = cpi->oxcf.kf_cfg.enable_sframe ||
+                         (lookahead_src->flags & AOM_EFLAG_SET_S_FRAME);
   const int is_key_frame =
       (cpi->rc.frames_to_key == 0) || (frame_flags & FRAMEFLAGS_KEY);
   return !(is_error_resilient || is_s_frame) || is_key_frame;
@@ -500,8 +451,11 @@
 
 // Update frame_flags to tell the encoder's caller what sort of frame was
 // encoded.
-static void update_frame_flags(AV1_COMP *cpi, unsigned int *frame_flags) {
-  if (encode_show_existing_frame(&cpi->common)) {
+static void update_frame_flags(
+    const AV1_COMMON *const cm,
+    const RefreshFrameFlagsInfo *const refresh_frame_flags,
+    unsigned int *frame_flags) {
+  if (encode_show_existing_frame(cm)) {
     *frame_flags &= ~FRAMEFLAGS_GOLDEN;
     *frame_flags &= ~FRAMEFLAGS_BWDREF;
     *frame_flags &= ~FRAMEFLAGS_ALTREF;
@@ -509,25 +463,25 @@
     return;
   }
 
-  if (cpi->refresh_golden_frame == 1) {
+  if (refresh_frame_flags->golden_frame) {
     *frame_flags |= FRAMEFLAGS_GOLDEN;
   } else {
     *frame_flags &= ~FRAMEFLAGS_GOLDEN;
   }
 
-  if (cpi->refresh_alt_ref_frame == 1) {
+  if (refresh_frame_flags->alt_ref_frame) {
     *frame_flags |= FRAMEFLAGS_ALTREF;
   } else {
     *frame_flags &= ~FRAMEFLAGS_ALTREF;
   }
 
-  if (cpi->refresh_bwd_ref_frame == 1) {
+  if (refresh_frame_flags->bwd_ref_frame) {
     *frame_flags |= FRAMEFLAGS_BWDREF;
   } else {
     *frame_flags &= ~FRAMEFLAGS_BWDREF;
   }
 
-  if (cpi->common.current_frame.frame_type == KEY_FRAME) {
+  if (cm->current_frame.frame_type == KEY_FRAME) {
     *frame_flags |= FRAMEFLAGS_KEY;
   } else {
     *frame_flags &= ~FRAMEFLAGS_KEY;
@@ -587,11 +541,12 @@
 #endif  // DUMP_REF_FRAME_IMAGES == 1
 
 int av1_get_refresh_ref_frame_map(int refresh_frame_flags) {
-  int ref_map_index = INVALID_IDX;
+  int ref_map_index;
 
   for (ref_map_index = 0; ref_map_index < REF_FRAMES; ++ref_map_index)
     if ((refresh_frame_flags >> ref_map_index) & 1) break;
 
+  if (ref_map_index == REF_FRAMES) ref_map_index = INVALID_IDX;
   return ref_map_index;
 }
 
@@ -632,7 +587,8 @@
 // Update reference frame stack info.
 void av1_update_ref_frame_map(AV1_COMP *cpi,
                               FRAME_UPDATE_TYPE frame_update_type,
-                              int show_existing_frame, int ref_map_index,
+                              FRAME_TYPE frame_type, int show_existing_frame,
+                              int ref_map_index,
                               RefBufferStack *ref_buffer_stack) {
   AV1_COMMON *const cm = &cpi->common;
   // TODO(jingning): Consider the S-frame same as key frame for the
@@ -640,7 +596,7 @@
   // expressed than converting the frame update type.
   if (frame_is_sframe(cm)) frame_update_type = KEY_FRAME;
 
-  if (is_frame_droppable(&cpi->svc, &cpi->ext_flags)) return;
+  if (is_frame_droppable(&cpi->svc, &cpi->ext_flags.refresh_frame)) return;
 
   switch (frame_update_type) {
     case KEY_FRAME:
@@ -672,15 +628,42 @@
       break;
     case ARF_UPDATE:
     case INTNL_ARF_UPDATE:
-      update_arf_stack(ref_map_index, ref_buffer_stack);
+      if (frame_type == KEY_FRAME && !cpi->no_show_fwd_kf) {
+        stack_reset(ref_buffer_stack->lst_stack,
+                    &ref_buffer_stack->lst_stack_size);
+        stack_reset(ref_buffer_stack->gld_stack,
+                    &ref_buffer_stack->gld_stack_size);
+        stack_reset(ref_buffer_stack->arf_stack,
+                    &ref_buffer_stack->arf_stack_size);
+      } else {
+        update_arf_stack(ref_map_index, ref_buffer_stack);
+      }
       stack_push(ref_buffer_stack->arf_stack, &ref_buffer_stack->arf_stack_size,
                  ref_map_index);
       break;
     case OVERLAY_UPDATE:
-      ref_map_index = stack_pop(ref_buffer_stack->arf_stack,
-                                &ref_buffer_stack->arf_stack_size);
-      stack_push(ref_buffer_stack->gld_stack, &ref_buffer_stack->gld_stack_size,
-                 ref_map_index);
+      if (frame_type == KEY_FRAME) {
+        ref_map_index = stack_pop(ref_buffer_stack->arf_stack,
+                                  &ref_buffer_stack->arf_stack_size);
+        stack_reset(ref_buffer_stack->lst_stack,
+                    &ref_buffer_stack->lst_stack_size);
+        stack_reset(ref_buffer_stack->gld_stack,
+                    &ref_buffer_stack->gld_stack_size);
+        stack_reset(ref_buffer_stack->arf_stack,
+                    &ref_buffer_stack->arf_stack_size);
+        stack_push(ref_buffer_stack->gld_stack,
+                   &ref_buffer_stack->gld_stack_size, ref_map_index);
+      } else {
+        if (ref_map_index != INVALID_IDX) {
+          update_arf_stack(ref_map_index, ref_buffer_stack);
+          stack_push(ref_buffer_stack->lst_stack,
+                     &ref_buffer_stack->lst_stack_size, ref_map_index);
+        }
+        ref_map_index = stack_pop(ref_buffer_stack->arf_stack,
+                                  &ref_buffer_stack->arf_stack_size);
+        stack_push(ref_buffer_stack->gld_stack,
+                   &ref_buffer_stack->gld_stack_size, ref_map_index);
+      }
       break;
     case INTNL_OVERLAY_UPDATE:
       ref_map_index = stack_pop(ref_buffer_stack->arf_stack,
@@ -727,10 +710,12 @@
                                 FRAME_UPDATE_TYPE frame_update_type,
                                 const RefBufferStack *const ref_buffer_stack) {
   const AV1_COMMON *const cm = &cpi->common;
-  const ExternalFlags *const ext_flags = &cpi->ext_flags;
+  const ExtRefreshFrameFlagsInfo *const ext_refresh_frame_flags =
+      &cpi->ext_flags.refresh_frame;
+
   const SVC *const svc = &cpi->svc;
   // Switch frames and shown key-frames overwrite all reference slots
-  if ((frame_params->frame_type == KEY_FRAME && frame_params->show_frame) ||
+  if ((frame_params->frame_type == KEY_FRAME && !cpi->no_show_fwd_kf) ||
       frame_params->frame_type == S_FRAME)
     return 0xFF;
 
@@ -742,11 +727,11 @@
     return 0;
   }
 
-  if (is_frame_droppable(svc, ext_flags)) return 0;
+  if (is_frame_droppable(svc, ext_refresh_frame_flags)) return 0;
 
   int refresh_mask = 0;
 
-  if (ext_flags->refresh_frame_flags_pending) {
+  if (ext_refresh_frame_flags->update_pending) {
     if (svc->external_ref_frame_config) {
       for (unsigned int i = 0; i < INTER_REFS_PER_FRAME; i++) {
         int ref_frame_map_idx = svc->ref_idx[i];
@@ -759,28 +744,33 @@
     // order to preserve the behaviour of the flag overrides.
     int ref_frame_map_idx = get_ref_frame_map_idx(cm, LAST_FRAME);
     if (ref_frame_map_idx != INVALID_IDX)
-      refresh_mask |= ext_flags->refresh_last_frame << ref_frame_map_idx;
+      refresh_mask |= ext_refresh_frame_flags->last_frame << ref_frame_map_idx;
 
     ref_frame_map_idx = get_ref_frame_map_idx(cm, EXTREF_FRAME);
     if (ref_frame_map_idx != INVALID_IDX)
-      refresh_mask |= ext_flags->refresh_bwd_ref_frame << ref_frame_map_idx;
+      refresh_mask |= ext_refresh_frame_flags->bwd_ref_frame
+                      << ref_frame_map_idx;
 
     ref_frame_map_idx = get_ref_frame_map_idx(cm, ALTREF2_FRAME);
     if (ref_frame_map_idx != INVALID_IDX)
-      refresh_mask |= ext_flags->refresh_alt2_ref_frame << ref_frame_map_idx;
+      refresh_mask |= ext_refresh_frame_flags->alt2_ref_frame
+                      << ref_frame_map_idx;
 
     if (frame_update_type == OVERLAY_UPDATE) {
       ref_frame_map_idx = get_ref_frame_map_idx(cm, ALTREF_FRAME);
       if (ref_frame_map_idx != INVALID_IDX)
-        refresh_mask |= ext_flags->refresh_golden_frame << ref_frame_map_idx;
+        refresh_mask |= ext_refresh_frame_flags->golden_frame
+                        << ref_frame_map_idx;
     } else {
       ref_frame_map_idx = get_ref_frame_map_idx(cm, GOLDEN_FRAME);
       if (ref_frame_map_idx != INVALID_IDX)
-        refresh_mask |= ext_flags->refresh_golden_frame << ref_frame_map_idx;
+        refresh_mask |= ext_refresh_frame_flags->golden_frame
+                        << ref_frame_map_idx;
 
       ref_frame_map_idx = get_ref_frame_map_idx(cm, ALTREF_FRAME);
       if (ref_frame_map_idx != INVALID_IDX)
-        refresh_mask |= ext_flags->refresh_alt_ref_frame << ref_frame_map_idx;
+        refresh_mask |= ext_refresh_frame_flags->alt_ref_frame
+                        << ref_frame_map_idx;
     }
     return refresh_mask;
   }
@@ -844,7 +834,9 @@
                      ->lst_stack[ref_buffer_stack->lst_stack_size - 1];
       }
       break;
-    case OVERLAY_UPDATE: break;
+    case OVERLAY_UPDATE:
+      if (free_fb_index != INVALID_IDX) refresh_mask = 1 << free_fb_index;
+      break;
     case INTNL_OVERLAY_UPDATE: break;
     default: assert(0); break;
   }
@@ -867,69 +859,116 @@
   set_mi_offsets(&cm->mi_params, xd, 0, 0);
 }
 
-// Apply temporal filtering to key frames and encode the filtered frame.
-// If the current frame is not key frame, this function is identical to
-// av1_encode().
+// Apply temporal filtering to source frames and encode the filtered frame.
+// If the current frame does not require filtering, this function is identical
+// to av1_encode() except that tpl is not performed.
 static int denoise_and_encode(AV1_COMP *const cpi, uint8_t *const dest,
                               EncodeFrameInput *const frame_input,
                               EncodeFrameParams *const frame_params,
                               EncodeFrameResults *const frame_results) {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  if (cpi->oxcf.pass == 2) start_timing(cpi, denoise_and_encode_time);
+#endif
   const AV1EncoderConfig *const oxcf = &cpi->oxcf;
   AV1_COMMON *const cm = &cpi->common;
+  const GF_GROUP *const gf_group = &cpi->gf_group;
 
   // Decide whether to apply temporal filtering to the source frame.
-  int apply_filtering =
-      frame_params->frame_type == KEY_FRAME &&
-      oxcf->enable_keyframe_filtering && !is_stat_generation_stage(cpi) &&
-      !frame_params->show_existing_frame &&
-      cpi->rc.frames_to_key > TF_NUM_FILTERING_FRAMES_FOR_KEY_FRAME &&
-      !is_lossless_requested(oxcf) && oxcf->arnr_max_frames > 0;
-  if (apply_filtering) {
-    const double y_noise_level = av1_estimate_noise_from_single_plane(
-        frame_input->source, 0, cm->seq_params.bit_depth);
-    apply_filtering = y_noise_level > 0;
+  int apply_filtering = 0;
+  int arf_src_index = -1;
+  if (frame_params->frame_type == KEY_FRAME) {
+    // Decide whether it is allowed to perform key frame filtering
+    int allow_kf_filtering =
+        oxcf->kf_cfg.enable_keyframe_filtering &&
+        !is_stat_generation_stage(cpi) && !frame_params->show_existing_frame &&
+        cpi->rc.frames_to_key > cpi->oxcf.algo_cfg.arnr_max_frames &&
+        !is_lossless_requested(&oxcf->rc_cfg) &&
+        oxcf->algo_cfg.arnr_max_frames > 0;
+    if (allow_kf_filtering) {
+      const double y_noise_level = av1_estimate_noise_from_single_plane(
+          frame_input->source, 0, cm->seq_params.bit_depth);
+      apply_filtering = y_noise_level > 0;
+    } else {
+      apply_filtering = 0;
+    }
+    // If we are doing kf filtering, set up a few things.
+    if (apply_filtering) {
+      av1_setup_past_independence(cm);
+      if (!frame_params->show_frame && cpi->no_show_fwd_kf) {
+        // fwd kf
+        arf_src_index = -1 * gf_group->arf_src_offset[gf_group->index];
+      } else if (!frame_params->show_frame) {
+        arf_src_index = 0;
+      } else {
+        arf_src_index = -1;
+      }
+    }
+  } else if (get_frame_update_type(&cpi->gf_group) == ARF_UPDATE ||
+             get_frame_update_type(&cpi->gf_group) == INTNL_ARF_UPDATE) {
+    // ARF
+    apply_filtering = oxcf->algo_cfg.arnr_max_frames > 0;
+    if (apply_filtering) {
+      arf_src_index = gf_group->arf_src_offset[gf_group->index];
+    }
   }
 
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  if (cpi->oxcf.pass == 2) start_timing(cpi, apply_filtering_time);
+#endif
   // Save the pointer to the original source image.
-  YV12_BUFFER_CONFIG *source_kf_buffer = frame_input->source;
-
-  // Apply filtering to key frame.
+  YV12_BUFFER_CONFIG *source_buffer = frame_input->source;
+  // apply filtering to frame
   if (apply_filtering) {
-    // Initialization for frame motion estimation.
-    MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
-    av1_init_mi_buffers(&cm->mi_params);
-    setup_mi(cpi, frame_input->source);
-    av1_init_macroblockd(cm, xd, NULL);
-    memset(
-        cpi->mbmi_ext_info.frame_base, 0,
-        cpi->mbmi_ext_info.alloc_size * sizeof(*cpi->mbmi_ext_info.frame_base));
-
-    av1_set_speed_features_framesize_independent(cpi, oxcf->speed);
-    av1_set_speed_features_framesize_dependent(cpi, oxcf->speed);
-    av1_set_rd_speed_thresholds(cpi);
-    av1_setup_frame_buf_refs(cm);
-    av1_setup_frame_sign_bias(cm);
-    av1_frame_init_quantizer(cpi);
-    av1_setup_past_independence(cm);
-
-    if (!frame_params->show_frame) {
-      int arf_src_index = get_arf_src_index(&cpi->gf_group, cpi->oxcf.pass);
-      av1_temporal_filter(cpi, -1 * arf_src_index, NULL);
-    } else {
-      av1_temporal_filter(cpi, -1, NULL);
+    int show_existing_alt_ref = 0;
+    // TODO(bohanli): figure out why we need frame_type in cm here.
+    cm->current_frame.frame_type = frame_params->frame_type;
+    const int code_arf =
+        av1_temporal_filter(cpi, arf_src_index, &show_existing_alt_ref);
+    if (code_arf) {
+      aom_extend_frame_borders(&cpi->alt_ref_buffer, av1_num_planes(cm));
+      frame_input->source = &cpi->alt_ref_buffer;
+      aom_copy_metadata_to_frame_buffer(frame_input->source,
+                                        source_buffer->metadata);
     }
-    aom_extend_frame_borders(&cpi->alt_ref_buffer, av1_num_planes(cm));
-    // Use the filtered frame for encoding.
-    frame_input->source = &cpi->alt_ref_buffer;
-    // Copy metadata info to alt-ref buffer.
-    aom_remove_metadata_from_frame_buffer(frame_input->source);
-    aom_copy_metadata_to_frame_buffer(frame_input->source,
-                                      source_kf_buffer->metadata);
+    // Currently INTNL_ARF_UPDATE only do show_existing.
+    if (get_frame_update_type(&cpi->gf_group) == ARF_UPDATE &&
+        !cpi->no_show_fwd_kf) {
+      cpi->show_existing_alt_ref = show_existing_alt_ref;
+    }
+  }
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  if (cpi->oxcf.pass == 2) end_timing(cpi, apply_filtering_time);
+#endif
 
-    if (oxcf->enable_tpl_model && oxcf->lag_in_frames > 0 &&
-        frame_params->show_frame) {
+  // perform tpl after filtering
+  int allow_tpl = oxcf->gf_cfg.lag_in_frames > 1 &&
+                  !is_stat_generation_stage(cpi) &&
+                  oxcf->algo_cfg.enable_tpl_model;
+  if (frame_params->frame_type == KEY_FRAME) {
+    // Don't do tpl for fwd key frames
+    allow_tpl = allow_tpl && !cpi->sf.tpl_sf.disable_filtered_key_tpl &&
+                !cpi->no_show_fwd_kf;
+  } else {
+    // Do tpl after ARF is filtered, or if no ARF, at the second frame of GF
+    // group.
+    // TODO(bohanli): if no ARF, just do it at the first frame.
+    int gf_index = gf_group->index;
+    allow_tpl = allow_tpl && (gf_group->update_type[gf_index] == ARF_UPDATE ||
+                              gf_group->update_type[gf_index] == GF_UPDATE);
+    if (allow_tpl) {
+      // Need to set the size for TPL for ARF
+      // TODO(bohanli): Why is this? what part of it is necessary?
+      av1_set_frame_size(cpi, cm->superres_upscaled_width,
+                         cm->superres_upscaled_height);
+    }
+  }
+
+  if (allow_tpl == 0) {
+    // Avoid the use of unintended TPL stats from previous GOP's results.
+    if (gf_group->index == 0) av1_init_tpl_stats(&cpi->tpl_data);
+  } else {
+    if (!cpi->tpl_data.skip_tpl_setup_stats)
       av1_tpl_setup_stats(cpi, 0, frame_params, frame_input);
-    }
   }
 
   if (av1_encode(cpi, dest, frame_input, frame_params, frame_results) !=
@@ -938,11 +977,15 @@
   }
 
   // Set frame_input source to true source for psnr calculation.
-  if (apply_filtering) {
-    cpi->source = source_kf_buffer;
-    cpi->unscaled_source = source_kf_buffer;
+  if (apply_filtering && is_psnr_calc_enabled(cpi)) {
+    cpi->source =
+        av1_scale_if_required(cm, source_buffer, &cpi->scaled_source,
+                              cm->features.interp_filter, 0, false, true);
+    cpi->unscaled_source = source_buffer;
   }
-
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  if (cpi->oxcf.pass == 2) end_timing(cpi, denoise_and_encode_time);
+#endif
   return AOM_CODEC_OK;
 }
 #endif  // !CONFIG_REALTIME_ONLY
@@ -996,11 +1039,20 @@
   if (gld_stack_size) {
     remapped_ref_idx[GOLDEN_FRAME - LAST_FRAME] = gld_stack[0];
 
+    // If there are more frames in the golden stack, assign them to BWDREF,
+    // ALTREF2, or LAST3.
     if (gld_stack_size > 1) {
-      if (arf_stack_size <= 1)
-        remapped_ref_idx[BWDREF_FRAME - LAST_FRAME] = gld_stack[1];
-      else
+      if (arf_stack_size <= 2) {
+        if (arf_stack_size <= 1) {
+          remapped_ref_idx[BWDREF_FRAME - LAST_FRAME] = gld_stack[1];
+          if (gld_stack_size > 2)
+            remapped_ref_idx[ALTREF2_FRAME - LAST_FRAME] = gld_stack[2];
+        } else {
+          remapped_ref_idx[ALTREF2_FRAME - LAST_FRAME] = gld_stack[1];
+        }
+      } else {
         remapped_ref_idx[LAST3_FRAME - LAST_FRAME] = gld_stack[1];
+      }
     }
   }
 
@@ -1022,10 +1074,13 @@
           find_unused_ref_frame(remapped_ref_idx, lst_stack, lst_stack_size);
     }
 
-    if (ref_map_index != INVALID_IDX)
+    if (ref_map_index != INVALID_IDX) {
       remapped_ref_idx[idx] = ref_map_index;
-    else
+    } else if (!gld_stack_size && arf_stack_size) {
+      remapped_ref_idx[idx] = ref_buffer_stack->arf_stack[0];
+    } else {
       remapped_ref_idx[idx] = ref_buffer_stack->gld_stack[0];
+    }
   }
 }
 
@@ -1034,10 +1089,11 @@
                         int64_t *const time_stamp, int64_t *const time_end,
                         const aom_rational64_t *const timestamp_ratio,
                         int flush) {
-  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  AV1EncoderConfig *const oxcf = &cpi->oxcf;
   AV1_COMMON *const cm = &cpi->common;
   GF_GROUP *gf_group = &cpi->gf_group;
   ExternalFlags *const ext_flags = &cpi->ext_flags;
+  GFConfig *const gf_cfg = &oxcf->gf_cfg;
 
   EncodeFrameInput frame_input;
   EncodeFrameParams frame_params;
@@ -1046,23 +1102,60 @@
   memset(&frame_params, 0, sizeof(frame_params));
   memset(&frame_results, 0, sizeof(frame_results));
 
-  // TODO(sarahparker) finish bit allocation for one pass pyramid
-  if (has_no_stats_stage(cpi) && oxcf->rc_mode != AOM_Q) {
-    cpi->oxcf.gf_max_pyr_height =
-        AOMMIN(cpi->oxcf.gf_max_pyr_height, USE_ALTREF_FOR_ONE_PASS);
-    cpi->oxcf.gf_min_pyr_height =
-        AOMMIN(cpi->oxcf.gf_min_pyr_height, cpi->oxcf.gf_max_pyr_height);
+  // Check if we need to stuff more src frames
+  if (flush == 0) {
+    int srcbuf_size =
+        av1_lookahead_depth(cpi->lookahead, cpi->compressor_stage);
+    int pop_size = av1_lookahead_pop_sz(cpi->lookahead, cpi->compressor_stage);
+
+    // Continue buffering look ahead buffer.
+    if (srcbuf_size < pop_size) return -1;
   }
 
+  if (!av1_lookahead_peek(cpi->lookahead, 0, cpi->compressor_stage)) {
+#if !CONFIG_REALTIME_ONLY
+    if (flush && oxcf->pass == 1 && !cpi->twopass.first_pass_done) {
+      av1_end_first_pass(cpi); /* get last stats packet */
+      cpi->twopass.first_pass_done = 1;
+    }
+#endif
+    return -1;
+  }
+
+  // TODO(sarahparker) finish bit allocation for one pass pyramid
+  if (has_no_stats_stage(cpi)) {
+    gf_cfg->gf_max_pyr_height =
+        AOMMIN(gf_cfg->gf_max_pyr_height, USE_ALTREF_FOR_ONE_PASS);
+    gf_cfg->gf_min_pyr_height =
+        AOMMIN(gf_cfg->gf_min_pyr_height, gf_cfg->gf_max_pyr_height);
+  }
+
+  cpi->tpl_data.skip_tpl_setup_stats = 0;
+#if !CONFIG_REALTIME_ONLY
+  const int use_one_pass_rt_params = has_no_stats_stage(cpi) &&
+                                     oxcf->mode == REALTIME &&
+                                     gf_cfg->lag_in_frames == 0;
+  if (!use_one_pass_rt_params && !is_stat_generation_stage(cpi)) {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    start_timing(cpi, av1_get_second_pass_params_time);
+#endif
+    av1_get_second_pass_params(cpi, &frame_params, &frame_input, *frame_flags);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    end_timing(cpi, av1_get_second_pass_params_time);
+#endif
+  }
+#endif
+
   if (!is_stat_generation_stage(cpi)) {
     // If this is a forward keyframe, mark as a show_existing_frame
-    if (cpi->oxcf.fwd_kf_enabled && (gf_group->index == gf_group->size) &&
-        gf_group->update_type[1] == ARF_UPDATE && cpi->rc.frames_to_key == 0) {
+    // TODO(bohanli): find a consistent condition for fwd keyframes
+    if (oxcf->kf_cfg.fwd_kf_enabled &&
+        gf_group->update_type[gf_group->index] == OVERLAY_UPDATE &&
+        cpi->rc.frames_to_key == 0) {
       frame_params.show_existing_frame = 1;
     } else {
       frame_params.show_existing_frame =
-          ((oxcf->enable_overlay == 0 || cpi->sf.hl_sf.disable_overlay_frames ||
-            cpi->show_existing_alt_ref) &&
+          (cpi->show_existing_alt_ref &&
            gf_group->update_type[gf_group->index] == OVERLAY_UPDATE) ||
           gf_group->update_type[gf_group->index] == INTNL_OVERLAY_UPDATE;
     }
@@ -1076,18 +1169,13 @@
     frame_params.show_existing_frame = 0;
   }
 
-  int code_arf = 0;
   struct lookahead_entry *source = NULL;
   struct lookahead_entry *last_source = NULL;
   if (frame_params.show_existing_frame) {
     source = av1_lookahead_pop(cpi->lookahead, flush, cpi->compressor_stage);
     frame_params.show_frame = 1;
   } else {
-    int show_existing_alt_ref = 0;
-    source = choose_frame_source(cpi, &code_arf, &flush, &last_source,
-                                 &frame_params, &show_existing_alt_ref);
-    if (gf_group->update_type[gf_group->index] == ARF_UPDATE)
-      cpi->show_existing_alt_ref = show_existing_alt_ref;
+    source = choose_frame_source(cpi, &flush, &last_source, &frame_params);
   }
 
   if (source == NULL) {  // If no source was found, we can't encode a frame.
@@ -1099,8 +1187,8 @@
 #endif
     return -1;
   }
-
-  frame_input.source = code_arf ? &cpi->alt_ref_buffer : &source->img;
+  // Source may be changed if temporal filtered later.
+  frame_input.source = &source->img;
   frame_input.last_source = last_source != NULL ? &last_source->img : NULL;
   frame_input.ts_duration = source->ts_end - source->ts_start;
   // Save unfiltered source. It is used in av1_get_second_pass_params().
@@ -1108,14 +1196,13 @@
 
   *time_stamp = source->ts_start;
   *time_end = source->ts_end;
-  if (source->ts_start < cpi->time_stamps.first_ever) {
-    cpi->time_stamps.first_ever = source->ts_start;
-    cpi->time_stamps.prev_end_seen = source->ts_start;
+  if (source->ts_start < cpi->time_stamps.first_ts_start) {
+    cpi->time_stamps.first_ts_start = source->ts_start;
+    cpi->time_stamps.prev_ts_end = source->ts_start;
   }
 
   av1_apply_encoding_flags(cpi, source->flags);
-  if (!frame_params.show_existing_frame)
-    *frame_flags = (source->flags & AOM_EFLAG_FORCE_KF) ? FRAMEFLAGS_KEY : 0;
+  *frame_flags = (source->flags & AOM_EFLAG_FORCE_KF) ? FRAMEFLAGS_KEY : 0;
 
   // Shown frames and arf-overlay frames need frame-rate considering
   if (frame_params.show_frame)
@@ -1138,13 +1225,18 @@
 
 #if CONFIG_REALTIME_ONLY
   av1_get_one_pass_rt_params(cpi, &frame_params, *frame_flags);
+  if (cpi->oxcf.speed >= 5 && cm->number_spatial_layers == 1 &&
+      cm->number_temporal_layers == 1)
+    av1_set_reference_structure_one_pass_rt(cpi, gf_group->index == 0);
 #else
-  if (has_no_stats_stage(cpi) && oxcf->mode == REALTIME &&
-      oxcf->lag_in_frames == 0)
+  if (use_one_pass_rt_params) {
     av1_get_one_pass_rt_params(cpi, &frame_params, *frame_flags);
-  else if (!is_stat_generation_stage(cpi))
-    av1_get_second_pass_params(cpi, &frame_params, &frame_input, *frame_flags);
+    if (cpi->oxcf.speed >= 5 && cm->number_spatial_layers == 1 &&
+        cm->number_temporal_layers == 1)
+      av1_set_reference_structure_one_pass_rt(cpi, gf_group->index == 0);
+  }
 #endif
+
   FRAME_UPDATE_TYPE frame_update_type = get_frame_update_type(gf_group);
 
   if (frame_params.show_existing_frame &&
@@ -1159,16 +1251,16 @@
   // TODO(david.turner@argondesign.com): Change all the encode strategy to
   // modify frame_params instead of cm or cpi.
 
-  // Per-frame encode speed.  In theory this can vary, but things may have been
-  // written assuming speed-level will not change within a sequence, so this
-  // parameter should be used with caution.
+  // Per-frame encode speed.  In theory this can vary, but things may have
+  // been written assuming speed-level will not change within a sequence, so
+  // this parameter should be used with caution.
   frame_params.speed = oxcf->speed;
 
   // Work out some encoding parameters specific to the pass:
-  if (has_no_stats_stage(cpi) && cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) {
+  if (has_no_stats_stage(cpi) && oxcf->q_cfg.aq_mode == CYCLIC_REFRESH_AQ) {
     av1_cyclic_refresh_update_parameters(cpi);
   } else if (is_stat_generation_stage(cpi)) {
-    cpi->td.mb.e_mbd.lossless[0] = is_lossless_requested(&cpi->oxcf);
+    cpi->td.mb.e_mbd.lossless[0] = is_lossless_requested(&oxcf->rc_cfg);
     const int kf_requested = (cm->current_frame.frame_number == 0 ||
                               (*frame_flags & FRAMEFLAGS_KEY));
     if (kf_requested && frame_update_type != OVERLAY_UPDATE &&
@@ -1196,14 +1288,15 @@
        frame_params.frame_type == S_FRAME) &&
       !frame_params.show_existing_frame;
 
-  av1_configure_buffer_updates(cpi, &frame_params, frame_update_type,
+  av1_configure_buffer_updates(cpi, &frame_params.refresh_frame,
+                               frame_update_type, frame_params.frame_type,
                                force_refresh_all);
 
   if (!is_stat_generation_stage(cpi)) {
     const RefCntBuffer *ref_frames[INTER_REFS_PER_FRAME];
     const YV12_BUFFER_CONFIG *ref_frame_buf[INTER_REFS_PER_FRAME];
 
-    if (!ext_flags->refresh_frame_flags_pending) {
+    if (!ext_flags->refresh_frame.update_pending) {
       av1_get_ref_frames(cpi, &cpi->ref_buffer_stack);
     } else if (cpi->svc.external_ref_frame_config) {
       for (unsigned int i = 0; i < INTER_REFS_PER_FRAME; i++)
@@ -1215,13 +1308,14 @@
       ref_frames[i] = get_ref_frame_buf(cm, ref_frame_priority_order[i]);
       ref_frame_buf[i] = ref_frames[i] != NULL ? &ref_frames[i]->buf : NULL;
     }
+
     // Work out which reference frame slots may be used.
     frame_params.ref_frame_flags = get_ref_frame_flags(
         &cpi->sf, ref_frame_buf, ext_flags->ref_frame_flags);
 
     frame_params.primary_ref_frame =
         choose_primary_ref_frame(cpi, &frame_params);
-    frame_params.order_offset = get_order_offset(&cpi->gf_group, &frame_params);
+    frame_params.order_offset = gf_group->arf_src_offset[gf_group->index];
 
     frame_params.refresh_frame_flags = av1_get_refresh_frame_flags(
         cpi, &frame_params, frame_update_type, &cpi->ref_buffer_stack);
@@ -1236,29 +1330,19 @@
 
   // The way frame_params->remapped_ref_idx is setup is a placeholder.
   // Currently, reference buffer assignment is done by update_ref_frame_map()
-  // which is called by high-level strategy AFTER encoding a frame.  It modifies
-  // cm->remapped_ref_idx.  If you want to use an alternative method to
-  // determine reference buffer assignment, just put your assignments into
+  // which is called by high-level strategy AFTER encoding a frame.  It
+  // modifies cm->remapped_ref_idx.  If you want to use an alternative method
+  // to determine reference buffer assignment, just put your assignments into
   // frame_params->remapped_ref_idx here and they will be used when encoding
   // this frame.  If frame_params->remapped_ref_idx is setup independently of
   // cm->remapped_ref_idx then update_ref_frame_map() will have no effect.
   memcpy(frame_params.remapped_ref_idx, cm->remapped_ref_idx,
          REF_FRAMES * sizeof(*cm->remapped_ref_idx));
 
-  cpi->td.mb.e_mbd.delta_qindex = 0;
+  cpi->td.mb.delta_qindex = 0;
 
   if (!frame_params.show_existing_frame) {
-    cm->quant_params.using_qmatrix = cpi->oxcf.using_qm;
-#if !CONFIG_REALTIME_ONLY
-    if (oxcf->lag_in_frames > 0 && !is_stat_generation_stage(cpi)) {
-      if (cpi->gf_group.index == 1 && cpi->oxcf.enable_tpl_model) {
-        av1_configure_buffer_updates(cpi, &frame_params, frame_update_type, 0);
-        av1_set_frame_size(cpi, cm->width, cm->height);
-        av1_tpl_setup_stats(cpi, 0, &frame_params, &frame_input);
-        assert(cpi->num_gf_group_show_frames == 1);
-      }
-    }
-#endif
+    cm->quant_params.using_qmatrix = oxcf->q_cfg.using_qm;
   }
 
 #if CONFIG_REALTIME_ONLY
@@ -1267,23 +1351,28 @@
     return AOM_CODEC_ERROR;
   }
 #else
-  if (denoise_and_encode(cpi, dest, &frame_input, &frame_params,
-                         &frame_results) != AOM_CODEC_OK) {
+  if (has_no_stats_stage(cpi) && oxcf->mode == REALTIME &&
+      gf_cfg->lag_in_frames == 0) {
+    if (av1_encode(cpi, dest, &frame_input, &frame_params, &frame_results) !=
+        AOM_CODEC_OK) {
+      return AOM_CODEC_ERROR;
+    }
+  } else if (denoise_and_encode(cpi, dest, &frame_input, &frame_params,
+                                &frame_results) != AOM_CODEC_OK) {
     return AOM_CODEC_ERROR;
   }
 #endif  // CONFIG_REALTIME_ONLY
-  if (!is_stat_generation_stage(cpi))
-    cpi->num_gf_group_show_frames += frame_params.show_frame;
 
   if (!is_stat_generation_stage(cpi)) {
     // First pass doesn't modify reference buffer assignment or produce frame
     // flags
-    update_frame_flags(cpi, frame_flags);
-    if (!ext_flags->refresh_frame_flags_pending) {
+    update_frame_flags(&cpi->common, &cpi->refresh_frame, frame_flags);
+    if (!ext_flags->refresh_frame.update_pending) {
       int ref_map_index =
           av1_get_refresh_ref_frame_map(cm->current_frame.refresh_frame_flags);
-      av1_update_ref_frame_map(cpi, frame_update_type, cm->show_existing_frame,
-                               ref_map_index, &cpi->ref_buffer_stack);
+      av1_update_ref_frame_map(cpi, frame_update_type, frame_params.frame_type,
+                               cm->show_existing_frame, ref_map_index,
+                               &cpi->ref_buffer_stack);
     }
   }
 
@@ -1297,10 +1386,18 @@
             cm->txcoeff_cost_count, cm->txcoeff_cost_timer,
             cm->cum_txcoeff_cost_timer);
 #endif
-    av1_twopass_postencode_update(cpi);
+    if (!has_no_stats_stage(cpi)) av1_twopass_postencode_update(cpi);
   }
 #endif  // !CONFIG_REALTIME_ONLY
 
+#if CONFIG_TUNE_VMAF
+  if (!is_stat_generation_stage(cpi) &&
+      (oxcf->tune_cfg.tuning >= AOM_TUNE_VMAF_WITH_PREPROCESSING &&
+       oxcf->tune_cfg.tuning <= AOM_TUNE_VMAF_NEG_MAX_GAIN)) {
+    av1_update_vmaf_curve(cpi);
+  }
+#endif
+
   if (!is_stat_generation_stage(cpi)) {
     update_fb_of_context_type(cpi, &frame_params, cpi->fb_of_context_type);
     set_additional_frame_flags(cm, frame_flags);
@@ -1312,7 +1409,7 @@
 
   // Leave a signal for a higher level caller about if this frame is droppable
   if (*size > 0) {
-    cpi->droppable = is_frame_droppable(&cpi->svc, ext_flags);
+    cpi->droppable = is_frame_droppable(&cpi->svc, &ext_flags->refresh_frame);
   }
 
   if (cpi->use_svc) av1_save_layer_context(cpi);
diff --git a/av1/encoder/encode_strategy.h b/av1/encoder/encode_strategy.h
index b05224b..351e8a1 100644
--- a/av1/encoder/encode_strategy.h
+++ b/av1/encoder/encode_strategy.h
@@ -9,6 +9,9 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
+/*!\file
+ * \brief Declares frame encoding functions.
+ */
 #ifndef AOM_AV1_ENCODER_ENCODE_STRATEGY_H_
 #define AOM_AV1_ENCODER_ENCODE_STRATEGY_H_
 
@@ -23,22 +26,45 @@
 #include "av1/encoder/encoder.h"
 #include "av1/encoder/firstpass.h"
 
-// This function will implement high-level encode strategy, choosing frame type,
-// frame placement, etc.  It populates an EncodeFrameParams struct with the
-// results of these decisions and then calls av1_encode()
+/*!\brief Implement high-level encode strategy
+ *
+ * \ingroup high_level_algo
+ * \callgraph
+ * \callergraph
+ * This function will implement high-level encode strategy, choosing frame type,
+ * frame placement, etc. It populates an EncodeFrameParams struct with the
+ * results of these decisions and then encodes the frame. The caller should use
+ * the output parameters *time_stamp and *time_end only when this function
+ * returns AOM_CODEC_OK.
+ *
+ * \param[in]    cpi         Top-level encoder structure
+ * \param[in]    size        Bitstream size
+ * \param[in]    dest        Bitstream output
+ * \param[in]    frame_flags Flags to decide how to encoding the frame
+ * \param[out]   time_stamp  Time stamp of the frame
+ * \param[out]   time_end    Time end
+ * \param[in]    timestamp_ratio Time base
+ * \param[in]    flush       Decide to encode one frame or the rest of frames
+ *
+ * \return Returns a value to indicate if the encoding is done successfully.
+ * \retval #AOM_CODEC_OK
+ * \retval -1
+ * \retval #AOM_CODEC_ERROR
+ */
 int av1_encode_strategy(AV1_COMP *const cpi, size_t *const size,
                         uint8_t *const dest, unsigned int *frame_flags,
                         int64_t *const time_stamp, int64_t *const time_end,
                         const aom_rational64_t *const timestamp_ratio,
                         int flush);
 
+/*!\cond */
 // Set individual buffer update flags based on frame reference type.
 // force_refresh_all is used when we have a KEY_FRAME or S_FRAME.  It forces all
 // refresh_*_frame flags to be set, because we refresh all buffers in this case.
-void av1_configure_buffer_updates(AV1_COMP *const cpi,
-                                  EncodeFrameParams *const frame_params,
-                                  const FRAME_UPDATE_TYPE type,
-                                  int force_refresh_all);
+void av1_configure_buffer_updates(
+    AV1_COMP *const cpi, RefreshFrameFlagsInfo *const refresh_frame_flags,
+    const FRAME_UPDATE_TYPE type, const FRAME_TYPE frame_type,
+    int force_refresh_all);
 
 int av1_get_refresh_frame_flags(const AV1_COMP *const cpi,
                                 const EncodeFrameParams *const frame_params,
@@ -49,7 +75,8 @@
 
 void av1_update_ref_frame_map(AV1_COMP *cpi,
                               FRAME_UPDATE_TYPE frame_update_type,
-                              int show_existing_frame, int ref_map_index,
+                              FRAME_TYPE frame_type, int show_existing_frame,
+                              int ref_map_index,
                               RefBufferStack *ref_buffer_stack);
 
 void av1_get_ref_frames(AV1_COMP *const cpi, RefBufferStack *ref_buffer_stack);
@@ -57,6 +84,7 @@
 int is_forced_keyframe_pending(struct lookahead_ctx *lookahead,
                                const int up_to_index,
                                const COMPRESSOR_STAGE compressor_stage);
+/*!\endcond */
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/av1/encoder/encodeframe.c b/av1/encoder/encodeframe.c
index 53b47d4..8707594 100644
--- a/av1/encoder/encodeframe.c
+++ b/av1/encoder/encodeframe.c
@@ -47,9 +47,9 @@
 #include "av1/encoder/aq_complexity.h"
 #include "av1/encoder/aq_cyclicrefresh.h"
 #include "av1/encoder/aq_variance.h"
-#include "av1/encoder/corner_detect.h"
-#include "av1/encoder/global_motion.h"
+#include "av1/encoder/global_motion_facade.h"
 #include "av1/encoder/encodeframe.h"
+#include "av1/encoder/encodeframe_utils.h"
 #include "av1/encoder/encodemb.h"
 #include "av1/encoder/encodemv.h"
 #include "av1/encoder/encodetxb.h"
@@ -61,6 +61,7 @@
 #if !CONFIG_REALTIME_ONLY
 #include "av1/encoder/partition_model_weights.h"
 #endif
+#include "av1/encoder/partition_search.h"
 #include "av1/encoder/rd.h"
 #include "av1/encoder/rdopt.h"
 #include "av1/encoder/reconinter_enc.h"
@@ -73,11 +74,7 @@
 #include "av1/encoder/tune_vmaf.h"
 #endif
 
-static AOM_INLINE void encode_superblock(const AV1_COMP *const cpi,
-                                         TileDataEnc *tile_data, ThreadData *td,
-                                         TOKENEXTRA **t, RUN_TYPE dry_run,
-                                         BLOCK_SIZE bsize, int *rate);
-
+/*!\cond */
 // This is used as a reference when computing the source variance for the
 //  purposes of activity masking.
 // Eventually this should be replaced by custom no-reference routines,
@@ -146,43 +143,7 @@
   128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
   128 * 16, 128 * 16
 };
-
-typedef struct {
-  ENTROPY_CONTEXT a[MAX_MIB_SIZE * MAX_MB_PLANE];
-  ENTROPY_CONTEXT l[MAX_MIB_SIZE * MAX_MB_PLANE];
-  PARTITION_CONTEXT sa[MAX_MIB_SIZE];
-  PARTITION_CONTEXT sl[MAX_MIB_SIZE];
-  TXFM_CONTEXT *p_ta;
-  TXFM_CONTEXT *p_tl;
-  TXFM_CONTEXT ta[MAX_MIB_SIZE];
-  TXFM_CONTEXT tl[MAX_MIB_SIZE];
-} RD_SEARCH_MACROBLOCK_CONTEXT;
-
-enum { PICK_MODE_RD = 0, PICK_MODE_NONRD };
-
-enum {
-  SB_SINGLE_PASS,  // Single pass encoding: all ctxs get updated normally
-  SB_DRY_PASS,     // First pass of multi-pass: does not update the ctxs
-  SB_WET_PASS      // Second pass of multi-pass: finalize and update the ctx
-} UENUM1BYTE(SB_MULTI_PASS_MODE);
-
-// This struct is used to store the statistics used by sb-level multi-pass
-// encoding. Currently, this is only used to make a copy of the state before we
-// perform the first pass
-typedef struct SB_FIRST_PASS_STATS {
-  RD_SEARCH_MACROBLOCK_CONTEXT x_ctx;
-  RD_COUNTS rd_count;
-
-  int split_count;
-  FRAME_COUNTS fc;
-  InterModeRdModel inter_mode_rd_models[BLOCK_SIZES_ALL];
-  int thresh_freq_fact[BLOCK_SIZES_ALL][MAX_MODES];
-  int current_qindex;
-
-#if CONFIG_INTERNAL_STATS
-  unsigned int mode_chosen_counts[MAX_MODES];
-#endif  // CONFIG_INTERNAL_STATS
-} SB_FIRST_PASS_STATS;
+/*!\endcond */
 
 unsigned int av1_get_sby_perpixel_variance(const AV1_COMP *cpi,
                                            const struct buf_2d *ref,
@@ -238,436 +199,6 @@
     return BLOCK_8X8;
 }
 
-static int set_deltaq_rdmult(const AV1_COMP *const cpi, MACROBLOCKD *const xd) {
-  const AV1_COMMON *const cm = &cpi->common;
-  const CommonQuantParams *quant_params = &cm->quant_params;
-  return av1_compute_rd_mult(cpi, quant_params->base_qindex + xd->delta_qindex +
-                                      quant_params->y_dc_delta_q);
-}
-
-static AOM_INLINE void set_ssim_rdmult(const AV1_COMP *const cpi,
-                                       MACROBLOCK *const x,
-                                       const BLOCK_SIZE bsize, const int mi_row,
-                                       const int mi_col, int *const rdmult) {
-  const AV1_COMMON *const cm = &cpi->common;
-
-  const int bsize_base = BLOCK_16X16;
-  const int num_mi_w = mi_size_wide[bsize_base];
-  const int num_mi_h = mi_size_high[bsize_base];
-  const int num_cols = (cm->mi_params.mi_cols + num_mi_w - 1) / num_mi_w;
-  const int num_rows = (cm->mi_params.mi_rows + num_mi_h - 1) / num_mi_h;
-  const int num_bcols = (mi_size_wide[bsize] + num_mi_w - 1) / num_mi_w;
-  const int num_brows = (mi_size_high[bsize] + num_mi_h - 1) / num_mi_h;
-  int row, col;
-  double num_of_mi = 0.0;
-  double geom_mean_of_scale = 0.0;
-
-  assert(cpi->oxcf.tuning == AOM_TUNE_SSIM);
-
-  aom_clear_system_state();
-  for (row = mi_row / num_mi_w;
-       row < num_rows && row < mi_row / num_mi_w + num_brows; ++row) {
-    for (col = mi_col / num_mi_h;
-         col < num_cols && col < mi_col / num_mi_h + num_bcols; ++col) {
-      const int index = row * num_cols + col;
-      geom_mean_of_scale += log(cpi->ssim_rdmult_scaling_factors[index]);
-      num_of_mi += 1.0;
-    }
-  }
-  geom_mean_of_scale = exp(geom_mean_of_scale / num_of_mi);
-
-  *rdmult = (int)((double)(*rdmult) * geom_mean_of_scale + 0.5);
-  *rdmult = AOMMAX(*rdmult, 0);
-  set_error_per_bit(x, *rdmult);
-  aom_clear_system_state();
-}
-
-static int get_hier_tpl_rdmult(const AV1_COMP *const cpi, MACROBLOCK *const x,
-                               const BLOCK_SIZE bsize, const int mi_row,
-                               const int mi_col, int orig_rdmult) {
-  const AV1_COMMON *const cm = &cpi->common;
-  assert(IMPLIES(cpi->gf_group.size > 0,
-                 cpi->gf_group.index < cpi->gf_group.size));
-  const int tpl_idx = cpi->gf_group.index;
-  const TplDepFrame *tpl_frame = &cpi->tpl_data.tpl_frame[tpl_idx];
-  MACROBLOCKD *const xd = &x->e_mbd;
-  const int deltaq_rdmult = set_deltaq_rdmult(cpi, xd);
-  if (tpl_frame->is_valid == 0) return deltaq_rdmult;
-  if (!is_frame_tpl_eligible((AV1_COMP *)cpi)) return deltaq_rdmult;
-  if (tpl_idx >= MAX_LAG_BUFFERS) return deltaq_rdmult;
-  if (cpi->superres_mode != SUPERRES_NONE) return deltaq_rdmult;
-  if (cpi->oxcf.aq_mode != NO_AQ) return deltaq_rdmult;
-
-  const int bsize_base = BLOCK_16X16;
-  const int num_mi_w = mi_size_wide[bsize_base];
-  const int num_mi_h = mi_size_high[bsize_base];
-  const int num_cols = (cm->mi_params.mi_cols + num_mi_w - 1) / num_mi_w;
-  const int num_rows = (cm->mi_params.mi_rows + num_mi_h - 1) / num_mi_h;
-  const int num_bcols = (mi_size_wide[bsize] + num_mi_w - 1) / num_mi_w;
-  const int num_brows = (mi_size_high[bsize] + num_mi_h - 1) / num_mi_h;
-  int row, col;
-  double base_block_count = 0.0;
-  double geom_mean_of_scale = 0.0;
-  aom_clear_system_state();
-  for (row = mi_row / num_mi_w;
-       row < num_rows && row < mi_row / num_mi_w + num_brows; ++row) {
-    for (col = mi_col / num_mi_h;
-         col < num_cols && col < mi_col / num_mi_h + num_bcols; ++col) {
-      const int index = row * num_cols + col;
-      geom_mean_of_scale += log(cpi->tpl_sb_rdmult_scaling_factors[index]);
-      base_block_count += 1.0;
-    }
-  }
-  geom_mean_of_scale = exp(geom_mean_of_scale / base_block_count);
-  int rdmult = (int)((double)orig_rdmult * geom_mean_of_scale + 0.5);
-  rdmult = AOMMAX(rdmult, 0);
-  set_error_per_bit(x, rdmult);
-  aom_clear_system_state();
-  if (bsize == cm->seq_params.sb_size) {
-    const int rdmult_sb = set_deltaq_rdmult(cpi, xd);
-    assert(rdmult_sb == rdmult);
-    (void)rdmult_sb;
-  }
-  return rdmult;
-}
-
-static int set_segment_rdmult(const AV1_COMP *const cpi, MACROBLOCK *const x,
-                              int8_t segment_id) {
-  const AV1_COMMON *const cm = &cpi->common;
-  av1_init_plane_quantizers(cpi, x, segment_id);
-  aom_clear_system_state();
-  const int segment_qindex =
-      av1_get_qindex(&cm->seg, segment_id, cm->quant_params.base_qindex);
-  return av1_compute_rd_mult(cpi,
-                             segment_qindex + cm->quant_params.y_dc_delta_q);
-}
-
-static AOM_INLINE void setup_block_rdmult(const AV1_COMP *const cpi,
-                                          MACROBLOCK *const x, int mi_row,
-                                          int mi_col, BLOCK_SIZE bsize,
-                                          AQ_MODE aq_mode, MB_MODE_INFO *mbmi) {
-  x->rdmult = cpi->rd.RDMULT;
-
-  if (aq_mode != NO_AQ) {
-    assert(mbmi != NULL);
-    if (aq_mode == VARIANCE_AQ) {
-      if (cpi->vaq_refresh) {
-        const int energy = bsize <= BLOCK_16X16
-                               ? x->mb_energy
-                               : av1_log_block_var(cpi, x, bsize);
-        mbmi->segment_id = energy;
-      }
-      x->rdmult = set_segment_rdmult(cpi, x, mbmi->segment_id);
-    } else if (aq_mode == COMPLEXITY_AQ) {
-      x->rdmult = set_segment_rdmult(cpi, x, mbmi->segment_id);
-    } else if (aq_mode == CYCLIC_REFRESH_AQ) {
-      // If segment is boosted, use rdmult for that segment.
-      if (cyclic_refresh_segment_id_boosted(mbmi->segment_id))
-        x->rdmult = av1_cyclic_refresh_get_rdmult(cpi->cyclic_refresh);
-    }
-  }
-
-  const AV1_COMMON *const cm = &cpi->common;
-  if (cm->delta_q_info.delta_q_present_flag &&
-      !cpi->sf.rt_sf.use_nonrd_pick_mode) {
-    x->rdmult = get_hier_tpl_rdmult(cpi, x, bsize, mi_row, mi_col, x->rdmult);
-  }
-
-  if (cpi->oxcf.tuning == AOM_TUNE_SSIM) {
-    set_ssim_rdmult(cpi, x, bsize, mi_row, mi_col, &x->rdmult);
-  }
-#if CONFIG_TUNE_VMAF
-  if (cpi->oxcf.tuning == AOM_TUNE_VMAF_WITHOUT_PREPROCESSING ||
-      cpi->oxcf.tuning == AOM_TUNE_VMAF_MAX_GAIN) {
-    av1_set_vmaf_rdmult(cpi, x, bsize, mi_row, mi_col, &x->rdmult);
-  }
-#endif
-}
-
-static AOM_INLINE void set_offsets_without_segment_id(
-    const AV1_COMP *const cpi, const TileInfo *const tile, MACROBLOCK *const x,
-    int mi_row, int mi_col, BLOCK_SIZE bsize) {
-  const AV1_COMMON *const cm = &cpi->common;
-  const int num_planes = av1_num_planes(cm);
-  MACROBLOCKD *const xd = &x->e_mbd;
-  assert(bsize < BLOCK_SIZES_ALL);
-  const int mi_width = mi_size_wide[bsize];
-  const int mi_height = mi_size_high[bsize];
-
-  set_mode_info_offsets(&cpi->common.mi_params, &cpi->mbmi_ext_info, x, xd,
-                        mi_row, mi_col);
-
-  set_entropy_context(xd, mi_row, mi_col, num_planes);
-  xd->above_txfm_context = cm->above_contexts.txfm[tile->tile_row] + mi_col;
-  xd->left_txfm_context =
-      xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
-
-  // Set up destination pointers.
-  av1_setup_dst_planes(xd->plane, bsize, &cm->cur_frame->buf, mi_row, mi_col, 0,
-                       num_planes);
-
-  // Set up limit values for MV components.
-  // Mv beyond the range do not produce new/different prediction block.
-  av1_set_mv_limits(&cm->mi_params, &x->mv_limits, mi_row, mi_col, mi_height,
-                    mi_width, cpi->oxcf.border_in_pixels);
-
-  set_plane_n4(xd, mi_width, mi_height, num_planes);
-
-  // Set up distance of MB to edge of frame in 1/8th pel units.
-  assert(!(mi_col & (mi_width - 1)) && !(mi_row & (mi_height - 1)));
-  set_mi_row_col(xd, tile, mi_row, mi_height, mi_col, mi_width,
-                 cm->mi_params.mi_rows, cm->mi_params.mi_cols);
-
-  // Set up source buffers.
-  av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes, bsize);
-
-  // required by av1_append_sub8x8_mvs_for_idx() and av1_find_best_ref_mvs()
-  xd->tile = *tile;
-}
-
-static AOM_INLINE void set_offsets(const AV1_COMP *const cpi,
-                                   const TileInfo *const tile,
-                                   MACROBLOCK *const x, int mi_row, int mi_col,
-                                   BLOCK_SIZE bsize) {
-  const AV1_COMMON *const cm = &cpi->common;
-  const struct segmentation *const seg = &cm->seg;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi;
-
-  set_offsets_without_segment_id(cpi, tile, x, mi_row, mi_col, bsize);
-
-  // Setup segment ID.
-  mbmi = xd->mi[0];
-  mbmi->segment_id = 0;
-  if (seg->enabled) {
-    if (seg->enabled && !cpi->vaq_refresh) {
-      const uint8_t *const map =
-          seg->update_map ? cpi->enc_seg.map : cm->last_frame_seg_map;
-      mbmi->segment_id =
-          map ? get_segment_id(&cm->mi_params, map, bsize, mi_row, mi_col) : 0;
-    }
-    av1_init_plane_quantizers(cpi, x, mbmi->segment_id);
-  }
-}
-
-static AOM_INLINE void update_filter_type_count(FRAME_COUNTS *counts,
-                                                const MACROBLOCKD *xd,
-                                                const MB_MODE_INFO *mbmi) {
-  int dir;
-  for (dir = 0; dir < 2; ++dir) {
-    const int ctx = av1_get_pred_context_switchable_interp(xd, dir);
-    InterpFilter filter = av1_extract_interp_filter(mbmi->interp_filters, dir);
-    ++counts->switchable_interp[ctx][filter];
-  }
-}
-
-static AOM_INLINE void update_filter_type_cdf(const MACROBLOCKD *xd,
-                                              const MB_MODE_INFO *mbmi) {
-  int dir;
-  for (dir = 0; dir < 2; ++dir) {
-    const int ctx = av1_get_pred_context_switchable_interp(xd, dir);
-    InterpFilter filter = av1_extract_interp_filter(mbmi->interp_filters, dir);
-    update_cdf(xd->tile_ctx->switchable_interp_cdf[ctx], filter,
-               SWITCHABLE_FILTERS);
-  }
-}
-
-static AOM_INLINE void update_global_motion_used(PREDICTION_MODE mode,
-                                                 BLOCK_SIZE bsize,
-                                                 const MB_MODE_INFO *mbmi,
-                                                 RD_COUNTS *rdc) {
-  if (mode == GLOBALMV || mode == GLOBAL_GLOBALMV) {
-    const int num_4x4s = mi_size_wide[bsize] * mi_size_high[bsize];
-    int ref;
-    for (ref = 0; ref < 1 + has_second_ref(mbmi); ++ref) {
-      rdc->global_motion_used[mbmi->ref_frame[ref]] += num_4x4s;
-    }
-  }
-}
-
-static AOM_INLINE void reset_tx_size(MACROBLOCK *x, MB_MODE_INFO *mbmi,
-                                     const TX_MODE tx_mode) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  if (xd->lossless[mbmi->segment_id]) {
-    mbmi->tx_size = TX_4X4;
-  } else if (tx_mode != TX_MODE_SELECT) {
-    mbmi->tx_size = tx_size_from_tx_mode(mbmi->sb_type, tx_mode);
-  } else {
-    BLOCK_SIZE bsize = mbmi->sb_type;
-    TX_SIZE min_tx_size = depth_to_tx_size(MAX_TX_DEPTH, bsize);
-    mbmi->tx_size = (TX_SIZE)TXSIZEMAX(mbmi->tx_size, min_tx_size);
-  }
-  if (is_inter_block(mbmi)) {
-    memset(mbmi->inter_tx_size, mbmi->tx_size, sizeof(mbmi->inter_tx_size));
-  }
-  const int stride = xd->tx_type_map_stride;
-  const int bw = mi_size_wide[mbmi->sb_type];
-  for (int row = 0; row < mi_size_high[mbmi->sb_type]; ++row) {
-    memset(xd->tx_type_map + row * stride, DCT_DCT,
-           bw * sizeof(xd->tx_type_map[0]));
-  }
-  av1_zero(x->blk_skip);
-  x->force_skip = 0;
-}
-
-// This function will copy the best reference mode information from
-// MB_MODE_INFO_EXT_FRAME to MB_MODE_INFO_EXT.
-static INLINE void copy_mbmi_ext_frame_to_mbmi_ext(
-    MB_MODE_INFO_EXT *mbmi_ext,
-    const MB_MODE_INFO_EXT_FRAME *const mbmi_ext_best, uint8_t ref_frame_type) {
-  memcpy(mbmi_ext->ref_mv_stack[ref_frame_type], mbmi_ext_best->ref_mv_stack,
-         sizeof(mbmi_ext->ref_mv_stack[USABLE_REF_MV_STACK_SIZE]));
-  memcpy(mbmi_ext->weight[ref_frame_type], mbmi_ext_best->weight,
-         sizeof(mbmi_ext->weight[USABLE_REF_MV_STACK_SIZE]));
-  mbmi_ext->mode_context[ref_frame_type] = mbmi_ext_best->mode_context;
-  mbmi_ext->ref_mv_count[ref_frame_type] = mbmi_ext_best->ref_mv_count;
-  memcpy(mbmi_ext->global_mvs, mbmi_ext_best->global_mvs,
-         sizeof(mbmi_ext->global_mvs));
-}
-
-static AOM_INLINE void update_state(const AV1_COMP *const cpi, ThreadData *td,
-                                    const PICK_MODE_CONTEXT *const ctx,
-                                    int mi_row, int mi_col, BLOCK_SIZE bsize,
-                                    RUN_TYPE dry_run) {
-  int i, x_idx, y;
-  const AV1_COMMON *const cm = &cpi->common;
-  const CommonModeInfoParams *const mi_params = &cm->mi_params;
-  const int num_planes = av1_num_planes(cm);
-  RD_COUNTS *const rdc = &td->rd_counts;
-  MACROBLOCK *const x = &td->mb;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  struct macroblock_plane *const p = x->plane;
-  struct macroblockd_plane *const pd = xd->plane;
-  const MB_MODE_INFO *const mi = &ctx->mic;
-  MB_MODE_INFO *const mi_addr = xd->mi[0];
-  const struct segmentation *const seg = &cm->seg;
-  const int bw = mi_size_wide[mi->sb_type];
-  const int bh = mi_size_high[mi->sb_type];
-  const int mis = mi_params->mi_stride;
-  const int mi_width = mi_size_wide[bsize];
-  const int mi_height = mi_size_high[bsize];
-
-  assert(mi->sb_type == bsize);
-
-  *mi_addr = *mi;
-  copy_mbmi_ext_frame_to_mbmi_ext(x->mbmi_ext, &ctx->mbmi_ext_best,
-                                  av1_ref_frame_type(ctx->mic.ref_frame));
-
-  memcpy(x->blk_skip, ctx->blk_skip, sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
-
-  x->force_skip = ctx->rd_stats.skip;
-
-  xd->tx_type_map = ctx->tx_type_map;
-  xd->tx_type_map_stride = mi_size_wide[bsize];
-  // If not dry_run, copy the transform type data into the frame level buffer.
-  // Encoder will fetch tx types when writing bitstream.
-  if (!dry_run) {
-    const int grid_idx = get_mi_grid_idx(mi_params, mi_row, mi_col);
-    uint8_t *const tx_type_map = mi_params->tx_type_map + grid_idx;
-    const int mi_stride = mi_params->mi_stride;
-    for (int blk_row = 0; blk_row < bh; ++blk_row) {
-      av1_copy_array(tx_type_map + blk_row * mi_stride,
-                     xd->tx_type_map + blk_row * xd->tx_type_map_stride, bw);
-    }
-    xd->tx_type_map = tx_type_map;
-    xd->tx_type_map_stride = mi_stride;
-  }
-
-  // If segmentation in use
-  if (seg->enabled) {
-    // For in frame complexity AQ copy the segment id from the segment map.
-    if (cpi->oxcf.aq_mode == COMPLEXITY_AQ) {
-      const uint8_t *const map =
-          seg->update_map ? cpi->enc_seg.map : cm->last_frame_seg_map;
-      mi_addr->segment_id =
-          map ? get_segment_id(mi_params, map, bsize, mi_row, mi_col) : 0;
-      reset_tx_size(x, mi_addr, x->tx_mode_search_type);
-    }
-    // Else for cyclic refresh mode update the segment map, set the segment id
-    // and then update the quantizer.
-    if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) {
-      av1_cyclic_refresh_update_segment(cpi, mi_addr, mi_row, mi_col, bsize,
-                                        ctx->rd_stats.rate, ctx->rd_stats.dist,
-                                        x->force_skip);
-    }
-    if (mi_addr->uv_mode == UV_CFL_PRED && !is_cfl_allowed(xd))
-      mi_addr->uv_mode = UV_DC_PRED;
-  }
-
-  for (i = 0; i < num_planes; ++i) {
-    p[i].coeff = ctx->coeff[i];
-    p[i].qcoeff = ctx->qcoeff[i];
-    pd[i].dqcoeff = ctx->dqcoeff[i];
-    p[i].eobs = ctx->eobs[i];
-    p[i].txb_entropy_ctx = ctx->txb_entropy_ctx[i];
-  }
-  for (i = 0; i < 2; ++i) pd[i].color_index_map = ctx->color_index_map[i];
-  // Restore the coding context of the MB to that that was in place
-  // when the mode was picked for it
-  for (y = 0; y < mi_height; y++) {
-    for (x_idx = 0; x_idx < mi_width; x_idx++) {
-      if ((xd->mb_to_right_edge >> (3 + MI_SIZE_LOG2)) + mi_width > x_idx &&
-          (xd->mb_to_bottom_edge >> (3 + MI_SIZE_LOG2)) + mi_height > y) {
-        xd->mi[x_idx + y * mis] = mi_addr;
-      }
-    }
-  }
-
-  if (cpi->oxcf.aq_mode) av1_init_plane_quantizers(cpi, x, mi_addr->segment_id);
-
-  if (dry_run) return;
-
-#if CONFIG_INTERNAL_STATS
-  {
-    unsigned int *const mode_chosen_counts =
-        (unsigned int *)cpi->mode_chosen_counts;  // Cast const away.
-    if (frame_is_intra_only(cm)) {
-      static const int kf_mode_index[] = {
-        THR_DC /*DC_PRED*/,
-        THR_V_PRED /*V_PRED*/,
-        THR_H_PRED /*H_PRED*/,
-        THR_D45_PRED /*D45_PRED*/,
-        THR_D135_PRED /*D135_PRED*/,
-        THR_D113_PRED /*D113_PRED*/,
-        THR_D157_PRED /*D157_PRED*/,
-        THR_D203_PRED /*D203_PRED*/,
-        THR_D67_PRED /*D67_PRED*/,
-        THR_SMOOTH,   /*SMOOTH_PRED*/
-        THR_SMOOTH_V, /*SMOOTH_V_PRED*/
-        THR_SMOOTH_H, /*SMOOTH_H_PRED*/
-        THR_PAETH /*PAETH_PRED*/,
-      };
-      ++mode_chosen_counts[kf_mode_index[mi_addr->mode]];
-    } else {
-      // Note how often each mode chosen as best
-      ++mode_chosen_counts[ctx->best_mode_index];
-    }
-  }
-#endif
-  if (!frame_is_intra_only(cm)) {
-    if (is_inter_block(mi_addr)) {
-      // TODO(sarahparker): global motion stats need to be handled per-tile
-      // to be compatible with tile-based threading.
-      update_global_motion_used(mi_addr->mode, bsize, mi_addr, rdc);
-    }
-
-    if (cm->features.interp_filter == SWITCHABLE &&
-        mi_addr->motion_mode != WARPED_CAUSAL &&
-        !is_nontrans_global_motion(xd, xd->mi[0])) {
-      update_filter_type_count(td->counts, xd, mi_addr);
-    }
-
-    rdc->comp_pred_diff[SINGLE_REFERENCE] += ctx->single_pred_diff;
-    rdc->comp_pred_diff[COMPOUND_REFERENCE] += ctx->comp_pred_diff;
-    rdc->comp_pred_diff[REFERENCE_MODE_SELECT] += ctx->hybrid_pred_diff;
-  }
-
-  const int x_mis = AOMMIN(bw, mi_params->mi_cols - mi_col);
-  const int y_mis = AOMMIN(bh, mi_params->mi_rows - mi_row);
-  if (cm->seq_params.order_hint_info.enable_ref_frame_mvs)
-    av1_copy_frame_mvs(cm, mi, mi_row, mi_col, x_mis, y_mis);
-}
-
 void av1_setup_src_planes(MACROBLOCK *x, const YV12_BUFFER_CONFIG *src,
                           int mi_row, int mi_col, const int num_planes,
                           BLOCK_SIZE bsize) {
@@ -685,3433 +216,23 @@
   }
 }
 
-static EdgeInfo edge_info(const struct buf_2d *ref, const BLOCK_SIZE bsize,
-                          const bool high_bd, const int bd) {
-  const int width = block_size_wide[bsize];
-  const int height = block_size_high[bsize];
-  // Implementation requires width to be a multiple of 8. It also requires
-  // height to be a multiple of 4, but this is always the case.
-  assert(height % 4 == 0);
-  if (width % 8 != 0) {
-    EdgeInfo ei = { .magnitude = 0, .x = 0, .y = 0 };
-    return ei;
-  }
-  return av1_edge_exists(ref->buf, ref->stride, width, height, high_bd, bd);
-}
-
-static int use_pb_simple_motion_pred_sse(const AV1_COMP *const cpi) {
-  // TODO(debargha, yuec): Not in use, need to implement a speed feature
-  // utilizing this data point, and replace '0' by the corresponding speed
-  // feature flag.
-  return 0 && !frame_is_intra_only(&cpi->common);
-}
-
-static void hybrid_intra_mode_search(AV1_COMP *cpi, MACROBLOCK *const x,
-                                     RD_STATS *rd_cost, BLOCK_SIZE bsize,
-                                     PICK_MODE_CONTEXT *ctx) {
-  // TODO(jianj): Investigate the failure of ScalabilityTest in AOM_Q mode,
-  // which sets base_qindex to 0 on keyframe.
-  if (cpi->oxcf.rc_mode != AOM_CBR || !cpi->sf.rt_sf.hybrid_intra_pickmode ||
-      bsize < BLOCK_16X16)
-    av1_rd_pick_intra_mode_sb(cpi, x, rd_cost, bsize, ctx, INT64_MAX);
-  else
-    av1_pick_intra_mode(cpi, x, rd_cost, bsize, ctx);
-}
-
-static AOM_INLINE void pick_sb_modes(AV1_COMP *const cpi,
-                                     TileDataEnc *tile_data,
-                                     MACROBLOCK *const x, int mi_row,
-                                     int mi_col, RD_STATS *rd_cost,
-                                     PARTITION_TYPE partition, BLOCK_SIZE bsize,
-                                     PICK_MODE_CONTEXT *ctx, RD_STATS best_rd,
-                                     int pick_mode_type) {
-  if (best_rd.rdcost < 0) {
-    ctx->rd_stats.rdcost = INT64_MAX;
-    ctx->rd_stats.skip = 0;
-    av1_invalid_rd_stats(rd_cost);
-    return;
-  }
-
-  set_offsets(cpi, &tile_data->tile_info, x, mi_row, mi_col, bsize);
-
-  if (ctx->rd_mode_is_ready) {
-    assert(ctx->mic.sb_type == bsize);
-    assert(ctx->mic.partition == partition);
-    rd_cost->rate = ctx->rd_stats.rate;
-    rd_cost->dist = ctx->rd_stats.dist;
-    rd_cost->rdcost = ctx->rd_stats.rdcost;
-    return;
-  }
-
-  AV1_COMMON *const cm = &cpi->common;
-  const int num_planes = av1_num_planes(cm);
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi;
-  struct macroblock_plane *const p = x->plane;
-  struct macroblockd_plane *const pd = xd->plane;
-  const AQ_MODE aq_mode = cpi->oxcf.aq_mode;
-  int i;
-
-#if CONFIG_COLLECT_COMPONENT_TIMING
-  start_timing(cpi, rd_pick_sb_modes_time);
-#endif
-
-  aom_clear_system_state();
-
-  mbmi = xd->mi[0];
-  mbmi->sb_type = bsize;
-  mbmi->partition = partition;
-
-#if CONFIG_RD_DEBUG
-  mbmi->mi_row = mi_row;
-  mbmi->mi_col = mi_col;
-#endif
-
-  xd->tx_type_map = x->tx_type_map;
-  xd->tx_type_map_stride = mi_size_wide[bsize];
-
-  for (i = 0; i < num_planes; ++i) {
-    p[i].coeff = ctx->coeff[i];
-    p[i].qcoeff = ctx->qcoeff[i];
-    pd[i].dqcoeff = ctx->dqcoeff[i];
-    p[i].eobs = ctx->eobs[i];
-    p[i].txb_entropy_ctx = ctx->txb_entropy_ctx[i];
-  }
-
-  for (i = 0; i < 2; ++i) pd[i].color_index_map = ctx->color_index_map[i];
-
-  ctx->skippable = 0;
-  // Set to zero to make sure we do not use the previous encoded frame stats
-  mbmi->skip = 0;
-  // Reset skip mode flag.
-  mbmi->skip_mode = 0;
-
-  if (is_cur_buf_hbd(xd)) {
-    x->source_variance = av1_high_get_sby_perpixel_variance(
-        cpi, &x->plane[0].src, bsize, xd->bd);
-  } else {
-    x->source_variance =
-        av1_get_sby_perpixel_variance(cpi, &x->plane[0].src, bsize);
-  }
-  if (use_pb_simple_motion_pred_sse(cpi)) {
-    const FULLPEL_MV start_mv = kZeroFullMv;
-    unsigned int var = 0;
-    av1_simple_motion_sse_var(cpi, x, mi_row, mi_col, bsize, start_mv, 0,
-                              &x->simple_motion_pred_sse, &var);
-  }
-
-  // If the threshold for disabling wedge search is zero, it means the feature
-  // should not be used. Use a value that will always succeed in the check.
-  if (cpi->sf.inter_sf.disable_wedge_search_edge_thresh == 0) {
-    x->edge_strength = UINT16_MAX;
-    x->edge_strength_x = UINT16_MAX;
-    x->edge_strength_y = UINT16_MAX;
-  } else {
-    EdgeInfo ei =
-        edge_info(&x->plane[0].src, bsize, is_cur_buf_hbd(xd), xd->bd);
-    x->edge_strength = ei.magnitude;
-    x->edge_strength_x = ei.x;
-    x->edge_strength_y = ei.y;
-  }
-
-  // Initialize default mode evaluation params
-  set_mode_eval_params(cpi, x, DEFAULT_EVAL);
-
-  // Save rdmult before it might be changed, so it can be restored later.
-  const int orig_rdmult = x->rdmult;
-  setup_block_rdmult(cpi, x, mi_row, mi_col, bsize, aq_mode, mbmi);
-  // Set error per bit for current rdmult
-  set_error_per_bit(x, x->rdmult);
-  av1_rd_cost_update(x->rdmult, &best_rd);
-
-  // Find best coding mode & reconstruct the MB so it is available
-  // as a predictor for MBs that follow in the SB
-  if (frame_is_intra_only(cm)) {
-#if CONFIG_COLLECT_COMPONENT_TIMING
-    start_timing(cpi, av1_rd_pick_intra_mode_sb_time);
-#endif
-    switch (pick_mode_type) {
-      case PICK_MODE_RD:
-        av1_rd_pick_intra_mode_sb(cpi, x, rd_cost, bsize, ctx, best_rd.rdcost);
-        break;
-      case PICK_MODE_NONRD:
-        hybrid_intra_mode_search(cpi, x, rd_cost, bsize, ctx);
-        break;
-      default: assert(0 && "Unknown pick mode type.");
-    }
-#if CONFIG_COLLECT_COMPONENT_TIMING
-    end_timing(cpi, av1_rd_pick_intra_mode_sb_time);
-#endif
-  } else {
-#if CONFIG_COLLECT_COMPONENT_TIMING
-    start_timing(cpi, av1_rd_pick_inter_mode_sb_time);
-#endif
-    if (segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
-      av1_rd_pick_inter_mode_sb_seg_skip(cpi, tile_data, x, mi_row, mi_col,
-                                         rd_cost, bsize, ctx, best_rd.rdcost);
-    } else {
-      // TODO(kyslov): do the same for pick_inter_mode_sb_seg_skip
-      switch (pick_mode_type) {
-        case PICK_MODE_RD:
-          av1_rd_pick_inter_mode_sb(cpi, tile_data, x, rd_cost, bsize, ctx,
-                                    best_rd.rdcost);
-          break;
-        case PICK_MODE_NONRD:
-          av1_nonrd_pick_inter_mode_sb(cpi, tile_data, x, rd_cost, bsize, ctx,
-                                       best_rd.rdcost);
-          break;
-        default: assert(0 && "Unknown pick mode type.");
-      }
-    }
-#if CONFIG_COLLECT_COMPONENT_TIMING
-    end_timing(cpi, av1_rd_pick_inter_mode_sb_time);
-#endif
-  }
-
-  // Examine the resulting rate and for AQ mode 2 make a segment choice.
-  if (rd_cost->rate != INT_MAX && aq_mode == COMPLEXITY_AQ &&
-      bsize >= BLOCK_16X16) {
-    av1_caq_select_segment(cpi, x, bsize, mi_row, mi_col, rd_cost->rate);
-  }
-
-  x->rdmult = orig_rdmult;
-
-  // TODO(jingning) The rate-distortion optimization flow needs to be
-  // refactored to provide proper exit/return handle.
-  if (rd_cost->rate == INT_MAX) rd_cost->rdcost = INT64_MAX;
-
-  ctx->rd_stats.rate = rd_cost->rate;
-  ctx->rd_stats.dist = rd_cost->dist;
-  ctx->rd_stats.rdcost = rd_cost->rdcost;
-
-#if CONFIG_COLLECT_COMPONENT_TIMING
-  end_timing(cpi, rd_pick_sb_modes_time);
-#endif
-}
-
-static AOM_INLINE void update_inter_mode_stats(FRAME_CONTEXT *fc,
-                                               FRAME_COUNTS *counts,
-                                               PREDICTION_MODE mode,
-                                               int16_t mode_context) {
-  (void)counts;
-
-  int16_t mode_ctx = mode_context & NEWMV_CTX_MASK;
-  if (mode == NEWMV) {
-#if CONFIG_ENTROPY_STATS
-    ++counts->newmv_mode[mode_ctx][0];
-#endif
-    update_cdf(fc->newmv_cdf[mode_ctx], 0, 2);
-    return;
-  }
-
-#if CONFIG_ENTROPY_STATS
-  ++counts->newmv_mode[mode_ctx][1];
-#endif
-  update_cdf(fc->newmv_cdf[mode_ctx], 1, 2);
-
-  mode_ctx = (mode_context >> GLOBALMV_OFFSET) & GLOBALMV_CTX_MASK;
-  if (mode == GLOBALMV) {
-#if CONFIG_ENTROPY_STATS
-    ++counts->zeromv_mode[mode_ctx][0];
-#endif
-    update_cdf(fc->zeromv_cdf[mode_ctx], 0, 2);
-    return;
-  }
-
-#if CONFIG_ENTROPY_STATS
-  ++counts->zeromv_mode[mode_ctx][1];
-#endif
-  update_cdf(fc->zeromv_cdf[mode_ctx], 1, 2);
-
-  mode_ctx = (mode_context >> REFMV_OFFSET) & REFMV_CTX_MASK;
-#if CONFIG_ENTROPY_STATS
-  ++counts->refmv_mode[mode_ctx][mode != NEARESTMV];
-#endif
-  update_cdf(fc->refmv_cdf[mode_ctx], mode != NEARESTMV, 2);
-}
-
-static AOM_INLINE void update_palette_cdf(MACROBLOCKD *xd,
-                                          const MB_MODE_INFO *const mbmi,
-                                          FRAME_COUNTS *counts) {
-  FRAME_CONTEXT *fc = xd->tile_ctx;
-  const BLOCK_SIZE bsize = mbmi->sb_type;
-  const PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
-  const int palette_bsize_ctx = av1_get_palette_bsize_ctx(bsize);
-
-  (void)counts;
-
-  if (mbmi->mode == DC_PRED) {
-    const int n = pmi->palette_size[0];
-    const int palette_mode_ctx = av1_get_palette_mode_ctx(xd);
-
-#if CONFIG_ENTROPY_STATS
-    ++counts->palette_y_mode[palette_bsize_ctx][palette_mode_ctx][n > 0];
-#endif
-    update_cdf(fc->palette_y_mode_cdf[palette_bsize_ctx][palette_mode_ctx],
-               n > 0, 2);
-    if (n > 0) {
-#if CONFIG_ENTROPY_STATS
-      ++counts->palette_y_size[palette_bsize_ctx][n - PALETTE_MIN_SIZE];
-#endif
-      update_cdf(fc->palette_y_size_cdf[palette_bsize_ctx],
-                 n - PALETTE_MIN_SIZE, PALETTE_SIZES);
-    }
-  }
-
-  if (mbmi->uv_mode == UV_DC_PRED) {
-    const int n = pmi->palette_size[1];
-    const int palette_uv_mode_ctx = (pmi->palette_size[0] > 0);
-
-#if CONFIG_ENTROPY_STATS
-    ++counts->palette_uv_mode[palette_uv_mode_ctx][n > 0];
-#endif
-    update_cdf(fc->palette_uv_mode_cdf[palette_uv_mode_ctx], n > 0, 2);
-
-    if (n > 0) {
-#if CONFIG_ENTROPY_STATS
-      ++counts->palette_uv_size[palette_bsize_ctx][n - PALETTE_MIN_SIZE];
-#endif
-      update_cdf(fc->palette_uv_size_cdf[palette_bsize_ctx],
-                 n - PALETTE_MIN_SIZE, PALETTE_SIZES);
-    }
-  }
-}
-
-static AOM_INLINE void sum_intra_stats(const AV1_COMMON *const cm,
-                                       FRAME_COUNTS *counts, MACROBLOCKD *xd,
-                                       const MB_MODE_INFO *const mbmi,
-                                       const MB_MODE_INFO *above_mi,
-                                       const MB_MODE_INFO *left_mi,
-                                       const int intraonly) {
-  FRAME_CONTEXT *fc = xd->tile_ctx;
-  const PREDICTION_MODE y_mode = mbmi->mode;
-  (void)counts;
-  const BLOCK_SIZE bsize = mbmi->sb_type;
-
-  if (intraonly) {
-#if CONFIG_ENTROPY_STATS
-    const PREDICTION_MODE above = av1_above_block_mode(above_mi);
-    const PREDICTION_MODE left = av1_left_block_mode(left_mi);
-    const int above_ctx = intra_mode_context[above];
-    const int left_ctx = intra_mode_context[left];
-    ++counts->kf_y_mode[above_ctx][left_ctx][y_mode];
-#endif  // CONFIG_ENTROPY_STATS
-    update_cdf(get_y_mode_cdf(fc, above_mi, left_mi), y_mode, INTRA_MODES);
-  } else {
-#if CONFIG_ENTROPY_STATS
-    ++counts->y_mode[size_group_lookup[bsize]][y_mode];
-#endif  // CONFIG_ENTROPY_STATS
-    update_cdf(fc->y_mode_cdf[size_group_lookup[bsize]], y_mode, INTRA_MODES);
-  }
-
-  if (av1_filter_intra_allowed(cm, mbmi)) {
-    const int use_filter_intra_mode =
-        mbmi->filter_intra_mode_info.use_filter_intra;
-#if CONFIG_ENTROPY_STATS
-    ++counts->filter_intra[mbmi->sb_type][use_filter_intra_mode];
-    if (use_filter_intra_mode) {
-      ++counts
-            ->filter_intra_mode[mbmi->filter_intra_mode_info.filter_intra_mode];
-    }
-#endif  // CONFIG_ENTROPY_STATS
-    update_cdf(fc->filter_intra_cdfs[mbmi->sb_type], use_filter_intra_mode, 2);
-    if (use_filter_intra_mode) {
-      update_cdf(fc->filter_intra_mode_cdf,
-                 mbmi->filter_intra_mode_info.filter_intra_mode,
-                 FILTER_INTRA_MODES);
-    }
-  }
-  if (av1_is_directional_mode(mbmi->mode) && av1_use_angle_delta(bsize)) {
-#if CONFIG_ENTROPY_STATS
-    ++counts->angle_delta[mbmi->mode - V_PRED]
-                         [mbmi->angle_delta[PLANE_TYPE_Y] + MAX_ANGLE_DELTA];
-#endif
-    update_cdf(fc->angle_delta_cdf[mbmi->mode - V_PRED],
-               mbmi->angle_delta[PLANE_TYPE_Y] + MAX_ANGLE_DELTA,
-               2 * MAX_ANGLE_DELTA + 1);
-  }
-
-  if (!xd->is_chroma_ref) return;
-
-  const UV_PREDICTION_MODE uv_mode = mbmi->uv_mode;
-  const CFL_ALLOWED_TYPE cfl_allowed = is_cfl_allowed(xd);
-#if CONFIG_ENTROPY_STATS
-  ++counts->uv_mode[cfl_allowed][y_mode][uv_mode];
-#endif  // CONFIG_ENTROPY_STATS
-  update_cdf(fc->uv_mode_cdf[cfl_allowed][y_mode], uv_mode,
-             UV_INTRA_MODES - !cfl_allowed);
-  if (uv_mode == UV_CFL_PRED) {
-    const int8_t joint_sign = mbmi->cfl_alpha_signs;
-    const uint8_t idx = mbmi->cfl_alpha_idx;
-
-#if CONFIG_ENTROPY_STATS
-    ++counts->cfl_sign[joint_sign];
-#endif
-    update_cdf(fc->cfl_sign_cdf, joint_sign, CFL_JOINT_SIGNS);
-    if (CFL_SIGN_U(joint_sign) != CFL_SIGN_ZERO) {
-      aom_cdf_prob *cdf_u = fc->cfl_alpha_cdf[CFL_CONTEXT_U(joint_sign)];
-
-#if CONFIG_ENTROPY_STATS
-      ++counts->cfl_alpha[CFL_CONTEXT_U(joint_sign)][CFL_IDX_U(idx)];
-#endif
-      update_cdf(cdf_u, CFL_IDX_U(idx), CFL_ALPHABET_SIZE);
-    }
-    if (CFL_SIGN_V(joint_sign) != CFL_SIGN_ZERO) {
-      aom_cdf_prob *cdf_v = fc->cfl_alpha_cdf[CFL_CONTEXT_V(joint_sign)];
-
-#if CONFIG_ENTROPY_STATS
-      ++counts->cfl_alpha[CFL_CONTEXT_V(joint_sign)][CFL_IDX_V(idx)];
-#endif
-      update_cdf(cdf_v, CFL_IDX_V(idx), CFL_ALPHABET_SIZE);
-    }
-  }
-  if (av1_is_directional_mode(get_uv_mode(uv_mode)) &&
-      av1_use_angle_delta(bsize)) {
-#if CONFIG_ENTROPY_STATS
-    ++counts->angle_delta[uv_mode - UV_V_PRED]
-                         [mbmi->angle_delta[PLANE_TYPE_UV] + MAX_ANGLE_DELTA];
-#endif
-    update_cdf(fc->angle_delta_cdf[uv_mode - UV_V_PRED],
-               mbmi->angle_delta[PLANE_TYPE_UV] + MAX_ANGLE_DELTA,
-               2 * MAX_ANGLE_DELTA + 1);
-  }
-  if (av1_allow_palette(cm->features.allow_screen_content_tools, bsize)) {
-    update_palette_cdf(xd, mbmi, counts);
-  }
-}
-
-static AOM_INLINE void update_stats(const AV1_COMMON *const cm,
-                                    ThreadData *td) {
-  MACROBLOCK *x = &td->mb;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  const MB_MODE_INFO *const mbmi = xd->mi[0];
-  const MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
-  const CurrentFrame *const current_frame = &cm->current_frame;
-  const BLOCK_SIZE bsize = mbmi->sb_type;
-  FRAME_CONTEXT *fc = xd->tile_ctx;
-  const int seg_ref_active =
-      segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_REF_FRAME);
-
-  if (current_frame->skip_mode_info.skip_mode_flag && !seg_ref_active &&
-      is_comp_ref_allowed(bsize)) {
-    const int skip_mode_ctx = av1_get_skip_mode_context(xd);
-#if CONFIG_ENTROPY_STATS
-    td->counts->skip_mode[skip_mode_ctx][mbmi->skip_mode]++;
-#endif
-    update_cdf(fc->skip_mode_cdfs[skip_mode_ctx], mbmi->skip_mode, 2);
-  }
-
-  if (!mbmi->skip_mode && !seg_ref_active) {
-    const int skip_ctx = av1_get_skip_context(xd);
-#if CONFIG_ENTROPY_STATS
-    td->counts->skip[skip_ctx][mbmi->skip]++;
-#endif
-    update_cdf(fc->skip_cdfs[skip_ctx], mbmi->skip, 2);
-  }
-
-#if CONFIG_ENTROPY_STATS
-  // delta quant applies to both intra and inter
-  const int super_block_upper_left =
-      ((xd->mi_row & (cm->seq_params.mib_size - 1)) == 0) &&
-      ((xd->mi_col & (cm->seq_params.mib_size - 1)) == 0);
-  const DeltaQInfo *const delta_q_info = &cm->delta_q_info;
-  if (delta_q_info->delta_q_present_flag &&
-      (bsize != cm->seq_params.sb_size || !mbmi->skip) &&
-      super_block_upper_left) {
-    const int dq =
-        (mbmi->current_qindex - xd->current_qindex) / delta_q_info->delta_q_res;
-    const int absdq = abs(dq);
-    for (int i = 0; i < AOMMIN(absdq, DELTA_Q_SMALL); ++i) {
-      td->counts->delta_q[i][1]++;
-    }
-    if (absdq < DELTA_Q_SMALL) td->counts->delta_q[absdq][0]++;
-    if (delta_q_info->delta_lf_present_flag) {
-      if (delta_q_info->delta_lf_multi) {
-        const int frame_lf_count =
-            av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2;
-        for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) {
-          const int delta_lf = (mbmi->delta_lf[lf_id] - xd->delta_lf[lf_id]) /
-                               delta_q_info->delta_lf_res;
-          const int abs_delta_lf = abs(delta_lf);
-          for (int i = 0; i < AOMMIN(abs_delta_lf, DELTA_LF_SMALL); ++i) {
-            td->counts->delta_lf_multi[lf_id][i][1]++;
-          }
-          if (abs_delta_lf < DELTA_LF_SMALL)
-            td->counts->delta_lf_multi[lf_id][abs_delta_lf][0]++;
-        }
-      } else {
-        const int delta_lf =
-            (mbmi->delta_lf_from_base - xd->delta_lf_from_base) /
-            delta_q_info->delta_lf_res;
-        const int abs_delta_lf = abs(delta_lf);
-        for (int i = 0; i < AOMMIN(abs_delta_lf, DELTA_LF_SMALL); ++i) {
-          td->counts->delta_lf[i][1]++;
-        }
-        if (abs_delta_lf < DELTA_LF_SMALL)
-          td->counts->delta_lf[abs_delta_lf][0]++;
-      }
-    }
-  }
-#endif
-
-  if (!is_inter_block(mbmi)) {
-    sum_intra_stats(cm, td->counts, xd, mbmi, xd->above_mbmi, xd->left_mbmi,
-                    frame_is_intra_only(cm));
-  }
-
-  if (av1_allow_intrabc(cm)) {
-    update_cdf(fc->intrabc_cdf, is_intrabc_block(mbmi), 2);
-#if CONFIG_ENTROPY_STATS
-    ++td->counts->intrabc[is_intrabc_block(mbmi)];
-#endif  // CONFIG_ENTROPY_STATS
-  }
-
-  if (frame_is_intra_only(cm) || mbmi->skip_mode) return;
-
-  FRAME_COUNTS *const counts = td->counts;
-  const int inter_block = is_inter_block(mbmi);
-
-  if (!seg_ref_active) {
-#if CONFIG_ENTROPY_STATS
-    counts->intra_inter[av1_get_intra_inter_context(xd)][inter_block]++;
-#endif
-    update_cdf(fc->intra_inter_cdf[av1_get_intra_inter_context(xd)],
-               inter_block, 2);
-    // If the segment reference feature is enabled we have only a single
-    // reference frame allowed for the segment so exclude it from
-    // the reference frame counts used to work out probabilities.
-    if (inter_block) {
-      const MV_REFERENCE_FRAME ref0 = mbmi->ref_frame[0];
-      const MV_REFERENCE_FRAME ref1 = mbmi->ref_frame[1];
-      if (current_frame->reference_mode == REFERENCE_MODE_SELECT) {
-        if (is_comp_ref_allowed(bsize)) {
-#if CONFIG_ENTROPY_STATS
-          counts->comp_inter[av1_get_reference_mode_context(xd)]
-                            [has_second_ref(mbmi)]++;
-#endif  // CONFIG_ENTROPY_STATS
-          update_cdf(av1_get_reference_mode_cdf(xd), has_second_ref(mbmi), 2);
-        }
-      }
-
-      if (has_second_ref(mbmi)) {
-        const COMP_REFERENCE_TYPE comp_ref_type = has_uni_comp_refs(mbmi)
-                                                      ? UNIDIR_COMP_REFERENCE
-                                                      : BIDIR_COMP_REFERENCE;
-        update_cdf(av1_get_comp_reference_type_cdf(xd), comp_ref_type,
-                   COMP_REFERENCE_TYPES);
-#if CONFIG_ENTROPY_STATS
-        counts->comp_ref_type[av1_get_comp_reference_type_context(xd)]
-                             [comp_ref_type]++;
-#endif  // CONFIG_ENTROPY_STATS
-
-        if (comp_ref_type == UNIDIR_COMP_REFERENCE) {
-          const int bit = (ref0 == BWDREF_FRAME);
-          update_cdf(av1_get_pred_cdf_uni_comp_ref_p(xd), bit, 2);
-#if CONFIG_ENTROPY_STATS
-          counts
-              ->uni_comp_ref[av1_get_pred_context_uni_comp_ref_p(xd)][0][bit]++;
-#endif  // CONFIG_ENTROPY_STATS
-          if (!bit) {
-            const int bit1 = (ref1 == LAST3_FRAME || ref1 == GOLDEN_FRAME);
-            update_cdf(av1_get_pred_cdf_uni_comp_ref_p1(xd), bit1, 2);
-#if CONFIG_ENTROPY_STATS
-            counts->uni_comp_ref[av1_get_pred_context_uni_comp_ref_p1(xd)][1]
-                                [bit1]++;
-#endif  // CONFIG_ENTROPY_STATS
-            if (bit1) {
-              update_cdf(av1_get_pred_cdf_uni_comp_ref_p2(xd),
-                         ref1 == GOLDEN_FRAME, 2);
-#if CONFIG_ENTROPY_STATS
-              counts->uni_comp_ref[av1_get_pred_context_uni_comp_ref_p2(xd)][2]
-                                  [ref1 == GOLDEN_FRAME]++;
-#endif  // CONFIG_ENTROPY_STATS
-            }
-          }
-        } else {
-          const int bit = (ref0 == GOLDEN_FRAME || ref0 == LAST3_FRAME);
-          update_cdf(av1_get_pred_cdf_comp_ref_p(xd), bit, 2);
-#if CONFIG_ENTROPY_STATS
-          counts->comp_ref[av1_get_pred_context_comp_ref_p(xd)][0][bit]++;
-#endif  // CONFIG_ENTROPY_STATS
-          if (!bit) {
-            update_cdf(av1_get_pred_cdf_comp_ref_p1(xd), ref0 == LAST2_FRAME,
-                       2);
-#if CONFIG_ENTROPY_STATS
-            counts->comp_ref[av1_get_pred_context_comp_ref_p1(xd)][1]
-                            [ref0 == LAST2_FRAME]++;
-#endif  // CONFIG_ENTROPY_STATS
-          } else {
-            update_cdf(av1_get_pred_cdf_comp_ref_p2(xd), ref0 == GOLDEN_FRAME,
-                       2);
-#if CONFIG_ENTROPY_STATS
-            counts->comp_ref[av1_get_pred_context_comp_ref_p2(xd)][2]
-                            [ref0 == GOLDEN_FRAME]++;
-#endif  // CONFIG_ENTROPY_STATS
-          }
-          update_cdf(av1_get_pred_cdf_comp_bwdref_p(xd), ref1 == ALTREF_FRAME,
-                     2);
-#if CONFIG_ENTROPY_STATS
-          counts->comp_bwdref[av1_get_pred_context_comp_bwdref_p(xd)][0]
-                             [ref1 == ALTREF_FRAME]++;
-#endif  // CONFIG_ENTROPY_STATS
-          if (ref1 != ALTREF_FRAME) {
-            update_cdf(av1_get_pred_cdf_comp_bwdref_p1(xd),
-                       ref1 == ALTREF2_FRAME, 2);
-#if CONFIG_ENTROPY_STATS
-            counts->comp_bwdref[av1_get_pred_context_comp_bwdref_p1(xd)][1]
-                               [ref1 == ALTREF2_FRAME]++;
-#endif  // CONFIG_ENTROPY_STATS
-          }
-        }
-      } else {
-        const int bit = (ref0 >= BWDREF_FRAME);
-        update_cdf(av1_get_pred_cdf_single_ref_p1(xd), bit, 2);
-#if CONFIG_ENTROPY_STATS
-        counts->single_ref[av1_get_pred_context_single_ref_p1(xd)][0][bit]++;
-#endif  // CONFIG_ENTROPY_STATS
-        if (bit) {
-          assert(ref0 <= ALTREF_FRAME);
-          update_cdf(av1_get_pred_cdf_single_ref_p2(xd), ref0 == ALTREF_FRAME,
-                     2);
-#if CONFIG_ENTROPY_STATS
-          counts->single_ref[av1_get_pred_context_single_ref_p2(xd)][1]
-                            [ref0 == ALTREF_FRAME]++;
-#endif  // CONFIG_ENTROPY_STATS
-          if (ref0 != ALTREF_FRAME) {
-            update_cdf(av1_get_pred_cdf_single_ref_p6(xd),
-                       ref0 == ALTREF2_FRAME, 2);
-#if CONFIG_ENTROPY_STATS
-            counts->single_ref[av1_get_pred_context_single_ref_p6(xd)][5]
-                              [ref0 == ALTREF2_FRAME]++;
-#endif  // CONFIG_ENTROPY_STATS
-          }
-        } else {
-          const int bit1 = !(ref0 == LAST2_FRAME || ref0 == LAST_FRAME);
-          update_cdf(av1_get_pred_cdf_single_ref_p3(xd), bit1, 2);
-#if CONFIG_ENTROPY_STATS
-          counts->single_ref[av1_get_pred_context_single_ref_p3(xd)][2][bit1]++;
-#endif  // CONFIG_ENTROPY_STATS
-          if (!bit1) {
-            update_cdf(av1_get_pred_cdf_single_ref_p4(xd), ref0 != LAST_FRAME,
-                       2);
-#if CONFIG_ENTROPY_STATS
-            counts->single_ref[av1_get_pred_context_single_ref_p4(xd)][3]
-                              [ref0 != LAST_FRAME]++;
-#endif  // CONFIG_ENTROPY_STATS
-          } else {
-            update_cdf(av1_get_pred_cdf_single_ref_p5(xd), ref0 != LAST3_FRAME,
-                       2);
-#if CONFIG_ENTROPY_STATS
-            counts->single_ref[av1_get_pred_context_single_ref_p5(xd)][4]
-                              [ref0 != LAST3_FRAME]++;
-#endif  // CONFIG_ENTROPY_STATS
-          }
-        }
-      }
-
-      if (cm->seq_params.enable_interintra_compound &&
-          is_interintra_allowed(mbmi)) {
-        const int bsize_group = size_group_lookup[bsize];
-        if (mbmi->ref_frame[1] == INTRA_FRAME) {
-#if CONFIG_ENTROPY_STATS
-          counts->interintra[bsize_group][1]++;
-#endif
-          update_cdf(fc->interintra_cdf[bsize_group], 1, 2);
-#if CONFIG_ENTROPY_STATS
-          counts->interintra_mode[bsize_group][mbmi->interintra_mode]++;
-#endif
-          update_cdf(fc->interintra_mode_cdf[bsize_group],
-                     mbmi->interintra_mode, INTERINTRA_MODES);
-          if (av1_is_wedge_used(bsize)) {
-#if CONFIG_ENTROPY_STATS
-            counts->wedge_interintra[bsize][mbmi->use_wedge_interintra]++;
-#endif
-            update_cdf(fc->wedge_interintra_cdf[bsize],
-                       mbmi->use_wedge_interintra, 2);
-            if (mbmi->use_wedge_interintra) {
-#if CONFIG_ENTROPY_STATS
-              counts->wedge_idx[bsize][mbmi->interintra_wedge_index]++;
-#endif
-              update_cdf(fc->wedge_idx_cdf[bsize], mbmi->interintra_wedge_index,
-                         16);
-            }
-          }
-        } else {
-#if CONFIG_ENTROPY_STATS
-          counts->interintra[bsize_group][0]++;
-#endif
-          update_cdf(fc->interintra_cdf[bsize_group], 0, 2);
-        }
-      }
-
-      const MOTION_MODE motion_allowed =
-          cm->features.switchable_motion_mode
-              ? motion_mode_allowed(xd->global_motion, xd, mbmi,
-                                    cm->features.allow_warped_motion)
-              : SIMPLE_TRANSLATION;
-      if (mbmi->ref_frame[1] != INTRA_FRAME) {
-        if (motion_allowed == WARPED_CAUSAL) {
-#if CONFIG_ENTROPY_STATS
-          counts->motion_mode[bsize][mbmi->motion_mode]++;
-#endif
-          update_cdf(fc->motion_mode_cdf[bsize], mbmi->motion_mode,
-                     MOTION_MODES);
-        } else if (motion_allowed == OBMC_CAUSAL) {
-#if CONFIG_ENTROPY_STATS
-          counts->obmc[bsize][mbmi->motion_mode == OBMC_CAUSAL]++;
-#endif
-          update_cdf(fc->obmc_cdf[bsize], mbmi->motion_mode == OBMC_CAUSAL, 2);
-        }
-      }
-
-      if (has_second_ref(mbmi)) {
-        assert(current_frame->reference_mode != SINGLE_REFERENCE &&
-               is_inter_compound_mode(mbmi->mode) &&
-               mbmi->motion_mode == SIMPLE_TRANSLATION);
-
-        const int masked_compound_used = is_any_masked_compound_used(bsize) &&
-                                         cm->seq_params.enable_masked_compound;
-        if (masked_compound_used) {
-          const int comp_group_idx_ctx = get_comp_group_idx_context(xd);
-#if CONFIG_ENTROPY_STATS
-          ++counts->comp_group_idx[comp_group_idx_ctx][mbmi->comp_group_idx];
-#endif
-          update_cdf(fc->comp_group_idx_cdf[comp_group_idx_ctx],
-                     mbmi->comp_group_idx, 2);
-        }
-
-        if (mbmi->comp_group_idx == 0) {
-          const int comp_index_ctx = get_comp_index_context(cm, xd);
-#if CONFIG_ENTROPY_STATS
-          ++counts->compound_index[comp_index_ctx][mbmi->compound_idx];
-#endif
-          update_cdf(fc->compound_index_cdf[comp_index_ctx], mbmi->compound_idx,
-                     2);
-        } else {
-          assert(masked_compound_used);
-          if (is_interinter_compound_used(COMPOUND_WEDGE, bsize)) {
-#if CONFIG_ENTROPY_STATS
-            ++counts->compound_type[bsize][mbmi->interinter_comp.type -
-                                           COMPOUND_WEDGE];
-#endif
-            update_cdf(fc->compound_type_cdf[bsize],
-                       mbmi->interinter_comp.type - COMPOUND_WEDGE,
-                       MASKED_COMPOUND_TYPES);
-          }
-        }
-      }
-      if (mbmi->interinter_comp.type == COMPOUND_WEDGE) {
-        if (is_interinter_compound_used(COMPOUND_WEDGE, bsize)) {
-#if CONFIG_ENTROPY_STATS
-          counts->wedge_idx[bsize][mbmi->interinter_comp.wedge_index]++;
-#endif
-          update_cdf(fc->wedge_idx_cdf[bsize],
-                     mbmi->interinter_comp.wedge_index, 16);
-        }
-      }
-    }
-  }
-
-  if (inter_block && cm->features.interp_filter == SWITCHABLE &&
-      mbmi->motion_mode != WARPED_CAUSAL &&
-      !is_nontrans_global_motion(xd, mbmi)) {
-    update_filter_type_cdf(xd, mbmi);
-  }
-  if (inter_block &&
-      !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
-    const PREDICTION_MODE mode = mbmi->mode;
-    const int16_t mode_ctx =
-        av1_mode_context_analyzer(mbmi_ext->mode_context, mbmi->ref_frame);
-    if (has_second_ref(mbmi)) {
-#if CONFIG_ENTROPY_STATS
-      ++counts->inter_compound_mode[mode_ctx][INTER_COMPOUND_OFFSET(mode)];
-#endif
-      update_cdf(fc->inter_compound_mode_cdf[mode_ctx],
-                 INTER_COMPOUND_OFFSET(mode), INTER_COMPOUND_MODES);
-    } else {
-      update_inter_mode_stats(fc, counts, mode, mode_ctx);
-    }
-
-    const int new_mv = mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV;
-    if (new_mv) {
-      const uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
-      for (int idx = 0; idx < 2; ++idx) {
-        if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) {
-          const uint8_t drl_ctx =
-              av1_drl_ctx(mbmi_ext->weight[ref_frame_type], idx);
-          update_cdf(fc->drl_cdf[drl_ctx], mbmi->ref_mv_idx != idx, 2);
-#if CONFIG_ENTROPY_STATS
-          ++counts->drl_mode[drl_ctx][mbmi->ref_mv_idx != idx];
-#endif
-          if (mbmi->ref_mv_idx == idx) break;
-        }
-      }
-    }
-
-    if (have_nearmv_in_inter_mode(mbmi->mode)) {
-      const uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
-      for (int idx = 1; idx < 3; ++idx) {
-        if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) {
-          const uint8_t drl_ctx =
-              av1_drl_ctx(mbmi_ext->weight[ref_frame_type], idx);
-          update_cdf(fc->drl_cdf[drl_ctx], mbmi->ref_mv_idx != idx - 1, 2);
-#if CONFIG_ENTROPY_STATS
-          ++counts->drl_mode[drl_ctx][mbmi->ref_mv_idx != idx - 1];
-#endif
-          if (mbmi->ref_mv_idx == idx - 1) break;
-        }
-      }
-    }
-    if (have_newmv_in_inter_mode(mbmi->mode)) {
-      const int allow_hp = cm->features.cur_frame_force_integer_mv
-                               ? MV_SUBPEL_NONE
-                               : cm->features.allow_high_precision_mv;
-      if (new_mv) {
-        for (int ref = 0; ref < 1 + has_second_ref(mbmi); ++ref) {
-          const int_mv ref_mv = av1_get_ref_mv(x, ref);
-          av1_update_mv_stats(&mbmi->mv[ref].as_mv, &ref_mv.as_mv, &fc->nmvc,
-                              allow_hp);
-        }
-      } else if (mbmi->mode == NEAREST_NEWMV || mbmi->mode == NEAR_NEWMV) {
-        const int ref = 1;
-        const int_mv ref_mv = av1_get_ref_mv(x, ref);
-        av1_update_mv_stats(&mbmi->mv[ref].as_mv, &ref_mv.as_mv, &fc->nmvc,
-                            allow_hp);
-      } else if (mbmi->mode == NEW_NEARESTMV || mbmi->mode == NEW_NEARMV) {
-        const int ref = 0;
-        const int_mv ref_mv = av1_get_ref_mv(x, ref);
-        av1_update_mv_stats(&mbmi->mv[ref].as_mv, &ref_mv.as_mv, &fc->nmvc,
-                            allow_hp);
-      }
-    }
-  }
-}
-
-static AOM_INLINE void restore_context(MACROBLOCK *x,
-                                       const RD_SEARCH_MACROBLOCK_CONTEXT *ctx,
-                                       int mi_row, int mi_col, BLOCK_SIZE bsize,
-                                       const int num_planes) {
-  MACROBLOCKD *xd = &x->e_mbd;
-  int p;
-  const int num_4x4_blocks_wide = mi_size_wide[bsize];
-  const int num_4x4_blocks_high = mi_size_high[bsize];
-  int mi_width = mi_size_wide[bsize];
-  int mi_height = mi_size_high[bsize];
-  for (p = 0; p < num_planes; p++) {
-    int tx_col = mi_col;
-    int tx_row = mi_row & MAX_MIB_MASK;
-    memcpy(
-        xd->above_entropy_context[p] + (tx_col >> xd->plane[p].subsampling_x),
-        ctx->a + num_4x4_blocks_wide * p,
-        (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_wide) >>
-            xd->plane[p].subsampling_x);
-    memcpy(xd->left_entropy_context[p] + (tx_row >> xd->plane[p].subsampling_y),
-           ctx->l + num_4x4_blocks_high * p,
-           (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_high) >>
-               xd->plane[p].subsampling_y);
-  }
-  memcpy(xd->above_partition_context + mi_col, ctx->sa,
-         sizeof(*xd->above_partition_context) * mi_width);
-  memcpy(xd->left_partition_context + (mi_row & MAX_MIB_MASK), ctx->sl,
-         sizeof(xd->left_partition_context[0]) * mi_height);
-  xd->above_txfm_context = ctx->p_ta;
-  xd->left_txfm_context = ctx->p_tl;
-  memcpy(xd->above_txfm_context, ctx->ta,
-         sizeof(*xd->above_txfm_context) * mi_width);
-  memcpy(xd->left_txfm_context, ctx->tl,
-         sizeof(*xd->left_txfm_context) * mi_height);
-}
-
-static AOM_INLINE void save_context(const MACROBLOCK *x,
-                                    RD_SEARCH_MACROBLOCK_CONTEXT *ctx,
-                                    int mi_row, int mi_col, BLOCK_SIZE bsize,
-                                    const int num_planes) {
-  const MACROBLOCKD *xd = &x->e_mbd;
-  int p;
-  int mi_width = mi_size_wide[bsize];
-  int mi_height = mi_size_high[bsize];
-
-  // buffer the above/left context information of the block in search.
-  for (p = 0; p < num_planes; ++p) {
-    int tx_col = mi_col;
-    int tx_row = mi_row & MAX_MIB_MASK;
-    memcpy(
-        ctx->a + mi_width * p,
-        xd->above_entropy_context[p] + (tx_col >> xd->plane[p].subsampling_x),
-        (sizeof(ENTROPY_CONTEXT) * mi_width) >> xd->plane[p].subsampling_x);
-    memcpy(ctx->l + mi_height * p,
-           xd->left_entropy_context[p] + (tx_row >> xd->plane[p].subsampling_y),
-           (sizeof(ENTROPY_CONTEXT) * mi_height) >> xd->plane[p].subsampling_y);
-  }
-  memcpy(ctx->sa, xd->above_partition_context + mi_col,
-         sizeof(*xd->above_partition_context) * mi_width);
-  memcpy(ctx->sl, xd->left_partition_context + (mi_row & MAX_MIB_MASK),
-         sizeof(xd->left_partition_context[0]) * mi_height);
-  memcpy(ctx->ta, xd->above_txfm_context,
-         sizeof(*xd->above_txfm_context) * mi_width);
-  memcpy(ctx->tl, xd->left_txfm_context,
-         sizeof(*xd->left_txfm_context) * mi_height);
-  ctx->p_ta = xd->above_txfm_context;
-  ctx->p_tl = xd->left_txfm_context;
-}
-
-static AOM_INLINE void encode_b(const AV1_COMP *const cpi,
-                                TileDataEnc *tile_data, ThreadData *td,
-                                TOKENEXTRA **tp, int mi_row, int mi_col,
-                                RUN_TYPE dry_run, BLOCK_SIZE bsize,
-                                PARTITION_TYPE partition,
-                                PICK_MODE_CONTEXT *const ctx, int *rate) {
-  TileInfo *const tile = &tile_data->tile_info;
-  MACROBLOCK *const x = &td->mb;
-  MACROBLOCKD *xd = &x->e_mbd;
-
-  set_offsets_without_segment_id(cpi, tile, x, mi_row, mi_col, bsize);
-  const int origin_mult = x->rdmult;
-  setup_block_rdmult(cpi, x, mi_row, mi_col, bsize, NO_AQ, NULL);
-  MB_MODE_INFO *mbmi = xd->mi[0];
-  mbmi->partition = partition;
-  update_state(cpi, td, ctx, mi_row, mi_col, bsize, dry_run);
-
-  if (!dry_run) {
-    x->mbmi_ext_frame->cb_offset = x->cb_offset;
-    assert(x->cb_offset <
-           (1 << num_pels_log2_lookup[cpi->common.seq_params.sb_size]));
-  }
-
-  encode_superblock(cpi, tile_data, td, tp, dry_run, bsize, rate);
-
-  if (!dry_run) {
-    const AV1_COMMON *const cm = &cpi->common;
-    x->cb_offset += block_size_wide[bsize] * block_size_high[bsize];
-    if (bsize == cpi->common.seq_params.sb_size && mbmi->skip == 1 &&
-        cm->delta_q_info.delta_lf_present_flag) {
-      const int frame_lf_count =
-          av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2;
-      for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id)
-        mbmi->delta_lf[lf_id] = xd->delta_lf[lf_id];
-      mbmi->delta_lf_from_base = xd->delta_lf_from_base;
-    }
-    if (has_second_ref(mbmi)) {
-      if (mbmi->compound_idx == 0 ||
-          mbmi->interinter_comp.type == COMPOUND_AVERAGE)
-        mbmi->comp_group_idx = 0;
-      else
-        mbmi->comp_group_idx = 1;
-    }
-
-    // delta quant applies to both intra and inter
-    const int super_block_upper_left =
-        ((mi_row & (cm->seq_params.mib_size - 1)) == 0) &&
-        ((mi_col & (cm->seq_params.mib_size - 1)) == 0);
-    const DeltaQInfo *const delta_q_info = &cm->delta_q_info;
-    if (delta_q_info->delta_q_present_flag &&
-        (bsize != cm->seq_params.sb_size || !mbmi->skip) &&
-        super_block_upper_left) {
-      xd->current_qindex = mbmi->current_qindex;
-      if (delta_q_info->delta_lf_present_flag) {
-        if (delta_q_info->delta_lf_multi) {
-          const int frame_lf_count =
-              av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2;
-          for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) {
-            xd->delta_lf[lf_id] = mbmi->delta_lf[lf_id];
-          }
-        } else {
-          xd->delta_lf_from_base = mbmi->delta_lf_from_base;
-        }
-      }
-    }
-
-    RD_COUNTS *rdc = &td->rd_counts;
-    if (mbmi->skip_mode) {
-      assert(!frame_is_intra_only(cm));
-      rdc->skip_mode_used_flag = 1;
-      if (cm->current_frame.reference_mode == REFERENCE_MODE_SELECT) {
-        assert(has_second_ref(mbmi));
-        rdc->compound_ref_used_flag = 1;
-      }
-      set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
-    } else {
-      const int seg_ref_active =
-          segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_REF_FRAME);
-      if (!seg_ref_active) {
-        // If the segment reference feature is enabled we have only a single
-        // reference frame allowed for the segment so exclude it from
-        // the reference frame counts used to work out probabilities.
-        if (is_inter_block(mbmi)) {
-          av1_collect_neighbors_ref_counts(xd);
-          if (cm->current_frame.reference_mode == REFERENCE_MODE_SELECT) {
-            if (has_second_ref(mbmi)) {
-              // This flag is also updated for 4x4 blocks
-              rdc->compound_ref_used_flag = 1;
-            }
-          }
-          set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
-        }
-      }
-    }
-
-    if (tile_data->allow_update_cdf) update_stats(&cpi->common, td);
-
-    // Gather obmc and warped motion count to update the probability.
-    if ((!cpi->sf.inter_sf.disable_obmc &&
-         cpi->sf.inter_sf.prune_obmc_prob_thresh > 0) ||
-        (cm->features.allow_warped_motion &&
-         cpi->sf.inter_sf.prune_warped_prob_thresh > 0)) {
-      const int inter_block = is_inter_block(mbmi);
-      const int seg_ref_active =
-          segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_REF_FRAME);
-      if (!seg_ref_active && inter_block) {
-        const MOTION_MODE motion_allowed =
-            cm->features.switchable_motion_mode
-                ? motion_mode_allowed(xd->global_motion, xd, mbmi,
-                                      cm->features.allow_warped_motion)
-                : SIMPLE_TRANSLATION;
-
-        if (mbmi->ref_frame[1] != INTRA_FRAME) {
-          if (motion_allowed >= OBMC_CAUSAL) {
-            td->rd_counts.obmc_used[bsize][mbmi->motion_mode == OBMC_CAUSAL]++;
-          }
-          if (motion_allowed == WARPED_CAUSAL) {
-            td->rd_counts.warped_used[mbmi->motion_mode == WARPED_CAUSAL]++;
-          }
-        }
-      }
-    }
-  }
-  // TODO(Ravi/Remya): Move this copy function to a better logical place
-  // This function will copy the best mode information from block
-  // level (x->mbmi_ext) to frame level (cpi->mbmi_ext_info.frame_base). This
-  // frame level buffer (cpi->mbmi_ext_info.frame_base) will be used during
-  // bitstream preparation.
-  av1_copy_mbmi_ext_to_mbmi_ext_frame(x->mbmi_ext_frame, x->mbmi_ext,
-                                      av1_ref_frame_type(xd->mi[0]->ref_frame));
-  x->rdmult = origin_mult;
-}
-
-static AOM_INLINE void encode_sb(const AV1_COMP *const cpi, ThreadData *td,
-                                 TileDataEnc *tile_data, TOKENEXTRA **tp,
-                                 int mi_row, int mi_col, RUN_TYPE dry_run,
-                                 BLOCK_SIZE bsize, PC_TREE *pc_tree,
-                                 int *rate) {
-  assert(bsize < BLOCK_SIZES_ALL);
-  const AV1_COMMON *const cm = &cpi->common;
-  const CommonModeInfoParams *const mi_params = &cm->mi_params;
-  MACROBLOCK *const x = &td->mb;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  assert(bsize < BLOCK_SIZES_ALL);
-  const int hbs = mi_size_wide[bsize] / 2;
-  const int is_partition_root = bsize >= BLOCK_8X8;
-  const int ctx = is_partition_root
-                      ? partition_plane_context(xd, mi_row, mi_col, bsize)
-                      : -1;
-  const PARTITION_TYPE partition = pc_tree->partitioning;
-  const BLOCK_SIZE subsize = get_partition_subsize(bsize, partition);
-  int quarter_step = mi_size_wide[bsize] / 4;
-  int i;
-  BLOCK_SIZE bsize2 = get_partition_subsize(bsize, PARTITION_SPLIT);
-
-  if (mi_row >= mi_params->mi_rows || mi_col >= mi_params->mi_cols) return;
-
-  if (!dry_run && ctx >= 0) {
-    const int has_rows = (mi_row + hbs) < mi_params->mi_rows;
-    const int has_cols = (mi_col + hbs) < mi_params->mi_cols;
-
-    if (has_rows && has_cols) {
-#if CONFIG_ENTROPY_STATS
-      td->counts->partition[ctx][partition]++;
-#endif
-
-      if (tile_data->allow_update_cdf) {
-        FRAME_CONTEXT *fc = xd->tile_ctx;
-        update_cdf(fc->partition_cdf[ctx], partition,
-                   partition_cdf_length(bsize));
-      }
-    }
-  }
-
-  switch (partition) {
-    case PARTITION_NONE:
-      encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, subsize,
-               partition, &pc_tree->none, rate);
-      break;
-    case PARTITION_VERT:
-      encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, subsize,
-               partition, &pc_tree->vertical[0], rate);
-      if (mi_col + hbs < mi_params->mi_cols) {
-        encode_b(cpi, tile_data, td, tp, mi_row, mi_col + hbs, dry_run, subsize,
-                 partition, &pc_tree->vertical[1], rate);
-      }
-      break;
-    case PARTITION_HORZ:
-      encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, subsize,
-               partition, &pc_tree->horizontal[0], rate);
-      if (mi_row + hbs < mi_params->mi_rows) {
-        encode_b(cpi, tile_data, td, tp, mi_row + hbs, mi_col, dry_run, subsize,
-                 partition, &pc_tree->horizontal[1], rate);
-      }
-      break;
-    case PARTITION_SPLIT:
-      encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, dry_run, subsize,
-                pc_tree->split[0], rate);
-      encode_sb(cpi, td, tile_data, tp, mi_row, mi_col + hbs, dry_run, subsize,
-                pc_tree->split[1], rate);
-      encode_sb(cpi, td, tile_data, tp, mi_row + hbs, mi_col, dry_run, subsize,
-                pc_tree->split[2], rate);
-      encode_sb(cpi, td, tile_data, tp, mi_row + hbs, mi_col + hbs, dry_run,
-                subsize, pc_tree->split[3], rate);
-      break;
-
-    case PARTITION_HORZ_A:
-      encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, bsize2,
-               partition, &pc_tree->horizontala[0], rate);
-      encode_b(cpi, tile_data, td, tp, mi_row, mi_col + hbs, dry_run, bsize2,
-               partition, &pc_tree->horizontala[1], rate);
-      encode_b(cpi, tile_data, td, tp, mi_row + hbs, mi_col, dry_run, subsize,
-               partition, &pc_tree->horizontala[2], rate);
-      break;
-    case PARTITION_HORZ_B:
-      encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, subsize,
-               partition, &pc_tree->horizontalb[0], rate);
-      encode_b(cpi, tile_data, td, tp, mi_row + hbs, mi_col, dry_run, bsize2,
-               partition, &pc_tree->horizontalb[1], rate);
-      encode_b(cpi, tile_data, td, tp, mi_row + hbs, mi_col + hbs, dry_run,
-               bsize2, partition, &pc_tree->horizontalb[2], rate);
-      break;
-    case PARTITION_VERT_A:
-      encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, bsize2,
-               partition, &pc_tree->verticala[0], rate);
-      encode_b(cpi, tile_data, td, tp, mi_row + hbs, mi_col, dry_run, bsize2,
-               partition, &pc_tree->verticala[1], rate);
-      encode_b(cpi, tile_data, td, tp, mi_row, mi_col + hbs, dry_run, subsize,
-               partition, &pc_tree->verticala[2], rate);
-
-      break;
-    case PARTITION_VERT_B:
-      encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, subsize,
-               partition, &pc_tree->verticalb[0], rate);
-      encode_b(cpi, tile_data, td, tp, mi_row, mi_col + hbs, dry_run, bsize2,
-               partition, &pc_tree->verticalb[1], rate);
-      encode_b(cpi, tile_data, td, tp, mi_row + hbs, mi_col + hbs, dry_run,
-               bsize2, partition, &pc_tree->verticalb[2], rate);
-      break;
-    case PARTITION_HORZ_4:
-      for (i = 0; i < 4; ++i) {
-        int this_mi_row = mi_row + i * quarter_step;
-        if (i > 0 && this_mi_row >= mi_params->mi_rows) break;
-
-        encode_b(cpi, tile_data, td, tp, this_mi_row, mi_col, dry_run, subsize,
-                 partition, &pc_tree->horizontal4[i], rate);
-      }
-      break;
-    case PARTITION_VERT_4:
-      for (i = 0; i < 4; ++i) {
-        int this_mi_col = mi_col + i * quarter_step;
-        if (i > 0 && this_mi_col >= mi_params->mi_cols) break;
-        encode_b(cpi, tile_data, td, tp, mi_row, this_mi_col, dry_run, subsize,
-                 partition, &pc_tree->vertical4[i], rate);
-      }
-      break;
-    default: assert(0 && "Invalid partition type."); break;
-  }
-
-  update_ext_partition_context(xd, mi_row, mi_col, subsize, bsize, partition);
-}
-
-static AOM_INLINE void set_partial_sb_partition(
-    const AV1_COMMON *const cm, MB_MODE_INFO *mi, int bh_in, int bw_in,
-    int mi_rows_remaining, int mi_cols_remaining, BLOCK_SIZE bsize,
-    MB_MODE_INFO **mib) {
-  int bh = bh_in;
-  int r, c;
-  for (r = 0; r < cm->seq_params.mib_size; r += bh) {
-    int bw = bw_in;
-    for (c = 0; c < cm->seq_params.mib_size; c += bw) {
-      const int grid_index = get_mi_grid_idx(&cm->mi_params, r, c);
-      const int mi_index = get_alloc_mi_idx(&cm->mi_params, r, c);
-      mib[grid_index] = mi + mi_index;
-      mib[grid_index]->sb_type = find_partition_size(
-          bsize, mi_rows_remaining - r, mi_cols_remaining - c, &bh, &bw);
-    }
-  }
-}
-
-// This function attempts to set all mode info entries in a given superblock
-// to the same block partition size.
-// However, at the bottom and right borders of the image the requested size
-// may not be allowed in which case this code attempts to choose the largest
-// allowable partition.
-static AOM_INLINE void set_fixed_partitioning(AV1_COMP *cpi,
-                                              const TileInfo *const tile,
-                                              MB_MODE_INFO **mib, int mi_row,
-                                              int mi_col, BLOCK_SIZE bsize) {
-  AV1_COMMON *const cm = &cpi->common;
-  const CommonModeInfoParams *const mi_params = &cm->mi_params;
-  const int mi_rows_remaining = tile->mi_row_end - mi_row;
-  const int mi_cols_remaining = tile->mi_col_end - mi_col;
-  MB_MODE_INFO *const mi_upper_left =
-      mi_params->mi_alloc + get_alloc_mi_idx(mi_params, mi_row, mi_col);
-  int bh = mi_size_high[bsize];
-  int bw = mi_size_wide[bsize];
-
-  assert(bsize >= mi_params->mi_alloc_bsize &&
-         "Attempted to use bsize < mi_params->mi_alloc_bsize");
-  assert((mi_rows_remaining > 0) && (mi_cols_remaining > 0));
-
-  // Apply the requested partition size to the SB if it is all "in image"
-  if ((mi_cols_remaining >= cm->seq_params.mib_size) &&
-      (mi_rows_remaining >= cm->seq_params.mib_size)) {
-    for (int block_row = 0; block_row < cm->seq_params.mib_size;
-         block_row += bh) {
-      for (int block_col = 0; block_col < cm->seq_params.mib_size;
-           block_col += bw) {
-        const int grid_index = get_mi_grid_idx(mi_params, block_row, block_col);
-        const int mi_index = get_alloc_mi_idx(mi_params, block_row, block_col);
-        mib[grid_index] = mi_upper_left + mi_index;
-        mib[grid_index]->sb_type = bsize;
-      }
-    }
-  } else {
-    // Else this is a partial SB.
-    set_partial_sb_partition(cm, mi_upper_left, bh, bw, mi_rows_remaining,
-                             mi_cols_remaining, bsize, mib);
-  }
-}
-
-static AOM_INLINE void rd_use_partition(
-    AV1_COMP *cpi, ThreadData *td, TileDataEnc *tile_data, MB_MODE_INFO **mib,
-    TOKENEXTRA **tp, int mi_row, int mi_col, BLOCK_SIZE bsize, int *rate,
-    int64_t *dist, int do_recon, PC_TREE *pc_tree) {
-  AV1_COMMON *const cm = &cpi->common;
-  const CommonModeInfoParams *const mi_params = &cm->mi_params;
-  const int num_planes = av1_num_planes(cm);
-  TileInfo *const tile_info = &tile_data->tile_info;
-  MACROBLOCK *const x = &td->mb;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  const int bs = mi_size_wide[bsize];
-  const int hbs = bs / 2;
-  int i;
-  const int pl = (bsize >= BLOCK_8X8)
-                     ? partition_plane_context(xd, mi_row, mi_col, bsize)
-                     : 0;
-  const PARTITION_TYPE partition =
-      (bsize >= BLOCK_8X8) ? get_partition(cm, mi_row, mi_col, bsize)
-                           : PARTITION_NONE;
-  const BLOCK_SIZE subsize = get_partition_subsize(bsize, partition);
-  RD_SEARCH_MACROBLOCK_CONTEXT x_ctx;
-  RD_STATS last_part_rdc, none_rdc, chosen_rdc, invalid_rdc;
-  BLOCK_SIZE sub_subsize = BLOCK_4X4;
-  int splits_below = 0;
-  BLOCK_SIZE bs_type = mib[0]->sb_type;
-  PICK_MODE_CONTEXT *ctx_none = &pc_tree->none;
-
-  if (mi_row >= mi_params->mi_rows || mi_col >= mi_params->mi_cols) return;
-
-  assert(mi_size_wide[bsize] == mi_size_high[bsize]);
-
-  av1_invalid_rd_stats(&last_part_rdc);
-  av1_invalid_rd_stats(&none_rdc);
-  av1_invalid_rd_stats(&chosen_rdc);
-  av1_invalid_rd_stats(&invalid_rdc);
-
-  pc_tree->partitioning = partition;
-
-  xd->above_txfm_context =
-      cm->above_contexts.txfm[tile_info->tile_row] + mi_col;
-  xd->left_txfm_context =
-      xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
-  save_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
-
-  if (bsize == BLOCK_16X16 && cpi->vaq_refresh) {
-    set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
-    x->mb_energy = av1_log_block_var(cpi, x, bsize);
-  }
-
-  // Save rdmult before it might be changed, so it can be restored later.
-  const int orig_rdmult = x->rdmult;
-  setup_block_rdmult(cpi, x, mi_row, mi_col, bsize, NO_AQ, NULL);
-
-  if (cpi->sf.part_sf.partition_search_type == VAR_BASED_PARTITION &&
-      (cpi->sf.part_sf.adjust_var_based_rd_partitioning == 2 ||
-       (cpi->sf.part_sf.adjust_var_based_rd_partitioning == 1 &&
-        cm->quant_params.base_qindex > 190 && bsize <= BLOCK_32X32 &&
-        !frame_is_intra_only(cm)))) {
-    // Check if any of the sub blocks are further split.
-    if (partition == PARTITION_SPLIT && subsize > BLOCK_8X8) {
-      sub_subsize = get_partition_subsize(subsize, PARTITION_SPLIT);
-      splits_below = 1;
-      for (i = 0; i < 4; i++) {
-        int jj = i >> 1, ii = i & 0x01;
-        MB_MODE_INFO *this_mi = mib[jj * hbs * mi_params->mi_stride + ii * hbs];
-        if (this_mi && this_mi->sb_type >= sub_subsize) {
-          splits_below = 0;
-        }
-      }
-    }
-
-    // If partition is not none try none unless each of the 4 splits are split
-    // even further..
-    if (partition != PARTITION_NONE && !splits_below &&
-        mi_row + hbs < mi_params->mi_rows &&
-        mi_col + hbs < mi_params->mi_cols) {
-      pc_tree->partitioning = PARTITION_NONE;
-      pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &none_rdc,
-                    PARTITION_NONE, bsize, ctx_none, invalid_rdc, PICK_MODE_RD);
-
-      if (none_rdc.rate < INT_MAX) {
-        none_rdc.rate += x->partition_cost[pl][PARTITION_NONE];
-        none_rdc.rdcost = RDCOST(x->rdmult, none_rdc.rate, none_rdc.dist);
-      }
-
-      restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
-      mib[0]->sb_type = bs_type;
-      pc_tree->partitioning = partition;
-    }
-  }
-
-  switch (partition) {
-    case PARTITION_NONE:
-      pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
-                    PARTITION_NONE, bsize, ctx_none, invalid_rdc, PICK_MODE_RD);
-      break;
-    case PARTITION_HORZ:
-      pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
-                    PARTITION_HORZ, subsize, &pc_tree->horizontal[0],
-                    invalid_rdc, PICK_MODE_RD);
-      if (last_part_rdc.rate != INT_MAX && bsize >= BLOCK_8X8 &&
-          mi_row + hbs < mi_params->mi_rows) {
-        RD_STATS tmp_rdc;
-        const PICK_MODE_CONTEXT *const ctx_h = &pc_tree->horizontal[0];
-        av1_init_rd_stats(&tmp_rdc);
-        update_state(cpi, td, ctx_h, mi_row, mi_col, subsize, 1);
-        encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, subsize,
-                          NULL);
-        pick_sb_modes(cpi, tile_data, x, mi_row + hbs, mi_col, &tmp_rdc,
-                      PARTITION_HORZ, subsize, &pc_tree->horizontal[1],
-                      invalid_rdc, PICK_MODE_RD);
-        if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
-          av1_invalid_rd_stats(&last_part_rdc);
-          break;
-        }
-        last_part_rdc.rate += tmp_rdc.rate;
-        last_part_rdc.dist += tmp_rdc.dist;
-        last_part_rdc.rdcost += tmp_rdc.rdcost;
-      }
-      break;
-    case PARTITION_VERT:
-      pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
-                    PARTITION_VERT, subsize, &pc_tree->vertical[0], invalid_rdc,
-                    PICK_MODE_RD);
-      if (last_part_rdc.rate != INT_MAX && bsize >= BLOCK_8X8 &&
-          mi_col + hbs < mi_params->mi_cols) {
-        RD_STATS tmp_rdc;
-        const PICK_MODE_CONTEXT *const ctx_v = &pc_tree->vertical[0];
-        av1_init_rd_stats(&tmp_rdc);
-        update_state(cpi, td, ctx_v, mi_row, mi_col, subsize, 1);
-        encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, subsize,
-                          NULL);
-        pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + hbs, &tmp_rdc,
-                      PARTITION_VERT, subsize,
-                      &pc_tree->vertical[bsize > BLOCK_8X8], invalid_rdc,
-                      PICK_MODE_RD);
-        if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
-          av1_invalid_rd_stats(&last_part_rdc);
-          break;
-        }
-        last_part_rdc.rate += tmp_rdc.rate;
-        last_part_rdc.dist += tmp_rdc.dist;
-        last_part_rdc.rdcost += tmp_rdc.rdcost;
-      }
-      break;
-    case PARTITION_SPLIT:
-      if (cpi->sf.part_sf.adjust_var_based_rd_partitioning == 1 &&
-          none_rdc.rate < INT_MAX && none_rdc.skip == 1) {
-        av1_invalid_rd_stats(&last_part_rdc);
-        break;
-      }
-      last_part_rdc.rate = 0;
-      last_part_rdc.dist = 0;
-      last_part_rdc.rdcost = 0;
-      for (i = 0; i < 4; i++) {
-        int x_idx = (i & 1) * hbs;
-        int y_idx = (i >> 1) * hbs;
-        int jj = i >> 1, ii = i & 0x01;
-        RD_STATS tmp_rdc;
-        if ((mi_row + y_idx >= mi_params->mi_rows) ||
-            (mi_col + x_idx >= mi_params->mi_cols))
-          continue;
-
-        av1_init_rd_stats(&tmp_rdc);
-        rd_use_partition(cpi, td, tile_data,
-                         mib + jj * hbs * mi_params->mi_stride + ii * hbs, tp,
-                         mi_row + y_idx, mi_col + x_idx, subsize, &tmp_rdc.rate,
-                         &tmp_rdc.dist, i != 3, pc_tree->split[i]);
-        if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
-          av1_invalid_rd_stats(&last_part_rdc);
-          break;
-        }
-        last_part_rdc.rate += tmp_rdc.rate;
-        last_part_rdc.dist += tmp_rdc.dist;
-      }
-      break;
-    case PARTITION_VERT_A:
-    case PARTITION_VERT_B:
-    case PARTITION_HORZ_A:
-    case PARTITION_HORZ_B:
-    case PARTITION_HORZ_4:
-    case PARTITION_VERT_4:
-      assert(0 && "Cannot handle extended partition types");
-    default: assert(0); break;
-  }
-
-  if (last_part_rdc.rate < INT_MAX) {
-    last_part_rdc.rate += x->partition_cost[pl][partition];
-    last_part_rdc.rdcost =
-        RDCOST(x->rdmult, last_part_rdc.rate, last_part_rdc.dist);
-  }
-
-  if ((cpi->sf.part_sf.partition_search_type == VAR_BASED_PARTITION &&
-       cpi->sf.part_sf.adjust_var_based_rd_partitioning > 2) &&
-      partition != PARTITION_SPLIT && bsize > BLOCK_8X8 &&
-      (mi_row + bs < mi_params->mi_rows ||
-       mi_row + hbs == mi_params->mi_rows) &&
-      (mi_col + bs < mi_params->mi_cols ||
-       mi_col + hbs == mi_params->mi_cols)) {
-    BLOCK_SIZE split_subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
-    chosen_rdc.rate = 0;
-    chosen_rdc.dist = 0;
-
-    restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
-    pc_tree->partitioning = PARTITION_SPLIT;
-
-    // Split partition.
-    for (i = 0; i < 4; i++) {
-      int x_idx = (i & 1) * hbs;
-      int y_idx = (i >> 1) * hbs;
-      RD_STATS tmp_rdc;
-
-      if ((mi_row + y_idx >= mi_params->mi_rows) ||
-          (mi_col + x_idx >= mi_params->mi_cols))
-        continue;
-
-      save_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
-      pc_tree->split[i]->partitioning = PARTITION_NONE;
-      pick_sb_modes(cpi, tile_data, x, mi_row + y_idx, mi_col + x_idx, &tmp_rdc,
-                    PARTITION_SPLIT, split_subsize, &pc_tree->split[i]->none,
-                    invalid_rdc, PICK_MODE_RD);
-
-      restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
-      if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
-        av1_invalid_rd_stats(&chosen_rdc);
-        break;
-      }
-
-      chosen_rdc.rate += tmp_rdc.rate;
-      chosen_rdc.dist += tmp_rdc.dist;
-
-      if (i != 3)
-        encode_sb(cpi, td, tile_data, tp, mi_row + y_idx, mi_col + x_idx,
-                  OUTPUT_ENABLED, split_subsize, pc_tree->split[i], NULL);
-
-      chosen_rdc.rate += x->partition_cost[pl][PARTITION_NONE];
-    }
-    if (chosen_rdc.rate < INT_MAX) {
-      chosen_rdc.rate += x->partition_cost[pl][PARTITION_SPLIT];
-      chosen_rdc.rdcost = RDCOST(x->rdmult, chosen_rdc.rate, chosen_rdc.dist);
-    }
-  }
-
-  // If last_part is better set the partitioning to that.
-  if (last_part_rdc.rdcost < chosen_rdc.rdcost) {
-    mib[0]->sb_type = bsize;
-    if (bsize >= BLOCK_8X8) pc_tree->partitioning = partition;
-    chosen_rdc = last_part_rdc;
-  }
-  // If none was better set the partitioning to that.
-  if (none_rdc.rdcost < chosen_rdc.rdcost) {
-    if (bsize >= BLOCK_8X8) pc_tree->partitioning = PARTITION_NONE;
-    chosen_rdc = none_rdc;
-  }
-
-  restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
-
-  // We must have chosen a partitioning and encoding or we'll fail later on.
-  // No other opportunities for success.
-  if (bsize == cm->seq_params.sb_size)
-    assert(chosen_rdc.rate < INT_MAX && chosen_rdc.dist < INT64_MAX);
-
-  if (do_recon) {
-    if (bsize == cm->seq_params.sb_size) {
-      // NOTE: To get estimate for rate due to the tokens, use:
-      // int rate_coeffs = 0;
-      // encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_COSTCOEFFS,
-      //           bsize, pc_tree, &rate_coeffs);
-      x->cb_offset = 0;
-      encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, OUTPUT_ENABLED, bsize,
-                pc_tree, NULL);
-    } else {
-      encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_NORMAL, bsize,
-                pc_tree, NULL);
-    }
-  }
-
-  *rate = chosen_rdc.rate;
-  *dist = chosen_rdc.dist;
-  x->rdmult = orig_rdmult;
-}
-
-static int is_leaf_split_partition(AV1_COMMON *cm, int mi_row, int mi_col,
-                                   BLOCK_SIZE bsize) {
-  const int bs = mi_size_wide[bsize];
-  const int hbs = bs / 2;
-  assert(bsize >= BLOCK_8X8);
-  const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
-
-  for (int i = 0; i < 4; i++) {
-    int x_idx = (i & 1) * hbs;
-    int y_idx = (i >> 1) * hbs;
-    if ((mi_row + y_idx >= cm->mi_params.mi_rows) ||
-        (mi_col + x_idx >= cm->mi_params.mi_cols))
-      return 0;
-    if (get_partition(cm, mi_row + y_idx, mi_col + x_idx, subsize) !=
-            PARTITION_NONE &&
-        subsize != BLOCK_8X8)
-      return 0;
-  }
-  return 1;
-}
-
-static AOM_INLINE int do_slipt_check(BLOCK_SIZE bsize) {
-  return (bsize == BLOCK_16X16 || bsize == BLOCK_32X32);
-}
-
-static AOM_INLINE void nonrd_use_partition(AV1_COMP *cpi, ThreadData *td,
-                                           TileDataEnc *tile_data,
-                                           MB_MODE_INFO **mib, TOKENEXTRA **tp,
-                                           int mi_row, int mi_col,
-                                           BLOCK_SIZE bsize, PC_TREE *pc_tree) {
-  AV1_COMMON *const cm = &cpi->common;
-  const CommonModeInfoParams *const mi_params = &cm->mi_params;
-  TileInfo *const tile_info = &tile_data->tile_info;
-  MACROBLOCK *const x = &td->mb;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  // Only square blocks from 8x8 to 128x128 are supported
-  assert(bsize >= BLOCK_8X8 && bsize <= BLOCK_128X128);
-  const int bs = mi_size_wide[bsize];
-  const int hbs = bs / 2;
-  const PARTITION_TYPE partition =
-      (bsize >= BLOCK_8X8) ? get_partition(cm, mi_row, mi_col, bsize)
-                           : PARTITION_NONE;
-  BLOCK_SIZE subsize = get_partition_subsize(bsize, partition);
-  assert(subsize <= BLOCK_LARGEST);
-  const int pl = (bsize >= BLOCK_8X8)
-                     ? partition_plane_context(xd, mi_row, mi_col, bsize)
-                     : 0;
-
-  RD_STATS dummy_cost;
-  av1_invalid_rd_stats(&dummy_cost);
-  RD_STATS invalid_rd;
-  av1_invalid_rd_stats(&invalid_rd);
-
-  if (mi_row >= mi_params->mi_rows || mi_col >= mi_params->mi_cols) return;
-
-  assert(mi_size_wide[bsize] == mi_size_high[bsize]);
-
-  pc_tree->partitioning = partition;
-
-  xd->above_txfm_context =
-      cm->above_contexts.txfm[tile_info->tile_row] + mi_col;
-  xd->left_txfm_context =
-      xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
-
-  switch (partition) {
-    case PARTITION_NONE:
-      if (cpi->sf.rt_sf.nonrd_check_partition_split && do_slipt_check(bsize) &&
-          !frame_is_intra_only(cm)) {
-        RD_STATS split_rdc, none_rdc, block_rdc;
-        RD_SEARCH_MACROBLOCK_CONTEXT x_ctx;
-
-        av1_init_rd_stats(&split_rdc);
-        av1_invalid_rd_stats(&none_rdc);
-
-        save_context(x, &x_ctx, mi_row, mi_col, bsize, 3);
-        subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
-        pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &none_rdc,
-                      PARTITION_NONE, bsize, &pc_tree->none, invalid_rd,
-                      PICK_MODE_NONRD);
-        none_rdc.rate += x->partition_cost[pl][PARTITION_NONE];
-        none_rdc.rdcost = RDCOST(x->rdmult, none_rdc.rate, none_rdc.dist);
-        restore_context(x, &x_ctx, mi_row, mi_col, bsize, 3);
-
-        for (int i = 0; i < 4; i++) {
-          av1_invalid_rd_stats(&block_rdc);
-          const int x_idx = (i & 1) * hbs;
-          const int y_idx = (i >> 1) * hbs;
-          if (mi_row + y_idx >= mi_params->mi_rows ||
-              mi_col + x_idx >= mi_params->mi_cols)
-            continue;
-          xd->above_txfm_context =
-              cm->above_contexts.txfm[tile_info->tile_row] + mi_col + x_idx;
-          xd->left_txfm_context =
-              xd->left_txfm_context_buffer + ((mi_row + y_idx) & MAX_MIB_MASK);
-          pc_tree->split[i]->partitioning = PARTITION_NONE;
-          pick_sb_modes(cpi, tile_data, x, mi_row + y_idx, mi_col + x_idx,
-                        &block_rdc, PARTITION_NONE, subsize,
-                        &pc_tree->split[i]->none, invalid_rd, PICK_MODE_NONRD);
-          split_rdc.rate += block_rdc.rate;
-          split_rdc.dist += block_rdc.dist;
-
-          encode_b(cpi, tile_data, td, tp, mi_row + y_idx, mi_col + x_idx, 1,
-                   subsize, PARTITION_NONE, &pc_tree->split[i]->none, NULL);
-        }
-        split_rdc.rate += x->partition_cost[pl][PARTITION_SPLIT];
-        split_rdc.rdcost = RDCOST(x->rdmult, split_rdc.rate, split_rdc.dist);
-        restore_context(x, &x_ctx, mi_row, mi_col, bsize, 3);
-
-        if (none_rdc.rdcost < split_rdc.rdcost) {
-          mib[0]->sb_type = bsize;
-          pc_tree->partitioning = PARTITION_NONE;
-          encode_b(cpi, tile_data, td, tp, mi_row, mi_col, 0, bsize, partition,
-                   &pc_tree->none, NULL);
-        } else {
-          mib[0]->sb_type = subsize;
-          pc_tree->partitioning = PARTITION_SPLIT;
-          for (int i = 0; i < 4; i++) {
-            const int x_idx = (i & 1) * hbs;
-            const int y_idx = (i >> 1) * hbs;
-            if (mi_row + y_idx >= mi_params->mi_rows ||
-                mi_col + x_idx >= mi_params->mi_cols)
-              continue;
-
-            encode_b(cpi, tile_data, td, tp, mi_row + y_idx, mi_col + x_idx, 0,
-                     subsize, PARTITION_NONE, &pc_tree->split[i]->none, NULL);
-          }
-        }
-
-      } else {
-        pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &dummy_cost,
-                      PARTITION_NONE, bsize, &pc_tree->none, invalid_rd,
-                      PICK_MODE_NONRD);
-        encode_b(cpi, tile_data, td, tp, mi_row, mi_col, 0, bsize, partition,
-                 &pc_tree->none, NULL);
-      }
-      break;
-    case PARTITION_VERT:
-      pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &dummy_cost,
-                    PARTITION_VERT, subsize, &pc_tree->vertical[0], invalid_rd,
-                    PICK_MODE_NONRD);
-      encode_b(cpi, tile_data, td, tp, mi_row, mi_col, 0, subsize,
-               PARTITION_VERT, &pc_tree->vertical[0], NULL);
-      if (mi_col + hbs < mi_params->mi_cols && bsize > BLOCK_8X8) {
-        pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + hbs, &dummy_cost,
-                      PARTITION_VERT, subsize, &pc_tree->vertical[1],
-                      invalid_rd, PICK_MODE_NONRD);
-        encode_b(cpi, tile_data, td, tp, mi_row, mi_col + hbs, 0, subsize,
-                 PARTITION_VERT, &pc_tree->vertical[1], NULL);
-      }
-      break;
-    case PARTITION_HORZ:
-      pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &dummy_cost,
-                    PARTITION_HORZ, subsize, &pc_tree->horizontal[0],
-                    invalid_rd, PICK_MODE_NONRD);
-      encode_b(cpi, tile_data, td, tp, mi_row, mi_col, 0, subsize,
-               PARTITION_HORZ, &pc_tree->horizontal[0], NULL);
-
-      if (mi_row + hbs < mi_params->mi_rows && bsize > BLOCK_8X8) {
-        pick_sb_modes(cpi, tile_data, x, mi_row + hbs, mi_col, &dummy_cost,
-                      PARTITION_HORZ, subsize, &pc_tree->horizontal[1],
-                      invalid_rd, PICK_MODE_NONRD);
-        encode_b(cpi, tile_data, td, tp, mi_row + hbs, mi_col, 0, subsize,
-                 PARTITION_HORZ, &pc_tree->horizontal[1], NULL);
-      }
-      break;
-    case PARTITION_SPLIT:
-      if (cpi->sf.rt_sf.nonrd_check_partition_merge_mode &&
-          is_leaf_split_partition(cm, mi_row, mi_col, bsize) &&
-          !frame_is_intra_only(cm) && bsize <= BLOCK_32X32) {
-        RD_SEARCH_MACROBLOCK_CONTEXT x_ctx;
-        RD_STATS split_rdc, none_rdc;
-        av1_invalid_rd_stats(&split_rdc);
-        av1_invalid_rd_stats(&none_rdc);
-        save_context(x, &x_ctx, mi_row, mi_col, bsize, 3);
-        xd->above_txfm_context =
-            cm->above_contexts.txfm[tile_info->tile_row] + mi_col;
-        xd->left_txfm_context =
-            xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
-        pc_tree->partitioning = PARTITION_NONE;
-        pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &none_rdc,
-                      PARTITION_NONE, bsize, &pc_tree->none, invalid_rd,
-                      PICK_MODE_NONRD);
-        none_rdc.rate += x->partition_cost[pl][PARTITION_NONE];
-        none_rdc.rdcost = RDCOST(x->rdmult, none_rdc.rate, none_rdc.dist);
-        restore_context(x, &x_ctx, mi_row, mi_col, bsize, 3);
-        if (cpi->sf.rt_sf.nonrd_check_partition_merge_mode != 2 ||
-            none_rdc.skip != 1 || pc_tree->none.mic.mode == NEWMV) {
-          av1_init_rd_stats(&split_rdc);
-          for (int i = 0; i < 4; i++) {
-            RD_STATS block_rdc;
-            av1_invalid_rd_stats(&block_rdc);
-            int x_idx = (i & 1) * hbs;
-            int y_idx = (i >> 1) * hbs;
-            if ((mi_row + y_idx >= mi_params->mi_rows) ||
-                (mi_col + x_idx >= mi_params->mi_cols))
-              continue;
-            xd->above_txfm_context =
-                cm->above_contexts.txfm[tile_info->tile_row] + mi_col + x_idx;
-            xd->left_txfm_context = xd->left_txfm_context_buffer +
-                                    ((mi_row + y_idx) & MAX_MIB_MASK);
-            pc_tree->split[i]->partitioning = PARTITION_NONE;
-            pick_sb_modes(cpi, tile_data, x, mi_row + y_idx, mi_col + x_idx,
-                          &block_rdc, PARTITION_NONE, subsize,
-                          &pc_tree->split[i]->none, invalid_rd,
-                          PICK_MODE_NONRD);
-            split_rdc.rate += block_rdc.rate;
-            split_rdc.dist += block_rdc.dist;
-
-            encode_b(cpi, tile_data, td, tp, mi_row + y_idx, mi_col + x_idx, 1,
-                     subsize, PARTITION_NONE, &pc_tree->split[i]->none, NULL);
-          }
-          restore_context(x, &x_ctx, mi_row, mi_col, bsize, 3);
-          split_rdc.rate += x->partition_cost[pl][PARTITION_SPLIT];
-          split_rdc.rdcost = RDCOST(x->rdmult, split_rdc.rate, split_rdc.dist);
-        }
-        if (none_rdc.rdcost < split_rdc.rdcost) {
-          mib[0]->sb_type = bsize;
-          pc_tree->partitioning = PARTITION_NONE;
-          encode_b(cpi, tile_data, td, tp, mi_row, mi_col, 0, bsize, partition,
-                   &pc_tree->none, NULL);
-        } else {
-          mib[0]->sb_type = subsize;
-          pc_tree->partitioning = PARTITION_SPLIT;
-          for (int i = 0; i < 4; i++) {
-            int x_idx = (i & 1) * hbs;
-            int y_idx = (i >> 1) * hbs;
-            if ((mi_row + y_idx >= mi_params->mi_rows) ||
-                (mi_col + x_idx >= mi_params->mi_cols))
-              continue;
-
-            encode_b(cpi, tile_data, td, tp, mi_row + y_idx, mi_col + x_idx, 0,
-                     subsize, PARTITION_NONE, &pc_tree->split[i]->none, NULL);
-          }
-        }
-      } else {
-        for (int i = 0; i < 4; i++) {
-          int x_idx = (i & 1) * hbs;
-          int y_idx = (i >> 1) * hbs;
-          int jj = i >> 1, ii = i & 0x01;
-          if ((mi_row + y_idx >= mi_params->mi_rows) ||
-              (mi_col + x_idx >= mi_params->mi_cols))
-            continue;
-          nonrd_use_partition(cpi, td, tile_data,
-                              mib + jj * hbs * mi_params->mi_stride + ii * hbs,
-                              tp, mi_row + y_idx, mi_col + x_idx, subsize,
-                              pc_tree->split[i]);
-        }
-      }
-      break;
-    case PARTITION_VERT_A:
-    case PARTITION_VERT_B:
-    case PARTITION_HORZ_A:
-    case PARTITION_HORZ_B:
-    case PARTITION_HORZ_4:
-    case PARTITION_VERT_4:
-      assert(0 && "Cannot handle extended partition types");
-    default: assert(0); break;
-  }
-}
-
 #if !CONFIG_REALTIME_ONLY
-static const FIRSTPASS_STATS *read_one_frame_stats(const TWO_PASS *p, int frm) {
-  assert(frm >= 0);
-  if (frm < 0 ||
-      p->stats_buf_ctx->stats_in_start + frm > p->stats_buf_ctx->stats_in_end) {
-    return NULL;
-  }
-
-  return &p->stats_buf_ctx->stats_in_start[frm];
-}
-// Checks to see if a super block is on a horizontal image edge.
-// In most cases this is the "real" edge unless there are formatting
-// bars embedded in the stream.
-static int active_h_edge(const AV1_COMP *cpi, int mi_row, int mi_step) {
-  int top_edge = 0;
-  int bottom_edge = cpi->common.mi_params.mi_rows;
-  int is_active_h_edge = 0;
-
-  // For two pass account for any formatting bars detected.
-  if (is_stat_consumption_stage_twopass(cpi)) {
-    const AV1_COMMON *const cm = &cpi->common;
-    const FIRSTPASS_STATS *const this_frame_stats = read_one_frame_stats(
-        &cpi->twopass, cm->current_frame.display_order_hint);
-    if (this_frame_stats == NULL) return AOM_CODEC_ERROR;
-
-    // The inactive region is specified in MBs not mi units.
-    // The image edge is in the following MB row.
-    top_edge += (int)(this_frame_stats->inactive_zone_rows * 4);
-
-    bottom_edge -= (int)(this_frame_stats->inactive_zone_rows * 4);
-    bottom_edge = AOMMAX(top_edge, bottom_edge);
-  }
-
-  if (((top_edge >= mi_row) && (top_edge < (mi_row + mi_step))) ||
-      ((bottom_edge >= mi_row) && (bottom_edge < (mi_row + mi_step)))) {
-    is_active_h_edge = 1;
-  }
-  return is_active_h_edge;
-}
-
-// Checks to see if a super block is on a vertical image edge.
-// In most cases this is the "real" edge unless there are formatting
-// bars embedded in the stream.
-static int active_v_edge(const AV1_COMP *cpi, int mi_col, int mi_step) {
-  int left_edge = 0;
-  int right_edge = cpi->common.mi_params.mi_cols;
-  int is_active_v_edge = 0;
-
-  // For two pass account for any formatting bars detected.
-  if (is_stat_consumption_stage_twopass(cpi)) {
-    const AV1_COMMON *const cm = &cpi->common;
-    const FIRSTPASS_STATS *const this_frame_stats = read_one_frame_stats(
-        &cpi->twopass, cm->current_frame.display_order_hint);
-    if (this_frame_stats == NULL) return AOM_CODEC_ERROR;
-
-    // The inactive region is specified in MBs not mi units.
-    // The image edge is in the following MB row.
-    left_edge += (int)(this_frame_stats->inactive_zone_cols * 4);
-
-    right_edge -= (int)(this_frame_stats->inactive_zone_cols * 4);
-    right_edge = AOMMAX(left_edge, right_edge);
-  }
-
-  if (((left_edge >= mi_col) && (left_edge < (mi_col + mi_step))) ||
-      ((right_edge >= mi_col) && (right_edge < (mi_col + mi_step)))) {
-    is_active_v_edge = 1;
-  }
-  return is_active_v_edge;
-}
-#endif  // !CONFIG_REALTIME_ONLY
-
-static INLINE void store_pred_mv(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx) {
-  memcpy(ctx->pred_mv, x->pred_mv, sizeof(x->pred_mv));
-}
-
-static INLINE void load_pred_mv(MACROBLOCK *x,
-                                const PICK_MODE_CONTEXT *const ctx) {
-  memcpy(x->pred_mv, ctx->pred_mv, sizeof(x->pred_mv));
-}
-
-#if !CONFIG_REALTIME_ONLY
-// Try searching for an encoding for the given subblock. Returns zero if the
-// rdcost is already too high (to tell the caller not to bother searching for
-// encodings of further subblocks)
-static int rd_try_subblock(AV1_COMP *const cpi, ThreadData *td,
-                           TileDataEnc *tile_data, TOKENEXTRA **tp, int is_last,
-                           int mi_row, int mi_col, BLOCK_SIZE subsize,
-                           RD_STATS best_rdcost, RD_STATS *sum_rdc,
-                           PARTITION_TYPE partition,
-                           PICK_MODE_CONTEXT *prev_ctx,
-                           PICK_MODE_CONTEXT *this_ctx) {
-  MACROBLOCK *const x = &td->mb;
-  const int orig_mult = x->rdmult;
-  setup_block_rdmult(cpi, x, mi_row, mi_col, subsize, NO_AQ, NULL);
-
-  av1_rd_cost_update(x->rdmult, &best_rdcost);
-  if (cpi->sf.mv_sf.adaptive_motion_search) load_pred_mv(x, prev_ctx);
-
-  RD_STATS rdcost_remaining;
-  av1_rd_stats_subtraction(x->rdmult, &best_rdcost, sum_rdc, &rdcost_remaining);
-  RD_STATS this_rdc;
-  pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc, partition,
-                subsize, this_ctx, rdcost_remaining, PICK_MODE_RD);
-
-  if (this_rdc.rate == INT_MAX) {
-    sum_rdc->rdcost = INT64_MAX;
-  } else {
-    sum_rdc->rate += this_rdc.rate;
-    sum_rdc->dist += this_rdc.dist;
-    av1_rd_cost_update(x->rdmult, sum_rdc);
-  }
-
-  if (sum_rdc->rdcost >= best_rdcost.rdcost) {
-    x->rdmult = orig_mult;
-    return 0;
-  }
-
-  if (!is_last) {
-    update_state(cpi, td, this_ctx, mi_row, mi_col, subsize, 1);
-    encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, subsize, NULL);
-  }
-
-  x->rdmult = orig_mult;
-  return 1;
-}
-
-static bool rd_test_partition3(AV1_COMP *const cpi, ThreadData *td,
-                               TileDataEnc *tile_data, TOKENEXTRA **tp,
-                               PC_TREE *pc_tree, RD_STATS *best_rdc,
-                               PICK_MODE_CONTEXT ctxs[3],
-                               PICK_MODE_CONTEXT *ctx, int mi_row, int mi_col,
-                               BLOCK_SIZE bsize, PARTITION_TYPE partition,
-                               int mi_row0, int mi_col0, BLOCK_SIZE subsize0,
-                               int mi_row1, int mi_col1, BLOCK_SIZE subsize1,
-                               int mi_row2, int mi_col2, BLOCK_SIZE subsize2) {
-  const MACROBLOCK *const x = &td->mb;
-  const MACROBLOCKD *const xd = &x->e_mbd;
-  const int pl = partition_plane_context(xd, mi_row, mi_col, bsize);
-  RD_STATS sum_rdc;
-  av1_init_rd_stats(&sum_rdc);
-  sum_rdc.rate = x->partition_cost[pl][partition];
-  sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, 0);
-  if (!rd_try_subblock(cpi, td, tile_data, tp, 0, mi_row0, mi_col0, subsize0,
-                       *best_rdc, &sum_rdc, partition, ctx, &ctxs[0]))
-    return false;
-
-  if (!rd_try_subblock(cpi, td, tile_data, tp, 0, mi_row1, mi_col1, subsize1,
-                       *best_rdc, &sum_rdc, partition, &ctxs[0], &ctxs[1]))
-    return false;
-
-  if (!rd_try_subblock(cpi, td, tile_data, tp, 1, mi_row2, mi_col2, subsize2,
-                       *best_rdc, &sum_rdc, partition, &ctxs[1], &ctxs[2]))
-    return false;
-
-  av1_rd_cost_update(x->rdmult, &sum_rdc);
-  if (sum_rdc.rdcost >= best_rdc->rdcost) return false;
-  sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
-  if (sum_rdc.rdcost >= best_rdc->rdcost) return false;
-
-  *best_rdc = sum_rdc;
-  pc_tree->partitioning = partition;
-  return true;
-}
-
-static AOM_INLINE void reset_partition(PC_TREE *pc_tree, BLOCK_SIZE bsize) {
-  pc_tree->partitioning = PARTITION_NONE;
-  pc_tree->none.rd_stats.skip = 0;
-
-  if (bsize >= BLOCK_8X8) {
-    BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
-    for (int idx = 0; idx < 4; ++idx)
-      reset_partition(pc_tree->split[idx], subsize);
-  }
-}
-
-// Record the ref frames that have been selected by square partition blocks.
-static AOM_INLINE void update_picked_ref_frames_mask(MACROBLOCK *const x,
-                                                     int ref_type,
-                                                     BLOCK_SIZE bsize,
-                                                     int mib_size, int mi_row,
-                                                     int mi_col) {
-  assert(mi_size_wide[bsize] == mi_size_high[bsize]);
-  const int sb_size_mask = mib_size - 1;
-  const int mi_row_in_sb = mi_row & sb_size_mask;
-  const int mi_col_in_sb = mi_col & sb_size_mask;
-  const int mi_size = mi_size_wide[bsize];
-  for (int i = mi_row_in_sb; i < mi_row_in_sb + mi_size; ++i) {
-    for (int j = mi_col_in_sb; j < mi_col_in_sb + mi_size; ++j) {
-      x->picked_ref_frames_mask[i * 32 + j] |= 1 << ref_type;
-    }
-  }
-}
-
-// Structure to keep win flags for HORZ and VERT partition evaluations
-typedef struct {
-  bool horz_win;
-  bool vert_win;
-} RD_RECT_PART_WIN_INFO;
-
-// Decide whether to evaluate the AB partition specified by part_type based on
-// split and HORZ/VERT info
-int evaluate_ab_partition_based_on_split(
-    PC_TREE *pc_tree, PARTITION_TYPE rect_part,
-    RD_RECT_PART_WIN_INFO *rect_part_win_info, int qindex, int split_idx1,
-    int split_idx2) {
-  int num_win = 0;
-  // Threshold for number of winners
-  // Conservative pruning for high quantizers
-  const int num_win_thresh = AOMMIN(3 * (2 * (MAXQ - qindex) / MAXQ), 3);
-  bool sub_part_win = (rect_part_win_info == NULL)
-                          ? (pc_tree->partitioning == rect_part)
-                          : (rect_part == PARTITION_HORZ)
-                                ? rect_part_win_info->horz_win
-                                : rect_part_win_info->vert_win;
-  num_win += (sub_part_win) ? 1 : 0;
-  num_win +=
-      (pc_tree->split[split_idx1]->partitioning == PARTITION_NONE) ? 1 : 0;
-  num_win +=
-      (pc_tree->split[split_idx2]->partitioning == PARTITION_NONE) ? 1 : 0;
-  if (num_win < num_win_thresh) {
-    return 0;
-  }
-  return 1;
-}
-
-// Searches for the best partition pattern for a block based on the
-// rate-distortion cost, and returns a bool value to indicate whether a valid
-// partition pattern is found. The partition can recursively go down to
-// the smallest block size.
-//
-// Inputs:
-//     cpi: the global compressor setting
-//     td: thread data
-//     tile_data: tile data
-//     tp: the pointer to the start token
-//     mi_row: row coordinate of the block in a step size of MI_SIZE
-//     mi_col: column coordinate of the block in a step size of MI_SIZE
-//     bsize: block size
-//     max_sq_part: the largest square block size for prediction blocks
-//     min_sq_part: the smallest square block size for prediction blocks
-//     rd_cost: the pointer to the final rd cost of the current block
-//     best_rdc: the upper bound of rd cost for a valid partition
-//     pc_tree: the pointer to the PC_TREE node storing the picked partitions
-//              and mode info for the current block
-//     none_rd: the pointer to the rd cost in the case of not splitting the
-//              current block
-//     multi_pass_mode: SB_SINGLE_PASS/SB_DRY_PASS/SB_WET_PASS
-//     rect_part_win_info: the pointer to a struct storing whether horz/vert
-//                         partition outperforms previously tested partitions
-//
-// Output:
-//     a bool value indicating whether a valid partition is found
-static bool rd_pick_partition(AV1_COMP *const cpi, ThreadData *td,
-                              TileDataEnc *tile_data, TOKENEXTRA **tp,
-                              int mi_row, int mi_col, BLOCK_SIZE bsize,
-                              BLOCK_SIZE max_sq_part, BLOCK_SIZE min_sq_part,
-                              RD_STATS *rd_cost, RD_STATS best_rdc,
-                              PC_TREE *pc_tree, int64_t *none_rd,
-                              SB_MULTI_PASS_MODE multi_pass_mode,
-                              RD_RECT_PART_WIN_INFO *rect_part_win_info) {
-  const AV1_COMMON *const cm = &cpi->common;
-  const CommonModeInfoParams *const mi_params = &cm->mi_params;
-  const int num_planes = av1_num_planes(cm);
-  TileInfo *const tile_info = &tile_data->tile_info;
-  MACROBLOCK *const x = &td->mb;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  const int mi_step = mi_size_wide[bsize] / 2;
-  RD_SEARCH_MACROBLOCK_CONTEXT x_ctx;
-  const TOKENEXTRA *const tp_orig = *tp;
-  PICK_MODE_CONTEXT *ctx_none = &pc_tree->none;
-  int tmp_partition_cost[PARTITION_TYPES];
-  BLOCK_SIZE subsize;
-  RD_STATS this_rdc, sum_rdc;
-  const int bsize_at_least_8x8 = (bsize >= BLOCK_8X8);
-  int do_square_split = bsize_at_least_8x8;
-  const int pl = bsize_at_least_8x8
-                     ? partition_plane_context(xd, mi_row, mi_col, bsize)
-                     : 0;
-  const int *partition_cost = x->partition_cost[pl];
-
-  int do_rectangular_split = cpi->oxcf.enable_rect_partitions;
-  int64_t cur_none_rd = 0;
-  int64_t split_rd[4] = { 0, 0, 0, 0 };
-  int64_t horz_rd[2] = { 0, 0 };
-  int64_t vert_rd[2] = { 0, 0 };
-  int prune_horz = 0;
-  int prune_vert = 0;
-  int terminate_partition_search = 0;
-
-  int split_ctx_is_ready[2] = { 0, 0 };
-  int horz_ctx_is_ready = 0;
-  int vert_ctx_is_ready = 0;
-  BLOCK_SIZE bsize2 = get_partition_subsize(bsize, PARTITION_SPLIT);
-  // Initialise HORZ and VERT win flags as true for all split partitions
-  RD_RECT_PART_WIN_INFO split_part_rect_win[4] = {
-    { true, true }, { true, true }, { true, true }, { true, true }
-  };
-
-  bool found_best_partition = false;
-  if (best_rdc.rdcost < 0) {
-    av1_invalid_rd_stats(rd_cost);
-    return found_best_partition;
-  }
-
-  if (frame_is_intra_only(cm) && bsize == BLOCK_64X64) {
-    x->quad_tree_idx = 0;
-    x->cnn_output_valid = 0;
-  }
-
-  if (bsize == cm->seq_params.sb_size) x->must_find_valid_partition = 0;
-
-  // Override skipping rectangular partition operations for edge blocks
-  const int has_rows = (mi_row + mi_step < mi_params->mi_rows);
-  const int has_cols = (mi_col + mi_step < mi_params->mi_cols);
-  const int xss = x->e_mbd.plane[1].subsampling_x;
-  const int yss = x->e_mbd.plane[1].subsampling_y;
-
-  if (none_rd) *none_rd = 0;
-  int partition_none_allowed = has_rows && has_cols;
-  int partition_horz_allowed =
-      has_cols && bsize_at_least_8x8 && cpi->oxcf.enable_rect_partitions &&
-      get_plane_block_size(get_partition_subsize(bsize, PARTITION_HORZ), xss,
-                           yss) != BLOCK_INVALID;
-  int partition_vert_allowed =
-      has_rows && bsize_at_least_8x8 && cpi->oxcf.enable_rect_partitions &&
-      get_plane_block_size(get_partition_subsize(bsize, PARTITION_VERT), xss,
-                           yss) != BLOCK_INVALID;
-
-  (void)*tp_orig;
-
-#if CONFIG_COLLECT_PARTITION_STATS
-  int partition_decisions[EXT_PARTITION_TYPES] = { 0 };
-  int partition_attempts[EXT_PARTITION_TYPES] = { 0 };
-  int64_t partition_times[EXT_PARTITION_TYPES] = { 0 };
-  struct aom_usec_timer partition_timer = { 0 };
-  int partition_timer_on = 0;
-#if CONFIG_COLLECT_PARTITION_STATS == 2
-  PartitionStats *part_stats = &cpi->partition_stats;
-#endif
-#endif
-
-  // Override partition costs at the edges of the frame in the same
-  // way as in read_partition (see decodeframe.c)
-  if (!(has_rows && has_cols)) {
-    assert(bsize_at_least_8x8 && pl >= 0);
-    const aom_cdf_prob *partition_cdf = cm->fc->partition_cdf[pl];
-    const int max_cost = av1_cost_symbol(0);
-    for (int i = 0; i < PARTITION_TYPES; ++i) tmp_partition_cost[i] = max_cost;
-    if (has_cols) {
-      // At the bottom, the two possibilities are HORZ and SPLIT
-      aom_cdf_prob bot_cdf[2];
-      partition_gather_vert_alike(bot_cdf, partition_cdf, bsize);
-      static const int bot_inv_map[2] = { PARTITION_HORZ, PARTITION_SPLIT };
-      av1_cost_tokens_from_cdf(tmp_partition_cost, bot_cdf, bot_inv_map);
-    } else if (has_rows) {
-      // At the right, the two possibilities are VERT and SPLIT
-      aom_cdf_prob rhs_cdf[2];
-      partition_gather_horz_alike(rhs_cdf, partition_cdf, bsize);
-      static const int rhs_inv_map[2] = { PARTITION_VERT, PARTITION_SPLIT };
-      av1_cost_tokens_from_cdf(tmp_partition_cost, rhs_cdf, rhs_inv_map);
-    } else {
-      // At the bottom right, we always split
-      tmp_partition_cost[PARTITION_SPLIT] = 0;
-    }
-
-    partition_cost = tmp_partition_cost;
-  }
-
-#ifndef NDEBUG
-  // Nothing should rely on the default value of this array (which is just
-  // leftover from encoding the previous block. Setting it to fixed pattern
-  // when debugging.
-  // bit 0, 1, 2 are blk_skip of each plane
-  // bit 4, 5, 6 are initialization checking of each plane
-  memset(x->blk_skip, 0x77, sizeof(x->blk_skip));
-#endif  // NDEBUG
-
-  assert(mi_size_wide[bsize] == mi_size_high[bsize]);
-
-  av1_init_rd_stats(&this_rdc);
-
-  set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
-
-  // Save rdmult before it might be changed, so it can be restored later.
-  const int orig_rdmult = x->rdmult;
-  setup_block_rdmult(cpi, x, mi_row, mi_col, bsize, NO_AQ, NULL);
-
-  av1_rd_cost_update(x->rdmult, &best_rdc);
-
-  if (bsize == BLOCK_16X16 && cpi->vaq_refresh)
-    x->mb_energy = av1_log_block_var(cpi, x, bsize);
-
-  if (bsize > cpi->sf.part_sf.use_square_partition_only_threshold) {
-    partition_horz_allowed &= !has_rows;
-    partition_vert_allowed &= !has_cols;
-  }
-
-  xd->above_txfm_context =
-      cm->above_contexts.txfm[tile_info->tile_row] + mi_col;
-  xd->left_txfm_context =
-      xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
-  save_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
-
-  const int try_intra_cnn_split =
-      !cpi->is_screen_content_type && frame_is_intra_only(cm) &&
-      cpi->sf.part_sf.intra_cnn_split &&
-      cm->seq_params.sb_size >= BLOCK_64X64 && bsize <= BLOCK_64X64 &&
-      bsize >= BLOCK_8X8 &&
-      mi_row + mi_size_high[bsize] <= mi_params->mi_rows &&
-      mi_col + mi_size_wide[bsize] <= mi_params->mi_cols;
-
-  if (try_intra_cnn_split) {
-    av1_intra_mode_cnn_partition(
-        &cpi->common, x, bsize, x->quad_tree_idx, &partition_none_allowed,
-        &partition_horz_allowed, &partition_vert_allowed, &do_rectangular_split,
-        &do_square_split);
-  }
-
-  // Use simple_motion_search to prune partitions. This must be done prior to
-  // PARTITION_SPLIT to propagate the initial mvs to a smaller blocksize.
-  const int try_split_only =
-      !cpi->is_screen_content_type &&
-      cpi->sf.part_sf.simple_motion_search_split && do_square_split &&
-      bsize >= BLOCK_8X8 &&
-      mi_row + mi_size_high[bsize] <= mi_params->mi_rows &&
-      mi_col + mi_size_wide[bsize] <= mi_params->mi_cols &&
-      !frame_is_intra_only(cm) && !av1_superres_scaled(cm);
-
-  if (try_split_only) {
-    av1_simple_motion_search_based_split(
-        cpi, x, pc_tree, mi_row, mi_col, bsize, &partition_none_allowed,
-        &partition_horz_allowed, &partition_vert_allowed, &do_rectangular_split,
-        &do_square_split);
-  }
-
-  const int try_prune_rect =
-      !cpi->is_screen_content_type &&
-      cpi->sf.part_sf.simple_motion_search_prune_rect &&
-      !frame_is_intra_only(cm) && do_rectangular_split &&
-      (do_square_split || partition_none_allowed ||
-       (prune_horz && prune_vert)) &&
-      (partition_horz_allowed || partition_vert_allowed) && bsize >= BLOCK_8X8;
-
-  if (try_prune_rect) {
-    av1_simple_motion_search_prune_rect(
-        cpi, x, pc_tree, mi_row, mi_col, bsize, &partition_horz_allowed,
-        &partition_vert_allowed, &prune_horz, &prune_vert);
-  }
-
-  // Max and min square partition levels are defined as the partition nodes that
-  // the recursive function rd_pick_partition() can reach. To implement this:
-  // only PARTITION_NONE is allowed if the current node equals min_sq_part,
-  // only PARTITION_SPLIT is allowed if the current node exceeds max_sq_part.
-  assert(block_size_wide[min_sq_part] == block_size_high[min_sq_part]);
-  assert(block_size_wide[max_sq_part] == block_size_high[max_sq_part]);
-  assert(min_sq_part <= max_sq_part);
-  assert(block_size_wide[bsize] == block_size_high[bsize]);
-  const int max_partition_size = block_size_wide[max_sq_part];
-  const int min_partition_size = block_size_wide[min_sq_part];
-  const int blksize = block_size_wide[bsize];
-  assert(min_partition_size <= max_partition_size);
-  const int is_le_min_sq_part = blksize <= min_partition_size;
-  const int is_gt_max_sq_part = blksize > max_partition_size;
-  if (is_gt_max_sq_part) {
-    // If current block size is larger than max, only allow split.
-    partition_none_allowed = 0;
-    partition_horz_allowed = 0;
-    partition_vert_allowed = 0;
-    do_square_split = 1;
-  } else if (is_le_min_sq_part) {
-    // If current block size is less or equal to min, only allow none if valid
-    // block large enough; only allow split otherwise.
-    partition_horz_allowed = 0;
-    partition_vert_allowed = 0;
-    // only disable square split when current block is not at the picture
-    // boundary. otherwise, inherit the square split flag from previous logic
-    if (has_rows && has_cols) do_square_split = 0;
-    partition_none_allowed = !do_square_split;
-  }
-
-BEGIN_PARTITION_SEARCH:
-  if (x->must_find_valid_partition) {
-    do_square_split = bsize_at_least_8x8 && (blksize > min_partition_size);
-    partition_none_allowed =
-        has_rows && has_cols && (blksize >= min_partition_size);
-    partition_horz_allowed =
-        has_cols && bsize_at_least_8x8 && cpi->oxcf.enable_rect_partitions &&
-        (blksize > min_partition_size) &&
-        get_plane_block_size(get_partition_subsize(bsize, PARTITION_HORZ), xss,
-                             yss) != BLOCK_INVALID;
-    partition_vert_allowed =
-        has_rows && bsize_at_least_8x8 && cpi->oxcf.enable_rect_partitions &&
-        (blksize > min_partition_size) &&
-        get_plane_block_size(get_partition_subsize(bsize, PARTITION_VERT), xss,
-                             yss) != BLOCK_INVALID;
-    terminate_partition_search = 0;
-  }
-
-  // Partition block source pixel variance.
-  unsigned int pb_source_variance = UINT_MAX;
-
-  // Partition block sse after simple motion compensation, not in use now,
-  // but will be used for upcoming speed features
-  unsigned int pb_simple_motion_pred_sse = UINT_MAX;
-  (void)pb_simple_motion_pred_sse;
-
-  // PARTITION_NONE
-  if (is_le_min_sq_part && has_rows && has_cols) partition_none_allowed = 1;
-  assert(terminate_partition_search == 0);
-  int64_t part_none_rd = INT64_MAX;
-  if (cpi->is_screen_content_type)
-    partition_none_allowed = has_rows && has_cols;
-  if (partition_none_allowed && !is_gt_max_sq_part) {
-    int pt_cost = 0;
-    if (bsize_at_least_8x8) {
-      pt_cost = partition_cost[PARTITION_NONE] < INT_MAX
-                    ? partition_cost[PARTITION_NONE]
-                    : 0;
-    }
-    RD_STATS partition_rdcost;
-    av1_init_rd_stats(&partition_rdcost);
-    partition_rdcost.rate = pt_cost;
-    av1_rd_cost_update(x->rdmult, &partition_rdcost);
-    RD_STATS best_remain_rdcost;
-    av1_rd_stats_subtraction(x->rdmult, &best_rdc, &partition_rdcost,
-                             &best_remain_rdcost);
-#if CONFIG_COLLECT_PARTITION_STATS
-    if (best_remain_rdcost >= 0) {
-      partition_attempts[PARTITION_NONE] += 1;
-      aom_usec_timer_start(&partition_timer);
-      partition_timer_on = 1;
-    }
-#endif
-    pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc, PARTITION_NONE,
-                  bsize, ctx_none, best_remain_rdcost, PICK_MODE_RD);
-    av1_rd_cost_update(x->rdmult, &this_rdc);
-#if CONFIG_COLLECT_PARTITION_STATS
-    if (partition_timer_on) {
-      aom_usec_timer_mark(&partition_timer);
-      int64_t time = aom_usec_timer_elapsed(&partition_timer);
-      partition_times[PARTITION_NONE] += time;
-      partition_timer_on = 0;
-    }
-#endif
-    pb_source_variance = x->source_variance;
-    pb_simple_motion_pred_sse = x->simple_motion_pred_sse;
-    if (none_rd) *none_rd = this_rdc.rdcost;
-    cur_none_rd = this_rdc.rdcost;
-    if (this_rdc.rate != INT_MAX) {
-      if (cpi->sf.inter_sf.prune_ref_frame_for_rect_partitions) {
-        const int ref_type = av1_ref_frame_type(ctx_none->mic.ref_frame);
-        update_picked_ref_frames_mask(x, ref_type, bsize,
-                                      cm->seq_params.mib_size, mi_row, mi_col);
-      }
-      if (bsize_at_least_8x8) {
-        this_rdc.rate += pt_cost;
-        this_rdc.rdcost = RDCOST(x->rdmult, this_rdc.rate, this_rdc.dist);
-      }
-
-      part_none_rd = this_rdc.rdcost;
-      if (this_rdc.rdcost < best_rdc.rdcost) {
-        // Adjust dist breakout threshold according to the partition size.
-        const int64_t dist_breakout_thr =
-            cpi->sf.part_sf.partition_search_breakout_dist_thr >>
-            ((2 * (MAX_SB_SIZE_LOG2 - 2)) -
-             (mi_size_wide_log2[bsize] + mi_size_high_log2[bsize]));
-        const int rate_breakout_thr =
-            cpi->sf.part_sf.partition_search_breakout_rate_thr *
-            num_pels_log2_lookup[bsize];
-
-        best_rdc = this_rdc;
-        found_best_partition = true;
-        if (bsize_at_least_8x8) pc_tree->partitioning = PARTITION_NONE;
-
-        if (!frame_is_intra_only(cm) &&
-            (do_square_split || do_rectangular_split) &&
-            !x->e_mbd.lossless[xd->mi[0]->segment_id] && ctx_none->skippable) {
-          const int use_ml_based_breakout =
-              bsize <= cpi->sf.part_sf.use_square_partition_only_threshold &&
-              bsize > BLOCK_4X4 && xd->bd == 8;
-          if (use_ml_based_breakout) {
-            if (av1_ml_predict_breakout(cpi, bsize, x, &this_rdc,
-                                        pb_source_variance)) {
-              do_square_split = 0;
-              do_rectangular_split = 0;
-            }
-          }
-
-          // If all y, u, v transform blocks in this partition are skippable,
-          // and the dist & rate are within the thresholds, the partition
-          // search is terminated for current branch of the partition search
-          // tree. The dist & rate thresholds are set to 0 at speed 0 to
-          // disable the early termination at that speed.
-          if (best_rdc.dist < dist_breakout_thr &&
-              best_rdc.rate < rate_breakout_thr) {
-            do_square_split = 0;
-            do_rectangular_split = 0;
-          }
-        }
-
-        if (cpi->sf.part_sf.simple_motion_search_early_term_none &&
-            cm->show_frame && !frame_is_intra_only(cm) &&
-            bsize >= BLOCK_16X16 && mi_row + mi_step < mi_params->mi_rows &&
-            mi_col + mi_step < mi_params->mi_cols &&
-            this_rdc.rdcost < INT64_MAX && this_rdc.rdcost >= 0 &&
-            this_rdc.rate < INT_MAX && this_rdc.rate >= 0 &&
-            (do_square_split || do_rectangular_split)) {
-          av1_simple_motion_search_early_term_none(cpi, x, pc_tree, mi_row,
-                                                   mi_col, bsize, &this_rdc,
-                                                   &terminate_partition_search);
-        }
-      }
-    }
-
-    restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
-  }
-
-  // store estimated motion vector
-  if (cpi->sf.mv_sf.adaptive_motion_search) store_pred_mv(x, ctx_none);
-
-  // PARTITION_SPLIT
-  int64_t part_split_rd = INT64_MAX;
-  if ((!terminate_partition_search && do_square_split) || is_gt_max_sq_part) {
-    av1_init_rd_stats(&sum_rdc);
-    subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
-    sum_rdc.rate = partition_cost[PARTITION_SPLIT];
-    sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, 0);
-
-    int idx;
-#if CONFIG_COLLECT_PARTITION_STATS
-    if (best_rdc.rdcost - sum_rdc.rdcost >= 0) {
-      partition_attempts[PARTITION_SPLIT] += 1;
-      aom_usec_timer_start(&partition_timer);
-      partition_timer_on = 1;
-    }
-#endif
-    for (idx = 0; idx < 4 && sum_rdc.rdcost < best_rdc.rdcost; ++idx) {
-      const int x_idx = (idx & 1) * mi_step;
-      const int y_idx = (idx >> 1) * mi_step;
-
-      if (mi_row + y_idx >= mi_params->mi_rows ||
-          mi_col + x_idx >= mi_params->mi_cols)
-        continue;
-
-      if (cpi->sf.mv_sf.adaptive_motion_search) load_pred_mv(x, ctx_none);
-
-      pc_tree->split[idx]->index = idx;
-      int64_t *p_split_rd = &split_rd[idx];
-
-      RD_STATS best_remain_rdcost;
-      av1_rd_stats_subtraction(x->rdmult, &best_rdc, &sum_rdc,
-                               &best_remain_rdcost);
-
-      int curr_quad_tree_idx = 0;
-      if (frame_is_intra_only(cm) && bsize <= BLOCK_64X64) {
-        curr_quad_tree_idx = x->quad_tree_idx;
-        x->quad_tree_idx = 4 * curr_quad_tree_idx + idx + 1;
-      }
-      if (!rd_pick_partition(cpi, td, tile_data, tp, mi_row + y_idx,
-                             mi_col + x_idx, subsize, max_sq_part, min_sq_part,
-                             &this_rdc, best_remain_rdcost, pc_tree->split[idx],
-                             p_split_rd, multi_pass_mode,
-                             &split_part_rect_win[idx])) {
-        av1_invalid_rd_stats(&sum_rdc);
-        break;
-      }
-      if (frame_is_intra_only(cm) && bsize <= BLOCK_64X64) {
-        x->quad_tree_idx = curr_quad_tree_idx;
-      }
-
-      sum_rdc.rate += this_rdc.rate;
-      sum_rdc.dist += this_rdc.dist;
-      av1_rd_cost_update(x->rdmult, &sum_rdc);
-      if (idx <= 1 && (bsize <= BLOCK_8X8 ||
-                       pc_tree->split[idx]->partitioning == PARTITION_NONE)) {
-        const MB_MODE_INFO *const mbmi = &pc_tree->split[idx]->none.mic;
-        const PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
-        // Neither palette mode nor cfl predicted
-        if (pmi->palette_size[0] == 0 && pmi->palette_size[1] == 0) {
-          if (mbmi->uv_mode != UV_CFL_PRED) split_ctx_is_ready[idx] = 1;
-        }
-      }
-    }
-#if CONFIG_COLLECT_PARTITION_STATS
-    if (partition_timer_on) {
-      aom_usec_timer_mark(&partition_timer);
-      int64_t time = aom_usec_timer_elapsed(&partition_timer);
-      partition_times[PARTITION_SPLIT] += time;
-      partition_timer_on = 0;
-    }
-#endif
-    const int reached_last_index = (idx == 4);
-
-    part_split_rd = sum_rdc.rdcost;
-    if (reached_last_index && sum_rdc.rdcost < best_rdc.rdcost) {
-      sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
-      if (sum_rdc.rdcost < best_rdc.rdcost) {
-        best_rdc = sum_rdc;
-        found_best_partition = true;
-        pc_tree->partitioning = PARTITION_SPLIT;
-      }
-    } else if (cpi->sf.part_sf.less_rectangular_check_level > 0) {
-      // Skip rectangular partition test when partition type none gives better
-      // rd than partition type split.
-      if (cpi->sf.part_sf.less_rectangular_check_level == 2 || idx <= 2) {
-        const int partition_none_valid = cur_none_rd > 0;
-        const int partition_none_better = cur_none_rd < sum_rdc.rdcost;
-        do_rectangular_split &=
-            !(partition_none_valid && partition_none_better);
-      }
-    }
-
-    restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
-  }  // if (do_split)
-
-  if (cpi->sf.part_sf.ml_early_term_after_part_split_level &&
-      !frame_is_intra_only(cm) && !terminate_partition_search &&
-      do_rectangular_split &&
-      (partition_horz_allowed || partition_vert_allowed)) {
-    av1_ml_early_term_after_split(cpi, x, pc_tree, bsize, best_rdc.rdcost,
-                                  part_none_rd, part_split_rd, split_rd, mi_row,
-                                  mi_col, &terminate_partition_search);
-  }
-
-  if (!cpi->sf.part_sf.ml_early_term_after_part_split_level &&
-      cpi->sf.part_sf.ml_prune_rect_partition && !frame_is_intra_only(cm) &&
-      (partition_horz_allowed || partition_vert_allowed) &&
-      !(prune_horz || prune_vert) && !terminate_partition_search) {
-    av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes, bsize);
-    av1_ml_prune_rect_partition(cpi, x, bsize, best_rdc.rdcost, cur_none_rd,
-                                split_rd, &prune_horz, &prune_vert);
-  }
-
-  // PARTITION_HORZ
-  assert(IMPLIES(!cpi->oxcf.enable_rect_partitions, !partition_horz_allowed));
-  if (!terminate_partition_search && partition_horz_allowed && !prune_horz &&
-      (do_rectangular_split || active_h_edge(cpi, mi_row, mi_step)) &&
-      !is_gt_max_sq_part) {
-    av1_init_rd_stats(&sum_rdc);
-    subsize = get_partition_subsize(bsize, PARTITION_HORZ);
-    if (cpi->sf.mv_sf.adaptive_motion_search) load_pred_mv(x, ctx_none);
-    sum_rdc.rate = partition_cost[PARTITION_HORZ];
-    sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, 0);
-    RD_STATS best_remain_rdcost;
-    av1_rd_stats_subtraction(x->rdmult, &best_rdc, &sum_rdc,
-                             &best_remain_rdcost);
-#if CONFIG_COLLECT_PARTITION_STATS
-    if (best_remain_rdcost >= 0) {
-      partition_attempts[PARTITION_HORZ] += 1;
-      aom_usec_timer_start(&partition_timer);
-      partition_timer_on = 1;
-    }
-#endif
-    pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc, PARTITION_HORZ,
-                  subsize, &pc_tree->horizontal[0], best_remain_rdcost,
-                  PICK_MODE_RD);
-    av1_rd_cost_update(x->rdmult, &this_rdc);
-
-    if (this_rdc.rate == INT_MAX) {
-      sum_rdc.rdcost = INT64_MAX;
-    } else {
-      sum_rdc.rate += this_rdc.rate;
-      sum_rdc.dist += this_rdc.dist;
-      av1_rd_cost_update(x->rdmult, &sum_rdc);
-    }
-    horz_rd[0] = this_rdc.rdcost;
-
-    if (sum_rdc.rdcost < best_rdc.rdcost && has_rows) {
-      const PICK_MODE_CONTEXT *const ctx_h = &pc_tree->horizontal[0];
-      const MB_MODE_INFO *const mbmi = &pc_tree->horizontal[0].mic;
-      const PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
-      // Neither palette mode nor cfl predicted
-      if (pmi->palette_size[0] == 0 && pmi->palette_size[1] == 0) {
-        if (mbmi->uv_mode != UV_CFL_PRED) horz_ctx_is_ready = 1;
-      }
-      update_state(cpi, td, ctx_h, mi_row, mi_col, subsize, 1);
-      encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, subsize, NULL);
-
-      if (cpi->sf.mv_sf.adaptive_motion_search) load_pred_mv(x, ctx_h);
-
-      av1_rd_stats_subtraction(x->rdmult, &best_rdc, &sum_rdc,
-                               &best_remain_rdcost);
-
-      pick_sb_modes(cpi, tile_data, x, mi_row + mi_step, mi_col, &this_rdc,
-                    PARTITION_HORZ, subsize, &pc_tree->horizontal[1],
-                    best_remain_rdcost, PICK_MODE_RD);
-      av1_rd_cost_update(x->rdmult, &this_rdc);
-      horz_rd[1] = this_rdc.rdcost;
-
-      if (this_rdc.rate == INT_MAX) {
-        sum_rdc.rdcost = INT64_MAX;
-      } else {
-        sum_rdc.rate += this_rdc.rate;
-        sum_rdc.dist += this_rdc.dist;
-        av1_rd_cost_update(x->rdmult, &sum_rdc);
-      }
-    }
-#if CONFIG_COLLECT_PARTITION_STATS
-    if (partition_timer_on) {
-      aom_usec_timer_mark(&partition_timer);
-      int64_t time = aom_usec_timer_elapsed(&partition_timer);
-      partition_times[PARTITION_HORZ] += time;
-      partition_timer_on = 0;
-    }
-#endif
-
-    if (sum_rdc.rdcost < best_rdc.rdcost) {
-      sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
-      if (sum_rdc.rdcost < best_rdc.rdcost) {
-        best_rdc = sum_rdc;
-        found_best_partition = true;
-        pc_tree->partitioning = PARTITION_HORZ;
-      }
-    } else {
-      // Update HORZ win flag
-      if (rect_part_win_info != NULL) {
-        rect_part_win_info->horz_win = false;
-      }
-    }
-
-    restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
-  }
-
-  // PARTITION_VERT
-  assert(IMPLIES(!cpi->oxcf.enable_rect_partitions, !partition_vert_allowed));
-  if (!terminate_partition_search && partition_vert_allowed && !prune_vert &&
-      (do_rectangular_split || active_v_edge(cpi, mi_col, mi_step)) &&
-      !is_gt_max_sq_part) {
-    av1_init_rd_stats(&sum_rdc);
-    subsize = get_partition_subsize(bsize, PARTITION_VERT);
-
-    if (cpi->sf.mv_sf.adaptive_motion_search) load_pred_mv(x, ctx_none);
-
-    sum_rdc.rate = partition_cost[PARTITION_VERT];
-    sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, 0);
-    RD_STATS best_remain_rdcost;
-    av1_rd_stats_subtraction(x->rdmult, &best_rdc, &sum_rdc,
-                             &best_remain_rdcost);
-#if CONFIG_COLLECT_PARTITION_STATS
-    if (best_remain_rdcost >= 0) {
-      partition_attempts[PARTITION_VERT] += 1;
-      aom_usec_timer_start(&partition_timer);
-      partition_timer_on = 1;
-    }
-#endif
-    pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc, PARTITION_VERT,
-                  subsize, &pc_tree->vertical[0], best_remain_rdcost,
-                  PICK_MODE_RD);
-    av1_rd_cost_update(x->rdmult, &this_rdc);
-
-    if (this_rdc.rate == INT_MAX) {
-      sum_rdc.rdcost = INT64_MAX;
-    } else {
-      sum_rdc.rate += this_rdc.rate;
-      sum_rdc.dist += this_rdc.dist;
-      av1_rd_cost_update(x->rdmult, &sum_rdc);
-    }
-    vert_rd[0] = this_rdc.rdcost;
-    if (sum_rdc.rdcost < best_rdc.rdcost && has_cols) {
-      const MB_MODE_INFO *const mbmi = &pc_tree->vertical[0].mic;
-      const PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
-      // Neither palette mode nor cfl predicted
-      if (pmi->palette_size[0] == 0 && pmi->palette_size[1] == 0) {
-        if (mbmi->uv_mode != UV_CFL_PRED) vert_ctx_is_ready = 1;
-      }
-      update_state(cpi, td, &pc_tree->vertical[0], mi_row, mi_col, subsize, 1);
-      encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, subsize, NULL);
-
-      if (cpi->sf.mv_sf.adaptive_motion_search) load_pred_mv(x, ctx_none);
-
-      av1_rd_stats_subtraction(x->rdmult, &best_rdc, &sum_rdc,
-                               &best_remain_rdcost);
-      pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + mi_step, &this_rdc,
-                    PARTITION_VERT, subsize, &pc_tree->vertical[1],
-                    best_remain_rdcost, PICK_MODE_RD);
-      av1_rd_cost_update(x->rdmult, &this_rdc);
-      vert_rd[1] = this_rdc.rdcost;
-
-      if (this_rdc.rate == INT_MAX) {
-        sum_rdc.rdcost = INT64_MAX;
-      } else {
-        sum_rdc.rate += this_rdc.rate;
-        sum_rdc.dist += this_rdc.dist;
-        av1_rd_cost_update(x->rdmult, &sum_rdc);
-      }
-    }
-#if CONFIG_COLLECT_PARTITION_STATS
-    if (partition_timer_on) {
-      aom_usec_timer_mark(&partition_timer);
-      int64_t time = aom_usec_timer_elapsed(&partition_timer);
-      partition_times[PARTITION_VERT] += time;
-      partition_timer_on = 0;
-    }
-#endif
-
-    av1_rd_cost_update(x->rdmult, &sum_rdc);
-    if (sum_rdc.rdcost < best_rdc.rdcost) {
-      best_rdc = sum_rdc;
-      found_best_partition = true;
-      pc_tree->partitioning = PARTITION_VERT;
-    } else {
-      // Update VERT win flag
-      if (rect_part_win_info != NULL) {
-        rect_part_win_info->vert_win = false;
-      }
-    }
-
-    restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
-  }
-
-  if (pb_source_variance == UINT_MAX) {
-    av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes, bsize);
-    if (is_cur_buf_hbd(xd)) {
-      pb_source_variance = av1_high_get_sby_perpixel_variance(
-          cpi, &x->plane[0].src, bsize, xd->bd);
-    } else {
-      pb_source_variance =
-          av1_get_sby_perpixel_variance(cpi, &x->plane[0].src, bsize);
-    }
-  }
-
-  if (use_pb_simple_motion_pred_sse(cpi) &&
-      pb_simple_motion_pred_sse == UINT_MAX) {
-    const FULLPEL_MV start_mv = kZeroFullMv;
-    unsigned int var = 0;
-
-    av1_simple_motion_sse_var(cpi, x, mi_row, mi_col, bsize, start_mv, 0,
-                              &pb_simple_motion_pred_sse, &var);
-  }
-
-  assert(IMPLIES(!cpi->oxcf.enable_rect_partitions, !do_rectangular_split));
-
-  const int ext_partition_allowed =
-      do_rectangular_split &&
-      bsize > cpi->sf.part_sf.ext_partition_eval_thresh && has_rows && has_cols;
-
-  // The standard AB partitions are allowed whenever ext-partition-types are
-  // allowed
-  int horzab_partition_allowed =
-      ext_partition_allowed & cpi->oxcf.enable_ab_partitions;
-  int vertab_partition_allowed =
-      ext_partition_allowed & cpi->oxcf.enable_ab_partitions;
-
-  if (cpi->sf.part_sf.prune_ext_partition_types_search_level) {
-    if (cpi->sf.part_sf.prune_ext_partition_types_search_level == 1) {
-      // TODO(debargha,huisu@google.com): may need to tune the threshold for
-      // pb_source_variance.
-      horzab_partition_allowed &= (pc_tree->partitioning == PARTITION_HORZ ||
-                                   (pc_tree->partitioning == PARTITION_NONE &&
-                                    pb_source_variance < 32) ||
-                                   pc_tree->partitioning == PARTITION_SPLIT);
-      vertab_partition_allowed &= (pc_tree->partitioning == PARTITION_VERT ||
-                                   (pc_tree->partitioning == PARTITION_NONE &&
-                                    pb_source_variance < 32) ||
-                                   pc_tree->partitioning == PARTITION_SPLIT);
-    } else {
-      horzab_partition_allowed &= (pc_tree->partitioning == PARTITION_HORZ ||
-                                   pc_tree->partitioning == PARTITION_SPLIT);
-      vertab_partition_allowed &= (pc_tree->partitioning == PARTITION_VERT ||
-                                   pc_tree->partitioning == PARTITION_SPLIT);
-    }
-    horz_rd[0] = (horz_rd[0] < INT64_MAX ? horz_rd[0] : 0);
-    horz_rd[1] = (horz_rd[1] < INT64_MAX ? horz_rd[1] : 0);
-    vert_rd[0] = (vert_rd[0] < INT64_MAX ? vert_rd[0] : 0);
-    vert_rd[1] = (vert_rd[1] < INT64_MAX ? vert_rd[1] : 0);
-    split_rd[0] = (split_rd[0] < INT64_MAX ? split_rd[0] : 0);
-    split_rd[1] = (split_rd[1] < INT64_MAX ? split_rd[1] : 0);
-    split_rd[2] = (split_rd[2] < INT64_MAX ? split_rd[2] : 0);
-    split_rd[3] = (split_rd[3] < INT64_MAX ? split_rd[3] : 0);
-  }
-  int horza_partition_allowed = horzab_partition_allowed;
-  int horzb_partition_allowed = horzab_partition_allowed;
-  if (cpi->sf.part_sf.prune_ext_partition_types_search_level) {
-    const int64_t horz_a_rd = horz_rd[1] + split_rd[0] + split_rd[1];
-    const int64_t horz_b_rd = horz_rd[0] + split_rd[2] + split_rd[3];
-    switch (cpi->sf.part_sf.prune_ext_partition_types_search_level) {
-      case 1:
-        horza_partition_allowed &= (horz_a_rd / 16 * 14 < best_rdc.rdcost);
-        horzb_partition_allowed &= (horz_b_rd / 16 * 14 < best_rdc.rdcost);
-        break;
-      case 2:
-      default:
-        horza_partition_allowed &= (horz_a_rd / 16 * 15 < best_rdc.rdcost);
-        horzb_partition_allowed &= (horz_b_rd / 16 * 15 < best_rdc.rdcost);
-        break;
-    }
-  }
-
-  int verta_partition_allowed = vertab_partition_allowed;
-  int vertb_partition_allowed = vertab_partition_allowed;
-  if (cpi->sf.part_sf.prune_ext_partition_types_search_level) {
-    const int64_t vert_a_rd = vert_rd[1] + split_rd[0] + split_rd[2];
-    const int64_t vert_b_rd = vert_rd[0] + split_rd[1] + split_rd[3];
-    switch (cpi->sf.part_sf.prune_ext_partition_types_search_level) {
-      case 1:
-        verta_partition_allowed &= (vert_a_rd / 16 * 14 < best_rdc.rdcost);
-        vertb_partition_allowed &= (vert_b_rd / 16 * 14 < best_rdc.rdcost);
-        break;
-      case 2:
-      default:
-        verta_partition_allowed &= (vert_a_rd / 16 * 15 < best_rdc.rdcost);
-        vertb_partition_allowed &= (vert_b_rd / 16 * 15 < best_rdc.rdcost);
-        break;
-    }
-  }
-
-  if (cpi->sf.part_sf.ml_prune_ab_partition && ext_partition_allowed &&
-      partition_horz_allowed && partition_vert_allowed) {
-    // TODO(huisu@google.com): x->source_variance may not be the current
-    // block's variance. The correct one to use is pb_source_variance. Need to
-    // re-train the model to fix it.
-    av1_ml_prune_ab_partition(
-        bsize, pc_tree->partitioning, get_unsigned_bits(x->source_variance),
-        best_rdc.rdcost, horz_rd, vert_rd, split_rd, &horza_partition_allowed,
-        &horzb_partition_allowed, &verta_partition_allowed,
-        &vertb_partition_allowed);
-  }
-
-  horza_partition_allowed &= cpi->oxcf.enable_ab_partitions;
-  horzb_partition_allowed &= cpi->oxcf.enable_ab_partitions;
-  verta_partition_allowed &= cpi->oxcf.enable_ab_partitions;
-  vertb_partition_allowed &= cpi->oxcf.enable_ab_partitions;
-
-  if (cpi->sf.part_sf.prune_ab_partition_using_split_info &&
-      horza_partition_allowed) {
-    horza_partition_allowed &= evaluate_ab_partition_based_on_split(
-        pc_tree, PARTITION_HORZ, rect_part_win_info, x->qindex, 0, 1);
-  }
-
-  // PARTITION_HORZ_A
-  if (!terminate_partition_search && partition_horz_allowed &&
-      horza_partition_allowed && !is_gt_max_sq_part) {
-    subsize = get_partition_subsize(bsize, PARTITION_HORZ_A);
-    pc_tree->horizontala[0].rd_mode_is_ready = 0;
-    pc_tree->horizontala[1].rd_mode_is_ready = 0;
-    pc_tree->horizontala[2].rd_mode_is_ready = 0;
-    if (split_ctx_is_ready[0]) {
-      av1_copy_tree_context(&pc_tree->horizontala[0], &pc_tree->split[0]->none);
-      pc_tree->horizontala[0].mic.partition = PARTITION_HORZ_A;
-      pc_tree->horizontala[0].rd_mode_is_ready = 1;
-      if (split_ctx_is_ready[1]) {
-        av1_copy_tree_context(&pc_tree->horizontala[1],
-                              &pc_tree->split[1]->none);
-        pc_tree->horizontala[1].mic.partition = PARTITION_HORZ_A;
-        pc_tree->horizontala[1].rd_mode_is_ready = 1;
-      }
-    }
-#if CONFIG_COLLECT_PARTITION_STATS
-    {
-      RD_STATS tmp_sum_rdc;
-      av1_init_rd_stats(&tmp_sum_rdc);
-      tmp_sum_rdc.rate = x->partition_cost[pl][PARTITION_HORZ_A];
-      tmp_sum_rdc.rdcost = RDCOST(x->rdmult, tmp_sum_rdc.rate, 0);
-      if (best_rdc.rdcost - tmp_sum_rdc.rdcost >= 0) {
-        partition_attempts[PARTITION_HORZ_A] += 1;
-        aom_usec_timer_start(&partition_timer);
-        partition_timer_on = 1;
-      }
-    }
-#endif
-    found_best_partition |= rd_test_partition3(
-        cpi, td, tile_data, tp, pc_tree, &best_rdc, pc_tree->horizontala,
-        ctx_none, mi_row, mi_col, bsize, PARTITION_HORZ_A, mi_row, mi_col,
-        bsize2, mi_row, mi_col + mi_step, bsize2, mi_row + mi_step, mi_col,
-        subsize);
-#if CONFIG_COLLECT_PARTITION_STATS
-    if (partition_timer_on) {
-      aom_usec_timer_mark(&partition_timer);
-      int64_t time = aom_usec_timer_elapsed(&partition_timer);
-      partition_times[PARTITION_HORZ_A] += time;
-      partition_timer_on = 0;
-    }
-#endif
-    restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
-  }
-
-  if (cpi->sf.part_sf.prune_ab_partition_using_split_info &&
-      horzb_partition_allowed) {
-    horzb_partition_allowed &= evaluate_ab_partition_based_on_split(
-        pc_tree, PARTITION_HORZ, rect_part_win_info, x->qindex, 2, 3);
-  }
-
-  // PARTITION_HORZ_B
-  if (!terminate_partition_search && partition_horz_allowed &&
-      horzb_partition_allowed && !is_gt_max_sq_part) {
-    subsize = get_partition_subsize(bsize, PARTITION_HORZ_B);
-    pc_tree->horizontalb[0].rd_mode_is_ready = 0;
-    pc_tree->horizontalb[1].rd_mode_is_ready = 0;
-    pc_tree->horizontalb[2].rd_mode_is_ready = 0;
-    if (horz_ctx_is_ready) {
-      av1_copy_tree_context(&pc_tree->horizontalb[0], &pc_tree->horizontal[0]);
-      pc_tree->horizontalb[0].mic.partition = PARTITION_HORZ_B;
-      pc_tree->horizontalb[0].rd_mode_is_ready = 1;
-    }
-#if CONFIG_COLLECT_PARTITION_STATS
-    {
-      RD_STATS tmp_sum_rdc;
-      av1_init_rd_stats(&tmp_sum_rdc);
-      tmp_sum_rdc.rate = x->partition_cost[pl][PARTITION_HORZ_B];
-      tmp_sum_rdc.rdcost = RDCOST(x->rdmult, tmp_sum_rdc.rate, 0);
-      if (best_rdc.rdcost - tmp_sum_rdc.rdcost >= 0) {
-        partition_attempts[PARTITION_HORZ_B] += 1;
-        aom_usec_timer_start(&partition_timer);
-        partition_timer_on = 1;
-      }
-    }
-#endif
-    found_best_partition |= rd_test_partition3(
-        cpi, td, tile_data, tp, pc_tree, &best_rdc, pc_tree->horizontalb,
-        ctx_none, mi_row, mi_col, bsize, PARTITION_HORZ_B, mi_row, mi_col,
-        subsize, mi_row + mi_step, mi_col, bsize2, mi_row + mi_step,
-        mi_col + mi_step, bsize2);
-
-#if CONFIG_COLLECT_PARTITION_STATS
-    if (partition_timer_on) {
-      aom_usec_timer_mark(&partition_timer);
-      int64_t time = aom_usec_timer_elapsed(&partition_timer);
-      partition_times[PARTITION_HORZ_B] += time;
-      partition_timer_on = 0;
-    }
-#endif
-    restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
-  }
-
-  if (cpi->sf.part_sf.prune_ab_partition_using_split_info &&
-      verta_partition_allowed) {
-    verta_partition_allowed &= evaluate_ab_partition_based_on_split(
-        pc_tree, PARTITION_VERT, rect_part_win_info, x->qindex, 0, 2);
-  }
-
-  // PARTITION_VERT_A
-  if (!terminate_partition_search && partition_vert_allowed &&
-      verta_partition_allowed && !is_gt_max_sq_part) {
-    subsize = get_partition_subsize(bsize, PARTITION_VERT_A);
-    pc_tree->verticala[0].rd_mode_is_ready = 0;
-    pc_tree->verticala[1].rd_mode_is_ready = 0;
-    pc_tree->verticala[2].rd_mode_is_ready = 0;
-    if (split_ctx_is_ready[0]) {
-      av1_copy_tree_context(&pc_tree->verticala[0], &pc_tree->split[0]->none);
-      pc_tree->verticala[0].mic.partition = PARTITION_VERT_A;
-      pc_tree->verticala[0].rd_mode_is_ready = 1;
-    }
-#if CONFIG_COLLECT_PARTITION_STATS
-    {
-      RD_STATS tmp_sum_rdc;
-      av1_init_rd_stats(&tmp_sum_rdc);
-      tmp_sum_rdc.rate = x->partition_cost[pl][PARTITION_VERT_A];
-      tmp_sum_rdc.rdcost = RDCOST(x->rdmult, tmp_sum_rdc.rate, 0);
-      if (best_rdc.rdcost - tmp_sum_rdc.rdcost >= 0) {
-        partition_attempts[PARTITION_VERT_A] += 1;
-        aom_usec_timer_start(&partition_timer);
-        partition_timer_on = 1;
-      }
-    }
-#endif
-    found_best_partition |= rd_test_partition3(
-        cpi, td, tile_data, tp, pc_tree, &best_rdc, pc_tree->verticala,
-        ctx_none, mi_row, mi_col, bsize, PARTITION_VERT_A, mi_row, mi_col,
-        bsize2, mi_row + mi_step, mi_col, bsize2, mi_row, mi_col + mi_step,
-        subsize);
-#if CONFIG_COLLECT_PARTITION_STATS
-    if (partition_timer_on) {
-      aom_usec_timer_mark(&partition_timer);
-      int64_t time = aom_usec_timer_elapsed(&partition_timer);
-      partition_times[PARTITION_VERT_A] += time;
-      partition_timer_on = 0;
-    }
-#endif
-    restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
-  }
-
-  if (cpi->sf.part_sf.prune_ab_partition_using_split_info &&
-      vertb_partition_allowed) {
-    vertb_partition_allowed &= evaluate_ab_partition_based_on_split(
-        pc_tree, PARTITION_VERT, rect_part_win_info, x->qindex, 1, 3);
-  }
-
-  // PARTITION_VERT_B
-  if (!terminate_partition_search && partition_vert_allowed &&
-      vertb_partition_allowed && !is_gt_max_sq_part) {
-    subsize = get_partition_subsize(bsize, PARTITION_VERT_B);
-    pc_tree->verticalb[0].rd_mode_is_ready = 0;
-    pc_tree->verticalb[1].rd_mode_is_ready = 0;
-    pc_tree->verticalb[2].rd_mode_is_ready = 0;
-    if (vert_ctx_is_ready) {
-      av1_copy_tree_context(&pc_tree->verticalb[0], &pc_tree->vertical[0]);
-      pc_tree->verticalb[0].mic.partition = PARTITION_VERT_B;
-      pc_tree->verticalb[0].rd_mode_is_ready = 1;
-    }
-#if CONFIG_COLLECT_PARTITION_STATS
-    {
-      RD_STATS tmp_sum_rdc;
-      av1_init_rd_stats(&tmp_sum_rdc);
-      tmp_sum_rdc.rate = x->partition_cost[pl][PARTITION_VERT_B];
-      tmp_sum_rdc.rdcost = RDCOST(x->rdmult, tmp_sum_rdc.rate, 0);
-      if (!frame_is_intra_only(cm) &&
-          best_rdc.rdcost - tmp_sum_rdc.rdcost >= 0) {
-        partition_attempts[PARTITION_VERT_B] += 1;
-        aom_usec_timer_start(&partition_timer);
-        partition_timer_on = 1;
-      }
-    }
-#endif
-    found_best_partition |= rd_test_partition3(
-        cpi, td, tile_data, tp, pc_tree, &best_rdc, pc_tree->verticalb,
-        ctx_none, mi_row, mi_col, bsize, PARTITION_VERT_B, mi_row, mi_col,
-        subsize, mi_row, mi_col + mi_step, bsize2, mi_row + mi_step,
-        mi_col + mi_step, bsize2);
-#if CONFIG_COLLECT_PARTITION_STATS
-    if (partition_timer_on) {
-      aom_usec_timer_mark(&partition_timer);
-      int64_t time = aom_usec_timer_elapsed(&partition_timer);
-      partition_times[PARTITION_VERT_B] += time;
-      partition_timer_on = 0;
-    }
-#endif
-    restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
-  }
-
-  // partition4_allowed is 1 if we can use a PARTITION_HORZ_4 or
-  // PARTITION_VERT_4 for this block. This is almost the same as
-  // ext_partition_allowed, except that we don't allow 128x32 or 32x128
-  // blocks, so we require that bsize is not BLOCK_128X128.
-  const int partition4_allowed = cpi->oxcf.enable_1to4_partitions &&
-                                 ext_partition_allowed &&
-                                 bsize != BLOCK_128X128;
-
-  int partition_horz4_allowed =
-      partition4_allowed && partition_horz_allowed &&
-      get_plane_block_size(get_partition_subsize(bsize, PARTITION_HORZ_4), xss,
-                           yss) != BLOCK_INVALID;
-  int partition_vert4_allowed =
-      partition4_allowed && partition_vert_allowed &&
-      get_plane_block_size(get_partition_subsize(bsize, PARTITION_VERT_4), xss,
-                           yss) != BLOCK_INVALID;
-  if (cpi->sf.part_sf.prune_ext_partition_types_search_level == 2) {
-    partition_horz4_allowed &= (pc_tree->partitioning == PARTITION_HORZ ||
-                                pc_tree->partitioning == PARTITION_HORZ_A ||
-                                pc_tree->partitioning == PARTITION_HORZ_B ||
-                                pc_tree->partitioning == PARTITION_SPLIT ||
-                                pc_tree->partitioning == PARTITION_NONE);
-    partition_vert4_allowed &= (pc_tree->partitioning == PARTITION_VERT ||
-                                pc_tree->partitioning == PARTITION_VERT_A ||
-                                pc_tree->partitioning == PARTITION_VERT_B ||
-                                pc_tree->partitioning == PARTITION_SPLIT ||
-                                pc_tree->partitioning == PARTITION_NONE);
-  }
-  if (cpi->sf.part_sf.ml_prune_4_partition && partition4_allowed &&
-      partition_horz_allowed && partition_vert_allowed) {
-    av1_ml_prune_4_partition(cpi, x, bsize, pc_tree->partitioning,
-                             best_rdc.rdcost, horz_rd, vert_rd, split_rd,
-                             &partition_horz4_allowed, &partition_vert4_allowed,
-                             pb_source_variance, mi_row, mi_col);
-  }
-
-  if (blksize < (min_partition_size << 2)) {
-    partition_horz4_allowed = 0;
-    partition_vert4_allowed = 0;
-  }
-
-  if (cpi->sf.part_sf.prune_4_partition_using_split_info &&
-      (partition_horz4_allowed || partition_vert4_allowed)) {
-    // Count of child blocks in which HORZ or VERT partition has won
-    int num_child_horz_win = 0, num_child_vert_win = 0;
-    for (int idx = 0; idx < 4; idx++) {
-      num_child_horz_win += (split_part_rect_win[idx].horz_win) ? 1 : 0;
-      num_child_vert_win += (split_part_rect_win[idx].vert_win) ? 1 : 0;
-    }
-
-    // Prune HORZ4/VERT4 partitions based on number of HORZ/VERT winners of
-    // split partiitons.
-    // Conservative pruning for high quantizers
-    const int num_win_thresh = AOMMIN(3 * (MAXQ - x->qindex) / MAXQ + 1, 3);
-    if (num_child_horz_win < num_win_thresh) {
-      partition_horz4_allowed = 0;
-    }
-    if (num_child_vert_win < num_win_thresh) {
-      partition_vert4_allowed = 0;
-    }
-  }
-
-  // PARTITION_HORZ_4
-  assert(IMPLIES(!cpi->oxcf.enable_rect_partitions, !partition_horz4_allowed));
-  if (!terminate_partition_search && partition_horz4_allowed && has_rows &&
-      (do_rectangular_split || active_h_edge(cpi, mi_row, mi_step)) &&
-      !is_gt_max_sq_part) {
-    av1_init_rd_stats(&sum_rdc);
-    const int quarter_step = mi_size_high[bsize] / 4;
-    PICK_MODE_CONTEXT *ctx_prev = ctx_none;
-
-    subsize = get_partition_subsize(bsize, PARTITION_HORZ_4);
-    sum_rdc.rate = partition_cost[PARTITION_HORZ_4];
-    sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, 0);
-
-#if CONFIG_COLLECT_PARTITION_STATS
-    if (best_rdc.rdcost - sum_rdc.rdcost >= 0) {
-      partition_attempts[PARTITION_HORZ_4] += 1;
-      aom_usec_timer_start(&partition_timer);
-      partition_timer_on = 1;
-    }
-#endif
-    for (int i = 0; i < 4; ++i) {
-      const int this_mi_row = mi_row + i * quarter_step;
-
-      if (i > 0 && this_mi_row >= mi_params->mi_rows) break;
-
-      PICK_MODE_CONTEXT *ctx_this = &pc_tree->horizontal4[i];
-
-      ctx_this->rd_mode_is_ready = 0;
-      if (!rd_try_subblock(cpi, td, tile_data, tp, (i == 3), this_mi_row,
-                           mi_col, subsize, best_rdc, &sum_rdc,
-                           PARTITION_HORZ_4, ctx_prev, ctx_this)) {
-        av1_invalid_rd_stats(&sum_rdc);
-        break;
-      }
-
-      ctx_prev = ctx_this;
-    }
-
-    av1_rd_cost_update(x->rdmult, &sum_rdc);
-    if (sum_rdc.rdcost < best_rdc.rdcost) {
-      best_rdc = sum_rdc;
-      found_best_partition = true;
-      pc_tree->partitioning = PARTITION_HORZ_4;
-    }
-
-#if CONFIG_COLLECT_PARTITION_STATS
-    if (partition_timer_on) {
-      aom_usec_timer_mark(&partition_timer);
-      int64_t time = aom_usec_timer_elapsed(&partition_timer);
-      partition_times[PARTITION_HORZ_4] += time;
-      partition_timer_on = 0;
-    }
-#endif
-    restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
-  }
-
-  // PARTITION_VERT_4
-  assert(IMPLIES(!cpi->oxcf.enable_rect_partitions, !partition_vert4_allowed));
-  if (!terminate_partition_search && partition_vert4_allowed && has_cols &&
-      (do_rectangular_split || active_v_edge(cpi, mi_row, mi_step)) &&
-      !is_gt_max_sq_part) {
-    av1_init_rd_stats(&sum_rdc);
-    const int quarter_step = mi_size_wide[bsize] / 4;
-    PICK_MODE_CONTEXT *ctx_prev = ctx_none;
-
-    subsize = get_partition_subsize(bsize, PARTITION_VERT_4);
-    sum_rdc.rate = partition_cost[PARTITION_VERT_4];
-    sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, 0);
-
-#if CONFIG_COLLECT_PARTITION_STATS
-    if (best_rdc.rdcost - sum_rdc.rdcost >= 0) {
-      partition_attempts[PARTITION_VERT_4] += 1;
-      aom_usec_timer_start(&partition_timer);
-      partition_timer_on = 1;
-    }
-#endif
-    for (int i = 0; i < 4; ++i) {
-      const int this_mi_col = mi_col + i * quarter_step;
-
-      if (i > 0 && this_mi_col >= mi_params->mi_cols) break;
-
-      PICK_MODE_CONTEXT *ctx_this = &pc_tree->vertical4[i];
-
-      ctx_this->rd_mode_is_ready = 0;
-      if (!rd_try_subblock(cpi, td, tile_data, tp, (i == 3), mi_row,
-                           this_mi_col, subsize, best_rdc, &sum_rdc,
-                           PARTITION_VERT_4, ctx_prev, ctx_this)) {
-        av1_invalid_rd_stats(&sum_rdc);
-        break;
-      }
-
-      ctx_prev = ctx_this;
-    }
-
-    av1_rd_cost_update(x->rdmult, &sum_rdc);
-    if (sum_rdc.rdcost < best_rdc.rdcost) {
-      best_rdc = sum_rdc;
-      found_best_partition = true;
-      pc_tree->partitioning = PARTITION_VERT_4;
-    }
-#if CONFIG_COLLECT_PARTITION_STATS
-    if (partition_timer_on) {
-      aom_usec_timer_mark(&partition_timer);
-      int64_t time = aom_usec_timer_elapsed(&partition_timer);
-      partition_times[PARTITION_VERT_4] += time;
-      partition_timer_on = 0;
-    }
-#endif
-    restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
-  }
-
-  if (bsize == cm->seq_params.sb_size && !found_best_partition) {
-    // Did not find a valid partition, go back and search again, with less
-    // constraint on which partition types to search.
-    x->must_find_valid_partition = 1;
-#if CONFIG_COLLECT_PARTITION_STATS == 2
-    part_stats->partition_redo += 1;
-#endif
-    goto BEGIN_PARTITION_SEARCH;
-  }
-
-  *rd_cost = best_rdc;
-
-#if CONFIG_COLLECT_PARTITION_STATS
-  if (best_rdc.rate < INT_MAX && best_rdc.dist < INT64_MAX) {
-    partition_decisions[pc_tree->partitioning] += 1;
-  }
-#endif
-
-#if CONFIG_COLLECT_PARTITION_STATS == 1
-  // If CONFIG_COLLECT_PARTITION_STATS is 1, then print out the stats for each
-  // prediction block
-  FILE *f = fopen("data.csv", "a");
-  fprintf(f, "%d,%d,%d,", bsize, cm->show_frame, frame_is_intra_only(cm));
-  for (int idx = 0; idx < EXT_PARTITION_TYPES; idx++) {
-    fprintf(f, "%d,", partition_decisions[idx]);
-  }
-  for (int idx = 0; idx < EXT_PARTITION_TYPES; idx++) {
-    fprintf(f, "%d,", partition_attempts[idx]);
-  }
-  for (int idx = 0; idx < EXT_PARTITION_TYPES; idx++) {
-    fprintf(f, "%ld,", partition_times[idx]);
-  }
-  fprintf(f, "\n");
-  fclose(f);
-#endif
-
-#if CONFIG_COLLECT_PARTITION_STATS == 2
-  // If CONFIG_COLLECTION_PARTITION_STATS is 2, then we print out the stats for
-  // the whole clip. So we need to pass the information upstream to the encoder
-  const int bsize_idx = av1_get_bsize_idx_for_part_stats(bsize);
-  int *agg_attempts = part_stats->partition_attempts[bsize_idx];
-  int *agg_decisions = part_stats->partition_decisions[bsize_idx];
-  int64_t *agg_times = part_stats->partition_times[bsize_idx];
-  for (int idx = 0; idx < EXT_PARTITION_TYPES; idx++) {
-    agg_attempts[idx] += partition_attempts[idx];
-    agg_decisions[idx] += partition_decisions[idx];
-    agg_times[idx] += partition_times[idx];
-  }
-#endif
-
-  if (found_best_partition && pc_tree->index != 3) {
-    if (bsize == cm->seq_params.sb_size) {
-      const int emit_output = multi_pass_mode != SB_DRY_PASS;
-      const RUN_TYPE run_type = emit_output ? OUTPUT_ENABLED : DRY_RUN_NORMAL;
-
-      x->cb_offset = 0;
-      encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, run_type, bsize,
-                pc_tree, NULL);
-    } else {
-      encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_NORMAL, bsize,
-                pc_tree, NULL);
-    }
-  }
-
-  if (bsize == cm->seq_params.sb_size) {
-    assert(best_rdc.rate < INT_MAX);
-    assert(best_rdc.dist < INT64_MAX);
-  } else {
-    assert(tp_orig == *tp);
-  }
-
-  x->rdmult = orig_rdmult;
-  return found_best_partition;
-}
-#endif  // !CONFIG_REALTIME_ONLY
-#undef NUM_SIMPLE_MOTION_FEATURES
-
-#if !CONFIG_REALTIME_ONLY
-
-static int get_rdmult_delta(AV1_COMP *cpi, BLOCK_SIZE bsize, int analysis_type,
-                            int mi_row, int mi_col, int orig_rdmult) {
-  AV1_COMMON *const cm = &cpi->common;
-  assert(IMPLIES(cpi->gf_group.size > 0,
-                 cpi->gf_group.index < cpi->gf_group.size));
-  const int tpl_idx = cpi->gf_group.index;
-  TplParams *const tpl_data = &cpi->tpl_data;
-  TplDepFrame *tpl_frame = &tpl_data->tpl_frame[tpl_idx];
-  TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr;
-  const uint8_t block_mis_log2 = tpl_data->tpl_stats_block_mis_log2;
-  int tpl_stride = tpl_frame->stride;
-  int64_t intra_cost = 0;
-  int64_t mc_dep_cost = 0;
-  const int mi_wide = mi_size_wide[bsize];
-  const int mi_high = mi_size_high[bsize];
-
-  if (tpl_frame->is_valid == 0) return orig_rdmult;
-
-  if (!is_frame_tpl_eligible(cpi)) return orig_rdmult;
-
-  if (cpi->gf_group.index >= MAX_LAG_BUFFERS) return orig_rdmult;
-
-  int64_t mc_count = 0, mc_saved = 0;
-  int mi_count = 0;
-  const int mi_col_sr =
-      coded_to_superres_mi(mi_col, cm->superres_scale_denominator);
-  const int mi_col_end_sr =
-      coded_to_superres_mi(mi_col + mi_wide, cm->superres_scale_denominator);
-  const int mi_cols_sr = av1_pixels_to_mi(cm->superres_upscaled_width);
-  const int step = 1 << block_mis_log2;
-  for (int row = mi_row; row < mi_row + mi_high; row += step) {
-    for (int col = mi_col_sr; col < mi_col_end_sr; col += step) {
-      if (row >= cm->mi_params.mi_rows || col >= mi_cols_sr) continue;
-      TplDepStats *this_stats =
-          &tpl_stats[av1_tpl_ptr_pos(row, col, tpl_stride, block_mis_log2)];
-      int64_t mc_dep_delta =
-          RDCOST(tpl_frame->base_rdmult, this_stats->mc_dep_rate,
-                 this_stats->mc_dep_dist);
-      intra_cost += this_stats->recrf_dist << RDDIV_BITS;
-      mc_dep_cost += (this_stats->recrf_dist << RDDIV_BITS) + mc_dep_delta;
-      mc_count += this_stats->mc_count;
-      mc_saved += this_stats->mc_saved;
-      mi_count++;
-    }
-  }
-
-  aom_clear_system_state();
-
-  double beta = 1.0;
-  if (analysis_type == 0) {
-    if (mc_dep_cost > 0 && intra_cost > 0) {
-      const double r0 = cpi->rd.r0;
-      const double rk = (double)intra_cost / mc_dep_cost;
-      beta = (r0 / rk);
-    }
-  } else if (analysis_type == 1) {
-    const double mc_count_base = (mi_count * cpi->rd.mc_count_base);
-    beta = (mc_count + 1.0) / (mc_count_base + 1.0);
-    beta = pow(beta, 0.5);
-  } else if (analysis_type == 2) {
-    const double mc_saved_base = (mi_count * cpi->rd.mc_saved_base);
-    beta = (mc_saved + 1.0) / (mc_saved_base + 1.0);
-    beta = pow(beta, 0.5);
-  }
-
-  int rdmult = av1_get_adaptive_rdmult(cpi, beta);
-
-  aom_clear_system_state();
-
-  rdmult = AOMMIN(rdmult, orig_rdmult * 3 / 2);
-  rdmult = AOMMAX(rdmult, orig_rdmult * 1 / 2);
-
-  rdmult = AOMMAX(1, rdmult);
-
-  return rdmult;
-}
-
-static int get_tpl_stats_b(AV1_COMP *cpi, BLOCK_SIZE bsize, int mi_row,
-                           int mi_col, int64_t *intra_cost_b,
-                           int64_t *inter_cost_b,
-                           int_mv mv_b[][INTER_REFS_PER_FRAME], int *stride) {
-  if (!cpi->oxcf.enable_tpl_model) return 0;
-  if (cpi->superres_mode != SUPERRES_NONE) return 0;
-  if (cpi->common.current_frame.frame_type == KEY_FRAME) return 0;
-  const FRAME_UPDATE_TYPE update_type = get_frame_update_type(&cpi->gf_group);
-  if (update_type == INTNL_OVERLAY_UPDATE || update_type == OVERLAY_UPDATE)
-    return 0;
-  assert(IMPLIES(cpi->gf_group.size > 0,
-                 cpi->gf_group.index < cpi->gf_group.size));
-
-  AV1_COMMON *const cm = &cpi->common;
-  const int gf_group_index = cpi->gf_group.index;
-  TplParams *const tpl_data = &cpi->tpl_data;
-  TplDepFrame *tpl_frame = &tpl_data->tpl_frame[gf_group_index];
-  TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr;
-  int tpl_stride = tpl_frame->stride;
-  const int mi_wide = mi_size_wide[bsize];
-  const int mi_high = mi_size_high[bsize];
-
-  if (tpl_frame->is_valid == 0) return 0;
-  if (gf_group_index >= MAX_LAG_BUFFERS) return 0;
-
-  int mi_count = 0;
-  int count = 0;
-  const int mi_col_sr =
-      coded_to_superres_mi(mi_col, cm->superres_scale_denominator);
-  const int mi_col_end_sr =
-      coded_to_superres_mi(mi_col + mi_wide, cm->superres_scale_denominator);
-  // mi_cols_sr is mi_cols at superres case.
-  const int mi_cols_sr = av1_pixels_to_mi(cm->superres_upscaled_width);
-
-  // TPL store unit size is not the same as the motion estimation unit size.
-  // Here always use motion estimation size to avoid getting repetitive inter/
-  // intra cost.
-  const BLOCK_SIZE tpl_bsize = convert_length_to_bsize(MC_FLOW_BSIZE_1D);
-  const int step = mi_size_wide[tpl_bsize];
-  assert(mi_size_wide[tpl_bsize] == mi_size_high[tpl_bsize]);
-
-  // Stride is only based on SB size, and we fill in values for every 16x16
-  // block in a SB.
-  *stride = (mi_col_end_sr - mi_col_sr) / step;
-
-  for (int row = mi_row; row < mi_row + mi_high; row += step) {
-    for (int col = mi_col_sr; col < mi_col_end_sr; col += step) {
-      // Handle partial SB, so that no invalid values are used later.
-      if (row >= cm->mi_params.mi_rows || col >= mi_cols_sr) {
-        inter_cost_b[count] = INT64_MAX;
-        intra_cost_b[count] = INT64_MAX;
-        for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
-          mv_b[count][i].as_int = INVALID_MV;
-        }
-        count++;
-        continue;
-      }
-
-      TplDepStats *this_stats = &tpl_stats[av1_tpl_ptr_pos(
-          row, col, tpl_stride, tpl_data->tpl_stats_block_mis_log2)];
-      inter_cost_b[count] = this_stats->inter_cost;
-      intra_cost_b[count] = this_stats->intra_cost;
-      memcpy(mv_b[count], this_stats->mv, sizeof(this_stats->mv));
-      mi_count++;
-      count++;
-    }
-  }
-
-  return mi_count;
-}
-
-// analysis_type 0: Use mc_dep_cost and intra_cost
-// analysis_type 1: Use count of best inter predictor chosen
-// analysis_type 2: Use cost reduction from intra to inter for best inter
-//                  predictor chosen
-static int get_q_for_deltaq_objective(AV1_COMP *const cpi, BLOCK_SIZE bsize,
-                                      int mi_row, int mi_col) {
-  AV1_COMMON *const cm = &cpi->common;
-  assert(IMPLIES(cpi->gf_group.size > 0,
-                 cpi->gf_group.index < cpi->gf_group.size));
-  const int tpl_idx = cpi->gf_group.index;
-  TplParams *const tpl_data = &cpi->tpl_data;
-  TplDepFrame *tpl_frame = &tpl_data->tpl_frame[tpl_idx];
-  TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr;
-  const uint8_t block_mis_log2 = tpl_data->tpl_stats_block_mis_log2;
-  int tpl_stride = tpl_frame->stride;
-  int64_t intra_cost = 0;
-  int64_t mc_dep_cost = 0;
-  const int mi_wide = mi_size_wide[bsize];
-  const int mi_high = mi_size_high[bsize];
-  const int base_qindex = cm->quant_params.base_qindex;
-
-  if (tpl_frame->is_valid == 0) return base_qindex;
-
-  if (!is_frame_tpl_eligible(cpi)) return base_qindex;
-
-  if (cpi->gf_group.index >= MAX_LAG_BUFFERS) return base_qindex;
-
-  int64_t mc_count = 0, mc_saved = 0;
-  int mi_count = 0;
-  const int mi_col_sr =
-      coded_to_superres_mi(mi_col, cm->superres_scale_denominator);
-  const int mi_col_end_sr =
-      coded_to_superres_mi(mi_col + mi_wide, cm->superres_scale_denominator);
-  const int mi_cols_sr = av1_pixels_to_mi(cm->superres_upscaled_width);
-  const int step = 1 << block_mis_log2;
-  for (int row = mi_row; row < mi_row + mi_high; row += step) {
-    for (int col = mi_col_sr; col < mi_col_end_sr; col += step) {
-      if (row >= cm->mi_params.mi_rows || col >= mi_cols_sr) continue;
-      TplDepStats *this_stats =
-          &tpl_stats[av1_tpl_ptr_pos(row, col, tpl_stride, block_mis_log2)];
-      int64_t mc_dep_delta =
-          RDCOST(tpl_frame->base_rdmult, this_stats->mc_dep_rate,
-                 this_stats->mc_dep_dist);
-      intra_cost += this_stats->recrf_dist << RDDIV_BITS;
-      mc_dep_cost += (this_stats->recrf_dist << RDDIV_BITS) + mc_dep_delta;
-      mc_count += this_stats->mc_count;
-      mc_saved += this_stats->mc_saved;
-      mi_count++;
-    }
-  }
-
-  aom_clear_system_state();
-
-  int offset = 0;
-  double beta = 1.0;
-  if (mc_dep_cost > 0 && intra_cost > 0) {
-    const double r0 = cpi->rd.r0;
-    const double rk = (double)intra_cost / mc_dep_cost;
-    beta = (r0 / rk);
-    assert(beta > 0.0);
-  }
-  offset = av1_get_deltaq_offset(cpi, base_qindex, beta);
-  aom_clear_system_state();
-
-  const DeltaQInfo *const delta_q_info = &cm->delta_q_info;
-  offset = AOMMIN(offset, delta_q_info->delta_q_res * 9 - 1);
-  offset = AOMMAX(offset, -delta_q_info->delta_q_res * 9 + 1);
-  int qindex = cm->quant_params.base_qindex + offset;
-  qindex = AOMMIN(qindex, MAXQ);
-  qindex = AOMMAX(qindex, MINQ);
-
-  return qindex;
-}
-
+/*!\brief Assigns different quantization parameters to each super
+ * block based on its TPL weight.
+ *
+ * \ingroup tpl_modelling
+ *
+ * \param[in]     cpi         Top level encoder instance structure
+ * \param[in,out] td          Thread data structure
+ * \param[in,out] x           Macro block level data for this block.
+ * \param[in]     tile_info   Tile infromation / identification
+ * \param[in]     mi_row      Block row (in "MI_SIZE" units) index
+ * \param[in]     mi_col      Block column (in "MI_SIZE" units) index
+ * \param[out]    num_planes  Number of image planes (e.g. Y,U,V)
+ *
+ * \return No return value but updates macroblock and thread data
+ * related to the q / q delta to be used.
+ */
 static AOM_INLINE void setup_delta_q(AV1_COMP *const cpi, ThreadData *td,
                                      MACROBLOCK *const x,
                                      const TileInfo *const tile_info,
@@ -4126,7 +247,7 @@
   av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes, sb_size);
 
   int current_qindex = cm->quant_params.base_qindex;
-  if (cpi->oxcf.deltaq_mode == DELTA_Q_PERCEPTUAL) {
+  if (cpi->oxcf.q_cfg.deltaq_mode == DELTA_Q_PERCEPTUAL) {
     if (DELTA_Q_PERCEPTUAL_MODULATION == 1) {
       const int block_wavelet_energy_level =
           av1_block_wavelet_energy_level(cpi, x, sb_size);
@@ -4139,16 +260,18 @@
       current_qindex =
           av1_compute_q_from_energy_level_deltaq_mode(cpi, block_var_level);
     }
-  } else if (cpi->oxcf.deltaq_mode == DELTA_Q_OBJECTIVE &&
-             cpi->oxcf.enable_tpl_model) {
+  } else if (cpi->oxcf.q_cfg.deltaq_mode == DELTA_Q_OBJECTIVE &&
+             cpi->oxcf.algo_cfg.enable_tpl_model) {
     // Setup deltaq based on tpl stats
-    current_qindex = get_q_for_deltaq_objective(cpi, sb_size, mi_row, mi_col);
+    current_qindex =
+        av1_get_q_for_deltaq_objective(cpi, sb_size, mi_row, mi_col);
   }
 
   const int delta_q_res = delta_q_info->delta_q_res;
-  // Right now aq only works with tpl model. So if tpl is disabled, we set the
-  // current_qindex to base_qindex.
-  if (cpi->oxcf.enable_tpl_model && cpi->oxcf.deltaq_mode != NO_DELTA_Q) {
+  // Right now deltaq only works with tpl model. So if tpl is disabled, we set
+  // the current_qindex to base_qindex.
+  if (cpi->oxcf.algo_cfg.enable_tpl_model &&
+      cpi->oxcf.q_cfg.deltaq_mode != NO_DELTA_Q) {
     current_qindex =
         clamp(current_qindex, delta_q_res, 256 - delta_q_info->delta_q_res);
   } else {
@@ -4157,28 +280,29 @@
 
   MACROBLOCKD *const xd = &x->e_mbd;
   const int sign_deltaq_index =
-      current_qindex - xd->current_qindex >= 0 ? 1 : -1;
+      current_qindex - xd->current_base_qindex >= 0 ? 1 : -1;
   const int deltaq_deadzone = delta_q_res / 4;
   const int qmask = ~(delta_q_res - 1);
-  int abs_deltaq_index = abs(current_qindex - xd->current_qindex);
+  int abs_deltaq_index = abs(current_qindex - xd->current_base_qindex);
   abs_deltaq_index = (abs_deltaq_index + deltaq_deadzone) & qmask;
-  current_qindex = xd->current_qindex + sign_deltaq_index * abs_deltaq_index;
+  current_qindex =
+      xd->current_base_qindex + sign_deltaq_index * abs_deltaq_index;
   current_qindex = AOMMAX(current_qindex, MINQ + 1);
   assert(current_qindex > 0);
 
-  xd->delta_qindex = current_qindex - cm->quant_params.base_qindex;
-  set_offsets(cpi, tile_info, x, mi_row, mi_col, sb_size);
+  x->delta_qindex = current_qindex - cm->quant_params.base_qindex;
+  av1_set_offsets(cpi, tile_info, x, mi_row, mi_col, sb_size);
   xd->mi[0]->current_qindex = current_qindex;
   av1_init_plane_quantizers(cpi, x, xd->mi[0]->segment_id);
 
   // keep track of any non-zero delta-q used
-  td->deltaq_used |= (xd->delta_qindex != 0);
+  td->deltaq_used |= (x->delta_qindex != 0);
 
-  if (cpi->oxcf.deltalf_mode) {
+  if (cpi->oxcf.tool_cfg.enable_deltalf_mode) {
     const int delta_lf_res = delta_q_info->delta_lf_res;
     const int lfmask = ~(delta_lf_res - 1);
     const int delta_lf_from_base =
-        ((xd->delta_qindex / 2 + delta_lf_res / 2) & lfmask);
+        ((x->delta_qindex / 2 + delta_lf_res / 2) & lfmask);
     const int8_t delta_lf =
         (int8_t)clamp(delta_lf_from_base, -MAX_LOOP_FILTER, MAX_LOOP_FILTER);
     const int frame_lf_count =
@@ -4198,381 +322,11 @@
     }
   }
 }
-#endif  // !CONFIG_REALTIME_ONLY
 
-#define AVG_CDF_WEIGHT_LEFT 3
-#define AVG_CDF_WEIGHT_TOP_RIGHT 1
-
-static AOM_INLINE void avg_cdf_symbol(aom_cdf_prob *cdf_ptr_left,
-                                      aom_cdf_prob *cdf_ptr_tr, int num_cdfs,
-                                      int cdf_stride, int nsymbs, int wt_left,
-                                      int wt_tr) {
-  for (int i = 0; i < num_cdfs; i++) {
-    for (int j = 0; j <= nsymbs; j++) {
-      cdf_ptr_left[i * cdf_stride + j] =
-          (aom_cdf_prob)(((int)cdf_ptr_left[i * cdf_stride + j] * wt_left +
-                          (int)cdf_ptr_tr[i * cdf_stride + j] * wt_tr +
-                          ((wt_left + wt_tr) / 2)) /
-                         (wt_left + wt_tr));
-      assert(cdf_ptr_left[i * cdf_stride + j] >= 0 &&
-             cdf_ptr_left[i * cdf_stride + j] < CDF_PROB_TOP);
-    }
-  }
-}
-
-#define AVERAGE_CDF(cname_left, cname_tr, nsymbs) \
-  AVG_CDF_STRIDE(cname_left, cname_tr, nsymbs, CDF_SIZE(nsymbs))
-
-#define AVG_CDF_STRIDE(cname_left, cname_tr, nsymbs, cdf_stride)           \
-  do {                                                                     \
-    aom_cdf_prob *cdf_ptr_left = (aom_cdf_prob *)cname_left;               \
-    aom_cdf_prob *cdf_ptr_tr = (aom_cdf_prob *)cname_tr;                   \
-    int array_size = (int)sizeof(cname_left) / sizeof(aom_cdf_prob);       \
-    int num_cdfs = array_size / cdf_stride;                                \
-    avg_cdf_symbol(cdf_ptr_left, cdf_ptr_tr, num_cdfs, cdf_stride, nsymbs, \
-                   wt_left, wt_tr);                                        \
-  } while (0)
-
-static AOM_INLINE void avg_nmv(nmv_context *nmv_left, nmv_context *nmv_tr,
-                               int wt_left, int wt_tr) {
-  AVERAGE_CDF(nmv_left->joints_cdf, nmv_tr->joints_cdf, 4);
-  for (int i = 0; i < 2; i++) {
-    AVERAGE_CDF(nmv_left->comps[i].classes_cdf, nmv_tr->comps[i].classes_cdf,
-                MV_CLASSES);
-    AVERAGE_CDF(nmv_left->comps[i].class0_fp_cdf,
-                nmv_tr->comps[i].class0_fp_cdf, MV_FP_SIZE);
-    AVERAGE_CDF(nmv_left->comps[i].fp_cdf, nmv_tr->comps[i].fp_cdf, MV_FP_SIZE);
-    AVERAGE_CDF(nmv_left->comps[i].sign_cdf, nmv_tr->comps[i].sign_cdf, 2);
-    AVERAGE_CDF(nmv_left->comps[i].class0_hp_cdf,
-                nmv_tr->comps[i].class0_hp_cdf, 2);
-    AVERAGE_CDF(nmv_left->comps[i].hp_cdf, nmv_tr->comps[i].hp_cdf, 2);
-    AVERAGE_CDF(nmv_left->comps[i].class0_cdf, nmv_tr->comps[i].class0_cdf,
-                CLASS0_SIZE);
-    AVERAGE_CDF(nmv_left->comps[i].bits_cdf, nmv_tr->comps[i].bits_cdf, 2);
-  }
-}
-
-// In case of row-based multi-threading of encoder, since we always
-// keep a top - right sync, we can average the top - right SB's CDFs and
-// the left SB's CDFs and use the same for current SB's encoding to
-// improve the performance. This function facilitates the averaging
-// of CDF and used only when row-mt is enabled in encoder.
-static AOM_INLINE void avg_cdf_symbols(FRAME_CONTEXT *ctx_left,
-                                       FRAME_CONTEXT *ctx_tr, int wt_left,
-                                       int wt_tr) {
-  AVERAGE_CDF(ctx_left->txb_skip_cdf, ctx_tr->txb_skip_cdf, 2);
-  AVERAGE_CDF(ctx_left->eob_extra_cdf, ctx_tr->eob_extra_cdf, 2);
-  AVERAGE_CDF(ctx_left->dc_sign_cdf, ctx_tr->dc_sign_cdf, 2);
-  AVERAGE_CDF(ctx_left->eob_flag_cdf16, ctx_tr->eob_flag_cdf16, 5);
-  AVERAGE_CDF(ctx_left->eob_flag_cdf32, ctx_tr->eob_flag_cdf32, 6);
-  AVERAGE_CDF(ctx_left->eob_flag_cdf64, ctx_tr->eob_flag_cdf64, 7);
-  AVERAGE_CDF(ctx_left->eob_flag_cdf128, ctx_tr->eob_flag_cdf128, 8);
-  AVERAGE_CDF(ctx_left->eob_flag_cdf256, ctx_tr->eob_flag_cdf256, 9);
-  AVERAGE_CDF(ctx_left->eob_flag_cdf512, ctx_tr->eob_flag_cdf512, 10);
-  AVERAGE_CDF(ctx_left->eob_flag_cdf1024, ctx_tr->eob_flag_cdf1024, 11);
-  AVERAGE_CDF(ctx_left->coeff_base_eob_cdf, ctx_tr->coeff_base_eob_cdf, 3);
-  AVERAGE_CDF(ctx_left->coeff_base_cdf, ctx_tr->coeff_base_cdf, 4);
-  AVERAGE_CDF(ctx_left->coeff_br_cdf, ctx_tr->coeff_br_cdf, BR_CDF_SIZE);
-  AVERAGE_CDF(ctx_left->newmv_cdf, ctx_tr->newmv_cdf, 2);
-  AVERAGE_CDF(ctx_left->zeromv_cdf, ctx_tr->zeromv_cdf, 2);
-  AVERAGE_CDF(ctx_left->refmv_cdf, ctx_tr->refmv_cdf, 2);
-  AVERAGE_CDF(ctx_left->drl_cdf, ctx_tr->drl_cdf, 2);
-  AVERAGE_CDF(ctx_left->inter_compound_mode_cdf,
-              ctx_tr->inter_compound_mode_cdf, INTER_COMPOUND_MODES);
-  AVERAGE_CDF(ctx_left->compound_type_cdf, ctx_tr->compound_type_cdf,
-              MASKED_COMPOUND_TYPES);
-  AVERAGE_CDF(ctx_left->wedge_idx_cdf, ctx_tr->wedge_idx_cdf, 16);
-  AVERAGE_CDF(ctx_left->interintra_cdf, ctx_tr->interintra_cdf, 2);
-  AVERAGE_CDF(ctx_left->wedge_interintra_cdf, ctx_tr->wedge_interintra_cdf, 2);
-  AVERAGE_CDF(ctx_left->interintra_mode_cdf, ctx_tr->interintra_mode_cdf,
-              INTERINTRA_MODES);
-  AVERAGE_CDF(ctx_left->motion_mode_cdf, ctx_tr->motion_mode_cdf, MOTION_MODES);
-  AVERAGE_CDF(ctx_left->obmc_cdf, ctx_tr->obmc_cdf, 2);
-  AVERAGE_CDF(ctx_left->palette_y_size_cdf, ctx_tr->palette_y_size_cdf,
-              PALETTE_SIZES);
-  AVERAGE_CDF(ctx_left->palette_uv_size_cdf, ctx_tr->palette_uv_size_cdf,
-              PALETTE_SIZES);
-  for (int j = 0; j < PALETTE_SIZES; j++) {
-    int nsymbs = j + PALETTE_MIN_SIZE;
-    AVG_CDF_STRIDE(ctx_left->palette_y_color_index_cdf[j],
-                   ctx_tr->palette_y_color_index_cdf[j], nsymbs,
-                   CDF_SIZE(PALETTE_COLORS));
-    AVG_CDF_STRIDE(ctx_left->palette_uv_color_index_cdf[j],
-                   ctx_tr->palette_uv_color_index_cdf[j], nsymbs,
-                   CDF_SIZE(PALETTE_COLORS));
-  }
-  AVERAGE_CDF(ctx_left->palette_y_mode_cdf, ctx_tr->palette_y_mode_cdf, 2);
-  AVERAGE_CDF(ctx_left->palette_uv_mode_cdf, ctx_tr->palette_uv_mode_cdf, 2);
-  AVERAGE_CDF(ctx_left->comp_inter_cdf, ctx_tr->comp_inter_cdf, 2);
-  AVERAGE_CDF(ctx_left->single_ref_cdf, ctx_tr->single_ref_cdf, 2);
-  AVERAGE_CDF(ctx_left->comp_ref_type_cdf, ctx_tr->comp_ref_type_cdf, 2);
-  AVERAGE_CDF(ctx_left->uni_comp_ref_cdf, ctx_tr->uni_comp_ref_cdf, 2);
-  AVERAGE_CDF(ctx_left->comp_ref_cdf, ctx_tr->comp_ref_cdf, 2);
-  AVERAGE_CDF(ctx_left->comp_bwdref_cdf, ctx_tr->comp_bwdref_cdf, 2);
-  AVERAGE_CDF(ctx_left->txfm_partition_cdf, ctx_tr->txfm_partition_cdf, 2);
-  AVERAGE_CDF(ctx_left->compound_index_cdf, ctx_tr->compound_index_cdf, 2);
-  AVERAGE_CDF(ctx_left->comp_group_idx_cdf, ctx_tr->comp_group_idx_cdf, 2);
-  AVERAGE_CDF(ctx_left->skip_mode_cdfs, ctx_tr->skip_mode_cdfs, 2);
-  AVERAGE_CDF(ctx_left->skip_cdfs, ctx_tr->skip_cdfs, 2);
-  AVERAGE_CDF(ctx_left->intra_inter_cdf, ctx_tr->intra_inter_cdf, 2);
-  avg_nmv(&ctx_left->nmvc, &ctx_tr->nmvc, wt_left, wt_tr);
-  avg_nmv(&ctx_left->ndvc, &ctx_tr->ndvc, wt_left, wt_tr);
-  AVERAGE_CDF(ctx_left->intrabc_cdf, ctx_tr->intrabc_cdf, 2);
-  AVERAGE_CDF(ctx_left->seg.tree_cdf, ctx_tr->seg.tree_cdf, MAX_SEGMENTS);
-  AVERAGE_CDF(ctx_left->seg.pred_cdf, ctx_tr->seg.pred_cdf, 2);
-  AVERAGE_CDF(ctx_left->seg.spatial_pred_seg_cdf,
-              ctx_tr->seg.spatial_pred_seg_cdf, MAX_SEGMENTS);
-  AVERAGE_CDF(ctx_left->filter_intra_cdfs, ctx_tr->filter_intra_cdfs, 2);
-  AVERAGE_CDF(ctx_left->filter_intra_mode_cdf, ctx_tr->filter_intra_mode_cdf,
-              FILTER_INTRA_MODES);
-  AVERAGE_CDF(ctx_left->switchable_restore_cdf, ctx_tr->switchable_restore_cdf,
-              RESTORE_SWITCHABLE_TYPES);
-  AVERAGE_CDF(ctx_left->wiener_restore_cdf, ctx_tr->wiener_restore_cdf, 2);
-  AVERAGE_CDF(ctx_left->sgrproj_restore_cdf, ctx_tr->sgrproj_restore_cdf, 2);
-  AVERAGE_CDF(ctx_left->y_mode_cdf, ctx_tr->y_mode_cdf, INTRA_MODES);
-  AVG_CDF_STRIDE(ctx_left->uv_mode_cdf[0], ctx_tr->uv_mode_cdf[0],
-                 UV_INTRA_MODES - 1, CDF_SIZE(UV_INTRA_MODES));
-  AVERAGE_CDF(ctx_left->uv_mode_cdf[1], ctx_tr->uv_mode_cdf[1], UV_INTRA_MODES);
-  for (int i = 0; i < PARTITION_CONTEXTS; i++) {
-    if (i < 4) {
-      AVG_CDF_STRIDE(ctx_left->partition_cdf[i], ctx_tr->partition_cdf[i], 4,
-                     CDF_SIZE(10));
-    } else if (i < 16) {
-      AVERAGE_CDF(ctx_left->partition_cdf[i], ctx_tr->partition_cdf[i], 10);
-    } else {
-      AVG_CDF_STRIDE(ctx_left->partition_cdf[i], ctx_tr->partition_cdf[i], 8,
-                     CDF_SIZE(10));
-    }
-  }
-  AVERAGE_CDF(ctx_left->switchable_interp_cdf, ctx_tr->switchable_interp_cdf,
-              SWITCHABLE_FILTERS);
-  AVERAGE_CDF(ctx_left->kf_y_cdf, ctx_tr->kf_y_cdf, INTRA_MODES);
-  AVERAGE_CDF(ctx_left->angle_delta_cdf, ctx_tr->angle_delta_cdf,
-              2 * MAX_ANGLE_DELTA + 1);
-  AVG_CDF_STRIDE(ctx_left->tx_size_cdf[0], ctx_tr->tx_size_cdf[0], MAX_TX_DEPTH,
-                 CDF_SIZE(MAX_TX_DEPTH + 1));
-  AVERAGE_CDF(ctx_left->tx_size_cdf[1], ctx_tr->tx_size_cdf[1],
-              MAX_TX_DEPTH + 1);
-  AVERAGE_CDF(ctx_left->tx_size_cdf[2], ctx_tr->tx_size_cdf[2],
-              MAX_TX_DEPTH + 1);
-  AVERAGE_CDF(ctx_left->tx_size_cdf[3], ctx_tr->tx_size_cdf[3],
-              MAX_TX_DEPTH + 1);
-  AVERAGE_CDF(ctx_left->delta_q_cdf, ctx_tr->delta_q_cdf, DELTA_Q_PROBS + 1);
-  AVERAGE_CDF(ctx_left->delta_lf_cdf, ctx_tr->delta_lf_cdf, DELTA_LF_PROBS + 1);
-  for (int i = 0; i < FRAME_LF_COUNT; i++) {
-    AVERAGE_CDF(ctx_left->delta_lf_multi_cdf[i], ctx_tr->delta_lf_multi_cdf[i],
-                DELTA_LF_PROBS + 1);
-  }
-  AVG_CDF_STRIDE(ctx_left->intra_ext_tx_cdf[1], ctx_tr->intra_ext_tx_cdf[1], 7,
-                 CDF_SIZE(TX_TYPES));
-  AVG_CDF_STRIDE(ctx_left->intra_ext_tx_cdf[2], ctx_tr->intra_ext_tx_cdf[2], 5,
-                 CDF_SIZE(TX_TYPES));
-  AVG_CDF_STRIDE(ctx_left->inter_ext_tx_cdf[1], ctx_tr->inter_ext_tx_cdf[1], 16,
-                 CDF_SIZE(TX_TYPES));
-  AVG_CDF_STRIDE(ctx_left->inter_ext_tx_cdf[2], ctx_tr->inter_ext_tx_cdf[2], 12,
-                 CDF_SIZE(TX_TYPES));
-  AVG_CDF_STRIDE(ctx_left->inter_ext_tx_cdf[3], ctx_tr->inter_ext_tx_cdf[3], 2,
-                 CDF_SIZE(TX_TYPES));
-  AVERAGE_CDF(ctx_left->cfl_sign_cdf, ctx_tr->cfl_sign_cdf, CFL_JOINT_SIGNS);
-  AVERAGE_CDF(ctx_left->cfl_alpha_cdf, ctx_tr->cfl_alpha_cdf,
-              CFL_ALPHABET_SIZE);
-}
-
-#if !CONFIG_REALTIME_ONLY
-static AOM_INLINE void adjust_rdmult_tpl_model(AV1_COMP *cpi, MACROBLOCK *x,
-                                               int mi_row, int mi_col) {
-  const BLOCK_SIZE sb_size = cpi->common.seq_params.sb_size;
-  const int orig_rdmult = cpi->rd.RDMULT;
-
-  assert(IMPLIES(cpi->gf_group.size > 0,
-                 cpi->gf_group.index < cpi->gf_group.size));
-  const int gf_group_index = cpi->gf_group.index;
-  if (cpi->oxcf.enable_tpl_model && cpi->oxcf.aq_mode == NO_AQ &&
-      cpi->oxcf.deltaq_mode == NO_DELTA_Q && gf_group_index > 0 &&
-      cpi->gf_group.update_type[gf_group_index] == ARF_UPDATE) {
-    const int dr =
-        get_rdmult_delta(cpi, sb_size, 0, mi_row, mi_col, orig_rdmult);
-    x->rdmult = dr;
-  }
-}
-#endif
-
-static void source_content_sb(AV1_COMP *cpi, MACROBLOCK *x, int shift) {
-  unsigned int tmp_sse;
-  unsigned int tmp_variance;
-  const BLOCK_SIZE bsize = BLOCK_64X64;
-  uint8_t *src_y = cpi->source->y_buffer;
-  int src_ystride = cpi->source->y_stride;
-  uint8_t *last_src_y = cpi->last_source->y_buffer;
-  int last_src_ystride = cpi->last_source->y_stride;
-  uint64_t avg_source_sse_threshold = 100000;        // ~5*5*(64*64)
-  uint64_t avg_source_sse_threshold_high = 1000000;  // ~15*15*(64*64)
-  uint64_t sum_sq_thresh = 10000;  // sum = sqrt(thresh / 64*64)) ~1.5
-#if CONFIG_AV1_HIGHBITDEPTH
-  MACROBLOCKD *xd = &x->e_mbd;
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) return;
-#endif
-  src_y += shift;
-  last_src_y += shift;
-  tmp_variance = cpi->fn_ptr[bsize].vf(src_y, src_ystride, last_src_y,
-                                       last_src_ystride, &tmp_sse);
-  // Note: tmp_sse - tmp_variance = ((sum * sum) >> 12)
-  // Detect large lighting change.
-  if (tmp_variance < (tmp_sse >> 1) && (tmp_sse - tmp_variance) > sum_sq_thresh)
-    x->content_state_sb = kLowVarHighSumdiff;
-  else if (tmp_sse < avg_source_sse_threshold)
-    x->content_state_sb = kLowSad;
-  else if (tmp_sse > avg_source_sse_threshold_high)
-    x->content_state_sb = kHighSad;
-}
-
-static AOM_INLINE void encode_nonrd_sb(AV1_COMP *cpi, ThreadData *td,
-                                       TileDataEnc *tile_data,
-                                       PC_TREE *const pc_root, TOKENEXTRA **tp,
-                                       const int mi_row, const int mi_col,
-                                       const int seg_skip) {
-  AV1_COMMON *const cm = &cpi->common;
-  MACROBLOCK *const x = &td->mb;
-  const SPEED_FEATURES *const sf = &cpi->sf;
-  const TileInfo *const tile_info = &tile_data->tile_info;
-  MB_MODE_INFO **mi = cm->mi_params.mi_grid_base +
-                      get_mi_grid_idx(&cm->mi_params, mi_row, mi_col);
-  const BLOCK_SIZE sb_size = cm->seq_params.sb_size;
-  if (sf->rt_sf.source_metrics_sb_nonrd && sb_size == BLOCK_64X64 &&
-      cpi->svc.number_spatial_layers <= 1 &&
-      cm->current_frame.frame_type != KEY_FRAME) {
-    int shift = cpi->source->y_stride * (mi_row << 2) + (mi_col << 2);
-    source_content_sb(cpi, x, shift);
-  }
-  if (sf->part_sf.partition_search_type == FIXED_PARTITION || seg_skip) {
-    set_offsets(cpi, tile_info, x, mi_row, mi_col, sb_size);
-    const BLOCK_SIZE bsize =
-        seg_skip ? sb_size : sf->part_sf.always_this_block_size;
-    set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize);
-  } else if (cpi->partition_search_skippable_frame) {
-    set_offsets(cpi, tile_info, x, mi_row, mi_col, sb_size);
-    const BLOCK_SIZE bsize =
-        get_rd_var_based_fixed_partition(cpi, x, mi_row, mi_col);
-    set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize);
-  } else if (sf->part_sf.partition_search_type == VAR_BASED_PARTITION) {
-    set_offsets_without_segment_id(cpi, tile_info, x, mi_row, mi_col, sb_size);
-    av1_choose_var_based_partitioning(cpi, tile_info, td, x, mi_row, mi_col);
-  }
-  assert(sf->part_sf.partition_search_type == FIXED_PARTITION || seg_skip ||
-         cpi->partition_search_skippable_frame ||
-         sf->part_sf.partition_search_type == VAR_BASED_PARTITION);
-  td->mb.cb_offset = 0;
-  nonrd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, sb_size,
-                      pc_root);
-}
-
-// Memset the mbmis at the current superblock to 0
-static INLINE void reset_mbmi(CommonModeInfoParams *const mi_params,
-                              BLOCK_SIZE sb_size, int mi_row, int mi_col) {
-  // size of sb in unit of mi (BLOCK_4X4)
-  const int sb_size_mi = mi_size_wide[sb_size];
-  const int mi_alloc_size_1d = mi_size_wide[mi_params->mi_alloc_bsize];
-  // size of sb in unit of allocated mi size
-  const int sb_size_alloc_mi = mi_size_wide[sb_size] / mi_alloc_size_1d;
-  assert(mi_params->mi_alloc_stride % sb_size_alloc_mi == 0 &&
-         "mi is not allocated as a multiple of sb!");
-  assert(mi_params->mi_stride % sb_size_mi == 0 &&
-         "mi_grid_base is not allocated as a multiple of sb!");
-
-  const int mi_rows = mi_size_high[sb_size];
-  for (int cur_mi_row = 0; cur_mi_row < mi_rows; cur_mi_row++) {
-    assert(get_mi_grid_idx(mi_params, 0, mi_col + mi_alloc_size_1d) <
-           mi_params->mi_stride);
-    const int mi_grid_idx =
-        get_mi_grid_idx(mi_params, mi_row + cur_mi_row, mi_col);
-    const int alloc_mi_idx =
-        get_alloc_mi_idx(mi_params, mi_row + cur_mi_row, mi_col);
-    memset(&mi_params->mi_grid_base[mi_grid_idx], 0,
-           sb_size_mi * sizeof(*mi_params->mi_grid_base));
-    memset(&mi_params->tx_type_map[mi_grid_idx], 0,
-           sb_size_mi * sizeof(*mi_params->tx_type_map));
-    if (cur_mi_row % mi_alloc_size_1d == 0) {
-      memset(&mi_params->mi_alloc[alloc_mi_idx], 0,
-             sb_size_alloc_mi * sizeof(*mi_params->mi_alloc));
-    }
-  }
-}
-
-static INLINE void backup_sb_state(SB_FIRST_PASS_STATS *sb_fp_stats,
-                                   const AV1_COMP *cpi, ThreadData *td,
-                                   const TileDataEnc *tile_data, int mi_row,
-                                   int mi_col) {
-  MACROBLOCK *x = &td->mb;
-  MACROBLOCKD *xd = &x->e_mbd;
-  const TileInfo *tile_info = &tile_data->tile_info;
-
-  const AV1_COMMON *cm = &cpi->common;
-  const int num_planes = av1_num_planes(cm);
-  const BLOCK_SIZE sb_size = cm->seq_params.sb_size;
-
-  xd->above_txfm_context =
-      cm->above_contexts.txfm[tile_info->tile_row] + mi_col;
-  xd->left_txfm_context =
-      xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
-  save_context(x, &sb_fp_stats->x_ctx, mi_row, mi_col, sb_size, num_planes);
-
-  sb_fp_stats->rd_count = cpi->td.rd_counts;
-  sb_fp_stats->split_count = cpi->td.mb.txb_split_count;
-
-  sb_fp_stats->fc = *td->counts;
-
-  memcpy(sb_fp_stats->inter_mode_rd_models, tile_data->inter_mode_rd_models,
-         sizeof(sb_fp_stats->inter_mode_rd_models));
-
-  memcpy(sb_fp_stats->thresh_freq_fact, x->thresh_freq_fact,
-         sizeof(sb_fp_stats->thresh_freq_fact));
-
-  const int alloc_mi_idx = get_alloc_mi_idx(&cm->mi_params, mi_row, mi_col);
-  sb_fp_stats->current_qindex =
-      cm->mi_params.mi_alloc[alloc_mi_idx].current_qindex;
-
-#if CONFIG_INTERNAL_STATS
-  memcpy(sb_fp_stats->mode_chosen_counts, cpi->mode_chosen_counts,
-         sizeof(sb_fp_stats->mode_chosen_counts));
-#endif  // CONFIG_INTERNAL_STATS
-}
-
-static INLINE void restore_sb_state(const SB_FIRST_PASS_STATS *sb_fp_stats,
-                                    AV1_COMP *cpi, ThreadData *td,
-                                    TileDataEnc *tile_data, int mi_row,
-                                    int mi_col) {
-  MACROBLOCK *x = &td->mb;
-
-  const AV1_COMMON *cm = &cpi->common;
-  const int num_planes = av1_num_planes(cm);
-  const BLOCK_SIZE sb_size = cm->seq_params.sb_size;
-
-  restore_context(x, &sb_fp_stats->x_ctx, mi_row, mi_col, sb_size, num_planes);
-
-  cpi->td.rd_counts = sb_fp_stats->rd_count;
-  cpi->td.mb.txb_split_count = sb_fp_stats->split_count;
-
-  *td->counts = sb_fp_stats->fc;
-
-  memcpy(tile_data->inter_mode_rd_models, sb_fp_stats->inter_mode_rd_models,
-         sizeof(sb_fp_stats->inter_mode_rd_models));
-  memcpy(x->thresh_freq_fact, sb_fp_stats->thresh_freq_fact,
-         sizeof(sb_fp_stats->thresh_freq_fact));
-
-  const int alloc_mi_idx = get_alloc_mi_idx(&cm->mi_params, mi_row, mi_col);
-  cm->mi_params.mi_alloc[alloc_mi_idx].current_qindex =
-      sb_fp_stats->current_qindex;
-
-#if CONFIG_INTERNAL_STATS
-  memcpy(cpi->mode_chosen_counts, sb_fp_stats->mode_chosen_counts,
-         sizeof(sb_fp_stats->mode_chosen_counts));
-#endif  // CONFIG_INTERNAL_STATS
-}
-
-#if !CONFIG_REALTIME_ONLY
 static void init_ref_frame_space(AV1_COMP *cpi, ThreadData *td, int mi_row,
                                  int mi_col) {
   const AV1_COMMON *cm = &cpi->common;
+  const GF_GROUP *const gf_group = &cpi->gf_group;
   const CommonModeInfoParams *const mi_params = &cm->mi_params;
   MACROBLOCK *x = &td->mb;
   const int frame_idx = cpi->gf_group.index;
@@ -4580,17 +334,16 @@
   TplDepFrame *tpl_frame = &tpl_data->tpl_frame[frame_idx];
   const uint8_t block_mis_log2 = tpl_data->tpl_stats_block_mis_log2;
 
-  av1_zero(x->search_ref_frame);
+  av1_zero(x->tpl_keep_ref_frame);
 
   if (tpl_frame->is_valid == 0) return;
-  if (!is_frame_tpl_eligible(cpi)) return;
-  if (frame_idx >= MAX_LAG_BUFFERS) return;
-  if (cpi->superres_mode != SUPERRES_NONE) return;
-  if (cpi->oxcf.aq_mode != NO_AQ) return;
+  if (!is_frame_tpl_eligible(gf_group, gf_group->index)) return;
+  if (frame_idx >= MAX_TPL_FRAME_IDX) return;
+  if (cpi->oxcf.q_cfg.aq_mode != NO_AQ) return;
 
   const int is_overlay = cpi->gf_group.update_type[frame_idx] == OVERLAY_UPDATE;
   if (is_overlay) {
-    memset(x->search_ref_frame, 1, sizeof(x->search_ref_frame));
+    memset(x->tpl_keep_ref_frame, 1, sizeof(x->tpl_keep_ref_frame));
     return;
   }
 
@@ -4599,13 +352,21 @@
   int64_t inter_cost[INTER_REFS_PER_FRAME] = { 0 };
   const int step = 1 << block_mis_log2;
   const BLOCK_SIZE sb_size = cm->seq_params.sb_size;
+
   const int mi_row_end =
       AOMMIN(mi_size_high[sb_size] + mi_row, mi_params->mi_rows);
-  const int mi_col_end =
-      AOMMIN(mi_size_wide[sb_size] + mi_col, mi_params->mi_cols);
-
-  for (int row = mi_row; row < mi_row_end; row += step) {
-    for (int col = mi_col; col < mi_col_end; col += step) {
+  const int mi_cols_sr = av1_pixels_to_mi(cm->superres_upscaled_width);
+  const int mi_col_sr =
+      coded_to_superres_mi(mi_col, cm->superres_scale_denominator);
+  const int mi_col_end_sr =
+      AOMMIN(coded_to_superres_mi(mi_col + mi_size_wide[sb_size],
+                                  cm->superres_scale_denominator),
+             mi_cols_sr);
+  const int row_step = step;
+  const int col_step_sr =
+      coded_to_superres_mi(step, cm->superres_scale_denominator);
+  for (int row = mi_row; row < mi_row_end; row += row_step) {
+    for (int col = mi_col_sr; col < mi_col_end_sr; col += col_step_sr) {
       const TplDepStats *this_stats =
           &tpl_stats[av1_tpl_ptr_pos(row, col, tpl_stride, block_mis_log2)];
       int64_t tpl_pred_error[INTER_REFS_PER_FRAME] = { 0 };
@@ -4641,12 +402,12 @@
     }
   }
 
-  x->search_ref_frame[INTRA_FRAME] = 1;
-  x->search_ref_frame[LAST_FRAME] = 1;
+  x->tpl_keep_ref_frame[INTRA_FRAME] = 1;
+  x->tpl_keep_ref_frame[LAST_FRAME] = 1;
 
   int cutoff_ref = 0;
   for (int idx = 0; idx < INTER_REFS_PER_FRAME - 1; ++idx) {
-    x->search_ref_frame[rank_index[idx] + LAST_FRAME] = 1;
+    x->tpl_keep_ref_frame[rank_index[idx] + LAST_FRAME] = 1;
     if (idx > 2) {
       if (!cutoff_ref) {
         // If the predictive coding gains are smaller than the previous more
@@ -4658,17 +419,154 @@
           cutoff_ref = 1;
       }
 
-      if (cutoff_ref) x->search_ref_frame[rank_index[idx] + LAST_FRAME] = 0;
+      if (cutoff_ref) x->tpl_keep_ref_frame[rank_index[idx] + LAST_FRAME] = 0;
     }
   }
 }
+
+static AOM_INLINE void adjust_rdmult_tpl_model(AV1_COMP *cpi, MACROBLOCK *x,
+                                               int mi_row, int mi_col) {
+  const BLOCK_SIZE sb_size = cpi->common.seq_params.sb_size;
+  const int orig_rdmult = cpi->rd.RDMULT;
+
+  assert(IMPLIES(cpi->gf_group.size > 0,
+                 cpi->gf_group.index < cpi->gf_group.size));
+  const int gf_group_index = cpi->gf_group.index;
+  if (cpi->oxcf.algo_cfg.enable_tpl_model && cpi->oxcf.q_cfg.aq_mode == NO_AQ &&
+      cpi->oxcf.q_cfg.deltaq_mode == NO_DELTA_Q && gf_group_index > 0 &&
+      cpi->gf_group.update_type[gf_group_index] == ARF_UPDATE) {
+    const int dr =
+        av1_get_rdmult_delta(cpi, sb_size, mi_row, mi_col, orig_rdmult);
+    x->rdmult = dr;
+  }
+}
 #endif  // !CONFIG_REALTIME_ONLY
 
+// Get a prediction(stored in x->est_pred) for the whole superblock.
+static void get_estimated_pred(AV1_COMP *cpi, const TileInfo *const tile,
+                               MACROBLOCK *x, int mi_row, int mi_col) {
+  AV1_COMMON *const cm = &cpi->common;
+  const int is_key_frame = frame_is_intra_only(cm);
+  MACROBLOCKD *xd = &x->e_mbd;
+
+  // TODO(kyslov) Extend to 128x128
+  assert(cm->seq_params.sb_size == BLOCK_64X64);
+
+  av1_set_offsets(cpi, tile, x, mi_row, mi_col, BLOCK_64X64);
+
+  if (!is_key_frame) {
+    MB_MODE_INFO *mi = xd->mi[0];
+    const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_yv12_buf(cm, LAST_FRAME);
+
+    assert(yv12 != NULL);
+
+    av1_setup_pre_planes(xd, 0, yv12, mi_row, mi_col,
+                         get_ref_scale_factors(cm, LAST_FRAME), 1);
+    mi->ref_frame[0] = LAST_FRAME;
+    mi->ref_frame[1] = NONE;
+    mi->bsize = BLOCK_64X64;
+    mi->mv[0].as_int = 0;
+    mi->interp_filters = av1_broadcast_interp_filter(BILINEAR);
+
+    set_ref_ptrs(cm, xd, mi->ref_frame[0], mi->ref_frame[1]);
+
+    xd->plane[0].dst.buf = x->est_pred;
+    xd->plane[0].dst.stride = 64;
+    av1_enc_build_inter_predictor_y(xd, mi_row, mi_col);
+  } else {
+#if CONFIG_AV1_HIGHBITDEPTH
+    switch (xd->bd) {
+      case 8: memset(x->est_pred, 128, 64 * 64 * sizeof(x->est_pred[0])); break;
+      case 10:
+        memset(x->est_pred, 128 * 4, 64 * 64 * sizeof(x->est_pred[0]));
+        break;
+      case 12:
+        memset(x->est_pred, 128 * 16, 64 * 64 * sizeof(x->est_pred[0]));
+        break;
+    }
+#else
+    memset(x->est_pred, 128, 64 * 64 * sizeof(x->est_pred[0]));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  }
+}
+
+#define AVG_CDF_WEIGHT_LEFT 3
+#define AVG_CDF_WEIGHT_TOP_RIGHT 1
+
+/*!\brief Encode a superblock (minimal RD search involved)
+ *
+ * \ingroup partition_search
+ * Encodes the superblock by a pre-determined partition pattern, only minor
+ * rd-based searches are allowed to adjust the initial pattern. It is only used
+ * by realtime encoding.
+ */
+static AOM_INLINE void encode_nonrd_sb(AV1_COMP *cpi, ThreadData *td,
+                                       TileDataEnc *tile_data, TokenExtra **tp,
+                                       const int mi_row, const int mi_col,
+                                       const int seg_skip) {
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &td->mb;
+  const SPEED_FEATURES *const sf = &cpi->sf;
+  const TileInfo *const tile_info = &tile_data->tile_info;
+  MB_MODE_INFO **mi = cm->mi_params.mi_grid_base +
+                      get_mi_grid_idx(&cm->mi_params, mi_row, mi_col);
+  const BLOCK_SIZE sb_size = cm->seq_params.sb_size;
+
+  // Grade the temporal variation of the sb, the grade will be used to decide
+  // fast mode search strategy for coding blocks
+  if (sf->rt_sf.source_metrics_sb_nonrd &&
+      cpi->svc.number_spatial_layers <= 1 &&
+      cm->current_frame.frame_type != KEY_FRAME) {
+    int offset = cpi->source->y_stride * (mi_row << 2) + (mi_col << 2);
+    av1_source_content_sb(cpi, x, offset);
+  }
+
+  if (sf->part_sf.partition_search_type == ML_BASED_PARTITION) {
+    PC_TREE *const pc_root = av1_alloc_pc_tree_node(sb_size);
+    RD_STATS dummy_rdc;
+    get_estimated_pred(cpi, tile_info, x, mi_row, mi_col);
+    av1_nonrd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col,
+                             BLOCK_64X64, &dummy_rdc, 1, INT64_MAX, pc_root);
+    av1_free_pc_tree_recursive(pc_root, av1_num_planes(cm), 0, 0);
+    return;
+  }
+
+  // Set the partition
+  if (sf->part_sf.partition_search_type == FIXED_PARTITION || seg_skip) {
+    // set a fixed-size partition
+    av1_set_offsets(cpi, tile_info, x, mi_row, mi_col, sb_size);
+    const BLOCK_SIZE bsize =
+        seg_skip ? sb_size : sf->part_sf.fixed_partition_size;
+    av1_set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize);
+  } else if (cpi->partition_search_skippable_frame) {
+    // set a fixed-size partition for which the size is determined by the source
+    // variance
+    av1_set_offsets(cpi, tile_info, x, mi_row, mi_col, sb_size);
+    const BLOCK_SIZE bsize =
+        get_rd_var_based_fixed_partition(cpi, x, mi_row, mi_col);
+    av1_set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize);
+  } else if (sf->part_sf.partition_search_type == VAR_BASED_PARTITION) {
+    // set a variance-based partition
+    av1_set_offsets(cpi, tile_info, x, mi_row, mi_col, sb_size);
+    av1_choose_var_based_partitioning(cpi, tile_info, td, x, mi_row, mi_col);
+  }
+  assert(sf->part_sf.partition_search_type == FIXED_PARTITION || seg_skip ||
+         cpi->partition_search_skippable_frame ||
+         sf->part_sf.partition_search_type == VAR_BASED_PARTITION);
+  set_cb_offsets(td->mb.cb_offset, 0, 0);
+
+  // Adjust and encode the superblock
+  PC_TREE *const pc_root = av1_alloc_pc_tree_node(sb_size);
+  av1_nonrd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, sb_size,
+                          pc_root);
+  av1_free_pc_tree_recursive(pc_root, av1_num_planes(cm), 0, 0);
+}
+
 // This function initializes the stats for encode_rd_sb.
 static INLINE void init_encode_rd_sb(AV1_COMP *cpi, ThreadData *td,
                                      const TileDataEnc *tile_data,
-                                     PC_TREE *pc_root, RD_STATS *rd_cost,
-                                     int mi_row, int mi_col,
+                                     SIMPLE_MOTION_DATA_TREE *sms_root,
+                                     RD_STATS *rd_cost, int mi_row, int mi_col,
                                      int gather_tpl_data) {
   const AV1_COMMON *cm = &cpi->common;
   const TileInfo *tile_info = &tile_data->tile_info;
@@ -4682,22 +580,30 @@
        sf->part_sf.ml_early_term_after_part_split_level) &&
       !frame_is_intra_only(cm);
   if (use_simple_motion_search) {
-    init_simple_motion_search_mvs(pc_root);
+    init_simple_motion_search_mvs(sms_root);
   }
 
 #if !CONFIG_REALTIME_ONLY
-  init_ref_frame_space(cpi, td, mi_row, mi_col);
-  x->sb_energy_level = 0;
-  x->cnn_output_valid = 0;
-  if (gather_tpl_data) {
-    if (cm->delta_q_info.delta_q_present_flag) {
-      const int num_planes = av1_num_planes(cm);
-      const BLOCK_SIZE sb_size = cm->seq_params.sb_size;
-      setup_delta_q(cpi, td, x, tile_info, mi_row, mi_col, num_planes);
-      av1_tpl_rdmult_setup_sb(cpi, x, sb_size, mi_row, mi_col);
-    }
-    if (cpi->oxcf.enable_tpl_model) {
-      adjust_rdmult_tpl_model(cpi, x, mi_row, mi_col);
+  if (has_no_stats_stage(cpi) && cpi->oxcf.mode == REALTIME &&
+      cpi->oxcf.gf_cfg.lag_in_frames == 0) {
+    (void)tile_info;
+    (void)mi_row;
+    (void)mi_col;
+    (void)gather_tpl_data;
+  } else {
+    init_ref_frame_space(cpi, td, mi_row, mi_col);
+    x->sb_energy_level = 0;
+    x->part_search_info.cnn_output_valid = 0;
+    if (gather_tpl_data) {
+      if (cm->delta_q_info.delta_q_present_flag) {
+        const int num_planes = av1_num_planes(cm);
+        const BLOCK_SIZE sb_size = cm->seq_params.sb_size;
+        setup_delta_q(cpi, td, x, tile_info, mi_row, mi_col, num_planes);
+        av1_tpl_rdmult_setup_sb(cpi, x, sb_size, mi_row, mi_col);
+      }
+      if (cpi->oxcf.algo_cfg.enable_tpl_model) {
+        adjust_rdmult_tpl_model(cpi, x, mi_row, mi_col);
+      }
     }
   }
 #else
@@ -4708,15 +614,19 @@
 #endif
 
   // Reset hash state for transform/mode rd hash information
-  reset_hash_records(x, cpi->sf.tx_sf.use_inter_txb_hash);
+  reset_hash_records(&x->txfm_search_info, cpi->sf.tx_sf.use_inter_txb_hash);
   av1_zero(x->picked_ref_frames_mask);
-  av1_zero(x->pred_mv);
   av1_invalid_rd_stats(rd_cost);
 }
 
+/*!\brief Encode a superblock (RD-search-based)
+ *
+ * \ingroup partition_search
+ * Conducts partition search for a superblock, based on rate-distortion costs,
+ * from scratch or adjusting from a pre-calculated partition pattern.
+ */
 static AOM_INLINE void encode_rd_sb(AV1_COMP *cpi, ThreadData *td,
-                                    TileDataEnc *tile_data,
-                                    PC_TREE *const pc_root, TOKENEXTRA **tp,
+                                    TileDataEnc *tile_data, TokenExtra **tp,
                                     const int mi_row, const int mi_col,
                                     const int seg_skip) {
   AV1_COMMON *const cm = &cpi->common;
@@ -4726,94 +636,111 @@
   MB_MODE_INFO **mi = cm->mi_params.mi_grid_base +
                       get_mi_grid_idx(&cm->mi_params, mi_row, mi_col);
   const BLOCK_SIZE sb_size = cm->seq_params.sb_size;
+  const int num_planes = av1_num_planes(cm);
   int dummy_rate;
   int64_t dummy_dist;
   RD_STATS dummy_rdc;
+  SIMPLE_MOTION_DATA_TREE *const sms_root = td->sms_root;
 
 #if CONFIG_REALTIME_ONLY
   (void)seg_skip;
 #endif  // CONFIG_REALTIME_ONLY
 
-  init_encode_rd_sb(cpi, td, tile_data, pc_root, &dummy_rdc, mi_row, mi_col, 1);
+  init_encode_rd_sb(cpi, td, tile_data, sms_root, &dummy_rdc, mi_row, mi_col,
+                    1);
 
+  // Encode the superblock
   if (sf->part_sf.partition_search_type == VAR_BASED_PARTITION) {
-    set_offsets_without_segment_id(cpi, tile_info, x, mi_row, mi_col, sb_size);
+    // partition search starting from a variance-based partition
+    av1_set_offsets_without_segment_id(cpi, tile_info, x, mi_row, mi_col,
+                                       sb_size);
     av1_choose_var_based_partitioning(cpi, tile_info, td, x, mi_row, mi_col);
-    rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, sb_size,
-                     &dummy_rate, &dummy_dist, 1, pc_root);
+    PC_TREE *const pc_root = av1_alloc_pc_tree_node(sb_size);
+    av1_rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, sb_size,
+                         &dummy_rate, &dummy_dist, 1, pc_root);
+    av1_free_pc_tree_recursive(pc_root, num_planes, 0, 0);
   }
 #if !CONFIG_REALTIME_ONLY
   else if (sf->part_sf.partition_search_type == FIXED_PARTITION || seg_skip) {
-    set_offsets(cpi, tile_info, x, mi_row, mi_col, sb_size);
+    // partition search by adjusting a fixed-size partition
+    av1_set_offsets(cpi, tile_info, x, mi_row, mi_col, sb_size);
     const BLOCK_SIZE bsize =
-        seg_skip ? sb_size : sf->part_sf.always_this_block_size;
-    set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize);
-    rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, sb_size,
-                     &dummy_rate, &dummy_dist, 1, pc_root);
+        seg_skip ? sb_size : sf->part_sf.fixed_partition_size;
+    av1_set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize);
+    PC_TREE *const pc_root = av1_alloc_pc_tree_node(sb_size);
+    av1_rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, sb_size,
+                         &dummy_rate, &dummy_dist, 1, pc_root);
+    av1_free_pc_tree_recursive(pc_root, num_planes, 0, 0);
   } else if (cpi->partition_search_skippable_frame) {
-    set_offsets(cpi, tile_info, x, mi_row, mi_col, sb_size);
+    // partition search by adjusting a fixed-size partition for which the size
+    // is determined by the source variance
+    av1_set_offsets(cpi, tile_info, x, mi_row, mi_col, sb_size);
     const BLOCK_SIZE bsize =
         get_rd_var_based_fixed_partition(cpi, x, mi_row, mi_col);
-    set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize);
-    rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, sb_size,
-                     &dummy_rate, &dummy_dist, 1, pc_root);
+    av1_set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize);
+    PC_TREE *const pc_root = av1_alloc_pc_tree_node(sb_size);
+    av1_rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, sb_size,
+                         &dummy_rate, &dummy_dist, 1, pc_root);
+    av1_free_pc_tree_recursive(pc_root, num_planes, 0, 0);
   } else {
+    // The most exhaustive recursive partition search
+    SuperBlockEnc *sb_enc = &x->sb_enc;
     // No stats for overlay frames. Exclude key frame.
-    x->valid_cost_b =
-        get_tpl_stats_b(cpi, sb_size, mi_row, mi_col, x->intra_cost_b,
-                        x->inter_cost_b, x->mv_b, &x->cost_stride);
+    av1_get_tpl_stats_sb(cpi, sb_size, mi_row, mi_col, sb_enc);
 
-    reset_partition(pc_root, sb_size);
+    // Reset the tree for simple motion search data
+    av1_reset_simple_motion_tree_partition(sms_root, sb_size);
 
 #if CONFIG_COLLECT_COMPONENT_TIMING
     start_timing(cpi, rd_pick_partition_time);
 #endif
-    BLOCK_SIZE max_sq_size = x->max_partition_size;
-    BLOCK_SIZE min_sq_size = x->min_partition_size;
 
-    if (use_auto_max_partition(cpi, sb_size, mi_row, mi_col)) {
-      float features[FEATURE_SIZE_MAX_MIN_PART_PRED] = { 0.0f };
+    // Estimate the maximum square partition block size, which will be used
+    // as the starting block size for partitioning the sb
+    set_max_min_partition_size(sb_enc, cpi, x, sf, sb_size, mi_row, mi_col);
 
-      av1_get_max_min_partition_features(cpi, x, mi_row, mi_col, features);
-      max_sq_size = AOMMAX(
-          AOMMIN(av1_predict_max_partition(cpi, x, features), max_sq_size),
-          min_sq_size);
-    }
-
-    const int num_passes = cpi->oxcf.sb_multipass_unit_test ? 2 : 1;
+    // The superblock can be searched only once, or twice consecutively for
+    // better quality. Note that the meaning of passes here is different from
+    // the general concept of 1-pass/2-pass encoders.
+    const int num_passes =
+        cpi->oxcf.unit_test_cfg.sb_multipass_unit_test ? 2 : 1;
 
     if (num_passes == 1) {
-      rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, sb_size,
-                        max_sq_size, min_sq_size, &dummy_rdc, dummy_rdc,
-                        pc_root, NULL, SB_SINGLE_PASS, NULL);
+      PC_TREE *const pc_root = av1_alloc_pc_tree_node(sb_size);
+      av1_rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, sb_size,
+                            &dummy_rdc, dummy_rdc, pc_root, sms_root, NULL,
+                            SB_SINGLE_PASS, NULL);
     } else {
       // First pass
       SB_FIRST_PASS_STATS sb_fp_stats;
-      backup_sb_state(&sb_fp_stats, cpi, td, tile_data, mi_row, mi_col);
-      rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, sb_size,
-                        max_sq_size, min_sq_size, &dummy_rdc, dummy_rdc,
-                        pc_root, NULL, SB_DRY_PASS, NULL);
+      av1_backup_sb_state(&sb_fp_stats, cpi, td, tile_data, mi_row, mi_col);
+      PC_TREE *const pc_root_p0 = av1_alloc_pc_tree_node(sb_size);
+      av1_rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, sb_size,
+                            &dummy_rdc, dummy_rdc, pc_root_p0, sms_root, NULL,
+                            SB_DRY_PASS, NULL);
 
       // Second pass
-      init_encode_rd_sb(cpi, td, tile_data, pc_root, &dummy_rdc, mi_row, mi_col,
-                        0);
-      reset_mbmi(&cm->mi_params, sb_size, mi_row, mi_col);
-      reset_partition(pc_root, sb_size);
+      init_encode_rd_sb(cpi, td, tile_data, sms_root, &dummy_rdc, mi_row,
+                        mi_col, 0);
+      av1_reset_mbmi(&cm->mi_params, sb_size, mi_row, mi_col);
+      av1_reset_simple_motion_tree_partition(sms_root, sb_size);
 
-      restore_sb_state(&sb_fp_stats, cpi, td, tile_data, mi_row, mi_col);
+      av1_restore_sb_state(&sb_fp_stats, cpi, td, tile_data, mi_row, mi_col);
 
-      rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, sb_size,
-                        max_sq_size, min_sq_size, &dummy_rdc, dummy_rdc,
-                        pc_root, NULL, SB_WET_PASS, NULL);
+      PC_TREE *const pc_root_p1 = av1_alloc_pc_tree_node(sb_size);
+      av1_rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, sb_size,
+                            &dummy_rdc, dummy_rdc, pc_root_p1, sms_root, NULL,
+                            SB_WET_PASS, NULL);
     }
     // Reset to 0 so that it wouldn't be used elsewhere mistakenly.
-    x->valid_cost_b = 0;
+    sb_enc->tpl_data_count = 0;
 #if CONFIG_COLLECT_COMPONENT_TIMING
     end_timing(cpi, rd_pick_partition_time);
 #endif
   }
 #endif  // !CONFIG_REALTIME_ONLY
 
+  // Update the inter rd model
   // TODO(angiebird): Let inter_mode_rd_model_estimation support multi-tile.
   if (cpi->sf.inter_sf.inter_mode_rd_model_estimation == 1 &&
       cm->tiles.cols == 1 && cm->tiles.rows == 1) {
@@ -4821,66 +748,23 @@
   }
 }
 
-static AOM_INLINE void set_cost_upd_freq(AV1_COMP *cpi, ThreadData *td,
-                                         const TileInfo *const tile_info,
-                                         const int mi_row, const int mi_col) {
-  AV1_COMMON *const cm = &cpi->common;
-  const int num_planes = av1_num_planes(cm);
-  MACROBLOCK *const x = &td->mb;
-  MACROBLOCKD *const xd = &x->e_mbd;
-
-  switch (cpi->oxcf.coeff_cost_upd_freq) {
-    case COST_UPD_TILE:  // Tile level
-      if (mi_row != tile_info->mi_row_start) break;
-      AOM_FALLTHROUGH_INTENDED;
-    case COST_UPD_SBROW:  // SB row level in tile
-      if (mi_col != tile_info->mi_col_start) break;
-      AOM_FALLTHROUGH_INTENDED;
-    case COST_UPD_SB:  // SB level
-      if (cpi->sf.inter_sf.disable_sb_level_coeff_cost_upd &&
-          mi_col != tile_info->mi_col_start)
-        break;
-      av1_fill_coeff_costs(&td->mb, xd->tile_ctx, num_planes);
-      break;
-    default: assert(0);
-  }
-
-  switch (cpi->oxcf.mode_cost_upd_freq) {
-    case COST_UPD_TILE:  // Tile level
-      if (mi_row != tile_info->mi_row_start) break;
-      AOM_FALLTHROUGH_INTENDED;
-    case COST_UPD_SBROW:  // SB row level in tile
-      if (mi_col != tile_info->mi_col_start) break;
-      AOM_FALLTHROUGH_INTENDED;
-    case COST_UPD_SB:  // SB level
-      av1_fill_mode_rates(cm, x, xd->tile_ctx);
-      break;
-    default: assert(0);
-  }
-  switch (cpi->oxcf.mv_cost_upd_freq) {
-    case COST_UPD_OFF: break;
-    case COST_UPD_TILE:  // Tile level
-      if (mi_row != tile_info->mi_row_start) break;
-      AOM_FALLTHROUGH_INTENDED;
-    case COST_UPD_SBROW:  // SB row level in tile
-      if (mi_col != tile_info->mi_col_start) break;
-      AOM_FALLTHROUGH_INTENDED;
-    case COST_UPD_SB:  // SB level
-      if (cpi->sf.inter_sf.disable_sb_level_mv_cost_upd &&
-          mi_col != tile_info->mi_col_start)
-        break;
-      av1_fill_mv_costs(xd->tile_ctx, cm->features.cur_frame_force_integer_mv,
-                        cm->features.allow_high_precision_mv, x);
-      break;
-    default: assert(0);
-  }
-}
-
+/*!\brief Encode a superblock row by breaking it into superblocks
+ *
+ * \ingroup partition_search
+ * \callgraph
+ * \callergraph
+ * Do partition and mode search for an sb row: one row of superblocks filling up
+ * the width of the current tile.
+ */
 static AOM_INLINE void encode_sb_row(AV1_COMP *cpi, ThreadData *td,
                                      TileDataEnc *tile_data, int mi_row,
-                                     TOKENEXTRA **tp) {
+                                     TokenExtra **tp) {
   AV1_COMMON *const cm = &cpi->common;
   const TileInfo *const tile_info = &tile_data->tile_info;
+  MultiThreadInfo *const mt_info = &cpi->mt_info;
+  AV1EncRowMultiThreadInfo *const enc_row_mt = &mt_info->enc_row_mt;
+  AV1EncRowMultiThreadSync *const row_mt_sync = &tile_data->row_mt_sync;
+  bool row_mt_enabled = mt_info->row_mt_enabled;
   MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   const int sb_cols_in_tile = av1_get_sb_cols_in_tile(cm, tile_data->tile_info);
@@ -4891,58 +775,61 @@
   const int use_nonrd_mode = cpi->sf.rt_sf.use_nonrd_pick_mode;
 
 #if CONFIG_COLLECT_COMPONENT_TIMING
-  start_timing(cpi, encode_sb_time);
+  start_timing(cpi, encode_sb_row_time);
 #endif
 
   // Initialize the left context for the new SB row
   av1_zero_left_context(xd);
 
-  // Reset delta for every tile
-  if (mi_row == tile_info->mi_row_start || cpi->row_mt) {
+  // Reset delta for quantizer and loof filters at the beginning of every tile
+  if (mi_row == tile_info->mi_row_start || row_mt_enabled) {
     if (cm->delta_q_info.delta_q_present_flag)
-      xd->current_qindex = cm->quant_params.base_qindex;
+      xd->current_base_qindex = cm->quant_params.base_qindex;
     if (cm->delta_q_info.delta_lf_present_flag) {
       av1_reset_loop_filter_delta(xd, av1_num_planes(cm));
     }
   }
+
   reset_thresh_freq_fact(x);
 
   // Code each SB in the row
   for (int mi_col = tile_info->mi_col_start, sb_col_in_tile = 0;
        mi_col < tile_info->mi_col_end; mi_col += mib_size, sb_col_in_tile++) {
-    (*(cpi->row_mt_sync_read_ptr))(&tile_data->row_mt_sync, sb_row,
-                                   sb_col_in_tile);
-    if (tile_data->allow_update_cdf && (cpi->row_mt == 1) &&
+    (*(enc_row_mt->sync_read_ptr))(row_mt_sync, sb_row, sb_col_in_tile);
+
+    if (tile_data->allow_update_cdf && row_mt_enabled &&
         (tile_info->mi_row_start != mi_row)) {
       if ((tile_info->mi_col_start == mi_col)) {
-        // restore frame context of 1st column sb
+        // restore frame context at the 1st column sb
         memcpy(xd->tile_ctx, x->row_ctx, sizeof(*xd->tile_ctx));
       } else {
+        // update context
         int wt_left = AVG_CDF_WEIGHT_LEFT;
         int wt_tr = AVG_CDF_WEIGHT_TOP_RIGHT;
         if (tile_info->mi_col_end > (mi_col + mib_size))
-          avg_cdf_symbols(xd->tile_ctx, x->row_ctx + sb_col_in_tile, wt_left,
-                          wt_tr);
+          av1_avg_cdf_symbols(xd->tile_ctx, x->row_ctx + sb_col_in_tile,
+                              wt_left, wt_tr);
         else
-          avg_cdf_symbols(xd->tile_ctx, x->row_ctx + sb_col_in_tile - 1,
-                          wt_left, wt_tr);
+          av1_avg_cdf_symbols(xd->tile_ctx, x->row_ctx + sb_col_in_tile - 1,
+                              wt_left, wt_tr);
       }
     }
 
-    set_cost_upd_freq(cpi, td, tile_info, mi_row, mi_col);
+    // Update the rate cost tables for some symbols
+    av1_set_cost_upd_freq(cpi, td, tile_info, mi_row, mi_col);
 
+    // Reset color coding related parameters
     x->color_sensitivity[0] = 0;
     x->color_sensitivity[1] = 0;
-    x->content_state_sb = 0;
-
-    PC_TREE *const pc_root = td->pc_root;
-    pc_root->index = 0;
+    x->content_state_sb.source_sad = kMedSad;
+    x->content_state_sb.lighting_change = 0;
+    x->content_state_sb.low_sumdiff = 0;
 
     xd->cur_frame_force_integer_mv = cm->features.cur_frame_force_integer_mv;
-    td->mb.cb_coef_buff = av1_get_cb_coeff_buffer(cpi, mi_row, mi_col);
     x->source_variance = UINT_MAX;
-    x->simple_motion_pred_sse = UINT_MAX;
+    td->mb.cb_coef_buff = av1_get_cb_coeff_buffer(cpi, mi_row, mi_col);
 
+    // Get segment id and skip flag
     const struct segmentation *const seg = &cm->seg;
     int seg_skip = 0;
     if (seg->enabled) {
@@ -4954,14 +841,15 @@
       seg_skip = segfeature_active(seg, segment_id, SEG_LVL_SKIP);
     }
 
+    // encode the superblock
     if (use_nonrd_mode) {
-      encode_nonrd_sb(cpi, td, tile_data, pc_root, tp, mi_row, mi_col,
-                      seg_skip);
+      encode_nonrd_sb(cpi, td, tile_data, tp, mi_row, mi_col, seg_skip);
     } else {
-      encode_rd_sb(cpi, td, tile_data, pc_root, tp, mi_row, mi_col, seg_skip);
+      encode_rd_sb(cpi, td, tile_data, tp, mi_row, mi_col, seg_skip);
     }
 
-    if (tile_data->allow_update_cdf && (cpi->row_mt == 1) &&
+    // Update the top-right context in row_mt coding
+    if (tile_data->allow_update_cdf && row_mt_enabled &&
         (tile_info->mi_row_end > (mi_row + mib_size))) {
       if (sb_cols_in_tile == 1)
         memcpy(x->row_ctx, xd->tile_ctx, sizeof(*xd->tile_ctx));
@@ -4969,11 +857,11 @@
         memcpy(x->row_ctx + sb_col_in_tile - 1, xd->tile_ctx,
                sizeof(*xd->tile_ctx));
     }
-    (*(cpi->row_mt_sync_write_ptr))(&tile_data->row_mt_sync, sb_row,
-                                    sb_col_in_tile, sb_cols_in_tile);
+    (*(enc_row_mt->sync_write_ptr))(row_mt_sync, sb_row, sb_col_in_tile,
+                                    sb_cols_in_tile);
   }
 #if CONFIG_COLLECT_COMPONENT_TIMING
-  end_timing(cpi, encode_sb_time);
+  end_timing(cpi, encode_sb_row_time);
 #endif
 }
 
@@ -5010,8 +898,9 @@
   const int tile_cols = cm->tiles.cols;
   const int tile_rows = cm->tiles.rows;
   int tile_col, tile_row;
-  TOKENEXTRA *pre_tok = cpi->tile_tok[0][0];
-  TOKENLIST *tplist = cpi->tplist[0][0];
+  TokenInfo *const token_info = &cpi->token_info;
+  TokenExtra *pre_tok = token_info->tile_tok[0][0];
+  TokenList *tplist = token_info->tplist[0][0];
   unsigned int tile_tok = 0;
   int tplist_count = 0;
 
@@ -5021,14 +910,18 @@
           &cpi->tile_data[tile_row * tile_cols + tile_col];
       TileInfo *const tile_info = &tile_data->tile_info;
       av1_tile_init(tile_info, cm, tile_row, tile_col);
+      tile_data->firstpass_top_mv = kZeroMv;
 
-      cpi->tile_tok[tile_row][tile_col] = pre_tok + tile_tok;
-      pre_tok = cpi->tile_tok[tile_row][tile_col];
-      tile_tok = allocated_tokens(
-          *tile_info, cm->seq_params.mib_size_log2 + MI_SIZE_LOG2, num_planes);
-      cpi->tplist[tile_row][tile_col] = tplist + tplist_count;
-      tplist = cpi->tplist[tile_row][tile_col];
-      tplist_count = av1_get_sb_rows_in_tile(cm, tile_data->tile_info);
+      if (pre_tok != NULL && tplist != NULL) {
+        token_info->tile_tok[tile_row][tile_col] = pre_tok + tile_tok;
+        pre_tok = token_info->tile_tok[tile_row][tile_col];
+        tile_tok = allocated_tokens(*tile_info,
+                                    cm->seq_params.mib_size_log2 + MI_SIZE_LOG2,
+                                    num_planes);
+        token_info->tplist[tile_row][tile_col] = tplist + tplist_count;
+        tplist = token_info->tplist[tile_row][tile_col];
+        tplist_count = av1_get_sb_rows_in_tile(cm, tile_data->tile_info);
+      }
       tile_data->allow_update_cdf = !cm->tiles.large_scale;
       tile_data->allow_update_cdf =
           tile_data->allow_update_cdf && !cm->features.disable_cdf_update;
@@ -5037,6 +930,10 @@
   }
 }
 
+/*!\brief Encode a superblock row
+ *
+ * \ingroup partition_search
+ */
 void av1_encode_sb_row(AV1_COMP *cpi, ThreadData *td, int tile_row,
                        int tile_col, int mi_row) {
   AV1_COMMON *const cm = &cpi->common;
@@ -5044,7 +941,8 @@
   const int tile_cols = cm->tiles.cols;
   TileDataEnc *this_tile = &cpi->tile_data[tile_row * tile_cols + tile_col];
   const TileInfo *const tile_info = &this_tile->tile_info;
-  TOKENEXTRA *tok = NULL;
+  TokenExtra *tok = NULL;
+  TokenList *const tplist = cpi->token_info.tplist[tile_row][tile_col];
   const int sb_row_in_tile =
       (mi_row - tile_info->mi_row_start) >> cm->seq_params.mib_size_log2;
   const int tile_mb_cols =
@@ -5054,25 +952,26 @@
 
   get_start_tok(cpi, tile_row, tile_col, mi_row, &tok,
                 cm->seq_params.mib_size_log2 + MI_SIZE_LOG2, num_planes);
-  cpi->tplist[tile_row][tile_col][sb_row_in_tile].start = tok;
+  tplist[sb_row_in_tile].start = tok;
 
   encode_sb_row(cpi, td, this_tile, mi_row, &tok);
 
-  cpi->tplist[tile_row][tile_col][sb_row_in_tile].stop = tok;
-  cpi->tplist[tile_row][tile_col][sb_row_in_tile].count =
-      (unsigned int)(cpi->tplist[tile_row][tile_col][sb_row_in_tile].stop -
-                     cpi->tplist[tile_row][tile_col][sb_row_in_tile].start);
+  tplist[sb_row_in_tile].count =
+      (unsigned int)(tok - tplist[sb_row_in_tile].start);
 
-  assert(
-      (unsigned int)(tok -
-                     cpi->tplist[tile_row][tile_col][sb_row_in_tile].start) <=
-      get_token_alloc(num_mb_rows_in_sb, tile_mb_cols,
-                      cm->seq_params.mib_size_log2 + MI_SIZE_LOG2, num_planes));
+  assert((unsigned int)(tok - tplist[sb_row_in_tile].start) <=
+         get_token_alloc(num_mb_rows_in_sb, tile_mb_cols,
+                         cm->seq_params.mib_size_log2 + MI_SIZE_LOG2,
+                         num_planes));
 
   (void)tile_mb_cols;
   (void)num_mb_rows_in_sb;
 }
 
+/*!\brief Encode a tile
+ *
+ * \ingroup partition_search
+ */
 void av1_encode_tile(AV1_COMP *cpi, ThreadData *td, int tile_row,
                      int tile_col) {
   AV1_COMMON *const cm = &cpi->common;
@@ -5087,9 +986,11 @@
   av1_init_above_context(&cm->above_contexts, av1_num_planes(cm), tile_row,
                          &td->mb.e_mbd);
 
-  if (cpi->oxcf.enable_cfl_intra) cfl_init(&td->mb.e_mbd.cfl, &cm->seq_params);
+  if (cpi->oxcf.intra_mode_cfg.enable_cfl_intra)
+    cfl_init(&td->mb.e_mbd.cfl, &cm->seq_params);
 
-  av1_crc32c_calculator_init(&td->mb.mb_rd_record.crc_calculator);
+  av1_crc32c_calculator_init(
+      &td->mb.txfm_search_info.mb_rd_record.crc_calculator);
 
   for (int mi_row = tile_info->mi_row_start; mi_row < tile_info->mi_row_end;
        mi_row += cm->seq_params.mib_size) {
@@ -5097,14 +998,21 @@
   }
 }
 
+/*!\brief Break one frame into tiles and encode the tiles
+ *
+ * \ingroup partition_search
+ *
+ * \param[in]    cpi    Top-level encoder structure
+ */
 static AOM_INLINE void encode_tiles(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
   const int tile_cols = cm->tiles.cols;
   const int tile_rows = cm->tiles.rows;
   int tile_col, tile_row;
 
-  if (cpi->tile_data == NULL || cpi->allocated_tiles < tile_cols * tile_rows)
-    av1_alloc_tile_data(cpi);
+  assert(IMPLIES(cpi->tile_data == NULL,
+                 cpi->allocated_tiles < tile_cols * tile_rows));
+  if (cpi->allocated_tiles < tile_cols * tile_rows) av1_alloc_tile_data(cpi);
 
   av1_init_tile_data(cpi);
 
@@ -5123,95 +1031,29 @@
   }
 }
 
-#define GLOBAL_TRANS_TYPES_ENC 3  // highest motion model to search
-static int gm_get_params_cost(const WarpedMotionParams *gm,
-                              const WarpedMotionParams *ref_gm, int allow_hp) {
-  int params_cost = 0;
-  int trans_bits, trans_prec_diff;
-  switch (gm->wmtype) {
-    case AFFINE:
-    case ROTZOOM:
-      params_cost += aom_count_signed_primitive_refsubexpfin(
-          GM_ALPHA_MAX + 1, SUBEXPFIN_K,
-          (ref_gm->wmmat[2] >> GM_ALPHA_PREC_DIFF) - (1 << GM_ALPHA_PREC_BITS),
-          (gm->wmmat[2] >> GM_ALPHA_PREC_DIFF) - (1 << GM_ALPHA_PREC_BITS));
-      params_cost += aom_count_signed_primitive_refsubexpfin(
-          GM_ALPHA_MAX + 1, SUBEXPFIN_K,
-          (ref_gm->wmmat[3] >> GM_ALPHA_PREC_DIFF),
-          (gm->wmmat[3] >> GM_ALPHA_PREC_DIFF));
-      if (gm->wmtype >= AFFINE) {
-        params_cost += aom_count_signed_primitive_refsubexpfin(
-            GM_ALPHA_MAX + 1, SUBEXPFIN_K,
-            (ref_gm->wmmat[4] >> GM_ALPHA_PREC_DIFF),
-            (gm->wmmat[4] >> GM_ALPHA_PREC_DIFF));
-        params_cost += aom_count_signed_primitive_refsubexpfin(
-            GM_ALPHA_MAX + 1, SUBEXPFIN_K,
-            (ref_gm->wmmat[5] >> GM_ALPHA_PREC_DIFF) -
-                (1 << GM_ALPHA_PREC_BITS),
-            (gm->wmmat[5] >> GM_ALPHA_PREC_DIFF) - (1 << GM_ALPHA_PREC_BITS));
-      }
-      AOM_FALLTHROUGH_INTENDED;
-    case TRANSLATION:
-      trans_bits = (gm->wmtype == TRANSLATION)
-                       ? GM_ABS_TRANS_ONLY_BITS - !allow_hp
-                       : GM_ABS_TRANS_BITS;
-      trans_prec_diff = (gm->wmtype == TRANSLATION)
-                            ? GM_TRANS_ONLY_PREC_DIFF + !allow_hp
-                            : GM_TRANS_PREC_DIFF;
-      params_cost += aom_count_signed_primitive_refsubexpfin(
-          (1 << trans_bits) + 1, SUBEXPFIN_K,
-          (ref_gm->wmmat[0] >> trans_prec_diff),
-          (gm->wmmat[0] >> trans_prec_diff));
-      params_cost += aom_count_signed_primitive_refsubexpfin(
-          (1 << trans_bits) + 1, SUBEXPFIN_K,
-          (ref_gm->wmmat[1] >> trans_prec_diff),
-          (gm->wmmat[1] >> trans_prec_diff));
-      AOM_FALLTHROUGH_INTENDED;
-    case IDENTITY: break;
-    default: assert(0);
-  }
-  return (params_cost << AV1_PROB_COST_SHIFT);
-}
-
-static int do_gm_search_logic(SPEED_FEATURES *const sf, int frame) {
-  (void)frame;
-  switch (sf->gm_sf.gm_search_type) {
-    case GM_FULL_SEARCH: return 1;
-    case GM_REDUCED_REF_SEARCH_SKIP_L2_L3:
-      return !(frame == LAST2_FRAME || frame == LAST3_FRAME);
-    case GM_REDUCED_REF_SEARCH_SKIP_L2_L3_ARF2:
-      return !(frame == LAST2_FRAME || frame == LAST3_FRAME ||
-               (frame == ALTREF2_FRAME));
-    case GM_DISABLE_SEARCH: return 0;
-    default: assert(0);
-  }
-  return 1;
-}
-
 // Set the relative distance of a reference frame w.r.t. current frame
-static AOM_INLINE void set_rel_frame_dist(AV1_COMP *cpi) {
-  const AV1_COMMON *const cm = &cpi->common;
-  const OrderHintInfo *const order_hint_info = &cm->seq_params.order_hint_info;
+static AOM_INLINE void set_rel_frame_dist(
+    const AV1_COMMON *const cm, RefFrameDistanceInfo *const ref_frame_dist_info,
+    const int ref_frame_flags) {
   MV_REFERENCE_FRAME ref_frame;
   int min_past_dist = INT32_MAX, min_future_dist = INT32_MAX;
-  cpi->nearest_past_ref = NONE_FRAME;
-  cpi->nearest_future_ref = NONE_FRAME;
+  ref_frame_dist_info->nearest_past_ref = NONE_FRAME;
+  ref_frame_dist_info->nearest_future_ref = NONE_FRAME;
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
-    cpi->ref_relative_dist[ref_frame - LAST_FRAME] = 0;
-    if (cpi->ref_frame_flags & av1_ref_frame_flag_list[ref_frame]) {
+    ref_frame_dist_info->ref_relative_dist[ref_frame - LAST_FRAME] = 0;
+    if (ref_frame_flags & av1_ref_frame_flag_list[ref_frame]) {
       int dist = av1_encoder_get_relative_dist(
-          order_hint_info,
           cm->cur_frame->ref_display_order_hint[ref_frame - LAST_FRAME],
           cm->current_frame.display_order_hint);
-      cpi->ref_relative_dist[ref_frame - LAST_FRAME] = dist;
+      ref_frame_dist_info->ref_relative_dist[ref_frame - LAST_FRAME] = dist;
       // Get the nearest ref_frame in the past
       if (abs(dist) < min_past_dist && dist < 0) {
-        cpi->nearest_past_ref = ref_frame;
+        ref_frame_dist_info->nearest_past_ref = ref_frame;
         min_past_dist = abs(dist);
       }
       // Get the nearest ref_frame in the future
       if (dist < min_future_dist && dist > 0) {
-        cpi->nearest_future_ref = ref_frame;
+        ref_frame_dist_info->nearest_future_ref = ref_frame;
         min_future_dist = dist;
       }
     }
@@ -5222,14 +1064,12 @@
   assert(!frame_is_intra_only(cm));
 
   int one_sided_refs = 1;
+  const int cur_display_order_hint = cm->current_frame.display_order_hint;
   for (int ref = LAST_FRAME; ref <= ALTREF_FRAME; ++ref) {
     const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref);
     if (buf == NULL) continue;
-
-    const int ref_display_order_hint = buf->display_order_hint;
-    if (av1_encoder_get_relative_dist(
-            &cm->seq_params.order_hint_info, ref_display_order_hint,
-            (int)cm->current_frame.display_order_hint) > 0) {
+    if (av1_encoder_get_relative_dist(buf->display_order_hint,
+                                      cur_display_order_hint) > 0) {
       one_sided_refs = 0;  // bwd reference
       break;
     }
@@ -5271,7 +1111,7 @@
   if (abs(cur_to_ref0 - cur_to_ref1) > 1) return 0;
 
   // High Latency: Turn off skip mode if all refs are fwd.
-  if (cpi->all_one_sided_refs && cpi->oxcf.lag_in_frames > 0) return 0;
+  if (cpi->all_one_sided_refs && cpi->oxcf.gf_cfg.lag_in_frames > 0) return 0;
 
   static const int flag_list[REF_FRAMES] = { 0,
                                              AOM_LAST_FLAG,
@@ -5292,19 +1132,6 @@
   return 1;
 }
 
-// Function to decide if we can skip the global motion parameter computation
-// for a particular ref frame
-static INLINE int skip_gm_frame(AV1_COMMON *const cm, int ref_frame) {
-  if ((ref_frame == LAST3_FRAME || ref_frame == LAST2_FRAME) &&
-      cm->global_motion[GOLDEN_FRAME].wmtype != IDENTITY) {
-    return get_relative_dist(
-               &cm->seq_params.order_hint_info,
-               cm->cur_frame->ref_order_hints[ref_frame - LAST_FRAME],
-               cm->cur_frame->ref_order_hints[GOLDEN_FRAME - LAST_FRAME]) <= 0;
-  }
-  return 0;
-}
-
 static AOM_INLINE void set_default_interp_skip_flags(
     const AV1_COMMON *cm, InterpSearchFlags *interp_search_flags) {
   const int num_planes = av1_num_planes(cm);
@@ -5313,271 +1140,24 @@
                         : INTERP_SKIP_LUMA_SKIP_CHROMA;
 }
 
-// TODO(Remya): Can include erroradv_prod_tr[] for threshold calculation
-static INLINE int64_t calc_erroradv_threshold(AV1_COMP *cpi,
-                                              int64_t ref_frame_error) {
-  if (!cpi->sf.gm_sf.disable_adaptive_warp_error_thresh)
-    return (int64_t)(
-        ref_frame_error * erroradv_tr[cpi->sf.gm_sf.gm_erroradv_type] + 0.5);
-  else
-    return INT64_MAX;
-}
-
-static void compute_global_motion_for_ref_frame(
-    AV1_COMP *cpi, YV12_BUFFER_CONFIG *ref_buf[REF_FRAMES], int frame,
-    int *num_frm_corners, int *frm_corners, unsigned char *frm_buffer,
-    MotionModel *params_by_motion, uint8_t *segment_map,
-    const int segment_map_w, const int segment_map_h,
-    const WarpedMotionParams *ref_params) {
-  ThreadData *const td = &cpi->td;
-  MACROBLOCK *const x = &td->mb;
-  AV1_COMMON *const cm = &cpi->common;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  int i;
-  // clang-format off
-  static const double kIdentityParams[MAX_PARAMDIM - 1] = {
-     0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0
-  };
-  // clang-format on
-  WarpedMotionParams tmp_wm_params;
-  const double *params_this_motion;
-  int inliers_by_motion[RANSAC_NUM_MOTIONS];
-  assert(ref_buf[frame] != NULL);
-  if (*num_frm_corners < 0) {
-    // compute interest points using FAST features
-    *num_frm_corners = av1_fast_corner_detect(
-        frm_buffer, cpi->source->y_width, cpi->source->y_height,
-        cpi->source->y_stride, frm_corners, MAX_CORNERS);
-  }
-  TransformationType model;
-
-  aom_clear_system_state();
-
-  // TODO(sarahparker, debargha): Explore do_adaptive_gm_estimation = 1
-  const int do_adaptive_gm_estimation = 0;
-
-  const int ref_frame_dist = get_relative_dist(
-      &cm->seq_params.order_hint_info, cm->current_frame.order_hint,
-      cm->cur_frame->ref_order_hints[frame - LAST_FRAME]);
-  const GlobalMotionEstimationType gm_estimation_type =
-      cm->seq_params.order_hint_info.enable_order_hint &&
-              abs(ref_frame_dist) <= 2 && do_adaptive_gm_estimation
-          ? GLOBAL_MOTION_DISFLOW_BASED
-          : GLOBAL_MOTION_FEATURE_BASED;
-  for (model = ROTZOOM; model < GLOBAL_TRANS_TYPES_ENC; ++model) {
-    int64_t best_warp_error = INT64_MAX;
-    // Initially set all params to identity.
-    for (i = 0; i < RANSAC_NUM_MOTIONS; ++i) {
-      memcpy(params_by_motion[i].params, kIdentityParams,
-             (MAX_PARAMDIM - 1) * sizeof(*(params_by_motion[i].params)));
-      params_by_motion[i].num_inliers = 0;
-    }
-
-    av1_compute_global_motion(
-        model, frm_buffer, cpi->source->y_width, cpi->source->y_height,
-        cpi->source->y_stride, frm_corners, *num_frm_corners, ref_buf[frame],
-        cpi->common.seq_params.bit_depth, gm_estimation_type, inliers_by_motion,
-        params_by_motion, RANSAC_NUM_MOTIONS);
-    int64_t ref_frame_error = 0;
-    for (i = 0; i < RANSAC_NUM_MOTIONS; ++i) {
-      if (inliers_by_motion[i] == 0) continue;
-
-      params_this_motion = params_by_motion[i].params;
-      av1_convert_model_to_params(params_this_motion, &tmp_wm_params);
-
-      if (tmp_wm_params.wmtype != IDENTITY) {
-        av1_compute_feature_segmentation_map(
-            segment_map, segment_map_w, segment_map_h,
-            params_by_motion[i].inliers, params_by_motion[i].num_inliers);
-
-        ref_frame_error = av1_segmented_frame_error(
-            is_cur_buf_hbd(xd), xd->bd, ref_buf[frame]->y_buffer,
-            ref_buf[frame]->y_stride, cpi->source->y_buffer,
-            cpi->source->y_width, cpi->source->y_height, cpi->source->y_stride,
-            segment_map, segment_map_w);
-
-        int64_t erroradv_threshold =
-            calc_erroradv_threshold(cpi, ref_frame_error);
-
-        const int64_t warp_error = av1_refine_integerized_param(
-            &tmp_wm_params, tmp_wm_params.wmtype, is_cur_buf_hbd(xd), xd->bd,
-            ref_buf[frame]->y_buffer, ref_buf[frame]->y_width,
-            ref_buf[frame]->y_height, ref_buf[frame]->y_stride,
-            cpi->source->y_buffer, cpi->source->y_width, cpi->source->y_height,
-            cpi->source->y_stride, GM_REFINEMENT_COUNT, best_warp_error,
-            segment_map, segment_map_w, erroradv_threshold);
-
-        if (warp_error < best_warp_error) {
-          best_warp_error = warp_error;
-          // Save the wm_params modified by
-          // av1_refine_integerized_param() rather than motion index to
-          // avoid rerunning refine() below.
-          memcpy(&(cm->global_motion[frame]), &tmp_wm_params,
-                 sizeof(WarpedMotionParams));
-        }
-      }
-    }
-    if (cm->global_motion[frame].wmtype <= AFFINE)
-      if (!av1_get_shear_params(&cm->global_motion[frame]))
-        cm->global_motion[frame] = default_warp_params;
-
-    if (cm->global_motion[frame].wmtype == TRANSLATION) {
-      cm->global_motion[frame].wmmat[0] =
-          convert_to_trans_prec(cm->features.allow_high_precision_mv,
-                                cm->global_motion[frame].wmmat[0]) *
-          GM_TRANS_ONLY_DECODE_FACTOR;
-      cm->global_motion[frame].wmmat[1] =
-          convert_to_trans_prec(cm->features.allow_high_precision_mv,
-                                cm->global_motion[frame].wmmat[1]) *
-          GM_TRANS_ONLY_DECODE_FACTOR;
-    }
-
-    if (cm->global_motion[frame].wmtype == IDENTITY) continue;
-
-    if (ref_frame_error == 0) continue;
-
-    // If the best error advantage found doesn't meet the threshold for
-    // this motion type, revert to IDENTITY.
-    if (!av1_is_enough_erroradvantage(
-            (double)best_warp_error / ref_frame_error,
-            gm_get_params_cost(&cm->global_motion[frame], ref_params,
-                               cm->features.allow_high_precision_mv),
-            cpi->sf.gm_sf.gm_erroradv_type)) {
-      cm->global_motion[frame] = default_warp_params;
-    }
-
-    if (cm->global_motion[frame].wmtype != IDENTITY) break;
-  }
-
-  aom_clear_system_state();
-}
-
-typedef struct {
-  int distance;
-  MV_REFERENCE_FRAME frame;
-} FrameDistPair;
-
-static INLINE void update_valid_ref_frames_for_gm(
-    AV1_COMP *cpi, YV12_BUFFER_CONFIG *ref_buf[REF_FRAMES],
-    FrameDistPair *past_ref_frame, FrameDistPair *future_ref_frame,
-    int *num_past_ref_frames, int *num_future_ref_frames) {
-  AV1_COMMON *const cm = &cpi->common;
-  const OrderHintInfo *const order_hint_info = &cm->seq_params.order_hint_info;
-  for (int frame = ALTREF_FRAME; frame >= LAST_FRAME; --frame) {
-    const MV_REFERENCE_FRAME ref_frame[2] = { frame, NONE_FRAME };
-    RefCntBuffer *buf = get_ref_frame_buf(cm, frame);
-    const int ref_disabled =
-        !(cpi->ref_frame_flags & av1_ref_frame_flag_list[frame]);
-    ref_buf[frame] = NULL;
-    cm->global_motion[frame] = default_warp_params;
-    // Skip global motion estimation for invalid ref frames
-    if (buf == NULL ||
-        (ref_disabled && cpi->sf.hl_sf.recode_loop != DISALLOW_RECODE)) {
-      cpi->gm_info.params_cost[frame] = 0;
-      continue;
-    } else {
-      ref_buf[frame] = &buf->buf;
-    }
-
-    if (ref_buf[frame]->y_crop_width == cpi->source->y_crop_width &&
-        ref_buf[frame]->y_crop_height == cpi->source->y_crop_height &&
-        do_gm_search_logic(&cpi->sf, frame) &&
-        !prune_ref_by_selective_ref_frame(
-            cpi, NULL, ref_frame, cm->cur_frame->ref_display_order_hint) &&
-        !(cpi->sf.gm_sf.selective_ref_gm && skip_gm_frame(cm, frame))) {
-      assert(ref_buf[frame] != NULL);
-      int relative_frame_dist = av1_encoder_get_relative_dist(
-          order_hint_info, buf->display_order_hint,
-          cm->cur_frame->display_order_hint);
-      // Populate past and future ref frames
-      if (relative_frame_dist <= 0) {
-        past_ref_frame[*num_past_ref_frames].distance =
-            abs(relative_frame_dist);
-        past_ref_frame[*num_past_ref_frames].frame = frame;
-        (*num_past_ref_frames)++;
-      } else {
-        future_ref_frame[*num_future_ref_frames].distance =
-            abs(relative_frame_dist);
-        future_ref_frame[*num_future_ref_frames].frame = frame;
-        (*num_future_ref_frames)++;
-      }
-    }
-  }
-}
-
-static INLINE void compute_gm_for_valid_ref_frames(
-    AV1_COMP *cpi, YV12_BUFFER_CONFIG *ref_buf[REF_FRAMES], int frame,
-    int *num_frm_corners, int *frm_corners, unsigned char *frm_buffer,
-    MotionModel *params_by_motion, uint8_t *segment_map,
-    const int segment_map_w, const int segment_map_h) {
-  AV1_COMMON *const cm = &cpi->common;
-  GlobalMotionInfo *const gm_info = &cpi->gm_info;
-  const WarpedMotionParams *ref_params =
-      cm->prev_frame ? &cm->prev_frame->global_motion[frame]
-                     : &default_warp_params;
-
-  compute_global_motion_for_ref_frame(
-      cpi, ref_buf, frame, num_frm_corners, frm_corners, frm_buffer,
-      params_by_motion, segment_map, segment_map_w, segment_map_h, ref_params);
-
-  gm_info->params_cost[frame] =
-      gm_get_params_cost(&cm->global_motion[frame], ref_params,
-                         cm->features.allow_high_precision_mv) +
-      gm_info->type_cost[cm->global_motion[frame].wmtype] -
-      gm_info->type_cost[IDENTITY];
-}
-
-static int compare_distance(const void *a, const void *b) {
-  const int diff =
-      ((FrameDistPair *)a)->distance - ((FrameDistPair *)b)->distance;
-  if (diff > 0)
-    return 1;
-  else if (diff < 0)
-    return -1;
-  return 0;
-}
-
-static INLINE void compute_global_motion_for_references(
-    AV1_COMP *cpi, YV12_BUFFER_CONFIG *ref_buf[REF_FRAMES],
-    FrameDistPair reference_frame[REF_FRAMES - 1], int num_ref_frames,
-    int *num_frm_corners, int *frm_corners, unsigned char *frm_buffer,
-    MotionModel *params_by_motion, uint8_t *segment_map,
-    const int segment_map_w, const int segment_map_h) {
-  AV1_COMMON *const cm = &cpi->common;
-  // Compute global motion w.r.t. reference frames starting from the nearest ref
-  // frame in a given direction
-  for (int frame = 0; frame < num_ref_frames; frame++) {
-    int ref_frame = reference_frame[frame].frame;
-    compute_gm_for_valid_ref_frames(cpi, ref_buf, ref_frame, num_frm_corners,
-                                    frm_corners, frm_buffer, params_by_motion,
-                                    segment_map, segment_map_w, segment_map_h);
-    // If global motion w.r.t. current ref frame is
-    // INVALID/TRANSLATION/IDENTITY, skip the evaluation of global motion w.r.t
-    // the remaining ref frames in that direction. The below exit is disabled
-    // when ref frame distance w.r.t. current frame is zero. E.g.:
-    // source_alt_ref_frame w.r.t. ARF frames
-    if (cpi->sf.gm_sf.prune_ref_frame_for_gm_search &&
-        reference_frame[frame].distance != 0 &&
-        cm->global_motion[ref_frame].wmtype != ROTZOOM)
-      break;
-  }
-}
-
 static AOM_INLINE void setup_prune_ref_frame_mask(AV1_COMP *cpi) {
-  if (!cpi->sf.rt_sf.use_nonrd_pick_mode &&
-      cpi->sf.inter_sf.selective_ref_frame >= 2) {
+  if ((!cpi->oxcf.ref_frm_cfg.enable_onesided_comp ||
+       cpi->sf.inter_sf.disable_onesided_comp) &&
+      cpi->all_one_sided_refs) {
+    // Disable all compound references
+    cpi->prune_ref_frame_mask = (1 << MODE_CTX_REF_FRAMES) - (1 << REF_FRAMES);
+  } else if (!cpi->sf.rt_sf.use_nonrd_pick_mode &&
+             cpi->sf.inter_sf.selective_ref_frame >= 2) {
     AV1_COMMON *const cm = &cpi->common;
-    const OrderHintInfo *const order_hint_info =
-        &cm->seq_params.order_hint_info;
     const int cur_frame_display_order_hint =
         cm->current_frame.display_order_hint;
     unsigned int *ref_display_order_hint =
         cm->cur_frame->ref_display_order_hint;
     const int arf2_dist = av1_encoder_get_relative_dist(
-        order_hint_info, ref_display_order_hint[ALTREF2_FRAME - LAST_FRAME],
+        ref_display_order_hint[ALTREF2_FRAME - LAST_FRAME],
         cur_frame_display_order_hint);
     const int bwd_dist = av1_encoder_get_relative_dist(
-        order_hint_info, ref_display_order_hint[BWDREF_FRAME - LAST_FRAME],
+        ref_display_order_hint[BWDREF_FRAME - LAST_FRAME],
         cur_frame_display_order_hint);
 
     for (int ref_idx = REF_FRAMES; ref_idx < MODE_CTX_REF_FRAMES; ++ref_idx) {
@@ -5592,7 +1172,7 @@
         int ref_dist[2];
         for (int i = 0; i < 2; ++i) {
           ref_dist[i] = av1_encoder_get_relative_dist(
-              order_hint_info, ref_display_order_hint[rf[i] - LAST_FRAME],
+              ref_display_order_hint[rf[i] - LAST_FRAME],
               cur_frame_display_order_hint);
         }
 
@@ -5617,8 +1197,11 @@
   }
 }
 
-#define CHECK_PRECOMPUTED_REF_FRAME_MAP 0
-
+/*!\brief Encoder setup(only for the current frame), encoding, and recontruction
+ * for a single frame
+ *
+ * \ingroup high_level_algo
+ */
 static AOM_INLINE void encode_frame_internal(AV1_COMP *cpi) {
   ThreadData *const td = &cpi->td;
   MACROBLOCK *const x = &td->mb;
@@ -5627,9 +1210,12 @@
   FeatureFlags *const features = &cm->features;
   MACROBLOCKD *const xd = &x->e_mbd;
   RD_COUNTS *const rdc = &cpi->td.rd_counts;
-  GlobalMotionInfo *const gm_info = &cpi->gm_info;
   FrameProbInfo *const frame_probs = &cpi->frame_probs;
   IntraBCHashInfo *const intrabc_hash_info = &x->intrabc_hash_info;
+  MultiThreadInfo *const mt_info = &cpi->mt_info;
+  AV1EncRowMultiThreadInfo *const enc_row_mt = &mt_info->enc_row_mt;
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  const DELTAQ_MODE deltaq_mode = oxcf->q_cfg.deltaq_mode;
   int i;
 
   if (!cpi->sf.rt_sf.use_nonrd_pick_mode) {
@@ -5638,12 +1224,6 @@
 
   set_mi_offsets(mi_params, xd, 0, 0);
 
-#if CONFIG_AV1_HIGHBITDEPTH
-  x->fwd_txfm4x4 = aom_fdct4x4;
-#else
-  x->fwd_txfm4x4 = aom_fdct4x4_lp;
-#endif
-
   av1_zero(*td->counts);
   av1_zero(rdc->comp_pred_diff);
   av1_zero(rdc->tx_type_used);
@@ -5657,7 +1237,7 @@
     features->allow_intrabc = 0;
   }
 
-  features->allow_intrabc &= (cpi->oxcf.enable_intrabc);
+  features->allow_intrabc &= (oxcf->kf_cfg.enable_intrabc);
 
   if (features->allow_warped_motion &&
       cpi->sf.inter_sf.prune_warped_prob_thresh > 0) {
@@ -5747,39 +1327,43 @@
 
   // Fix delta q resolution for the moment
   cm->delta_q_info.delta_q_res = 0;
-  if (cpi->oxcf.deltaq_mode == DELTA_Q_OBJECTIVE)
-    cm->delta_q_info.delta_q_res = DEFAULT_DELTA_Q_RES_OBJECTIVE;
-  else if (cpi->oxcf.deltaq_mode == DELTA_Q_PERCEPTUAL)
-    cm->delta_q_info.delta_q_res = DEFAULT_DELTA_Q_RES_PERCEPTUAL;
-  // Set delta_q_present_flag before it is used for the first time
-  cm->delta_q_info.delta_lf_res = DEFAULT_DELTA_LF_RES;
-  cm->delta_q_info.delta_q_present_flag = cpi->oxcf.deltaq_mode != NO_DELTA_Q;
+  if (cpi->oxcf.q_cfg.aq_mode != CYCLIC_REFRESH_AQ) {
+    if (deltaq_mode == DELTA_Q_OBJECTIVE)
+      cm->delta_q_info.delta_q_res = DEFAULT_DELTA_Q_RES_OBJECTIVE;
+    else if (deltaq_mode == DELTA_Q_PERCEPTUAL)
+      cm->delta_q_info.delta_q_res = DEFAULT_DELTA_Q_RES_PERCEPTUAL;
+    // Set delta_q_present_flag before it is used for the first time
+    cm->delta_q_info.delta_lf_res = DEFAULT_DELTA_LF_RES;
+    cm->delta_q_info.delta_q_present_flag = deltaq_mode != NO_DELTA_Q;
 
-  // Turn off cm->delta_q_info.delta_q_present_flag if objective delta_q is used
-  // for ineligible frames. That effectively will turn off row_mt usage.
-  // Note objective delta_q and tpl eligible frames are only altref frames
-  // currently.
-  if (cm->delta_q_info.delta_q_present_flag) {
-    if (cpi->oxcf.deltaq_mode == DELTA_Q_OBJECTIVE &&
-        !is_frame_tpl_eligible(cpi))
-      cm->delta_q_info.delta_q_present_flag = 0;
+    // Turn off cm->delta_q_info.delta_q_present_flag if objective delta_q
+    // is used for ineligible frames. That effectively will turn off row_mt
+    // usage. Note objective delta_q and tpl eligible frames are only altref
+    // frames currently.
+    const GF_GROUP *gf_group = &cpi->gf_group;
+    if (cm->delta_q_info.delta_q_present_flag) {
+      if (deltaq_mode == DELTA_Q_OBJECTIVE &&
+          !is_frame_tpl_eligible(gf_group, gf_group->index))
+        cm->delta_q_info.delta_q_present_flag = 0;
+    }
+
+    // Reset delta_q_used flag
+    cpi->deltaq_used = 0;
+
+    cm->delta_q_info.delta_lf_present_flag =
+        cm->delta_q_info.delta_q_present_flag &&
+        oxcf->tool_cfg.enable_deltalf_mode;
+    cm->delta_q_info.delta_lf_multi = DEFAULT_DELTA_LF_MULTI;
+
+    // update delta_q_present_flag and delta_lf_present_flag based on
+    // base_qindex
+    cm->delta_q_info.delta_q_present_flag &= quant_params->base_qindex > 0;
+    cm->delta_q_info.delta_lf_present_flag &= quant_params->base_qindex > 0;
   }
 
-  // Reset delta_q_used flag
-  cpi->deltaq_used = 0;
-
-  cm->delta_q_info.delta_lf_present_flag =
-      cm->delta_q_info.delta_q_present_flag && cpi->oxcf.deltalf_mode;
-  cm->delta_q_info.delta_lf_multi = DEFAULT_DELTA_LF_MULTI;
-
-  // update delta_q_present_flag and delta_lf_present_flag based on
-  // base_qindex
-  cm->delta_q_info.delta_q_present_flag &= quant_params->base_qindex > 0;
-  cm->delta_q_info.delta_lf_present_flag &= quant_params->base_qindex > 0;
-
   av1_frame_init_quantizer(cpi);
   av1_initialize_rd_consts(cpi);
-  av1_initialize_me_consts(cpi, x, quant_params->base_qindex);
+  av1_set_sad_per_bit(cpi, &x->mv_costs, quant_params->base_qindex);
 
   init_encode_frame_mb_context(cpi);
   set_default_interp_skip_flags(cm, &cpi->interp_search_flags);
@@ -5804,95 +1388,20 @@
   // Figure out which ref frames can be skipped at frame level.
   setup_prune_ref_frame_mask(cpi);
 
-  x->txb_split_count = 0;
+  x->txfm_search_info.txb_split_count = 0;
 #if CONFIG_SPEED_STATS
-  x->tx_search_count = 0;
+  x->txfm_search_info.tx_search_count = 0;
 #endif  // CONFIG_SPEED_STATS
 
+#if !CONFIG_REALTIME_ONLY
 #if CONFIG_COLLECT_COMPONENT_TIMING
   start_timing(cpi, av1_compute_global_motion_time);
 #endif
-  av1_zero(rdc->global_motion_used);
-  av1_zero(gm_info->params_cost);
-  if (cpi->common.current_frame.frame_type == INTER_FRAME && cpi->source &&
-      cpi->oxcf.enable_global_motion && !gm_info->search_done) {
-    YV12_BUFFER_CONFIG *ref_buf[REF_FRAMES];
-    MotionModel params_by_motion[RANSAC_NUM_MOTIONS];
-    for (int m = 0; m < RANSAC_NUM_MOTIONS; m++) {
-      memset(&params_by_motion[m], 0, sizeof(params_by_motion[m]));
-      params_by_motion[m].inliers =
-          aom_malloc(sizeof(*(params_by_motion[m].inliers)) * 2 * MAX_CORNERS);
-    }
-
-    int num_frm_corners = -1;
-    int frm_corners[2 * MAX_CORNERS];
-    unsigned char *frm_buffer = cpi->source->y_buffer;
-    if (cpi->source->flags & YV12_FLAG_HIGHBITDEPTH) {
-      // The frame buffer is 16-bit, so we need to convert to 8 bits for the
-      // following code. We cache the result until the frame is released.
-      frm_buffer =
-          av1_downconvert_frame(cpi->source, cpi->common.seq_params.bit_depth);
-    }
-    const int segment_map_w =
-        (cpi->source->y_width + WARP_ERROR_BLOCK) >> WARP_ERROR_BLOCK_LOG;
-    const int segment_map_h =
-        (cpi->source->y_height + WARP_ERROR_BLOCK) >> WARP_ERROR_BLOCK_LOG;
-
-    uint8_t *segment_map =
-        aom_malloc(sizeof(*segment_map) * segment_map_w * segment_map_h);
-    memset(segment_map, 0,
-           sizeof(*segment_map) * segment_map_w * segment_map_h);
-
-    FrameDistPair future_ref_frame[REF_FRAMES - 1] = {
-      { -1, NONE_FRAME }, { -1, NONE_FRAME }, { -1, NONE_FRAME },
-      { -1, NONE_FRAME }, { -1, NONE_FRAME }, { -1, NONE_FRAME },
-      { -1, NONE_FRAME }
-    };
-    FrameDistPair past_ref_frame[REF_FRAMES - 1] = {
-      { -1, NONE_FRAME }, { -1, NONE_FRAME }, { -1, NONE_FRAME },
-      { -1, NONE_FRAME }, { -1, NONE_FRAME }, { -1, NONE_FRAME },
-      { -1, NONE_FRAME }
-    };
-    int num_past_ref_frames = 0;
-    int num_future_ref_frames = 0;
-    // Populate ref_buf for valid ref frames in global motion
-    update_valid_ref_frames_for_gm(cpi, ref_buf, past_ref_frame,
-                                   future_ref_frame, &num_past_ref_frames,
-                                   &num_future_ref_frames);
-
-    // Sort the ref frames in the ascending order of their distance from the
-    // current frame
-    qsort(past_ref_frame, num_past_ref_frames, sizeof(past_ref_frame[0]),
-          compare_distance);
-    qsort(future_ref_frame, num_future_ref_frames, sizeof(future_ref_frame[0]),
-          compare_distance);
-
-    // Compute global motion w.r.t. past reference frames
-    if (num_past_ref_frames > 0)
-      compute_global_motion_for_references(
-          cpi, ref_buf, past_ref_frame, num_past_ref_frames, &num_frm_corners,
-          frm_corners, frm_buffer, params_by_motion, segment_map, segment_map_w,
-          segment_map_h);
-
-    // Compute global motion w.r.t. future reference frames
-    if (num_future_ref_frames > 0)
-      compute_global_motion_for_references(
-          cpi, ref_buf, future_ref_frame, num_future_ref_frames,
-          &num_frm_corners, frm_corners, frm_buffer, params_by_motion,
-          segment_map, segment_map_w, segment_map_h);
-
-    aom_free(segment_map);
-
-    gm_info->search_done = 1;
-    for (int m = 0; m < RANSAC_NUM_MOTIONS; m++) {
-      aom_free(params_by_motion[m].inliers);
-    }
-  }
-  memcpy(cm->cur_frame->global_motion, cm->global_motion,
-         REF_FRAMES * sizeof(WarpedMotionParams));
+  av1_compute_global_motion_facade(cpi);
 #if CONFIG_COLLECT_COMPONENT_TIMING
   end_timing(cpi, av1_compute_global_motion_time);
 #endif
+#endif  // !CONFIG_REALTIME_ONLY
 
 #if CONFIG_COLLECT_COMPONENT_TIMING
   start_timing(cpi, av1_setup_motion_field_time);
@@ -5905,17 +1414,17 @@
   cm->current_frame.skip_mode_info.skip_mode_flag =
       check_skip_mode_enabled(cpi);
 
-  cpi->row_mt_sync_read_ptr = av1_row_mt_sync_read_dummy;
-  cpi->row_mt_sync_write_ptr = av1_row_mt_sync_write_dummy;
-  cpi->row_mt = 0;
+  enc_row_mt->sync_read_ptr = av1_row_mt_sync_read_dummy;
+  enc_row_mt->sync_write_ptr = av1_row_mt_sync_write_dummy;
+  mt_info->row_mt_enabled = 0;
 
-  if (cpi->oxcf.row_mt && (cpi->oxcf.max_threads > 1)) {
-    cpi->row_mt = 1;
-    cpi->row_mt_sync_read_ptr = av1_row_mt_sync_read;
-    cpi->row_mt_sync_write_ptr = av1_row_mt_sync_write;
+  if (oxcf->row_mt && (mt_info->num_workers > 1)) {
+    mt_info->row_mt_enabled = 1;
+    enc_row_mt->sync_read_ptr = av1_row_mt_sync_read;
+    enc_row_mt->sync_write_ptr = av1_row_mt_sync_write;
     av1_encode_tiles_row_mt(cpi);
   } else {
-    if (AOMMIN(cpi->oxcf.max_threads, cm->tiles.cols * cm->tiles.rows) > 1)
+    if (AOMMIN(mt_info->num_workers, cm->tiles.cols * cm->tiles.rows) > 1)
       av1_encode_tiles_mt(cpi);
     else
       encode_tiles(cpi);
@@ -5940,7 +1449,7 @@
           : DEFAULT_EVAL;
   const TX_SIZE_SEARCH_METHOD tx_search_type =
       cpi->winner_mode_params.tx_size_search_methods[eval_type];
-  assert(cpi->oxcf.enable_tx64 || tx_search_type != USE_LARGESTALL);
+  assert(oxcf->txfm_cfg.enable_tx64 || tx_search_type != USE_LARGESTALL);
   features->tx_mode = select_tx_mode(cm, tx_search_type);
 
   if (cpi->sf.tx_sf.tx_type_search.prune_tx_type_using_stats) {
@@ -6020,13 +1529,19 @@
     }
   }
 
-  if ((!is_stat_generation_stage(cpi) && av1_use_hash_me(cpi) &&
-       !cpi->sf.rt_sf.use_nonrd_pick_mode) ||
-      hash_table_created) {
+  if (hash_table_created) {
     av1_hash_table_destroy(&intrabc_hash_info->intrabc_hash_table);
   }
 }
 
+/*!\brief Setup reference frame buffers and encode a frame
+ *
+ * \ingroup high_level_algo
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in]    cpi    Top-level encoder structure
+ */
 void av1_encode_frame(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
   CurrentFrame *const current_frame = &cm->current_frame;
@@ -6034,7 +1549,7 @@
   const int num_planes = av1_num_planes(cm);
   // Indicates whether or not to use a default reduced set for ext-tx
   // rather than the potential full set of 16 transforms
-  features->reduced_tx_set_used = cpi->oxcf.reduced_tx_type_set;
+  features->reduced_tx_set_used = cpi->oxcf.txfm_cfg.reduced_tx_type_set;
 
   // Make sure segment_id is no larger than last_active_segid.
   if (cm->seg.enabled && cm->seg.update_map) {
@@ -6051,47 +1566,13 @@
   }
 
   av1_setup_frame_buf_refs(cm);
-  enforce_max_ref_frames(cpi, &cpi->ref_frame_flags);
-  set_rel_frame_dist(cpi);
+  enforce_max_ref_frames(cpi, &cpi->ref_frame_flags,
+                         cm->cur_frame->ref_display_order_hint,
+                         cm->current_frame.display_order_hint);
+  set_rel_frame_dist(&cpi->common, &cpi->ref_frame_dist_info,
+                     cpi->ref_frame_flags);
   av1_setup_frame_sign_bias(cm);
 
-#if CHECK_PRECOMPUTED_REF_FRAME_MAP
-  GF_GROUP *gf_group = &cpi->gf_group;
-  // TODO(yuec): The check is disabled on OVERLAY frames for now, because info
-  // in cpi->gf_group has been refreshed for the next GOP when the check is
-  // performed for OVERLAY frames. Since we have not support inter-GOP ref
-  // frame map computation, the precomputed ref map for an OVERLAY frame is all
-  // -1 at this point (although it is meaning before gf_group is refreshed).
-  if (!frame_is_intra_only(cm) && gf_group->index != 0) {
-    const RefCntBuffer *const golden_buf = get_ref_frame_buf(cm, GOLDEN_FRAME);
-
-    if (golden_buf) {
-      const int golden_order_hint = golden_buf->order_hint;
-
-      for (int ref = LAST_FRAME; ref < EXTREF_FRAME; ++ref) {
-        const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref);
-        const int ref_disp_idx_precomputed =
-            gf_group->ref_frame_disp_idx[gf_group->index][ref - LAST_FRAME];
-
-        (void)ref_disp_idx_precomputed;
-
-        if (buf != NULL) {
-          const int ref_disp_idx =
-              get_relative_dist(&cm->seq_params.order_hint_info,
-                                buf->order_hint, golden_order_hint);
-
-          if (ref_disp_idx >= 0)
-            assert(ref_disp_idx == ref_disp_idx_precomputed);
-          else
-            assert(ref_disp_idx_precomputed == -1);
-        } else {
-          assert(ref_disp_idx_precomputed == -1);
-        }
-      }
-    }
-  }
-#endif
-
 #if CONFIG_MISMATCH_DEBUG
   mismatch_reset_frame(num_planes);
 #else
@@ -6138,338 +1619,10 @@
 
     if (!cm->tiles.large_scale) {
       if (features->tx_mode == TX_MODE_SELECT &&
-          cpi->td.mb.txb_split_count == 0)
+          cpi->td.mb.txfm_search_info.txb_split_count == 0)
         features->tx_mode = TX_MODE_LARGEST;
     }
   } else {
     encode_frame_internal(cpi);
   }
 }
-
-static AOM_INLINE void update_txfm_count(MACROBLOCK *x, MACROBLOCKD *xd,
-                                         FRAME_COUNTS *counts, TX_SIZE tx_size,
-                                         int depth, int blk_row, int blk_col,
-                                         uint8_t allow_update_cdf) {
-  MB_MODE_INFO *mbmi = xd->mi[0];
-  const BLOCK_SIZE bsize = mbmi->sb_type;
-  const int max_blocks_high = max_block_high(xd, bsize, 0);
-  const int max_blocks_wide = max_block_wide(xd, bsize, 0);
-  int ctx = txfm_partition_context(xd->above_txfm_context + blk_col,
-                                   xd->left_txfm_context + blk_row,
-                                   mbmi->sb_type, tx_size);
-  const int txb_size_index = av1_get_txb_size_index(bsize, blk_row, blk_col);
-  const TX_SIZE plane_tx_size = mbmi->inter_tx_size[txb_size_index];
-
-  if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
-  assert(tx_size > TX_4X4);
-
-  if (depth == MAX_VARTX_DEPTH) {
-    // Don't add to counts in this case
-    mbmi->tx_size = tx_size;
-    txfm_partition_update(xd->above_txfm_context + blk_col,
-                          xd->left_txfm_context + blk_row, tx_size, tx_size);
-    return;
-  }
-
-  if (tx_size == plane_tx_size) {
-#if CONFIG_ENTROPY_STATS
-    ++counts->txfm_partition[ctx][0];
-#endif
-    if (allow_update_cdf)
-      update_cdf(xd->tile_ctx->txfm_partition_cdf[ctx], 0, 2);
-    mbmi->tx_size = tx_size;
-    txfm_partition_update(xd->above_txfm_context + blk_col,
-                          xd->left_txfm_context + blk_row, tx_size, tx_size);
-  } else {
-    const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
-    const int bsw = tx_size_wide_unit[sub_txs];
-    const int bsh = tx_size_high_unit[sub_txs];
-
-#if CONFIG_ENTROPY_STATS
-    ++counts->txfm_partition[ctx][1];
-#endif
-    if (allow_update_cdf)
-      update_cdf(xd->tile_ctx->txfm_partition_cdf[ctx], 1, 2);
-    ++x->txb_split_count;
-
-    if (sub_txs == TX_4X4) {
-      mbmi->inter_tx_size[txb_size_index] = TX_4X4;
-      mbmi->tx_size = TX_4X4;
-      txfm_partition_update(xd->above_txfm_context + blk_col,
-                            xd->left_txfm_context + blk_row, TX_4X4, tx_size);
-      return;
-    }
-
-    for (int row = 0; row < tx_size_high_unit[tx_size]; row += bsh) {
-      for (int col = 0; col < tx_size_wide_unit[tx_size]; col += bsw) {
-        int offsetr = row;
-        int offsetc = col;
-
-        update_txfm_count(x, xd, counts, sub_txs, depth + 1, blk_row + offsetr,
-                          blk_col + offsetc, allow_update_cdf);
-      }
-    }
-  }
-}
-
-static AOM_INLINE void tx_partition_count_update(const AV1_COMMON *const cm,
-                                                 MACROBLOCK *x,
-                                                 BLOCK_SIZE plane_bsize,
-                                                 FRAME_COUNTS *td_counts,
-                                                 uint8_t allow_update_cdf) {
-  MACROBLOCKD *xd = &x->e_mbd;
-  const int mi_width = mi_size_wide[plane_bsize];
-  const int mi_height = mi_size_high[plane_bsize];
-  const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, plane_bsize, 0);
-  const int bh = tx_size_high_unit[max_tx_size];
-  const int bw = tx_size_wide_unit[max_tx_size];
-
-  xd->above_txfm_context =
-      cm->above_contexts.txfm[xd->tile.tile_row] + xd->mi_col;
-  xd->left_txfm_context =
-      xd->left_txfm_context_buffer + (xd->mi_row & MAX_MIB_MASK);
-
-  for (int idy = 0; idy < mi_height; idy += bh) {
-    for (int idx = 0; idx < mi_width; idx += bw) {
-      update_txfm_count(x, xd, td_counts, max_tx_size, 0, idy, idx,
-                        allow_update_cdf);
-    }
-  }
-}
-
-static AOM_INLINE void set_txfm_context(MACROBLOCKD *xd, TX_SIZE tx_size,
-                                        int blk_row, int blk_col) {
-  MB_MODE_INFO *mbmi = xd->mi[0];
-  const BLOCK_SIZE bsize = mbmi->sb_type;
-  const int max_blocks_high = max_block_high(xd, bsize, 0);
-  const int max_blocks_wide = max_block_wide(xd, bsize, 0);
-  const int txb_size_index = av1_get_txb_size_index(bsize, blk_row, blk_col);
-  const TX_SIZE plane_tx_size = mbmi->inter_tx_size[txb_size_index];
-
-  if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
-
-  if (tx_size == plane_tx_size) {
-    mbmi->tx_size = tx_size;
-    txfm_partition_update(xd->above_txfm_context + blk_col,
-                          xd->left_txfm_context + blk_row, tx_size, tx_size);
-
-  } else {
-    if (tx_size == TX_8X8) {
-      mbmi->inter_tx_size[txb_size_index] = TX_4X4;
-      mbmi->tx_size = TX_4X4;
-      txfm_partition_update(xd->above_txfm_context + blk_col,
-                            xd->left_txfm_context + blk_row, TX_4X4, tx_size);
-      return;
-    }
-    const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
-    const int bsw = tx_size_wide_unit[sub_txs];
-    const int bsh = tx_size_high_unit[sub_txs];
-    for (int row = 0; row < tx_size_high_unit[tx_size]; row += bsh) {
-      for (int col = 0; col < tx_size_wide_unit[tx_size]; col += bsw) {
-        const int offsetr = blk_row + row;
-        const int offsetc = blk_col + col;
-        if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue;
-        set_txfm_context(xd, sub_txs, offsetr, offsetc);
-      }
-    }
-  }
-}
-
-static AOM_INLINE void tx_partition_set_contexts(const AV1_COMMON *const cm,
-                                                 MACROBLOCKD *xd,
-                                                 BLOCK_SIZE plane_bsize) {
-  const int mi_width = mi_size_wide[plane_bsize];
-  const int mi_height = mi_size_high[plane_bsize];
-  const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, plane_bsize, 0);
-  const int bh = tx_size_high_unit[max_tx_size];
-  const int bw = tx_size_wide_unit[max_tx_size];
-
-  xd->above_txfm_context =
-      cm->above_contexts.txfm[xd->tile.tile_row] + xd->mi_col;
-  xd->left_txfm_context =
-      xd->left_txfm_context_buffer + (xd->mi_row & MAX_MIB_MASK);
-
-  for (int idy = 0; idy < mi_height; idy += bh) {
-    for (int idx = 0; idx < mi_width; idx += bw) {
-      set_txfm_context(xd, max_tx_size, idy, idx);
-    }
-  }
-}
-
-static AOM_INLINE void encode_superblock(const AV1_COMP *const cpi,
-                                         TileDataEnc *tile_data, ThreadData *td,
-                                         TOKENEXTRA **t, RUN_TYPE dry_run,
-                                         BLOCK_SIZE bsize, int *rate) {
-  const AV1_COMMON *const cm = &cpi->common;
-  const int num_planes = av1_num_planes(cm);
-  MACROBLOCK *const x = &td->mb;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO **mi_4x4 = xd->mi;
-  MB_MODE_INFO *mbmi = mi_4x4[0];
-  const int seg_skip =
-      segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP);
-  const int mis = cm->mi_params.mi_stride;
-  const int mi_width = mi_size_wide[bsize];
-  const int mi_height = mi_size_high[bsize];
-  const int is_inter = is_inter_block(mbmi);
-
-  // Initialize tx_mode and tx_size_search_method
-  set_tx_size_search_method(
-      cm, &cpi->winner_mode_params, x,
-      cpi->sf.winner_mode_sf.enable_winner_mode_for_tx_size_srch, 1);
-
-  const int mi_row = xd->mi_row;
-  const int mi_col = xd->mi_col;
-  if (!is_inter) {
-    xd->cfl.store_y = store_cfl_required(cm, xd);
-    mbmi->skip = 1;
-    for (int plane = 0; plane < num_planes; ++plane) {
-      av1_encode_intra_block_plane(cpi, x, bsize, plane, dry_run,
-                                   cpi->optimize_seg_arr[mbmi->segment_id]);
-    }
-
-    // If there is at least one lossless segment, force the skip for intra
-    // block to be 0, in order to avoid the segment_id to be changed by in
-    // write_segment_id().
-    if (!cpi->common.seg.segid_preskip && cpi->common.seg.update_map &&
-        cpi->enc_seg.has_lossless_segment)
-      mbmi->skip = 0;
-
-    xd->cfl.store_y = 0;
-    if (av1_allow_palette(cm->features.allow_screen_content_tools, bsize)) {
-      for (int plane = 0; plane < AOMMIN(2, num_planes); ++plane) {
-        if (mbmi->palette_mode_info.palette_size[plane] > 0) {
-          if (!dry_run) {
-            av1_tokenize_color_map(x, plane, t, bsize, mbmi->tx_size,
-                                   PALETTE_MAP, tile_data->allow_update_cdf,
-                                   td->counts);
-          } else if (dry_run == DRY_RUN_COSTCOEFFS) {
-            rate +=
-                av1_cost_color_map(x, plane, bsize, mbmi->tx_size, PALETTE_MAP);
-          }
-        }
-      }
-    }
-
-    av1_update_txb_context(cpi, td, dry_run, bsize,
-                           tile_data->allow_update_cdf);
-  } else {
-    int ref;
-    const int is_compound = has_second_ref(mbmi);
-
-    set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
-    for (ref = 0; ref < 1 + is_compound; ++ref) {
-      const YV12_BUFFER_CONFIG *cfg =
-          get_ref_frame_yv12_buf(cm, mbmi->ref_frame[ref]);
-      assert(IMPLIES(!is_intrabc_block(mbmi), cfg));
-      av1_setup_pre_planes(xd, ref, cfg, mi_row, mi_col,
-                           xd->block_ref_scale_factors[ref], num_planes);
-    }
-    int start_plane = (cpi->sf.rt_sf.reuse_inter_pred_nonrd) ? 1 : 0;
-    av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize,
-                                  start_plane, av1_num_planes(cm) - 1);
-    if (mbmi->motion_mode == OBMC_CAUSAL) {
-      assert(cpi->oxcf.enable_obmc == 1);
-      av1_build_obmc_inter_predictors_sb(cm, xd);
-    }
-
-#if CONFIG_MISMATCH_DEBUG
-    if (dry_run == OUTPUT_ENABLED) {
-      for (int plane = 0; plane < num_planes; ++plane) {
-        const struct macroblockd_plane *pd = &xd->plane[plane];
-        int pixel_c, pixel_r;
-        mi_to_pixel_loc(&pixel_c, &pixel_r, mi_col, mi_row, 0, 0,
-                        pd->subsampling_x, pd->subsampling_y);
-        if (!is_chroma_reference(mi_row, mi_col, bsize, pd->subsampling_x,
-                                 pd->subsampling_y))
-          continue;
-        mismatch_record_block_pre(pd->dst.buf, pd->dst.stride,
-                                  cm->current_frame.order_hint, plane, pixel_c,
-                                  pixel_r, pd->width, pd->height,
-                                  xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH);
-      }
-    }
-#else
-    (void)num_planes;
-#endif
-
-    av1_encode_sb(cpi, x, bsize, dry_run);
-    av1_tokenize_sb_vartx(cpi, td, dry_run, bsize, rate,
-                          tile_data->allow_update_cdf);
-  }
-
-  if (!dry_run) {
-    if (av1_allow_intrabc(cm) && is_intrabc_block(mbmi)) td->intrabc_used = 1;
-    if (x->tx_mode_search_type == TX_MODE_SELECT &&
-        !xd->lossless[mbmi->segment_id] && mbmi->sb_type > BLOCK_4X4 &&
-        !(is_inter && (mbmi->skip || seg_skip))) {
-      if (is_inter) {
-        tx_partition_count_update(cm, x, bsize, td->counts,
-                                  tile_data->allow_update_cdf);
-      } else {
-        if (mbmi->tx_size != max_txsize_rect_lookup[bsize])
-          ++x->txb_split_count;
-        if (block_signals_txsize(bsize)) {
-          const int tx_size_ctx = get_tx_size_context(xd);
-          const int32_t tx_size_cat = bsize_to_tx_size_cat(bsize);
-          const int depth = tx_size_to_depth(mbmi->tx_size, bsize);
-          const int max_depths = bsize_to_max_depth(bsize);
-
-          if (tile_data->allow_update_cdf)
-            update_cdf(xd->tile_ctx->tx_size_cdf[tx_size_cat][tx_size_ctx],
-                       depth, max_depths + 1);
-#if CONFIG_ENTROPY_STATS
-          ++td->counts->intra_tx_size[tx_size_cat][tx_size_ctx][depth];
-#endif
-        }
-      }
-      assert(IMPLIES(is_rect_tx(mbmi->tx_size), is_rect_tx_allowed(xd, mbmi)));
-    } else {
-      int i, j;
-      TX_SIZE intra_tx_size;
-      // The new intra coding scheme requires no change of transform size
-      if (is_inter) {
-        if (xd->lossless[mbmi->segment_id]) {
-          intra_tx_size = TX_4X4;
-        } else {
-          intra_tx_size = tx_size_from_tx_mode(bsize, x->tx_mode_search_type);
-        }
-      } else {
-        intra_tx_size = mbmi->tx_size;
-      }
-
-      for (j = 0; j < mi_height; j++)
-        for (i = 0; i < mi_width; i++)
-          if (mi_col + i < cm->mi_params.mi_cols &&
-              mi_row + j < cm->mi_params.mi_rows)
-            mi_4x4[mis * j + i]->tx_size = intra_tx_size;
-
-      if (intra_tx_size != max_txsize_rect_lookup[bsize]) ++x->txb_split_count;
-    }
-  }
-
-  if (x->tx_mode_search_type == TX_MODE_SELECT &&
-      block_signals_txsize(mbmi->sb_type) && is_inter &&
-      !(mbmi->skip || seg_skip) && !xd->lossless[mbmi->segment_id]) {
-    if (dry_run) tx_partition_set_contexts(cm, xd, bsize);
-  } else {
-    TX_SIZE tx_size = mbmi->tx_size;
-    // The new intra coding scheme requires no change of transform size
-    if (is_inter) {
-      if (xd->lossless[mbmi->segment_id]) {
-        tx_size = TX_4X4;
-      } else {
-        tx_size = tx_size_from_tx_mode(bsize, x->tx_mode_search_type);
-      }
-    } else {
-      tx_size = (bsize > BLOCK_4X4) ? tx_size : TX_4X4;
-    }
-    mbmi->tx_size = tx_size;
-    set_txfm_ctxs(tx_size, xd->width, xd->height,
-                  (mbmi->skip || seg_skip) && is_inter_block(mbmi), xd);
-  }
-
-  if (is_inter_block(mbmi) && !xd->is_chroma_ref && is_cfl_allowed(xd)) {
-    cfl_store_block(xd, mbmi->sb_type, mbmi->tx_size);
-  }
-}
diff --git a/av1/encoder/encodeframe.h b/av1/encoder/encodeframe.h
index e4c4841..36b38d5 100644
--- a/av1/encoder/encodeframe.h
+++ b/av1/encoder/encodeframe.h
@@ -16,6 +16,8 @@
 #include "av1/common/blockd.h"
 #include "av1/common/enums.h"
 
+#include "av1/encoder/global_motion.h"
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -41,7 +43,6 @@
                      int tile_col);
 void av1_encode_sb_row(struct AV1_COMP *cpi, struct ThreadData *td,
                        int tile_row, int tile_col, int mi_row);
-
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/av1/encoder/encodeframe_utils.c b/av1/encoder/encodeframe_utils.c
new file mode 100644
index 0000000..612f06d
--- /dev/null
+++ b/av1/encoder/encodeframe_utils.c
@@ -0,0 +1,1387 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_ports/system_state.h"
+
+#include "av1/common/reconintra.h"
+
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/encodeframe_utils.h"
+#include "av1/encoder/partition_strategy.h"
+#include "av1/encoder/rdopt.h"
+
+static AOM_INLINE int set_deltaq_rdmult(const AV1_COMP *const cpi,
+                                        const MACROBLOCK *const x) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const CommonQuantParams *quant_params = &cm->quant_params;
+  return av1_compute_rd_mult(cpi, quant_params->base_qindex + x->delta_qindex +
+                                      quant_params->y_dc_delta_q);
+}
+
+void av1_set_ssim_rdmult(const AV1_COMP *const cpi, MvCosts *const mv_costs,
+                         const BLOCK_SIZE bsize, const int mi_row,
+                         const int mi_col, int *const rdmult) {
+  const AV1_COMMON *const cm = &cpi->common;
+
+  const int bsize_base = BLOCK_16X16;
+  const int num_mi_w = mi_size_wide[bsize_base];
+  const int num_mi_h = mi_size_high[bsize_base];
+  const int num_cols = (cm->mi_params.mi_cols + num_mi_w - 1) / num_mi_w;
+  const int num_rows = (cm->mi_params.mi_rows + num_mi_h - 1) / num_mi_h;
+  const int num_bcols = (mi_size_wide[bsize] + num_mi_w - 1) / num_mi_w;
+  const int num_brows = (mi_size_high[bsize] + num_mi_h - 1) / num_mi_h;
+  int row, col;
+  double num_of_mi = 0.0;
+  double geom_mean_of_scale = 0.0;
+
+  assert(cpi->oxcf.tune_cfg.tuning == AOM_TUNE_SSIM);
+
+  aom_clear_system_state();
+  for (row = mi_row / num_mi_w;
+       row < num_rows && row < mi_row / num_mi_w + num_brows; ++row) {
+    for (col = mi_col / num_mi_h;
+         col < num_cols && col < mi_col / num_mi_h + num_bcols; ++col) {
+      const int index = row * num_cols + col;
+      geom_mean_of_scale += log(cpi->ssim_rdmult_scaling_factors[index]);
+      num_of_mi += 1.0;
+    }
+  }
+  geom_mean_of_scale = exp(geom_mean_of_scale / num_of_mi);
+
+  *rdmult = (int)((double)(*rdmult) * geom_mean_of_scale + 0.5);
+  *rdmult = AOMMAX(*rdmult, 0);
+  av1_set_error_per_bit(mv_costs, *rdmult);
+  aom_clear_system_state();
+}
+
+// Return the end column for the current superblock, in unit of TPL blocks.
+static int get_superblock_tpl_column_end(const AV1_COMMON *const cm, int mi_col,
+                                         int num_mi_w) {
+  // Find the start column of this superblock.
+  const int sb_mi_col_start = (mi_col >> cm->seq_params.mib_size_log2)
+                              << cm->seq_params.mib_size_log2;
+  // Same but in superres upscaled dimension.
+  const int sb_mi_col_start_sr =
+      coded_to_superres_mi(sb_mi_col_start, cm->superres_scale_denominator);
+  // Width of this superblock in mi units.
+  const int sb_mi_width = mi_size_wide[cm->seq_params.sb_size];
+  // Same but in superres upscaled dimension.
+  const int sb_mi_width_sr =
+      coded_to_superres_mi(sb_mi_width, cm->superres_scale_denominator);
+  // Superblock end in mi units.
+  const int sb_mi_end = sb_mi_col_start_sr + sb_mi_width_sr;
+  // Superblock end in TPL units.
+  return (sb_mi_end + num_mi_w - 1) / num_mi_w;
+}
+
+int av1_get_hier_tpl_rdmult(const AV1_COMP *const cpi, MACROBLOCK *const x,
+                            const BLOCK_SIZE bsize, const int mi_row,
+                            const int mi_col, int orig_rdmult) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const GF_GROUP *const gf_group = &cpi->gf_group;
+  assert(IMPLIES(cpi->gf_group.size > 0,
+                 cpi->gf_group.index < cpi->gf_group.size));
+  const int tpl_idx = cpi->gf_group.index;
+  const TplDepFrame *tpl_frame = &cpi->tpl_data.tpl_frame[tpl_idx];
+  const int deltaq_rdmult = set_deltaq_rdmult(cpi, x);
+  if (tpl_frame->is_valid == 0) return deltaq_rdmult;
+  if (!is_frame_tpl_eligible(gf_group, gf_group->index)) return deltaq_rdmult;
+  if (tpl_idx >= MAX_TPL_FRAME_IDX) return deltaq_rdmult;
+  if (cpi->oxcf.q_cfg.aq_mode != NO_AQ) return deltaq_rdmult;
+
+  const int mi_col_sr =
+      coded_to_superres_mi(mi_col, cm->superres_scale_denominator);
+  const int mi_cols_sr = av1_pixels_to_mi(cm->superres_upscaled_width);
+  const int block_mi_width_sr =
+      coded_to_superres_mi(mi_size_wide[bsize], cm->superres_scale_denominator);
+
+  const int bsize_base = BLOCK_16X16;
+  const int num_mi_w = mi_size_wide[bsize_base];
+  const int num_mi_h = mi_size_high[bsize_base];
+  const int num_cols = (mi_cols_sr + num_mi_w - 1) / num_mi_w;
+  const int num_rows = (cm->mi_params.mi_rows + num_mi_h - 1) / num_mi_h;
+  const int num_bcols = (block_mi_width_sr + num_mi_w - 1) / num_mi_w;
+  const int num_brows = (mi_size_high[bsize] + num_mi_h - 1) / num_mi_h;
+  // This is required because the end col of superblock may be off by 1 in case
+  // of superres.
+  const int sb_bcol_end = get_superblock_tpl_column_end(cm, mi_col, num_mi_w);
+  int row, col;
+  double base_block_count = 0.0;
+  double geom_mean_of_scale = 0.0;
+  aom_clear_system_state();
+  for (row = mi_row / num_mi_w;
+       row < num_rows && row < mi_row / num_mi_w + num_brows; ++row) {
+    for (col = mi_col_sr / num_mi_h;
+         col < num_cols && col < mi_col_sr / num_mi_h + num_bcols &&
+         col < sb_bcol_end;
+         ++col) {
+      const int index = row * num_cols + col;
+      geom_mean_of_scale += log(cpi->tpl_sb_rdmult_scaling_factors[index]);
+      base_block_count += 1.0;
+    }
+  }
+  geom_mean_of_scale = exp(geom_mean_of_scale / base_block_count);
+  int rdmult = (int)((double)orig_rdmult * geom_mean_of_scale + 0.5);
+  rdmult = AOMMAX(rdmult, 0);
+  av1_set_error_per_bit(&x->mv_costs, rdmult);
+  aom_clear_system_state();
+  if (bsize == cm->seq_params.sb_size) {
+    const int rdmult_sb = set_deltaq_rdmult(cpi, x);
+    assert(rdmult_sb == rdmult);
+    (void)rdmult_sb;
+  }
+  return rdmult;
+}
+
+static AOM_INLINE void update_filter_type_count(FRAME_COUNTS *counts,
+                                                const MACROBLOCKD *xd,
+                                                const MB_MODE_INFO *mbmi) {
+  int dir;
+  for (dir = 0; dir < 2; ++dir) {
+    const int ctx = av1_get_pred_context_switchable_interp(xd, dir);
+    InterpFilter filter = av1_extract_interp_filter(mbmi->interp_filters, dir);
+    ++counts->switchable_interp[ctx][filter];
+  }
+}
+
+static void reset_tx_size(MACROBLOCK *x, MB_MODE_INFO *mbmi,
+                          const TX_MODE tx_mode) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  TxfmSearchInfo *txfm_info = &x->txfm_search_info;
+  if (xd->lossless[mbmi->segment_id]) {
+    mbmi->tx_size = TX_4X4;
+  } else if (tx_mode != TX_MODE_SELECT) {
+    mbmi->tx_size = tx_size_from_tx_mode(mbmi->bsize, tx_mode);
+  } else {
+    BLOCK_SIZE bsize = mbmi->bsize;
+    TX_SIZE min_tx_size = depth_to_tx_size(MAX_TX_DEPTH, bsize);
+    mbmi->tx_size = (TX_SIZE)TXSIZEMAX(mbmi->tx_size, min_tx_size);
+  }
+  if (is_inter_block(mbmi)) {
+    memset(mbmi->inter_tx_size, mbmi->tx_size, sizeof(mbmi->inter_tx_size));
+  }
+  const int stride = xd->tx_type_map_stride;
+  const int bw = mi_size_wide[mbmi->bsize];
+  for (int row = 0; row < mi_size_high[mbmi->bsize]; ++row) {
+    memset(xd->tx_type_map + row * stride, DCT_DCT,
+           bw * sizeof(xd->tx_type_map[0]));
+  }
+  av1_zero(txfm_info->blk_skip);
+  txfm_info->skip_txfm = 0;
+}
+
+// This function will copy the best reference mode information from
+// MB_MODE_INFO_EXT_FRAME to MB_MODE_INFO_EXT.
+static INLINE void copy_mbmi_ext_frame_to_mbmi_ext(
+    MB_MODE_INFO_EXT *mbmi_ext,
+    const MB_MODE_INFO_EXT_FRAME *const mbmi_ext_best, uint8_t ref_frame_type) {
+  memcpy(mbmi_ext->ref_mv_stack[ref_frame_type], mbmi_ext_best->ref_mv_stack,
+         sizeof(mbmi_ext->ref_mv_stack[USABLE_REF_MV_STACK_SIZE]));
+  memcpy(mbmi_ext->weight[ref_frame_type], mbmi_ext_best->weight,
+         sizeof(mbmi_ext->weight[USABLE_REF_MV_STACK_SIZE]));
+  mbmi_ext->mode_context[ref_frame_type] = mbmi_ext_best->mode_context;
+  mbmi_ext->ref_mv_count[ref_frame_type] = mbmi_ext_best->ref_mv_count;
+  memcpy(mbmi_ext->global_mvs, mbmi_ext_best->global_mvs,
+         sizeof(mbmi_ext->global_mvs));
+}
+
+void av1_update_state(const AV1_COMP *const cpi, ThreadData *td,
+                      const PICK_MODE_CONTEXT *const ctx, int mi_row,
+                      int mi_col, BLOCK_SIZE bsize, RUN_TYPE dry_run) {
+  int i, x_idx, y;
+  const AV1_COMMON *const cm = &cpi->common;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  const int num_planes = av1_num_planes(cm);
+  RD_COUNTS *const rdc = &td->rd_counts;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  struct macroblock_plane *const p = x->plane;
+  struct macroblockd_plane *const pd = xd->plane;
+  const MB_MODE_INFO *const mi = &ctx->mic;
+  MB_MODE_INFO *const mi_addr = xd->mi[0];
+  const struct segmentation *const seg = &cm->seg;
+  assert(bsize < BLOCK_SIZES_ALL);
+  const int bw = mi_size_wide[mi->bsize];
+  const int bh = mi_size_high[mi->bsize];
+  const int mis = mi_params->mi_stride;
+  const int mi_width = mi_size_wide[bsize];
+  const int mi_height = mi_size_high[bsize];
+  TxfmSearchInfo *txfm_info = &x->txfm_search_info;
+
+  assert(mi->bsize == bsize);
+
+  *mi_addr = *mi;
+  copy_mbmi_ext_frame_to_mbmi_ext(&x->mbmi_ext, &ctx->mbmi_ext_best,
+                                  av1_ref_frame_type(ctx->mic.ref_frame));
+
+  memcpy(txfm_info->blk_skip, ctx->blk_skip,
+         sizeof(txfm_info->blk_skip[0]) * ctx->num_4x4_blk);
+
+  txfm_info->skip_txfm = ctx->rd_stats.skip_txfm;
+
+  xd->tx_type_map = ctx->tx_type_map;
+  xd->tx_type_map_stride = mi_size_wide[bsize];
+  // If not dry_run, copy the transform type data into the frame level buffer.
+  // Encoder will fetch tx types when writing bitstream.
+  if (!dry_run) {
+    const int grid_idx = get_mi_grid_idx(mi_params, mi_row, mi_col);
+    uint8_t *const tx_type_map = mi_params->tx_type_map + grid_idx;
+    const int mi_stride = mi_params->mi_stride;
+    for (int blk_row = 0; blk_row < bh; ++blk_row) {
+      av1_copy_array(tx_type_map + blk_row * mi_stride,
+                     xd->tx_type_map + blk_row * xd->tx_type_map_stride, bw);
+    }
+    xd->tx_type_map = tx_type_map;
+    xd->tx_type_map_stride = mi_stride;
+  }
+
+  // If segmentation in use
+  if (seg->enabled) {
+    // For in frame complexity AQ copy the segment id from the segment map.
+    if (cpi->oxcf.q_cfg.aq_mode == COMPLEXITY_AQ) {
+      const uint8_t *const map =
+          seg->update_map ? cpi->enc_seg.map : cm->last_frame_seg_map;
+      mi_addr->segment_id =
+          map ? get_segment_id(mi_params, map, bsize, mi_row, mi_col) : 0;
+      reset_tx_size(x, mi_addr, x->txfm_search_params.tx_mode_search_type);
+    }
+    // Else for cyclic refresh mode update the segment map, set the segment id
+    // and then update the quantizer.
+    if (cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ) {
+      av1_cyclic_refresh_update_segment(cpi, mi_addr, mi_row, mi_col, bsize,
+                                        ctx->rd_stats.rate, ctx->rd_stats.dist,
+                                        txfm_info->skip_txfm);
+    }
+    if (mi_addr->uv_mode == UV_CFL_PRED && !is_cfl_allowed(xd))
+      mi_addr->uv_mode = UV_DC_PRED;
+  }
+
+  for (i = 0; i < num_planes; ++i) {
+    p[i].coeff = ctx->coeff[i];
+    p[i].qcoeff = ctx->qcoeff[i];
+    p[i].dqcoeff = ctx->dqcoeff[i];
+    p[i].eobs = ctx->eobs[i];
+    p[i].txb_entropy_ctx = ctx->txb_entropy_ctx[i];
+  }
+  for (i = 0; i < 2; ++i) pd[i].color_index_map = ctx->color_index_map[i];
+  // Restore the coding context of the MB to that that was in place
+  // when the mode was picked for it
+  for (y = 0; y < mi_height; y++) {
+    for (x_idx = 0; x_idx < mi_width; x_idx++) {
+      if ((xd->mb_to_right_edge >> (3 + MI_SIZE_LOG2)) + mi_width > x_idx &&
+          (xd->mb_to_bottom_edge >> (3 + MI_SIZE_LOG2)) + mi_height > y) {
+        xd->mi[x_idx + y * mis] = mi_addr;
+      }
+    }
+  }
+
+  if (cpi->oxcf.q_cfg.aq_mode)
+    av1_init_plane_quantizers(cpi, x, mi_addr->segment_id);
+
+  if (dry_run) return;
+
+#if CONFIG_INTERNAL_STATS
+  {
+    unsigned int *const mode_chosen_counts =
+        (unsigned int *)cpi->mode_chosen_counts;  // Cast const away.
+    if (frame_is_intra_only(cm)) {
+      static const int kf_mode_index[] = {
+        THR_DC /*DC_PRED*/,
+        THR_V_PRED /*V_PRED*/,
+        THR_H_PRED /*H_PRED*/,
+        THR_D45_PRED /*D45_PRED*/,
+        THR_D135_PRED /*D135_PRED*/,
+        THR_D113_PRED /*D113_PRED*/,
+        THR_D157_PRED /*D157_PRED*/,
+        THR_D203_PRED /*D203_PRED*/,
+        THR_D67_PRED /*D67_PRED*/,
+        THR_SMOOTH,   /*SMOOTH_PRED*/
+        THR_SMOOTH_V, /*SMOOTH_V_PRED*/
+        THR_SMOOTH_H, /*SMOOTH_H_PRED*/
+        THR_PAETH /*PAETH_PRED*/,
+      };
+      ++mode_chosen_counts[kf_mode_index[mi_addr->mode]];
+    } else {
+      // Note how often each mode chosen as best
+      ++mode_chosen_counts[ctx->best_mode_index];
+    }
+  }
+#endif
+  if (!frame_is_intra_only(cm)) {
+    if (is_inter_block(mi_addr)) {
+      // TODO(sarahparker): global motion stats need to be handled per-tile
+      // to be compatible with tile-based threading.
+      update_global_motion_used(mi_addr->mode, bsize, mi_addr, rdc);
+    }
+
+    if (cm->features.interp_filter == SWITCHABLE &&
+        mi_addr->motion_mode != WARPED_CAUSAL &&
+        !is_nontrans_global_motion(xd, xd->mi[0])) {
+      update_filter_type_count(td->counts, xd, mi_addr);
+    }
+
+    rdc->comp_pred_diff[SINGLE_REFERENCE] += ctx->single_pred_diff;
+    rdc->comp_pred_diff[COMPOUND_REFERENCE] += ctx->comp_pred_diff;
+    rdc->comp_pred_diff[REFERENCE_MODE_SELECT] += ctx->hybrid_pred_diff;
+  }
+
+  const int x_mis = AOMMIN(bw, mi_params->mi_cols - mi_col);
+  const int y_mis = AOMMIN(bh, mi_params->mi_rows - mi_row);
+  if (cm->seq_params.order_hint_info.enable_ref_frame_mvs)
+    av1_copy_frame_mvs(cm, mi, mi_row, mi_col, x_mis, y_mis);
+}
+
+void av1_update_inter_mode_stats(FRAME_CONTEXT *fc, FRAME_COUNTS *counts,
+                                 PREDICTION_MODE mode, int16_t mode_context) {
+  (void)counts;
+
+  int16_t mode_ctx = mode_context & NEWMV_CTX_MASK;
+  if (mode == NEWMV) {
+#if CONFIG_ENTROPY_STATS
+    ++counts->newmv_mode[mode_ctx][0];
+#endif
+    update_cdf(fc->newmv_cdf[mode_ctx], 0, 2);
+    return;
+  }
+
+#if CONFIG_ENTROPY_STATS
+  ++counts->newmv_mode[mode_ctx][1];
+#endif
+  update_cdf(fc->newmv_cdf[mode_ctx], 1, 2);
+
+  mode_ctx = (mode_context >> GLOBALMV_OFFSET) & GLOBALMV_CTX_MASK;
+  if (mode == GLOBALMV) {
+#if CONFIG_ENTROPY_STATS
+    ++counts->zeromv_mode[mode_ctx][0];
+#endif
+    update_cdf(fc->zeromv_cdf[mode_ctx], 0, 2);
+    return;
+  }
+
+#if CONFIG_ENTROPY_STATS
+  ++counts->zeromv_mode[mode_ctx][1];
+#endif
+  update_cdf(fc->zeromv_cdf[mode_ctx], 1, 2);
+
+  mode_ctx = (mode_context >> REFMV_OFFSET) & REFMV_CTX_MASK;
+#if CONFIG_ENTROPY_STATS
+  ++counts->refmv_mode[mode_ctx][mode != NEARESTMV];
+#endif
+  update_cdf(fc->refmv_cdf[mode_ctx], mode != NEARESTMV, 2);
+}
+
+static void update_palette_cdf(MACROBLOCKD *xd, const MB_MODE_INFO *const mbmi,
+                               FRAME_COUNTS *counts) {
+  FRAME_CONTEXT *fc = xd->tile_ctx;
+  const BLOCK_SIZE bsize = mbmi->bsize;
+  const PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+  const int palette_bsize_ctx = av1_get_palette_bsize_ctx(bsize);
+
+  (void)counts;
+
+  if (mbmi->mode == DC_PRED) {
+    const int n = pmi->palette_size[0];
+    const int palette_mode_ctx = av1_get_palette_mode_ctx(xd);
+
+#if CONFIG_ENTROPY_STATS
+    ++counts->palette_y_mode[palette_bsize_ctx][palette_mode_ctx][n > 0];
+#endif
+    update_cdf(fc->palette_y_mode_cdf[palette_bsize_ctx][palette_mode_ctx],
+               n > 0, 2);
+    if (n > 0) {
+#if CONFIG_ENTROPY_STATS
+      ++counts->palette_y_size[palette_bsize_ctx][n - PALETTE_MIN_SIZE];
+#endif
+      update_cdf(fc->palette_y_size_cdf[palette_bsize_ctx],
+                 n - PALETTE_MIN_SIZE, PALETTE_SIZES);
+    }
+  }
+
+  if (mbmi->uv_mode == UV_DC_PRED) {
+    const int n = pmi->palette_size[1];
+    const int palette_uv_mode_ctx = (pmi->palette_size[0] > 0);
+
+#if CONFIG_ENTROPY_STATS
+    ++counts->palette_uv_mode[palette_uv_mode_ctx][n > 0];
+#endif
+    update_cdf(fc->palette_uv_mode_cdf[palette_uv_mode_ctx], n > 0, 2);
+
+    if (n > 0) {
+#if CONFIG_ENTROPY_STATS
+      ++counts->palette_uv_size[palette_bsize_ctx][n - PALETTE_MIN_SIZE];
+#endif
+      update_cdf(fc->palette_uv_size_cdf[palette_bsize_ctx],
+                 n - PALETTE_MIN_SIZE, PALETTE_SIZES);
+    }
+  }
+}
+
+void av1_sum_intra_stats(const AV1_COMMON *const cm, FRAME_COUNTS *counts,
+                         MACROBLOCKD *xd, const MB_MODE_INFO *const mbmi,
+                         const MB_MODE_INFO *above_mi,
+                         const MB_MODE_INFO *left_mi, const int intraonly) {
+  FRAME_CONTEXT *fc = xd->tile_ctx;
+  const PREDICTION_MODE y_mode = mbmi->mode;
+  (void)counts;
+  const BLOCK_SIZE bsize = mbmi->bsize;
+
+  if (intraonly) {
+#if CONFIG_ENTROPY_STATS
+    const PREDICTION_MODE above = av1_above_block_mode(above_mi);
+    const PREDICTION_MODE left = av1_left_block_mode(left_mi);
+    const int above_ctx = intra_mode_context[above];
+    const int left_ctx = intra_mode_context[left];
+    ++counts->kf_y_mode[above_ctx][left_ctx][y_mode];
+#endif  // CONFIG_ENTROPY_STATS
+    update_cdf(get_y_mode_cdf(fc, above_mi, left_mi), y_mode, INTRA_MODES);
+  } else {
+#if CONFIG_ENTROPY_STATS
+    ++counts->y_mode[size_group_lookup[bsize]][y_mode];
+#endif  // CONFIG_ENTROPY_STATS
+    update_cdf(fc->y_mode_cdf[size_group_lookup[bsize]], y_mode, INTRA_MODES);
+  }
+
+  if (av1_filter_intra_allowed(cm, mbmi)) {
+    const int use_filter_intra_mode =
+        mbmi->filter_intra_mode_info.use_filter_intra;
+#if CONFIG_ENTROPY_STATS
+    ++counts->filter_intra[mbmi->bsize][use_filter_intra_mode];
+    if (use_filter_intra_mode) {
+      ++counts
+            ->filter_intra_mode[mbmi->filter_intra_mode_info.filter_intra_mode];
+    }
+#endif  // CONFIG_ENTROPY_STATS
+    update_cdf(fc->filter_intra_cdfs[mbmi->bsize], use_filter_intra_mode, 2);
+    if (use_filter_intra_mode) {
+      update_cdf(fc->filter_intra_mode_cdf,
+                 mbmi->filter_intra_mode_info.filter_intra_mode,
+                 FILTER_INTRA_MODES);
+    }
+  }
+  if (av1_is_directional_mode(mbmi->mode) && av1_use_angle_delta(bsize)) {
+#if CONFIG_ENTROPY_STATS
+    ++counts->angle_delta[mbmi->mode - V_PRED]
+                         [mbmi->angle_delta[PLANE_TYPE_Y] + MAX_ANGLE_DELTA];
+#endif
+    update_cdf(fc->angle_delta_cdf[mbmi->mode - V_PRED],
+               mbmi->angle_delta[PLANE_TYPE_Y] + MAX_ANGLE_DELTA,
+               2 * MAX_ANGLE_DELTA + 1);
+  }
+
+  if (!xd->is_chroma_ref) return;
+
+  const UV_PREDICTION_MODE uv_mode = mbmi->uv_mode;
+  const CFL_ALLOWED_TYPE cfl_allowed = is_cfl_allowed(xd);
+#if CONFIG_ENTROPY_STATS
+  ++counts->uv_mode[cfl_allowed][y_mode][uv_mode];
+#endif  // CONFIG_ENTROPY_STATS
+  update_cdf(fc->uv_mode_cdf[cfl_allowed][y_mode], uv_mode,
+             UV_INTRA_MODES - !cfl_allowed);
+  if (uv_mode == UV_CFL_PRED) {
+    const int8_t joint_sign = mbmi->cfl_alpha_signs;
+    const uint8_t idx = mbmi->cfl_alpha_idx;
+
+#if CONFIG_ENTROPY_STATS
+    ++counts->cfl_sign[joint_sign];
+#endif
+    update_cdf(fc->cfl_sign_cdf, joint_sign, CFL_JOINT_SIGNS);
+    if (CFL_SIGN_U(joint_sign) != CFL_SIGN_ZERO) {
+      aom_cdf_prob *cdf_u = fc->cfl_alpha_cdf[CFL_CONTEXT_U(joint_sign)];
+
+#if CONFIG_ENTROPY_STATS
+      ++counts->cfl_alpha[CFL_CONTEXT_U(joint_sign)][CFL_IDX_U(idx)];
+#endif
+      update_cdf(cdf_u, CFL_IDX_U(idx), CFL_ALPHABET_SIZE);
+    }
+    if (CFL_SIGN_V(joint_sign) != CFL_SIGN_ZERO) {
+      aom_cdf_prob *cdf_v = fc->cfl_alpha_cdf[CFL_CONTEXT_V(joint_sign)];
+
+#if CONFIG_ENTROPY_STATS
+      ++counts->cfl_alpha[CFL_CONTEXT_V(joint_sign)][CFL_IDX_V(idx)];
+#endif
+      update_cdf(cdf_v, CFL_IDX_V(idx), CFL_ALPHABET_SIZE);
+    }
+  }
+  if (av1_is_directional_mode(get_uv_mode(uv_mode)) &&
+      av1_use_angle_delta(bsize)) {
+#if CONFIG_ENTROPY_STATS
+    ++counts->angle_delta[uv_mode - UV_V_PRED]
+                         [mbmi->angle_delta[PLANE_TYPE_UV] + MAX_ANGLE_DELTA];
+#endif
+    update_cdf(fc->angle_delta_cdf[uv_mode - UV_V_PRED],
+               mbmi->angle_delta[PLANE_TYPE_UV] + MAX_ANGLE_DELTA,
+               2 * MAX_ANGLE_DELTA + 1);
+  }
+  if (av1_allow_palette(cm->features.allow_screen_content_tools, bsize)) {
+    update_palette_cdf(xd, mbmi, counts);
+  }
+}
+
+void av1_restore_context(MACROBLOCK *x, const RD_SEARCH_MACROBLOCK_CONTEXT *ctx,
+                         int mi_row, int mi_col, BLOCK_SIZE bsize,
+                         const int num_planes) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  int p;
+  const int num_4x4_blocks_wide = mi_size_wide[bsize];
+  const int num_4x4_blocks_high = mi_size_high[bsize];
+  int mi_width = mi_size_wide[bsize];
+  int mi_height = mi_size_high[bsize];
+  for (p = 0; p < num_planes; p++) {
+    int tx_col = mi_col;
+    int tx_row = mi_row & MAX_MIB_MASK;
+    memcpy(
+        xd->above_entropy_context[p] + (tx_col >> xd->plane[p].subsampling_x),
+        ctx->a + num_4x4_blocks_wide * p,
+        (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_wide) >>
+            xd->plane[p].subsampling_x);
+    memcpy(xd->left_entropy_context[p] + (tx_row >> xd->plane[p].subsampling_y),
+           ctx->l + num_4x4_blocks_high * p,
+           (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_high) >>
+               xd->plane[p].subsampling_y);
+  }
+  memcpy(xd->above_partition_context + mi_col, ctx->sa,
+         sizeof(*xd->above_partition_context) * mi_width);
+  memcpy(xd->left_partition_context + (mi_row & MAX_MIB_MASK), ctx->sl,
+         sizeof(xd->left_partition_context[0]) * mi_height);
+  xd->above_txfm_context = ctx->p_ta;
+  xd->left_txfm_context = ctx->p_tl;
+  memcpy(xd->above_txfm_context, ctx->ta,
+         sizeof(*xd->above_txfm_context) * mi_width);
+  memcpy(xd->left_txfm_context, ctx->tl,
+         sizeof(*xd->left_txfm_context) * mi_height);
+}
+
+void av1_save_context(const MACROBLOCK *x, RD_SEARCH_MACROBLOCK_CONTEXT *ctx,
+                      int mi_row, int mi_col, BLOCK_SIZE bsize,
+                      const int num_planes) {
+  const MACROBLOCKD *xd = &x->e_mbd;
+  int p;
+  int mi_width = mi_size_wide[bsize];
+  int mi_height = mi_size_high[bsize];
+
+  // buffer the above/left context information of the block in search.
+  for (p = 0; p < num_planes; ++p) {
+    int tx_col = mi_col;
+    int tx_row = mi_row & MAX_MIB_MASK;
+    memcpy(
+        ctx->a + mi_width * p,
+        xd->above_entropy_context[p] + (tx_col >> xd->plane[p].subsampling_x),
+        (sizeof(ENTROPY_CONTEXT) * mi_width) >> xd->plane[p].subsampling_x);
+    memcpy(ctx->l + mi_height * p,
+           xd->left_entropy_context[p] + (tx_row >> xd->plane[p].subsampling_y),
+           (sizeof(ENTROPY_CONTEXT) * mi_height) >> xd->plane[p].subsampling_y);
+  }
+  memcpy(ctx->sa, xd->above_partition_context + mi_col,
+         sizeof(*xd->above_partition_context) * mi_width);
+  memcpy(ctx->sl, xd->left_partition_context + (mi_row & MAX_MIB_MASK),
+         sizeof(xd->left_partition_context[0]) * mi_height);
+  memcpy(ctx->ta, xd->above_txfm_context,
+         sizeof(*xd->above_txfm_context) * mi_width);
+  memcpy(ctx->tl, xd->left_txfm_context,
+         sizeof(*xd->left_txfm_context) * mi_height);
+  ctx->p_ta = xd->above_txfm_context;
+  ctx->p_tl = xd->left_txfm_context;
+}
+
+static void set_partial_sb_partition(const AV1_COMMON *const cm,
+                                     MB_MODE_INFO *mi, int bh_in, int bw_in,
+                                     int mi_rows_remaining,
+                                     int mi_cols_remaining, BLOCK_SIZE bsize,
+                                     MB_MODE_INFO **mib) {
+  int bh = bh_in;
+  int r, c;
+  for (r = 0; r < cm->seq_params.mib_size; r += bh) {
+    int bw = bw_in;
+    for (c = 0; c < cm->seq_params.mib_size; c += bw) {
+      const int grid_index = get_mi_grid_idx(&cm->mi_params, r, c);
+      const int mi_index = get_alloc_mi_idx(&cm->mi_params, r, c);
+      mib[grid_index] = mi + mi_index;
+      mib[grid_index]->bsize = find_partition_size(
+          bsize, mi_rows_remaining - r, mi_cols_remaining - c, &bh, &bw);
+    }
+  }
+}
+
+// This function attempts to set all mode info entries in a given superblock
+// to the same block partition size.
+// However, at the bottom and right borders of the image the requested size
+// may not be allowed in which case this code attempts to choose the largest
+// allowable partition.
+void av1_set_fixed_partitioning(AV1_COMP *cpi, const TileInfo *const tile,
+                                MB_MODE_INFO **mib, int mi_row, int mi_col,
+                                BLOCK_SIZE bsize) {
+  AV1_COMMON *const cm = &cpi->common;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  const int mi_rows_remaining = tile->mi_row_end - mi_row;
+  const int mi_cols_remaining = tile->mi_col_end - mi_col;
+  MB_MODE_INFO *const mi_upper_left =
+      mi_params->mi_alloc + get_alloc_mi_idx(mi_params, mi_row, mi_col);
+  int bh = mi_size_high[bsize];
+  int bw = mi_size_wide[bsize];
+
+  assert(bsize >= mi_params->mi_alloc_bsize &&
+         "Attempted to use bsize < mi_params->mi_alloc_bsize");
+  assert((mi_rows_remaining > 0) && (mi_cols_remaining > 0));
+
+  // Apply the requested partition size to the SB if it is all "in image"
+  if ((mi_cols_remaining >= cm->seq_params.mib_size) &&
+      (mi_rows_remaining >= cm->seq_params.mib_size)) {
+    for (int block_row = 0; block_row < cm->seq_params.mib_size;
+         block_row += bh) {
+      for (int block_col = 0; block_col < cm->seq_params.mib_size;
+           block_col += bw) {
+        const int grid_index = get_mi_grid_idx(mi_params, block_row, block_col);
+        const int mi_index = get_alloc_mi_idx(mi_params, block_row, block_col);
+        mib[grid_index] = mi_upper_left + mi_index;
+        mib[grid_index]->bsize = bsize;
+      }
+    }
+  } else {
+    // Else this is a partial SB.
+    set_partial_sb_partition(cm, mi_upper_left, bh, bw, mi_rows_remaining,
+                             mi_cols_remaining, bsize, mib);
+  }
+}
+
+int av1_is_leaf_split_partition(AV1_COMMON *cm, int mi_row, int mi_col,
+                                BLOCK_SIZE bsize) {
+  const int bs = mi_size_wide[bsize];
+  const int hbs = bs / 2;
+  assert(bsize >= BLOCK_8X8);
+  const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
+
+  for (int i = 0; i < 4; i++) {
+    int x_idx = (i & 1) * hbs;
+    int y_idx = (i >> 1) * hbs;
+    if ((mi_row + y_idx >= cm->mi_params.mi_rows) ||
+        (mi_col + x_idx >= cm->mi_params.mi_cols))
+      return 0;
+    if (get_partition(cm, mi_row + y_idx, mi_col + x_idx, subsize) !=
+            PARTITION_NONE &&
+        subsize != BLOCK_8X8)
+      return 0;
+  }
+  return 1;
+}
+
+#if !CONFIG_REALTIME_ONLY
+int av1_get_rdmult_delta(AV1_COMP *cpi, BLOCK_SIZE bsize, int mi_row,
+                         int mi_col, int orig_rdmult) {
+  AV1_COMMON *const cm = &cpi->common;
+  const GF_GROUP *const gf_group = &cpi->gf_group;
+  assert(IMPLIES(cpi->gf_group.size > 0,
+                 cpi->gf_group.index < cpi->gf_group.size));
+  const int tpl_idx = cpi->gf_group.index;
+  TplParams *const tpl_data = &cpi->tpl_data;
+  TplDepFrame *tpl_frame = &tpl_data->tpl_frame[tpl_idx];
+  TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr;
+  const uint8_t block_mis_log2 = tpl_data->tpl_stats_block_mis_log2;
+  int tpl_stride = tpl_frame->stride;
+  int64_t intra_cost = 0;
+  int64_t mc_dep_cost = 0;
+  const int mi_wide = mi_size_wide[bsize];
+  const int mi_high = mi_size_high[bsize];
+
+  if (tpl_frame->is_valid == 0) return orig_rdmult;
+
+  if (!is_frame_tpl_eligible(gf_group, gf_group->index)) return orig_rdmult;
+
+  if (cpi->gf_group.index >= MAX_TPL_FRAME_IDX) return orig_rdmult;
+
+  int mi_count = 0;
+  const int mi_col_sr =
+      coded_to_superres_mi(mi_col, cm->superres_scale_denominator);
+  const int mi_col_end_sr =
+      coded_to_superres_mi(mi_col + mi_wide, cm->superres_scale_denominator);
+  const int mi_cols_sr = av1_pixels_to_mi(cm->superres_upscaled_width);
+  const int step = 1 << block_mis_log2;
+  const int row_step = step;
+  const int col_step_sr =
+      coded_to_superres_mi(step, cm->superres_scale_denominator);
+  for (int row = mi_row; row < mi_row + mi_high; row += row_step) {
+    for (int col = mi_col_sr; col < mi_col_end_sr; col += col_step_sr) {
+      if (row >= cm->mi_params.mi_rows || col >= mi_cols_sr) continue;
+      TplDepStats *this_stats =
+          &tpl_stats[av1_tpl_ptr_pos(row, col, tpl_stride, block_mis_log2)];
+      int64_t mc_dep_delta =
+          RDCOST(tpl_frame->base_rdmult, this_stats->mc_dep_rate,
+                 this_stats->mc_dep_dist);
+      intra_cost += this_stats->recrf_dist << RDDIV_BITS;
+      mc_dep_cost += (this_stats->recrf_dist << RDDIV_BITS) + mc_dep_delta;
+      mi_count++;
+    }
+  }
+  assert(mi_count <= MAX_TPL_BLK_IN_SB * MAX_TPL_BLK_IN_SB);
+
+  aom_clear_system_state();
+
+  double beta = 1.0;
+  if (mc_dep_cost > 0 && intra_cost > 0) {
+    const double r0 = cpi->rd.r0;
+    const double rk = (double)intra_cost / mc_dep_cost;
+    beta = (r0 / rk);
+  }
+
+  int rdmult = av1_get_adaptive_rdmult(cpi, beta);
+
+  aom_clear_system_state();
+
+  rdmult = AOMMIN(rdmult, orig_rdmult * 3 / 2);
+  rdmult = AOMMAX(rdmult, orig_rdmult * 1 / 2);
+
+  rdmult = AOMMAX(1, rdmult);
+
+  return rdmult;
+}
+
+// Checks to see if a super block is on a horizontal image edge.
+// In most cases this is the "real" edge unless there are formatting
+// bars embedded in the stream.
+int av1_active_h_edge(const AV1_COMP *cpi, int mi_row, int mi_step) {
+  int top_edge = 0;
+  int bottom_edge = cpi->common.mi_params.mi_rows;
+  int is_active_h_edge = 0;
+
+  // For two pass account for any formatting bars detected.
+  if (is_stat_consumption_stage_twopass(cpi)) {
+    const AV1_COMMON *const cm = &cpi->common;
+    const FIRSTPASS_STATS *const this_frame_stats = read_one_frame_stats(
+        &cpi->twopass, cm->current_frame.display_order_hint);
+    if (this_frame_stats == NULL) return AOM_CODEC_ERROR;
+
+    // The inactive region is specified in MBs not mi units.
+    // The image edge is in the following MB row.
+    top_edge += (int)(this_frame_stats->inactive_zone_rows * 4);
+
+    bottom_edge -= (int)(this_frame_stats->inactive_zone_rows * 4);
+    bottom_edge = AOMMAX(top_edge, bottom_edge);
+  }
+
+  if (((top_edge >= mi_row) && (top_edge < (mi_row + mi_step))) ||
+      ((bottom_edge >= mi_row) && (bottom_edge < (mi_row + mi_step)))) {
+    is_active_h_edge = 1;
+  }
+  return is_active_h_edge;
+}
+
+// Checks to see if a super block is on a vertical image edge.
+// In most cases this is the "real" edge unless there are formatting
+// bars embedded in the stream.
+int av1_active_v_edge(const AV1_COMP *cpi, int mi_col, int mi_step) {
+  int left_edge = 0;
+  int right_edge = cpi->common.mi_params.mi_cols;
+  int is_active_v_edge = 0;
+
+  // For two pass account for any formatting bars detected.
+  if (is_stat_consumption_stage_twopass(cpi)) {
+    const AV1_COMMON *const cm = &cpi->common;
+    const FIRSTPASS_STATS *const this_frame_stats = read_one_frame_stats(
+        &cpi->twopass, cm->current_frame.display_order_hint);
+    if (this_frame_stats == NULL) return AOM_CODEC_ERROR;
+
+    // The inactive region is specified in MBs not mi units.
+    // The image edge is in the following MB row.
+    left_edge += (int)(this_frame_stats->inactive_zone_cols * 4);
+
+    right_edge -= (int)(this_frame_stats->inactive_zone_cols * 4);
+    right_edge = AOMMAX(left_edge, right_edge);
+  }
+
+  if (((left_edge >= mi_col) && (left_edge < (mi_col + mi_step))) ||
+      ((right_edge >= mi_col) && (right_edge < (mi_col + mi_step)))) {
+    is_active_v_edge = 1;
+  }
+  return is_active_v_edge;
+}
+
+void av1_get_tpl_stats_sb(AV1_COMP *cpi, BLOCK_SIZE bsize, int mi_row,
+                          int mi_col, SuperBlockEnc *sb_enc) {
+  sb_enc->tpl_data_count = 0;
+
+  if (!cpi->oxcf.algo_cfg.enable_tpl_model) return;
+  if (cpi->common.current_frame.frame_type == KEY_FRAME) return;
+  const FRAME_UPDATE_TYPE update_type = get_frame_update_type(&cpi->gf_group);
+  if (update_type == INTNL_OVERLAY_UPDATE || update_type == OVERLAY_UPDATE)
+    return;
+  assert(IMPLIES(cpi->gf_group.size > 0,
+                 cpi->gf_group.index < cpi->gf_group.size));
+
+  AV1_COMMON *const cm = &cpi->common;
+  const int gf_group_index = cpi->gf_group.index;
+  TplParams *const tpl_data = &cpi->tpl_data;
+  TplDepFrame *tpl_frame = &tpl_data->tpl_frame[gf_group_index];
+  TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr;
+  int tpl_stride = tpl_frame->stride;
+  const int mi_wide = mi_size_wide[bsize];
+  const int mi_high = mi_size_high[bsize];
+
+  if (tpl_frame->is_valid == 0) return;
+  if (gf_group_index >= MAX_TPL_FRAME_IDX) return;
+
+  int mi_count = 0;
+  int count = 0;
+  const int mi_col_sr =
+      coded_to_superres_mi(mi_col, cm->superres_scale_denominator);
+  const int mi_col_end_sr =
+      coded_to_superres_mi(mi_col + mi_wide, cm->superres_scale_denominator);
+  // mi_cols_sr is mi_cols at superres case.
+  const int mi_cols_sr = av1_pixels_to_mi(cm->superres_upscaled_width);
+
+  // TPL store unit size is not the same as the motion estimation unit size.
+  // Here always use motion estimation size to avoid getting repetitive inter/
+  // intra cost.
+  const BLOCK_SIZE tpl_bsize = convert_length_to_bsize(tpl_data->tpl_bsize_1d);
+  assert(mi_size_wide[tpl_bsize] == mi_size_high[tpl_bsize]);
+  const int row_step = mi_size_high[tpl_bsize];
+  const int col_step_sr = coded_to_superres_mi(mi_size_wide[tpl_bsize],
+                                               cm->superres_scale_denominator);
+
+  // Stride is only based on SB size, and we fill in values for every 16x16
+  // block in a SB.
+  sb_enc->tpl_stride = (mi_col_end_sr - mi_col_sr) / col_step_sr;
+
+  for (int row = mi_row; row < mi_row + mi_high; row += row_step) {
+    for (int col = mi_col_sr; col < mi_col_end_sr; col += col_step_sr) {
+      assert(count < MAX_TPL_BLK_IN_SB * MAX_TPL_BLK_IN_SB);
+      // Handle partial SB, so that no invalid values are used later.
+      if (row >= cm->mi_params.mi_rows || col >= mi_cols_sr) {
+        sb_enc->tpl_inter_cost[count] = INT64_MAX;
+        sb_enc->tpl_intra_cost[count] = INT64_MAX;
+        for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
+          sb_enc->tpl_mv[count][i].as_int = INVALID_MV;
+        }
+        count++;
+        continue;
+      }
+
+      TplDepStats *this_stats = &tpl_stats[av1_tpl_ptr_pos(
+          row, col, tpl_stride, tpl_data->tpl_stats_block_mis_log2)];
+      sb_enc->tpl_inter_cost[count] = this_stats->inter_cost;
+      sb_enc->tpl_intra_cost[count] = this_stats->intra_cost;
+      memcpy(sb_enc->tpl_mv[count], this_stats->mv, sizeof(this_stats->mv));
+      mi_count++;
+      count++;
+    }
+  }
+
+  assert(mi_count <= MAX_TPL_BLK_IN_SB * MAX_TPL_BLK_IN_SB);
+  sb_enc->tpl_data_count = mi_count;
+}
+
+// analysis_type 0: Use mc_dep_cost and intra_cost
+// analysis_type 1: Use count of best inter predictor chosen
+// analysis_type 2: Use cost reduction from intra to inter for best inter
+//                  predictor chosen
+int av1_get_q_for_deltaq_objective(AV1_COMP *const cpi, BLOCK_SIZE bsize,
+                                   int mi_row, int mi_col) {
+  AV1_COMMON *const cm = &cpi->common;
+  const GF_GROUP *const gf_group = &cpi->gf_group;
+  assert(IMPLIES(cpi->gf_group.size > 0,
+                 cpi->gf_group.index < cpi->gf_group.size));
+  const int tpl_idx = cpi->gf_group.index;
+  TplParams *const tpl_data = &cpi->tpl_data;
+  TplDepFrame *tpl_frame = &tpl_data->tpl_frame[tpl_idx];
+  TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr;
+  const uint8_t block_mis_log2 = tpl_data->tpl_stats_block_mis_log2;
+  int tpl_stride = tpl_frame->stride;
+  int64_t intra_cost = 0;
+  int64_t mc_dep_cost = 0;
+  const int mi_wide = mi_size_wide[bsize];
+  const int mi_high = mi_size_high[bsize];
+  const int base_qindex = cm->quant_params.base_qindex;
+
+  if (tpl_frame->is_valid == 0) return base_qindex;
+
+  if (!is_frame_tpl_eligible(gf_group, gf_group->index)) return base_qindex;
+
+  if (cpi->gf_group.index >= MAX_TPL_FRAME_IDX) return base_qindex;
+
+  int mi_count = 0;
+  const int mi_col_sr =
+      coded_to_superres_mi(mi_col, cm->superres_scale_denominator);
+  const int mi_col_end_sr =
+      coded_to_superres_mi(mi_col + mi_wide, cm->superres_scale_denominator);
+  const int mi_cols_sr = av1_pixels_to_mi(cm->superres_upscaled_width);
+  const int step = 1 << block_mis_log2;
+  const int row_step = step;
+  const int col_step_sr =
+      coded_to_superres_mi(step, cm->superres_scale_denominator);
+  for (int row = mi_row; row < mi_row + mi_high; row += row_step) {
+    for (int col = mi_col_sr; col < mi_col_end_sr; col += col_step_sr) {
+      if (row >= cm->mi_params.mi_rows || col >= mi_cols_sr) continue;
+      TplDepStats *this_stats =
+          &tpl_stats[av1_tpl_ptr_pos(row, col, tpl_stride, block_mis_log2)];
+      int64_t mc_dep_delta =
+          RDCOST(tpl_frame->base_rdmult, this_stats->mc_dep_rate,
+                 this_stats->mc_dep_dist);
+      intra_cost += this_stats->recrf_dist << RDDIV_BITS;
+      mc_dep_cost += (this_stats->recrf_dist << RDDIV_BITS) + mc_dep_delta;
+      mi_count++;
+    }
+  }
+  assert(mi_count <= MAX_TPL_BLK_IN_SB * MAX_TPL_BLK_IN_SB);
+
+  aom_clear_system_state();
+
+  int offset = 0;
+  double beta = 1.0;
+  if (mc_dep_cost > 0 && intra_cost > 0) {
+    const double r0 = cpi->rd.r0;
+    const double rk = (double)intra_cost / mc_dep_cost;
+    beta = (r0 / rk);
+    assert(beta > 0.0);
+  }
+  offset = av1_get_deltaq_offset(cpi, base_qindex, beta);
+  aom_clear_system_state();
+
+  const DeltaQInfo *const delta_q_info = &cm->delta_q_info;
+  offset = AOMMIN(offset, delta_q_info->delta_q_res * 9 - 1);
+  offset = AOMMAX(offset, -delta_q_info->delta_q_res * 9 + 1);
+  int qindex = cm->quant_params.base_qindex + offset;
+  qindex = AOMMIN(qindex, MAXQ);
+  qindex = AOMMAX(qindex, MINQ);
+
+  return qindex;
+}
+#endif  // !CONFIG_REALTIME_ONLY
+
+void av1_reset_simple_motion_tree_partition(SIMPLE_MOTION_DATA_TREE *sms_tree,
+                                            BLOCK_SIZE bsize) {
+  sms_tree->partitioning = PARTITION_NONE;
+
+  if (bsize >= BLOCK_8X8) {
+    BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
+    for (int idx = 0; idx < 4; ++idx)
+      av1_reset_simple_motion_tree_partition(sms_tree->split[idx], subsize);
+  }
+}
+
+// Record the ref frames that have been selected by square partition blocks.
+void av1_update_picked_ref_frames_mask(MACROBLOCK *const x, int ref_type,
+                                       BLOCK_SIZE bsize, int mib_size,
+                                       int mi_row, int mi_col) {
+  assert(mi_size_wide[bsize] == mi_size_high[bsize]);
+  const int sb_size_mask = mib_size - 1;
+  const int mi_row_in_sb = mi_row & sb_size_mask;
+  const int mi_col_in_sb = mi_col & sb_size_mask;
+  const int mi_size = mi_size_wide[bsize];
+  for (int i = mi_row_in_sb; i < mi_row_in_sb + mi_size; ++i) {
+    for (int j = mi_col_in_sb; j < mi_col_in_sb + mi_size; ++j) {
+      x->picked_ref_frames_mask[i * 32 + j] |= 1 << ref_type;
+    }
+  }
+}
+
+static void avg_cdf_symbol(aom_cdf_prob *cdf_ptr_left, aom_cdf_prob *cdf_ptr_tr,
+                           int num_cdfs, int cdf_stride, int nsymbs,
+                           int wt_left, int wt_tr) {
+  for (int i = 0; i < num_cdfs; i++) {
+    for (int j = 0; j <= nsymbs; j++) {
+      cdf_ptr_left[i * cdf_stride + j] =
+          (aom_cdf_prob)(((int)cdf_ptr_left[i * cdf_stride + j] * wt_left +
+                          (int)cdf_ptr_tr[i * cdf_stride + j] * wt_tr +
+                          ((wt_left + wt_tr) / 2)) /
+                         (wt_left + wt_tr));
+      assert(cdf_ptr_left[i * cdf_stride + j] >= 0 &&
+             cdf_ptr_left[i * cdf_stride + j] < CDF_PROB_TOP);
+    }
+  }
+}
+
+#define AVERAGE_CDF(cname_left, cname_tr, nsymbs) \
+  AVG_CDF_STRIDE(cname_left, cname_tr, nsymbs, CDF_SIZE(nsymbs))
+
+#define AVG_CDF_STRIDE(cname_left, cname_tr, nsymbs, cdf_stride)           \
+  do {                                                                     \
+    aom_cdf_prob *cdf_ptr_left = (aom_cdf_prob *)cname_left;               \
+    aom_cdf_prob *cdf_ptr_tr = (aom_cdf_prob *)cname_tr;                   \
+    int array_size = (int)sizeof(cname_left) / sizeof(aom_cdf_prob);       \
+    int num_cdfs = array_size / cdf_stride;                                \
+    avg_cdf_symbol(cdf_ptr_left, cdf_ptr_tr, num_cdfs, cdf_stride, nsymbs, \
+                   wt_left, wt_tr);                                        \
+  } while (0)
+
+static void avg_nmv(nmv_context *nmv_left, nmv_context *nmv_tr, int wt_left,
+                    int wt_tr) {
+  AVERAGE_CDF(nmv_left->joints_cdf, nmv_tr->joints_cdf, 4);
+  for (int i = 0; i < 2; i++) {
+    AVERAGE_CDF(nmv_left->comps[i].classes_cdf, nmv_tr->comps[i].classes_cdf,
+                MV_CLASSES);
+    AVERAGE_CDF(nmv_left->comps[i].class0_fp_cdf,
+                nmv_tr->comps[i].class0_fp_cdf, MV_FP_SIZE);
+    AVERAGE_CDF(nmv_left->comps[i].fp_cdf, nmv_tr->comps[i].fp_cdf, MV_FP_SIZE);
+    AVERAGE_CDF(nmv_left->comps[i].sign_cdf, nmv_tr->comps[i].sign_cdf, 2);
+    AVERAGE_CDF(nmv_left->comps[i].class0_hp_cdf,
+                nmv_tr->comps[i].class0_hp_cdf, 2);
+    AVERAGE_CDF(nmv_left->comps[i].hp_cdf, nmv_tr->comps[i].hp_cdf, 2);
+    AVERAGE_CDF(nmv_left->comps[i].class0_cdf, nmv_tr->comps[i].class0_cdf,
+                CLASS0_SIZE);
+    AVERAGE_CDF(nmv_left->comps[i].bits_cdf, nmv_tr->comps[i].bits_cdf, 2);
+  }
+}
+
+// In case of row-based multi-threading of encoder, since we always
+// keep a top - right sync, we can average the top - right SB's CDFs and
+// the left SB's CDFs and use the same for current SB's encoding to
+// improve the performance. This function facilitates the averaging
+// of CDF and used only when row-mt is enabled in encoder.
+void av1_avg_cdf_symbols(FRAME_CONTEXT *ctx_left, FRAME_CONTEXT *ctx_tr,
+                         int wt_left, int wt_tr) {
+  AVERAGE_CDF(ctx_left->txb_skip_cdf, ctx_tr->txb_skip_cdf, 2);
+  AVERAGE_CDF(ctx_left->eob_extra_cdf, ctx_tr->eob_extra_cdf, 2);
+  AVERAGE_CDF(ctx_left->dc_sign_cdf, ctx_tr->dc_sign_cdf, 2);
+  AVERAGE_CDF(ctx_left->eob_flag_cdf16, ctx_tr->eob_flag_cdf16, 5);
+  AVERAGE_CDF(ctx_left->eob_flag_cdf32, ctx_tr->eob_flag_cdf32, 6);
+  AVERAGE_CDF(ctx_left->eob_flag_cdf64, ctx_tr->eob_flag_cdf64, 7);
+  AVERAGE_CDF(ctx_left->eob_flag_cdf128, ctx_tr->eob_flag_cdf128, 8);
+  AVERAGE_CDF(ctx_left->eob_flag_cdf256, ctx_tr->eob_flag_cdf256, 9);
+  AVERAGE_CDF(ctx_left->eob_flag_cdf512, ctx_tr->eob_flag_cdf512, 10);
+  AVERAGE_CDF(ctx_left->eob_flag_cdf1024, ctx_tr->eob_flag_cdf1024, 11);
+  AVERAGE_CDF(ctx_left->coeff_base_eob_cdf, ctx_tr->coeff_base_eob_cdf, 3);
+  AVERAGE_CDF(ctx_left->coeff_base_cdf, ctx_tr->coeff_base_cdf, 4);
+  AVERAGE_CDF(ctx_left->coeff_br_cdf, ctx_tr->coeff_br_cdf, BR_CDF_SIZE);
+  AVERAGE_CDF(ctx_left->newmv_cdf, ctx_tr->newmv_cdf, 2);
+  AVERAGE_CDF(ctx_left->zeromv_cdf, ctx_tr->zeromv_cdf, 2);
+  AVERAGE_CDF(ctx_left->refmv_cdf, ctx_tr->refmv_cdf, 2);
+  AVERAGE_CDF(ctx_left->drl_cdf, ctx_tr->drl_cdf, 2);
+  AVERAGE_CDF(ctx_left->inter_compound_mode_cdf,
+              ctx_tr->inter_compound_mode_cdf, INTER_COMPOUND_MODES);
+  AVERAGE_CDF(ctx_left->compound_type_cdf, ctx_tr->compound_type_cdf,
+              MASKED_COMPOUND_TYPES);
+  AVERAGE_CDF(ctx_left->wedge_idx_cdf, ctx_tr->wedge_idx_cdf, 16);
+  AVERAGE_CDF(ctx_left->interintra_cdf, ctx_tr->interintra_cdf, 2);
+  AVERAGE_CDF(ctx_left->wedge_interintra_cdf, ctx_tr->wedge_interintra_cdf, 2);
+  AVERAGE_CDF(ctx_left->interintra_mode_cdf, ctx_tr->interintra_mode_cdf,
+              INTERINTRA_MODES);
+  AVERAGE_CDF(ctx_left->motion_mode_cdf, ctx_tr->motion_mode_cdf, MOTION_MODES);
+  AVERAGE_CDF(ctx_left->obmc_cdf, ctx_tr->obmc_cdf, 2);
+  AVERAGE_CDF(ctx_left->palette_y_size_cdf, ctx_tr->palette_y_size_cdf,
+              PALETTE_SIZES);
+  AVERAGE_CDF(ctx_left->palette_uv_size_cdf, ctx_tr->palette_uv_size_cdf,
+              PALETTE_SIZES);
+  for (int j = 0; j < PALETTE_SIZES; j++) {
+    int nsymbs = j + PALETTE_MIN_SIZE;
+    AVG_CDF_STRIDE(ctx_left->palette_y_color_index_cdf[j],
+                   ctx_tr->palette_y_color_index_cdf[j], nsymbs,
+                   CDF_SIZE(PALETTE_COLORS));
+    AVG_CDF_STRIDE(ctx_left->palette_uv_color_index_cdf[j],
+                   ctx_tr->palette_uv_color_index_cdf[j], nsymbs,
+                   CDF_SIZE(PALETTE_COLORS));
+  }
+  AVERAGE_CDF(ctx_left->palette_y_mode_cdf, ctx_tr->palette_y_mode_cdf, 2);
+  AVERAGE_CDF(ctx_left->palette_uv_mode_cdf, ctx_tr->palette_uv_mode_cdf, 2);
+  AVERAGE_CDF(ctx_left->comp_inter_cdf, ctx_tr->comp_inter_cdf, 2);
+  AVERAGE_CDF(ctx_left->single_ref_cdf, ctx_tr->single_ref_cdf, 2);
+  AVERAGE_CDF(ctx_left->comp_ref_type_cdf, ctx_tr->comp_ref_type_cdf, 2);
+  AVERAGE_CDF(ctx_left->uni_comp_ref_cdf, ctx_tr->uni_comp_ref_cdf, 2);
+  AVERAGE_CDF(ctx_left->comp_ref_cdf, ctx_tr->comp_ref_cdf, 2);
+  AVERAGE_CDF(ctx_left->comp_bwdref_cdf, ctx_tr->comp_bwdref_cdf, 2);
+  AVERAGE_CDF(ctx_left->txfm_partition_cdf, ctx_tr->txfm_partition_cdf, 2);
+  AVERAGE_CDF(ctx_left->compound_index_cdf, ctx_tr->compound_index_cdf, 2);
+  AVERAGE_CDF(ctx_left->comp_group_idx_cdf, ctx_tr->comp_group_idx_cdf, 2);
+  AVERAGE_CDF(ctx_left->skip_mode_cdfs, ctx_tr->skip_mode_cdfs, 2);
+  AVERAGE_CDF(ctx_left->skip_txfm_cdfs, ctx_tr->skip_txfm_cdfs, 2);
+  AVERAGE_CDF(ctx_left->intra_inter_cdf, ctx_tr->intra_inter_cdf, 2);
+  avg_nmv(&ctx_left->nmvc, &ctx_tr->nmvc, wt_left, wt_tr);
+  avg_nmv(&ctx_left->ndvc, &ctx_tr->ndvc, wt_left, wt_tr);
+  AVERAGE_CDF(ctx_left->intrabc_cdf, ctx_tr->intrabc_cdf, 2);
+  AVERAGE_CDF(ctx_left->seg.tree_cdf, ctx_tr->seg.tree_cdf, MAX_SEGMENTS);
+  AVERAGE_CDF(ctx_left->seg.pred_cdf, ctx_tr->seg.pred_cdf, 2);
+  AVERAGE_CDF(ctx_left->seg.spatial_pred_seg_cdf,
+              ctx_tr->seg.spatial_pred_seg_cdf, MAX_SEGMENTS);
+  AVERAGE_CDF(ctx_left->filter_intra_cdfs, ctx_tr->filter_intra_cdfs, 2);
+  AVERAGE_CDF(ctx_left->filter_intra_mode_cdf, ctx_tr->filter_intra_mode_cdf,
+              FILTER_INTRA_MODES);
+  AVERAGE_CDF(ctx_left->switchable_restore_cdf, ctx_tr->switchable_restore_cdf,
+              RESTORE_SWITCHABLE_TYPES);
+  AVERAGE_CDF(ctx_left->wiener_restore_cdf, ctx_tr->wiener_restore_cdf, 2);
+  AVERAGE_CDF(ctx_left->sgrproj_restore_cdf, ctx_tr->sgrproj_restore_cdf, 2);
+  AVERAGE_CDF(ctx_left->y_mode_cdf, ctx_tr->y_mode_cdf, INTRA_MODES);
+  AVG_CDF_STRIDE(ctx_left->uv_mode_cdf[0], ctx_tr->uv_mode_cdf[0],
+                 UV_INTRA_MODES - 1, CDF_SIZE(UV_INTRA_MODES));
+  AVERAGE_CDF(ctx_left->uv_mode_cdf[1], ctx_tr->uv_mode_cdf[1], UV_INTRA_MODES);
+  for (int i = 0; i < PARTITION_CONTEXTS; i++) {
+    if (i < 4) {
+      AVG_CDF_STRIDE(ctx_left->partition_cdf[i], ctx_tr->partition_cdf[i], 4,
+                     CDF_SIZE(10));
+    } else if (i < 16) {
+      AVERAGE_CDF(ctx_left->partition_cdf[i], ctx_tr->partition_cdf[i], 10);
+    } else {
+      AVG_CDF_STRIDE(ctx_left->partition_cdf[i], ctx_tr->partition_cdf[i], 8,
+                     CDF_SIZE(10));
+    }
+  }
+  AVERAGE_CDF(ctx_left->switchable_interp_cdf, ctx_tr->switchable_interp_cdf,
+              SWITCHABLE_FILTERS);
+  AVERAGE_CDF(ctx_left->kf_y_cdf, ctx_tr->kf_y_cdf, INTRA_MODES);
+  AVERAGE_CDF(ctx_left->angle_delta_cdf, ctx_tr->angle_delta_cdf,
+              2 * MAX_ANGLE_DELTA + 1);
+  AVG_CDF_STRIDE(ctx_left->tx_size_cdf[0], ctx_tr->tx_size_cdf[0], MAX_TX_DEPTH,
+                 CDF_SIZE(MAX_TX_DEPTH + 1));
+  AVERAGE_CDF(ctx_left->tx_size_cdf[1], ctx_tr->tx_size_cdf[1],
+              MAX_TX_DEPTH + 1);
+  AVERAGE_CDF(ctx_left->tx_size_cdf[2], ctx_tr->tx_size_cdf[2],
+              MAX_TX_DEPTH + 1);
+  AVERAGE_CDF(ctx_left->tx_size_cdf[3], ctx_tr->tx_size_cdf[3],
+              MAX_TX_DEPTH + 1);
+  AVERAGE_CDF(ctx_left->delta_q_cdf, ctx_tr->delta_q_cdf, DELTA_Q_PROBS + 1);
+  AVERAGE_CDF(ctx_left->delta_lf_cdf, ctx_tr->delta_lf_cdf, DELTA_LF_PROBS + 1);
+  for (int i = 0; i < FRAME_LF_COUNT; i++) {
+    AVERAGE_CDF(ctx_left->delta_lf_multi_cdf[i], ctx_tr->delta_lf_multi_cdf[i],
+                DELTA_LF_PROBS + 1);
+  }
+  AVG_CDF_STRIDE(ctx_left->intra_ext_tx_cdf[1], ctx_tr->intra_ext_tx_cdf[1], 7,
+                 CDF_SIZE(TX_TYPES));
+  AVG_CDF_STRIDE(ctx_left->intra_ext_tx_cdf[2], ctx_tr->intra_ext_tx_cdf[2], 5,
+                 CDF_SIZE(TX_TYPES));
+  AVG_CDF_STRIDE(ctx_left->inter_ext_tx_cdf[1], ctx_tr->inter_ext_tx_cdf[1], 16,
+                 CDF_SIZE(TX_TYPES));
+  AVG_CDF_STRIDE(ctx_left->inter_ext_tx_cdf[2], ctx_tr->inter_ext_tx_cdf[2], 12,
+                 CDF_SIZE(TX_TYPES));
+  AVG_CDF_STRIDE(ctx_left->inter_ext_tx_cdf[3], ctx_tr->inter_ext_tx_cdf[3], 2,
+                 CDF_SIZE(TX_TYPES));
+  AVERAGE_CDF(ctx_left->cfl_sign_cdf, ctx_tr->cfl_sign_cdf, CFL_JOINT_SIGNS);
+  AVERAGE_CDF(ctx_left->cfl_alpha_cdf, ctx_tr->cfl_alpha_cdf,
+              CFL_ALPHABET_SIZE);
+}
+
+// Grade the temporal variation of the source by comparing the current sb and
+// its collocated block in the last frame.
+void av1_source_content_sb(AV1_COMP *cpi, MACROBLOCK *x, int offset) {
+  unsigned int tmp_sse;
+  unsigned int tmp_variance;
+  const BLOCK_SIZE bsize = cpi->common.seq_params.sb_size;
+  uint8_t *src_y = cpi->source->y_buffer;
+  int src_ystride = cpi->source->y_stride;
+  uint8_t *last_src_y = cpi->last_source->y_buffer;
+  int last_src_ystride = cpi->last_source->y_stride;
+  uint64_t avg_source_sse_threshold = 100000;        // ~5*5*(64*64)
+  uint64_t avg_source_sse_threshold_high = 1000000;  // ~15*15*(64*64)
+  uint64_t sum_sq_thresh = 10000;  // sum = sqrt(thresh / 64*64)) ~1.5
+#if CONFIG_AV1_HIGHBITDEPTH
+  MACROBLOCKD *xd = &x->e_mbd;
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) return;
+#endif
+  src_y += offset;
+  last_src_y += offset;
+  tmp_variance = cpi->fn_ptr[bsize].vf(src_y, src_ystride, last_src_y,
+                                       last_src_ystride, &tmp_sse);
+  if (tmp_sse < avg_source_sse_threshold)
+    x->content_state_sb.source_sad = kLowSad;
+  else if (tmp_sse > avg_source_sse_threshold_high)
+    x->content_state_sb.source_sad = kHighSad;
+  // Detect large lighting change.
+  // Note: tmp_sse - tmp_variance = ((sum * sum) >> 12)
+  if (tmp_variance < (tmp_sse >> 1) && (tmp_sse - tmp_variance) > sum_sq_thresh)
+    x->content_state_sb.lighting_change = 1;
+  if ((tmp_sse - tmp_variance) < (sum_sq_thresh >> 1))
+    x->content_state_sb.low_sumdiff = 1;
+}
+
+// Memset the mbmis at the current superblock to 0
+void av1_reset_mbmi(CommonModeInfoParams *const mi_params, BLOCK_SIZE sb_size,
+                    int mi_row, int mi_col) {
+  // size of sb in unit of mi (BLOCK_4X4)
+  const int sb_size_mi = mi_size_wide[sb_size];
+  const int mi_alloc_size_1d = mi_size_wide[mi_params->mi_alloc_bsize];
+  // size of sb in unit of allocated mi size
+  const int sb_size_alloc_mi = mi_size_wide[sb_size] / mi_alloc_size_1d;
+  assert(mi_params->mi_alloc_stride % sb_size_alloc_mi == 0 &&
+         "mi is not allocated as a multiple of sb!");
+  assert(mi_params->mi_stride % sb_size_mi == 0 &&
+         "mi_grid_base is not allocated as a multiple of sb!");
+
+  const int mi_rows = mi_size_high[sb_size];
+  for (int cur_mi_row = 0; cur_mi_row < mi_rows; cur_mi_row++) {
+    assert(get_mi_grid_idx(mi_params, 0, mi_col + mi_alloc_size_1d) <
+           mi_params->mi_stride);
+    const int mi_grid_idx =
+        get_mi_grid_idx(mi_params, mi_row + cur_mi_row, mi_col);
+    const int alloc_mi_idx =
+        get_alloc_mi_idx(mi_params, mi_row + cur_mi_row, mi_col);
+    memset(&mi_params->mi_grid_base[mi_grid_idx], 0,
+           sb_size_mi * sizeof(*mi_params->mi_grid_base));
+    memset(&mi_params->tx_type_map[mi_grid_idx], 0,
+           sb_size_mi * sizeof(*mi_params->tx_type_map));
+    if (cur_mi_row % mi_alloc_size_1d == 0) {
+      memset(&mi_params->mi_alloc[alloc_mi_idx], 0,
+             sb_size_alloc_mi * sizeof(*mi_params->mi_alloc));
+    }
+  }
+}
+
+void av1_backup_sb_state(SB_FIRST_PASS_STATS *sb_fp_stats, const AV1_COMP *cpi,
+                         ThreadData *td, const TileDataEnc *tile_data,
+                         int mi_row, int mi_col) {
+  MACROBLOCK *x = &td->mb;
+  MACROBLOCKD *xd = &x->e_mbd;
+  const TileInfo *tile_info = &tile_data->tile_info;
+
+  const AV1_COMMON *cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  const BLOCK_SIZE sb_size = cm->seq_params.sb_size;
+
+  xd->above_txfm_context =
+      cm->above_contexts.txfm[tile_info->tile_row] + mi_col;
+  xd->left_txfm_context =
+      xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+  av1_save_context(x, &sb_fp_stats->x_ctx, mi_row, mi_col, sb_size, num_planes);
+
+  sb_fp_stats->rd_count = cpi->td.rd_counts;
+  sb_fp_stats->split_count = x->txfm_search_info.txb_split_count;
+
+  sb_fp_stats->fc = *td->counts;
+
+  memcpy(sb_fp_stats->inter_mode_rd_models, tile_data->inter_mode_rd_models,
+         sizeof(sb_fp_stats->inter_mode_rd_models));
+
+  memcpy(sb_fp_stats->thresh_freq_fact, x->thresh_freq_fact,
+         sizeof(sb_fp_stats->thresh_freq_fact));
+
+  const int alloc_mi_idx = get_alloc_mi_idx(&cm->mi_params, mi_row, mi_col);
+  sb_fp_stats->current_qindex =
+      cm->mi_params.mi_alloc[alloc_mi_idx].current_qindex;
+
+#if CONFIG_INTERNAL_STATS
+  memcpy(sb_fp_stats->mode_chosen_counts, cpi->mode_chosen_counts,
+         sizeof(sb_fp_stats->mode_chosen_counts));
+#endif  // CONFIG_INTERNAL_STATS
+}
+
+void av1_restore_sb_state(const SB_FIRST_PASS_STATS *sb_fp_stats, AV1_COMP *cpi,
+                          ThreadData *td, TileDataEnc *tile_data, int mi_row,
+                          int mi_col) {
+  MACROBLOCK *x = &td->mb;
+
+  const AV1_COMMON *cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  const BLOCK_SIZE sb_size = cm->seq_params.sb_size;
+
+  av1_restore_context(x, &sb_fp_stats->x_ctx, mi_row, mi_col, sb_size,
+                      num_planes);
+
+  cpi->td.rd_counts = sb_fp_stats->rd_count;
+  x->txfm_search_info.txb_split_count = sb_fp_stats->split_count;
+
+  *td->counts = sb_fp_stats->fc;
+
+  memcpy(tile_data->inter_mode_rd_models, sb_fp_stats->inter_mode_rd_models,
+         sizeof(sb_fp_stats->inter_mode_rd_models));
+  memcpy(x->thresh_freq_fact, sb_fp_stats->thresh_freq_fact,
+         sizeof(sb_fp_stats->thresh_freq_fact));
+
+  const int alloc_mi_idx = get_alloc_mi_idx(&cm->mi_params, mi_row, mi_col);
+  cm->mi_params.mi_alloc[alloc_mi_idx].current_qindex =
+      sb_fp_stats->current_qindex;
+
+#if CONFIG_INTERNAL_STATS
+  memcpy(cpi->mode_chosen_counts, sb_fp_stats->mode_chosen_counts,
+         sizeof(sb_fp_stats->mode_chosen_counts));
+#endif  // CONFIG_INTERNAL_STATS
+}
+
+// Checks for skip status of mv cost update.
+static int skip_mv_cost_update(AV1_COMP *cpi, const TileInfo *const tile_info,
+                               const int mi_row, const int mi_col) {
+  // mv_cost_upd_level=0: update happens at each sb,
+  //                      so return skip status as 0.
+  // mv_cost_upd_level=1: update happens once for each sb row,
+  //                      so return skip status as 1 for
+  //                      mi_col != tile_info->mi_col_start.
+  // mv_cost_upd_level=2: update happens once for a set of rows,
+  //                      so return skip status as 1 appropriately.
+  if (!cpi->sf.inter_sf.mv_cost_upd_level) return 0;
+  if (mi_col != tile_info->mi_col_start) return 1;
+  if (cpi->sf.inter_sf.mv_cost_upd_level == 2) {
+    AV1_COMMON *const cm = &cpi->common;
+    const int mib_size_log2 = cm->seq_params.mib_size_log2;
+    const int sb_row = (mi_row - tile_info->mi_row_start) >> mib_size_log2;
+    const int sb_size = cm->seq_params.mib_size * MI_SIZE;
+    const int tile_height =
+        (tile_info->mi_row_end - tile_info->mi_row_start) * MI_SIZE;
+    // When mv_cost_upd_level = 2, the cost update happens once for 2, 4 sb
+    // rows for sb size 128, sb size 64 respectively. However, as the update
+    // will not be equally spaced in smaller resolutions making it equally
+    // spaced by calculating (mv_num_rows_cost_update) the number of rows
+    // after which the cost update should happen.
+    const int sb_size_update_freq_map[2] = { 2, 4 };
+    const int update_freq_sb_rows =
+        sb_size_update_freq_map[sb_size != MAX_SB_SIZE];
+    const int update_freq_num_rows = sb_size * update_freq_sb_rows;
+    // Round-up the division result to next integer.
+    const int num_updates_per_tile =
+        (tile_height + update_freq_num_rows - 1) / update_freq_num_rows;
+    const int num_rows_update_per_tile = num_updates_per_tile * sb_size;
+    // Round-up the division result to next integer.
+    const int num_sb_rows_per_update =
+        (tile_height + num_rows_update_per_tile - 1) / num_rows_update_per_tile;
+    if ((sb_row % num_sb_rows_per_update) != 0) return 1;
+  }
+  return 0;
+}
+
+// Update the rate costs of some symbols according to the frequency directed
+// by speed features
+void av1_set_cost_upd_freq(AV1_COMP *cpi, ThreadData *td,
+                           const TileInfo *const tile_info, const int mi_row,
+                           const int mi_col) {
+  AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+
+  switch (cpi->oxcf.cost_upd_freq.coeff) {
+    case COST_UPD_OFF: break;
+    case COST_UPD_TILE:  // Tile level
+      if (mi_row != tile_info->mi_row_start) break;
+      AOM_FALLTHROUGH_INTENDED;
+    case COST_UPD_SBROW:  // SB row level in tile
+      if (mi_col != tile_info->mi_col_start) break;
+      AOM_FALLTHROUGH_INTENDED;
+    case COST_UPD_SB:  // SB level
+      if (cpi->sf.inter_sf.disable_sb_level_coeff_cost_upd &&
+          mi_col != tile_info->mi_col_start)
+        break;
+      av1_fill_coeff_costs(&x->coeff_costs, xd->tile_ctx, num_planes);
+      break;
+    default: assert(0);
+  }
+
+  switch (cpi->oxcf.cost_upd_freq.mode) {
+    case COST_UPD_OFF: break;
+    case COST_UPD_TILE:  // Tile level
+      if (mi_row != tile_info->mi_row_start) break;
+      AOM_FALLTHROUGH_INTENDED;
+    case COST_UPD_SBROW:  // SB row level in tile
+      if (mi_col != tile_info->mi_col_start) break;
+      AOM_FALLTHROUGH_INTENDED;
+    case COST_UPD_SB:  // SB level
+      av1_fill_mode_rates(cm, &x->mode_costs, xd->tile_ctx);
+      break;
+    default: assert(0);
+  }
+  switch (cpi->oxcf.cost_upd_freq.mv) {
+    case COST_UPD_OFF: break;
+    case COST_UPD_TILE:  // Tile level
+      if (mi_row != tile_info->mi_row_start) break;
+      AOM_FALLTHROUGH_INTENDED;
+    case COST_UPD_SBROW:  // SB row level in tile
+      if (mi_col != tile_info->mi_col_start) break;
+      AOM_FALLTHROUGH_INTENDED;
+    case COST_UPD_SB:  // SB level
+      // Checks for skip status of mv cost update.
+      if (skip_mv_cost_update(cpi, tile_info, mi_row, mi_col)) break;
+      av1_fill_mv_costs(xd->tile_ctx, cm->features.cur_frame_force_integer_mv,
+                        cm->features.allow_high_precision_mv, &x->mv_costs);
+      break;
+    default: assert(0);
+  }
+}
diff --git a/av1/encoder/encodeframe_utils.h b/av1/encoder/encodeframe_utils.h
new file mode 100644
index 0000000..a67c734
--- /dev/null
+++ b/av1/encoder/encodeframe_utils.h
@@ -0,0 +1,413 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_ENCODEFRAME_UTILS_H_
+#define AOM_AV1_ENCODER_ENCODEFRAME_UTILS_H_
+
+#include "av1/common/reconinter.h"
+
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/partition_strategy.h"
+#include "av1/encoder/rdopt.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+enum { PICK_MODE_RD = 0, PICK_MODE_NONRD };
+
+enum {
+  SB_SINGLE_PASS,  // Single pass encoding: all ctxs get updated normally
+  SB_DRY_PASS,     // First pass of multi-pass: does not update the ctxs
+  SB_WET_PASS      // Second pass of multi-pass: finalize and update the ctx
+} UENUM1BYTE(SB_MULTI_PASS_MODE);
+
+typedef struct {
+  ENTROPY_CONTEXT a[MAX_MIB_SIZE * MAX_MB_PLANE];
+  ENTROPY_CONTEXT l[MAX_MIB_SIZE * MAX_MB_PLANE];
+  PARTITION_CONTEXT sa[MAX_MIB_SIZE];
+  PARTITION_CONTEXT sl[MAX_MIB_SIZE];
+  TXFM_CONTEXT *p_ta;
+  TXFM_CONTEXT *p_tl;
+  TXFM_CONTEXT ta[MAX_MIB_SIZE];
+  TXFM_CONTEXT tl[MAX_MIB_SIZE];
+} RD_SEARCH_MACROBLOCK_CONTEXT;
+
+// This struct is used to store the statistics used by sb-level multi-pass
+// encoding. Currently, this is only used to make a copy of the state before we
+// perform the first pass
+typedef struct SB_FIRST_PASS_STATS {
+  RD_SEARCH_MACROBLOCK_CONTEXT x_ctx;
+  RD_COUNTS rd_count;
+
+  int split_count;
+  FRAME_COUNTS fc;
+  InterModeRdModel inter_mode_rd_models[BLOCK_SIZES_ALL];
+  int thresh_freq_fact[BLOCK_SIZES_ALL][MAX_MODES];
+  int current_qindex;
+
+#if CONFIG_INTERNAL_STATS
+  unsigned int mode_chosen_counts[MAX_MODES];
+#endif  // CONFIG_INTERNAL_STATS
+} SB_FIRST_PASS_STATS;
+
+// This structure contains block size related
+// variables for use in rd_pick_partition().
+typedef struct {
+  // Half of block width to determine block edge.
+  int mi_step;
+
+  // Block row and column indices.
+  int mi_row;
+  int mi_col;
+
+  // Block edge row and column indices.
+  int mi_row_edge;
+  int mi_col_edge;
+
+  // Block width of current partition block.
+  int width;
+
+  // Block width of minimum partition size allowed.
+  int min_partition_size_1d;
+
+  // Flag to indicate if partition is 8x8 or higher size.
+  int bsize_at_least_8x8;
+
+  // Indicates edge blocks in frame.
+  int has_rows;
+  int has_cols;
+
+  // Block size of current partition.
+  BLOCK_SIZE bsize;
+
+  // Size of current sub-partition.
+  BLOCK_SIZE subsize;
+
+  // Size of split partition.
+  BLOCK_SIZE split_bsize2;
+} PartitionBlkParams;
+
+// Structure holding state variables for partition search.
+typedef struct {
+  // Intra partitioning related info.
+  PartitionSearchInfo *intra_part_info;
+
+  // Parameters related to partition block size.
+  PartitionBlkParams part_blk_params;
+
+  // Win flags for HORZ and VERT partition evaluations.
+  RD_RECT_PART_WIN_INFO split_part_rect_win[SUB_PARTITIONS_SPLIT];
+
+  // RD cost for the current block of given partition type.
+  RD_STATS this_rdc;
+
+  // RD cost summed across all blocks of partition type.
+  RD_STATS sum_rdc;
+
+  // Array holding partition type cost.
+  int tmp_partition_cost[PARTITION_TYPES];
+
+  // Pointer to partition cost buffer
+  int *partition_cost;
+
+  // RD costs for different partition types.
+  int64_t none_rd;
+  int64_t split_rd[SUB_PARTITIONS_SPLIT];
+  // RD costs for rectangular partitions.
+  // rect_part_rd[0][i] is the RD cost of ith partition index of PARTITION_HORZ.
+  // rect_part_rd[1][i] is the RD cost of ith partition index of PARTITION_VERT.
+  int64_t rect_part_rd[NUM_RECT_PARTS][SUB_PARTITIONS_RECT];
+
+  // Flags indicating if the corresponding partition was winner or not.
+  // Used to bypass similar blocks during AB partition evaluation.
+  int is_split_ctx_is_ready[2];
+  int is_rect_ctx_is_ready[NUM_RECT_PARTS];
+
+  // Flags to prune/skip particular partition size evaluation.
+  int terminate_partition_search;
+  int partition_none_allowed;
+  int partition_rect_allowed[NUM_RECT_PARTS];
+  int do_rectangular_split;
+  int do_square_split;
+  int prune_rect_part[NUM_RECT_PARTS];
+
+  // Chroma subsampling in x and y directions.
+  int ss_x;
+  int ss_y;
+
+  // Partition plane context index.
+  int pl_ctx_idx;
+
+  // This flag will be set if best partition is found from the search.
+  bool found_best_partition;
+} PartitionSearchState;
+
+static AOM_INLINE void update_global_motion_used(PREDICTION_MODE mode,
+                                                 BLOCK_SIZE bsize,
+                                                 const MB_MODE_INFO *mbmi,
+                                                 RD_COUNTS *rdc) {
+  if (mode == GLOBALMV || mode == GLOBAL_GLOBALMV) {
+    const int num_4x4s = mi_size_wide[bsize] * mi_size_high[bsize];
+    int ref;
+    for (ref = 0; ref < 1 + has_second_ref(mbmi); ++ref) {
+      rdc->global_motion_used[mbmi->ref_frame[ref]] += num_4x4s;
+    }
+  }
+}
+
+static AOM_INLINE void update_filter_type_cdf(const MACROBLOCKD *xd,
+                                              const MB_MODE_INFO *mbmi,
+                                              int dual_filter) {
+  for (int dir = 0; dir < 2; ++dir) {
+    if (dir && !dual_filter) break;
+    const int ctx = av1_get_pred_context_switchable_interp(xd, dir);
+    InterpFilter filter = av1_extract_interp_filter(mbmi->interp_filters, dir);
+    update_cdf(xd->tile_ctx->switchable_interp_cdf[ctx], filter,
+               SWITCHABLE_FILTERS);
+  }
+}
+
+static AOM_INLINE int set_segment_rdmult(const AV1_COMP *const cpi,
+                                         MACROBLOCK *const x,
+                                         int8_t segment_id) {
+  const AV1_COMMON *const cm = &cpi->common;
+  av1_init_plane_quantizers(cpi, x, segment_id);
+  aom_clear_system_state();
+  const int segment_qindex =
+      av1_get_qindex(&cm->seg, segment_id, cm->quant_params.base_qindex);
+  return av1_compute_rd_mult(cpi,
+                             segment_qindex + cm->quant_params.y_dc_delta_q);
+}
+
+static AOM_INLINE int do_slipt_check(BLOCK_SIZE bsize) {
+  return (bsize == BLOCK_16X16 || bsize == BLOCK_32X32);
+}
+
+#if !CONFIG_REALTIME_ONLY
+static AOM_INLINE const FIRSTPASS_STATS *read_one_frame_stats(const TWO_PASS *p,
+                                                              int frm) {
+  assert(frm >= 0);
+  if (frm < 0 ||
+      p->stats_buf_ctx->stats_in_start + frm > p->stats_buf_ctx->stats_in_end) {
+    return NULL;
+  }
+
+  return &p->stats_buf_ctx->stats_in_start[frm];
+}
+
+static BLOCK_SIZE dim_to_size(int dim) {
+  switch (dim) {
+    case 4: return BLOCK_4X4;
+    case 8: return BLOCK_8X8;
+    case 16: return BLOCK_16X16;
+    case 32: return BLOCK_32X32;
+    case 64: return BLOCK_64X64;
+    case 128: return BLOCK_128X128;
+    default: assert(0); return 0;
+  }
+}
+
+static AOM_INLINE void set_max_min_partition_size(SuperBlockEnc *sb_enc,
+                                                  AV1_COMP *cpi, MACROBLOCK *x,
+                                                  const SPEED_FEATURES *sf,
+                                                  BLOCK_SIZE sb_size,
+                                                  int mi_row, int mi_col) {
+  const AV1_COMMON *cm = &cpi->common;
+
+  sb_enc->max_partition_size =
+      AOMMIN(sf->part_sf.default_max_partition_size,
+             dim_to_size(cpi->oxcf.part_cfg.max_partition_size));
+  sb_enc->min_partition_size =
+      AOMMAX(sf->part_sf.default_min_partition_size,
+             dim_to_size(cpi->oxcf.part_cfg.min_partition_size));
+  sb_enc->max_partition_size =
+      AOMMIN(sb_enc->max_partition_size, cm->seq_params.sb_size);
+  sb_enc->min_partition_size =
+      AOMMIN(sb_enc->min_partition_size, cm->seq_params.sb_size);
+
+  if (use_auto_max_partition(cpi, sb_size, mi_row, mi_col)) {
+    float features[FEATURE_SIZE_MAX_MIN_PART_PRED] = { 0.0f };
+
+    av1_get_max_min_partition_features(cpi, x, mi_row, mi_col, features);
+    sb_enc->max_partition_size =
+        AOMMAX(AOMMIN(av1_predict_max_partition(cpi, x, features),
+                      sb_enc->max_partition_size),
+               sb_enc->min_partition_size);
+  }
+}
+
+int av1_get_rdmult_delta(AV1_COMP *cpi, BLOCK_SIZE bsize, int mi_row,
+                         int mi_col, int orig_rdmult);
+
+int av1_active_h_edge(const AV1_COMP *cpi, int mi_row, int mi_step);
+
+int av1_active_v_edge(const AV1_COMP *cpi, int mi_col, int mi_step);
+
+void av1_get_tpl_stats_sb(AV1_COMP *cpi, BLOCK_SIZE bsize, int mi_row,
+                          int mi_col, SuperBlockEnc *sb_enc);
+
+int av1_get_q_for_deltaq_objective(AV1_COMP *const cpi, BLOCK_SIZE bsize,
+                                   int mi_row, int mi_col);
+#endif  // !CONFIG_REALTIME_ONLY
+
+void av1_set_ssim_rdmult(const AV1_COMP *const cpi, MvCosts *const mv_costs,
+                         const BLOCK_SIZE bsize, const int mi_row,
+                         const int mi_col, int *const rdmult);
+
+int av1_get_hier_tpl_rdmult(const AV1_COMP *const cpi, MACROBLOCK *const x,
+                            const BLOCK_SIZE bsize, const int mi_row,
+                            const int mi_col, int orig_rdmult);
+
+void av1_update_state(const AV1_COMP *const cpi, ThreadData *td,
+                      const PICK_MODE_CONTEXT *const ctx, int mi_row,
+                      int mi_col, BLOCK_SIZE bsize, RUN_TYPE dry_run);
+
+void av1_update_inter_mode_stats(FRAME_CONTEXT *fc, FRAME_COUNTS *counts,
+                                 PREDICTION_MODE mode, int16_t mode_context);
+
+void av1_sum_intra_stats(const AV1_COMMON *const cm, FRAME_COUNTS *counts,
+                         MACROBLOCKD *xd, const MB_MODE_INFO *const mbmi,
+                         const MB_MODE_INFO *above_mi,
+                         const MB_MODE_INFO *left_mi, const int intraonly);
+
+void av1_restore_context(MACROBLOCK *x, const RD_SEARCH_MACROBLOCK_CONTEXT *ctx,
+                         int mi_row, int mi_col, BLOCK_SIZE bsize,
+                         const int num_planes);
+
+void av1_save_context(const MACROBLOCK *x, RD_SEARCH_MACROBLOCK_CONTEXT *ctx,
+                      int mi_row, int mi_col, BLOCK_SIZE bsize,
+                      const int num_planes);
+
+void av1_set_fixed_partitioning(AV1_COMP *cpi, const TileInfo *const tile,
+                                MB_MODE_INFO **mib, int mi_row, int mi_col,
+                                BLOCK_SIZE bsize);
+
+int av1_is_leaf_split_partition(AV1_COMMON *cm, int mi_row, int mi_col,
+                                BLOCK_SIZE bsize);
+
+void av1_reset_simple_motion_tree_partition(SIMPLE_MOTION_DATA_TREE *sms_tree,
+                                            BLOCK_SIZE bsize);
+
+void av1_update_picked_ref_frames_mask(MACROBLOCK *const x, int ref_type,
+                                       BLOCK_SIZE bsize, int mib_size,
+                                       int mi_row, int mi_col);
+
+void av1_avg_cdf_symbols(FRAME_CONTEXT *ctx_left, FRAME_CONTEXT *ctx_tr,
+                         int wt_left, int wt_tr);
+
+void av1_source_content_sb(AV1_COMP *cpi, MACROBLOCK *x, int offset);
+
+void av1_reset_mbmi(CommonModeInfoParams *const mi_params, BLOCK_SIZE sb_size,
+                    int mi_row, int mi_col);
+
+void av1_backup_sb_state(SB_FIRST_PASS_STATS *sb_fp_stats, const AV1_COMP *cpi,
+                         ThreadData *td, const TileDataEnc *tile_data,
+                         int mi_row, int mi_col);
+
+void av1_restore_sb_state(const SB_FIRST_PASS_STATS *sb_fp_stats, AV1_COMP *cpi,
+                          ThreadData *td, TileDataEnc *tile_data, int mi_row,
+                          int mi_col);
+
+void av1_set_cost_upd_freq(AV1_COMP *cpi, ThreadData *td,
+                           const TileInfo *const tile_info, const int mi_row,
+                           const int mi_col);
+
+// This function will compute the number of reference frames to be disabled
+// based on selective_ref_frame speed feature.
+static AOM_INLINE unsigned int get_num_refs_to_disable(
+    const AV1_COMP *cpi, const int *ref_frame_flags,
+    const unsigned int *ref_display_order_hint,
+    unsigned int cur_frame_display_index) {
+  unsigned int num_refs_to_disable = 0;
+  if (cpi->sf.inter_sf.selective_ref_frame >= 3) {
+    num_refs_to_disable++;
+    if (cpi->sf.inter_sf.selective_ref_frame >= 5 &&
+        *ref_frame_flags & av1_ref_frame_flag_list[LAST2_FRAME]) {
+      const int last2_frame_dist = av1_encoder_get_relative_dist(
+          ref_display_order_hint[LAST2_FRAME - LAST_FRAME],
+          cur_frame_display_index);
+      // Disable LAST2_FRAME if it is a temporally distant frame
+      if (abs(last2_frame_dist) > 2) {
+        num_refs_to_disable++;
+      }
+#if !CONFIG_REALTIME_ONLY
+      else if (is_stat_consumption_stage_twopass(cpi)) {
+        const FIRSTPASS_STATS *const this_frame_stats =
+            read_one_frame_stats(&cpi->twopass, cur_frame_display_index);
+        aom_clear_system_state();
+        const double coded_error_per_mb =
+            this_frame_stats->coded_error / cpi->frame_info.num_mbs;
+        // Disable LAST2_FRAME if the coded error of the current frame based on
+        // first pass stats is very low.
+        if (coded_error_per_mb < 100.0) num_refs_to_disable++;
+      }
+#endif  // CONFIG_REALTIME_ONLY
+    }
+  }
+  return num_refs_to_disable;
+}
+
+static INLINE int get_max_allowed_ref_frames(
+    const AV1_COMP *cpi, const int *ref_frame_flags,
+    const unsigned int *ref_display_order_hint,
+    unsigned int cur_frame_display_index) {
+  const unsigned int max_reference_frames =
+      cpi->oxcf.ref_frm_cfg.max_reference_frames;
+  const unsigned int num_refs_to_disable = get_num_refs_to_disable(
+      cpi, ref_frame_flags, ref_display_order_hint, cur_frame_display_index);
+  const unsigned int max_allowed_refs_for_given_speed =
+      INTER_REFS_PER_FRAME - num_refs_to_disable;
+  return AOMMIN(max_allowed_refs_for_given_speed, max_reference_frames);
+}
+
+// Enforce the number of references for each arbitrary frame based on user
+// options and speed.
+static AOM_INLINE void enforce_max_ref_frames(
+    AV1_COMP *cpi, int *ref_frame_flags,
+    const unsigned int *ref_display_order_hint,
+    unsigned int cur_frame_display_index) {
+  MV_REFERENCE_FRAME ref_frame;
+  int total_valid_refs = 0;
+
+  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+    if (*ref_frame_flags & av1_ref_frame_flag_list[ref_frame]) {
+      total_valid_refs++;
+    }
+  }
+
+  const int max_allowed_refs = get_max_allowed_ref_frames(
+      cpi, ref_frame_flags, ref_display_order_hint, cur_frame_display_index);
+
+  for (int i = 0; i < 4 && total_valid_refs > max_allowed_refs; ++i) {
+    const MV_REFERENCE_FRAME ref_frame_to_disable = disable_order[i];
+
+    if (!(*ref_frame_flags & av1_ref_frame_flag_list[ref_frame_to_disable])) {
+      continue;
+    }
+
+    switch (ref_frame_to_disable) {
+      case LAST3_FRAME: *ref_frame_flags &= ~AOM_LAST3_FLAG; break;
+      case LAST2_FRAME: *ref_frame_flags &= ~AOM_LAST2_FLAG; break;
+      case ALTREF2_FRAME: *ref_frame_flags &= ~AOM_ALT2_FLAG; break;
+      case GOLDEN_FRAME: *ref_frame_flags &= ~AOM_GOLD_FLAG; break;
+      default: assert(0);
+    }
+    --total_valid_refs;
+  }
+  assert(total_valid_refs <= max_allowed_refs);
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_ENCODEFRAME_UTILS_H_
diff --git a/av1/encoder/encodemb.c b/av1/encoder/encodemb.c
index ec33362..14ee354 100644
--- a/av1/encoder/encodemb.c
+++ b/av1/encoder/encodemb.c
@@ -84,8 +84,7 @@
 
 int av1_optimize_b(const struct AV1_COMP *cpi, MACROBLOCK *x, int plane,
                    int block, TX_SIZE tx_size, TX_TYPE tx_type,
-                   const TXB_CTX *const txb_ctx, int fast_mode,
-                   int *rate_cost) {
+                   const TXB_CTX *const txb_ctx, int *rate_cost) {
   MACROBLOCKD *const xd = &x->e_mbd;
   struct macroblock_plane *const p = &x->plane[plane];
   const int eob = p->eobs[block];
@@ -93,12 +92,12 @@
 
   if (eob == 0 || !cpi->optimize_seg_arr[segment_id] ||
       xd->lossless[segment_id]) {
-    *rate_cost = av1_cost_skip_txb(x, txb_ctx, plane, tx_size);
+    *rate_cost = av1_cost_skip_txb(&x->coeff_costs, txb_ctx, plane, tx_size);
     return eob;
   }
 
   return av1_optimize_txb_new(cpi, x, plane, block, tx_size, tx_type, txb_ctx,
-                              rate_cost, cpi->oxcf.sharpness, fast_mode);
+                              rate_cost, cpi->oxcf.algo_cfg.sharpness);
 }
 
 // Hyper-parameters for dropout optimization, based on following logics.
@@ -133,11 +132,9 @@
 
 void av1_dropout_qcoeff(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size,
                         TX_TYPE tx_type, int qindex) {
-  MACROBLOCKD *const xd = &mb->e_mbd;
   const struct macroblock_plane *const p = &mb->plane[plane];
-  const struct macroblockd_plane *const pd = &xd->plane[plane];
   tran_low_t *const qcoeff = p->qcoeff + BLOCK_OFFSET(block);
-  tran_low_t *const dqcoeff = pd->dqcoeff + BLOCK_OFFSET(block);
+  tran_low_t *const dqcoeff = p->dqcoeff + BLOCK_OFFSET(block);
   const int tx_width = tx_size_wide[tx_size];
   const int tx_height = tx_size_high[tx_size];
   const int max_eob = av1_get_max_eob(tx_size);
@@ -226,7 +223,7 @@
   if (eob != p->eobs[block]) {
     p->eobs[block] = eob;
     p->txb_entropy_ctx[block] =
-        (uint8_t)av1_get_txb_entropy_context(qcoeff, scan_order, eob);
+        av1_get_txb_entropy_context(qcoeff, scan_order, eob);
   }
 }
 
@@ -262,29 +259,53 @@
 };
 #endif
 
+// Computes the transform for DC only blocks
+void av1_xform_dc_only(MACROBLOCK *x, int plane, int block,
+                       TxfmParam *txfm_param, int64_t per_px_mean) {
+  assert(per_px_mean != INT64_MAX);
+  const struct macroblock_plane *const p = &x->plane[plane];
+  const int block_offset = BLOCK_OFFSET(block);
+  tran_low_t *const coeff = p->coeff + block_offset;
+  const int n_coeffs = av1_get_max_eob(txfm_param->tx_size);
+  memset(coeff, 0, sizeof(*coeff) * n_coeffs);
+  coeff[0] =
+      (tran_low_t)((per_px_mean * dc_coeff_scale[txfm_param->tx_size]) >> 12);
+}
+
 void av1_xform_quant(MACROBLOCK *x, int plane, int block, int blk_row,
                      int blk_col, BLOCK_SIZE plane_bsize, TxfmParam *txfm_param,
                      QUANT_PARAM *qparam) {
-  MACROBLOCKD *const xd = &x->e_mbd;
+  av1_xform(x, plane, block, blk_row, blk_col, plane_bsize, txfm_param);
+  av1_quant(x, plane, block, txfm_param, qparam);
+}
+
+void av1_xform(MACROBLOCK *x, int plane, int block, int blk_row, int blk_col,
+               BLOCK_SIZE plane_bsize, TxfmParam *txfm_param) {
   const struct macroblock_plane *const p = &x->plane[plane];
-  const struct macroblockd_plane *const pd = &xd->plane[plane];
-  const SCAN_ORDER *const scan_order =
-      get_scan(txfm_param->tx_size, txfm_param->tx_type);
   const int block_offset = BLOCK_OFFSET(block);
   tran_low_t *const coeff = p->coeff + block_offset;
-  tran_low_t *const qcoeff = p->qcoeff + block_offset;
-  tran_low_t *const dqcoeff = pd->dqcoeff + block_offset;
-  uint16_t *const eob = &p->eobs[block];
   const int diff_stride = block_size_wide[plane_bsize];
 
   const int src_offset = (blk_row * diff_stride + blk_col);
   const int16_t *src_diff = &p->src_diff[src_offset << MI_SIZE_LOG2];
 
   av1_fwd_txfm(src_diff, coeff, diff_stride, txfm_param);
+}
+
+void av1_quant(MACROBLOCK *x, int plane, int block, TxfmParam *txfm_param,
+               QUANT_PARAM *qparam) {
+  const struct macroblock_plane *const p = &x->plane[plane];
+  const SCAN_ORDER *const scan_order =
+      get_scan(txfm_param->tx_size, txfm_param->tx_type);
+  const int block_offset = BLOCK_OFFSET(block);
+  tran_low_t *const coeff = p->coeff + block_offset;
+  tran_low_t *const qcoeff = p->qcoeff + block_offset;
+  tran_low_t *const dqcoeff = p->dqcoeff + block_offset;
+  uint16_t *const eob = &p->eobs[block];
 
   if (qparam->xform_quant_idx != AV1_XFORM_QUANT_SKIP_QUANT) {
     const int n_coeffs = av1_get_max_eob(txfm_param->tx_size);
-    if (LIKELY(!x->skip_block)) {
+    if (LIKELY(!x->seg_skip_block)) {
 #if CONFIG_AV1_HIGHBITDEPTH
       quant_func_list[qparam->xform_quant_idx][txfm_param->is_hbd](
           coeff, n_coeffs, p, qcoeff, dqcoeff, eob, scan_order, qparam);
@@ -302,9 +323,8 @@
     p->txb_entropy_ctx[block] = 0;
   } else {
     p->txb_entropy_ctx[block] =
-        (uint8_t)av1_get_txb_entropy_context(qcoeff, scan_order, *eob);
+        av1_get_txb_entropy_context(qcoeff, scan_order, *eob);
   }
-  return;
 }
 
 void av1_setup_xform(const AV1_COMMON *cm, MACROBLOCK *x, TX_SIZE tx_size,
@@ -358,7 +378,7 @@
   MB_MODE_INFO *mbmi = xd->mi[0];
   struct macroblock_plane *const p = &x->plane[plane];
   struct macroblockd_plane *const pd = &xd->plane[plane];
-  tran_low_t *const dqcoeff = pd->dqcoeff + BLOCK_OFFSET(block);
+  tran_low_t *const dqcoeff = p->dqcoeff + BLOCK_OFFSET(block);
   uint8_t *dst;
   ENTROPY_CONTEXT *a, *l;
   int dummy_rate_cost = 0;
@@ -370,7 +390,9 @@
   l = &args->tl[blk_row];
 
   TX_TYPE tx_type = DCT_DCT;
-  if (!is_blk_skip(x, plane, blk_row * bw + blk_col) && !mbmi->skip_mode) {
+  if (!is_blk_skip(x->txfm_search_info.blk_skip, plane,
+                   blk_row * bw + blk_col) &&
+      !mbmi->skip_mode) {
     tx_type = av1_get_tx_type(xd, pd->plane_type, blk_row, blk_col, tx_size,
                               cm->features.reduced_tx_set_used);
     TxfmParam txfm_param;
@@ -383,8 +405,8 @@
       quant_idx =
           USE_B_QUANT_NO_TRELLIS ? AV1_XFORM_QUANT_B : AV1_XFORM_QUANT_FP;
     av1_setup_xform(cm, x, tx_size, tx_type, &txfm_param);
-    av1_setup_quant(tx_size, use_trellis, quant_idx, cpi->oxcf.quant_b_adapt,
-                    &quant_param);
+    av1_setup_quant(tx_size, use_trellis, quant_idx,
+                    cpi->oxcf.q_cfg.quant_b_adapt, &quant_param);
     av1_setup_qmatrix(&cm->quant_params, xd, plane, tx_size, tx_type,
                       &quant_param);
     av1_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, &txfm_param,
@@ -400,7 +422,7 @@
       TXB_CTX txb_ctx;
       get_txb_ctx(plane_bsize, tx_size, plane, a, l, &txb_ctx);
       av1_optimize_b(args->cpi, x, plane, block, tx_size, tx_type, &txb_ctx,
-                     args->cpi->sf.rd_sf.trellis_eob_fast, &dummy_rate_cost);
+                     &dummy_rate_cost);
     }
     if (!quant_param.use_optimize_b && do_dropout) {
       av1_dropout_qcoeff(x, plane, block, tx_size, tx_type,
@@ -427,8 +449,8 @@
   // again.
   if (p->eobs[block] == 0 && plane == 0) {
 #if 0
-    if (args->cpi->oxcf.aq_mode == NO_AQ &&
-        args->cpi->oxcf.deltaq_mode == NO_DELTA_Q) {
+    if (args->cpi->oxcf.q_cfg.aq_mode == NO_AQ &&
+        args->cpi->oxcf.q_cfg.deltaq_mode == NO_DELTA_Q) {
       // TODO(jingning,angiebird,huisu@google.com): enable txk_check when
       // enable_optimize_b is true to detect potential RD bug.
       const uint8_t disable_txk_check = args->enable_optimize_b;
@@ -470,7 +492,7 @@
   if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
 
   const TX_SIZE plane_tx_size =
-      plane ? av1_get_max_uv_txsize(mbmi->sb_type, pd->subsampling_x,
+      plane ? av1_get_max_uv_txsize(mbmi->bsize, pd->subsampling_x,
                                     pd->subsampling_y)
             : mbmi->inter_tx_size[av1_get_txb_size_index(plane_bsize, blk_row,
                                                          blk_col)];
@@ -565,7 +587,7 @@
   MACROBLOCKD *const xd = &x->e_mbd;
   struct macroblock_plane *const p = &x->plane[plane];
   struct macroblockd_plane *const pd = &xd->plane[plane];
-  tran_low_t *const dqcoeff = pd->dqcoeff + BLOCK_OFFSET(block);
+  tran_low_t *const dqcoeff = p->dqcoeff + BLOCK_OFFSET(block);
 
   uint8_t *dst;
   dst = &pd->dst.buf[(blk_row * pd->dst.stride + blk_col) << MI_SIZE_LOG2];
@@ -574,7 +596,7 @@
   QUANT_PARAM quant_param;
 
   av1_setup_xform(cm, x, tx_size, DCT_DCT, &txfm_param);
-  av1_setup_quant(tx_size, 0, AV1_XFORM_QUANT_B, cpi->oxcf.quant_b_adapt,
+  av1_setup_quant(tx_size, 0, AV1_XFORM_QUANT_B, cpi->oxcf.q_cfg.quant_b_adapt,
                   &quant_param);
   av1_setup_qmatrix(&cm->quant_params, xd, plane, tx_size, DCT_DCT,
                     &quant_param);
@@ -604,12 +626,12 @@
   assert(bsize < BLOCK_SIZES_ALL);
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *mbmi = xd->mi[0];
-  mbmi->skip = 1;
-  if (x->force_skip) return;
+  mbmi->skip_txfm = 1;
+  if (x->txfm_search_info.skip_txfm) return;
 
   struct optimize_ctx ctx;
   struct encode_b_args arg = {
-    cpi,  x,    &ctx,    &mbmi->skip,
+    cpi,  x,    &ctx,    &mbmi->skip_txfm,
     NULL, NULL, dry_run, cpi->optimize_seg_arr[mbmi->segment_id]
   };
   const AV1_COMMON *const cm = &cpi->common;
@@ -683,7 +705,7 @@
   MACROBLOCKD *const xd = &x->e_mbd;
   struct macroblock_plane *const p = &x->plane[plane];
   struct macroblockd_plane *const pd = &xd->plane[plane];
-  tran_low_t *dqcoeff = pd->dqcoeff + BLOCK_OFFSET(block);
+  tran_low_t *dqcoeff = p->dqcoeff + BLOCK_OFFSET(block);
   PLANE_TYPE plane_type = get_plane_type(plane);
   uint16_t *eob = &p->eobs[block];
   const int dst_stride = pd->dst.stride;
@@ -694,7 +716,8 @@
 
   TX_TYPE tx_type = DCT_DCT;
   const int bw = mi_size_wide[plane_bsize];
-  if (plane == 0 && is_blk_skip(x, plane, blk_row * bw + blk_col)) {
+  if (plane == 0 && is_blk_skip(x->txfm_search_info.blk_skip, plane,
+                                blk_row * bw + blk_col)) {
     *eob = 0;
     p->txb_entropy_ctx[block] = 0;
   } else {
@@ -716,8 +739,8 @@
           USE_B_QUANT_NO_TRELLIS ? AV1_XFORM_QUANT_B : AV1_XFORM_QUANT_FP;
 
     av1_setup_xform(cm, x, tx_size, tx_type, &txfm_param);
-    av1_setup_quant(tx_size, use_trellis, quant_idx, cpi->oxcf.quant_b_adapt,
-                    &quant_param);
+    av1_setup_quant(tx_size, use_trellis, quant_idx,
+                    cpi->oxcf.q_cfg.quant_b_adapt, &quant_param);
     av1_setup_qmatrix(&cm->quant_params, xd, plane, tx_size, tx_type,
                       &quant_param);
 
@@ -743,7 +766,7 @@
       TXB_CTX txb_ctx;
       get_txb_ctx(plane_bsize, tx_size, plane, a, l, &txb_ctx);
       av1_optimize_b(args->cpi, x, plane, block, tx_size, tx_type, &txb_ctx,
-                     args->cpi->sf.rd_sf.trellis_eob_fast, &dummy_rate_cost);
+                     &dummy_rate_cost);
     }
     if (do_dropout) {
       av1_dropout_qcoeff(x, plane, block, tx_size, tx_type,
@@ -764,8 +787,8 @@
   // again.
   if (*eob == 0 && plane == 0) {
 #if 0
-    if (args->cpi->oxcf.aq_mode == NO_AQ
-        && args->cpi->oxcf.deltaq_mode == NO_DELTA_Q) {
+    if (args->cpi->oxcf.q_cfg.aq_mode == NO_AQ
+        && args->cpi->oxcf.q_cfg.deltaq_mode == NO_DELTA_Q) {
       assert(xd->tx_type_map[blk_row * xd->tx_type_map_stride + blk_col)] ==
           DCT_DCT);
     }
@@ -794,7 +817,7 @@
   const int ss_y = pd->subsampling_y;
   ENTROPY_CONTEXT ta[MAX_MIB_SIZE] = { 0 };
   ENTROPY_CONTEXT tl[MAX_MIB_SIZE] = { 0 };
-  struct encode_b_args arg = { cpi, x,  NULL,    &(xd->mi[0]->skip),
+  struct encode_b_args arg = { cpi, x,  NULL,    &(xd->mi[0]->skip_txfm),
                                ta,  tl, dry_run, enable_optimize_b };
   const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, ss_x, ss_y);
   if (enable_optimize_b) {
diff --git a/av1/encoder/encodemb.h b/av1/encoder/encodemb.h
index a337c83..fcd34a3 100644
--- a/av1/encoder/encodemb.h
+++ b/av1/encoder/encodemb.h
@@ -16,12 +16,37 @@
 
 #include "av1/common/av1_common_int.h"
 #include "av1/common/txb_common.h"
+#include "av1/encoder/av1_quantize.h"
 #include "av1/encoder/block.h"
 #include "av1/encoder/tokenize.h"
 #ifdef __cplusplus
 extern "C" {
 #endif
 
+enum {
+  AV1_XFORM_QUANT_FP = 0,
+  AV1_XFORM_QUANT_B = 1,
+  AV1_XFORM_QUANT_DC = 2,
+  AV1_XFORM_QUANT_SKIP_QUANT,
+  AV1_XFORM_QUANT_TYPES,
+} UENUM1BYTE(AV1_XFORM_QUANT);
+
+// TODO(any): Merge OPT_TYPe and TRELLLIS_OPT_TYPE
+// Available optimization types to optimize the quantized coefficients.
+enum {
+  NONE_OPT = 0,            // No optimization.
+  TRELLIS_OPT = 1,         // Trellis optimization. See `av1_optimize_b()`.
+  DROPOUT_OPT = 2,         // Dropout optimization. See `av1_dropout_qcoeff()`.
+  TRELLIS_DROPOUT_OPT = 3  // Perform dropout after trellis optimization.
+} UENUM1BYTE(OPT_TYPE);
+
+enum {
+  NO_TRELLIS_OPT,          // No trellis optimization
+  FULL_TRELLIS_OPT,        // Trellis optimization in all stages
+  FINAL_PASS_TRELLIS_OPT,  // Trellis optimization in only the final encode pass
+  NO_ESTIMATE_YRD_TRELLIS_OPT  // Disable trellis in estimate_yrd_for_sb
+} UENUM1BYTE(TRELLIS_OPT_TYPE);
+
 struct optimize_ctx {
   ENTROPY_CONTEXT ta[MAX_MB_PLANE][MAX_MIB_SIZE];
   ENTROPY_CONTEXT tl[MAX_MB_PLANE][MAX_MIB_SIZE];
@@ -38,22 +63,6 @@
   TRELLIS_OPT_TYPE enable_optimize_b;
 };
 
-enum {
-  AV1_XFORM_QUANT_FP = 0,
-  AV1_XFORM_QUANT_B = 1,
-  AV1_XFORM_QUANT_DC = 2,
-  AV1_XFORM_QUANT_SKIP_QUANT,
-  AV1_XFORM_QUANT_TYPES,
-} UENUM1BYTE(AV1_XFORM_QUANT);
-
-// Available optimization types to optimize the quantized coefficients.
-enum {
-  NONE_OPT = 0,            // No optimization.
-  TRELLIS_OPT = 1,         // Trellis optimization. See `av1_optimize_b()`.
-  DROPOUT_OPT = 2,         // Dropout optimization. See `av1_dropout_qcoeff()`.
-  TRELLIS_DROPOUT_OPT = 3  // Perform dropout after trellis optimization.
-} UENUM1BYTE(OPT_TYPE);
-
 void av1_encode_sb(const struct AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
                    RUN_TYPE dry_run);
 
@@ -72,13 +81,22 @@
                        const MACROBLOCKD *xd, int plane, TX_SIZE tx_size,
                        TX_TYPE tx_type, QUANT_PARAM *qparam);
 
+void av1_xform_dc_only(MACROBLOCK *x, int plane, int block,
+                       TxfmParam *txfm_param, int64_t per_px_mean);
+
 void av1_xform_quant(MACROBLOCK *x, int plane, int block, int blk_row,
                      int blk_col, BLOCK_SIZE plane_bsize, TxfmParam *txfm_param,
                      QUANT_PARAM *qparam);
 
+void av1_xform(MACROBLOCK *x, int plane, int block, int blk_row, int blk_col,
+               BLOCK_SIZE plane_bsize, TxfmParam *txfm_param);
+
+void av1_quant(MACROBLOCK *x, int plane, int block, TxfmParam *txfm_param,
+               QUANT_PARAM *qparam);
+
 int av1_optimize_b(const struct AV1_COMP *cpi, MACROBLOCK *mb, int plane,
                    int block, TX_SIZE tx_size, TX_TYPE tx_type,
-                   const TXB_CTX *const txb_ctx, int fast_mode, int *rate_cost);
+                   const TXB_CTX *const txb_ctx, int *rate_cost);
 
 // This function can be used as (i) a further optimization to reduce the
 // redundancy of quantized coefficients (a.k.a., `qcoeff`) after trellis
@@ -138,6 +156,19 @@
     return false;
   return true;
 }
+
+// Scaling terms (precision of 12 bits) to perform tx-size specific
+// normalization that is used in DCT_DCT forward transform.
+// For transform blocks of 1:2 and 2:1       - sqrt(2) normalization is used
+// For transform blocks of 1:4 and 4:1       - factor of 2 is used
+// For transform blocks TX_8x8 and below     - an additional factor of 2 is used
+// For transform blocks max(width,height)=64 - currently not supported
+
+static const uint16_t dc_coeff_scale[TX_SIZES_ALL] = {
+  1024, 2048, 4096, 4096, 0,    1448, 1448, 2896, 2896, 2896,
+  2896, 0,    0,    2048, 2048, 4096, 4096, 0,    0
+};
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/av1/encoder/encodemv.c b/av1/encoder/encodemv.c
index 167e9c0..86c6156 100644
--- a/av1/encoder/encodemv.c
+++ b/av1/encoder/encodemv.c
@@ -253,7 +253,7 @@
     ref_mv_idx += 1;
   }
   return av1_get_ref_mv_from_stack(ref_idx, mbmi->ref_frame, ref_mv_idx,
-                                   x->mbmi_ext);
+                                   &x->mbmi_ext);
 }
 
 void av1_find_best_ref_mvs_from_stack(int allow_hp,
diff --git a/av1/encoder/encodemv.h b/av1/encoder/encodemv.h
index 0d13014..9f0d607 100644
--- a/av1/encoder/encodemv.h
+++ b/av1/encoder/encodemv.h
@@ -69,6 +69,37 @@
   return c;
 }
 
+static INLINE int av1_check_newmv_joint_nonzero(const AV1_COMMON *cm,
+                                                MACROBLOCK *const x) {
+  (void)cm;
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  const PREDICTION_MODE this_mode = mbmi->mode;
+  if (this_mode == NEW_NEWMV) {
+    const int_mv ref_mv_0 = av1_get_ref_mv(x, 0);
+    const int_mv ref_mv_1 = av1_get_ref_mv(x, 1);
+    if (mbmi->mv[0].as_int == ref_mv_0.as_int ||
+        mbmi->mv[1].as_int == ref_mv_1.as_int) {
+      return 0;
+    }
+  } else if (this_mode == NEAREST_NEWMV || this_mode == NEAR_NEWMV) {
+    const int_mv ref_mv_1 = av1_get_ref_mv(x, 1);
+    if (mbmi->mv[1].as_int == ref_mv_1.as_int) {
+      return 0;
+    }
+  } else if (this_mode == NEW_NEARESTMV || this_mode == NEW_NEARMV) {
+    const int_mv ref_mv_0 = av1_get_ref_mv(x, 0);
+    if (mbmi->mv[0].as_int == ref_mv_0.as_int) {
+      return 0;
+    }
+  } else if (this_mode == NEWMV) {
+    const int_mv ref_mv_0 = av1_get_ref_mv(x, 0);
+    if (mbmi->mv[0].as_int == ref_mv_0.as_int) {
+      return 0;
+    }
+  }
+  return 1;
+}
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/av1/encoder/encoder.c b/av1/encoder/encoder.c
index 6f8774b9..cc681bc 100644
--- a/av1/encoder/encoder.c
+++ b/av1/encoder/encoder.c
@@ -16,11 +16,7 @@
 
 #include "config/aom_config.h"
 #include "config/aom_dsp_rtcd.h"
-#include "config/aom_scale_rtcd.h"
-#include "config/av1_rtcd.h"
 
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_dsp/aom_filter.h"
 #if CONFIG_DENOISE
 #include "aom_dsp/grain_table.h"
 #include "aom_dsp/noise_util.h"
@@ -39,7 +35,6 @@
 #endif  // CONFIG_BITSTREAM_DEBUG
 
 #include "av1/common/alloccommon.h"
-#include "av1/common/cdef.h"
 #include "av1/common/filter.h"
 #include "av1/common/idct.h"
 #include "av1/common/reconinter.h"
@@ -47,7 +42,6 @@
 #include "av1/common/resize.h"
 #include "av1/common/tile_common.h"
 
-#include "av1/encoder/av1_multi_thread.h"
 #include "av1/encoder/aq_complexity.h"
 #include "av1/encoder/aq_cyclicrefresh.h"
 #include "av1/encoder/aq_variance.h"
@@ -57,319 +51,42 @@
 #include "av1/encoder/encodemv.h"
 #include "av1/encoder/encode_strategy.h"
 #include "av1/encoder/encoder.h"
+#include "av1/encoder/encoder_alloc.h"
+#include "av1/encoder/encoder_utils.h"
 #include "av1/encoder/encodetxb.h"
 #include "av1/encoder/ethread.h"
 #include "av1/encoder/firstpass.h"
-#include "av1/encoder/grain_test_vectors.h"
 #include "av1/encoder/hash_motion.h"
+#include "av1/encoder/intra_mode_search.h"
 #include "av1/encoder/mv_prec.h"
 #include "av1/encoder/pass2_strategy.h"
+#include "av1/encoder/pickcdef.h"
 #include "av1/encoder/picklpf.h"
 #include "av1/encoder/pickrst.h"
 #include "av1/encoder/random.h"
 #include "av1/encoder/ratectrl.h"
+#include "av1/encoder/rc_utils.h"
 #include "av1/encoder/rd.h"
 #include "av1/encoder/rdopt.h"
 #include "av1/encoder/segmentation.h"
 #include "av1/encoder/speed_features.h"
+#include "av1/encoder/superres_scale.h"
 #include "av1/encoder/tpl_model.h"
 #include "av1/encoder/reconinter_enc.h"
 #include "av1/encoder/var_based_part.h"
 
-#if CONFIG_TUNE_VMAF
-#include "av1/encoder/tune_vmaf.h"
-#endif
-
 #define DEFAULT_EXPLICIT_ORDER_HINT_BITS 7
 
 #if CONFIG_ENTROPY_STATS
 FRAME_COUNTS aggregate_fc;
 #endif  // CONFIG_ENTROPY_STATS
 
-#define AM_SEGMENT_ID_INACTIVE 7
-#define AM_SEGMENT_ID_ACTIVE 0
-
 // #define OUTPUT_YUV_REC
-#ifdef OUTPUT_YUV_SKINMAP
-FILE *yuv_skinmap_file = NULL;
-#endif
 #ifdef OUTPUT_YUV_REC
 FILE *yuv_rec_file;
 #define FILE_NAME_LEN 100
 #endif
 
-const int default_tx_type_probs[FRAME_UPDATE_TYPES][TX_SIZES_ALL][TX_TYPES] = {
-  { { 221, 189, 214, 292, 0, 0, 0, 0, 0, 2, 38, 68, 0, 0, 0, 0 },
-    { 262, 203, 216, 239, 0, 0, 0, 0, 0, 1, 37, 66, 0, 0, 0, 0 },
-    { 315, 231, 239, 226, 0, 0, 0, 0, 0, 13, 0, 0, 0, 0, 0, 0 },
-    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
-    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
-    { 222, 188, 214, 287, 0, 0, 0, 0, 0, 2, 50, 61, 0, 0, 0, 0 },
-    { 256, 182, 205, 282, 0, 0, 0, 0, 0, 2, 21, 76, 0, 0, 0, 0 },
-    { 281, 214, 217, 222, 0, 0, 0, 0, 0, 1, 48, 41, 0, 0, 0, 0 },
-    { 263, 194, 225, 225, 0, 0, 0, 0, 0, 2, 15, 100, 0, 0, 0, 0 },
-    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
-    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
-    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
-    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
-    { 170, 192, 242, 293, 0, 0, 0, 0, 0, 1, 68, 58, 0, 0, 0, 0 },
-    { 199, 210, 213, 291, 0, 0, 0, 0, 0, 1, 14, 96, 0, 0, 0, 0 },
-    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
-    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
-    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
-    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
-  { { 106, 69, 107, 278, 9, 15, 20, 45, 49, 23, 23, 88, 36, 74, 25, 57 },
-    { 105, 72, 81, 98, 45, 49, 47, 50, 56, 72, 30, 81, 33, 95, 27, 83 },
-    { 211, 105, 109, 120, 57, 62, 43, 49, 52, 58, 42, 116, 0, 0, 0, 0 },
-    { 1008, 0, 0, 0, 0, 0, 0, 0, 0, 16, 0, 0, 0, 0, 0, 0 },
-    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
-    { 131, 57, 98, 172, 19, 40, 37, 64, 69, 22, 41, 52, 51, 77, 35, 59 },
-    { 176, 83, 93, 202, 22, 24, 28, 47, 50, 16, 12, 93, 26, 76, 17, 59 },
-    { 136, 72, 89, 95, 46, 59, 47, 56, 61, 68, 35, 51, 32, 82, 26, 69 },
-    { 122, 80, 87, 105, 49, 47, 46, 46, 57, 52, 13, 90, 19, 103, 15, 93 },
-    { 1009, 0, 0, 0, 0, 0, 0, 0, 0, 15, 0, 0, 0, 0, 0, 0 },
-    { 1011, 0, 0, 0, 0, 0, 0, 0, 0, 13, 0, 0, 0, 0, 0, 0 },
-    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
-    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
-    { 202, 20, 84, 114, 14, 60, 41, 79, 99, 21, 41, 15, 50, 84, 34, 66 },
-    { 196, 44, 23, 72, 30, 22, 28, 57, 67, 13, 4, 165, 15, 148, 9, 131 },
-    { 882, 0, 0, 0, 0, 0, 0, 0, 0, 142, 0, 0, 0, 0, 0, 0 },
-    { 840, 0, 0, 0, 0, 0, 0, 0, 0, 184, 0, 0, 0, 0, 0, 0 },
-    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
-    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
-  { { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
-    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
-    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
-    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
-    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
-    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
-    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
-    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
-    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
-    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
-    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
-    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
-    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
-    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
-    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
-    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
-    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
-    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
-    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 } },
-  { { 213, 110, 141, 269, 12, 16, 15, 19, 21, 11, 38, 68, 22, 29, 16, 24 },
-    { 216, 119, 128, 143, 38, 41, 26, 30, 31, 30, 42, 70, 23, 36, 19, 32 },
-    { 367, 149, 154, 154, 38, 35, 17, 21, 21, 10, 22, 36, 0, 0, 0, 0 },
-    { 1022, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0 },
-    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
-    { 219, 96, 127, 191, 21, 40, 25, 32, 34, 18, 45, 45, 33, 39, 26, 33 },
-    { 296, 99, 122, 198, 23, 21, 19, 24, 25, 13, 20, 64, 23, 32, 18, 27 },
-    { 275, 128, 142, 143, 35, 48, 23, 30, 29, 18, 42, 36, 18, 23, 14, 20 },
-    { 239, 132, 166, 175, 36, 27, 19, 21, 24, 14, 13, 85, 9, 31, 8, 25 },
-    { 1022, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0 },
-    { 1022, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0 },
-    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
-    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
-    { 309, 25, 79, 59, 25, 80, 34, 53, 61, 25, 49, 23, 43, 64, 36, 59 },
-    { 270, 57, 40, 54, 50, 42, 41, 53, 56, 28, 17, 81, 45, 86, 34, 70 },
-    { 1005, 0, 0, 0, 0, 0, 0, 0, 0, 19, 0, 0, 0, 0, 0, 0 },
-    { 992, 0, 0, 0, 0, 0, 0, 0, 0, 32, 0, 0, 0, 0, 0, 0 },
-    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
-    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
-  { { 133, 63, 55, 83, 57, 87, 58, 72, 68, 16, 24, 35, 29, 105, 25, 114 },
-    { 131, 75, 74, 60, 71, 77, 65, 66, 73, 33, 21, 79, 20, 83, 18, 78 },
-    { 276, 95, 82, 58, 86, 93, 63, 60, 64, 17, 38, 92, 0, 0, 0, 0 },
-    { 1006, 0, 0, 0, 0, 0, 0, 0, 0, 18, 0, 0, 0, 0, 0, 0 },
-    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
-    { 147, 49, 75, 78, 50, 97, 60, 67, 76, 17, 42, 35, 31, 93, 27, 80 },
-    { 157, 49, 58, 75, 61, 52, 56, 67, 69, 12, 15, 79, 24, 119, 11, 120 },
-    { 178, 69, 83, 77, 69, 85, 72, 77, 77, 20, 35, 40, 25, 48, 23, 46 },
-    { 174, 55, 64, 57, 73, 68, 62, 61, 75, 15, 12, 90, 17, 99, 16, 86 },
-    { 1008, 0, 0, 0, 0, 0, 0, 0, 0, 16, 0, 0, 0, 0, 0, 0 },
-    { 1018, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0 },
-    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
-    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
-    { 266, 31, 63, 64, 21, 52, 39, 54, 63, 30, 52, 31, 48, 89, 46, 75 },
-    { 272, 26, 32, 44, 29, 31, 32, 53, 51, 13, 13, 88, 22, 153, 16, 149 },
-    { 923, 0, 0, 0, 0, 0, 0, 0, 0, 101, 0, 0, 0, 0, 0, 0 },
-    { 969, 0, 0, 0, 0, 0, 0, 0, 0, 55, 0, 0, 0, 0, 0, 0 },
-    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
-    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
-  { { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
-    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
-    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
-    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
-    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
-    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
-    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
-    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
-    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
-    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
-    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
-    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
-    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
-    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
-    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
-    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
-    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
-    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
-    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 } },
-  { { 158, 92, 125, 298, 12, 15, 20, 29, 31, 12, 29, 67, 34, 44, 23, 35 },
-    { 147, 94, 103, 123, 45, 48, 38, 41, 46, 48, 37, 78, 33, 63, 27, 53 },
-    { 268, 126, 125, 136, 54, 53, 31, 38, 38, 33, 35, 87, 0, 0, 0, 0 },
-    { 1018, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0 },
-    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
-    { 159, 72, 103, 194, 20, 35, 37, 50, 56, 21, 39, 40, 51, 61, 38, 48 },
-    { 259, 86, 95, 188, 32, 20, 25, 34, 37, 13, 12, 85, 25, 53, 17, 43 },
-    { 189, 99, 113, 123, 45, 59, 37, 46, 48, 44, 39, 41, 31, 47, 26, 37 },
-    { 175, 110, 113, 128, 58, 38, 33, 33, 43, 29, 13, 100, 14, 68, 12, 57 },
-    { 1017, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0 },
-    { 1019, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0 },
-    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
-    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
-    { 208, 22, 84, 101, 21, 59, 44, 70, 90, 25, 59, 13, 64, 67, 49, 48 },
-    { 277, 52, 32, 63, 43, 26, 33, 48, 54, 11, 6, 130, 18, 119, 11, 101 },
-    { 963, 0, 0, 0, 0, 0, 0, 0, 0, 61, 0, 0, 0, 0, 0, 0 },
-    { 979, 0, 0, 0, 0, 0, 0, 0, 0, 45, 0, 0, 0, 0, 0, 0 },
-    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
-    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }
-};
-
-const int default_obmc_probs[FRAME_UPDATE_TYPES][BLOCK_SIZES_ALL] = {
-  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
-  { 0,  0,  0,  106, 90, 90, 97, 67, 59, 70, 28,
-    30, 38, 16, 16,  16, 0,  0,  44, 50, 26, 25 },
-  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
-  { 0,  0,  0,  98, 93, 97, 68, 82, 85, 33, 30,
-    33, 16, 16, 16, 16, 0,  0,  43, 37, 26, 16 },
-  { 0,  0,  0,  91, 80, 76, 78, 55, 49, 24, 16,
-    16, 16, 16, 16, 16, 0,  0,  29, 45, 16, 38 },
-  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
-  { 0,  0,  0,  103, 89, 89, 89, 62, 63, 76, 34,
-    35, 32, 19, 16,  16, 0,  0,  49, 55, 29, 19 }
-};
-
-const int default_warped_probs[FRAME_UPDATE_TYPES] = { 64, 64, 64, 64,
-                                                       64, 64, 64 };
-
-// TODO(yunqing): the default probs can be trained later from better
-// performance.
-const int default_switchable_interp_probs[FRAME_UPDATE_TYPES]
-                                         [SWITCHABLE_FILTER_CONTEXTS]
-                                         [SWITCHABLE_FILTERS] = {
-                                           { { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 } },
-                                           { { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 } },
-                                           { { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 } },
-                                           { { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 } },
-                                           { { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 } },
-                                           { { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 } },
-                                           { { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 } }
-                                         };
-
 static INLINE void Scale2Ratio(AOM_SCALING mode, int *hr, int *hs) {
   switch (mode) {
     case NORMAL:
@@ -384,6 +101,18 @@
       *hr = 3;
       *hs = 5;
       break;
+    case THREEFOUR:
+      *hr = 3;
+      *hs = 4;
+      break;
+    case ONEFOUR:
+      *hr = 1;
+      *hs = 4;
+      break;
+    case ONEEIGHT:
+      *hr = 1;
+      *hs = 8;
+      break;
     case ONETWO:
       *hr = 1;
       *hs = 2;
@@ -396,67 +125,6 @@
   }
 }
 
-// Mark all inactive blocks as active. Other segmentation features may be set
-// so memset cannot be used, instead only inactive blocks should be reset.
-static void suppress_active_map(AV1_COMP *cpi) {
-  unsigned char *const seg_map = cpi->enc_seg.map;
-  int i;
-  if (cpi->active_map.enabled || cpi->active_map.update)
-    for (i = 0;
-         i < cpi->common.mi_params.mi_rows * cpi->common.mi_params.mi_cols; ++i)
-      if (seg_map[i] == AM_SEGMENT_ID_INACTIVE)
-        seg_map[i] = AM_SEGMENT_ID_ACTIVE;
-}
-
-static void apply_active_map(AV1_COMP *cpi) {
-  struct segmentation *const seg = &cpi->common.seg;
-  unsigned char *const seg_map = cpi->enc_seg.map;
-  const unsigned char *const active_map = cpi->active_map.map;
-  int i;
-
-  assert(AM_SEGMENT_ID_ACTIVE == CR_SEGMENT_ID_BASE);
-
-  if (frame_is_intra_only(&cpi->common)) {
-    cpi->active_map.enabled = 0;
-    cpi->active_map.update = 1;
-  }
-
-  if (cpi->active_map.update) {
-    if (cpi->active_map.enabled) {
-      for (i = 0;
-           i < cpi->common.mi_params.mi_rows * cpi->common.mi_params.mi_cols;
-           ++i)
-        if (seg_map[i] == AM_SEGMENT_ID_ACTIVE) seg_map[i] = active_map[i];
-      av1_enable_segmentation(seg);
-      av1_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_SKIP);
-      av1_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_Y_H);
-      av1_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_Y_V);
-      av1_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_U);
-      av1_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_V);
-
-      av1_set_segdata(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_Y_H,
-                      -MAX_LOOP_FILTER);
-      av1_set_segdata(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_Y_V,
-                      -MAX_LOOP_FILTER);
-      av1_set_segdata(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_U,
-                      -MAX_LOOP_FILTER);
-      av1_set_segdata(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_V,
-                      -MAX_LOOP_FILTER);
-    } else {
-      av1_disable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_SKIP);
-      av1_disable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_Y_H);
-      av1_disable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_Y_V);
-      av1_disable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_U);
-      av1_disable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_V);
-      if (seg->enabled) {
-        seg->update_data = 1;
-        seg->update_map = 1;
-      }
-    }
-    cpi->active_map.update = 0;
-  }
-}
-
 int av1_set_active_map(AV1_COMP *cpi, unsigned char *new_map_16x16, int rows,
                        int cols) {
   const CommonModeInfoParams *const mi_params = &cpi->common.mi_params;
@@ -516,195 +184,6 @@
   }
 }
 
-// Compute the horizontal frequency components' energy in a frame
-// by calculuating the 16x4 Horizontal DCT. This is to be used to
-// decide the superresolution parameters.
-static void analyze_hor_freq(const AV1_COMP *cpi, double *energy) {
-  uint64_t freq_energy[16] = { 0 };
-  const YV12_BUFFER_CONFIG *buf = cpi->source;
-  const int bd = cpi->td.mb.e_mbd.bd;
-  const int width = buf->y_crop_width;
-  const int height = buf->y_crop_height;
-  DECLARE_ALIGNED(16, int32_t, coeff[16 * 4]);
-  int n = 0;
-  memset(freq_energy, 0, sizeof(freq_energy));
-  if (buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    const int16_t *src16 = (const int16_t *)CONVERT_TO_SHORTPTR(buf->y_buffer);
-    for (int i = 0; i < height - 4; i += 4) {
-      for (int j = 0; j < width - 16; j += 16) {
-        av1_fwd_txfm2d_16x4(src16 + i * buf->y_stride + j, coeff, buf->y_stride,
-                            H_DCT, bd);
-        for (int k = 1; k < 16; ++k) {
-          const uint64_t this_energy =
-              ((int64_t)coeff[k] * coeff[k]) +
-              ((int64_t)coeff[k + 16] * coeff[k + 16]) +
-              ((int64_t)coeff[k + 32] * coeff[k + 32]) +
-              ((int64_t)coeff[k + 48] * coeff[k + 48]);
-          freq_energy[k] += ROUND_POWER_OF_TWO(this_energy, 2 + 2 * (bd - 8));
-        }
-        n++;
-      }
-    }
-  } else {
-    assert(bd == 8);
-    DECLARE_ALIGNED(16, int16_t, src16[16 * 4]);
-    for (int i = 0; i < height - 4; i += 4) {
-      for (int j = 0; j < width - 16; j += 16) {
-        for (int ii = 0; ii < 4; ++ii)
-          for (int jj = 0; jj < 16; ++jj)
-            src16[ii * 16 + jj] =
-                buf->y_buffer[(i + ii) * buf->y_stride + (j + jj)];
-        av1_fwd_txfm2d_16x4(src16, coeff, 16, H_DCT, bd);
-        for (int k = 1; k < 16; ++k) {
-          const uint64_t this_energy =
-              ((int64_t)coeff[k] * coeff[k]) +
-              ((int64_t)coeff[k + 16] * coeff[k + 16]) +
-              ((int64_t)coeff[k + 32] * coeff[k + 32]) +
-              ((int64_t)coeff[k + 48] * coeff[k + 48]);
-          freq_energy[k] += ROUND_POWER_OF_TWO(this_energy, 2);
-        }
-        n++;
-      }
-    }
-  }
-  if (n) {
-    for (int k = 1; k < 16; ++k) energy[k] = (double)freq_energy[k] / n;
-    // Convert to cumulative energy
-    for (int k = 14; k > 0; --k) energy[k] += energy[k + 1];
-  } else {
-    for (int k = 1; k < 16; ++k) energy[k] = 1e+20;
-  }
-}
-
-static BLOCK_SIZE select_sb_size(const AV1_COMP *const cpi) {
-  const AV1_COMMON *const cm = &cpi->common;
-
-  if (cpi->oxcf.superblock_size == AOM_SUPERBLOCK_SIZE_64X64)
-    return BLOCK_64X64;
-  if (cpi->oxcf.superblock_size == AOM_SUPERBLOCK_SIZE_128X128)
-    return BLOCK_128X128;
-
-  assert(cpi->oxcf.superblock_size == AOM_SUPERBLOCK_SIZE_DYNAMIC);
-
-  if (cpi->svc.number_spatial_layers > 1) {
-    // Use the configured size (top resolution) for spatial layers.
-    return AOMMIN(cpi->oxcf.width, cpi->oxcf.height) > 480 ? BLOCK_128X128
-                                                           : BLOCK_64X64;
-  }
-
-  // TODO(any): Possibly could improve this with a heuristic.
-  // When superres / resize is on, 'cm->width / height' can change between
-  // calls, so we don't apply this heuristic there.
-  // Things break if superblock size changes between the first pass and second
-  // pass encoding, which is why this heuristic is not configured as a
-  // speed-feature.
-  if (cpi->oxcf.superres_mode == SUPERRES_NONE &&
-      cpi->oxcf.resize_mode == RESIZE_NONE && cpi->oxcf.speed >= 1) {
-    return AOMMIN(cm->width, cm->height) > 480 ? BLOCK_128X128 : BLOCK_64X64;
-  }
-
-  return BLOCK_128X128;
-}
-
-static void setup_frame(AV1_COMP *cpi) {
-  AV1_COMMON *const cm = &cpi->common;
-  // Set up entropy context depending on frame type. The decoder mandates
-  // the use of the default context, index 0, for keyframes and inter
-  // frames where the error_resilient_mode or intra_only flag is set. For
-  // other inter-frames the encoder currently uses only two contexts;
-  // context 1 for ALTREF frames and context 0 for the others.
-
-  if (frame_is_intra_only(cm) || cm->features.error_resilient_mode ||
-      cpi->ext_flags.use_primary_ref_none) {
-    av1_setup_past_independence(cm);
-  }
-
-  if ((cm->current_frame.frame_type == KEY_FRAME && cm->show_frame) ||
-      frame_is_sframe(cm)) {
-    if (!cpi->seq_params_locked) {
-      set_sb_size(&cm->seq_params, select_sb_size(cpi));
-    }
-  } else {
-    const RefCntBuffer *const primary_ref_buf = get_primary_ref_frame_buf(cm);
-    if (primary_ref_buf == NULL) {
-      av1_setup_past_independence(cm);
-      cm->seg.update_map = 1;
-      cm->seg.update_data = 1;
-    } else {
-      *cm->fc = primary_ref_buf->frame_context;
-    }
-  }
-
-  av1_zero(cm->cur_frame->interp_filter_selected);
-  cm->prev_frame = get_primary_ref_frame_buf(cm);
-  cpi->vaq_refresh = 0;
-}
-
-static void set_mb_mi(CommonModeInfoParams *mi_params, int width, int height) {
-  // Ensure that the decoded width and height are both multiples of
-  // 8 luma pixels (note: this may only be a multiple of 4 chroma pixels if
-  // subsampling is used).
-  // This simplifies the implementation of various experiments,
-  // eg. cdef, which operates on units of 8x8 luma pixels.
-  const int aligned_width = ALIGN_POWER_OF_TWO(width, 3);
-  const int aligned_height = ALIGN_POWER_OF_TWO(height, 3);
-
-  mi_params->mi_cols = aligned_width >> MI_SIZE_LOG2;
-  mi_params->mi_rows = aligned_height >> MI_SIZE_LOG2;
-  mi_params->mi_stride = calc_mi_size(mi_params->mi_cols);
-
-  mi_params->mb_cols = (mi_params->mi_cols + 2) >> 2;
-  mi_params->mb_rows = (mi_params->mi_rows + 2) >> 2;
-  mi_params->MBs = mi_params->mb_rows * mi_params->mb_cols;
-
-  const int mi_alloc_size_1d = mi_size_wide[mi_params->mi_alloc_bsize];
-  mi_params->mi_alloc_stride =
-      (mi_params->mi_stride + mi_alloc_size_1d - 1) / mi_alloc_size_1d;
-
-  assert(mi_size_wide[mi_params->mi_alloc_bsize] ==
-         mi_size_high[mi_params->mi_alloc_bsize]);
-
-#if CONFIG_LPF_MASK
-  av1_alloc_loop_filter_mask(mi_params);
-#endif
-}
-
-static void enc_set_mb_mi(CommonModeInfoParams *mi_params, int width,
-                          int height) {
-  const int is_4k_or_larger = AOMMIN(width, height) >= 2160;
-  mi_params->mi_alloc_bsize = is_4k_or_larger ? BLOCK_8X8 : BLOCK_4X4;
-
-  set_mb_mi(mi_params, width, height);
-}
-
-static void stat_stage_set_mb_mi(CommonModeInfoParams *mi_params, int width,
-                                 int height) {
-  mi_params->mi_alloc_bsize = BLOCK_16X16;
-
-  set_mb_mi(mi_params, width, height);
-}
-
-static void enc_setup_mi(CommonModeInfoParams *mi_params) {
-  const int mi_grid_size =
-      mi_params->mi_stride * calc_mi_size(mi_params->mi_rows);
-  memset(mi_params->mi_alloc, 0,
-         mi_params->mi_alloc_size * sizeof(*mi_params->mi_alloc));
-  memset(mi_params->mi_grid_base, 0,
-         mi_grid_size * sizeof(*mi_params->mi_grid_base));
-  memset(mi_params->tx_type_map, 0,
-         mi_grid_size * sizeof(*mi_params->tx_type_map));
-}
-
-static void enc_free_mi(CommonModeInfoParams *mi_params) {
-  aom_free(mi_params->mi_alloc);
-  mi_params->mi_alloc = NULL;
-  aom_free(mi_params->mi_grid_base);
-  mi_params->mi_grid_base = NULL;
-  mi_params->mi_alloc_size = 0;
-  aom_free(mi_params->tx_type_map);
-  mi_params->tx_type_map = NULL;
-}
-
 void av1_initialize_enc(void) {
   av1_rtcd();
   aom_dsp_rtcd();
@@ -715,331 +194,6 @@
   av1_init_wedge_masks();
 }
 
-static void dealloc_context_buffers_ext(MBMIExtFrameBufferInfo *mbmi_ext_info) {
-  if (mbmi_ext_info->frame_base) {
-    aom_free(mbmi_ext_info->frame_base);
-    mbmi_ext_info->frame_base = NULL;
-    mbmi_ext_info->alloc_size = 0;
-  }
-}
-
-static void alloc_context_buffers_ext(AV1_COMMON *cm,
-                                      MBMIExtFrameBufferInfo *mbmi_ext_info) {
-  const CommonModeInfoParams *const mi_params = &cm->mi_params;
-
-  const int mi_alloc_size_1d = mi_size_wide[mi_params->mi_alloc_bsize];
-  const int mi_alloc_rows =
-      (mi_params->mi_rows + mi_alloc_size_1d - 1) / mi_alloc_size_1d;
-  const int mi_alloc_cols =
-      (mi_params->mi_cols + mi_alloc_size_1d - 1) / mi_alloc_size_1d;
-  const int new_ext_mi_size = mi_alloc_rows * mi_alloc_cols;
-
-  if (new_ext_mi_size > mbmi_ext_info->alloc_size) {
-    dealloc_context_buffers_ext(mbmi_ext_info);
-    CHECK_MEM_ERROR(
-        cm, mbmi_ext_info->frame_base,
-        aom_calloc(new_ext_mi_size, sizeof(*mbmi_ext_info->frame_base)));
-    mbmi_ext_info->alloc_size = new_ext_mi_size;
-  }
-  // The stride needs to be updated regardless of whether new allocation
-  // happened or not.
-  mbmi_ext_info->stride = mi_alloc_cols;
-}
-
-static void reset_film_grain_chroma_params(aom_film_grain_t *pars) {
-  pars->num_cr_points = 0;
-  pars->cr_mult = 0;
-  pars->cr_luma_mult = 0;
-  memset(pars->scaling_points_cr, 0, sizeof(pars->scaling_points_cr));
-  memset(pars->ar_coeffs_cr, 0, sizeof(pars->ar_coeffs_cr));
-  pars->num_cb_points = 0;
-  pars->cb_mult = 0;
-  pars->cb_luma_mult = 0;
-  pars->chroma_scaling_from_luma = 0;
-  memset(pars->scaling_points_cb, 0, sizeof(pars->scaling_points_cb));
-  memset(pars->ar_coeffs_cb, 0, sizeof(pars->ar_coeffs_cb));
-}
-
-static void update_film_grain_parameters(struct AV1_COMP *cpi,
-                                         const AV1EncoderConfig *oxcf) {
-  AV1_COMMON *const cm = &cpi->common;
-  cpi->oxcf = *oxcf;
-
-  if (cpi->film_grain_table) {
-    aom_film_grain_table_free(cpi->film_grain_table);
-    aom_free(cpi->film_grain_table);
-    cpi->film_grain_table = NULL;
-  }
-
-  if (oxcf->film_grain_test_vector) {
-    cm->seq_params.film_grain_params_present = 1;
-    if (cm->current_frame.frame_type == KEY_FRAME) {
-      memcpy(&cm->film_grain_params,
-             film_grain_test_vectors + oxcf->film_grain_test_vector - 1,
-             sizeof(cm->film_grain_params));
-      if (oxcf->monochrome)
-        reset_film_grain_chroma_params(&cm->film_grain_params);
-      cm->film_grain_params.bit_depth = cm->seq_params.bit_depth;
-      if (cm->seq_params.color_range == AOM_CR_FULL_RANGE) {
-        cm->film_grain_params.clip_to_restricted_range = 0;
-      }
-    }
-  } else if (oxcf->film_grain_table_filename) {
-    cm->seq_params.film_grain_params_present = 1;
-
-    cpi->film_grain_table = aom_malloc(sizeof(*cpi->film_grain_table));
-    memset(cpi->film_grain_table, 0, sizeof(aom_film_grain_table_t));
-
-    aom_film_grain_table_read(cpi->film_grain_table,
-                              oxcf->film_grain_table_filename, &cm->error);
-  } else {
-#if CONFIG_DENOISE
-    cm->seq_params.film_grain_params_present = (cpi->oxcf.noise_level > 0);
-#else
-    cm->seq_params.film_grain_params_present = 0;
-#endif
-    memset(&cm->film_grain_params, 0, sizeof(cm->film_grain_params));
-  }
-}
-
-static void dealloc_compressor_data(AV1_COMP *cpi) {
-  AV1_COMMON *const cm = &cpi->common;
-  const int num_planes = av1_num_planes(cm);
-
-  dealloc_context_buffers_ext(&cpi->mbmi_ext_info);
-
-  aom_free(cpi->tile_data);
-  cpi->tile_data = NULL;
-
-  // Delete sementation map
-  aom_free(cpi->enc_seg.map);
-  cpi->enc_seg.map = NULL;
-
-  av1_cyclic_refresh_free(cpi->cyclic_refresh);
-  cpi->cyclic_refresh = NULL;
-
-  aom_free(cpi->active_map.map);
-  cpi->active_map.map = NULL;
-
-  aom_free(cpi->ssim_rdmult_scaling_factors);
-  cpi->ssim_rdmult_scaling_factors = NULL;
-
-  aom_free(cpi->tpl_rdmult_scaling_factors);
-  cpi->tpl_rdmult_scaling_factors = NULL;
-
-  aom_free(cpi->tpl_sb_rdmult_scaling_factors);
-  cpi->tpl_sb_rdmult_scaling_factors = NULL;
-
-#if CONFIG_TUNE_VMAF
-  aom_free(cpi->vmaf_rdmult_scaling_factors);
-  cpi->vmaf_rdmult_scaling_factors = NULL;
-#endif
-
-  aom_free(cpi->td.mb.above_pred_buf);
-  cpi->td.mb.above_pred_buf = NULL;
-
-  aom_free(cpi->td.mb.left_pred_buf);
-  cpi->td.mb.left_pred_buf = NULL;
-
-  aom_free(cpi->td.mb.wsrc_buf);
-  cpi->td.mb.wsrc_buf = NULL;
-
-  aom_free(cpi->td.mb.inter_modes_info);
-  cpi->td.mb.inter_modes_info = NULL;
-
-  for (int i = 0; i < 2; i++)
-    for (int j = 0; j < 2; j++) {
-      aom_free(cpi->td.mb.intrabc_hash_info.hash_value_buffer[i][j]);
-      cpi->td.mb.intrabc_hash_info.hash_value_buffer[i][j] = NULL;
-    }
-  aom_free(cpi->td.mb.mask_buf);
-  cpi->td.mb.mask_buf = NULL;
-
-  aom_free(cm->tpl_mvs);
-  cm->tpl_mvs = NULL;
-
-  aom_free(cpi->td.mb.mbmi_ext);
-  cpi->td.mb.mbmi_ext = NULL;
-
-  if (cpi->td.vt64x64) {
-    aom_free(cpi->td.vt64x64);
-    cpi->td.vt64x64 = NULL;
-  }
-
-  av1_free_ref_frame_buffers(cm->buffer_pool);
-  av1_free_txb_buf(cpi);
-  av1_free_context_buffers(cm);
-
-  aom_free_frame_buffer(&cpi->last_frame_uf);
-  av1_free_restoration_buffers(cm);
-  aom_free_frame_buffer(&cpi->trial_frame_rst);
-  aom_free_frame_buffer(&cpi->scaled_source);
-  aom_free_frame_buffer(&cpi->scaled_last_source);
-  aom_free_frame_buffer(&cpi->alt_ref_buffer);
-  av1_lookahead_destroy(cpi->lookahead);
-
-  aom_free(cpi->tile_tok[0][0]);
-  cpi->tile_tok[0][0] = 0;
-
-  aom_free(cpi->tplist[0][0]);
-  cpi->tplist[0][0] = NULL;
-
-  av1_free_pc_tree(cpi, &cpi->td, num_planes, cm->seq_params.sb_size);
-
-  aom_free(cpi->td.mb.palette_buffer);
-  av1_release_compound_type_rd_buffers(&cpi->td.mb.comp_rd_buffer);
-  aom_free(cpi->td.mb.tmp_conv_dst);
-  for (int j = 0; j < 2; ++j) {
-    aom_free(cpi->td.mb.tmp_obmc_bufs[j]);
-  }
-
-#if CONFIG_DENOISE
-  if (cpi->denoise_and_model) {
-    aom_denoise_and_model_free(cpi->denoise_and_model);
-    cpi->denoise_and_model = NULL;
-  }
-#endif
-  if (cpi->film_grain_table) {
-    aom_film_grain_table_free(cpi->film_grain_table);
-    cpi->film_grain_table = NULL;
-  }
-
-  for (int i = 0; i < MAX_NUM_OPERATING_POINTS; ++i) {
-    aom_free(cpi->level_params.level_info[i]);
-  }
-
-  if (cpi->use_svc) av1_free_svc_cyclic_refresh(cpi);
-}
-
-static void configure_static_seg_features(AV1_COMP *cpi) {
-  AV1_COMMON *const cm = &cpi->common;
-  const RATE_CONTROL *const rc = &cpi->rc;
-  struct segmentation *const seg = &cm->seg;
-
-  int high_q = (int)(rc->avg_q > 48.0);
-  int qi_delta;
-
-  // Disable and clear down for KF
-  if (cm->current_frame.frame_type == KEY_FRAME) {
-    // Clear down the global segmentation map
-    memset(cpi->enc_seg.map, 0, cm->mi_params.mi_rows * cm->mi_params.mi_cols);
-    seg->update_map = 0;
-    seg->update_data = 0;
-
-    // Disable segmentation
-    av1_disable_segmentation(seg);
-
-    // Clear down the segment features.
-    av1_clearall_segfeatures(seg);
-  } else if (cpi->refresh_alt_ref_frame) {
-    // If this is an alt ref frame
-    // Clear down the global segmentation map
-    memset(cpi->enc_seg.map, 0, cm->mi_params.mi_rows * cm->mi_params.mi_cols);
-    seg->update_map = 0;
-    seg->update_data = 0;
-
-    // Disable segmentation and individual segment features by default
-    av1_disable_segmentation(seg);
-    av1_clearall_segfeatures(seg);
-
-    // If segmentation was enabled set those features needed for the
-    // arf itself.
-    if (seg->enabled) {
-      seg->update_map = 1;
-      seg->update_data = 1;
-
-      qi_delta = av1_compute_qdelta(rc, rc->avg_q, rc->avg_q * 0.875,
-                                    cm->seq_params.bit_depth);
-      av1_set_segdata(seg, 1, SEG_LVL_ALT_Q, qi_delta - 2);
-      av1_set_segdata(seg, 1, SEG_LVL_ALT_LF_Y_H, -2);
-      av1_set_segdata(seg, 1, SEG_LVL_ALT_LF_Y_V, -2);
-      av1_set_segdata(seg, 1, SEG_LVL_ALT_LF_U, -2);
-      av1_set_segdata(seg, 1, SEG_LVL_ALT_LF_V, -2);
-
-      av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF_Y_H);
-      av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF_Y_V);
-      av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF_U);
-      av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF_V);
-
-      av1_enable_segfeature(seg, 1, SEG_LVL_ALT_Q);
-    }
-  } else if (seg->enabled) {
-    // All other frames if segmentation has been enabled
-
-    // First normal frame in a valid gf or alt ref group
-    if (rc->frames_since_golden == 0) {
-      // Set up segment features for normal frames in an arf group
-      if (rc->source_alt_ref_active) {
-        seg->update_map = 0;
-        seg->update_data = 1;
-
-        qi_delta = av1_compute_qdelta(rc, rc->avg_q, rc->avg_q * 1.125,
-                                      cm->seq_params.bit_depth);
-        av1_set_segdata(seg, 1, SEG_LVL_ALT_Q, qi_delta + 2);
-        av1_enable_segfeature(seg, 1, SEG_LVL_ALT_Q);
-
-        av1_set_segdata(seg, 1, SEG_LVL_ALT_LF_Y_H, -2);
-        av1_set_segdata(seg, 1, SEG_LVL_ALT_LF_Y_V, -2);
-        av1_set_segdata(seg, 1, SEG_LVL_ALT_LF_U, -2);
-        av1_set_segdata(seg, 1, SEG_LVL_ALT_LF_V, -2);
-
-        av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF_Y_H);
-        av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF_Y_V);
-        av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF_U);
-        av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF_V);
-
-        // Segment coding disabled for compred testing
-        if (high_q) {
-          av1_set_segdata(seg, 1, SEG_LVL_REF_FRAME, ALTREF_FRAME);
-          av1_enable_segfeature(seg, 1, SEG_LVL_REF_FRAME);
-          av1_enable_segfeature(seg, 1, SEG_LVL_SKIP);
-        }
-      } else {
-        // Disable segmentation and clear down features if alt ref
-        // is not active for this group
-
-        av1_disable_segmentation(seg);
-
-        memset(cpi->enc_seg.map, 0,
-               cm->mi_params.mi_rows * cm->mi_params.mi_cols);
-
-        seg->update_map = 0;
-        seg->update_data = 0;
-
-        av1_clearall_segfeatures(seg);
-      }
-    } else if (rc->is_src_frame_alt_ref) {
-      // Special case where we are coding over the top of a previous
-      // alt ref frame.
-      // Segment coding disabled for compred testing
-
-      // Enable ref frame features for segment 0 as well
-      av1_enable_segfeature(seg, 0, SEG_LVL_REF_FRAME);
-      av1_enable_segfeature(seg, 1, SEG_LVL_REF_FRAME);
-
-      // All mbs should use ALTREF_FRAME
-      av1_clear_segdata(seg, 0, SEG_LVL_REF_FRAME);
-      av1_set_segdata(seg, 0, SEG_LVL_REF_FRAME, ALTREF_FRAME);
-      av1_clear_segdata(seg, 1, SEG_LVL_REF_FRAME);
-      av1_set_segdata(seg, 1, SEG_LVL_REF_FRAME, ALTREF_FRAME);
-
-      // Skip all MBs if high Q (0,0 mv and skip coeffs)
-      if (high_q) {
-        av1_enable_segfeature(seg, 0, SEG_LVL_SKIP);
-        av1_enable_segfeature(seg, 1, SEG_LVL_SKIP);
-      }
-      // Enable data update
-      seg->update_data = 1;
-    } else {
-      // All other frames.
-
-      // No updates.. leave things as they are.
-      seg->update_map = 0;
-      seg->update_data = 0;
-    }
-  }
-}
-
 static void update_reference_segmentation_map(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
   const CommonModeInfoParams *const mi_params = &cm->mi_params;
@@ -1056,93 +210,6 @@
   }
 }
 
-static void alloc_altref_frame_buffer(AV1_COMP *cpi) {
-  AV1_COMMON *cm = &cpi->common;
-  const SequenceHeader *const seq_params = &cm->seq_params;
-  const AV1EncoderConfig *oxcf = &cpi->oxcf;
-
-  // TODO(agrange) Check if ARF is enabled and skip allocation if not.
-  if (aom_realloc_frame_buffer(
-          &cpi->alt_ref_buffer, oxcf->width, oxcf->height,
-          seq_params->subsampling_x, seq_params->subsampling_y,
-          seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels,
-          cm->features.byte_alignment, NULL, NULL, NULL))
-    aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
-                       "Failed to allocate altref buffer");
-}
-
-static void alloc_util_frame_buffers(AV1_COMP *cpi) {
-  AV1_COMMON *const cm = &cpi->common;
-  const SequenceHeader *const seq_params = &cm->seq_params;
-  const int byte_alignment = cm->features.byte_alignment;
-  if (aom_realloc_frame_buffer(
-          &cpi->last_frame_uf, cm->width, cm->height, seq_params->subsampling_x,
-          seq_params->subsampling_y, seq_params->use_highbitdepth,
-          cpi->oxcf.border_in_pixels, byte_alignment, NULL, NULL, NULL))
-    aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
-                       "Failed to allocate last frame buffer");
-
-  if (aom_realloc_frame_buffer(
-          &cpi->trial_frame_rst, cm->superres_upscaled_width,
-          cm->superres_upscaled_height, seq_params->subsampling_x,
-          seq_params->subsampling_y, seq_params->use_highbitdepth,
-          AOM_RESTORATION_FRAME_BORDER, byte_alignment, NULL, NULL, NULL))
-    aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
-                       "Failed to allocate trial restored frame buffer");
-
-  if (aom_realloc_frame_buffer(
-          &cpi->scaled_source, cm->width, cm->height, seq_params->subsampling_x,
-          seq_params->subsampling_y, seq_params->use_highbitdepth,
-          cpi->oxcf.border_in_pixels, byte_alignment, NULL, NULL, NULL))
-    aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
-                       "Failed to allocate scaled source buffer");
-
-  if (aom_realloc_frame_buffer(
-          &cpi->scaled_last_source, cm->width, cm->height,
-          seq_params->subsampling_x, seq_params->subsampling_y,
-          seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels,
-          byte_alignment, NULL, NULL, NULL))
-    aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
-                       "Failed to allocate scaled last source buffer");
-}
-
-static void alloc_compressor_data(AV1_COMP *cpi) {
-  AV1_COMMON *cm = &cpi->common;
-  const int num_planes = av1_num_planes(cm);
-
-  if (av1_alloc_context_buffers(cm, cm->width, cm->height)) {
-    aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
-                       "Failed to allocate context buffers");
-  }
-
-  int mi_rows_aligned_to_sb =
-      ALIGN_POWER_OF_TWO(cm->mi_params.mi_rows, cm->seq_params.mib_size_log2);
-  int sb_rows = mi_rows_aligned_to_sb >> cm->seq_params.mib_size_log2;
-
-  if (!is_stat_generation_stage(cpi)) {
-    av1_alloc_txb_buf(cpi);
-
-    alloc_context_buffers_ext(cm, &cpi->mbmi_ext_info);
-  }
-
-  aom_free(cpi->tile_tok[0][0]);
-  aom_free(cpi->tplist[0][0]);
-
-  if (!is_stat_generation_stage(cpi)) {
-    unsigned int tokens =
-        get_token_alloc(cm->mi_params.mb_rows, cm->mi_params.mb_cols,
-                        MAX_SB_SIZE_LOG2, num_planes);
-    CHECK_MEM_ERROR(cm, cpi->tile_tok[0][0],
-                    aom_calloc(tokens, sizeof(*cpi->tile_tok[0][0])));
-
-    CHECK_MEM_ERROR(cm, cpi->tplist[0][0],
-                    aom_calloc(sb_rows * MAX_TILE_ROWS * MAX_TILE_COLS,
-                               sizeof(*cpi->tplist[0][0])));
-  }
-
-  av1_setup_pc_tree(cpi, &cpi->td);
-}
-
 void av1_new_framerate(AV1_COMP *cpi, double framerate) {
   cpi->framerate = framerate < 0.1 ? 30 : framerate;
   av1_rc_update_framerate(cpi, cpi->common.width, cpi->common.height);
@@ -1164,8 +231,8 @@
   return uncompressed_frame_size / (double)encoded_frame_size;
 }
 
-static void set_tile_info(AV1_COMP *cpi) {
-  AV1_COMMON *const cm = &cpi->common;
+static void set_tile_info(AV1_COMMON *const cm,
+                          const TileConfig *const tile_cfg) {
   const CommonModeInfoParams *const mi_params = &cm->mi_params;
   const SequenceHeader *const seq_params = &cm->seq_params;
   CommonTileParams *const tiles = &cm->tiles;
@@ -1174,9 +241,9 @@
   av1_get_tile_limits(cm);
 
   // configure tile columns
-  if (cpi->oxcf.tile_width_count == 0 || cpi->oxcf.tile_height_count == 0) {
+  if (tile_cfg->tile_width_count == 0 || tile_cfg->tile_height_count == 0) {
     tiles->uniform_spacing = 1;
-    tiles->log2_cols = AOMMAX(cpi->oxcf.tile_columns, tiles->min_log2_cols);
+    tiles->log2_cols = AOMMAX(tile_cfg->tile_columns, tiles->min_log2_cols);
     tiles->log2_cols = AOMMIN(tiles->log2_cols, tiles->max_log2_cols);
   } else {
     int mi_cols =
@@ -1186,8 +253,8 @@
     tiles->uniform_spacing = 0;
     for (i = 0, start_sb = 0; start_sb < sb_cols && i < MAX_TILE_COLS; i++) {
       tiles->col_start_sb[i] = start_sb;
-      size_sb = cpi->oxcf.tile_widths[j++];
-      if (j >= cpi->oxcf.tile_width_count) j = 0;
+      size_sb = tile_cfg->tile_widths[j++];
+      if (j >= tile_cfg->tile_width_count) j = 0;
       start_sb += AOMMIN(size_sb, tiles->max_width_sb);
     }
     tiles->cols = i;
@@ -1198,7 +265,7 @@
 
   // configure tile rows
   if (tiles->uniform_spacing) {
-    tiles->log2_rows = AOMMAX(cpi->oxcf.tile_rows, tiles->min_log2_rows);
+    tiles->log2_rows = AOMMAX(tile_cfg->tile_rows, tiles->min_log2_rows);
     tiles->log2_rows = AOMMIN(tiles->log2_rows, tiles->max_log2_rows);
   } else {
     int mi_rows =
@@ -1207,8 +274,8 @@
     int size_sb, j = 0;
     for (i = 0, start_sb = 0; start_sb < sb_rows && i < MAX_TILE_ROWS; i++) {
       tiles->row_start_sb[i] = start_sb;
-      size_sb = cpi->oxcf.tile_heights[j++];
-      if (j >= cpi->oxcf.tile_height_count) j = 0;
+      size_sb = tile_cfg->tile_heights[j++];
+      if (j >= tile_cfg->tile_height_count) j = 0;
       start_sb += AOMMIN(size_sb, tiles->max_height_sb);
     }
     tiles->rows = i;
@@ -1228,20 +295,15 @@
   }
   av1_init_mi_buffers(&cm->mi_params);
 
-  av1_init_macroblockd(cm, xd, NULL);
+  av1_init_macroblockd(cm, xd);
 
   if (!is_stat_generation_stage(cpi))
     alloc_context_buffers_ext(cm, &cpi->mbmi_ext_info);
-  set_tile_info(cpi);
-}
 
-static void init_buffer_indices(ForceIntegerMVInfo *const force_intpel_info,
-                                int *const remapped_ref_idx) {
-  int fb_idx;
-  for (fb_idx = 0; fb_idx < REF_FRAMES; ++fb_idx)
-    remapped_ref_idx[fb_idx] = fb_idx;
-  force_intpel_info->rate_index = 0;
-  force_intpel_info->rate_size = 0;
+  if (!cpi->seq_params_locked)
+    set_sb_size(&cm->seq_params, av1_select_sb_size(cpi));
+
+  set_tile_info(cm, &cpi->oxcf.tile_cfg);
 }
 
 static INLINE int does_level_match(int width, int height, double fps,
@@ -1258,47 +320,47 @@
 }
 
 static void set_bitstream_level_tier(SequenceHeader *seq, AV1_COMMON *cm,
-                                     const AV1EncoderConfig *oxcf) {
+                                     int width, int height,
+                                     double init_framerate) {
   // TODO(any): This is a placeholder function that only addresses dimensions
   // and max display sample rates.
   // Need to add checks for max bit rate, max decoded luma sample rate, header
   // rate, etc. that are not covered by this function.
   AV1_LEVEL level = SEQ_LEVEL_MAX;
-  if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate, 512,
-                       288, 30.0, 4)) {
+  if (does_level_match(width, height, init_framerate, 512, 288, 30.0, 4)) {
     level = SEQ_LEVEL_2_0;
-  } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
-                              704, 396, 30.0, 4)) {
+  } else if (does_level_match(width, height, init_framerate, 704, 396, 30.0,
+                              4)) {
     level = SEQ_LEVEL_2_1;
-  } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
-                              1088, 612, 30.0, 4)) {
+  } else if (does_level_match(width, height, init_framerate, 1088, 612, 30.0,
+                              4)) {
     level = SEQ_LEVEL_3_0;
-  } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
-                              1376, 774, 30.0, 4)) {
+  } else if (does_level_match(width, height, init_framerate, 1376, 774, 30.0,
+                              4)) {
     level = SEQ_LEVEL_3_1;
-  } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
-                              2048, 1152, 30.0, 3)) {
+  } else if (does_level_match(width, height, init_framerate, 2048, 1152, 30.0,
+                              3)) {
     level = SEQ_LEVEL_4_0;
-  } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
-                              2048, 1152, 60.0, 3)) {
+  } else if (does_level_match(width, height, init_framerate, 2048, 1152, 60.0,
+                              3)) {
     level = SEQ_LEVEL_4_1;
-  } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
-                              4096, 2176, 30.0, 2)) {
+  } else if (does_level_match(width, height, init_framerate, 4096, 2176, 30.0,
+                              2)) {
     level = SEQ_LEVEL_5_0;
-  } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
-                              4096, 2176, 60.0, 2)) {
+  } else if (does_level_match(width, height, init_framerate, 4096, 2176, 60.0,
+                              2)) {
     level = SEQ_LEVEL_5_1;
-  } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
-                              4096, 2176, 120.0, 2)) {
+  } else if (does_level_match(width, height, init_framerate, 4096, 2176, 120.0,
+                              2)) {
     level = SEQ_LEVEL_5_2;
-  } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
-                              8192, 4352, 30.0, 2)) {
+  } else if (does_level_match(width, height, init_framerate, 8192, 4352, 30.0,
+                              2)) {
     level = SEQ_LEVEL_6_0;
-  } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
-                              8192, 4352, 60.0, 2)) {
+  } else if (does_level_match(width, height, init_framerate, 8192, 4352, 60.0,
+                              2)) {
     level = SEQ_LEVEL_6_1;
-  } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
-                              8192, 4352, 120.0, 2)) {
+  } else if (does_level_match(width, height, init_framerate, 8192, 4352, 120.0,
+                              2)) {
     level = SEQ_LEVEL_6_2;
   }
 
@@ -1320,18 +382,23 @@
   }
 }
 
-static void init_seq_coding_tools(SequenceHeader *seq, AV1_COMMON *cm,
-                                  const AV1EncoderConfig *oxcf, int use_svc) {
-  seq->still_picture = (oxcf->force_video_mode == 0) && (oxcf->limit == 1);
-  seq->reduced_still_picture_hdr = seq->still_picture;
-  seq->reduced_still_picture_hdr &= !oxcf->full_still_picture_hdr;
+void av1_init_seq_coding_tools(SequenceHeader *seq, AV1_COMMON *cm,
+                               const AV1EncoderConfig *oxcf, int use_svc) {
+  const FrameDimensionCfg *const frm_dim_cfg = &oxcf->frm_dim_cfg;
+  const ToolCfg *const tool_cfg = &oxcf->tool_cfg;
+
+  seq->still_picture =
+      !tool_cfg->force_video_mode && (oxcf->input_cfg.limit == 1);
+  seq->reduced_still_picture_hdr =
+      seq->still_picture && !tool_cfg->full_still_picture_hdr;
   seq->force_screen_content_tools = (oxcf->mode == REALTIME) ? 0 : 2;
   seq->force_integer_mv = 2;
-  seq->order_hint_info.enable_order_hint = oxcf->enable_order_hint;
+  seq->order_hint_info.enable_order_hint = tool_cfg->enable_order_hint;
   seq->frame_id_numbers_present_flag =
-      !(seq->still_picture && seq->reduced_still_picture_hdr) &&
-      !oxcf->large_scale_tile && oxcf->error_resilient_mode && !use_svc;
-  if (seq->still_picture && seq->reduced_still_picture_hdr) {
+      !seq->reduced_still_picture_hdr &&
+      !oxcf->tile_cfg.enable_large_scale_tile &&
+      tool_cfg->error_resilient_mode && !use_svc;
+  if (seq->reduced_still_picture_hdr) {
     seq->order_hint_info.enable_order_hint = 0;
     seq->force_screen_content_tools = 2;
     seq->force_integer_mv = 2;
@@ -1341,11 +408,12 @@
           ? DEFAULT_EXPLICIT_ORDER_HINT_BITS - 1
           : -1;
 
-  seq->max_frame_width =
-      oxcf->forced_max_frame_width ? oxcf->forced_max_frame_width : oxcf->width;
-  seq->max_frame_height = oxcf->forced_max_frame_height
-                              ? oxcf->forced_max_frame_height
-                              : oxcf->height;
+  seq->max_frame_width = frm_dim_cfg->forced_max_frame_width
+                             ? frm_dim_cfg->forced_max_frame_width
+                             : frm_dim_cfg->width;
+  seq->max_frame_height = frm_dim_cfg->forced_max_frame_height
+                              ? frm_dim_cfg->forced_max_frame_height
+                              : frm_dim_cfg->height;
   seq->num_bits_width =
       (seq->max_frame_width > 1) ? get_msb(seq->max_frame_width - 1) + 1 : 1;
   seq->num_bits_height =
@@ -1356,23 +424,25 @@
   seq->frame_id_length = FRAME_ID_LENGTH;
   seq->delta_frame_id_length = DELTA_FRAME_ID_LENGTH;
 
-  seq->enable_dual_filter = oxcf->enable_dual_filter;
-  seq->order_hint_info.enable_dist_wtd_comp = oxcf->enable_dist_wtd_comp;
+  seq->enable_dual_filter = tool_cfg->enable_dual_filter;
+  seq->order_hint_info.enable_dist_wtd_comp =
+      oxcf->comp_type_cfg.enable_dist_wtd_comp;
   seq->order_hint_info.enable_dist_wtd_comp &=
       seq->order_hint_info.enable_order_hint;
-  seq->order_hint_info.enable_ref_frame_mvs = oxcf->enable_ref_frame_mvs;
+  seq->order_hint_info.enable_ref_frame_mvs = tool_cfg->ref_frame_mvs_present;
   seq->order_hint_info.enable_ref_frame_mvs &=
       seq->order_hint_info.enable_order_hint;
-  seq->enable_superres = oxcf->enable_superres;
-  seq->enable_cdef = oxcf->enable_cdef;
-  seq->enable_restoration = oxcf->enable_restoration;
-  seq->enable_warped_motion = oxcf->enable_warped_motion;
-  seq->enable_interintra_compound = oxcf->enable_interintra_comp;
-  seq->enable_masked_compound = oxcf->enable_masked_comp;
-  seq->enable_intra_edge_filter = oxcf->enable_intra_edge_filter;
-  seq->enable_filter_intra = oxcf->enable_filter_intra;
+  seq->enable_superres = oxcf->superres_cfg.enable_superres;
+  seq->enable_cdef = tool_cfg->enable_cdef;
+  seq->enable_restoration = tool_cfg->enable_restoration;
+  seq->enable_warped_motion = oxcf->motion_mode_cfg.enable_warped_motion;
+  seq->enable_interintra_compound = tool_cfg->enable_interintra_comp;
+  seq->enable_masked_compound = oxcf->comp_type_cfg.enable_masked_comp;
+  seq->enable_intra_edge_filter = oxcf->intra_mode_cfg.enable_intra_edge_filter;
+  seq->enable_filter_intra = oxcf->intra_mode_cfg.enable_filter_intra;
 
-  set_bitstream_level_tier(seq, cm, oxcf);
+  set_bitstream_level_tier(seq, cm, frm_dim_cfg->width, frm_dim_cfg->height,
+                           oxcf->input_cfg.init_framerate);
 
   if (seq->operating_points_cnt_minus_1 == 0) {
     seq->operating_point_idc[0] = 0;
@@ -1399,36 +469,37 @@
   AV1_COMMON *const cm = &cpi->common;
   SequenceHeader *const seq_params = &cm->seq_params;
   ResizePendingParams *resize_pending_params = &cpi->resize_pending_params;
-
+  const DecoderModelCfg *const dec_model_cfg = &oxcf->dec_model_cfg;
+  const ColorCfg *const color_cfg = &oxcf->color_cfg;
   cpi->oxcf = *oxcf;
-  cpi->framerate = oxcf->init_framerate;
+  cpi->framerate = oxcf->input_cfg.init_framerate;
 
   seq_params->profile = oxcf->profile;
-  seq_params->bit_depth = oxcf->bit_depth;
+  seq_params->bit_depth = oxcf->tool_cfg.bit_depth;
   seq_params->use_highbitdepth = oxcf->use_highbitdepth;
-  seq_params->color_primaries = oxcf->color_primaries;
-  seq_params->transfer_characteristics = oxcf->transfer_characteristics;
-  seq_params->matrix_coefficients = oxcf->matrix_coefficients;
-  seq_params->monochrome = oxcf->monochrome;
-  seq_params->chroma_sample_position = oxcf->chroma_sample_position;
-  seq_params->color_range = oxcf->color_range;
-  seq_params->timing_info_present = oxcf->timing_info_present;
+  seq_params->color_primaries = color_cfg->color_primaries;
+  seq_params->transfer_characteristics = color_cfg->transfer_characteristics;
+  seq_params->matrix_coefficients = color_cfg->matrix_coefficients;
+  seq_params->monochrome = oxcf->tool_cfg.enable_monochrome;
+  seq_params->chroma_sample_position = color_cfg->chroma_sample_position;
+  seq_params->color_range = color_cfg->color_range;
+  seq_params->timing_info_present = dec_model_cfg->timing_info_present;
   seq_params->timing_info.num_units_in_display_tick =
-      oxcf->timing_info.num_units_in_display_tick;
-  seq_params->timing_info.time_scale = oxcf->timing_info.time_scale;
+      dec_model_cfg->timing_info.num_units_in_display_tick;
+  seq_params->timing_info.time_scale = dec_model_cfg->timing_info.time_scale;
   seq_params->timing_info.equal_picture_interval =
-      oxcf->timing_info.equal_picture_interval;
+      dec_model_cfg->timing_info.equal_picture_interval;
   seq_params->timing_info.num_ticks_per_picture =
-      oxcf->timing_info.num_ticks_per_picture;
+      dec_model_cfg->timing_info.num_ticks_per_picture;
 
   seq_params->display_model_info_present_flag =
-      oxcf->display_model_info_present_flag;
+      dec_model_cfg->display_model_info_present_flag;
   seq_params->decoder_model_info_present_flag =
-      oxcf->decoder_model_info_present_flag;
-  if (oxcf->decoder_model_info_present_flag) {
+      dec_model_cfg->decoder_model_info_present_flag;
+  if (dec_model_cfg->decoder_model_info_present_flag) {
     // set the decoder model parameters in schedule mode
     seq_params->decoder_model_info.num_units_in_decoding_tick =
-        oxcf->buffer_model.num_units_in_decoding_tick;
+        dec_model_cfg->num_units_in_decoding_tick;
     cm->buffer_removal_time_present = 1;
     av1_set_aom_dec_model_info(&seq_params->decoder_model_info);
     av1_set_dec_model_op_parameters(&seq_params->op_params[0]);
@@ -1459,8 +530,8 @@
       seq_params->subsampling_y = 0;
     } else {
       if (seq_params->bit_depth == AOM_BITS_12) {
-        seq_params->subsampling_x = oxcf->chroma_subsampling_x;
-        seq_params->subsampling_y = oxcf->chroma_subsampling_y;
+        seq_params->subsampling_x = oxcf->input_cfg.chroma_subsampling_x;
+        seq_params->subsampling_y = oxcf->input_cfg.chroma_subsampling_y;
       } else {
         seq_params->subsampling_x = 1;
         seq_params->subsampling_y = 0;
@@ -1468,13 +539,13 @@
     }
   }
 
-  cm->width = oxcf->width;
-  cm->height = oxcf->height;
+  cm->width = oxcf->frm_dim_cfg.width;
+  cm->height = oxcf->frm_dim_cfg.height;
   set_sb_size(seq_params,
-              select_sb_size(cpi));  // set sb size before allocations
+              av1_select_sb_size(cpi));  // set sb size before allocations
   alloc_compressor_data(cpi);
 
-  update_film_grain_parameters(cpi, oxcf);
+  av1_update_film_grain_parameters(cpi, oxcf);
 
   // Single thread case: use counts in common.
   cpi->td.counts = &cpi->counts;
@@ -1500,1321 +571,59 @@
   resize_pending_params->height = 0;
 
   init_buffer_indices(&cpi->force_intpel_info, cm->remapped_ref_idx);
-}
 
-static void set_rc_buffer_sizes(RATE_CONTROL *rc,
-                                const AV1EncoderConfig *oxcf) {
-  const int64_t bandwidth = oxcf->target_bandwidth;
-  const int64_t starting = oxcf->starting_buffer_level_ms;
-  const int64_t optimal = oxcf->optimal_buffer_level_ms;
-  const int64_t maximum = oxcf->maximum_buffer_size_ms;
-
-  rc->starting_buffer_level = starting * bandwidth / 1000;
-  rc->optimal_buffer_level =
-      (optimal == 0) ? bandwidth / 8 : optimal * bandwidth / 1000;
-  rc->maximum_buffer_size =
-      (maximum == 0) ? bandwidth / 8 : maximum * bandwidth / 1000;
-}
-
-#define HIGHBD_BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX4DF, JSDAF, JSVAF) \
-  cpi->fn_ptr[BT].sdf = SDF;                                           \
-  cpi->fn_ptr[BT].sdaf = SDAF;                                         \
-  cpi->fn_ptr[BT].vf = VF;                                             \
-  cpi->fn_ptr[BT].svf = SVF;                                           \
-  cpi->fn_ptr[BT].svaf = SVAF;                                         \
-  cpi->fn_ptr[BT].sdx4df = SDX4DF;                                     \
-  cpi->fn_ptr[BT].jsdaf = JSDAF;                                       \
-  cpi->fn_ptr[BT].jsvaf = JSVAF;
-
-#define MAKE_BFP_SAD_WRAPPER(fnname)                                           \
-  static unsigned int fnname##_bits8(const uint8_t *src_ptr,                   \
-                                     int source_stride,                        \
-                                     const uint8_t *ref_ptr, int ref_stride) { \
-    return fnname(src_ptr, source_stride, ref_ptr, ref_stride);                \
-  }                                                                            \
-  static unsigned int fnname##_bits10(                                         \
-      const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr,       \
-      int ref_stride) {                                                        \
-    return fnname(src_ptr, source_stride, ref_ptr, ref_stride) >> 2;           \
-  }                                                                            \
-  static unsigned int fnname##_bits12(                                         \
-      const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr,       \
-      int ref_stride) {                                                        \
-    return fnname(src_ptr, source_stride, ref_ptr, ref_stride) >> 4;           \
-  }
-
-#define MAKE_BFP_SADAVG_WRAPPER(fnname)                                        \
-  static unsigned int fnname##_bits8(                                          \
-      const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr,       \
-      int ref_stride, const uint8_t *second_pred) {                            \
-    return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred);   \
-  }                                                                            \
-  static unsigned int fnname##_bits10(                                         \
-      const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr,       \
-      int ref_stride, const uint8_t *second_pred) {                            \
-    return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred) >> \
-           2;                                                                  \
-  }                                                                            \
-  static unsigned int fnname##_bits12(                                         \
-      const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr,       \
-      int ref_stride, const uint8_t *second_pred) {                            \
-    return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred) >> \
-           4;                                                                  \
-  }
-
-#define MAKE_BFP_SAD4D_WRAPPER(fnname)                                        \
-  static void fnname##_bits8(const uint8_t *src_ptr, int source_stride,       \
-                             const uint8_t *const ref_ptr[], int ref_stride,  \
-                             unsigned int *sad_array) {                       \
-    fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array);           \
-  }                                                                           \
-  static void fnname##_bits10(const uint8_t *src_ptr, int source_stride,      \
-                              const uint8_t *const ref_ptr[], int ref_stride, \
-                              unsigned int *sad_array) {                      \
-    int i;                                                                    \
-    fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array);           \
-    for (i = 0; i < 4; i++) sad_array[i] >>= 2;                               \
-  }                                                                           \
-  static void fnname##_bits12(const uint8_t *src_ptr, int source_stride,      \
-                              const uint8_t *const ref_ptr[], int ref_stride, \
-                              unsigned int *sad_array) {                      \
-    int i;                                                                    \
-    fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array);           \
-    for (i = 0; i < 4; i++) sad_array[i] >>= 4;                               \
-  }
-
-#define MAKE_BFP_JSADAVG_WRAPPER(fnname)                                    \
-  static unsigned int fnname##_bits8(                                       \
-      const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr,    \
-      int ref_stride, const uint8_t *second_pred,                           \
-      const DIST_WTD_COMP_PARAMS *jcp_param) {                              \
-    return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred, \
-                  jcp_param);                                               \
-  }                                                                         \
-  static unsigned int fnname##_bits10(                                      \
-      const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr,    \
-      int ref_stride, const uint8_t *second_pred,                           \
-      const DIST_WTD_COMP_PARAMS *jcp_param) {                              \
-    return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred, \
-                  jcp_param) >>                                             \
-           2;                                                               \
-  }                                                                         \
-  static unsigned int fnname##_bits12(                                      \
-      const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr,    \
-      int ref_stride, const uint8_t *second_pred,                           \
-      const DIST_WTD_COMP_PARAMS *jcp_param) {                              \
-    return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred, \
-                  jcp_param) >>                                             \
-           4;                                                               \
-  }
-
-#if CONFIG_AV1_HIGHBITDEPTH
-MAKE_BFP_SAD_WRAPPER(aom_highbd_sad128x128)
-MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad128x128_avg)
-MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad128x128x4d)
-MAKE_BFP_SAD_WRAPPER(aom_highbd_sad128x64)
-MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad128x64_avg)
-MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad128x64x4d)
-MAKE_BFP_SAD_WRAPPER(aom_highbd_sad64x128)
-MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad64x128_avg)
-MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x128x4d)
-MAKE_BFP_SAD_WRAPPER(aom_highbd_sad32x16)
-MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad32x16_avg)
-MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x16x4d)
-MAKE_BFP_SAD_WRAPPER(aom_highbd_sad16x32)
-MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad16x32_avg)
-MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x32x4d)
-MAKE_BFP_SAD_WRAPPER(aom_highbd_sad64x32)
-MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad64x32_avg)
-MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x32x4d)
-MAKE_BFP_SAD_WRAPPER(aom_highbd_sad32x64)
-MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad32x64_avg)
-MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x64x4d)
-MAKE_BFP_SAD_WRAPPER(aom_highbd_sad32x32)
-MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad32x32_avg)
-MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x32x4d)
-MAKE_BFP_SAD_WRAPPER(aom_highbd_sad64x64)
-MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad64x64_avg)
-MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x64x4d)
-MAKE_BFP_SAD_WRAPPER(aom_highbd_sad16x16)
-MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad16x16_avg)
-MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x16x4d)
-MAKE_BFP_SAD_WRAPPER(aom_highbd_sad16x8)
-MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad16x8_avg)
-MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x8x4d)
-MAKE_BFP_SAD_WRAPPER(aom_highbd_sad8x16)
-MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad8x16_avg)
-MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x16x4d)
-MAKE_BFP_SAD_WRAPPER(aom_highbd_sad8x8)
-MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad8x8_avg)
-MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x8x4d)
-MAKE_BFP_SAD_WRAPPER(aom_highbd_sad8x4)
-MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad8x4_avg)
-MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x4x4d)
-MAKE_BFP_SAD_WRAPPER(aom_highbd_sad4x8)
-MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad4x8_avg)
-MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad4x8x4d)
-MAKE_BFP_SAD_WRAPPER(aom_highbd_sad4x4)
-MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad4x4_avg)
-MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad4x4x4d)
-
-MAKE_BFP_SAD_WRAPPER(aom_highbd_sad4x16)
-MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad4x16_avg)
-MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad4x16x4d)
-MAKE_BFP_SAD_WRAPPER(aom_highbd_sad16x4)
-MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad16x4_avg)
-MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x4x4d)
-MAKE_BFP_SAD_WRAPPER(aom_highbd_sad8x32)
-MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad8x32_avg)
-MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x32x4d)
-MAKE_BFP_SAD_WRAPPER(aom_highbd_sad32x8)
-MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad32x8_avg)
-MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x8x4d)
-MAKE_BFP_SAD_WRAPPER(aom_highbd_sad16x64)
-MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad16x64_avg)
-MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x64x4d)
-MAKE_BFP_SAD_WRAPPER(aom_highbd_sad64x16)
-MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad64x16_avg)
-MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x16x4d)
-
-MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad128x128_avg)
-MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad128x64_avg)
-MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad64x128_avg)
-MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad32x16_avg)
-MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad16x32_avg)
-MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad64x32_avg)
-MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad32x64_avg)
-MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad32x32_avg)
-MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad64x64_avg)
-MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad16x16_avg)
-MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad16x8_avg)
-MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad8x16_avg)
-MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad8x8_avg)
-MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad8x4_avg)
-MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad4x8_avg)
-MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad4x4_avg)
-MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad4x16_avg)
-MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad16x4_avg)
-MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad8x32_avg)
-MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad32x8_avg)
-MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad16x64_avg)
-MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad64x16_avg)
-#endif  // CONFIG_AV1_HIGHBITDEPTH
-
-#define HIGHBD_MBFP(BT, MCSDF, MCSVF) \
-  cpi->fn_ptr[BT].msdf = MCSDF;       \
-  cpi->fn_ptr[BT].msvf = MCSVF;
-
-#define MAKE_MBFP_COMPOUND_SAD_WRAPPER(fnname)                           \
-  static unsigned int fnname##_bits8(                                    \
-      const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \
-      int ref_stride, const uint8_t *second_pred_ptr, const uint8_t *m,  \
-      int m_stride, int invert_mask) {                                   \
-    return fnname(src_ptr, source_stride, ref_ptr, ref_stride,           \
-                  second_pred_ptr, m, m_stride, invert_mask);            \
-  }                                                                      \
-  static unsigned int fnname##_bits10(                                   \
-      const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \
-      int ref_stride, const uint8_t *second_pred_ptr, const uint8_t *m,  \
-      int m_stride, int invert_mask) {                                   \
-    return fnname(src_ptr, source_stride, ref_ptr, ref_stride,           \
-                  second_pred_ptr, m, m_stride, invert_mask) >>          \
-           2;                                                            \
-  }                                                                      \
-  static unsigned int fnname##_bits12(                                   \
-      const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \
-      int ref_stride, const uint8_t *second_pred_ptr, const uint8_t *m,  \
-      int m_stride, int invert_mask) {                                   \
-    return fnname(src_ptr, source_stride, ref_ptr, ref_stride,           \
-                  second_pred_ptr, m, m_stride, invert_mask) >>          \
-           4;                                                            \
-  }
-
-#if CONFIG_AV1_HIGHBITDEPTH
-MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad128x128)
-MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad128x64)
-MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad64x128)
-MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad64x64)
-MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad64x32)
-MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad32x64)
-MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad32x32)
-MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad32x16)
-MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad16x32)
-MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad16x16)
-MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad16x8)
-MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad8x16)
-MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad8x8)
-MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad8x4)
-MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad4x8)
-MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad4x4)
-MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad4x16)
-MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad16x4)
-MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad8x32)
-MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad32x8)
-MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad16x64)
-MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad64x16)
-#endif
-
-#define HIGHBD_OBFP(BT, OSDF, OVF, OSVF) \
-  cpi->fn_ptr[BT].osdf = OSDF;           \
-  cpi->fn_ptr[BT].ovf = OVF;             \
-  cpi->fn_ptr[BT].osvf = OSVF;
-
-#define MAKE_OBFP_SAD_WRAPPER(fnname)                                     \
-  static unsigned int fnname##_bits8(const uint8_t *ref, int ref_stride,  \
-                                     const int32_t *wsrc,                 \
-                                     const int32_t *msk) {                \
-    return fnname(ref, ref_stride, wsrc, msk);                            \
-  }                                                                       \
-  static unsigned int fnname##_bits10(const uint8_t *ref, int ref_stride, \
-                                      const int32_t *wsrc,                \
-                                      const int32_t *msk) {               \
-    return fnname(ref, ref_stride, wsrc, msk) >> 2;                       \
-  }                                                                       \
-  static unsigned int fnname##_bits12(const uint8_t *ref, int ref_stride, \
-                                      const int32_t *wsrc,                \
-                                      const int32_t *msk) {               \
-    return fnname(ref, ref_stride, wsrc, msk) >> 4;                       \
-  }
-
-#if CONFIG_AV1_HIGHBITDEPTH
-MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad128x128)
-MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad128x64)
-MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad64x128)
-MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad64x64)
-MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad64x32)
-MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad32x64)
-MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad32x32)
-MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad32x16)
-MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad16x32)
-MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad16x16)
-MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad16x8)
-MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad8x16)
-MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad8x8)
-MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad8x4)
-MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad4x8)
-MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad4x4)
-MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad4x16)
-MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad16x4)
-MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad8x32)
-MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad32x8)
-MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad16x64)
-MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad64x16)
-
-static void highbd_set_var_fns(AV1_COMP *const cpi) {
-  AV1_COMMON *const cm = &cpi->common;
-  if (cm->seq_params.use_highbitdepth) {
-    switch (cm->seq_params.bit_depth) {
-      case AOM_BITS_8:
-        HIGHBD_BFP(BLOCK_64X16, aom_highbd_sad64x16_bits8,
-                   aom_highbd_sad64x16_avg_bits8, aom_highbd_8_variance64x16,
-                   aom_highbd_8_sub_pixel_variance64x16,
-                   aom_highbd_8_sub_pixel_avg_variance64x16,
-                   aom_highbd_sad64x16x4d_bits8,
-                   aom_highbd_dist_wtd_sad64x16_avg_bits8,
-                   aom_highbd_8_dist_wtd_sub_pixel_avg_variance64x16)
-
-        HIGHBD_BFP(BLOCK_16X64, aom_highbd_sad16x64_bits8,
-                   aom_highbd_sad16x64_avg_bits8, aom_highbd_8_variance16x64,
-                   aom_highbd_8_sub_pixel_variance16x64,
-                   aom_highbd_8_sub_pixel_avg_variance16x64,
-                   aom_highbd_sad16x64x4d_bits8,
-                   aom_highbd_dist_wtd_sad16x64_avg_bits8,
-                   aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x64)
-
-        HIGHBD_BFP(
-            BLOCK_32X8, aom_highbd_sad32x8_bits8, aom_highbd_sad32x8_avg_bits8,
-            aom_highbd_8_variance32x8, aom_highbd_8_sub_pixel_variance32x8,
-            aom_highbd_8_sub_pixel_avg_variance32x8,
-            aom_highbd_sad32x8x4d_bits8, aom_highbd_dist_wtd_sad32x8_avg_bits8,
-            aom_highbd_8_dist_wtd_sub_pixel_avg_variance32x8)
-
-        HIGHBD_BFP(
-            BLOCK_8X32, aom_highbd_sad8x32_bits8, aom_highbd_sad8x32_avg_bits8,
-            aom_highbd_8_variance8x32, aom_highbd_8_sub_pixel_variance8x32,
-            aom_highbd_8_sub_pixel_avg_variance8x32,
-            aom_highbd_sad8x32x4d_bits8, aom_highbd_dist_wtd_sad8x32_avg_bits8,
-            aom_highbd_8_dist_wtd_sub_pixel_avg_variance8x32)
-
-        HIGHBD_BFP(
-            BLOCK_16X4, aom_highbd_sad16x4_bits8, aom_highbd_sad16x4_avg_bits8,
-            aom_highbd_8_variance16x4, aom_highbd_8_sub_pixel_variance16x4,
-            aom_highbd_8_sub_pixel_avg_variance16x4,
-            aom_highbd_sad16x4x4d_bits8, aom_highbd_dist_wtd_sad16x4_avg_bits8,
-            aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x4)
-
-        HIGHBD_BFP(
-            BLOCK_4X16, aom_highbd_sad4x16_bits8, aom_highbd_sad4x16_avg_bits8,
-            aom_highbd_8_variance4x16, aom_highbd_8_sub_pixel_variance4x16,
-            aom_highbd_8_sub_pixel_avg_variance4x16,
-            aom_highbd_sad4x16x4d_bits8, aom_highbd_dist_wtd_sad4x16_avg_bits8,
-            aom_highbd_8_dist_wtd_sub_pixel_avg_variance4x16)
-
-        HIGHBD_BFP(BLOCK_32X16, aom_highbd_sad32x16_bits8,
-                   aom_highbd_sad32x16_avg_bits8, aom_highbd_8_variance32x16,
-                   aom_highbd_8_sub_pixel_variance32x16,
-                   aom_highbd_8_sub_pixel_avg_variance32x16,
-                   aom_highbd_sad32x16x4d_bits8,
-                   aom_highbd_dist_wtd_sad32x16_avg_bits8,
-                   aom_highbd_8_dist_wtd_sub_pixel_avg_variance32x16)
-
-        HIGHBD_BFP(BLOCK_16X32, aom_highbd_sad16x32_bits8,
-                   aom_highbd_sad16x32_avg_bits8, aom_highbd_8_variance16x32,
-                   aom_highbd_8_sub_pixel_variance16x32,
-                   aom_highbd_8_sub_pixel_avg_variance16x32,
-                   aom_highbd_sad16x32x4d_bits8,
-                   aom_highbd_dist_wtd_sad16x32_avg_bits8,
-                   aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x32)
-
-        HIGHBD_BFP(BLOCK_64X32, aom_highbd_sad64x32_bits8,
-                   aom_highbd_sad64x32_avg_bits8, aom_highbd_8_variance64x32,
-                   aom_highbd_8_sub_pixel_variance64x32,
-                   aom_highbd_8_sub_pixel_avg_variance64x32,
-                   aom_highbd_sad64x32x4d_bits8,
-                   aom_highbd_dist_wtd_sad64x32_avg_bits8,
-                   aom_highbd_8_dist_wtd_sub_pixel_avg_variance64x32)
-
-        HIGHBD_BFP(BLOCK_32X64, aom_highbd_sad32x64_bits8,
-                   aom_highbd_sad32x64_avg_bits8, aom_highbd_8_variance32x64,
-                   aom_highbd_8_sub_pixel_variance32x64,
-                   aom_highbd_8_sub_pixel_avg_variance32x64,
-                   aom_highbd_sad32x64x4d_bits8,
-                   aom_highbd_dist_wtd_sad32x64_avg_bits8,
-                   aom_highbd_8_dist_wtd_sub_pixel_avg_variance32x64)
-
-        HIGHBD_BFP(BLOCK_32X32, aom_highbd_sad32x32_bits8,
-                   aom_highbd_sad32x32_avg_bits8, aom_highbd_8_variance32x32,
-                   aom_highbd_8_sub_pixel_variance32x32,
-                   aom_highbd_8_sub_pixel_avg_variance32x32,
-                   aom_highbd_sad32x32x4d_bits8,
-                   aom_highbd_dist_wtd_sad32x32_avg_bits8,
-                   aom_highbd_8_dist_wtd_sub_pixel_avg_variance32x32)
-
-        HIGHBD_BFP(BLOCK_64X64, aom_highbd_sad64x64_bits8,
-                   aom_highbd_sad64x64_avg_bits8, aom_highbd_8_variance64x64,
-                   aom_highbd_8_sub_pixel_variance64x64,
-                   aom_highbd_8_sub_pixel_avg_variance64x64,
-                   aom_highbd_sad64x64x4d_bits8,
-                   aom_highbd_dist_wtd_sad64x64_avg_bits8,
-                   aom_highbd_8_dist_wtd_sub_pixel_avg_variance64x64)
-
-        HIGHBD_BFP(BLOCK_16X16, aom_highbd_sad16x16_bits8,
-                   aom_highbd_sad16x16_avg_bits8, aom_highbd_8_variance16x16,
-                   aom_highbd_8_sub_pixel_variance16x16,
-                   aom_highbd_8_sub_pixel_avg_variance16x16,
-                   aom_highbd_sad16x16x4d_bits8,
-                   aom_highbd_dist_wtd_sad16x16_avg_bits8,
-                   aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x16)
-
-        HIGHBD_BFP(
-            BLOCK_16X8, aom_highbd_sad16x8_bits8, aom_highbd_sad16x8_avg_bits8,
-            aom_highbd_8_variance16x8, aom_highbd_8_sub_pixel_variance16x8,
-            aom_highbd_8_sub_pixel_avg_variance16x8,
-            aom_highbd_sad16x8x4d_bits8, aom_highbd_dist_wtd_sad16x8_avg_bits8,
-            aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x8)
-
-        HIGHBD_BFP(
-            BLOCK_8X16, aom_highbd_sad8x16_bits8, aom_highbd_sad8x16_avg_bits8,
-            aom_highbd_8_variance8x16, aom_highbd_8_sub_pixel_variance8x16,
-            aom_highbd_8_sub_pixel_avg_variance8x16,
-            aom_highbd_sad8x16x4d_bits8, aom_highbd_dist_wtd_sad8x16_avg_bits8,
-            aom_highbd_8_dist_wtd_sub_pixel_avg_variance8x16)
-
-        HIGHBD_BFP(
-            BLOCK_8X8, aom_highbd_sad8x8_bits8, aom_highbd_sad8x8_avg_bits8,
-            aom_highbd_8_variance8x8, aom_highbd_8_sub_pixel_variance8x8,
-            aom_highbd_8_sub_pixel_avg_variance8x8, aom_highbd_sad8x8x4d_bits8,
-            aom_highbd_dist_wtd_sad8x8_avg_bits8,
-            aom_highbd_8_dist_wtd_sub_pixel_avg_variance8x8)
-
-        HIGHBD_BFP(
-            BLOCK_8X4, aom_highbd_sad8x4_bits8, aom_highbd_sad8x4_avg_bits8,
-            aom_highbd_8_variance8x4, aom_highbd_8_sub_pixel_variance8x4,
-            aom_highbd_8_sub_pixel_avg_variance8x4, aom_highbd_sad8x4x4d_bits8,
-            aom_highbd_dist_wtd_sad8x4_avg_bits8,
-            aom_highbd_8_dist_wtd_sub_pixel_avg_variance8x4)
-
-        HIGHBD_BFP(
-            BLOCK_4X8, aom_highbd_sad4x8_bits8, aom_highbd_sad4x8_avg_bits8,
-            aom_highbd_8_variance4x8, aom_highbd_8_sub_pixel_variance4x8,
-            aom_highbd_8_sub_pixel_avg_variance4x8, aom_highbd_sad4x8x4d_bits8,
-            aom_highbd_dist_wtd_sad4x8_avg_bits8,
-            aom_highbd_8_dist_wtd_sub_pixel_avg_variance4x8)
-
-        HIGHBD_BFP(
-            BLOCK_4X4, aom_highbd_sad4x4_bits8, aom_highbd_sad4x4_avg_bits8,
-            aom_highbd_8_variance4x4, aom_highbd_8_sub_pixel_variance4x4,
-            aom_highbd_8_sub_pixel_avg_variance4x4, aom_highbd_sad4x4x4d_bits8,
-            aom_highbd_dist_wtd_sad4x4_avg_bits8,
-            aom_highbd_8_dist_wtd_sub_pixel_avg_variance4x4)
-
-        HIGHBD_BFP(BLOCK_128X128, aom_highbd_sad128x128_bits8,
-                   aom_highbd_sad128x128_avg_bits8,
-                   aom_highbd_8_variance128x128,
-                   aom_highbd_8_sub_pixel_variance128x128,
-                   aom_highbd_8_sub_pixel_avg_variance128x128,
-                   aom_highbd_sad128x128x4d_bits8,
-                   aom_highbd_dist_wtd_sad128x128_avg_bits8,
-                   aom_highbd_8_dist_wtd_sub_pixel_avg_variance128x128)
-
-        HIGHBD_BFP(BLOCK_128X64, aom_highbd_sad128x64_bits8,
-                   aom_highbd_sad128x64_avg_bits8, aom_highbd_8_variance128x64,
-                   aom_highbd_8_sub_pixel_variance128x64,
-                   aom_highbd_8_sub_pixel_avg_variance128x64,
-                   aom_highbd_sad128x64x4d_bits8,
-                   aom_highbd_dist_wtd_sad128x64_avg_bits8,
-                   aom_highbd_8_dist_wtd_sub_pixel_avg_variance128x64)
-
-        HIGHBD_BFP(BLOCK_64X128, aom_highbd_sad64x128_bits8,
-                   aom_highbd_sad64x128_avg_bits8, aom_highbd_8_variance64x128,
-                   aom_highbd_8_sub_pixel_variance64x128,
-                   aom_highbd_8_sub_pixel_avg_variance64x128,
-                   aom_highbd_sad64x128x4d_bits8,
-                   aom_highbd_dist_wtd_sad64x128_avg_bits8,
-                   aom_highbd_8_dist_wtd_sub_pixel_avg_variance64x128)
-
-        HIGHBD_MBFP(BLOCK_128X128, aom_highbd_masked_sad128x128_bits8,
-                    aom_highbd_8_masked_sub_pixel_variance128x128)
-        HIGHBD_MBFP(BLOCK_128X64, aom_highbd_masked_sad128x64_bits8,
-                    aom_highbd_8_masked_sub_pixel_variance128x64)
-        HIGHBD_MBFP(BLOCK_64X128, aom_highbd_masked_sad64x128_bits8,
-                    aom_highbd_8_masked_sub_pixel_variance64x128)
-        HIGHBD_MBFP(BLOCK_64X64, aom_highbd_masked_sad64x64_bits8,
-                    aom_highbd_8_masked_sub_pixel_variance64x64)
-        HIGHBD_MBFP(BLOCK_64X32, aom_highbd_masked_sad64x32_bits8,
-                    aom_highbd_8_masked_sub_pixel_variance64x32)
-        HIGHBD_MBFP(BLOCK_32X64, aom_highbd_masked_sad32x64_bits8,
-                    aom_highbd_8_masked_sub_pixel_variance32x64)
-        HIGHBD_MBFP(BLOCK_32X32, aom_highbd_masked_sad32x32_bits8,
-                    aom_highbd_8_masked_sub_pixel_variance32x32)
-        HIGHBD_MBFP(BLOCK_32X16, aom_highbd_masked_sad32x16_bits8,
-                    aom_highbd_8_masked_sub_pixel_variance32x16)
-        HIGHBD_MBFP(BLOCK_16X32, aom_highbd_masked_sad16x32_bits8,
-                    aom_highbd_8_masked_sub_pixel_variance16x32)
-        HIGHBD_MBFP(BLOCK_16X16, aom_highbd_masked_sad16x16_bits8,
-                    aom_highbd_8_masked_sub_pixel_variance16x16)
-        HIGHBD_MBFP(BLOCK_8X16, aom_highbd_masked_sad8x16_bits8,
-                    aom_highbd_8_masked_sub_pixel_variance8x16)
-        HIGHBD_MBFP(BLOCK_16X8, aom_highbd_masked_sad16x8_bits8,
-                    aom_highbd_8_masked_sub_pixel_variance16x8)
-        HIGHBD_MBFP(BLOCK_8X8, aom_highbd_masked_sad8x8_bits8,
-                    aom_highbd_8_masked_sub_pixel_variance8x8)
-        HIGHBD_MBFP(BLOCK_4X8, aom_highbd_masked_sad4x8_bits8,
-                    aom_highbd_8_masked_sub_pixel_variance4x8)
-        HIGHBD_MBFP(BLOCK_8X4, aom_highbd_masked_sad8x4_bits8,
-                    aom_highbd_8_masked_sub_pixel_variance8x4)
-        HIGHBD_MBFP(BLOCK_4X4, aom_highbd_masked_sad4x4_bits8,
-                    aom_highbd_8_masked_sub_pixel_variance4x4)
-        HIGHBD_MBFP(BLOCK_64X16, aom_highbd_masked_sad64x16_bits8,
-                    aom_highbd_8_masked_sub_pixel_variance64x16)
-        HIGHBD_MBFP(BLOCK_16X64, aom_highbd_masked_sad16x64_bits8,
-                    aom_highbd_8_masked_sub_pixel_variance16x64)
-        HIGHBD_MBFP(BLOCK_32X8, aom_highbd_masked_sad32x8_bits8,
-                    aom_highbd_8_masked_sub_pixel_variance32x8)
-        HIGHBD_MBFP(BLOCK_8X32, aom_highbd_masked_sad8x32_bits8,
-                    aom_highbd_8_masked_sub_pixel_variance8x32)
-        HIGHBD_MBFP(BLOCK_16X4, aom_highbd_masked_sad16x4_bits8,
-                    aom_highbd_8_masked_sub_pixel_variance16x4)
-        HIGHBD_MBFP(BLOCK_4X16, aom_highbd_masked_sad4x16_bits8,
-                    aom_highbd_8_masked_sub_pixel_variance4x16)
-        HIGHBD_OBFP(BLOCK_128X128, aom_highbd_obmc_sad128x128_bits8,
-                    aom_highbd_obmc_variance128x128,
-                    aom_highbd_obmc_sub_pixel_variance128x128)
-        HIGHBD_OBFP(BLOCK_128X64, aom_highbd_obmc_sad128x64_bits8,
-                    aom_highbd_obmc_variance128x64,
-                    aom_highbd_obmc_sub_pixel_variance128x64)
-        HIGHBD_OBFP(BLOCK_64X128, aom_highbd_obmc_sad64x128_bits8,
-                    aom_highbd_obmc_variance64x128,
-                    aom_highbd_obmc_sub_pixel_variance64x128)
-        HIGHBD_OBFP(BLOCK_64X64, aom_highbd_obmc_sad64x64_bits8,
-                    aom_highbd_obmc_variance64x64,
-                    aom_highbd_obmc_sub_pixel_variance64x64)
-        HIGHBD_OBFP(BLOCK_64X32, aom_highbd_obmc_sad64x32_bits8,
-                    aom_highbd_obmc_variance64x32,
-                    aom_highbd_obmc_sub_pixel_variance64x32)
-        HIGHBD_OBFP(BLOCK_32X64, aom_highbd_obmc_sad32x64_bits8,
-                    aom_highbd_obmc_variance32x64,
-                    aom_highbd_obmc_sub_pixel_variance32x64)
-        HIGHBD_OBFP(BLOCK_32X32, aom_highbd_obmc_sad32x32_bits8,
-                    aom_highbd_obmc_variance32x32,
-                    aom_highbd_obmc_sub_pixel_variance32x32)
-        HIGHBD_OBFP(BLOCK_32X16, aom_highbd_obmc_sad32x16_bits8,
-                    aom_highbd_obmc_variance32x16,
-                    aom_highbd_obmc_sub_pixel_variance32x16)
-        HIGHBD_OBFP(BLOCK_16X32, aom_highbd_obmc_sad16x32_bits8,
-                    aom_highbd_obmc_variance16x32,
-                    aom_highbd_obmc_sub_pixel_variance16x32)
-        HIGHBD_OBFP(BLOCK_16X16, aom_highbd_obmc_sad16x16_bits8,
-                    aom_highbd_obmc_variance16x16,
-                    aom_highbd_obmc_sub_pixel_variance16x16)
-        HIGHBD_OBFP(BLOCK_8X16, aom_highbd_obmc_sad8x16_bits8,
-                    aom_highbd_obmc_variance8x16,
-                    aom_highbd_obmc_sub_pixel_variance8x16)
-        HIGHBD_OBFP(BLOCK_16X8, aom_highbd_obmc_sad16x8_bits8,
-                    aom_highbd_obmc_variance16x8,
-                    aom_highbd_obmc_sub_pixel_variance16x8)
-        HIGHBD_OBFP(BLOCK_8X8, aom_highbd_obmc_sad8x8_bits8,
-                    aom_highbd_obmc_variance8x8,
-                    aom_highbd_obmc_sub_pixel_variance8x8)
-        HIGHBD_OBFP(BLOCK_4X8, aom_highbd_obmc_sad4x8_bits8,
-                    aom_highbd_obmc_variance4x8,
-                    aom_highbd_obmc_sub_pixel_variance4x8)
-        HIGHBD_OBFP(BLOCK_8X4, aom_highbd_obmc_sad8x4_bits8,
-                    aom_highbd_obmc_variance8x4,
-                    aom_highbd_obmc_sub_pixel_variance8x4)
-        HIGHBD_OBFP(BLOCK_4X4, aom_highbd_obmc_sad4x4_bits8,
-                    aom_highbd_obmc_variance4x4,
-                    aom_highbd_obmc_sub_pixel_variance4x4)
-        HIGHBD_OBFP(BLOCK_64X16, aom_highbd_obmc_sad64x16_bits8,
-                    aom_highbd_obmc_variance64x16,
-                    aom_highbd_obmc_sub_pixel_variance64x16)
-        HIGHBD_OBFP(BLOCK_16X64, aom_highbd_obmc_sad16x64_bits8,
-                    aom_highbd_obmc_variance16x64,
-                    aom_highbd_obmc_sub_pixel_variance16x64)
-        HIGHBD_OBFP(BLOCK_32X8, aom_highbd_obmc_sad32x8_bits8,
-                    aom_highbd_obmc_variance32x8,
-                    aom_highbd_obmc_sub_pixel_variance32x8)
-        HIGHBD_OBFP(BLOCK_8X32, aom_highbd_obmc_sad8x32_bits8,
-                    aom_highbd_obmc_variance8x32,
-                    aom_highbd_obmc_sub_pixel_variance8x32)
-        HIGHBD_OBFP(BLOCK_16X4, aom_highbd_obmc_sad16x4_bits8,
-                    aom_highbd_obmc_variance16x4,
-                    aom_highbd_obmc_sub_pixel_variance16x4)
-        HIGHBD_OBFP(BLOCK_4X16, aom_highbd_obmc_sad4x16_bits8,
-                    aom_highbd_obmc_variance4x16,
-                    aom_highbd_obmc_sub_pixel_variance4x16)
-        break;
-
-      case AOM_BITS_10:
-        HIGHBD_BFP(BLOCK_64X16, aom_highbd_sad64x16_bits10,
-                   aom_highbd_sad64x16_avg_bits10, aom_highbd_10_variance64x16,
-                   aom_highbd_10_sub_pixel_variance64x16,
-                   aom_highbd_10_sub_pixel_avg_variance64x16,
-                   aom_highbd_sad64x16x4d_bits10,
-                   aom_highbd_dist_wtd_sad64x16_avg_bits10,
-                   aom_highbd_10_dist_wtd_sub_pixel_avg_variance64x16);
-
-        HIGHBD_BFP(BLOCK_16X64, aom_highbd_sad16x64_bits10,
-                   aom_highbd_sad16x64_avg_bits10, aom_highbd_10_variance16x64,
-                   aom_highbd_10_sub_pixel_variance16x64,
-                   aom_highbd_10_sub_pixel_avg_variance16x64,
-                   aom_highbd_sad16x64x4d_bits10,
-                   aom_highbd_dist_wtd_sad16x64_avg_bits10,
-                   aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x64);
-
-        HIGHBD_BFP(BLOCK_32X8, aom_highbd_sad32x8_bits10,
-                   aom_highbd_sad32x8_avg_bits10, aom_highbd_10_variance32x8,
-                   aom_highbd_10_sub_pixel_variance32x8,
-                   aom_highbd_10_sub_pixel_avg_variance32x8,
-                   aom_highbd_sad32x8x4d_bits10,
-                   aom_highbd_dist_wtd_sad32x8_avg_bits10,
-                   aom_highbd_10_dist_wtd_sub_pixel_avg_variance32x8);
-
-        HIGHBD_BFP(BLOCK_8X32, aom_highbd_sad8x32_bits10,
-                   aom_highbd_sad8x32_avg_bits10, aom_highbd_10_variance8x32,
-                   aom_highbd_10_sub_pixel_variance8x32,
-                   aom_highbd_10_sub_pixel_avg_variance8x32,
-                   aom_highbd_sad8x32x4d_bits10,
-                   aom_highbd_dist_wtd_sad8x32_avg_bits10,
-                   aom_highbd_10_dist_wtd_sub_pixel_avg_variance8x32);
-
-        HIGHBD_BFP(BLOCK_16X4, aom_highbd_sad16x4_bits10,
-                   aom_highbd_sad16x4_avg_bits10, aom_highbd_10_variance16x4,
-                   aom_highbd_10_sub_pixel_variance16x4,
-                   aom_highbd_10_sub_pixel_avg_variance16x4,
-                   aom_highbd_sad16x4x4d_bits10,
-                   aom_highbd_dist_wtd_sad16x4_avg_bits10,
-                   aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x4);
-
-        HIGHBD_BFP(BLOCK_4X16, aom_highbd_sad4x16_bits10,
-                   aom_highbd_sad4x16_avg_bits10, aom_highbd_10_variance4x16,
-                   aom_highbd_10_sub_pixel_variance4x16,
-                   aom_highbd_10_sub_pixel_avg_variance4x16,
-                   aom_highbd_sad4x16x4d_bits10,
-                   aom_highbd_dist_wtd_sad4x16_avg_bits10,
-                   aom_highbd_10_dist_wtd_sub_pixel_avg_variance4x16);
-
-        HIGHBD_BFP(BLOCK_32X16, aom_highbd_sad32x16_bits10,
-                   aom_highbd_sad32x16_avg_bits10, aom_highbd_10_variance32x16,
-                   aom_highbd_10_sub_pixel_variance32x16,
-                   aom_highbd_10_sub_pixel_avg_variance32x16,
-                   aom_highbd_sad32x16x4d_bits10,
-                   aom_highbd_dist_wtd_sad32x16_avg_bits10,
-                   aom_highbd_10_dist_wtd_sub_pixel_avg_variance32x16);
-
-        HIGHBD_BFP(BLOCK_16X32, aom_highbd_sad16x32_bits10,
-                   aom_highbd_sad16x32_avg_bits10, aom_highbd_10_variance16x32,
-                   aom_highbd_10_sub_pixel_variance16x32,
-                   aom_highbd_10_sub_pixel_avg_variance16x32,
-                   aom_highbd_sad16x32x4d_bits10,
-                   aom_highbd_dist_wtd_sad16x32_avg_bits10,
-                   aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x32);
-
-        HIGHBD_BFP(BLOCK_64X32, aom_highbd_sad64x32_bits10,
-                   aom_highbd_sad64x32_avg_bits10, aom_highbd_10_variance64x32,
-                   aom_highbd_10_sub_pixel_variance64x32,
-                   aom_highbd_10_sub_pixel_avg_variance64x32,
-                   aom_highbd_sad64x32x4d_bits10,
-                   aom_highbd_dist_wtd_sad64x32_avg_bits10,
-                   aom_highbd_10_dist_wtd_sub_pixel_avg_variance64x32);
-
-        HIGHBD_BFP(BLOCK_32X64, aom_highbd_sad32x64_bits10,
-                   aom_highbd_sad32x64_avg_bits10, aom_highbd_10_variance32x64,
-                   aom_highbd_10_sub_pixel_variance32x64,
-                   aom_highbd_10_sub_pixel_avg_variance32x64,
-                   aom_highbd_sad32x64x4d_bits10,
-                   aom_highbd_dist_wtd_sad32x64_avg_bits10,
-                   aom_highbd_10_dist_wtd_sub_pixel_avg_variance32x64);
-
-        HIGHBD_BFP(BLOCK_32X32, aom_highbd_sad32x32_bits10,
-                   aom_highbd_sad32x32_avg_bits10, aom_highbd_10_variance32x32,
-                   aom_highbd_10_sub_pixel_variance32x32,
-                   aom_highbd_10_sub_pixel_avg_variance32x32,
-                   aom_highbd_sad32x32x4d_bits10,
-                   aom_highbd_dist_wtd_sad32x32_avg_bits10,
-                   aom_highbd_10_dist_wtd_sub_pixel_avg_variance32x32);
-
-        HIGHBD_BFP(BLOCK_64X64, aom_highbd_sad64x64_bits10,
-                   aom_highbd_sad64x64_avg_bits10, aom_highbd_10_variance64x64,
-                   aom_highbd_10_sub_pixel_variance64x64,
-                   aom_highbd_10_sub_pixel_avg_variance64x64,
-                   aom_highbd_sad64x64x4d_bits10,
-                   aom_highbd_dist_wtd_sad64x64_avg_bits10,
-                   aom_highbd_10_dist_wtd_sub_pixel_avg_variance64x64);
-
-        HIGHBD_BFP(BLOCK_16X16, aom_highbd_sad16x16_bits10,
-                   aom_highbd_sad16x16_avg_bits10, aom_highbd_10_variance16x16,
-                   aom_highbd_10_sub_pixel_variance16x16,
-                   aom_highbd_10_sub_pixel_avg_variance16x16,
-                   aom_highbd_sad16x16x4d_bits10,
-                   aom_highbd_dist_wtd_sad16x16_avg_bits10,
-                   aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x16);
-
-        HIGHBD_BFP(BLOCK_16X8, aom_highbd_sad16x8_bits10,
-                   aom_highbd_sad16x8_avg_bits10, aom_highbd_10_variance16x8,
-                   aom_highbd_10_sub_pixel_variance16x8,
-                   aom_highbd_10_sub_pixel_avg_variance16x8,
-                   aom_highbd_sad16x8x4d_bits10,
-                   aom_highbd_dist_wtd_sad16x8_avg_bits10,
-                   aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x8);
-
-        HIGHBD_BFP(BLOCK_8X16, aom_highbd_sad8x16_bits10,
-                   aom_highbd_sad8x16_avg_bits10, aom_highbd_10_variance8x16,
-                   aom_highbd_10_sub_pixel_variance8x16,
-                   aom_highbd_10_sub_pixel_avg_variance8x16,
-                   aom_highbd_sad8x16x4d_bits10,
-                   aom_highbd_dist_wtd_sad8x16_avg_bits10,
-                   aom_highbd_10_dist_wtd_sub_pixel_avg_variance8x16);
-
-        HIGHBD_BFP(
-            BLOCK_8X8, aom_highbd_sad8x8_bits10, aom_highbd_sad8x8_avg_bits10,
-            aom_highbd_10_variance8x8, aom_highbd_10_sub_pixel_variance8x8,
-            aom_highbd_10_sub_pixel_avg_variance8x8,
-            aom_highbd_sad8x8x4d_bits10, aom_highbd_dist_wtd_sad8x8_avg_bits10,
-            aom_highbd_10_dist_wtd_sub_pixel_avg_variance8x8);
-
-        HIGHBD_BFP(
-            BLOCK_8X4, aom_highbd_sad8x4_bits10, aom_highbd_sad8x4_avg_bits10,
-            aom_highbd_10_variance8x4, aom_highbd_10_sub_pixel_variance8x4,
-            aom_highbd_10_sub_pixel_avg_variance8x4,
-            aom_highbd_sad8x4x4d_bits10, aom_highbd_dist_wtd_sad8x4_avg_bits10,
-            aom_highbd_10_dist_wtd_sub_pixel_avg_variance8x4);
-
-        HIGHBD_BFP(
-            BLOCK_4X8, aom_highbd_sad4x8_bits10, aom_highbd_sad4x8_avg_bits10,
-            aom_highbd_10_variance4x8, aom_highbd_10_sub_pixel_variance4x8,
-            aom_highbd_10_sub_pixel_avg_variance4x8,
-            aom_highbd_sad4x8x4d_bits10, aom_highbd_dist_wtd_sad4x8_avg_bits10,
-            aom_highbd_10_dist_wtd_sub_pixel_avg_variance4x8);
-
-        HIGHBD_BFP(
-            BLOCK_4X4, aom_highbd_sad4x4_bits10, aom_highbd_sad4x4_avg_bits10,
-            aom_highbd_10_variance4x4, aom_highbd_10_sub_pixel_variance4x4,
-            aom_highbd_10_sub_pixel_avg_variance4x4,
-            aom_highbd_sad4x4x4d_bits10, aom_highbd_dist_wtd_sad4x4_avg_bits10,
-            aom_highbd_10_dist_wtd_sub_pixel_avg_variance4x4);
-
-        HIGHBD_BFP(BLOCK_128X128, aom_highbd_sad128x128_bits10,
-                   aom_highbd_sad128x128_avg_bits10,
-                   aom_highbd_10_variance128x128,
-                   aom_highbd_10_sub_pixel_variance128x128,
-                   aom_highbd_10_sub_pixel_avg_variance128x128,
-                   aom_highbd_sad128x128x4d_bits10,
-                   aom_highbd_dist_wtd_sad128x128_avg_bits10,
-                   aom_highbd_10_dist_wtd_sub_pixel_avg_variance128x128);
-
-        HIGHBD_BFP(BLOCK_128X64, aom_highbd_sad128x64_bits10,
-                   aom_highbd_sad128x64_avg_bits10,
-                   aom_highbd_10_variance128x64,
-                   aom_highbd_10_sub_pixel_variance128x64,
-                   aom_highbd_10_sub_pixel_avg_variance128x64,
-                   aom_highbd_sad128x64x4d_bits10,
-                   aom_highbd_dist_wtd_sad128x64_avg_bits10,
-                   aom_highbd_10_dist_wtd_sub_pixel_avg_variance128x64);
-
-        HIGHBD_BFP(BLOCK_64X128, aom_highbd_sad64x128_bits10,
-                   aom_highbd_sad64x128_avg_bits10,
-                   aom_highbd_10_variance64x128,
-                   aom_highbd_10_sub_pixel_variance64x128,
-                   aom_highbd_10_sub_pixel_avg_variance64x128,
-                   aom_highbd_sad64x128x4d_bits10,
-                   aom_highbd_dist_wtd_sad64x128_avg_bits10,
-                   aom_highbd_10_dist_wtd_sub_pixel_avg_variance64x128);
-
-        HIGHBD_MBFP(BLOCK_128X128, aom_highbd_masked_sad128x128_bits10,
-                    aom_highbd_10_masked_sub_pixel_variance128x128)
-        HIGHBD_MBFP(BLOCK_128X64, aom_highbd_masked_sad128x64_bits10,
-                    aom_highbd_10_masked_sub_pixel_variance128x64)
-        HIGHBD_MBFP(BLOCK_64X128, aom_highbd_masked_sad64x128_bits10,
-                    aom_highbd_10_masked_sub_pixel_variance64x128)
-        HIGHBD_MBFP(BLOCK_64X64, aom_highbd_masked_sad64x64_bits10,
-                    aom_highbd_10_masked_sub_pixel_variance64x64)
-        HIGHBD_MBFP(BLOCK_64X32, aom_highbd_masked_sad64x32_bits10,
-                    aom_highbd_10_masked_sub_pixel_variance64x32)
-        HIGHBD_MBFP(BLOCK_32X64, aom_highbd_masked_sad32x64_bits10,
-                    aom_highbd_10_masked_sub_pixel_variance32x64)
-        HIGHBD_MBFP(BLOCK_32X32, aom_highbd_masked_sad32x32_bits10,
-                    aom_highbd_10_masked_sub_pixel_variance32x32)
-        HIGHBD_MBFP(BLOCK_32X16, aom_highbd_masked_sad32x16_bits10,
-                    aom_highbd_10_masked_sub_pixel_variance32x16)
-        HIGHBD_MBFP(BLOCK_16X32, aom_highbd_masked_sad16x32_bits10,
-                    aom_highbd_10_masked_sub_pixel_variance16x32)
-        HIGHBD_MBFP(BLOCK_16X16, aom_highbd_masked_sad16x16_bits10,
-                    aom_highbd_10_masked_sub_pixel_variance16x16)
-        HIGHBD_MBFP(BLOCK_8X16, aom_highbd_masked_sad8x16_bits10,
-                    aom_highbd_10_masked_sub_pixel_variance8x16)
-        HIGHBD_MBFP(BLOCK_16X8, aom_highbd_masked_sad16x8_bits10,
-                    aom_highbd_10_masked_sub_pixel_variance16x8)
-        HIGHBD_MBFP(BLOCK_8X8, aom_highbd_masked_sad8x8_bits10,
-                    aom_highbd_10_masked_sub_pixel_variance8x8)
-        HIGHBD_MBFP(BLOCK_4X8, aom_highbd_masked_sad4x8_bits10,
-                    aom_highbd_10_masked_sub_pixel_variance4x8)
-        HIGHBD_MBFP(BLOCK_8X4, aom_highbd_masked_sad8x4_bits10,
-                    aom_highbd_10_masked_sub_pixel_variance8x4)
-        HIGHBD_MBFP(BLOCK_4X4, aom_highbd_masked_sad4x4_bits10,
-                    aom_highbd_10_masked_sub_pixel_variance4x4)
-        HIGHBD_MBFP(BLOCK_64X16, aom_highbd_masked_sad64x16_bits10,
-                    aom_highbd_10_masked_sub_pixel_variance64x16)
-        HIGHBD_MBFP(BLOCK_16X64, aom_highbd_masked_sad16x64_bits10,
-                    aom_highbd_10_masked_sub_pixel_variance16x64)
-        HIGHBD_MBFP(BLOCK_32X8, aom_highbd_masked_sad32x8_bits10,
-                    aom_highbd_10_masked_sub_pixel_variance32x8)
-        HIGHBD_MBFP(BLOCK_8X32, aom_highbd_masked_sad8x32_bits10,
-                    aom_highbd_10_masked_sub_pixel_variance8x32)
-        HIGHBD_MBFP(BLOCK_16X4, aom_highbd_masked_sad16x4_bits10,
-                    aom_highbd_10_masked_sub_pixel_variance16x4)
-        HIGHBD_MBFP(BLOCK_4X16, aom_highbd_masked_sad4x16_bits10,
-                    aom_highbd_10_masked_sub_pixel_variance4x16)
-        HIGHBD_OBFP(BLOCK_128X128, aom_highbd_obmc_sad128x128_bits10,
-                    aom_highbd_10_obmc_variance128x128,
-                    aom_highbd_10_obmc_sub_pixel_variance128x128)
-        HIGHBD_OBFP(BLOCK_128X64, aom_highbd_obmc_sad128x64_bits10,
-                    aom_highbd_10_obmc_variance128x64,
-                    aom_highbd_10_obmc_sub_pixel_variance128x64)
-        HIGHBD_OBFP(BLOCK_64X128, aom_highbd_obmc_sad64x128_bits10,
-                    aom_highbd_10_obmc_variance64x128,
-                    aom_highbd_10_obmc_sub_pixel_variance64x128)
-        HIGHBD_OBFP(BLOCK_64X64, aom_highbd_obmc_sad64x64_bits10,
-                    aom_highbd_10_obmc_variance64x64,
-                    aom_highbd_10_obmc_sub_pixel_variance64x64)
-        HIGHBD_OBFP(BLOCK_64X32, aom_highbd_obmc_sad64x32_bits10,
-                    aom_highbd_10_obmc_variance64x32,
-                    aom_highbd_10_obmc_sub_pixel_variance64x32)
-        HIGHBD_OBFP(BLOCK_32X64, aom_highbd_obmc_sad32x64_bits10,
-                    aom_highbd_10_obmc_variance32x64,
-                    aom_highbd_10_obmc_sub_pixel_variance32x64)
-        HIGHBD_OBFP(BLOCK_32X32, aom_highbd_obmc_sad32x32_bits10,
-                    aom_highbd_10_obmc_variance32x32,
-                    aom_highbd_10_obmc_sub_pixel_variance32x32)
-        HIGHBD_OBFP(BLOCK_32X16, aom_highbd_obmc_sad32x16_bits10,
-                    aom_highbd_10_obmc_variance32x16,
-                    aom_highbd_10_obmc_sub_pixel_variance32x16)
-        HIGHBD_OBFP(BLOCK_16X32, aom_highbd_obmc_sad16x32_bits10,
-                    aom_highbd_10_obmc_variance16x32,
-                    aom_highbd_10_obmc_sub_pixel_variance16x32)
-        HIGHBD_OBFP(BLOCK_16X16, aom_highbd_obmc_sad16x16_bits10,
-                    aom_highbd_10_obmc_variance16x16,
-                    aom_highbd_10_obmc_sub_pixel_variance16x16)
-        HIGHBD_OBFP(BLOCK_8X16, aom_highbd_obmc_sad8x16_bits10,
-                    aom_highbd_10_obmc_variance8x16,
-                    aom_highbd_10_obmc_sub_pixel_variance8x16)
-        HIGHBD_OBFP(BLOCK_16X8, aom_highbd_obmc_sad16x8_bits10,
-                    aom_highbd_10_obmc_variance16x8,
-                    aom_highbd_10_obmc_sub_pixel_variance16x8)
-        HIGHBD_OBFP(BLOCK_8X8, aom_highbd_obmc_sad8x8_bits10,
-                    aom_highbd_10_obmc_variance8x8,
-                    aom_highbd_10_obmc_sub_pixel_variance8x8)
-        HIGHBD_OBFP(BLOCK_4X8, aom_highbd_obmc_sad4x8_bits10,
-                    aom_highbd_10_obmc_variance4x8,
-                    aom_highbd_10_obmc_sub_pixel_variance4x8)
-        HIGHBD_OBFP(BLOCK_8X4, aom_highbd_obmc_sad8x4_bits10,
-                    aom_highbd_10_obmc_variance8x4,
-                    aom_highbd_10_obmc_sub_pixel_variance8x4)
-        HIGHBD_OBFP(BLOCK_4X4, aom_highbd_obmc_sad4x4_bits10,
-                    aom_highbd_10_obmc_variance4x4,
-                    aom_highbd_10_obmc_sub_pixel_variance4x4)
-
-        HIGHBD_OBFP(BLOCK_64X16, aom_highbd_obmc_sad64x16_bits10,
-                    aom_highbd_10_obmc_variance64x16,
-                    aom_highbd_10_obmc_sub_pixel_variance64x16)
-
-        HIGHBD_OBFP(BLOCK_16X64, aom_highbd_obmc_sad16x64_bits10,
-                    aom_highbd_10_obmc_variance16x64,
-                    aom_highbd_10_obmc_sub_pixel_variance16x64)
-
-        HIGHBD_OBFP(BLOCK_32X8, aom_highbd_obmc_sad32x8_bits10,
-                    aom_highbd_10_obmc_variance32x8,
-                    aom_highbd_10_obmc_sub_pixel_variance32x8)
-
-        HIGHBD_OBFP(BLOCK_8X32, aom_highbd_obmc_sad8x32_bits10,
-                    aom_highbd_10_obmc_variance8x32,
-                    aom_highbd_10_obmc_sub_pixel_variance8x32)
-
-        HIGHBD_OBFP(BLOCK_16X4, aom_highbd_obmc_sad16x4_bits10,
-                    aom_highbd_10_obmc_variance16x4,
-                    aom_highbd_10_obmc_sub_pixel_variance16x4)
-
-        HIGHBD_OBFP(BLOCK_4X16, aom_highbd_obmc_sad4x16_bits10,
-                    aom_highbd_10_obmc_variance4x16,
-                    aom_highbd_10_obmc_sub_pixel_variance4x16)
-        break;
-
-      case AOM_BITS_12:
-        HIGHBD_BFP(BLOCK_64X16, aom_highbd_sad64x16_bits12,
-                   aom_highbd_sad64x16_avg_bits12, aom_highbd_12_variance64x16,
-                   aom_highbd_12_sub_pixel_variance64x16,
-                   aom_highbd_12_sub_pixel_avg_variance64x16,
-                   aom_highbd_sad64x16x4d_bits12,
-                   aom_highbd_dist_wtd_sad64x16_avg_bits12,
-                   aom_highbd_12_dist_wtd_sub_pixel_avg_variance64x16);
-
-        HIGHBD_BFP(BLOCK_16X64, aom_highbd_sad16x64_bits12,
-                   aom_highbd_sad16x64_avg_bits12, aom_highbd_12_variance16x64,
-                   aom_highbd_12_sub_pixel_variance16x64,
-                   aom_highbd_12_sub_pixel_avg_variance16x64,
-                   aom_highbd_sad16x64x4d_bits12,
-                   aom_highbd_dist_wtd_sad16x64_avg_bits12,
-                   aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x64);
-
-        HIGHBD_BFP(BLOCK_32X8, aom_highbd_sad32x8_bits12,
-                   aom_highbd_sad32x8_avg_bits12, aom_highbd_12_variance32x8,
-                   aom_highbd_12_sub_pixel_variance32x8,
-                   aom_highbd_12_sub_pixel_avg_variance32x8,
-                   aom_highbd_sad32x8x4d_bits12,
-                   aom_highbd_dist_wtd_sad32x8_avg_bits12,
-                   aom_highbd_12_dist_wtd_sub_pixel_avg_variance32x8);
-
-        HIGHBD_BFP(BLOCK_8X32, aom_highbd_sad8x32_bits12,
-                   aom_highbd_sad8x32_avg_bits12, aom_highbd_12_variance8x32,
-                   aom_highbd_12_sub_pixel_variance8x32,
-                   aom_highbd_12_sub_pixel_avg_variance8x32,
-                   aom_highbd_sad8x32x4d_bits12,
-                   aom_highbd_dist_wtd_sad8x32_avg_bits12,
-                   aom_highbd_12_dist_wtd_sub_pixel_avg_variance8x32);
-
-        HIGHBD_BFP(BLOCK_16X4, aom_highbd_sad16x4_bits12,
-                   aom_highbd_sad16x4_avg_bits12, aom_highbd_12_variance16x4,
-                   aom_highbd_12_sub_pixel_variance16x4,
-                   aom_highbd_12_sub_pixel_avg_variance16x4,
-                   aom_highbd_sad16x4x4d_bits12,
-                   aom_highbd_dist_wtd_sad16x4_avg_bits12,
-                   aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x4);
-
-        HIGHBD_BFP(BLOCK_4X16, aom_highbd_sad4x16_bits12,
-                   aom_highbd_sad4x16_avg_bits12, aom_highbd_12_variance4x16,
-                   aom_highbd_12_sub_pixel_variance4x16,
-                   aom_highbd_12_sub_pixel_avg_variance4x16,
-                   aom_highbd_sad4x16x4d_bits12,
-                   aom_highbd_dist_wtd_sad4x16_avg_bits12,
-                   aom_highbd_12_dist_wtd_sub_pixel_avg_variance4x16);
-
-        HIGHBD_BFP(BLOCK_32X16, aom_highbd_sad32x16_bits12,
-                   aom_highbd_sad32x16_avg_bits12, aom_highbd_12_variance32x16,
-                   aom_highbd_12_sub_pixel_variance32x16,
-                   aom_highbd_12_sub_pixel_avg_variance32x16,
-                   aom_highbd_sad32x16x4d_bits12,
-                   aom_highbd_dist_wtd_sad32x16_avg_bits12,
-                   aom_highbd_12_dist_wtd_sub_pixel_avg_variance32x16);
-
-        HIGHBD_BFP(BLOCK_16X32, aom_highbd_sad16x32_bits12,
-                   aom_highbd_sad16x32_avg_bits12, aom_highbd_12_variance16x32,
-                   aom_highbd_12_sub_pixel_variance16x32,
-                   aom_highbd_12_sub_pixel_avg_variance16x32,
-                   aom_highbd_sad16x32x4d_bits12,
-                   aom_highbd_dist_wtd_sad16x32_avg_bits12,
-                   aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x32);
-
-        HIGHBD_BFP(BLOCK_64X32, aom_highbd_sad64x32_bits12,
-                   aom_highbd_sad64x32_avg_bits12, aom_highbd_12_variance64x32,
-                   aom_highbd_12_sub_pixel_variance64x32,
-                   aom_highbd_12_sub_pixel_avg_variance64x32,
-                   aom_highbd_sad64x32x4d_bits12,
-                   aom_highbd_dist_wtd_sad64x32_avg_bits12,
-                   aom_highbd_12_dist_wtd_sub_pixel_avg_variance64x32);
-
-        HIGHBD_BFP(BLOCK_32X64, aom_highbd_sad32x64_bits12,
-                   aom_highbd_sad32x64_avg_bits12, aom_highbd_12_variance32x64,
-                   aom_highbd_12_sub_pixel_variance32x64,
-                   aom_highbd_12_sub_pixel_avg_variance32x64,
-                   aom_highbd_sad32x64x4d_bits12,
-                   aom_highbd_dist_wtd_sad32x64_avg_bits12,
-                   aom_highbd_12_dist_wtd_sub_pixel_avg_variance32x64);
-
-        HIGHBD_BFP(BLOCK_32X32, aom_highbd_sad32x32_bits12,
-                   aom_highbd_sad32x32_avg_bits12, aom_highbd_12_variance32x32,
-                   aom_highbd_12_sub_pixel_variance32x32,
-                   aom_highbd_12_sub_pixel_avg_variance32x32,
-                   aom_highbd_sad32x32x4d_bits12,
-                   aom_highbd_dist_wtd_sad32x32_avg_bits12,
-                   aom_highbd_12_dist_wtd_sub_pixel_avg_variance32x32);
-
-        HIGHBD_BFP(BLOCK_64X64, aom_highbd_sad64x64_bits12,
-                   aom_highbd_sad64x64_avg_bits12, aom_highbd_12_variance64x64,
-                   aom_highbd_12_sub_pixel_variance64x64,
-                   aom_highbd_12_sub_pixel_avg_variance64x64,
-                   aom_highbd_sad64x64x4d_bits12,
-                   aom_highbd_dist_wtd_sad64x64_avg_bits12,
-                   aom_highbd_12_dist_wtd_sub_pixel_avg_variance64x64);
-
-        HIGHBD_BFP(BLOCK_16X16, aom_highbd_sad16x16_bits12,
-                   aom_highbd_sad16x16_avg_bits12, aom_highbd_12_variance16x16,
-                   aom_highbd_12_sub_pixel_variance16x16,
-                   aom_highbd_12_sub_pixel_avg_variance16x16,
-                   aom_highbd_sad16x16x4d_bits12,
-                   aom_highbd_dist_wtd_sad16x16_avg_bits12,
-                   aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x16);
-
-        HIGHBD_BFP(BLOCK_16X8, aom_highbd_sad16x8_bits12,
-                   aom_highbd_sad16x8_avg_bits12, aom_highbd_12_variance16x8,
-                   aom_highbd_12_sub_pixel_variance16x8,
-                   aom_highbd_12_sub_pixel_avg_variance16x8,
-                   aom_highbd_sad16x8x4d_bits12,
-                   aom_highbd_dist_wtd_sad16x8_avg_bits12,
-                   aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x8);
-
-        HIGHBD_BFP(BLOCK_8X16, aom_highbd_sad8x16_bits12,
-                   aom_highbd_sad8x16_avg_bits12, aom_highbd_12_variance8x16,
-                   aom_highbd_12_sub_pixel_variance8x16,
-                   aom_highbd_12_sub_pixel_avg_variance8x16,
-                   aom_highbd_sad8x16x4d_bits12,
-                   aom_highbd_dist_wtd_sad8x16_avg_bits12,
-                   aom_highbd_12_dist_wtd_sub_pixel_avg_variance8x16);
-
-        HIGHBD_BFP(
-            BLOCK_8X8, aom_highbd_sad8x8_bits12, aom_highbd_sad8x8_avg_bits12,
-            aom_highbd_12_variance8x8, aom_highbd_12_sub_pixel_variance8x8,
-            aom_highbd_12_sub_pixel_avg_variance8x8,
-            aom_highbd_sad8x8x4d_bits12, aom_highbd_dist_wtd_sad8x8_avg_bits12,
-            aom_highbd_12_dist_wtd_sub_pixel_avg_variance8x8);
-
-        HIGHBD_BFP(
-            BLOCK_8X4, aom_highbd_sad8x4_bits12, aom_highbd_sad8x4_avg_bits12,
-            aom_highbd_12_variance8x4, aom_highbd_12_sub_pixel_variance8x4,
-            aom_highbd_12_sub_pixel_avg_variance8x4,
-            aom_highbd_sad8x4x4d_bits12, aom_highbd_dist_wtd_sad8x4_avg_bits12,
-            aom_highbd_12_dist_wtd_sub_pixel_avg_variance8x4);
-
-        HIGHBD_BFP(
-            BLOCK_4X8, aom_highbd_sad4x8_bits12, aom_highbd_sad4x8_avg_bits12,
-            aom_highbd_12_variance4x8, aom_highbd_12_sub_pixel_variance4x8,
-            aom_highbd_12_sub_pixel_avg_variance4x8,
-            aom_highbd_sad4x8x4d_bits12, aom_highbd_dist_wtd_sad4x8_avg_bits12,
-            aom_highbd_12_dist_wtd_sub_pixel_avg_variance4x8);
-
-        HIGHBD_BFP(
-            BLOCK_4X4, aom_highbd_sad4x4_bits12, aom_highbd_sad4x4_avg_bits12,
-            aom_highbd_12_variance4x4, aom_highbd_12_sub_pixel_variance4x4,
-            aom_highbd_12_sub_pixel_avg_variance4x4,
-            aom_highbd_sad4x4x4d_bits12, aom_highbd_dist_wtd_sad4x4_avg_bits12,
-            aom_highbd_12_dist_wtd_sub_pixel_avg_variance4x4);
-
-        HIGHBD_BFP(BLOCK_128X128, aom_highbd_sad128x128_bits12,
-                   aom_highbd_sad128x128_avg_bits12,
-                   aom_highbd_12_variance128x128,
-                   aom_highbd_12_sub_pixel_variance128x128,
-                   aom_highbd_12_sub_pixel_avg_variance128x128,
-                   aom_highbd_sad128x128x4d_bits12,
-                   aom_highbd_dist_wtd_sad128x128_avg_bits12,
-                   aom_highbd_12_dist_wtd_sub_pixel_avg_variance128x128);
-
-        HIGHBD_BFP(BLOCK_128X64, aom_highbd_sad128x64_bits12,
-                   aom_highbd_sad128x64_avg_bits12,
-                   aom_highbd_12_variance128x64,
-                   aom_highbd_12_sub_pixel_variance128x64,
-                   aom_highbd_12_sub_pixel_avg_variance128x64,
-                   aom_highbd_sad128x64x4d_bits12,
-                   aom_highbd_dist_wtd_sad128x64_avg_bits12,
-                   aom_highbd_12_dist_wtd_sub_pixel_avg_variance128x64);
-
-        HIGHBD_BFP(BLOCK_64X128, aom_highbd_sad64x128_bits12,
-                   aom_highbd_sad64x128_avg_bits12,
-                   aom_highbd_12_variance64x128,
-                   aom_highbd_12_sub_pixel_variance64x128,
-                   aom_highbd_12_sub_pixel_avg_variance64x128,
-                   aom_highbd_sad64x128x4d_bits12,
-                   aom_highbd_dist_wtd_sad64x128_avg_bits12,
-                   aom_highbd_12_dist_wtd_sub_pixel_avg_variance64x128);
-
-        HIGHBD_MBFP(BLOCK_128X128, aom_highbd_masked_sad128x128_bits12,
-                    aom_highbd_12_masked_sub_pixel_variance128x128)
-        HIGHBD_MBFP(BLOCK_128X64, aom_highbd_masked_sad128x64_bits12,
-                    aom_highbd_12_masked_sub_pixel_variance128x64)
-        HIGHBD_MBFP(BLOCK_64X128, aom_highbd_masked_sad64x128_bits12,
-                    aom_highbd_12_masked_sub_pixel_variance64x128)
-        HIGHBD_MBFP(BLOCK_64X64, aom_highbd_masked_sad64x64_bits12,
-                    aom_highbd_12_masked_sub_pixel_variance64x64)
-        HIGHBD_MBFP(BLOCK_64X32, aom_highbd_masked_sad64x32_bits12,
-                    aom_highbd_12_masked_sub_pixel_variance64x32)
-        HIGHBD_MBFP(BLOCK_32X64, aom_highbd_masked_sad32x64_bits12,
-                    aom_highbd_12_masked_sub_pixel_variance32x64)
-        HIGHBD_MBFP(BLOCK_32X32, aom_highbd_masked_sad32x32_bits12,
-                    aom_highbd_12_masked_sub_pixel_variance32x32)
-        HIGHBD_MBFP(BLOCK_32X16, aom_highbd_masked_sad32x16_bits12,
-                    aom_highbd_12_masked_sub_pixel_variance32x16)
-        HIGHBD_MBFP(BLOCK_16X32, aom_highbd_masked_sad16x32_bits12,
-                    aom_highbd_12_masked_sub_pixel_variance16x32)
-        HIGHBD_MBFP(BLOCK_16X16, aom_highbd_masked_sad16x16_bits12,
-                    aom_highbd_12_masked_sub_pixel_variance16x16)
-        HIGHBD_MBFP(BLOCK_8X16, aom_highbd_masked_sad8x16_bits12,
-                    aom_highbd_12_masked_sub_pixel_variance8x16)
-        HIGHBD_MBFP(BLOCK_16X8, aom_highbd_masked_sad16x8_bits12,
-                    aom_highbd_12_masked_sub_pixel_variance16x8)
-        HIGHBD_MBFP(BLOCK_8X8, aom_highbd_masked_sad8x8_bits12,
-                    aom_highbd_12_masked_sub_pixel_variance8x8)
-        HIGHBD_MBFP(BLOCK_4X8, aom_highbd_masked_sad4x8_bits12,
-                    aom_highbd_12_masked_sub_pixel_variance4x8)
-        HIGHBD_MBFP(BLOCK_8X4, aom_highbd_masked_sad8x4_bits12,
-                    aom_highbd_12_masked_sub_pixel_variance8x4)
-        HIGHBD_MBFP(BLOCK_4X4, aom_highbd_masked_sad4x4_bits12,
-                    aom_highbd_12_masked_sub_pixel_variance4x4)
-        HIGHBD_MBFP(BLOCK_64X16, aom_highbd_masked_sad64x16_bits12,
-                    aom_highbd_12_masked_sub_pixel_variance64x16)
-        HIGHBD_MBFP(BLOCK_16X64, aom_highbd_masked_sad16x64_bits12,
-                    aom_highbd_12_masked_sub_pixel_variance16x64)
-        HIGHBD_MBFP(BLOCK_32X8, aom_highbd_masked_sad32x8_bits12,
-                    aom_highbd_12_masked_sub_pixel_variance32x8)
-        HIGHBD_MBFP(BLOCK_8X32, aom_highbd_masked_sad8x32_bits12,
-                    aom_highbd_12_masked_sub_pixel_variance8x32)
-        HIGHBD_MBFP(BLOCK_16X4, aom_highbd_masked_sad16x4_bits12,
-                    aom_highbd_12_masked_sub_pixel_variance16x4)
-        HIGHBD_MBFP(BLOCK_4X16, aom_highbd_masked_sad4x16_bits12,
-                    aom_highbd_12_masked_sub_pixel_variance4x16)
-        HIGHBD_OBFP(BLOCK_128X128, aom_highbd_obmc_sad128x128_bits12,
-                    aom_highbd_12_obmc_variance128x128,
-                    aom_highbd_12_obmc_sub_pixel_variance128x128)
-        HIGHBD_OBFP(BLOCK_128X64, aom_highbd_obmc_sad128x64_bits12,
-                    aom_highbd_12_obmc_variance128x64,
-                    aom_highbd_12_obmc_sub_pixel_variance128x64)
-        HIGHBD_OBFP(BLOCK_64X128, aom_highbd_obmc_sad64x128_bits12,
-                    aom_highbd_12_obmc_variance64x128,
-                    aom_highbd_12_obmc_sub_pixel_variance64x128)
-        HIGHBD_OBFP(BLOCK_64X64, aom_highbd_obmc_sad64x64_bits12,
-                    aom_highbd_12_obmc_variance64x64,
-                    aom_highbd_12_obmc_sub_pixel_variance64x64)
-        HIGHBD_OBFP(BLOCK_64X32, aom_highbd_obmc_sad64x32_bits12,
-                    aom_highbd_12_obmc_variance64x32,
-                    aom_highbd_12_obmc_sub_pixel_variance64x32)
-        HIGHBD_OBFP(BLOCK_32X64, aom_highbd_obmc_sad32x64_bits12,
-                    aom_highbd_12_obmc_variance32x64,
-                    aom_highbd_12_obmc_sub_pixel_variance32x64)
-        HIGHBD_OBFP(BLOCK_32X32, aom_highbd_obmc_sad32x32_bits12,
-                    aom_highbd_12_obmc_variance32x32,
-                    aom_highbd_12_obmc_sub_pixel_variance32x32)
-        HIGHBD_OBFP(BLOCK_32X16, aom_highbd_obmc_sad32x16_bits12,
-                    aom_highbd_12_obmc_variance32x16,
-                    aom_highbd_12_obmc_sub_pixel_variance32x16)
-        HIGHBD_OBFP(BLOCK_16X32, aom_highbd_obmc_sad16x32_bits12,
-                    aom_highbd_12_obmc_variance16x32,
-                    aom_highbd_12_obmc_sub_pixel_variance16x32)
-        HIGHBD_OBFP(BLOCK_16X16, aom_highbd_obmc_sad16x16_bits12,
-                    aom_highbd_12_obmc_variance16x16,
-                    aom_highbd_12_obmc_sub_pixel_variance16x16)
-        HIGHBD_OBFP(BLOCK_8X16, aom_highbd_obmc_sad8x16_bits12,
-                    aom_highbd_12_obmc_variance8x16,
-                    aom_highbd_12_obmc_sub_pixel_variance8x16)
-        HIGHBD_OBFP(BLOCK_16X8, aom_highbd_obmc_sad16x8_bits12,
-                    aom_highbd_12_obmc_variance16x8,
-                    aom_highbd_12_obmc_sub_pixel_variance16x8)
-        HIGHBD_OBFP(BLOCK_8X8, aom_highbd_obmc_sad8x8_bits12,
-                    aom_highbd_12_obmc_variance8x8,
-                    aom_highbd_12_obmc_sub_pixel_variance8x8)
-        HIGHBD_OBFP(BLOCK_4X8, aom_highbd_obmc_sad4x8_bits12,
-                    aom_highbd_12_obmc_variance4x8,
-                    aom_highbd_12_obmc_sub_pixel_variance4x8)
-        HIGHBD_OBFP(BLOCK_8X4, aom_highbd_obmc_sad8x4_bits12,
-                    aom_highbd_12_obmc_variance8x4,
-                    aom_highbd_12_obmc_sub_pixel_variance8x4)
-        HIGHBD_OBFP(BLOCK_4X4, aom_highbd_obmc_sad4x4_bits12,
-                    aom_highbd_12_obmc_variance4x4,
-                    aom_highbd_12_obmc_sub_pixel_variance4x4)
-        HIGHBD_OBFP(BLOCK_64X16, aom_highbd_obmc_sad64x16_bits12,
-                    aom_highbd_12_obmc_variance64x16,
-                    aom_highbd_12_obmc_sub_pixel_variance64x16)
-        HIGHBD_OBFP(BLOCK_16X64, aom_highbd_obmc_sad16x64_bits12,
-                    aom_highbd_12_obmc_variance16x64,
-                    aom_highbd_12_obmc_sub_pixel_variance16x64)
-        HIGHBD_OBFP(BLOCK_32X8, aom_highbd_obmc_sad32x8_bits12,
-                    aom_highbd_12_obmc_variance32x8,
-                    aom_highbd_12_obmc_sub_pixel_variance32x8)
-        HIGHBD_OBFP(BLOCK_8X32, aom_highbd_obmc_sad8x32_bits12,
-                    aom_highbd_12_obmc_variance8x32,
-                    aom_highbd_12_obmc_sub_pixel_variance8x32)
-        HIGHBD_OBFP(BLOCK_16X4, aom_highbd_obmc_sad16x4_bits12,
-                    aom_highbd_12_obmc_variance16x4,
-                    aom_highbd_12_obmc_sub_pixel_variance16x4)
-        HIGHBD_OBFP(BLOCK_4X16, aom_highbd_obmc_sad4x16_bits12,
-                    aom_highbd_12_obmc_variance4x16,
-                    aom_highbd_12_obmc_sub_pixel_variance4x16)
-        break;
-
-      default:
-        assert(0 &&
-               "cm->seq_params.bit_depth should be AOM_BITS_8, "
-               "AOM_BITS_10 or AOM_BITS_12");
-    }
-  }
-}
-#endif  // CONFIG_AV1_HIGHBITDEPTH
-
-static void realloc_segmentation_maps(AV1_COMP *cpi) {
-  AV1_COMMON *const cm = &cpi->common;
-  CommonModeInfoParams *const mi_params = &cm->mi_params;
-
-  // Create the encoder segmentation map and set all entries to 0
-  aom_free(cpi->enc_seg.map);
-  CHECK_MEM_ERROR(cm, cpi->enc_seg.map,
-                  aom_calloc(mi_params->mi_rows * mi_params->mi_cols, 1));
-
-  // Create a map used for cyclic background refresh.
-  if (cpi->cyclic_refresh) av1_cyclic_refresh_free(cpi->cyclic_refresh);
-  CHECK_MEM_ERROR(
-      cm, cpi->cyclic_refresh,
-      av1_cyclic_refresh_alloc(mi_params->mi_rows, mi_params->mi_cols));
-
-  // Create a map used to mark inactive areas.
-  aom_free(cpi->active_map.map);
-  CHECK_MEM_ERROR(cm, cpi->active_map.map,
-                  aom_calloc(mi_params->mi_rows * mi_params->mi_cols, 1));
-}
-
-static AOM_INLINE void set_tpl_stats_block_size(int width, int height,
-                                                uint8_t *block_mis_log2) {
-  const int is_720p_or_larger = AOMMIN(width, height) >= 720;
-
-  // 0: 4x4, 1: 8x8, 2: 16x16
-  *block_mis_log2 = is_720p_or_larger ? 2 : 1;
-}
-
-void av1_alloc_compound_type_rd_buffers(AV1_COMMON *const cm,
-                                        CompoundTypeRdBuffers *const bufs) {
-  CHECK_MEM_ERROR(
-      cm, bufs->pred0,
-      (uint8_t *)aom_memalign(16, 2 * MAX_SB_SQUARE * sizeof(*bufs->pred0)));
-  CHECK_MEM_ERROR(
-      cm, bufs->pred1,
-      (uint8_t *)aom_memalign(16, 2 * MAX_SB_SQUARE * sizeof(*bufs->pred1)));
-  CHECK_MEM_ERROR(
-      cm, bufs->residual1,
-      (int16_t *)aom_memalign(32, MAX_SB_SQUARE * sizeof(*bufs->residual1)));
-  CHECK_MEM_ERROR(
-      cm, bufs->diff10,
-      (int16_t *)aom_memalign(32, MAX_SB_SQUARE * sizeof(*bufs->diff10)));
-  CHECK_MEM_ERROR(cm, bufs->tmp_best_mask_buf,
-                  (uint8_t *)aom_malloc(2 * MAX_SB_SQUARE *
-                                        sizeof(*bufs->tmp_best_mask_buf)));
-}
-
-void av1_release_compound_type_rd_buffers(CompoundTypeRdBuffers *const bufs) {
-  aom_free(bufs->pred0);
-  aom_free(bufs->pred1);
-  aom_free(bufs->residual1);
-  aom_free(bufs->diff10);
-  aom_free(bufs->tmp_best_mask_buf);
-  av1_zero(*bufs);  // Set all pointers to NULL for safety.
-}
-
-static void config_target_level(AV1_COMP *const cpi, AV1_LEVEL target_level,
-                                int tier) {
-  aom_clear_system_state();
-
-  AV1EncoderConfig *const oxcf = &cpi->oxcf;
-  SequenceHeader *const seq_params = &cpi->common.seq_params;
-
-  // Adjust target bitrate to be no larger than 70% of level limit.
-  const BITSTREAM_PROFILE profile = seq_params->profile;
-  const double level_bitrate_limit =
-      av1_get_max_bitrate_for_level(target_level, tier, profile);
-  const int64_t max_bitrate = (int64_t)(level_bitrate_limit * 0.70);
-  oxcf->target_bandwidth = AOMMIN(oxcf->target_bandwidth, max_bitrate);
-  // Also need to update cpi->twopass.bits_left.
-  TWO_PASS *const twopass = &cpi->twopass;
-  FIRSTPASS_STATS *stats = twopass->stats_buf_ctx->total_stats;
-  if (stats != NULL)
-    cpi->twopass.bits_left =
-        (int64_t)(stats->duration * cpi->oxcf.target_bandwidth / 10000000.0);
-
-  // Adjust max over-shoot percentage.
-  oxcf->over_shoot_pct = 0;
-
-  // Adjust max quantizer.
-  oxcf->worst_allowed_q = 255;
-
-  // Adjust number of tiles and tile columns to be under level limit.
-  int max_tiles, max_tile_cols;
-  av1_get_max_tiles_for_level(target_level, &max_tiles, &max_tile_cols);
-  while (oxcf->tile_columns > 0 && (1 << oxcf->tile_columns) > max_tile_cols) {
-    --oxcf->tile_columns;
-  }
-  const int tile_cols = (1 << oxcf->tile_columns);
-  while (oxcf->tile_rows > 0 &&
-         tile_cols * (1 << oxcf->tile_rows) > max_tiles) {
-    --oxcf->tile_rows;
-  }
-
-  // Adjust min compression ratio.
-  const int still_picture = seq_params->still_picture;
-  const double min_cr =
-      av1_get_min_cr_for_level(target_level, tier, still_picture);
-  oxcf->min_cr = AOMMAX(oxcf->min_cr, (unsigned int)(min_cr * 100));
+  av1_noise_estimate_init(&cpi->noise_estimate, cm->width, cm->height);
 }
 
 void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf) {
   AV1_COMMON *const cm = &cpi->common;
   SequenceHeader *const seq_params = &cm->seq_params;
-  const int num_planes = av1_num_planes(cm);
   RATE_CONTROL *const rc = &cpi->rc;
   MACROBLOCK *const x = &cpi->td.mb;
   AV1LevelParams *const level_params = &cpi->level_params;
+  InitialDimensions *const initial_dimensions = &cpi->initial_dimensions;
+  RefreshFrameFlagsInfo *const refresh_frame_flags = &cpi->refresh_frame;
+  const FrameDimensionCfg *const frm_dim_cfg = &cpi->oxcf.frm_dim_cfg;
+  const DecoderModelCfg *const dec_model_cfg = &oxcf->dec_model_cfg;
+  const ColorCfg *const color_cfg = &oxcf->color_cfg;
+  const RateControlCfg *const rc_cfg = &oxcf->rc_cfg;
+  // in case of LAP, lag in frames is set according to number of lap buffers
+  // calculated at init time. This stores and restores LAP's lag in frames to
+  // prevent override by new cfg.
+  int lap_lag_in_frames = -1;
+  if (cpi->lap_enabled && cpi->compressor_stage == LAP_STAGE) {
+    lap_lag_in_frames = cpi->oxcf.gf_cfg.lag_in_frames;
+  }
 
   if (seq_params->profile != oxcf->profile) seq_params->profile = oxcf->profile;
-  seq_params->bit_depth = oxcf->bit_depth;
-  seq_params->color_primaries = oxcf->color_primaries;
-  seq_params->transfer_characteristics = oxcf->transfer_characteristics;
-  seq_params->matrix_coefficients = oxcf->matrix_coefficients;
-  seq_params->monochrome = oxcf->monochrome;
-  seq_params->chroma_sample_position = oxcf->chroma_sample_position;
-  seq_params->color_range = oxcf->color_range;
+  seq_params->bit_depth = oxcf->tool_cfg.bit_depth;
+  seq_params->color_primaries = color_cfg->color_primaries;
+  seq_params->transfer_characteristics = color_cfg->transfer_characteristics;
+  seq_params->matrix_coefficients = color_cfg->matrix_coefficients;
+  seq_params->monochrome = oxcf->tool_cfg.enable_monochrome;
+  seq_params->chroma_sample_position = color_cfg->chroma_sample_position;
+  seq_params->color_range = color_cfg->color_range;
 
   assert(IMPLIES(seq_params->profile <= PROFILE_1,
                  seq_params->bit_depth <= AOM_BITS_10));
 
-  seq_params->timing_info_present = oxcf->timing_info_present;
+  seq_params->timing_info_present = dec_model_cfg->timing_info_present;
   seq_params->timing_info.num_units_in_display_tick =
-      oxcf->timing_info.num_units_in_display_tick;
-  seq_params->timing_info.time_scale = oxcf->timing_info.time_scale;
+      dec_model_cfg->timing_info.num_units_in_display_tick;
+  seq_params->timing_info.time_scale = dec_model_cfg->timing_info.time_scale;
   seq_params->timing_info.equal_picture_interval =
-      oxcf->timing_info.equal_picture_interval;
+      dec_model_cfg->timing_info.equal_picture_interval;
   seq_params->timing_info.num_ticks_per_picture =
-      oxcf->timing_info.num_ticks_per_picture;
+      dec_model_cfg->timing_info.num_ticks_per_picture;
 
   seq_params->display_model_info_present_flag =
-      oxcf->display_model_info_present_flag;
+      dec_model_cfg->display_model_info_present_flag;
   seq_params->decoder_model_info_present_flag =
-      oxcf->decoder_model_info_present_flag;
-  if (oxcf->decoder_model_info_present_flag) {
+      dec_model_cfg->decoder_model_info_present_flag;
+  if (dec_model_cfg->decoder_model_info_present_flag) {
     // set the decoder model parameters in schedule mode
     seq_params->decoder_model_info.num_units_in_decoding_tick =
-        oxcf->buffer_model.num_units_in_decoding_tick;
+        dec_model_cfg->num_units_in_decoding_tick;
     cm->buffer_removal_time_present = 1;
     av1_set_aom_dec_model_info(&seq_params->decoder_model_info);
     av1_set_dec_model_op_parameters(&seq_params->op_params[0]);
@@ -2828,10 +637,17 @@
         10;  // Default value (not signaled)
   }
 
-  update_film_grain_parameters(cpi, oxcf);
+  av1_update_film_grain_parameters(cpi, oxcf);
 
   cpi->oxcf = *oxcf;
-  cpi->superres_mode = oxcf->superres_mode;  // default
+  // When user provides superres_mode = AOM_SUPERRES_AUTO, we still initialize
+  // superres mode for current encoding = AOM_SUPERRES_NONE. This is to ensure
+  // that any analysis (e.g. TPL) happening outside the main encoding loop still
+  // happens at full resolution.
+  // This value will later be set appropriately just before main encoding loop.
+  cpi->superres_mode = oxcf->superres_cfg.superres_mode == AOM_SUPERRES_AUTO
+                           ? AOM_SUPERRES_NONE
+                           : oxcf->superres_cfg.superres_mode;  // default
   x->e_mbd.bd = (int)seq_params->bit_depth;
   x->e_mbd.global_motion = cm->global_motion;
 
@@ -2856,19 +672,20 @@
                         seq_params->tier[0]);
   }
 
-  if ((has_no_stats_stage(cpi)) && (oxcf->rc_mode == AOM_Q)) {
+  if ((has_no_stats_stage(cpi)) && (rc_cfg->mode == AOM_Q)) {
     rc->baseline_gf_interval = FIXED_GF_INTERVAL;
   } else {
     rc->baseline_gf_interval = (MIN_GF_INTERVAL + MAX_GF_INTERVAL) / 2;
   }
 
-  cpi->refresh_golden_frame = 0;
-  cpi->refresh_bwd_ref_frame = 0;
+  refresh_frame_flags->golden_frame = false;
+  refresh_frame_flags->bwd_ref_frame = false;
 
-  cm->features.refresh_frame_context = (oxcf->frame_parallel_decoding_mode)
-                                           ? REFRESH_FRAME_CONTEXT_DISABLED
-                                           : REFRESH_FRAME_CONTEXT_BACKWARD;
-  if (oxcf->large_scale_tile)
+  cm->features.refresh_frame_context =
+      (oxcf->tool_cfg.frame_parallel_decoding_mode)
+          ? REFRESH_FRAME_CONTEXT_DISABLED
+          : REFRESH_FRAME_CONTEXT_BACKWARD;
+  if (oxcf->tile_cfg.enable_large_scale_tile)
     cm->features.refresh_frame_context = REFRESH_FRAME_CONTEXT_DISABLED;
 
   if (x->palette_buffer == NULL) {
@@ -2877,7 +694,7 @@
   }
 
   if (x->comp_rd_buffer.pred0 == NULL) {
-    av1_alloc_compound_type_rd_buffers(cm, &x->comp_rd_buffer);
+    alloc_compound_type_rd_buffers(cm, &x->comp_rd_buffer);
   }
 
   if (x->tmp_conv_dst == NULL) {
@@ -2887,11 +704,11 @@
     x->e_mbd.tmp_conv_dst = x->tmp_conv_dst;
   }
   for (int i = 0; i < 2; ++i) {
-    if (x->tmp_obmc_bufs[i] == NULL) {
-      CHECK_MEM_ERROR(cm, x->tmp_obmc_bufs[i],
+    if (x->tmp_pred_bufs[i] == NULL) {
+      CHECK_MEM_ERROR(cm, x->tmp_pred_bufs[i],
                       aom_memalign(32, 2 * MAX_MB_PLANE * MAX_SB_SQUARE *
-                                           sizeof(*x->tmp_obmc_bufs[i])));
-      x->e_mbd.tmp_obmc_bufs[i] = x->tmp_obmc_bufs[i];
+                                           sizeof(*x->tmp_pred_bufs[i])));
+      x->e_mbd.tmp_obmc_bufs[i] = x->tmp_pred_bufs[i];
     }
   }
 
@@ -2899,7 +716,7 @@
 
   av1_set_high_precision_mv(cpi, 1, 0);
 
-  set_rc_buffer_sizes(rc, &cpi->oxcf);
+  set_rc_buffer_sizes(rc, rc_cfg);
 
   // Under a configuration change, where maximum_buffer_size may change,
   // keep buffer level clipped to the maximum allowed buffer size.
@@ -2910,49 +727,53 @@
   av1_new_framerate(cpi, cpi->framerate);
 
   // Set absolute upper and lower quality limits
-  rc->worst_quality = cpi->oxcf.worst_allowed_q;
-  rc->best_quality = cpi->oxcf.best_allowed_q;
+  rc->worst_quality = rc_cfg->worst_allowed_q;
+  rc->best_quality = rc_cfg->best_allowed_q;
 
   cm->features.interp_filter =
-      oxcf->large_scale_tile ? EIGHTTAP_REGULAR : SWITCHABLE;
+      oxcf->tile_cfg.enable_large_scale_tile ? EIGHTTAP_REGULAR : SWITCHABLE;
   cm->features.switchable_motion_mode = 1;
 
-  if (cpi->oxcf.render_width > 0 && cpi->oxcf.render_height > 0) {
-    cm->render_width = cpi->oxcf.render_width;
-    cm->render_height = cpi->oxcf.render_height;
+  if (frm_dim_cfg->render_width > 0 && frm_dim_cfg->render_height > 0) {
+    cm->render_width = frm_dim_cfg->render_width;
+    cm->render_height = frm_dim_cfg->render_height;
   } else {
-    cm->render_width = cpi->oxcf.width;
-    cm->render_height = cpi->oxcf.height;
+    cm->render_width = frm_dim_cfg->width;
+    cm->render_height = frm_dim_cfg->height;
   }
-  cm->width = cpi->oxcf.width;
-  cm->height = cpi->oxcf.height;
+  cm->width = frm_dim_cfg->width;
+  cm->height = frm_dim_cfg->height;
 
   int sb_size = seq_params->sb_size;
   // Superblock size should not be updated after the first key frame.
   if (!cpi->seq_params_locked) {
-    set_sb_size(&cm->seq_params, select_sb_size(cpi));
+    set_sb_size(&cm->seq_params, av1_select_sb_size(cpi));
     for (int i = 0; i < MAX_NUM_OPERATING_POINTS; ++i)
       seq_params->tier[i] = (oxcf->tier_mask >> i) & 1;
   }
 
-  if (cpi->initial_width || sb_size != seq_params->sb_size) {
-    if (cm->width > cpi->initial_width || cm->height > cpi->initial_height ||
+  if (initial_dimensions->width || sb_size != seq_params->sb_size) {
+    if (cm->width > initial_dimensions->width ||
+        cm->height > initial_dimensions->height ||
         seq_params->sb_size != sb_size) {
       av1_free_context_buffers(cm);
-      av1_free_pc_tree(cpi, &cpi->td, num_planes, (BLOCK_SIZE)sb_size);
+      av1_free_shared_coeff_buffer(&cpi->td.shared_coeff_buf);
+      av1_free_sms_tree(&cpi->td);
+      av1_free_pmc(cpi->td.firstpass_ctx, av1_num_planes(cm));
+      cpi->td.firstpass_ctx = NULL;
       alloc_compressor_data(cpi);
       realloc_segmentation_maps(cpi);
-      cpi->initial_width = cpi->initial_height = 0;
+      initial_dimensions->width = initial_dimensions->height = 0;
     }
   }
   update_frame_size(cpi);
 
   rc->is_src_frame_alt_ref = 0;
 
-  set_tile_info(cpi);
+  set_tile_info(cm, &cpi->oxcf.tile_cfg);
 
   if (!cpi->svc.external_ref_frame_config)
-    cpi->ext_flags.refresh_frame_flags_pending = 0;
+    cpi->ext_flags.refresh_frame.update_pending = 0;
   cpi->ext_flags.refresh_frame_context_pending = 0;
 
 #if CONFIG_AV1_HIGHBITDEPTH
@@ -2966,51 +787,16 @@
         (cm->number_spatial_layers > 1 || cm->number_temporal_layers > 1)
             ? cm->number_spatial_layers * cm->number_temporal_layers - 1
             : 0;
-    init_seq_coding_tools(&cm->seq_params, cm, oxcf, cpi->use_svc);
+    av1_init_seq_coding_tools(&cm->seq_params, cm, oxcf, cpi->use_svc);
   }
 
   if (cpi->use_svc)
-    av1_update_layer_context_change_config(cpi, oxcf->target_bandwidth);
-}
+    av1_update_layer_context_change_config(cpi, rc_cfg->target_bandwidth);
 
-static INLINE void setup_tpl_buffers(AV1_COMMON *const cm,
-                                     TplParams *const tpl_data) {
-  CommonModeInfoParams *const mi_params = &cm->mi_params;
-  set_tpl_stats_block_size(cm->width, cm->height,
-                           &tpl_data->tpl_stats_block_mis_log2);
-  const uint8_t block_mis_log2 = tpl_data->tpl_stats_block_mis_log2;
-
-  for (int frame = 0; frame < MAX_LENGTH_TPL_FRAME_STATS; ++frame) {
-    const int mi_cols =
-        ALIGN_POWER_OF_TWO(mi_params->mi_cols, MAX_MIB_SIZE_LOG2);
-    const int mi_rows =
-        ALIGN_POWER_OF_TWO(mi_params->mi_rows, MAX_MIB_SIZE_LOG2);
-
-    tpl_data->tpl_stats_buffer[frame].is_valid = 0;
-    tpl_data->tpl_stats_buffer[frame].width = mi_cols >> block_mis_log2;
-    tpl_data->tpl_stats_buffer[frame].height = mi_rows >> block_mis_log2;
-    tpl_data->tpl_stats_buffer[frame].stride =
-        tpl_data->tpl_stats_buffer[frame].width;
-    tpl_data->tpl_stats_buffer[frame].mi_rows = mi_params->mi_rows;
-    tpl_data->tpl_stats_buffer[frame].mi_cols = mi_params->mi_cols;
+  // restore the value of lag_in_frame for LAP stage.
+  if (lap_lag_in_frames != -1) {
+    cpi->oxcf.gf_cfg.lag_in_frames = lap_lag_in_frames;
   }
-
-  for (int frame = 0; frame < MAX_LAG_BUFFERS; ++frame) {
-    CHECK_MEM_ERROR(
-        cm, tpl_data->tpl_stats_pool[frame],
-        aom_calloc(tpl_data->tpl_stats_buffer[frame].width *
-                       tpl_data->tpl_stats_buffer[frame].height,
-                   sizeof(*tpl_data->tpl_stats_buffer[frame].tpl_stats_ptr)));
-    if (aom_alloc_frame_buffer(
-            &tpl_data->tpl_rec_pool[frame], cm->width, cm->height,
-            cm->seq_params.subsampling_x, cm->seq_params.subsampling_y,
-            cm->seq_params.use_highbitdepth, AOM_ENC_NO_SCALE_BORDER,
-            cm->features.byte_alignment))
-      aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
-                         "Failed to allocate frame buffer");
-  }
-
-  tpl_data->tpl_frame = &tpl_data->tpl_stats_buffer[REF_FRAMES + 1];
 }
 
 static INLINE void init_frame_info(FRAME_INFO *frame_info,
@@ -3075,15 +861,29 @@
 
   init_config(cpi, oxcf);
   if (cpi->compressor_stage == LAP_STAGE) {
-    cpi->oxcf.lag_in_frames = lap_lag_in_frames;
+    cpi->oxcf.gf_cfg.lag_in_frames = lap_lag_in_frames;
   }
 
+  cpi->frames_left = cpi->oxcf.input_cfg.limit;
+
   av1_rc_init(&cpi->oxcf, oxcf->pass, &cpi->rc);
 
-  cpi->rc.enable_scenecut_detection = 1;
-  if (cpi->lap_enabled &&
-      (num_lap_buffers < (MAX_GF_LENGTH_LAP + SCENE_CUT_KEY_TEST_INTERVAL + 1)))
-    cpi->rc.enable_scenecut_detection = 0;
+  // For two pass and lag_in_frames > 33 in LAP.
+  cpi->rc.enable_scenecut_detection = ENABLE_SCENECUT_MODE_2;
+  if (cpi->lap_enabled) {
+    if ((num_lap_buffers <
+         (MAX_GF_LENGTH_LAP + SCENE_CUT_KEY_TEST_INTERVAL + 1)) &&
+        num_lap_buffers >= (MAX_GF_LENGTH_LAP + 3)) {
+      /*
+       * For lag in frames >= 19 and <33, enable scenecut
+       * with limited future frame prediction.
+       */
+      cpi->rc.enable_scenecut_detection = ENABLE_SCENECUT_MODE_1;
+    } else if (num_lap_buffers < (MAX_GF_LENGTH_LAP + 3)) {
+      // Disable scenecut when lag_in_frames < 19.
+      cpi->rc.enable_scenecut_detection = DISABLE_SCENECUT;
+    }
+  }
   init_frame_info(&cpi->frame_info, cm);
 
   cm->current_frame.frame_number = 0;
@@ -3094,28 +894,35 @@
   cpi->last_show_frame_buf = NULL;
   realloc_segmentation_maps(cpi);
 
-  cpi->refresh_alt_ref_frame = 0;
+  cpi->refresh_frame.alt_ref_frame = false;
 
   cpi->b_calculate_psnr = CONFIG_INTERNAL_STATS;
 #if CONFIG_INTERNAL_STATS
   cpi->b_calculate_blockiness = 1;
   cpi->b_calculate_consistency = 1;
   cpi->total_inconsistency = 0;
-  cpi->psnr.worst = 100.0;
+  cpi->psnr[0].worst = 100.0;
+  cpi->psnr[1].worst = 100.0;
   cpi->worst_ssim = 100.0;
+  cpi->worst_ssim_hbd = 100.0;
 
-  cpi->count = 0;
+  cpi->count[0] = 0;
+  cpi->count[1] = 0;
   cpi->bytes = 0;
 #if CONFIG_SPEED_STATS
   cpi->tx_search_count = 0;
 #endif  // CONFIG_SPEED_STATS
 
   if (cpi->b_calculate_psnr) {
-    cpi->total_sq_error = 0;
-    cpi->total_samples = 0;
+    cpi->total_sq_error[0] = 0;
+    cpi->total_samples[0] = 0;
+    cpi->total_sq_error[1] = 0;
+    cpi->total_samples[1] = 0;
     cpi->tot_recode_hits = 0;
     cpi->summed_quality = 0;
     cpi->summed_weights = 0;
+    cpi->summed_quality_hbd = 0;
+    cpi->summed_weights_hbd = 0;
   }
 
   cpi->fastssim.worst = 100.0;
@@ -3138,11 +945,8 @@
   av1_zero(aggregate_fc);
 #endif  // CONFIG_ENTROPY_STATS
 
-  cpi->time_stamps.first_ever = INT64_MAX;
+  cpi->time_stamps.first_ts_start = INT64_MAX;
 
-#ifdef OUTPUT_YUV_SKINMAP
-  yuv_skinmap_file = fopen("skinmap.yuv", "ab");
-#endif
 #ifdef OUTPUT_YUV_REC
   yuv_rec_file = fopen("rec.yuv", "wb");
 #endif
@@ -3158,12 +962,12 @@
 #if !CONFIG_REALTIME_ONLY
   if (is_stat_consumption_stage(cpi)) {
     const size_t packet_sz = sizeof(FIRSTPASS_STATS);
-    const int packets = (int)(oxcf->two_pass_stats_in.sz / packet_sz);
+    const int packets = (int)(oxcf->twopass_stats_in.sz / packet_sz);
 
     if (!cpi->lap_enabled) {
       /*Re-initialize to stats buffer, populated by application in the case of
        * two pass*/
-      cpi->twopass.stats_buf_ctx->stats_in_start = oxcf->two_pass_stats_in.buf;
+      cpi->twopass.stats_buf_ctx->stats_in_start = oxcf->twopass_stats_in.buf;
       cpi->twopass.stats_in = cpi->twopass.stats_buf_ctx->stats_in_start;
       cpi->twopass.stats_buf_ctx->stats_in_end =
           &cpi->twopass.stats_buf_ctx->stats_in_start[packets - 1];
@@ -3175,20 +979,7 @@
   }
 #endif
 
-  int sb_mi_size = av1_get_sb_mi_size(cm);
-
-  CHECK_MEM_ERROR(
-      cm, cpi->td.mb.above_pred_buf,
-      (uint8_t *)aom_memalign(16, MAX_MB_PLANE * MAX_SB_SQUARE *
-                                      sizeof(*cpi->td.mb.above_pred_buf)));
-  CHECK_MEM_ERROR(
-      cm, cpi->td.mb.left_pred_buf,
-      (uint8_t *)aom_memalign(16, MAX_MB_PLANE * MAX_SB_SQUARE *
-                                      sizeof(*cpi->td.mb.left_pred_buf)));
-
-  CHECK_MEM_ERROR(cm, cpi->td.mb.wsrc_buf,
-                  (int32_t *)aom_memalign(
-                      16, MAX_SB_SQUARE * sizeof(*cpi->td.mb.wsrc_buf)));
+  alloc_obmc_buffers(&cpi->td.mb.obmc_buffer, cm);
 
   CHECK_MEM_ERROR(
       cm, cpi->td.mb.inter_modes_info,
@@ -3204,16 +995,13 @@
 
   cpi->td.mb.intrabc_hash_info.g_crc_initialized = 0;
 
-  CHECK_MEM_ERROR(cm, cpi->td.mb.mask_buf,
-                  (int32_t *)aom_memalign(
-                      16, MAX_SB_SQUARE * sizeof(*cpi->td.mb.mask_buf)));
-
-  CHECK_MEM_ERROR(cm, cpi->td.mb.mbmi_ext,
-                  aom_calloc(sb_mi_size, sizeof(*cpi->td.mb.mbmi_ext)));
-
   av1_set_speed_features_framesize_independent(cpi, oxcf->speed);
   av1_set_speed_features_framesize_dependent(cpi, oxcf->speed);
 
+  CHECK_MEM_ERROR(cm, cpi->consec_zero_mv,
+                  aom_calloc((mi_params->mi_rows * mi_params->mi_cols) >> 2,
+                             sizeof(*cpi->consec_zero_mv)));
+
   {
     const int bsize = BLOCK_16X16;
     const int w = mi_size_wide[bsize];
@@ -3246,15 +1034,24 @@
     const int h = mi_size_high[bsize];
     const int num_cols = (mi_params->mi_cols + w - 1) / w;
     const int num_rows = (mi_params->mi_rows + h - 1) / h;
-    CHECK_MEM_ERROR(cm, cpi->vmaf_rdmult_scaling_factors,
+    CHECK_MEM_ERROR(cm, cpi->vmaf_info.rdmult_scaling_factors,
                     aom_calloc(num_rows * num_cols,
-                               sizeof(*cpi->vmaf_rdmult_scaling_factors)));
-    cpi->last_frame_unsharp_amount = 0.0;
+                               sizeof(*cpi->vmaf_info.rdmult_scaling_factors)));
+    for (int i = 0; i < MAX_ARF_LAYERS; i++) {
+      cpi->vmaf_info.last_frame_unsharp_amount[i] = -1.0;
+      cpi->vmaf_info.last_frame_ysse[i] = -1.0;
+      cpi->vmaf_info.last_frame_vmaf[i] = -1.0;
+    }
+    cpi->vmaf_info.original_qindex = -1;
+
+#if CONFIG_USE_VMAF_RC
+    cpi->vmaf_info.vmaf_model = NULL;
+#endif
   }
 #endif
 
   if (!is_stat_generation_stage(cpi)) {
-    setup_tpl_buffers(cm, &cpi->tpl_data);
+    setup_tpl_buffers(cm, &cpi->tpl_data, cpi->oxcf.gf_cfg.lag_in_frames);
   }
 
 #if CONFIG_COLLECT_PARTITION_STATS == 2
@@ -3271,6 +1068,8 @@
   cpi->fn_ptr[BT].jsdaf = JSDAF;                                \
   cpi->fn_ptr[BT].jsvaf = JSVAF;
 
+// Realtime mode doesn't use 4x rectangular blocks.
+#if !CONFIG_REALTIME_ONLY
   BFP(BLOCK_4X16, aom_sad4x16, aom_sad4x16_avg, aom_variance4x16,
       aom_sub_pixel_variance4x16, aom_sub_pixel_avg_variance4x16,
       aom_sad4x16x4d, aom_dist_wtd_sad4x16_avg,
@@ -3300,6 +1099,7 @@
       aom_sub_pixel_variance64x16, aom_sub_pixel_avg_variance64x16,
       aom_sad64x16x4d, aom_dist_wtd_sad64x16_avg,
       aom_dist_wtd_sub_pixel_avg_variance64x16)
+#endif  // !CONFIG_REALTIME_ONLY
 
   BFP(BLOCK_128X128, aom_sad128x128, aom_sad128x128_avg, aom_variance128x128,
       aom_sub_pixel_variance128x128, aom_sub_pixel_avg_variance128x128,
@@ -3377,6 +1177,7 @@
       aom_sub_pixel_variance4x4, aom_sub_pixel_avg_variance4x4, aom_sad4x4x4d,
       aom_dist_wtd_sad4x4_avg, aom_dist_wtd_sub_pixel_avg_variance4x4)
 
+#if !CONFIG_REALTIME_ONLY
 #define OBFP(BT, OSDF, OVF, OSVF) \
   cpi->fn_ptr[BT].osdf = OSDF;    \
   cpi->fn_ptr[BT].ovf = OVF;      \
@@ -3426,6 +1227,7 @@
        aom_obmc_sub_pixel_variance16x64)
   OBFP(BLOCK_64X16, aom_obmc_sad64x16, aom_obmc_variance64x16,
        aom_obmc_sub_pixel_variance64x16)
+#endif  // !CONFIG_REALTIME_ONLY
 
 #define MBFP(BT, MCSDF, MCSVF)  \
   cpi->fn_ptr[BT].msdf = MCSDF; \
@@ -3449,17 +1251,45 @@
   MBFP(BLOCK_8X4, aom_masked_sad8x4, aom_masked_sub_pixel_variance8x4)
   MBFP(BLOCK_4X4, aom_masked_sad4x4, aom_masked_sub_pixel_variance4x4)
 
+#if !CONFIG_REALTIME_ONLY
   MBFP(BLOCK_4X16, aom_masked_sad4x16, aom_masked_sub_pixel_variance4x16)
-
   MBFP(BLOCK_16X4, aom_masked_sad16x4, aom_masked_sub_pixel_variance16x4)
-
   MBFP(BLOCK_8X32, aom_masked_sad8x32, aom_masked_sub_pixel_variance8x32)
-
   MBFP(BLOCK_32X8, aom_masked_sad32x8, aom_masked_sub_pixel_variance32x8)
-
   MBFP(BLOCK_16X64, aom_masked_sad16x64, aom_masked_sub_pixel_variance16x64)
-
   MBFP(BLOCK_64X16, aom_masked_sad64x16, aom_masked_sub_pixel_variance64x16)
+#endif
+
+#define SDSFP(BT, SDSF, SDSX4DF) \
+  cpi->fn_ptr[BT].sdsf = SDSF;   \
+  cpi->fn_ptr[BT].sdsx4df = SDSX4DF;
+
+  SDSFP(BLOCK_128X128, aom_sad_skip_128x128, aom_sad_skip_128x128x4d);
+  SDSFP(BLOCK_128X64, aom_sad_skip_128x64, aom_sad_skip_128x64x4d);
+  SDSFP(BLOCK_64X128, aom_sad_skip_64x128, aom_sad_skip_64x128x4d);
+  SDSFP(BLOCK_64X64, aom_sad_skip_64x64, aom_sad_skip_64x64x4d);
+  SDSFP(BLOCK_64X32, aom_sad_skip_64x32, aom_sad_skip_64x32x4d);
+
+  SDSFP(BLOCK_32X64, aom_sad_skip_32x64, aom_sad_skip_32x64x4d);
+  SDSFP(BLOCK_32X32, aom_sad_skip_32x32, aom_sad_skip_32x32x4d);
+  SDSFP(BLOCK_32X16, aom_sad_skip_32x16, aom_sad_skip_32x16x4d);
+
+  SDSFP(BLOCK_16X32, aom_sad_skip_16x32, aom_sad_skip_16x32x4d);
+  SDSFP(BLOCK_16X16, aom_sad_skip_16x16, aom_sad_skip_16x16x4d);
+  SDSFP(BLOCK_16X8, aom_sad_skip_16x8, aom_sad_skip_16x8x4d);
+  SDSFP(BLOCK_8X16, aom_sad_skip_8x16, aom_sad_skip_8x16x4d);
+  SDSFP(BLOCK_8X8, aom_sad_skip_8x8, aom_sad_skip_8x8x4d);
+
+  SDSFP(BLOCK_4X8, aom_sad_skip_4x8, aom_sad_skip_4x8x4d);
+
+#if !CONFIG_REALTIME_ONLY
+  SDSFP(BLOCK_64X16, aom_sad_skip_64x16, aom_sad_skip_64x16x4d);
+  SDSFP(BLOCK_16X64, aom_sad_skip_16x64, aom_sad_skip_16x64x4d);
+  SDSFP(BLOCK_32X8, aom_sad_skip_32x8, aom_sad_skip_32x8x4d);
+  SDSFP(BLOCK_8X32, aom_sad_skip_8x32, aom_sad_skip_8x32x4d);
+  SDSFP(BLOCK_4X16, aom_sad_skip_4x16, aom_sad_skip_4x16x4d);
+#endif
+#undef SDSFP
 
 #if CONFIG_AV1_HIGHBITDEPTH
   highbd_set_var_fns(cpi);
@@ -3476,10 +1306,11 @@
 
   av1_loop_filter_init(cm);
   cm->superres_scale_denominator = SCALE_NUMERATOR;
-  cm->superres_upscaled_width = oxcf->width;
-  cm->superres_upscaled_height = oxcf->height;
+  cm->superres_upscaled_width = oxcf->frm_dim_cfg.width;
+  cm->superres_upscaled_height = oxcf->frm_dim_cfg.height;
+#if !CONFIG_REALTIME_ONLY
   av1_loop_restoration_precal();
-
+#endif
   cm->error.setjmp = 0;
 
   return cpi;
@@ -3492,16 +1323,54 @@
   snprintf((H) + strlen(H), sizeof(H) - strlen(H), (T), (V))
 #endif  // CONFIG_INTERNAL_STATS
 
-void av1_remove_compressor(AV1_COMP *cpi) {
-  AV1_COMMON *cm;
-  TplParams *const tpl_data = &cpi->tpl_data;
-  int t;
+// This function will change the state and free the mutex of corresponding
+// workers and terminate the object. The object can not be re-used unless a call
+// to reset() is made.
+static AOM_INLINE void terminate_worker_data(AV1_COMP *cpi) {
+  MultiThreadInfo *const mt_info = &cpi->mt_info;
+  for (int t = mt_info->num_workers - 1; t >= 0; --t) {
+    AVxWorker *const worker = &mt_info->workers[t];
+    aom_get_worker_interface()->end(worker);
+  }
+}
 
+// Deallocate allocated thread_data.
+static AOM_INLINE void free_thread_data(AV1_COMP *cpi) {
+  MultiThreadInfo *const mt_info = &cpi->mt_info;
+  AV1_COMMON *cm = &cpi->common;
+  for (int t = 0; t < mt_info->num_workers; ++t) {
+    EncWorkerData *const thread_data = &mt_info->tile_thr_data[t];
+    aom_free(thread_data->td->tctx);
+    if (t == 0) continue;
+    aom_free(thread_data->td->palette_buffer);
+    aom_free(thread_data->td->tmp_conv_dst);
+    release_compound_type_rd_buffers(&thread_data->td->comp_rd_buffer);
+    for (int j = 0; j < 2; ++j) {
+      aom_free(thread_data->td->tmp_pred_bufs[j]);
+    }
+    release_obmc_buffers(&thread_data->td->obmc_buffer);
+    aom_free(thread_data->td->vt64x64);
+
+    aom_free(thread_data->td->inter_modes_info);
+    for (int x = 0; x < 2; x++) {
+      for (int y = 0; y < 2; y++) {
+        aom_free(thread_data->td->hash_value_buffer[x][y]);
+        thread_data->td->hash_value_buffer[x][y] = NULL;
+      }
+    }
+    aom_free(thread_data->td->counts);
+    av1_free_pmc(thread_data->td->firstpass_ctx, av1_num_planes(cm));
+    thread_data->td->firstpass_ctx = NULL;
+    av1_free_shared_coeff_buffer(&thread_data->td->shared_coeff_buf);
+    av1_free_sms_tree(thread_data->td);
+    aom_free(thread_data->td);
+  }
+}
+
+void av1_remove_compressor(AV1_COMP *cpi) {
   if (!cpi) return;
 
-  cm = &cpi->common;
-  const int num_planes = av1_num_planes(cm);
-
+  AV1_COMMON *cm = &cpi->common;
   if (cm->current_frame.frame_number > 0) {
 #if CONFIG_ENTROPY_STATS
     if (!is_stat_generation_stage(cpi)) {
@@ -3519,19 +1388,22 @@
       char results[512] = { 0 };
       FILE *f = fopen("opsnr.stt", "a");
       double time_encoded =
-          (cpi->time_stamps.prev_end_seen - cpi->time_stamps.first_ever) /
+          (cpi->time_stamps.prev_ts_end - cpi->time_stamps.first_ts_start) /
           10000000.000;
       double total_encode_time =
           (cpi->time_receive_data + cpi->time_compress_data) / 1000.000;
       const double dr =
           (double)cpi->bytes * (double)8 / (double)1000 / time_encoded;
-      const double peak = (double)((1 << cpi->oxcf.input_bit_depth) - 1);
-      const double target_rate = (double)cpi->oxcf.target_bandwidth / 1000;
+      const double peak =
+          (double)((1 << cpi->oxcf.input_cfg.input_bit_depth) - 1);
+      const double target_rate =
+          (double)cpi->oxcf.rc_cfg.target_bandwidth / 1000;
       const double rate_err = ((100.0 * (dr - target_rate)) / target_rate);
 
       if (cpi->b_calculate_psnr) {
-        const double total_psnr = aom_sse_to_psnr(
-            (double)cpi->total_samples, peak, (double)cpi->total_sq_error);
+        const double total_psnr =
+            aom_sse_to_psnr((double)cpi->total_samples[0], peak,
+                            (double)cpi->total_sq_error[0]);
         const double total_ssim =
             100 * pow(cpi->summed_quality / cpi->summed_weights, 8.0);
         snprintf(headings, sizeof(headings),
@@ -3544,24 +1416,25 @@
                  "%7.3f\t%7.3f\t%7.3f\t%7.3f\t"
                  "%7.3f\t%7.3f\t%7.3f\t%7.3f\t"
                  "%7.3f\t%7.3f\t%7.3f",
-                 dr, cpi->psnr.stat[STAT_ALL] / cpi->count, total_psnr,
-                 cpi->psnr.stat[STAT_ALL] / cpi->count, total_psnr, total_ssim,
-                 total_ssim, cpi->fastssim.stat[STAT_ALL] / cpi->count,
-                 cpi->psnrhvs.stat[STAT_ALL] / cpi->count, cpi->psnr.worst,
-                 cpi->worst_ssim, cpi->fastssim.worst, cpi->psnrhvs.worst,
-                 cpi->psnr.stat[STAT_Y] / cpi->count,
-                 cpi->psnr.stat[STAT_U] / cpi->count,
-                 cpi->psnr.stat[STAT_V] / cpi->count);
+                 dr, cpi->psnr[0].stat[STAT_ALL] / cpi->count[0], total_psnr,
+                 cpi->psnr[0].stat[STAT_ALL] / cpi->count[0], total_psnr,
+                 total_ssim, total_ssim,
+                 cpi->fastssim.stat[STAT_ALL] / cpi->count[0],
+                 cpi->psnrhvs.stat[STAT_ALL] / cpi->count[0],
+                 cpi->psnr[0].worst, cpi->worst_ssim, cpi->fastssim.worst,
+                 cpi->psnrhvs.worst, cpi->psnr[0].stat[STAT_Y] / cpi->count[0],
+                 cpi->psnr[0].stat[STAT_U] / cpi->count[0],
+                 cpi->psnr[0].stat[STAT_V] / cpi->count[0]);
 
         if (cpi->b_calculate_blockiness) {
           SNPRINT(headings, "\t  Block\tWstBlck");
-          SNPRINT2(results, "\t%7.3f", cpi->total_blockiness / cpi->count);
+          SNPRINT2(results, "\t%7.3f", cpi->total_blockiness / cpi->count[0]);
           SNPRINT2(results, "\t%7.3f", cpi->worst_blockiness);
         }
 
         if (cpi->b_calculate_consistency) {
           double consistency =
-              aom_sse_to_psnr((double)cpi->total_samples, peak,
+              aom_sse_to_psnr((double)cpi->total_samples[0], peak,
                               (double)cpi->total_inconsistency);
 
           SNPRINT(headings, "\tConsist\tWstCons");
@@ -3569,16 +1442,52 @@
           SNPRINT2(results, "\t%7.3f", cpi->worst_consistency);
         }
 
-        SNPRINT(headings, "\t    Time\tRcErr\tAbsErr");
+        SNPRINT(headings, "\t   Time\tRcErr\tAbsErr");
         SNPRINT2(results, "\t%8.0f", total_encode_time);
-        SNPRINT2(results, "\t%7.2f", rate_err);
-        SNPRINT2(results, "\t%7.2f", fabs(rate_err));
+        SNPRINT2(results, " %7.2f", rate_err);
+        SNPRINT2(results, " %7.2f", fabs(rate_err));
 
-        fprintf(f, "%s\tAPsnr611\n", headings);
-        fprintf(f, "%s\t%7.3f\n", results,
-                (6 * cpi->psnr.stat[STAT_Y] + cpi->psnr.stat[STAT_U] +
-                 cpi->psnr.stat[STAT_V]) /
-                    (cpi->count * 8));
+        SNPRINT(headings, "\tAPsnr611");
+        SNPRINT2(results, " %7.3f",
+                 (6 * cpi->psnr[0].stat[STAT_Y] + cpi->psnr[0].stat[STAT_U] +
+                  cpi->psnr[0].stat[STAT_V]) /
+                     (cpi->count[0] * 8));
+
+#if CONFIG_AV1_HIGHBITDEPTH
+        const uint32_t in_bit_depth = cpi->oxcf.input_cfg.input_bit_depth;
+        const uint32_t bit_depth = cpi->td.mb.e_mbd.bd;
+        if ((cpi->source->flags & YV12_FLAG_HIGHBITDEPTH) &&
+            (in_bit_depth < bit_depth)) {
+          const double peak_hbd = (double)((1 << bit_depth) - 1);
+          const double total_psnr_hbd =
+              aom_sse_to_psnr((double)cpi->total_samples[1], peak_hbd,
+                              (double)cpi->total_sq_error[1]);
+          const double total_ssim_hbd =
+              100 * pow(cpi->summed_quality_hbd / cpi->summed_weights_hbd, 8.0);
+          SNPRINT(headings,
+                  "\t AVGPsnrH GLBPsnrH AVPsnrPH GLPsnrPH"
+                  " AVPsnrYH APsnrCbH APsnrCrH WstPsnrH"
+                  " AOMSSIMH VPSSIMPH WstSsimH");
+          SNPRINT2(results, "\t%7.3f",
+                   cpi->psnr[1].stat[STAT_ALL] / cpi->count[1]);
+          SNPRINT2(results, "  %7.3f", total_psnr_hbd);
+          SNPRINT2(results, "  %7.3f",
+                   cpi->psnr[1].stat[STAT_ALL] / cpi->count[1]);
+          SNPRINT2(results, "  %7.3f", total_psnr_hbd);
+          SNPRINT2(results, "  %7.3f",
+                   cpi->psnr[1].stat[STAT_Y] / cpi->count[1]);
+          SNPRINT2(results, "  %7.3f",
+                   cpi->psnr[1].stat[STAT_U] / cpi->count[1]);
+          SNPRINT2(results, "  %7.3f",
+                   cpi->psnr[1].stat[STAT_V] / cpi->count[1]);
+          SNPRINT2(results, "  %7.3f", cpi->psnr[1].worst);
+          SNPRINT2(results, "  %7.3f", total_ssim_hbd);
+          SNPRINT2(results, "  %7.3f", total_ssim_hbd);
+          SNPRINT2(results, "  %7.3f", cpi->worst_ssim_hbd);
+        }
+#endif
+        fprintf(f, "%s\n", headings);
+        fprintf(f, "%s\n", results);
       }
 
       fclose(f);
@@ -3597,60 +1506,46 @@
 #endif
   }
 
+  TplParams *const tpl_data = &cpi->tpl_data;
   for (int frame = 0; frame < MAX_LAG_BUFFERS; ++frame) {
     aom_free(tpl_data->tpl_stats_pool[frame]);
     aom_free_frame_buffer(&tpl_data->tpl_rec_pool[frame]);
   }
 
-  for (t = cpi->num_workers - 1; t >= 0; --t) {
-    AVxWorker *const worker = &cpi->workers[t];
-    EncWorkerData *const thread_data = &cpi->tile_thr_data[t];
-
-    // Deallocate allocated threads.
-    aom_get_worker_interface()->end(worker);
-
-    // Deallocate allocated thread data.
-    aom_free(thread_data->td->tctx);
-    if (t > 0) {
-      aom_free(thread_data->td->palette_buffer);
-      aom_free(thread_data->td->tmp_conv_dst);
-      av1_release_compound_type_rd_buffers(&thread_data->td->comp_rd_buffer);
-      for (int j = 0; j < 2; ++j) {
-        aom_free(thread_data->td->tmp_obmc_bufs[j]);
-      }
-      aom_free(thread_data->td->above_pred_buf);
-      aom_free(thread_data->td->left_pred_buf);
-      aom_free(thread_data->td->wsrc_buf);
-      aom_free(thread_data->td->vt64x64);
-
-      aom_free(thread_data->td->inter_modes_info);
-      for (int x = 0; x < 2; x++) {
-        for (int y = 0; y < 2; y++) {
-          aom_free(thread_data->td->hash_value_buffer[x][y]);
-          thread_data->td->hash_value_buffer[x][y] = NULL;
-        }
-      }
-      aom_free(thread_data->td->mask_buf);
-      aom_free(thread_data->td->counts);
-      av1_free_pc_tree(cpi, thread_data->td, num_planes,
-                       cm->seq_params.sb_size);
-      aom_free(thread_data->td->mbmi_ext);
-      aom_free(thread_data->td);
-    }
+  if (cpi->compressor_stage != LAP_STAGE) {
+    terminate_worker_data(cpi);
+    free_thread_data(cpi);
   }
+
+  MultiThreadInfo *const mt_info = &cpi->mt_info;
 #if CONFIG_MULTITHREAD
-  if (cpi->row_mt_mutex_ != NULL) {
-    pthread_mutex_destroy(cpi->row_mt_mutex_);
-    aom_free(cpi->row_mt_mutex_);
+  pthread_mutex_t *const enc_row_mt_mutex_ = mt_info->enc_row_mt.mutex_;
+  pthread_mutex_t *const gm_mt_mutex_ = mt_info->gm_sync.mutex_;
+  if (enc_row_mt_mutex_ != NULL) {
+    pthread_mutex_destroy(enc_row_mt_mutex_);
+    aom_free(enc_row_mt_mutex_);
+  }
+  if (gm_mt_mutex_ != NULL) {
+    pthread_mutex_destroy(gm_mt_mutex_);
+    aom_free(gm_mt_mutex_);
   }
 #endif
   av1_row_mt_mem_dealloc(cpi);
-  aom_free(cpi->tile_thr_data);
-  aom_free(cpi->workers);
+  if (cpi->compressor_stage != LAP_STAGE) {
+    aom_free(mt_info->tile_thr_data);
+    aom_free(mt_info->workers);
+  }
 
-  if (cpi->num_workers > 1) {
-    av1_loop_filter_dealloc(&cpi->lf_row_sync);
-    av1_loop_restoration_dealloc(&cpi->lr_row_sync, cpi->num_workers);
+#if !CONFIG_REALTIME_ONLY
+  av1_tpl_dealloc(&tpl_data->tpl_mt_sync);
+#endif
+  if (mt_info->num_workers > 1) {
+    av1_loop_filter_dealloc(&mt_info->lf_row_sync);
+#if !CONFIG_REALTIME_ONLY
+    av1_loop_restoration_dealloc(&mt_info->lr_row_sync, mt_info->num_workers);
+    av1_gm_dealloc(&mt_info->gm_sync);
+    av1_tf_mt_dealloc(&mt_info->tf_sync);
+#endif
   }
 
   dealloc_compressor_data(cpi);
@@ -3661,16 +1556,10 @@
 #endif  // CONFIG_INTERNAL_STATS
 
   av1_remove_common(cm);
-#if CONFIG_HTB_TRELLIS
-  if (cpi->sf.use_hash_based_trellis) hbt_destroy();
-#endif  // CONFIG_HTB_TRELLIS
   av1_free_ref_frame_buffers(cm->buffer_pool);
 
   aom_free(cpi);
 
-#ifdef OUTPUT_YUV_SKINMAP
-  fclose(yuv_skinmap_file);
-#endif
 #ifdef OUTPUT_YUV_REC
   fclose(yuv_rec_file);
 #endif
@@ -3681,7 +1570,7 @@
   int i;
   PSNR_STATS psnr;
 #if CONFIG_AV1_HIGHBITDEPTH
-  const uint32_t in_bit_depth = cpi->oxcf.input_bit_depth;
+  const uint32_t in_bit_depth = cpi->oxcf.input_cfg.input_bit_depth;
   const uint32_t bit_depth = cpi->td.mb.e_mbd.bd;
   aom_calc_highbd_psnr(cpi->source, &cpi->common.cur_frame->buf, &psnr,
                        bit_depth, in_bit_depth);
@@ -3694,6 +1583,18 @@
     pkt.data.psnr.sse[i] = psnr.sse[i];
     pkt.data.psnr.psnr[i] = psnr.psnr[i];
   }
+
+#if CONFIG_AV1_HIGHBITDEPTH
+  if ((cpi->source->flags & YV12_FLAG_HIGHBITDEPTH) &&
+      (in_bit_depth < bit_depth)) {
+    for (i = 0; i < 4; ++i) {
+      pkt.data.psnr.samples_hbd[i] = psnr.samples_hbd[i];
+      pkt.data.psnr.sse_hbd[i] = psnr.sse_hbd[i];
+      pkt.data.psnr.psnr_hbd[i] = psnr.psnr_hbd[i];
+    }
+  }
+#endif
+
   pkt.kind = AOM_CODEC_PSNR_PKT;
   aom_codec_pkt_list_add(cpi->output_pkt_list, &pkt);
 }
@@ -3729,45 +1630,6 @@
   }
 }
 
-int av1_update_entropy(bool *ext_refresh_frame_context,
-                       bool *ext_refresh_frame_context_pending, bool update) {
-  *ext_refresh_frame_context = update;
-  *ext_refresh_frame_context_pending = 1;
-  return 0;
-}
-
-#if defined(OUTPUT_YUV_DENOISED) || defined(OUTPUT_YUV_SKINMAP)
-// The denoiser buffer is allocated as a YUV 440 buffer. This function writes it
-// as YUV 420. We simply use the top-left pixels of the UV buffers, since we do
-// not denoise the UV channels at this time. If ever we implement UV channel
-// denoising we will have to modify this.
-void aom_write_yuv_frame_420(YV12_BUFFER_CONFIG *s, FILE *f) {
-  uint8_t *src = s->y_buffer;
-  int h = s->y_height;
-
-  do {
-    fwrite(src, s->y_width, 1, f);
-    src += s->y_stride;
-  } while (--h);
-
-  src = s->u_buffer;
-  h = s->uv_height;
-
-  do {
-    fwrite(src, s->uv_width, 1, f);
-    src += s->uv_stride;
-  } while (--h);
-
-  src = s->v_buffer;
-  h = s->uv_height;
-
-  do {
-    fwrite(src, s->uv_width, 1, f);
-    src += s->uv_stride;
-  } while (--h);
-}
-#endif
-
 #ifdef OUTPUT_YUV_REC
 void aom_write_one_yuv_frame(AV1_COMMON *cm, YV12_BUFFER_CONFIG *s) {
   uint8_t *src = s->y_buffer;
@@ -3826,144 +1688,6 @@
 }
 #endif  // OUTPUT_YUV_REC
 
-#define GM_RECODE_LOOP_NUM4X4_FACTOR 192
-static int recode_loop_test_global_motion(
-    WarpedMotionParams *const global_motion,
-    const int *const global_motion_used, int *const gm_params_cost) {
-  int i;
-  int recode = 0;
-  for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
-    if (global_motion[i].wmtype != IDENTITY &&
-        global_motion_used[i] * GM_RECODE_LOOP_NUM4X4_FACTOR <
-            gm_params_cost[i]) {
-      global_motion[i] = default_warp_params;
-      assert(global_motion[i].wmtype == IDENTITY);
-      gm_params_cost[i] = 0;
-      recode = 1;
-      // TODO(sarahparker): The earlier condition for recoding here was:
-      // "recode |= (rdc->global_motion_used[i] > 0);". Can we bring something
-      // similar to that back to speed up global motion?
-    }
-  }
-  return recode;
-}
-
-// Function to test for conditions that indicate we should loop
-// back and recode a frame.
-static int recode_loop_test(AV1_COMP *cpi, int high_limit, int low_limit, int q,
-                            int maxq, int minq) {
-  const RATE_CONTROL *const rc = &cpi->rc;
-  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
-  const int frame_is_kfgfarf = frame_is_kf_gf_arf(cpi);
-  int force_recode = 0;
-
-  if ((rc->projected_frame_size >= rc->max_frame_bandwidth) ||
-      (cpi->sf.hl_sf.recode_loop == ALLOW_RECODE) ||
-      (frame_is_kfgfarf &&
-       (cpi->sf.hl_sf.recode_loop == ALLOW_RECODE_KFARFGF))) {
-    // TODO(agrange) high_limit could be greater than the scale-down threshold.
-    if ((rc->projected_frame_size > high_limit && q < maxq) ||
-        (rc->projected_frame_size < low_limit && q > minq)) {
-      force_recode = 1;
-    } else if (cpi->oxcf.rc_mode == AOM_CQ) {
-      // Deal with frame undershoot and whether or not we are
-      // below the automatically set cq level.
-      if (q > oxcf->cq_level &&
-          rc->projected_frame_size < ((rc->this_frame_target * 7) >> 3)) {
-        force_recode = 1;
-      }
-    }
-  }
-  return force_recode;
-}
-
-static void scale_references(AV1_COMP *cpi) {
-  AV1_COMMON *cm = &cpi->common;
-  const int num_planes = av1_num_planes(cm);
-  MV_REFERENCE_FRAME ref_frame;
-
-  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
-    // Need to convert from AOM_REFFRAME to index into ref_mask (subtract 1).
-    if (cpi->ref_frame_flags & av1_ref_frame_flag_list[ref_frame]) {
-      BufferPool *const pool = cm->buffer_pool;
-      const YV12_BUFFER_CONFIG *const ref =
-          get_ref_frame_yv12_buf(cm, ref_frame);
-
-      if (ref == NULL) {
-        cpi->scaled_ref_buf[ref_frame - 1] = NULL;
-        continue;
-      }
-
-      if (ref->y_crop_width != cm->width || ref->y_crop_height != cm->height) {
-        // Replace the reference buffer with a copy having a thicker border,
-        // if the reference buffer is higher resolution than the current
-        // frame, and the border is thin.
-        if ((ref->y_crop_width > cm->width ||
-             ref->y_crop_height > cm->height) &&
-            ref->border < AOM_BORDER_IN_PIXELS) {
-          RefCntBuffer *ref_fb = get_ref_frame_buf(cm, ref_frame);
-          if (aom_yv12_realloc_with_new_border(
-                  &ref_fb->buf, AOM_BORDER_IN_PIXELS,
-                  cm->features.byte_alignment, num_planes) != 0) {
-            aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
-                               "Failed to allocate frame buffer");
-          }
-        }
-        int force_scaling = 0;
-        RefCntBuffer *new_fb = cpi->scaled_ref_buf[ref_frame - 1];
-        if (new_fb == NULL) {
-          const int new_fb_idx = get_free_fb(cm);
-          if (new_fb_idx == INVALID_IDX) {
-            aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
-                               "Unable to find free frame buffer");
-          }
-          force_scaling = 1;
-          new_fb = &pool->frame_bufs[new_fb_idx];
-        }
-
-        if (force_scaling || new_fb->buf.y_crop_width != cm->width ||
-            new_fb->buf.y_crop_height != cm->height) {
-          if (aom_realloc_frame_buffer(
-                  &new_fb->buf, cm->width, cm->height,
-                  cm->seq_params.subsampling_x, cm->seq_params.subsampling_y,
-                  cm->seq_params.use_highbitdepth, AOM_BORDER_IN_PIXELS,
-                  cm->features.byte_alignment, NULL, NULL, NULL)) {
-            if (force_scaling) {
-              // Release the reference acquired in the get_free_fb() call above.
-              --new_fb->ref_count;
-            }
-            aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
-                               "Failed to allocate frame buffer");
-          }
-          av1_resize_and_extend_frame(
-              ref, &new_fb->buf, (int)cm->seq_params.bit_depth, num_planes);
-          cpi->scaled_ref_buf[ref_frame - 1] = new_fb;
-          alloc_frame_mvs(cm, new_fb);
-        }
-      } else {
-        RefCntBuffer *buf = get_ref_frame_buf(cm, ref_frame);
-        buf->buf.y_crop_width = ref->y_crop_width;
-        buf->buf.y_crop_height = ref->y_crop_height;
-        cpi->scaled_ref_buf[ref_frame - 1] = buf;
-        ++buf->ref_count;
-      }
-    } else {
-      if (!has_no_stats_stage(cpi)) cpi->scaled_ref_buf[ref_frame - 1] = NULL;
-    }
-  }
-}
-
-static void release_scaled_references(AV1_COMP *cpi) {
-  // TODO(isbs): only refresh the necessary frames, rather than all of them
-  for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
-    RefCntBuffer *const buf = cpi->scaled_ref_buf[i];
-    if (buf != NULL) {
-      --buf->ref_count;
-      cpi->scaled_ref_buf[i] = NULL;
-    }
-  }
-}
-
 static void set_mv_search_params(AV1_COMP *cpi) {
   const AV1_COMMON *const cm = &cpi->common;
   MotionVectorSearchParams *const mv_search_params = &cpi->mv_search_params;
@@ -3991,8 +1715,7 @@
   }
 }
 
-void av1_set_screen_content_options(const AV1_COMP *cpi,
-                                    FeatureFlags *features) {
+void av1_set_screen_content_options(AV1_COMP *cpi, FeatureFlags *features) {
   const AV1_COMMON *const cm = &cpi->common;
 
   if (cm->seq_params.force_screen_content_tools != 2) {
@@ -4007,7 +1730,7 @@
     return;
   }
 
-  if (cpi->oxcf.content == AOM_CONTENT_SCREEN) {
+  if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN) {
     features->allow_screen_content_tools = features->allow_intrabc = 1;
     return;
   }
@@ -4034,12 +1757,14 @@
 
   for (int r = 0; r + blk_h <= height; r += blk_h) {
     for (int c = 0; c + blk_w <= width; c += blk_w) {
-      int count_buf[1 << 12];  // Maximum (1 << 12) color levels.
+      int count_buf[1 << 8];  // Maximum (1 << 8) bins for hbd path.
       const uint8_t *const this_src = src + r * stride + c;
-      const int n_colors =
-          use_hbd ? av1_count_colors_highbd(this_src, stride, blk_w, blk_h, bd,
-                                            count_buf)
-                  : av1_count_colors(this_src, stride, blk_w, blk_h, count_buf);
+      int n_colors;
+      if (use_hbd)
+        av1_count_colors_highbd(this_src, stride, blk_w, blk_h, bd, NULL,
+                                count_buf, &n_colors, NULL);
+      else
+        av1_count_colors(this_src, stride, blk_w, blk_h, count_buf, &n_colors);
       if (n_colors > 1 && n_colors <= color_thresh) {
         ++counts_1;
         struct buf_2d buf;
@@ -4061,219 +1786,68 @@
   // requires that the block has high variance.
   features->allow_intrabc = features->allow_screen_content_tools &&
                             counts_2 * blk_h * blk_w * 12 > width * height;
+  cpi->use_screen_content_tools = features->allow_screen_content_tools;
+  cpi->is_screen_content_type =
+      features->allow_intrabc ||
+      (counts_1 * blk_h * blk_w * 10 > width * height * 4 &&
+       counts_2 * blk_h * blk_w * 30 > width * height);
 }
 
-static void set_size_independent_vars(AV1_COMP *cpi) {
-  int i;
-  AV1_COMMON *const cm = &cpi->common;
-  for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
-    cm->global_motion[i] = default_warp_params;
-  }
-  cpi->gm_info.search_done = 0;
+// Function pointer to search site config initialization
+// of different search method functions.
+typedef void (*av1_init_search_site_config)(search_site_config *cfg, int stride,
+                                            int level);
 
-  av1_set_speed_features_framesize_independent(cpi, cpi->speed);
-  av1_set_rd_speed_thresholds(cpi);
-  cm->features.interp_filter = SWITCHABLE;
-  cm->features.switchable_motion_mode = 1;
-}
-
-#if !CONFIG_REALTIME_ONLY
-double av1_get_gfu_boost_projection_factor(double min_factor, double max_factor,
-                                           int frame_count) {
-  double factor = sqrt((double)frame_count);
-  factor = AOMMIN(factor, max_factor);
-  factor = AOMMAX(factor, min_factor);
-  factor = (200.0 + 10.0 * factor);
-  return factor;
-}
-
-static int get_gfu_boost_from_r0_lap(double min_factor, double max_factor,
-                                     double r0, int frames_to_key) {
-  double factor = av1_get_gfu_boost_projection_factor(min_factor, max_factor,
-                                                      frames_to_key);
-  const int boost = (int)rint(factor / r0);
-  return boost;
-}
-
-double av1_get_kf_boost_projection_factor(int frame_count) {
-  double factor = sqrt((double)frame_count);
-  factor = AOMMIN(factor, 10.0);
-  factor = AOMMAX(factor, 4.0);
-  factor = (75.0 + 14.0 * factor);
-  return factor;
-}
-
-static int get_kf_boost_from_r0(double r0, int frames_to_key) {
-  double factor = av1_get_kf_boost_projection_factor(frames_to_key);
-  const int boost = (int)rint(factor / r0);
-  return boost;
-}
-#endif
-
-#define MIN_BOOST_COMBINE_FACTOR 4.0
-#define MAX_BOOST_COMBINE_FACTOR 12.0
-int combine_prior_with_tpl_boost(double min_factor, double max_factor,
-                                 int prior_boost, int tpl_boost,
-                                 int frames_to_key) {
-  double factor = sqrt((double)frames_to_key);
-  double range = max_factor - min_factor;
-  factor = AOMMIN(factor, max_factor);
-  factor = AOMMAX(factor, min_factor);
-  factor -= min_factor;
-  int boost =
-      (int)((factor * prior_boost + (range - factor) * tpl_boost) / range);
-  return boost;
-}
-
-#if !CONFIG_REALTIME_ONLY
-static void process_tpl_stats_frame(AV1_COMP *cpi) {
-  const GF_GROUP *const gf_group = &cpi->gf_group;
-  AV1_COMMON *const cm = &cpi->common;
-
-  assert(IMPLIES(gf_group->size > 0, gf_group->index < gf_group->size));
-
-  const int tpl_idx = gf_group->index;
-  TplParams *const tpl_data = &cpi->tpl_data;
-  TplDepFrame *tpl_frame = &tpl_data->tpl_frame[tpl_idx];
-  TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr;
-
-  if (tpl_frame->is_valid) {
-    int tpl_stride = tpl_frame->stride;
-    int64_t intra_cost_base = 0;
-    int64_t mc_dep_cost_base = 0;
-    int64_t mc_saved_base = 0;
-    int64_t mc_count_base = 0;
-    const int step = 1 << tpl_data->tpl_stats_block_mis_log2;
-    const int mi_cols_sr = av1_pixels_to_mi(cm->superres_upscaled_width);
-
-    for (int row = 0; row < cm->mi_params.mi_rows; row += step) {
-      for (int col = 0; col < mi_cols_sr; col += step) {
-        TplDepStats *this_stats = &tpl_stats[av1_tpl_ptr_pos(
-            row, col, tpl_stride, tpl_data->tpl_stats_block_mis_log2)];
-        int64_t mc_dep_delta =
-            RDCOST(tpl_frame->base_rdmult, this_stats->mc_dep_rate,
-                   this_stats->mc_dep_dist);
-        intra_cost_base += (this_stats->recrf_dist << RDDIV_BITS);
-        mc_dep_cost_base +=
-            (this_stats->recrf_dist << RDDIV_BITS) + mc_dep_delta;
-        mc_count_base += this_stats->mc_count;
-        mc_saved_base += this_stats->mc_saved;
-      }
-    }
-
-    if (mc_dep_cost_base == 0) {
-      tpl_frame->is_valid = 0;
-    } else {
-      aom_clear_system_state();
-      cpi->rd.r0 = (double)intra_cost_base / mc_dep_cost_base;
-      if (is_frame_arf_and_tpl_eligible(gf_group)) {
-        cpi->rd.arf_r0 = cpi->rd.r0;
-        if (cpi->lap_enabled) {
-          double min_boost_factor = sqrt(cpi->rc.baseline_gf_interval);
-          const int gfu_boost = get_gfu_boost_from_r0_lap(
-              min_boost_factor, MAX_GFUBOOST_FACTOR, cpi->rd.arf_r0,
-              cpi->rc.num_stats_required_for_gfu_boost);
-          // printf("old boost %d new boost %d\n", cpi->rc.gfu_boost,
-          //        gfu_boost);
-          cpi->rc.gfu_boost = combine_prior_with_tpl_boost(
-              min_boost_factor, MAX_BOOST_COMBINE_FACTOR, cpi->rc.gfu_boost,
-              gfu_boost, cpi->rc.num_stats_used_for_gfu_boost);
-        } else {
-          const int gfu_boost = (int)(200.0 / cpi->rd.r0);
-          cpi->rc.gfu_boost = combine_prior_with_tpl_boost(
-              MIN_BOOST_COMBINE_FACTOR, MAX_BOOST_COMBINE_FACTOR,
-              cpi->rc.gfu_boost, gfu_boost, cpi->rc.frames_to_key);
-        }
-      } else if (frame_is_intra_only(cm)) {
-        // TODO(debargha): Turn off q adjustment for kf temporarily to
-        // reduce impact on speed of encoding. Need to investigate how
-        // to mitigate the issue.
-        if (cpi->oxcf.rc_mode == AOM_Q) {
-          const int kf_boost =
-              get_kf_boost_from_r0(cpi->rd.r0, cpi->rc.frames_to_key);
-          if (cpi->lap_enabled) {
-            cpi->rc.kf_boost = combine_prior_with_tpl_boost(
-                MIN_BOOST_COMBINE_FACTOR, MAX_BOOST_COMBINE_FACTOR,
-                cpi->rc.kf_boost, kf_boost,
-                cpi->rc.num_stats_used_for_kf_boost);
-          } else {
-            cpi->rc.kf_boost = combine_prior_with_tpl_boost(
-                MIN_BOOST_COMBINE_FACTOR, MAX_BOOST_COMBINE_FACTOR,
-                cpi->rc.kf_boost, kf_boost, cpi->rc.frames_to_key);
-          }
-        }
-      }
-      cpi->rd.mc_count_base = (double)mc_count_base /
-                              (cm->mi_params.mi_rows * cm->mi_params.mi_cols);
-      cpi->rd.mc_saved_base = (double)mc_saved_base /
-                              (cm->mi_params.mi_rows * cm->mi_params.mi_cols);
-      aom_clear_system_state();
-    }
-  }
-}
-#endif  // !CONFIG_REALTIME_ONLY
-
-static void set_size_dependent_vars(AV1_COMP *cpi, int *q, int *bottom_index,
-                                    int *top_index) {
-  AV1_COMMON *const cm = &cpi->common;
-
-  // Setup variables that depend on the dimensions of the frame.
-  av1_set_speed_features_framesize_dependent(cpi, cpi->speed);
-
-#if !CONFIG_REALTIME_ONLY
-  if (cpi->oxcf.enable_tpl_model && is_frame_tpl_eligible(cpi)) {
-    process_tpl_stats_frame(cpi);
-    av1_tpl_rdmult_setup(cpi);
-  }
-#endif
-
-  // Decide q and q bounds.
-  *q = av1_rc_pick_q_and_bounds(cpi, &cpi->rc, cm->width, cm->height,
-                                cpi->gf_group.index, bottom_index, top_index);
-
-  // Configure experimental use of segmentation for enhanced coding of
-  // static regions if indicated.
-  // Only allowed in the second pass of a two pass encode, as it requires
-  // lagged coding, and if the relevant speed feature flag is set.
-  if (is_stat_consumption_stage_twopass(cpi) &&
-      cpi->sf.hl_sf.static_segmentation)
-    configure_static_seg_features(cpi);
-}
+av1_init_search_site_config
+    av1_init_motion_compensation[NUM_DISTINCT_SEARCH_METHODS] = {
+      av1_init_dsmotion_compensation,     av1_init_motion_compensation_nstep,
+      av1_init_motion_compensation_nstep, av1_init_dsmotion_compensation,
+      av1_init_motion_compensation_hex,   av1_init_motion_compensation_bigdia,
+      av1_init_motion_compensation_square
+    };
 
 static void init_motion_estimation(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
   MotionVectorSearchParams *const mv_search_params = &cpi->mv_search_params;
   const int y_stride = cpi->scaled_source.y_stride;
-  const int y_stride_src =
-      ((cpi->oxcf.width != cm->width || cpi->oxcf.height != cm->height) ||
-       av1_superres_scaled(cm))
-          ? y_stride
-          : cpi->lookahead->buf->img.y_stride;
+  const int y_stride_src = ((cpi->oxcf.frm_dim_cfg.width != cm->width ||
+                             cpi->oxcf.frm_dim_cfg.height != cm->height) ||
+                            av1_superres_scaled(cm))
+                               ? y_stride
+                               : cpi->lookahead->buf->img.y_stride;
   int fpf_y_stride = cm->cur_frame != NULL ? cm->cur_frame->buf.y_stride
                                            : cpi->scaled_source.y_stride;
 
-  // Update if ss_cfg is uninitialized or the current frame has a new stride
+  // Update if search_site_cfg is uninitialized or the current frame has a new
+  // stride
   const int should_update =
-      !mv_search_params->ss_cfg[SS_CFG_SRC].stride ||
-      !mv_search_params->ss_cfg[SS_CFG_LOOKAHEAD].stride ||
-      (y_stride != mv_search_params->ss_cfg[SS_CFG_SRC].stride);
+      !mv_search_params->search_site_cfg[SS_CFG_SRC][DIAMOND].stride ||
+      !mv_search_params->search_site_cfg[SS_CFG_LOOKAHEAD][DIAMOND].stride ||
+      (y_stride !=
+       mv_search_params->search_site_cfg[SS_CFG_SRC][DIAMOND].stride);
 
   if (!should_update) {
     return;
   }
 
-  if (cpi->sf.mv_sf.search_method == DIAMOND) {
-    av1_init_dsmotion_compensation(&mv_search_params->ss_cfg[SS_CFG_SRC],
-                                   y_stride);
-    av1_init_dsmotion_compensation(&mv_search_params->ss_cfg[SS_CFG_LOOKAHEAD],
-                                   y_stride_src);
-  } else {
-    av1_init3smotion_compensation(&mv_search_params->ss_cfg[SS_CFG_SRC],
-                                  y_stride);
-    av1_init3smotion_compensation(&mv_search_params->ss_cfg[SS_CFG_LOOKAHEAD],
-                                  y_stride_src);
+  // Initialization of search_site_cfg for NUM_DISTINCT_SEARCH_METHODS.
+  for (SEARCH_METHODS i = DIAMOND; i < NUM_DISTINCT_SEARCH_METHODS; i++) {
+    const int level = ((i == NSTEP_8PT) || (i == CLAMPED_DIAMOND)) ? 1 : 0;
+    av1_init_motion_compensation[i](
+        &mv_search_params->search_site_cfg[SS_CFG_SRC][i], y_stride, level);
+    av1_init_motion_compensation[i](
+        &mv_search_params->search_site_cfg[SS_CFG_LOOKAHEAD][i], y_stride_src,
+        level);
   }
-  av1_init_motion_fpf(&mv_search_params->ss_cfg[SS_CFG_FPF], fpf_y_stride);
+
+  // First pass search site config initialization.
+  av1_init_motion_fpf(&mv_search_params->search_site_cfg[SS_CFG_FPF][DIAMOND],
+                      fpf_y_stride);
+  for (SEARCH_METHODS i = NSTEP; i < NUM_DISTINCT_SEARCH_METHODS; i++) {
+    memcpy(&mv_search_params->search_site_cfg[SS_CFG_FPF][i],
+           &mv_search_params->search_site_cfg[SS_CFG_FPF][DIAMOND],
+           sizeof(search_site_config));
+  }
 }
 
 #define COUPLED_CHROMA_FROM_LUMA_RESTORATION 0
@@ -4314,8 +1888,10 @@
                              int subsampling_x, int subsampling_y) {
   AV1_COMMON *const cm = &cpi->common;
   SequenceHeader *const seq_params = &cm->seq_params;
+  InitialDimensions *const initial_dimensions = &cpi->initial_dimensions;
 
-  if (!cpi->initial_width || seq_params->use_highbitdepth != use_highbitdepth ||
+  if (!initial_dimensions->width ||
+      seq_params->use_highbitdepth != use_highbitdepth ||
       seq_params->subsampling_x != subsampling_x ||
       seq_params->subsampling_y != subsampling_y) {
     seq_params->subsampling_x = subsampling_x;
@@ -4333,8 +1909,8 @@
 
     init_motion_estimation(cpi);  // TODO(agrange) This can be removed.
 
-    cpi->initial_width = cm->width;
-    cpi->initial_height = cm->height;
+    initial_dimensions->width = cm->width;
+    initial_dimensions->height = cm->height;
     cpi->initial_mbs = cm->mi_params.MBs;
   }
 }
@@ -4342,7 +1918,7 @@
 // Returns 1 if the assigned width or height was <= 0.
 int av1_set_size_literal(AV1_COMP *cpi, int width, int height) {
   AV1_COMMON *cm = &cpi->common;
-  const int num_planes = av1_num_planes(cm);
+  InitialDimensions *const initial_dimensions = &cpi->initial_dimensions;
   av1_check_initial_width(cpi, cm->seq_params.use_highbitdepth,
                           cm->seq_params.subsampling_x,
                           cm->seq_params.subsampling_y);
@@ -4352,13 +1928,17 @@
   cm->width = width;
   cm->height = height;
 
-  if (cpi->initial_width && cpi->initial_height &&
-      (cm->width > cpi->initial_width || cm->height > cpi->initial_height)) {
+  if (initial_dimensions->width && initial_dimensions->height &&
+      (cm->width > initial_dimensions->width ||
+       cm->height > initial_dimensions->height)) {
     av1_free_context_buffers(cm);
-    av1_free_pc_tree(cpi, &cpi->td, num_planes, cm->seq_params.sb_size);
+    av1_free_shared_coeff_buffer(&cpi->td.shared_coeff_buf);
+    av1_free_sms_tree(&cpi->td);
+    av1_free_pmc(cpi->td.firstpass_ctx, av1_num_planes(cm));
+    cpi->td.firstpass_ctx = NULL;
     alloc_compressor_data(cpi);
     realloc_segmentation_maps(cpi);
-    cpi->initial_width = cpi->initial_height = 0;
+    initial_dimensions->width = initial_dimensions->height = 0;
   }
   update_frame_size(cpi);
 
@@ -4378,6 +1958,8 @@
     // Recalculate 'all_lossless' in case super-resolution was (un)selected.
     cm->features.all_lossless =
         cm->features.coded_lossless && !av1_superres_scaled(cm);
+
+    av1_noise_estimate_init(&cpi->noise_estimate, cm->width, cm->height);
   }
   set_mv_search_params(cpi);
 
@@ -4417,7 +1999,9 @@
   for (int i = 0; i < num_planes; ++i)
     cm->rst_info[i].frame_restoration_type = RESTORE_NONE;
 
+#if !CONFIG_REALTIME_ONLY
   av1_alloc_restoration_buffers(cm);
+#endif
   if (!is_stat_generation_stage(cpi)) alloc_util_frame_buffers(cpi);
   init_motion_estimation(cpi);
 
@@ -4438,371 +2022,19 @@
   set_ref_ptrs(cm, xd, LAST_FRAME, LAST_FRAME);
 }
 
-static uint8_t calculate_next_resize_scale(const AV1_COMP *cpi) {
-  // Choose an arbitrary random number
-  static unsigned int seed = 56789;
-  const AV1EncoderConfig *oxcf = &cpi->oxcf;
-  if (is_stat_generation_stage(cpi)) return SCALE_NUMERATOR;
-  uint8_t new_denom = SCALE_NUMERATOR;
-
-  if (cpi->common.seq_params.reduced_still_picture_hdr) return SCALE_NUMERATOR;
-  switch (oxcf->resize_mode) {
-    case RESIZE_NONE: new_denom = SCALE_NUMERATOR; break;
-    case RESIZE_FIXED:
-      if (cpi->common.current_frame.frame_type == KEY_FRAME)
-        new_denom = oxcf->resize_kf_scale_denominator;
-      else
-        new_denom = oxcf->resize_scale_denominator;
-      break;
-    case RESIZE_RANDOM: new_denom = lcg_rand16(&seed) % 9 + 8; break;
-    default: assert(0);
-  }
-  return new_denom;
-}
-
-#if CONFIG_SUPERRES_IN_RECODE
-static int superres_in_recode_allowed(const AV1_COMP *const cpi) {
-  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
-  // Empirically found to not be beneficial for AOM_Q mode and images coding.
-  return oxcf->superres_mode == SUPERRES_AUTO &&
-         (oxcf->rc_mode == AOM_VBR || oxcf->rc_mode == AOM_CQ) &&
-         cpi->rc.frames_to_key > 1;
-}
-#endif  // CONFIG_SUPERRES_IN_RECODE
-
-#define SUPERRES_ENERGY_BY_Q2_THRESH_KEYFRAME_SOLO 0.012
-#define SUPERRES_ENERGY_BY_Q2_THRESH_KEYFRAME 0.008
-#define SUPERRES_ENERGY_BY_Q2_THRESH_ARFFRAME 0.008
-#define SUPERRES_ENERGY_BY_AC_THRESH 0.2
-
-static double get_energy_by_q2_thresh(const GF_GROUP *gf_group,
-                                      const RATE_CONTROL *rc) {
-  // TODO(now): Return keyframe thresh * factor based on frame type / pyramid
-  // level.
-  if (gf_group->update_type[gf_group->index] == ARF_UPDATE) {
-    return SUPERRES_ENERGY_BY_Q2_THRESH_ARFFRAME;
-  } else if (gf_group->update_type[gf_group->index] == KF_UPDATE) {
-    if (rc->frames_to_key <= 1)
-      return SUPERRES_ENERGY_BY_Q2_THRESH_KEYFRAME_SOLO;
-    else
-      return SUPERRES_ENERGY_BY_Q2_THRESH_KEYFRAME;
-  } else {
-    assert(0);
-  }
-  return 0;
-}
-
-static uint8_t get_superres_denom_from_qindex_energy(int qindex, double *energy,
-                                                     double threshq,
-                                                     double threshp) {
-  const double q = av1_convert_qindex_to_q(qindex, AOM_BITS_8);
-  const double tq = threshq * q * q;
-  const double tp = threshp * energy[1];
-  const double thresh = AOMMIN(tq, tp);
-  int k;
-  for (k = SCALE_NUMERATOR * 2; k > SCALE_NUMERATOR; --k) {
-    if (energy[k - 1] > thresh) break;
-  }
-  return 3 * SCALE_NUMERATOR - k;
-}
-
-static uint8_t get_superres_denom_for_qindex(const AV1_COMP *cpi, int qindex,
-                                             int sr_kf, int sr_arf) {
-  // Use superres for Key-frames and Alt-ref frames only.
-  const GF_GROUP *gf_group = &cpi->gf_group;
-  if (gf_group->update_type[gf_group->index] != KF_UPDATE &&
-      gf_group->update_type[gf_group->index] != ARF_UPDATE) {
-    return SCALE_NUMERATOR;
-  }
-  if (gf_group->update_type[gf_group->index] == KF_UPDATE && !sr_kf) {
-    return SCALE_NUMERATOR;
-  }
-  if (gf_group->update_type[gf_group->index] == ARF_UPDATE && !sr_arf) {
-    return SCALE_NUMERATOR;
-  }
-
-  double energy[16];
-  analyze_hor_freq(cpi, energy);
-
-  const double energy_by_q2_thresh =
-      get_energy_by_q2_thresh(gf_group, &cpi->rc);
-  int denom = get_superres_denom_from_qindex_energy(
-      qindex, energy, energy_by_q2_thresh, SUPERRES_ENERGY_BY_AC_THRESH);
-  /*
-  printf("\nenergy = [");
-  for (int k = 1; k < 16; ++k) printf("%f, ", energy[k]);
-  printf("]\n");
-  printf("boost = %d\n",
-         (gf_group->update_type[gf_group->index] == KF_UPDATE)
-             ? cpi->rc.kf_boost
-             : cpi->rc.gfu_boost);
-  printf("denom = %d\n", denom);
-  */
-#if CONFIG_SUPERRES_IN_RECODE
-  if (superres_in_recode_allowed(cpi)) {
-    assert(cpi->superres_mode != SUPERRES_NONE);
-    // Force superres to be tried in the recode loop, as full-res is also going
-    // to be tried anyway.
-    denom = AOMMAX(denom, SCALE_NUMERATOR + 1);
-  }
-#endif  // CONFIG_SUPERRES_IN_RECODE
-  return denom;
-}
-
-// If true, SUPERRES_AUTO mode will exhaustively search over all superres
-// denominators for all frames (except overlay and internal overlay frames).
-#define SUPERRES_RECODE_ALL_RATIOS 0
-
-static uint8_t calculate_next_superres_scale(AV1_COMP *cpi) {
-  // Choose an arbitrary random number
-  static unsigned int seed = 34567;
-  const AV1EncoderConfig *oxcf = &cpi->oxcf;
-  if (is_stat_generation_stage(cpi)) return SCALE_NUMERATOR;
-  uint8_t new_denom = SCALE_NUMERATOR;
-
-  // Make sure that superres mode of the frame is consistent with the
-  // sequence-level flag.
-  assert(IMPLIES(oxcf->superres_mode != SUPERRES_NONE,
-                 cpi->common.seq_params.enable_superres));
-  assert(IMPLIES(!cpi->common.seq_params.enable_superres,
-                 oxcf->superres_mode == SUPERRES_NONE));
-  // Make sure that superres mode for current encoding is consistent with user
-  // provided superres mode.
-  assert(IMPLIES(oxcf->superres_mode != SUPERRES_AUTO,
-                 cpi->superres_mode == oxcf->superres_mode));
-
-  // Note: we must look at the current superres_mode to be tried in 'cpi' here,
-  // not the user given mode in 'oxcf'.
-  switch (cpi->superres_mode) {
-    case SUPERRES_NONE: new_denom = SCALE_NUMERATOR; break;
-    case SUPERRES_FIXED:
-      if (cpi->common.current_frame.frame_type == KEY_FRAME)
-        new_denom = oxcf->superres_kf_scale_denominator;
-      else
-        new_denom = oxcf->superres_scale_denominator;
-      break;
-    case SUPERRES_RANDOM: new_denom = lcg_rand16(&seed) % 9 + 8; break;
-    case SUPERRES_QTHRESH: {
-      // Do not use superres when screen content tools are used.
-      if (cpi->common.features.allow_screen_content_tools) break;
-      if (oxcf->rc_mode == AOM_VBR || oxcf->rc_mode == AOM_CQ)
-        av1_set_target_rate(cpi, cpi->oxcf.width, cpi->oxcf.height);
-
-      // Now decide the use of superres based on 'q'.
-      int bottom_index, top_index;
-      const int q = av1_rc_pick_q_and_bounds(
-          cpi, &cpi->rc, cpi->oxcf.width, cpi->oxcf.height, cpi->gf_group.index,
-          &bottom_index, &top_index);
-
-      const int qthresh = (frame_is_intra_only(&cpi->common))
-                              ? oxcf->superres_kf_qthresh
-                              : oxcf->superres_qthresh;
-      if (q <= qthresh) {
-        new_denom = SCALE_NUMERATOR;
-      } else {
-        new_denom = get_superres_denom_for_qindex(cpi, q, 1, 1);
-      }
-      break;
-    }
-    case SUPERRES_AUTO: {
-      // Do not use superres when screen content tools are used.
-      if (cpi->common.features.allow_screen_content_tools) break;
-      if (oxcf->rc_mode == AOM_VBR || oxcf->rc_mode == AOM_CQ)
-        av1_set_target_rate(cpi, cpi->oxcf.width, cpi->oxcf.height);
-
-      // Now decide the use of superres based on 'q'.
-      int bottom_index, top_index;
-      const int q = av1_rc_pick_q_and_bounds(
-          cpi, &cpi->rc, cpi->oxcf.width, cpi->oxcf.height, cpi->gf_group.index,
-          &bottom_index, &top_index);
-
-      const int qthresh = 128;
-      if (q <= qthresh) {
-        new_denom = SCALE_NUMERATOR;
-      } else {
-#if SUPERRES_RECODE_ALL_RATIOS
-        if (cpi->common.current_frame.frame_type == KEY_FRAME)
-          new_denom = oxcf->superres_kf_scale_denominator;
-        else
-          new_denom = oxcf->superres_scale_denominator;
-#else
-        new_denom = get_superres_denom_for_qindex(cpi, q, 1, 1);
-#endif  // SUPERRES_RECODE_ALL_RATIOS
-      }
-      break;
-    }
-    default: assert(0);
-  }
-  return new_denom;
-}
-
-static int dimension_is_ok(int orig_dim, int resized_dim, int denom) {
-  return (resized_dim * SCALE_NUMERATOR >= orig_dim * denom / 2);
-}
-
-static int dimensions_are_ok(int owidth, int oheight, size_params_type *rsz) {
-  // Only need to check the width, as scaling is horizontal only.
-  (void)oheight;
-  return dimension_is_ok(owidth, rsz->resize_width, rsz->superres_denom);
-}
-
-static int validate_size_scales(RESIZE_MODE resize_mode,
-                                SUPERRES_MODE superres_mode, int owidth,
-                                int oheight, size_params_type *rsz) {
-  if (dimensions_are_ok(owidth, oheight, rsz)) {  // Nothing to do.
-    return 1;
-  }
-
-  // Calculate current resize scale.
-  int resize_denom =
-      AOMMAX(DIVIDE_AND_ROUND(owidth * SCALE_NUMERATOR, rsz->resize_width),
-             DIVIDE_AND_ROUND(oheight * SCALE_NUMERATOR, rsz->resize_height));
-
-  if (resize_mode != RESIZE_RANDOM && superres_mode == SUPERRES_RANDOM) {
-    // Alter superres scale as needed to enforce conformity.
-    rsz->superres_denom =
-        (2 * SCALE_NUMERATOR * SCALE_NUMERATOR) / resize_denom;
-    if (!dimensions_are_ok(owidth, oheight, rsz)) {
-      if (rsz->superres_denom > SCALE_NUMERATOR) --rsz->superres_denom;
-    }
-  } else if (resize_mode == RESIZE_RANDOM && superres_mode != SUPERRES_RANDOM) {
-    // Alter resize scale as needed to enforce conformity.
-    resize_denom =
-        (2 * SCALE_NUMERATOR * SCALE_NUMERATOR) / rsz->superres_denom;
-    rsz->resize_width = owidth;
-    rsz->resize_height = oheight;
-    av1_calculate_scaled_size(&rsz->resize_width, &rsz->resize_height,
-                              resize_denom);
-    if (!dimensions_are_ok(owidth, oheight, rsz)) {
-      if (resize_denom > SCALE_NUMERATOR) {
-        --resize_denom;
-        rsz->resize_width = owidth;
-        rsz->resize_height = oheight;
-        av1_calculate_scaled_size(&rsz->resize_width, &rsz->resize_height,
-                                  resize_denom);
-      }
-    }
-  } else if (resize_mode == RESIZE_RANDOM && superres_mode == SUPERRES_RANDOM) {
-    // Alter both resize and superres scales as needed to enforce conformity.
-    do {
-      if (resize_denom > rsz->superres_denom)
-        --resize_denom;
-      else
-        --rsz->superres_denom;
-      rsz->resize_width = owidth;
-      rsz->resize_height = oheight;
-      av1_calculate_scaled_size(&rsz->resize_width, &rsz->resize_height,
-                                resize_denom);
-    } while (!dimensions_are_ok(owidth, oheight, rsz) &&
-             (resize_denom > SCALE_NUMERATOR ||
-              rsz->superres_denom > SCALE_NUMERATOR));
-  } else {  // We are allowed to alter neither resize scale nor superres
-            // scale.
-    return 0;
-  }
-  return dimensions_are_ok(owidth, oheight, rsz);
-}
-
-// Calculates resize and superres params for next frame
-static size_params_type calculate_next_size_params(AV1_COMP *cpi) {
-  const AV1EncoderConfig *oxcf = &cpi->oxcf;
-  ResizePendingParams *resize_pending_params = &cpi->resize_pending_params;
-  size_params_type rsz = { oxcf->width, oxcf->height, SCALE_NUMERATOR };
-  int resize_denom = SCALE_NUMERATOR;
-  if (has_no_stats_stage(cpi) && cpi->use_svc &&
-      cpi->svc.spatial_layer_id < cpi->svc.number_spatial_layers - 1) {
-    rsz.resize_width = cpi->common.width;
-    rsz.resize_height = cpi->common.height;
-    return rsz;
-  }
-  if (is_stat_generation_stage(cpi)) return rsz;
-  if (resize_pending_params->width && resize_pending_params->height) {
-    rsz.resize_width = resize_pending_params->width;
-    rsz.resize_height = resize_pending_params->height;
-    resize_pending_params->width = resize_pending_params->height = 0;
-  } else {
-    resize_denom = calculate_next_resize_scale(cpi);
-    rsz.resize_width = oxcf->width;
-    rsz.resize_height = oxcf->height;
-    av1_calculate_scaled_size(&rsz.resize_width, &rsz.resize_height,
-                              resize_denom);
-  }
-  rsz.superres_denom = calculate_next_superres_scale(cpi);
-  if (!validate_size_scales(oxcf->resize_mode, cpi->superres_mode, oxcf->width,
-                            oxcf->height, &rsz))
-    assert(0 && "Invalid scale parameters");
-  return rsz;
-}
-
-static void setup_frame_size_from_params(AV1_COMP *cpi,
-                                         const size_params_type *rsz) {
-  int encode_width = rsz->resize_width;
-  int encode_height = rsz->resize_height;
-
-  AV1_COMMON *cm = &cpi->common;
-  cm->superres_upscaled_width = encode_width;
-  cm->superres_upscaled_height = encode_height;
-  cm->superres_scale_denominator = rsz->superres_denom;
-  av1_calculate_scaled_superres_size(&encode_width, &encode_height,
-                                     rsz->superres_denom);
-  av1_set_frame_size(cpi, encode_width, encode_height);
-}
-
-void av1_setup_frame_size(AV1_COMP *cpi) {
-  AV1_COMMON *cm = &cpi->common;
-  // Reset superres params from previous frame.
-  cm->superres_scale_denominator = SCALE_NUMERATOR;
-  const size_params_type rsz = calculate_next_size_params(cpi);
-  setup_frame_size_from_params(cpi, &rsz);
-
-  assert(av1_is_min_tile_width_satisfied(cm));
-}
-
-static void superres_post_encode(AV1_COMP *cpi) {
-  AV1_COMMON *cm = &cpi->common;
-  const int num_planes = av1_num_planes(cm);
-
-  if (!av1_superres_scaled(cm)) return;
-
-  assert(cpi->oxcf.enable_superres);
-  assert(!is_lossless_requested(&cpi->oxcf));
-  assert(!cm->features.all_lossless);
-
-  av1_superres_upscale(cm, NULL);
-
-  // If regular resizing is occurring the source will need to be downscaled to
-  // match the upscaled superres resolution. Otherwise the original source is
-  // used.
-  if (!av1_resize_scaled(cm)) {
-    cpi->source = cpi->unscaled_source;
-    if (cpi->last_source != NULL) cpi->last_source = cpi->unscaled_last_source;
-  } else {
-    assert(cpi->unscaled_source->y_crop_width != cm->superres_upscaled_width);
-    assert(cpi->unscaled_source->y_crop_height != cm->superres_upscaled_height);
-    // Do downscale. cm->(width|height) has been updated by
-    // av1_superres_upscale
-    if (aom_realloc_frame_buffer(
-            &cpi->scaled_source, cm->superres_upscaled_width,
-            cm->superres_upscaled_height, cm->seq_params.subsampling_x,
-            cm->seq_params.subsampling_y, cm->seq_params.use_highbitdepth,
-            AOM_BORDER_IN_PIXELS, cm->features.byte_alignment, NULL, NULL,
-            NULL))
-      aom_internal_error(
-          &cm->error, AOM_CODEC_MEM_ERROR,
-          "Failed to reallocate scaled source buffer for superres");
-    assert(cpi->scaled_source.y_crop_width == cm->superres_upscaled_width);
-    assert(cpi->scaled_source.y_crop_height == cm->superres_upscaled_height);
-    av1_resize_and_extend_frame(cpi->unscaled_source, &cpi->scaled_source,
-                                (int)cm->seq_params.bit_depth, num_planes);
-    cpi->source = &cpi->scaled_source;
-  }
-}
-
+/*!\brief Select and apply cdef filters and switchable restoration filters
+ *
+ * \ingroup high_level_algo
+ */
 static void cdef_restoration_frame(AV1_COMP *cpi, AV1_COMMON *cm,
                                    MACROBLOCKD *xd, int use_restoration,
                                    int use_cdef) {
+#if !CONFIG_REALTIME_ONLY
   if (use_restoration)
     av1_loop_restoration_save_boundary_lines(&cm->cur_frame->buf, cm, 0);
+#else
+  (void)use_restoration;
+#endif
 
   if (use_cdef) {
 #if CONFIG_COLLECT_COMPONENT_TIMING
@@ -4813,7 +2045,8 @@
                     cpi->sf.lpf_sf.cdef_pick_method, cpi->td.mb.rdmult);
 
     // Apply the filter
-    av1_cdef_frame(&cm->cur_frame->buf, cm, xd);
+    if (!cpi->sf.rt_sf.skip_loopfilter_non_reference)
+      av1_cdef_frame(&cm->cur_frame->buf, cm, xd);
 #if CONFIG_COLLECT_COMPONENT_TIMING
     end_timing(cpi, cdef_time);
 #endif
@@ -4824,21 +2057,24 @@
     cm->cdef_info.cdef_uv_strengths[0] = 0;
   }
 
-  superres_post_encode(cpi);
+  av1_superres_post_encode(cpi);
 
+#if !CONFIG_REALTIME_ONLY
 #if CONFIG_COLLECT_COMPONENT_TIMING
   start_timing(cpi, loop_restoration_time);
 #endif
   if (use_restoration) {
+    MultiThreadInfo *const mt_info = &cpi->mt_info;
+    const int num_workers = mt_info->num_workers;
     av1_loop_restoration_save_boundary_lines(&cm->cur_frame->buf, cm, 1);
     av1_pick_filter_restoration(cpi->source, cpi);
     if (cm->rst_info[0].frame_restoration_type != RESTORE_NONE ||
         cm->rst_info[1].frame_restoration_type != RESTORE_NONE ||
         cm->rst_info[2].frame_restoration_type != RESTORE_NONE) {
-      if (cpi->num_workers > 1)
-        av1_loop_restoration_filter_frame_mt(&cm->cur_frame->buf, cm, 0,
-                                             cpi->workers, cpi->num_workers,
-                                             &cpi->lr_row_sync, &cpi->lr_ctxt);
+      if (num_workers > 1)
+        av1_loop_restoration_filter_frame_mt(
+            &cm->cur_frame->buf, cm, 0, mt_info->workers, num_workers,
+            &mt_info->lr_row_sync, &cpi->lr_ctxt);
       else
         av1_loop_restoration_filter_frame(&cm->cur_frame->buf, cm, 0,
                                           &cpi->lr_ctxt);
@@ -4851,13 +2087,21 @@
 #if CONFIG_COLLECT_COMPONENT_TIMING
   end_timing(cpi, loop_restoration_time);
 #endif
+#endif  // !CONFIG_REALTIME_ONLY
 }
 
+/*!\brief Select and apply in-loop deblocking filters, cdef filters, and
+ * restoration filters
+ *
+ * \ingroup high_level_algo
+ */
 static void loopfilter_frame(AV1_COMP *cpi, AV1_COMMON *cm) {
+  MultiThreadInfo *const mt_info = &cpi->mt_info;
+  const int num_workers = mt_info->num_workers;
   const int num_planes = av1_num_planes(cm);
   MACROBLOCKD *xd = &cpi->td.mb.e_mbd;
 
-  assert(IMPLIES(is_lossless_requested(&cpi->oxcf),
+  assert(IMPLIES(is_lossless_requested(&cpi->oxcf.rc_cfg),
                  cm->features.coded_lossless && cm->features.all_lossless));
 
   const int use_loopfilter =
@@ -4881,14 +2125,15 @@
     lf->filter_level[1] = 0;
   }
 
-  if (lf->filter_level[0] || lf->filter_level[1]) {
-    if (cpi->num_workers > 1)
+  if ((lf->filter_level[0] || lf->filter_level[1]) &&
+      !cpi->sf.rt_sf.skip_loopfilter_non_reference) {
+    if (num_workers > 1)
       av1_loop_filter_frame_mt(&cm->cur_frame->buf, cm, xd, 0, num_planes, 0,
 #if CONFIG_LPF_MASK
                                0,
 #endif
-                               cpi->workers, cpi->num_workers,
-                               &cpi->lf_row_sync);
+                               mt_info->workers, num_workers,
+                               &mt_info->lf_row_sync);
     else
       av1_loop_filter_frame(&cm->cur_frame->buf, cm, xd,
 #if CONFIG_LPF_MASK
@@ -4903,446 +2148,148 @@
   cdef_restoration_frame(cpi, cm, xd, use_restoration, use_cdef);
 }
 
-static void fix_interp_filter(InterpFilter *const interp_filter,
-                              const FRAME_COUNTS *const counts) {
-  if (*interp_filter == SWITCHABLE) {
-    // Check to see if only one of the filters is actually used
-    int count[SWITCHABLE_FILTERS] = { 0 };
-    int num_filters_used = 0;
-    for (int i = 0; i < SWITCHABLE_FILTERS; ++i) {
-      for (int j = 0; j < SWITCHABLE_FILTER_CONTEXTS; ++j)
-        count[i] += counts->switchable_interp[j][i];
-      num_filters_used += (count[i] > 0);
-    }
-    if (num_filters_used == 1) {
-      // Only one filter is used. So set the filter at frame level
-      for (int i = 0; i < SWITCHABLE_FILTERS; ++i) {
-        if (count[i]) {
-          if (i == EIGHTTAP_REGULAR) *interp_filter = i;
-          break;
-        }
-      }
-    }
-  }
-}
-
-static void finalize_encoded_frame(AV1_COMP *const cpi) {
+/*!\brief Encode a frame without the recode loop, usually used in one-pass
+ * encoding and realtime coding.
+ *
+ * \ingroup high_level_algo
+ *
+ * \param[in]    cpi             Top-level encoder structure
+ *
+ * \return Returns a value to indicate if the encoding is done successfully.
+ * \retval #AOM_CODEC_OK
+ * \retval #AOM_CODEC_ERROR
+ */
+static int encode_without_recode(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
-  CurrentFrame *const current_frame = &cm->current_frame;
+  const QuantizationCfg *const q_cfg = &cpi->oxcf.q_cfg;
+  SVC *const svc = &cpi->svc;
+  ResizePendingParams *const resize_pending_params =
+      &cpi->resize_pending_params;
+  const int resize_pending =
+      (resize_pending_params->width && resize_pending_params->height &&
+       (cpi->common.width != resize_pending_params->width ||
+        cpi->common.height != resize_pending_params->height));
 
-  if (!cm->seq_params.reduced_still_picture_hdr &&
-      encode_show_existing_frame(cm)) {
-    RefCntBuffer *const frame_to_show =
-        cm->ref_frame_map[cpi->existing_fb_idx_to_show];
+  int top_index = 0, bottom_index = 0, q = 0;
+  YV12_BUFFER_CONFIG *unscaled = cpi->unscaled_source;
+  InterpFilter filter_scaler =
+      cpi->use_svc ? svc->downsample_filter_type[svc->spatial_layer_id]
+                   : EIGHTTAP_SMOOTH;
+  int phase_scaler =
+      cpi->use_svc ? svc->downsample_filter_phase[svc->spatial_layer_id] : 0;
 
-    if (frame_to_show == NULL) {
-      aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
-                         "Buffer does not contain a reconstructed frame");
-    }
-    assert(frame_to_show->ref_count > 0);
-    assign_frame_buffer_p(&cm->cur_frame, frame_to_show);
-  }
+  set_size_independent_vars(cpi);
+  av1_setup_frame_size(cpi);
+  av1_set_size_dependent_vars(cpi, &q, &bottom_index, &top_index);
 
-  if (!encode_show_existing_frame(cm) &&
-      cm->seq_params.film_grain_params_present &&
-      (cm->show_frame || cm->showable_frame)) {
-    // Copy the current frame's film grain params to the its corresponding
-    // RefCntBuffer slot.
-    cm->cur_frame->film_grain_params = cm->film_grain_params;
-
-    // We must update the parameters if this is not an INTER_FRAME
-    if (current_frame->frame_type != INTER_FRAME)
-      cm->cur_frame->film_grain_params.update_parameters = 1;
-
-    // Iterate the random seed for the next frame.
-    cm->film_grain_params.random_seed += 3381;
-    if (cm->film_grain_params.random_seed == 0)
-      cm->film_grain_params.random_seed = 7391;
-  }
-
-  // Initialise all tiles' contexts from the global frame context
-  for (int tile_col = 0; tile_col < cm->tiles.cols; tile_col++) {
-    for (int tile_row = 0; tile_row < cm->tiles.rows; tile_row++) {
-      const int tile_idx = tile_row * cm->tiles.cols + tile_col;
-      cpi->tile_data[tile_idx].tctx = *cm->fc;
+  if (!cpi->use_svc) {
+    phase_scaler = 8;
+    // 2:1 scaling.
+    if ((cm->width << 1) == unscaled->y_crop_width &&
+        (cm->height << 1) == unscaled->y_crop_height) {
+      filter_scaler = BILINEAR;
+      // For lower resolutions use eighttap_smooth.
+      if (cm->width * cm->height <= 320 * 180) filter_scaler = EIGHTTAP_SMOOTH;
+    } else if ((cm->width << 2) == unscaled->y_crop_width &&
+               (cm->height << 2) == unscaled->y_crop_height) {
+      // 4:1 scaling.
+      filter_scaler = EIGHTTAP_SMOOTH;
+    } else if ((cm->width << 2) == 3 * unscaled->y_crop_width &&
+               (cm->height << 2) == 3 * unscaled->y_crop_height) {
+      // 4:3 scaling.
+      filter_scaler = EIGHTTAP_REGULAR;
     }
   }
 
-  fix_interp_filter(&cm->features.interp_filter, cpi->td.counts);
-}
+  if (cpi->sf.part_sf.partition_search_type == VAR_BASED_PARTITION)
+    variance_partition_alloc(cpi);
 
-static int get_regulated_q_overshoot(AV1_COMP *const cpi, int q_low, int q_high,
-                                     int top_index, int bottom_index) {
-  const AV1_COMMON *const cm = &cpi->common;
-  const RATE_CONTROL *const rc = &cpi->rc;
+  if (cm->current_frame.frame_type == KEY_FRAME) copy_frame_prob_info(cpi);
 
-  av1_rc_update_rate_correction_factors(cpi, cm->width, cm->height);
-
-  int q_regulated =
-      av1_rc_regulate_q(cpi, rc->this_frame_target, bottom_index,
-                        AOMMAX(q_high, top_index), cm->width, cm->height);
-
-  int retries = 0;
-  while (q_regulated < q_low && retries < 10) {
-    av1_rc_update_rate_correction_factors(cpi, cm->width, cm->height);
-    q_regulated =
-        av1_rc_regulate_q(cpi, rc->this_frame_target, bottom_index,
-                          AOMMAX(q_high, top_index), cm->width, cm->height);
-    retries++;
-  }
-  return q_regulated;
-}
-
-static int get_regulated_q_undershoot(AV1_COMP *const cpi, int q_high,
-                                      int top_index, int bottom_index) {
-  const AV1_COMMON *const cm = &cpi->common;
-  const RATE_CONTROL *const rc = &cpi->rc;
-
-  av1_rc_update_rate_correction_factors(cpi, cm->width, cm->height);
-  int q_regulated = av1_rc_regulate_q(cpi, rc->this_frame_target, bottom_index,
-                                      top_index, cm->width, cm->height);
-
-  int retries = 0;
-  while (q_regulated > q_high && retries < 10) {
-    av1_rc_update_rate_correction_factors(cpi, cm->width, cm->height);
-    q_regulated = av1_rc_regulate_q(cpi, rc->this_frame_target, bottom_index,
-                                    top_index, cm->width, cm->height);
-    retries++;
-  }
-  return q_regulated;
-}
-
-// Called after encode_with_recode_loop() has just encoded a frame and packed
-// its bitstream.  This function works out whether we under- or over-shot
-// our bitrate target and adjusts q as appropriate.  Also decides whether
-// or not we should do another recode loop, indicated by *loop
-static void recode_loop_update_q(
-    AV1_COMP *const cpi, int *const loop, int *const q, int *const q_low,
-    int *const q_high, const int top_index, const int bottom_index,
-    int *const undershoot_seen, int *const overshoot_seen,
-    int *const low_cr_seen, const int loop_at_this_size) {
-  AV1_COMMON *const cm = &cpi->common;
-  RATE_CONTROL *const rc = &cpi->rc;
-  *loop = 0;
-
-  const int min_cr = cpi->oxcf.min_cr;
-  if (min_cr > 0) {
-    aom_clear_system_state();
-    const double compression_ratio =
-        av1_get_compression_ratio(cm, rc->projected_frame_size >> 3);
-    const double target_cr = min_cr / 100.0;
-    if (compression_ratio < target_cr) {
-      *low_cr_seen = 1;
-      if (*q < rc->worst_quality) {
-        const double cr_ratio = target_cr / compression_ratio;
-        const int projected_q = AOMMAX(*q + 1, (int)(*q * cr_ratio * cr_ratio));
-        *q = AOMMIN(AOMMIN(projected_q, *q + 32), rc->worst_quality);
-        *q_low = AOMMAX(*q, *q_low);
-        *q_high = AOMMAX(*q, *q_high);
-        *loop = 1;
-      }
-    }
-    if (*low_cr_seen) return;
-  }
-
-  if (cpi->oxcf.rc_mode == AOM_Q) return;
-
-  const int last_q = *q;
-  int frame_over_shoot_limit = 0, frame_under_shoot_limit = 0;
-  av1_rc_compute_frame_size_bounds(cpi, rc->this_frame_target,
-                                   &frame_under_shoot_limit,
-                                   &frame_over_shoot_limit);
-  if (frame_over_shoot_limit == 0) frame_over_shoot_limit = 1;
-
-  if (cm->current_frame.frame_type == KEY_FRAME && rc->this_key_frame_forced &&
-      rc->projected_frame_size < rc->max_frame_bandwidth) {
-    int64_t kf_err;
-    const int64_t high_err_target = cpi->ambient_err;
-    const int64_t low_err_target = cpi->ambient_err >> 1;
-
-#if CONFIG_AV1_HIGHBITDEPTH
-    if (cm->seq_params.use_highbitdepth) {
-      kf_err = aom_highbd_get_y_sse(cpi->source, &cm->cur_frame->buf);
-    } else {
-      kf_err = aom_get_y_sse(cpi->source, &cm->cur_frame->buf);
-    }
-#else
-    kf_err = aom_get_y_sse(cpi->source, &cm->cur_frame->buf);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  printf("\n Encoding a frame:");
 #endif
-    // Prevent possible divide by zero error below for perfect KF
-    kf_err += !kf_err;
 
-    // The key frame is not good enough or we can afford
-    // to make it better without undue risk of popping.
-    if ((kf_err > high_err_target &&
-         rc->projected_frame_size <= frame_over_shoot_limit) ||
-        (kf_err > low_err_target &&
-         rc->projected_frame_size <= frame_under_shoot_limit)) {
-      // Lower q_high
-      *q_high = AOMMAX(*q - 1, *q_low);
-
-      // Adjust Q
-      *q = (int)((*q * high_err_target) / kf_err);
-      *q = AOMMIN(*q, (*q_high + *q_low) >> 1);
-    } else if (kf_err < low_err_target &&
-               rc->projected_frame_size >= frame_under_shoot_limit) {
-      // The key frame is much better than the previous frame
-      // Raise q_low
-      *q_low = AOMMIN(*q + 1, *q_high);
-
-      // Adjust Q
-      *q = (int)((*q * low_err_target) / kf_err);
-      *q = AOMMIN(*q, (*q_high + *q_low + 1) >> 1);
-    }
-
-    // Clamp Q to upper and lower limits:
-    *q = clamp(*q, *q_low, *q_high);
-    *loop = (*q != last_q);
-    return;
-  }
-
-  if (recode_loop_test(cpi, frame_over_shoot_limit, frame_under_shoot_limit, *q,
-                       AOMMAX(*q_high, top_index), bottom_index)) {
-    // Is the projected frame size out of range and are we allowed
-    // to attempt to recode.
-
-    // Frame size out of permitted range:
-    // Update correction factor & compute new Q to try...
-    // Frame is too large
-    if (rc->projected_frame_size > rc->this_frame_target) {
-      // Special case if the projected size is > the max allowed.
-      if (*q == *q_high &&
-          rc->projected_frame_size >= rc->max_frame_bandwidth) {
-        const double q_val_high_current =
-            av1_convert_qindex_to_q(*q_high, cm->seq_params.bit_depth);
-        const double q_val_high_new =
-            q_val_high_current *
-            ((double)rc->projected_frame_size / rc->max_frame_bandwidth);
-        *q_high = av1_find_qindex(q_val_high_new, cm->seq_params.bit_depth,
-                                  rc->best_quality, rc->worst_quality);
-      }
-
-      // Raise Qlow as to at least the current value
-      *q_low = AOMMIN(*q + 1, *q_high);
-
-      if (*undershoot_seen || loop_at_this_size > 2 ||
-          (loop_at_this_size == 2 && !frame_is_intra_only(cm))) {
-        av1_rc_update_rate_correction_factors(cpi, cm->width, cm->height);
-
-        *q = (*q_high + *q_low + 1) / 2;
-      } else if (loop_at_this_size == 2 && frame_is_intra_only(cm)) {
-        const int q_mid = (*q_high + *q_low + 1) / 2;
-        const int q_regulated = get_regulated_q_overshoot(
-            cpi, *q_low, *q_high, top_index, bottom_index);
-        // Get 'q' in-between 'q_mid' and 'q_regulated' for a smooth
-        // transition between loop_at_this_size < 2 and loop_at_this_size > 2.
-        *q = (q_mid + q_regulated + 1) / 2;
-      } else {
-        *q = get_regulated_q_overshoot(cpi, *q_low, *q_high, top_index,
-                                       bottom_index);
-      }
-
-      *overshoot_seen = 1;
-    } else {
-      // Frame is too small
-      *q_high = AOMMAX(*q - 1, *q_low);
-
-      if (*overshoot_seen || loop_at_this_size > 2 ||
-          (loop_at_this_size == 2 && !frame_is_intra_only(cm))) {
-        av1_rc_update_rate_correction_factors(cpi, cm->width, cm->height);
-        *q = (*q_high + *q_low) / 2;
-      } else if (loop_at_this_size == 2 && frame_is_intra_only(cm)) {
-        const int q_mid = (*q_high + *q_low) / 2;
-        const int q_regulated =
-            get_regulated_q_undershoot(cpi, *q_high, top_index, bottom_index);
-        // Get 'q' in-between 'q_mid' and 'q_regulated' for a smooth
-        // transition between loop_at_this_size < 2 and loop_at_this_size > 2.
-        *q = (q_mid + q_regulated) / 2;
-
-        // Special case reset for qlow for constrained quality.
-        // This should only trigger where there is very substantial
-        // undershoot on a frame and the auto cq level is above
-        // the user passsed in value.
-        if (cpi->oxcf.rc_mode == AOM_CQ && q_regulated < *q_low) {
-          *q_low = *q;
-        }
-      } else {
-        *q = get_regulated_q_undershoot(cpi, *q_high, top_index, bottom_index);
-
-        // Special case reset for qlow for constrained quality.
-        // This should only trigger where there is very substantial
-        // undershoot on a frame and the auto cq level is above
-        // the user passsed in value.
-        if (cpi->oxcf.rc_mode == AOM_CQ && *q < *q_low) {
-          *q_low = *q;
-        }
-      }
-
-      *undershoot_seen = 1;
-    }
-
-    // Clamp Q to upper and lower limits:
-    *q = clamp(*q, *q_low, *q_high);
-  }
-
-  *loop = (*q != last_q);
-}
-
-static int get_interp_filter_selected(const AV1_COMMON *const cm,
-                                      MV_REFERENCE_FRAME ref,
-                                      InterpFilter ifilter) {
-  const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref);
-  if (buf == NULL) return 0;
-  return buf->interp_filter_selected[ifilter];
-}
-
-static uint16_t setup_interp_filter_search_mask(AV1_COMP *cpi) {
-  const AV1_COMMON *const cm = &cpi->common;
-  int ref_total[REF_FRAMES] = { 0 };
-  uint16_t mask = ALLOW_ALL_INTERP_FILT_MASK;
-
-  if (cpi->last_frame_type == KEY_FRAME || cpi->refresh_alt_ref_frame)
-    return mask;
-
-  for (MV_REFERENCE_FRAME ref = LAST_FRAME; ref <= ALTREF_FRAME; ++ref) {
-    for (InterpFilter ifilter = EIGHTTAP_REGULAR; ifilter <= MULTITAP_SHARP;
-         ++ifilter) {
-      ref_total[ref] += get_interp_filter_selected(cm, ref, ifilter);
-    }
-  }
-  int ref_total_total = (ref_total[LAST2_FRAME] + ref_total[LAST3_FRAME] +
-                         ref_total[GOLDEN_FRAME] + ref_total[BWDREF_FRAME] +
-                         ref_total[ALTREF2_FRAME] + ref_total[ALTREF_FRAME]);
-
-  for (InterpFilter ifilter = EIGHTTAP_REGULAR; ifilter <= MULTITAP_SHARP;
-       ++ifilter) {
-    int last_score = get_interp_filter_selected(cm, LAST_FRAME, ifilter) * 30;
-    if (ref_total[LAST_FRAME] && last_score <= ref_total[LAST_FRAME]) {
-      int filter_score =
-          get_interp_filter_selected(cm, LAST2_FRAME, ifilter) * 20 +
-          get_interp_filter_selected(cm, LAST3_FRAME, ifilter) * 20 +
-          get_interp_filter_selected(cm, GOLDEN_FRAME, ifilter) * 20 +
-          get_interp_filter_selected(cm, BWDREF_FRAME, ifilter) * 10 +
-          get_interp_filter_selected(cm, ALTREF2_FRAME, ifilter) * 10 +
-          get_interp_filter_selected(cm, ALTREF_FRAME, ifilter) * 10;
-      if (filter_score < ref_total_total) {
-        DUAL_FILTER_TYPE filt_type = ifilter + SWITCHABLE_FILTERS * ifilter;
-        reset_interp_filter_allowed_mask(&mask, filt_type);
-      }
-    }
-  }
-  return mask;
-}
-
-#if !CONFIG_REALTIME_ONLY
-#define STRICT_PSNR_DIFF_THRESH 0.9
-// Encode key frame with/without screen content tools to determine whether
-// screen content tools should be enabled for this key frame group or not.
-// The first encoding is without screen content tools.
-// The second encoding is with screen content tools.
-// We compare the psnr and frame size to make the decision.
-static void screen_content_tools_determination(
-    AV1_COMP *cpi, const int allow_screen_content_tools_orig_decision,
-    const int allow_intrabc_orig_decision,
-    const int is_screen_content_type_orig_decision, const int pass,
-    int *projected_size_pass, PSNR_STATS *psnr) {
-  AV1_COMMON *const cm = &cpi->common;
-  FeatureFlags *const features = &cm->features;
-  projected_size_pass[pass] = cpi->rc.projected_frame_size;
-#if CONFIG_AV1_HIGHBITDEPTH
-  const uint32_t in_bit_depth = cpi->oxcf.input_bit_depth;
-  const uint32_t bit_depth = cpi->td.mb.e_mbd.bd;
-  aom_calc_highbd_psnr(cpi->source, &cpi->common.cur_frame->buf, &psnr[pass],
-                       bit_depth, in_bit_depth);
-#else
-  aom_calc_psnr(cpi->source, &cpi->common.cur_frame->buf, &psnr[pass]);
-#endif
-  if (pass != 1) return;
-
-  const double psnr_diff = psnr[1].psnr[0] - psnr[0].psnr[0];
-  const int is_sc_encoding_much_better = psnr_diff > STRICT_PSNR_DIFF_THRESH;
-  if (is_sc_encoding_much_better) {
-    // Use screen content tools, if we get coding gain.
-    features->allow_screen_content_tools = 1;
-    features->allow_intrabc = cpi->intrabc_used;
-    cpi->is_screen_content_type = 1;
-  } else {
-    // Use original screen content decision.
-    features->allow_screen_content_tools =
-        allow_screen_content_tools_orig_decision;
-    features->allow_intrabc = allow_intrabc_orig_decision;
-    cpi->is_screen_content_type = is_screen_content_type_orig_decision;
-  }
-}
-
-// Set some encoding parameters to make the encoding process fast.
-// A fixed block partition size, and a large q is used.
-static void set_encoding_params_for_screen_content(AV1_COMP *cpi,
-                                                   const int pass) {
-  AV1_COMMON *const cm = &cpi->common;
-  if (pass == 0) {
-    // In the first pass, encode without screen content tools.
-    // Use a high q, and a fixed block size for fast encoding.
-    cm->features.allow_screen_content_tools = 0;
-    cm->features.allow_intrabc = 0;
-    cpi->is_screen_content_type = 0;
-    cpi->sf.part_sf.partition_search_type = FIXED_PARTITION;
-    cpi->sf.part_sf.always_this_block_size = BLOCK_32X32;
-    return;
-  }
-  assert(pass == 1);
-  // In the second pass, encode with screen content tools.
-  // Use a high q, and a fixed block size for fast encoding.
-  cm->features.allow_screen_content_tools = 1;
-  // TODO(chengchen): turn intrabc on could lead to data race issue.
-  // cm->allow_intrabc = 1;
-  cpi->is_screen_content_type = 1;
-  cpi->sf.part_sf.partition_search_type = FIXED_PARTITION;
-  cpi->sf.part_sf.always_this_block_size = BLOCK_32X32;
-}
-
-// Determines whether to use screen content tools for the key frame group.
-// This function modifies "cm->features.allow_screen_content_tools",
-// "cm->features.allow_intrabc" and "cpi->is_screen_content_type".
-static void determine_sc_tools_with_encoding(AV1_COMP *cpi, const int q_orig) {
-  AV1_COMMON *const cm = &cpi->common;
-  // Variables to help determine if we should allow screen content tools.
-  int projected_size_pass[3] = { 0 };
-  PSNR_STATS psnr[3];
-  const int is_key_frame = cm->current_frame.frame_type == KEY_FRAME;
-  const int allow_screen_content_tools_orig_decision =
-      cm->features.allow_screen_content_tools;
-  const int allow_intrabc_orig_decision = cm->features.allow_intrabc;
-  const int is_screen_content_type_orig_decision = cpi->is_screen_content_type;
-  // Turn off the encoding trial for forward key frame and superres.
-  if (cpi->sf.rt_sf.use_nonrd_pick_mode || cpi->oxcf.fwd_kf_enabled ||
-      cpi->superres_mode != SUPERRES_NONE || cpi->oxcf.mode == REALTIME ||
-      is_screen_content_type_orig_decision || !is_key_frame) {
-    return;
-  }
-
-  // TODO(chengchen): multiple encoding for the lossless mode is time consuming.
-  // Find a better way to determine whether screen content tools should be used
-  // for lossless coding.
-  // Use a high q and a fixed partition to do quick encoding.
-  const int q_for_screen_content_quick_run =
-      is_lossless_requested(&cpi->oxcf) ? q_orig : AOMMAX(q_orig, 244);
-  const int partition_search_type_orig = cpi->sf.part_sf.partition_search_type;
-  const BLOCK_SIZE fixed_partition_block_size_orig =
-      cpi->sf.part_sf.always_this_block_size;
-
-  // Setup necessary params for encoding, including frame source, etc.
   aom_clear_system_state();
 
-  cpi->source =
-      av1_scale_if_required(cm, cpi->unscaled_source, &cpi->scaled_source);
-  if (cpi->unscaled_last_source != NULL) {
-    cpi->last_source = av1_scale_if_required(cm, cpi->unscaled_last_source,
-                                             &cpi->scaled_last_source);
+  cpi->source = av1_scale_if_required(cm, unscaled, &cpi->scaled_source,
+                                      filter_scaler, phase_scaler, true, false);
+  if (frame_is_intra_only(cm) || resize_pending != 0) {
+    memset(cpi->consec_zero_mv, 0,
+           ((cm->mi_params.mi_rows * cm->mi_params.mi_cols) >> 2) *
+               sizeof(*cpi->consec_zero_mv));
   }
 
-  setup_frame(cpi);
+  if (cpi->unscaled_last_source != NULL) {
+    cpi->last_source = av1_scale_if_required(
+        cm, cpi->unscaled_last_source, &cpi->scaled_last_source, filter_scaler,
+        phase_scaler, true, false);
+  }
 
+  if (cpi->sf.rt_sf.use_temporal_noise_estimate) {
+    av1_update_noise_estimate(cpi);
+  }
+
+  // For 1 spatial layer encoding: if the (non-LAST) reference has different
+  // resolution from the source then disable that reference. This is to avoid
+  // significant increase in encode time from scaling the references in
+  // av1_scale_references. Note GOLDEN is forced to update on the (first/tigger)
+  // resized frame and ALTREF will be refreshed ~4 frames later, so both
+  // references become available again after few frames.
+  if (svc->number_spatial_layers == 1) {
+    if (cpi->ref_frame_flags & av1_ref_frame_flag_list[GOLDEN_FRAME]) {
+      const YV12_BUFFER_CONFIG *const ref =
+          get_ref_frame_yv12_buf(cm, GOLDEN_FRAME);
+      if (ref->y_crop_width != cm->width || ref->y_crop_height != cm->height)
+        cpi->ref_frame_flags ^= AOM_GOLD_FLAG;
+    }
+    if (cpi->ref_frame_flags & av1_ref_frame_flag_list[ALTREF_FRAME]) {
+      const YV12_BUFFER_CONFIG *const ref =
+          get_ref_frame_yv12_buf(cm, ALTREF_FRAME);
+      if (ref->y_crop_width != cm->width || ref->y_crop_height != cm->height)
+        cpi->ref_frame_flags ^= AOM_ALT_FLAG;
+    }
+  }
+
+  // For SVC the inter-layer/spatial prediction is not done for newmv
+  // (zero_mode is forced), and since the scaled references are only
+  // use for newmv search, we can avoid scaling here.
+  if (!frame_is_intra_only(cm) &&
+      !(cpi->use_svc && cpi->svc.force_zero_mode_spatial_ref))
+    av1_scale_references(cpi, filter_scaler, phase_scaler, 1);
+
+  av1_set_quantizer(cm, q_cfg->qm_minlevel, q_cfg->qm_maxlevel, q,
+                    q_cfg->enable_chroma_deltaq);
+  av1_set_speed_features_qindex_dependent(cpi, cpi->oxcf.speed);
+  if (q_cfg->deltaq_mode != NO_DELTA_Q)
+    av1_init_quantizer(&cpi->enc_quant_dequant_params, &cm->quant_params,
+                       cm->seq_params.bit_depth);
+  av1_set_variance_partition_thresholds(cpi, q, 0);
+  av1_setup_frame(cpi);
+
+  // Check if this high_source_sad (scene/slide change) frame should be
+  // encoded at high/max QP, and if so, set the q and adjust some rate
+  // control parameters.
+  if (cpi->sf.rt_sf.overshoot_detection_cbr == FAST_DETECTION_MAXQ &&
+      cpi->rc.high_source_sad) {
+    if (av1_encodedframe_overshoot_cbr(cpi, &q)) {
+      av1_set_quantizer(cm, q_cfg->qm_minlevel, q_cfg->qm_maxlevel, q,
+                        q_cfg->enable_chroma_deltaq);
+      av1_set_speed_features_qindex_dependent(cpi, cpi->oxcf.speed);
+      if (q_cfg->deltaq_mode != NO_DELTA_Q)
+        av1_init_quantizer(&cpi->enc_quant_dequant_params, &cm->quant_params,
+                           cm->seq_params.bit_depth);
+      av1_set_variance_partition_thresholds(cpi, q, 0);
+      if (frame_is_intra_only(cm) || cm->features.error_resilient_mode)
+        av1_setup_frame(cpi);
+    }
+  }
+
+  if (q_cfg->aq_mode == CYCLIC_REFRESH_AQ) {
+    suppress_active_map(cpi);
+    av1_cyclic_refresh_setup(cpi);
+    av1_apply_active_map(cpi);
+  }
   if (cm->seg.enabled) {
     if (!cm->seg.update_data && cm->prev_frame) {
       segfeatures_copy(&cm->seg, &cm->prev_frame->seg);
@@ -5356,128 +2303,106 @@
   segfeatures_copy(&cm->cur_frame->seg, &cm->seg);
   cm->cur_frame->seg.enabled = cm->seg.enabled;
 
-  // The two encoding passes aim to help determine whether to use screen
-  // content tools, with a high q and fixed partition.
-  for (int pass = 0; pass < 2; ++pass) {
-    set_encoding_params_for_screen_content(cpi, pass);
-#if CONFIG_TUNE_VMAF
-    if (cpi->oxcf.tuning == AOM_TUNE_VMAF_WITH_PREPROCESSING ||
-        cpi->oxcf.tuning == AOM_TUNE_VMAF_WITHOUT_PREPROCESSING ||
-        cpi->oxcf.tuning == AOM_TUNE_VMAF_MAX_GAIN) {
-      av1_set_quantizer(
-          cm, cpi->oxcf.qm_minlevel, cpi->oxcf.qm_maxlevel,
-          av1_get_vmaf_base_qindex(cpi, q_for_screen_content_quick_run));
-    } else {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  start_timing(cpi, av1_encode_frame_time);
 #endif
-      av1_set_quantizer(cm, cpi->oxcf.qm_minlevel, cpi->oxcf.qm_maxlevel,
-                        q_for_screen_content_quick_run);
-#if CONFIG_TUNE_VMAF
-    }
+
+  // Set the motion vector precision based on mv stats from the last coded
+  // frame.
+  if (!frame_is_intra_only(cm)) av1_pick_and_set_high_precision_mv(cpi, q);
+
+  // transform / motion compensation build reconstruction frame
+  av1_encode_frame(cpi);
+
+  // Update some stats from cyclic refresh.
+  if (q_cfg->aq_mode == CYCLIC_REFRESH_AQ && !frame_is_intra_only(cm))
+    av1_cyclic_refresh_postencode(cpi);
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  end_timing(cpi, av1_encode_frame_time);
 #endif
-    av1_set_speed_features_qindex_dependent(cpi, cpi->oxcf.speed);
-    if (cpi->oxcf.deltaq_mode != NO_DELTA_Q)
-      av1_init_quantizer(&cpi->enc_quant_dequant_params, &cm->quant_params,
-                         cm->seq_params.bit_depth);
+#if CONFIG_INTERNAL_STATS
+  ++cpi->tot_recode_hits;
+#endif
 
-    av1_set_variance_partition_thresholds(cpi, q_for_screen_content_quick_run,
-                                          0);
-    // transform / motion compensation build reconstruction frame
-    av1_encode_frame(cpi);
-    // Screen content decision
-    screen_content_tools_determination(
-        cpi, allow_screen_content_tools_orig_decision,
-        allow_intrabc_orig_decision, is_screen_content_type_orig_decision, pass,
-        projected_size_pass, psnr);
-  }
+  aom_clear_system_state();
 
-  // Set partition speed feature back.
-  cpi->sf.part_sf.partition_search_type = partition_search_type_orig;
-  cpi->sf.part_sf.always_this_block_size = fixed_partition_block_size_orig;
+  return AOM_CODEC_OK;
 }
-#endif  // CONFIG_REALTIME_ONLY
 
+#if !CONFIG_REALTIME_ONLY
+
+/*!\brief Recode loop for encoding one frame. the purpose of encoding one frame
+ * for multiple times can be approaching a target bitrate or adjusting the usage
+ * of global motions.
+ *
+ * \ingroup high_level_algo
+ *
+ * \param[in]    cpi             Top-level encoder structure
+ * \param[in]    size            Bitstream size
+ * \param[in]    dest            Bitstream output
+ *
+ * \return Returns a value to indicate if the encoding is done successfully.
+ * \retval #AOM_CODEC_OK
+ * \retval -1
+ * \retval #AOM_CODEC_ERROR
+ */
 static int encode_with_recode_loop(AV1_COMP *cpi, size_t *size, uint8_t *dest) {
   AV1_COMMON *const cm = &cpi->common;
   RATE_CONTROL *const rc = &cpi->rc;
   GlobalMotionInfo *const gm_info = &cpi->gm_info;
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  const QuantizationCfg *const q_cfg = &oxcf->q_cfg;
   const int allow_recode = (cpi->sf.hl_sf.recode_loop != DISALLOW_RECODE);
   // Must allow recode if minimum compression ratio is set.
-  assert(IMPLIES(cpi->oxcf.min_cr > 0, allow_recode));
+  assert(IMPLIES(oxcf->rc_cfg.min_cr > 0, allow_recode));
 
   set_size_independent_vars(cpi);
   if (is_stat_consumption_stage_twopass(cpi) &&
       cpi->sf.interp_sf.adaptive_interp_filter_search)
     cpi->interp_search_flags.interp_filter_search_mask =
-        setup_interp_filter_search_mask(cpi);
+        av1_setup_interp_filter_search_mask(cpi);
   cpi->source->buf_8bit_valid = 0;
 
   av1_setup_frame_size(cpi);
 
-#if CONFIG_SUPERRES_IN_RECODE
-  if (superres_in_recode_allowed(cpi) && cpi->superres_mode != SUPERRES_NONE &&
+  if (av1_superres_in_recode_allowed(cpi) &&
+      cpi->superres_mode != AOM_SUPERRES_NONE &&
       cm->superres_scale_denominator == SCALE_NUMERATOR) {
     // Superres mode is currently enabled, but the denominator selected will
     // disable superres. So no need to continue, as we will go through another
     // recode loop for full-resolution after this anyway.
     return -1;
   }
-#endif  // CONFIG_SUPERRES_IN_RECODE
 
   int top_index = 0, bottom_index = 0;
   int q = 0, q_low = 0, q_high = 0;
-  set_size_dependent_vars(cpi, &q, &bottom_index, &top_index);
+  av1_set_size_dependent_vars(cpi, &q, &bottom_index, &top_index);
   q_low = bottom_index;
   q_high = top_index;
-  if (cpi->sf.part_sf.partition_search_type == VAR_BASED_PARTITION) {
-    const int num_64x64_blocks =
-        (cm->seq_params.sb_size == BLOCK_64X64) ? 1 : 4;
-    if (cpi->td.vt64x64) {
-      if (num_64x64_blocks != cpi->td.num_64x64_blocks) {
-        aom_free(cpi->td.vt64x64);
-        cpi->td.vt64x64 = NULL;
-      }
-    }
-    if (!cpi->td.vt64x64) {
-      CHECK_MEM_ERROR(cm, cpi->td.vt64x64,
-                      aom_malloc(sizeof(*cpi->td.vt64x64) * num_64x64_blocks));
-      cpi->td.num_64x64_blocks = num_64x64_blocks;
-    }
-  }
 
-  if (cm->current_frame.frame_type == KEY_FRAME) {
-    FrameProbInfo *const frame_probs = &cpi->frame_probs;
+  if (cpi->sf.part_sf.partition_search_type == VAR_BASED_PARTITION)
+    variance_partition_alloc(cpi);
 
-    if (cpi->sf.tx_sf.tx_type_search.prune_tx_type_using_stats) {
-      av1_copy(frame_probs->tx_type_probs, default_tx_type_probs);
-    }
-
-    if (!cpi->sf.inter_sf.disable_obmc &&
-        cpi->sf.inter_sf.prune_obmc_prob_thresh > 0) {
-      av1_copy(frame_probs->obmc_probs, default_obmc_probs);
-    }
-
-    if (cpi->sf.inter_sf.prune_warped_prob_thresh > 0) {
-      av1_copy(frame_probs->warped_probs, default_warped_probs);
-    }
-
-    if (cpi->sf.interp_sf.adaptive_interp_filter_search == 2) {
-      av1_copy(frame_probs->switchable_interp_probs,
-               default_switchable_interp_probs);
-    }
-  }
-#if !CONFIG_REALTIME_ONLY
-  // Determine whether to use screen content tools using two fast encoding.
-  determine_sc_tools_with_encoding(cpi, q);
-#endif  // CONFIG_REALTIME_ONLY
+  if (cm->current_frame.frame_type == KEY_FRAME) copy_frame_prob_info(cpi);
 
 #if CONFIG_COLLECT_COMPONENT_TIMING
   printf("\n Encoding a frame:");
 #endif
 
+  // Determine whether to use screen content tools using two fast encoding.
+  if (!cpi->sf.hl_sf.disable_extra_sc_testing)
+    av1_determine_sc_tools_with_encoding(cpi, q);
+
+#if CONFIG_USE_VMAF_RC
+  if (oxcf->tune_cfg.tuning == AOM_TUNE_VMAF_NEG_MAX_GAIN) {
+    av1_vmaf_neg_preprocessing(cpi, cpi->unscaled_source);
+  }
+#endif
+
   // Loop variables
   int loop = 0;
   int loop_count = 0;
-  int loop_at_this_size = 0;
   int overshoot_seen = 0;
   int undershoot_seen = 0;
   int low_cr_seen = 0;
@@ -5496,33 +2421,33 @@
       }
     }
     cpi->source =
-        av1_scale_if_required(cm, cpi->unscaled_source, &cpi->scaled_source);
+        av1_scale_if_required(cm, cpi->unscaled_source, &cpi->scaled_source,
+                              EIGHTTAP_REGULAR, 0, false, false);
+
     if (cpi->unscaled_last_source != NULL) {
-      cpi->last_source = av1_scale_if_required(cm, cpi->unscaled_last_source,
-                                               &cpi->scaled_last_source);
+      cpi->last_source = av1_scale_if_required(
+          cm, cpi->unscaled_last_source, &cpi->scaled_last_source,
+          EIGHTTAP_REGULAR, 0, false, false);
     }
 
     if (!frame_is_intra_only(cm)) {
       if (loop_count > 0) {
         release_scaled_references(cpi);
       }
-      scale_references(cpi);
+      av1_scale_references(cpi, EIGHTTAP_REGULAR, 0, 0);
     }
 #if CONFIG_TUNE_VMAF
-    if (cpi->oxcf.tuning == AOM_TUNE_VMAF_WITH_PREPROCESSING ||
-        cpi->oxcf.tuning == AOM_TUNE_VMAF_WITHOUT_PREPROCESSING ||
-        cpi->oxcf.tuning == AOM_TUNE_VMAF_MAX_GAIN) {
-      av1_set_quantizer(cm, cpi->oxcf.qm_minlevel, cpi->oxcf.qm_maxlevel,
-                        av1_get_vmaf_base_qindex(cpi, q));
-    } else {
-#endif
-      av1_set_quantizer(cm, cpi->oxcf.qm_minlevel, cpi->oxcf.qm_maxlevel, q);
-#if CONFIG_TUNE_VMAF
+    if (oxcf->tune_cfg.tuning >= AOM_TUNE_VMAF_WITH_PREPROCESSING &&
+        oxcf->tune_cfg.tuning <= AOM_TUNE_VMAF_NEG_MAX_GAIN) {
+      cpi->vmaf_info.original_qindex = q;
+      q = av1_get_vmaf_base_qindex(cpi, q);
     }
 #endif
-    av1_set_speed_features_qindex_dependent(cpi, cpi->oxcf.speed);
+    av1_set_quantizer(cm, q_cfg->qm_minlevel, q_cfg->qm_maxlevel, q,
+                      q_cfg->enable_chroma_deltaq);
+    av1_set_speed_features_qindex_dependent(cpi, oxcf->speed);
 
-    if (cpi->oxcf.deltaq_mode != NO_DELTA_Q)
+    if (q_cfg->deltaq_mode != NO_DELTA_Q)
       av1_init_quantizer(&cpi->enc_quant_dequant_params, &cm->quant_params,
                          cm->seq_params.bit_depth);
 
@@ -5533,7 +2458,7 @@
     //        cm->current_frame.frame_type, cm->superres_scale_denominator);
 
     if (loop_count == 0) {
-      setup_frame(cpi);
+      av1_setup_frame(cpi);
     } else if (get_primary_ref_frame_buf(cm) == NULL) {
       // Base q-index may have changed, so we need to assign proper default coef
       // probs before every iteration.
@@ -5541,14 +2466,10 @@
       av1_setup_frame_contexts(cm);
     }
 
-    if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
+    if (q_cfg->aq_mode == VARIANCE_AQ) {
       av1_vaq_frame_setup(cpi);
-    } else if (cpi->oxcf.aq_mode == COMPLEXITY_AQ) {
+    } else if (q_cfg->aq_mode == COMPLEXITY_AQ) {
       av1_setup_in_frame_q_adj(cpi);
-    } else if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && !allow_recode) {
-      suppress_active_map(cpi);
-      av1_cyclic_refresh_setup(cpi);
-      apply_active_map(cpi);
     }
 
     if (cm->seg.enabled) {
@@ -5583,7 +2504,7 @@
 
     // transform / motion compensation build reconstruction frame
     av1_encode_frame(cpi);
-#if !CONFIG_REALTIME_ONLY
+
     // Reset the mv_stats in case we are interrupted by an intraframe or an
     // overlay frame.
     if (cpi->mv_stats.valid) {
@@ -5594,7 +2515,6 @@
         av1_frame_allows_smart_mv(cpi)) {
       av1_collect_mv_stats(cpi, q);
     }
-#endif  // !CONFIG_REALTIME_ONLY
 
 #if CONFIG_COLLECT_COMPONENT_TIMING
     end_timing(cpi, av1_encode_frame_time);
@@ -5607,11 +2527,12 @@
     // to recode.
     const int do_dummy_pack =
         (cpi->sf.hl_sf.recode_loop >= ALLOW_RECODE_KFARFGF &&
-         cpi->oxcf.rc_mode != AOM_Q) ||
-        cpi->oxcf.min_cr > 0;
+         oxcf->rc_cfg.mode != AOM_Q) ||
+        oxcf->rc_cfg.min_cr > 0;
     if (do_dummy_pack) {
-      finalize_encoded_frame(cpi);
+      av1_finalize_encoded_frame(cpi);
       int largest_tile_id = 0;  // Output from bitstream: unused here
+      rc->coefficient_size = 0;
       if (av1_pack_bitstream(cpi, dest, size, &largest_tile_id) !=
           AOM_CODEC_OK) {
         return AOM_CODEC_ERROR;
@@ -5620,29 +2541,26 @@
       rc->projected_frame_size = (int)(*size) << 3;
     }
 
+#if CONFIG_TUNE_VMAF
+    if (oxcf->tune_cfg.tuning >= AOM_TUNE_VMAF_WITH_PREPROCESSING &&
+        oxcf->tune_cfg.tuning <= AOM_TUNE_VMAF_NEG_MAX_GAIN) {
+      q = cpi->vmaf_info.original_qindex;
+    }
+#endif
     if (allow_recode) {
       // Update q and decide whether to do a recode loop
       recode_loop_update_q(cpi, &loop, &q, &q_low, &q_high, top_index,
                            bottom_index, &undershoot_seen, &overshoot_seen,
-                           &low_cr_seen, loop_at_this_size);
-    }
+                           &low_cr_seen, loop_count);
 
-    // Special case for overlay frame.
-    if (loop && rc->is_src_frame_alt_ref &&
-        rc->projected_frame_size < rc->max_frame_bandwidth) {
-      loop = 0;
-    }
-
-    if (allow_recode && !cpi->sf.gm_sf.gm_disable_recode &&
-        recode_loop_test_global_motion(cm->global_motion,
-                                       cpi->td.rd_counts.global_motion_used,
-                                       gm_info->params_cost)) {
-      loop = 1;
+      if (!loop && !cpi->sf.gm_sf.gm_disable_recode)
+        loop = av1_recode_loop_test_global_motion(
+            cm->global_motion, cpi->td.rd_counts.global_motion_used,
+            gm_info->params_cost);
     }
 
     if (loop) {
       ++loop_count;
-      ++loop_at_this_size;
 
 #if CONFIG_INTERNAL_STATS
       ++cpi->tot_recode_hits;
@@ -5653,13 +2571,28 @@
 #endif
   } while (loop);
 
-  // Update some stats from cyclic refresh.
-  if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && !frame_is_intra_only(cm))
-    av1_cyclic_refresh_postencode(cpi);
-
   return AOM_CODEC_OK;
 }
+#endif  // !CONFIG_REALTIME_ONLY
 
+/*!\brief Recode loop or a single loop for encoding one frame, followed by
+ * in-loop deblocking filters, CDEF filters, and restoration filters.
+ *
+ * \ingroup high_level_algo
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in]    cpi             Top-level encoder structure
+ * \param[in]    size            Bitstream size
+ * \param[in]    dest            Bitstream output
+ * \param[in]    sse             Total distortion of the frame
+ * \param[in]    rate            Total rate of the frame
+ * \param[in]    largest_tile_id Tile id of the last tile
+ *
+ * \return Returns a value to indicate if the encoding is done successfully.
+ * \retval #AOM_CODEC_OK
+ * \retval #AOM_CODEC_ERROR
+ */
 static int encode_with_recode_loop_and_filter(AV1_COMP *cpi, size_t *size,
                                               uint8_t *dest, int64_t *sse,
                                               int64_t *rate,
@@ -5667,7 +2600,15 @@
 #if CONFIG_COLLECT_COMPONENT_TIMING
   start_timing(cpi, encode_with_recode_loop_time);
 #endif
-  int err = encode_with_recode_loop(cpi, size, dest);
+  int err;
+#if CONFIG_REALTIME_ONLY
+  err = encode_without_recode(cpi);
+#else
+  if (cpi->sf.hl_sf.recode_loop == DISALLOW_RECODE)
+    err = encode_without_recode(cpi);
+  else
+    err = encode_with_recode_loop(cpi, size, dest);
+#endif
 #if CONFIG_COLLECT_COMPONENT_TIMING
   end_timing(cpi, encode_with_recode_loop_time);
 #endif
@@ -5683,12 +2624,6 @@
     return err;
   }
 
-#ifdef OUTPUT_YUV_SKINMAP
-  if (cpi->common.current_frame.frame_number > 1) {
-    av1_compute_skin_map(cpi, yuv_skinmap_file);
-  }
-#endif  // OUTPUT_YUV_SKINMAP
-
   AV1_COMMON *const cm = &cpi->common;
   SequenceHeader *const seq_params = &cm->seq_params;
 
@@ -5718,9 +2653,6 @@
   cm->cur_frame->buf.render_width = cm->render_width;
   cm->cur_frame->buf.render_height = cm->render_height;
 
-  // TODO(zoeliu): For non-ref frames, loop filtering may need to be turned
-  // off.
-
   // Pick the loop filter level for the frame.
   if (!cm->features.allow_intrabc) {
     loopfilter_frame(cpi, cm);
@@ -5744,11 +2676,12 @@
   aom_write_one_yuv_frame(cm, &cm->cur_frame->buf);
 #endif
 
-  finalize_encoded_frame(cpi);
+  av1_finalize_encoded_frame(cpi);
   // Build the bitstream
 #if CONFIG_COLLECT_COMPONENT_TIMING
   start_timing(cpi, av1_pack_bitstream_final_time);
 #endif
+  cpi->rc.coefficient_size = 0;
   if (av1_pack_bitstream(cpi, dest, size, largest_tile_id) != AOM_CODEC_OK)
     return AOM_CODEC_ERROR;
 #if CONFIG_COLLECT_COMPONENT_TIMING
@@ -5772,485 +2705,178 @@
   return AOM_CODEC_OK;
 }
 
-#if CONFIG_SUPERRES_IN_RECODE
-
-static void save_cur_buf(AV1_COMP *cpi) {
-  CODING_CONTEXT *const cc = &cpi->coding_context;
-  AV1_COMMON *cm = &cpi->common;
-  const YV12_BUFFER_CONFIG *ybf = &cm->cur_frame->buf;
-  memset(&cc->copy_buffer, 0, sizeof(cc->copy_buffer));
-  if (aom_alloc_frame_buffer(&cc->copy_buffer, ybf->y_crop_width,
-                             ybf->y_crop_height, ybf->subsampling_x,
-                             ybf->subsampling_y,
-                             ybf->flags & YV12_FLAG_HIGHBITDEPTH, ybf->border,
-                             cm->features.byte_alignment) != AOM_CODEC_OK) {
-    aom_internal_error(
-        &cm->error, AOM_CODEC_MEM_ERROR,
-        "Failed to allocate copy buffer for saving coding context");
-  }
-  aom_yv12_copy_frame(ybf, &cc->copy_buffer, av1_num_planes(cm));
-}
-
-// Coding context that only needs to be saved when recode loop includes
-// filtering (deblocking, CDEF, superres post-encode upscale and/or loop
-// restoraton).
-static void save_extra_coding_context(AV1_COMP *cpi) {
-  CODING_CONTEXT *const cc = &cpi->coding_context;
-  AV1_COMMON *cm = &cpi->common;
-
-  cc->lf = cm->lf;
-  cc->cdef_info = cm->cdef_info;
-  cc->rc = cpi->rc;
-}
-
-static void save_all_coding_context(AV1_COMP *cpi) {
-  save_cur_buf(cpi);
-  save_extra_coding_context(cpi);
-  if (!frame_is_intra_only(&cpi->common)) release_scaled_references(cpi);
-}
-
-static void restore_cur_buf(AV1_COMP *cpi) {
-  CODING_CONTEXT *const cc = &cpi->coding_context;
-  AV1_COMMON *cm = &cpi->common;
-  aom_yv12_copy_frame(&cc->copy_buffer, &cm->cur_frame->buf,
-                      av1_num_planes(cm));
-}
-
-// Coding context that only needs to be restored when recode loop includes
-// filtering (deblocking, CDEF, superres post-encode upscale and/or loop
-// restoraton).
-static void restore_extra_coding_context(AV1_COMP *cpi) {
-  CODING_CONTEXT *const cc = &cpi->coding_context;
-  AV1_COMMON *cm = &cpi->common;
-  cm->lf = cc->lf;
-  cm->cdef_info = cc->cdef_info;
-  cpi->rc = cc->rc;
-}
-
-static void restore_all_coding_context(AV1_COMP *cpi) {
-  restore_cur_buf(cpi);
-  restore_extra_coding_context(cpi);
-  if (!frame_is_intra_only(&cpi->common)) release_scaled_references(cpi);
-}
-
-static void release_copy_buffer(CODING_CONTEXT *cc) {
-  aom_free_frame_buffer(&cc->copy_buffer);
-}
-
 static int encode_with_and_without_superres(AV1_COMP *cpi, size_t *size,
                                             uint8_t *dest,
                                             int *largest_tile_id) {
   const AV1_COMMON *const cm = &cpi->common;
   assert(cm->seq_params.enable_superres);
-  assert(superres_in_recode_allowed(cpi));
+  assert(av1_superres_in_recode_allowed(cpi));
   aom_codec_err_t err = AOM_CODEC_OK;
-  save_all_coding_context(cpi);
+  av1_save_all_coding_context(cpi);
 
-  // Encode with superres.
-#if SUPERRES_RECODE_ALL_RATIOS
-  AV1EncoderConfig *const oxcf = &cpi->oxcf;
-  int64_t superres_sses[SCALE_NUMERATOR];
-  int64_t superres_rates[SCALE_NUMERATOR];
-  int superres_largest_tile_ids[SCALE_NUMERATOR];
-  // Use superres for Key-frames and Alt-ref frames only.
-  const GF_GROUP *const gf_group = &cpi->gf_group;
-  if (gf_group->update_type[gf_group->index] != OVERLAY_UPDATE &&
-      gf_group->update_type[gf_group->index] != INTNL_OVERLAY_UPDATE) {
-    for (int denom = SCALE_NUMERATOR + 1; denom <= 2 * SCALE_NUMERATOR;
-         ++denom) {
-      oxcf->superres_scale_denominator = denom;
-      oxcf->superres_kf_scale_denominator = denom;
-      const int this_index = denom - (SCALE_NUMERATOR + 1);
-      err = encode_with_recode_loop_and_filter(
-          cpi, size, dest, &superres_sses[this_index],
-          &superres_rates[this_index], &superres_largest_tile_ids[this_index]);
-      if (err != AOM_CODEC_OK) return err;
-      restore_all_coding_context(cpi);
-    }
-    // Reset.
-    oxcf->superres_scale_denominator = SCALE_NUMERATOR;
-    oxcf->superres_kf_scale_denominator = SCALE_NUMERATOR;
-  } else {
-    for (int denom = SCALE_NUMERATOR + 1; denom <= 2 * SCALE_NUMERATOR;
-         ++denom) {
-      const int this_index = denom - (SCALE_NUMERATOR + 1);
-      superres_sses[this_index] = INT64_MAX;
-      superres_rates[this_index] = INT64_MAX;
-    }
-  }
-#else
-  int64_t sse1 = INT64_MAX;
-  int64_t rate1 = INT64_MAX;
-  int largest_tile_id1;
-  err = encode_with_recode_loop_and_filter(cpi, size, dest, &sse1, &rate1,
-                                           &largest_tile_id1);
-  if (err != AOM_CODEC_OK) return err;
-  restore_all_coding_context(cpi);
-#endif  // SUPERRES_RECODE_ALL_RATIOS
-
-  // Encode without superres.
-  int64_t sse2 = INT64_MAX;
-  int64_t rate2 = INT64_MAX;
-  int largest_tile_id2;
-  cpi->superres_mode = SUPERRES_NONE;  // To force full-res.
-  err = encode_with_recode_loop_and_filter(cpi, size, dest, &sse2, &rate2,
-                                           &largest_tile_id2);
-  cpi->superres_mode = cpi->oxcf.superres_mode;  // Reset.
-  assert(cpi->oxcf.superres_mode == SUPERRES_AUTO);
-  if (err != AOM_CODEC_OK) return err;
-
-  // Note: Both use common rdmult based on base qindex of fullres.
-  const int64_t rdmult =
-      av1_compute_rd_mult_based_on_qindex(cpi, cm->quant_params.base_qindex);
-
-#if SUPERRES_RECODE_ALL_RATIOS
-  // Find the best rdcost among all superres denoms.
-  double proj_rdcost1 = DBL_MAX;
   int64_t sse1 = INT64_MAX;
   int64_t rate1 = INT64_MAX;
   int largest_tile_id1 = 0;
-  (void)sse1;
-  (void)rate1;
-  (void)largest_tile_id1;
-  int best_denom = -1;
-  for (int denom = SCALE_NUMERATOR + 1; denom <= 2 * SCALE_NUMERATOR; ++denom) {
-    const int this_index = denom - (SCALE_NUMERATOR + 1);
-    const int64_t this_sse = superres_sses[this_index];
-    const int64_t this_rate = superres_rates[this_index];
-    const int this_largest_tile_id = superres_largest_tile_ids[this_index];
-    const double this_rdcost = RDCOST_DBL(rdmult, this_rate, this_sse);
-    if (this_rdcost < proj_rdcost1) {
-      sse1 = this_sse;
-      rate1 = this_rate;
-      largest_tile_id1 = this_largest_tile_id;
-      proj_rdcost1 = this_rdcost;
-      best_denom = denom;
+  int64_t sse2 = INT64_MAX;
+  int64_t rate2 = INT64_MAX;
+  int largest_tile_id2;
+  double proj_rdcost1 = DBL_MAX;
+
+  // Encode with superres.
+  if (cpi->sf.hl_sf.superres_auto_search_type == SUPERRES_AUTO_ALL) {
+    SuperResCfg *const superres_cfg = &cpi->oxcf.superres_cfg;
+    int64_t superres_sses[SCALE_NUMERATOR];
+    int64_t superres_rates[SCALE_NUMERATOR];
+    int superres_largest_tile_ids[SCALE_NUMERATOR];
+    // Use superres for Key-frames and Alt-ref frames only.
+    const GF_GROUP *const gf_group = &cpi->gf_group;
+    if (gf_group->update_type[gf_group->index] != OVERLAY_UPDATE &&
+        gf_group->update_type[gf_group->index] != INTNL_OVERLAY_UPDATE) {
+      for (int denom = SCALE_NUMERATOR + 1; denom <= 2 * SCALE_NUMERATOR;
+           ++denom) {
+        superres_cfg->superres_scale_denominator = denom;
+        superres_cfg->superres_kf_scale_denominator = denom;
+        const int this_index = denom - (SCALE_NUMERATOR + 1);
+
+        cpi->superres_mode = AOM_SUPERRES_AUTO;  // Super-res on for this loop.
+        err = encode_with_recode_loop_and_filter(
+            cpi, size, dest, &superres_sses[this_index],
+            &superres_rates[this_index],
+            &superres_largest_tile_ids[this_index]);
+        cpi->superres_mode = AOM_SUPERRES_NONE;  // Reset to default (full-res).
+        if (err != AOM_CODEC_OK) return err;
+        restore_all_coding_context(cpi);
+      }
+      // Reset.
+      superres_cfg->superres_scale_denominator = SCALE_NUMERATOR;
+      superres_cfg->superres_kf_scale_denominator = SCALE_NUMERATOR;
+    } else {
+      for (int denom = SCALE_NUMERATOR + 1; denom <= 2 * SCALE_NUMERATOR;
+           ++denom) {
+        const int this_index = denom - (SCALE_NUMERATOR + 1);
+        superres_sses[this_index] = INT64_MAX;
+        superres_rates[this_index] = INT64_MAX;
+      }
+    }
+    // Encode without superres.
+    assert(cpi->superres_mode == AOM_SUPERRES_NONE);
+    err = encode_with_recode_loop_and_filter(cpi, size, dest, &sse2, &rate2,
+                                             &largest_tile_id2);
+    if (err != AOM_CODEC_OK) return err;
+
+    // Note: Both use common rdmult based on base qindex of fullres.
+    const int64_t rdmult =
+        av1_compute_rd_mult_based_on_qindex(cpi, cm->quant_params.base_qindex);
+
+    // Find the best rdcost among all superres denoms.
+    int best_denom = -1;
+    for (int denom = SCALE_NUMERATOR + 1; denom <= 2 * SCALE_NUMERATOR;
+         ++denom) {
+      const int this_index = denom - (SCALE_NUMERATOR + 1);
+      const int64_t this_sse = superres_sses[this_index];
+      const int64_t this_rate = superres_rates[this_index];
+      const int this_largest_tile_id = superres_largest_tile_ids[this_index];
+      const double this_rdcost = RDCOST_DBL_WITH_NATIVE_BD_DIST(
+          rdmult, this_rate, this_sse, cm->seq_params.bit_depth);
+      if (this_rdcost < proj_rdcost1) {
+        sse1 = this_sse;
+        rate1 = this_rate;
+        largest_tile_id1 = this_largest_tile_id;
+        proj_rdcost1 = this_rdcost;
+        best_denom = denom;
+      }
+    }
+    const double proj_rdcost2 = RDCOST_DBL_WITH_NATIVE_BD_DIST(
+        rdmult, rate2, sse2, cm->seq_params.bit_depth);
+    // Re-encode with superres if it's better.
+    if (proj_rdcost1 < proj_rdcost2) {
+      restore_all_coding_context(cpi);
+      // TODO(urvang): We should avoid rerunning the recode loop by saving
+      // previous output+state, or running encode only for the selected 'q' in
+      // previous step.
+      // Again, temporarily force the best denom.
+      superres_cfg->superres_scale_denominator = best_denom;
+      superres_cfg->superres_kf_scale_denominator = best_denom;
+      int64_t sse3 = INT64_MAX;
+      int64_t rate3 = INT64_MAX;
+      cpi->superres_mode =
+          AOM_SUPERRES_AUTO;  // Super-res on for this recode loop.
+      err = encode_with_recode_loop_and_filter(cpi, size, dest, &sse3, &rate3,
+                                               largest_tile_id);
+      cpi->superres_mode = AOM_SUPERRES_NONE;  // Reset to default (full-res).
+      assert(sse1 == sse3);
+      assert(rate1 == rate3);
+      assert(largest_tile_id1 == *largest_tile_id);
+      // Reset.
+      superres_cfg->superres_scale_denominator = SCALE_NUMERATOR;
+      superres_cfg->superres_kf_scale_denominator = SCALE_NUMERATOR;
+    } else {
+      *largest_tile_id = largest_tile_id2;
+    }
+  } else {
+    assert(cpi->sf.hl_sf.superres_auto_search_type == SUPERRES_AUTO_DUAL);
+    cpi->superres_mode =
+        AOM_SUPERRES_AUTO;  // Super-res on for this recode loop.
+    err = encode_with_recode_loop_and_filter(cpi, size, dest, &sse1, &rate1,
+                                             &largest_tile_id1);
+    cpi->superres_mode = AOM_SUPERRES_NONE;  // Reset to default (full-res).
+    if (err != AOM_CODEC_OK) return err;
+    restore_all_coding_context(cpi);
+    // Encode without superres.
+    assert(cpi->superres_mode == AOM_SUPERRES_NONE);
+    err = encode_with_recode_loop_and_filter(cpi, size, dest, &sse2, &rate2,
+                                             &largest_tile_id2);
+    if (err != AOM_CODEC_OK) return err;
+
+    // Note: Both use common rdmult based on base qindex of fullres.
+    const int64_t rdmult =
+        av1_compute_rd_mult_based_on_qindex(cpi, cm->quant_params.base_qindex);
+    proj_rdcost1 = RDCOST_DBL_WITH_NATIVE_BD_DIST(rdmult, rate1, sse1,
+                                                  cm->seq_params.bit_depth);
+    const double proj_rdcost2 = RDCOST_DBL_WITH_NATIVE_BD_DIST(
+        rdmult, rate2, sse2, cm->seq_params.bit_depth);
+    // Re-encode with superres if it's better.
+    if (proj_rdcost1 < proj_rdcost2) {
+      restore_all_coding_context(cpi);
+      // TODO(urvang): We should avoid rerunning the recode loop by saving
+      // previous output+state, or running encode only for the selected 'q' in
+      // previous step.
+      int64_t sse3 = INT64_MAX;
+      int64_t rate3 = INT64_MAX;
+      cpi->superres_mode =
+          AOM_SUPERRES_AUTO;  // Super-res on for this recode loop.
+      err = encode_with_recode_loop_and_filter(cpi, size, dest, &sse3, &rate3,
+                                               largest_tile_id);
+      cpi->superres_mode = AOM_SUPERRES_NONE;  // Reset to default (full-res).
+      assert(sse1 == sse3);
+      assert(rate1 == rate3);
+      assert(largest_tile_id1 == *largest_tile_id);
+    } else {
+      *largest_tile_id = largest_tile_id2;
     }
   }
-#else
-  const double proj_rdcost1 = RDCOST_DBL(rdmult, rate1, sse1);
-#endif  // SUPERRES_RECODE_ALL_RATIOS
-  const double proj_rdcost2 = RDCOST_DBL(rdmult, rate2, sse2);
-
-  // Re-encode with superres if it's better.
-  if (proj_rdcost1 < proj_rdcost2) {
-    restore_all_coding_context(cpi);
-    // TODO(urvang): We should avoid rerunning the recode loop by saving
-    // previous output+state, or running encode only for the selected 'q' in
-    // previous step.
-#if SUPERRES_RECODE_ALL_RATIOS
-    // Again, temporarily force the best denom.
-    oxcf->superres_scale_denominator = best_denom;
-    oxcf->superres_kf_scale_denominator = best_denom;
-#endif  // SUPERRES_RECODE_ALL_RATIOS
-    int64_t sse3 = INT64_MAX;
-    int64_t rate3 = INT64_MAX;
-    err = encode_with_recode_loop_and_filter(cpi, size, dest, &sse3, &rate3,
-                                             largest_tile_id);
-    assert(sse1 == sse3);
-    assert(rate1 == rate3);
-    assert(largest_tile_id1 == *largest_tile_id);
-#if SUPERRES_RECODE_ALL_RATIOS
-    // Reset.
-    oxcf->superres_scale_denominator = SCALE_NUMERATOR;
-    oxcf->superres_kf_scale_denominator = SCALE_NUMERATOR;
-#endif  // SUPERRES_RECODE_ALL_RATIOS
-  } else {
-    *largest_tile_id = largest_tile_id2;
-  }
-
-  release_copy_buffer(&cpi->coding_context);
 
   return err;
 }
-#endif  // CONFIG_SUPERRES_IN_RECODE
-
-#define DUMP_RECON_FRAMES 0
-
-#if DUMP_RECON_FRAMES == 1
-// NOTE(zoeliu): For debug - Output the filtered reconstructed video.
-static void dump_filtered_recon_frames(AV1_COMP *cpi) {
-  AV1_COMMON *const cm = &cpi->common;
-  const CurrentFrame *const current_frame = &cm->current_frame;
-  const YV12_BUFFER_CONFIG *recon_buf = &cm->cur_frame->buf;
-
-  if (recon_buf == NULL) {
-    printf("Frame %d is not ready.\n", current_frame->frame_number);
-    return;
-  }
-
-  static const int flag_list[REF_FRAMES] = { 0,
-                                             AOM_LAST_FLAG,
-                                             AOM_LAST2_FLAG,
-                                             AOM_LAST3_FLAG,
-                                             AOM_GOLD_FLAG,
-                                             AOM_BWD_FLAG,
-                                             AOM_ALT2_FLAG,
-                                             AOM_ALT_FLAG };
-  printf(
-      "\n***Frame=%d (frame_offset=%d, show_frame=%d, "
-      "show_existing_frame=%d) "
-      "[LAST LAST2 LAST3 GOLDEN BWD ALT2 ALT]=[",
-      current_frame->frame_number, current_frame->order_hint, cm->show_frame,
-      cm->show_existing_frame);
-  for (int ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
-    const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame);
-    const int ref_offset = buf != NULL ? (int)buf->order_hint : -1;
-    printf(" %d(%c)", ref_offset,
-           (cpi->ref_frame_flags & flag_list[ref_frame]) ? 'Y' : 'N');
-  }
-  printf(" ]\n");
-
-  if (!cm->show_frame) {
-    printf("Frame %d is a no show frame, so no image dump.\n",
-           current_frame->frame_number);
-    return;
-  }
-
-  int h;
-  char file_name[256] = "/tmp/enc_filtered_recon.yuv";
-  FILE *f_recon = NULL;
-
-  if (current_frame->frame_number == 0) {
-    if ((f_recon = fopen(file_name, "wb")) == NULL) {
-      printf("Unable to open file %s to write.\n", file_name);
-      return;
-    }
-  } else {
-    if ((f_recon = fopen(file_name, "ab")) == NULL) {
-      printf("Unable to open file %s to append.\n", file_name);
-      return;
-    }
-  }
-  printf(
-      "\nFrame=%5d, encode_update_type[%5d]=%1d, frame_offset=%d, "
-      "show_frame=%d, show_existing_frame=%d, source_alt_ref_active=%d, "
-      "refresh_alt_ref_frame=%d, "
-      "y_stride=%4d, uv_stride=%4d, cm->width=%4d, cm->height=%4d\n\n",
-      current_frame->frame_number, cpi->gf_group.index,
-      cpi->gf_group.update_type[cpi->gf_group.index], current_frame->order_hint,
-      cm->show_frame, cm->show_existing_frame, cpi->rc.source_alt_ref_active,
-      cpi->refresh_alt_ref_frame, recon_buf->y_stride, recon_buf->uv_stride,
-      cm->width, cm->height);
-#if 0
-  int ref_frame;
-  printf("get_ref_frame_map_idx: [");
-  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame)
-    printf(" %d", get_ref_frame_map_idx(cm, ref_frame));
-  printf(" ]\n");
-#endif  // 0
-
-  // --- Y ---
-  for (h = 0; h < cm->height; ++h) {
-    fwrite(&recon_buf->y_buffer[h * recon_buf->y_stride], 1, cm->width,
-           f_recon);
-  }
-  // --- U ---
-  for (h = 0; h < (cm->height >> 1); ++h) {
-    fwrite(&recon_buf->u_buffer[h * recon_buf->uv_stride], 1, (cm->width >> 1),
-           f_recon);
-  }
-  // --- V ---
-  for (h = 0; h < (cm->height >> 1); ++h) {
-    fwrite(&recon_buf->v_buffer[h * recon_buf->uv_stride], 1, (cm->width >> 1),
-           f_recon);
-  }
-
-  fclose(f_recon);
-}
-#endif  // DUMP_RECON_FRAMES
-
-static int is_integer_mv(const YV12_BUFFER_CONFIG *cur_picture,
-                         const YV12_BUFFER_CONFIG *last_picture,
-                         ForceIntegerMVInfo *const force_intpel_info) {
-  aom_clear_system_state();
-  // check use hash ME
-  int k;
-
-  const int block_size = FORCE_INT_MV_DECISION_BLOCK_SIZE;
-  const double threshold_current = 0.8;
-  const double threshold_average = 0.95;
-  const int max_history_size = 32;
-  int T = 0;  // total block
-  int C = 0;  // match with collocated block
-  int S = 0;  // smooth region but not match with collocated block
-
-  const int pic_width = cur_picture->y_width;
-  const int pic_height = cur_picture->y_height;
-  for (int i = 0; i + block_size <= pic_height; i += block_size) {
-    for (int j = 0; j + block_size <= pic_width; j += block_size) {
-      const int x_pos = j;
-      const int y_pos = i;
-      int match = 1;
-      T++;
-
-      // check whether collocated block match with current
-      uint8_t *p_cur = cur_picture->y_buffer;
-      uint8_t *p_ref = last_picture->y_buffer;
-      int stride_cur = cur_picture->y_stride;
-      int stride_ref = last_picture->y_stride;
-      p_cur += (y_pos * stride_cur + x_pos);
-      p_ref += (y_pos * stride_ref + x_pos);
-
-      if (cur_picture->flags & YV12_FLAG_HIGHBITDEPTH) {
-        uint16_t *p16_cur = CONVERT_TO_SHORTPTR(p_cur);
-        uint16_t *p16_ref = CONVERT_TO_SHORTPTR(p_ref);
-        for (int tmpY = 0; tmpY < block_size && match; tmpY++) {
-          for (int tmpX = 0; tmpX < block_size && match; tmpX++) {
-            if (p16_cur[tmpX] != p16_ref[tmpX]) {
-              match = 0;
-            }
-          }
-          p16_cur += stride_cur;
-          p16_ref += stride_ref;
-        }
-      } else {
-        for (int tmpY = 0; tmpY < block_size && match; tmpY++) {
-          for (int tmpX = 0; tmpX < block_size && match; tmpX++) {
-            if (p_cur[tmpX] != p_ref[tmpX]) {
-              match = 0;
-            }
-          }
-          p_cur += stride_cur;
-          p_ref += stride_ref;
-        }
-      }
-
-      if (match) {
-        C++;
-        continue;
-      }
-
-      if (av1_hash_is_horizontal_perfect(cur_picture, block_size, x_pos,
-                                         y_pos) ||
-          av1_hash_is_vertical_perfect(cur_picture, block_size, x_pos, y_pos)) {
-        S++;
-        continue;
-      }
-    }
-  }
-
-  assert(T > 0);
-  double cs_rate = ((double)(C + S)) / ((double)(T));
-
-  force_intpel_info->cs_rate_array[force_intpel_info->rate_index] = cs_rate;
-
-  force_intpel_info->rate_index =
-      (force_intpel_info->rate_index + 1) % max_history_size;
-  force_intpel_info->rate_size++;
-  force_intpel_info->rate_size =
-      AOMMIN(force_intpel_info->rate_size, max_history_size);
-
-  if (cs_rate < threshold_current) {
-    return 0;
-  }
-
-  if (C == T) {
-    return 1;
-  }
-
-  double cs_average = 0.0;
-
-  for (k = 0; k < force_intpel_info->rate_size; k++) {
-    cs_average += force_intpel_info->cs_rate_array[k];
-  }
-  cs_average /= force_intpel_info->rate_size;
-
-  if (cs_average < threshold_average) {
-    return 0;
-  }
-
-  if ((T - C - S) < 0) {
-    return 1;
-  }
-
-  if (cs_average > 1.01) {
-    return 1;
-  }
-
-  return 0;
-}
-
-// Refresh reference frame buffers according to refresh_frame_flags.
-static void refresh_reference_frames(AV1_COMP *cpi) {
-  AV1_COMMON *const cm = &cpi->common;
-  // All buffers are refreshed for shown keyframes and S-frames.
-
-  for (int ref_frame = 0; ref_frame < REF_FRAMES; ref_frame++) {
-    if (((cm->current_frame.refresh_frame_flags >> ref_frame) & 1) == 1) {
-      assign_frame_buffer_p(&cm->ref_frame_map[ref_frame], cm->cur_frame);
-    }
-  }
-}
-
-static void set_mb_ssim_rdmult_scaling(AV1_COMP *cpi) {
-  const CommonModeInfoParams *const mi_params = &cpi->common.mi_params;
-  ThreadData *td = &cpi->td;
-  MACROBLOCK *x = &td->mb;
-  MACROBLOCKD *xd = &x->e_mbd;
-  uint8_t *y_buffer = cpi->source->y_buffer;
-  const int y_stride = cpi->source->y_stride;
-  const int block_size = BLOCK_16X16;
-
-  const int num_mi_w = mi_size_wide[block_size];
-  const int num_mi_h = mi_size_high[block_size];
-  const int num_cols = (mi_params->mi_cols + num_mi_w - 1) / num_mi_w;
-  const int num_rows = (mi_params->mi_rows + num_mi_h - 1) / num_mi_h;
-  double log_sum = 0.0;
-  const int use_hbd = cpi->source->flags & YV12_FLAG_HIGHBITDEPTH;
-
-  // Loop through each 16x16 block.
-  for (int row = 0; row < num_rows; ++row) {
-    for (int col = 0; col < num_cols; ++col) {
-      double var = 0.0, num_of_var = 0.0;
-      const int index = row * num_cols + col;
-
-      // Loop through each 8x8 block.
-      for (int mi_row = row * num_mi_h;
-           mi_row < mi_params->mi_rows && mi_row < (row + 1) * num_mi_h;
-           mi_row += 2) {
-        for (int mi_col = col * num_mi_w;
-             mi_col < mi_params->mi_cols && mi_col < (col + 1) * num_mi_w;
-             mi_col += 2) {
-          struct buf_2d buf;
-          const int row_offset_y = mi_row << 2;
-          const int col_offset_y = mi_col << 2;
-
-          buf.buf = y_buffer + row_offset_y * y_stride + col_offset_y;
-          buf.stride = y_stride;
-
-          if (use_hbd) {
-            var += av1_high_get_sby_perpixel_variance(cpi, &buf, BLOCK_8X8,
-                                                      xd->bd);
-          } else {
-            var += av1_get_sby_perpixel_variance(cpi, &buf, BLOCK_8X8);
-          }
-
-          num_of_var += 1.0;
-        }
-      }
-      var = var / num_of_var;
-
-      // Curve fitting with an exponential model on all 16x16 blocks from the
-      // midres dataset.
-      var = 67.035434 * (1 - exp(-0.0021489 * var)) + 17.492222;
-      cpi->ssim_rdmult_scaling_factors[index] = var;
-      log_sum += log(var);
-    }
-  }
-  log_sum = exp(log_sum / (double)(num_rows * num_cols));
-
-  for (int row = 0; row < num_rows; ++row) {
-    for (int col = 0; col < num_cols; ++col) {
-      const int index = row * num_cols + col;
-      cpi->ssim_rdmult_scaling_factors[index] /= log_sum;
-    }
-  }
-}
 
 extern void av1_print_frame_contexts(const FRAME_CONTEXT *fc,
                                      const char *filename);
 
+/*!\brief Run the final pass encoding for 1-pass/2-pass encoding mode, and pack
+ * the bitstream
+ *
+ * \ingroup high_level_algo
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in]    cpi             Top-level encoder structure
+ * \param[in]    size            Bitstream size
+ * \param[in]    dest            Bitstream output
+ *
+ * \return Returns a value to indicate if the encoding is done successfully.
+ * \retval #AOM_CODEC_OK
+ * \retval #AOM_CODEC_ERROR
+ */
 static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size,
                                      uint8_t *dest) {
   AV1_COMMON *const cm = &cpi->common;
@@ -6259,16 +2885,21 @@
   const AV1EncoderConfig *const oxcf = &cpi->oxcf;
   struct segmentation *const seg = &cm->seg;
   FeatureFlags *const features = &cm->features;
+  const TileConfig *const tile_cfg = &oxcf->tile_cfg;
 
 #if CONFIG_COLLECT_COMPONENT_TIMING
   start_timing(cpi, encode_frame_to_data_rate_time);
 #endif
 
+  if (frame_is_intra_only(cm)) {
+    av1_set_screen_content_options(cpi, features);
+  }
+
   // frame type has been decided outside of this function call
   cm->cur_frame->frame_type = current_frame->frame_type;
 
-  cm->tiles.large_scale = cpi->oxcf.large_scale_tile;
-  cm->tiles.single_tile_decoding = cpi->oxcf.single_tile_decoding;
+  cm->tiles.large_scale = tile_cfg->enable_large_scale_tile;
+  cm->tiles.single_tile_decoding = tile_cfg->enable_single_tile_decoding;
 
   features->allow_ref_frame_mvs &= frame_might_allow_ref_frame_mvs(cm);
   // features->allow_ref_frame_mvs needs to be written into the frame header
@@ -6276,15 +2907,23 @@
   // is separated from frame_might_allow_ref_frame_mvs().
   features->allow_ref_frame_mvs &= !cm->tiles.large_scale;
 
-  features->allow_warped_motion =
-      cpi->oxcf.allow_warped_motion && frame_might_allow_warped_motion(cm);
+  features->allow_warped_motion = oxcf->motion_mode_cfg.allow_warped_motion &&
+                                  frame_might_allow_warped_motion(cm);
 
   cpi->last_frame_type = current_frame->frame_type;
 
+  if (frame_is_sframe(cm)) {
+    GF_GROUP *gf_group = &cpi->gf_group;
+    // S frame will wipe out any previously encoded altref so we cannot place
+    // an overlay frame
+    gf_group->update_type[gf_group->size] = GF_UPDATE;
+  }
+
   if (encode_show_existing_frame(cm)) {
-    finalize_encoded_frame(cpi);
+    av1_finalize_encoded_frame(cpi);
     // Build the bitstream
     int largest_tile_id = 0;  // Output from bitstream: unused here
+    cpi->rc.coefficient_size = 0;
     if (av1_pack_bitstream(cpi, dest, size, &largest_tile_id) != AOM_CODEC_OK)
       return AOM_CODEC_ERROR;
 
@@ -6300,7 +2939,7 @@
 
 #if DUMP_RECON_FRAMES == 1
     // NOTE(zoeliu): For debug - Output the filtered reconstructed video.
-    dump_filtered_recon_frames(cpi);
+    av1_dump_filtered_recon_frames(cpi);
 #endif  // DUMP_RECON_FRAMES
 
     // NOTE: Save the new show frame buffer index for --test-code=warn, i.e.,
@@ -6311,13 +2950,16 @@
 
     // Since we allocate a spot for the OVERLAY frame in the gf group, we need
     // to do post-encoding update accordingly.
-    if (cpi->rc.is_src_frame_alt_ref) {
-      av1_set_target_rate(cpi, cm->width, cm->height);
-      av1_rc_postencode_update(cpi, *size);
+    av1_set_target_rate(cpi, cm->width, cm->height);
+    av1_rc_postencode_update(cpi, *size);
+
+    if (is_psnr_calc_enabled(cpi)) {
+      cpi->source =
+          realloc_and_scale_source(cpi, cm->cur_frame->buf.y_crop_width,
+                                   cm->cur_frame->buf.y_crop_height);
     }
 
     ++current_frame->frame_number;
-
     return AOM_CODEC_OK;
   }
 
@@ -6328,7 +2970,7 @@
     if (cpi->common.seq_params.force_integer_mv == 2) {
       // Adaptive mode: see what previous frame encoded did
       if (cpi->unscaled_last_source != NULL) {
-        features->cur_frame_force_integer_mv = is_integer_mv(
+        features->cur_frame_force_integer_mv = av1_is_integer_mv(
             cpi->source, cpi->unscaled_last_source, &cpi->force_intpel_info);
       } else {
         cpi->common.features.cur_frame_force_integer_mv = 0;
@@ -6354,12 +2996,9 @@
       seg->update_map = 1;
       seg->update_data = 1;
     }
-
-    // The alternate reference frame cannot be active for a key frame.
-    cpi->rc.source_alt_ref_active = 0;
   }
-  if (cpi->oxcf.mtu == 0) {
-    cpi->num_tg = cpi->oxcf.num_tile_groups;
+  if (tile_cfg->mtu == 0) {
+    cpi->num_tg = tile_cfg->num_tile_groups;
   } else {
     // Use a default value for the purposes of weighting costs in probability
     // updates
@@ -6368,20 +3007,23 @@
 
   // For 1 pass CBR, check if we are dropping this frame.
   // Never drop on key frame.
-  if (has_no_stats_stage(cpi) && oxcf->rc_mode == AOM_CBR &&
+  if (has_no_stats_stage(cpi) && oxcf->rc_cfg.mode == AOM_CBR &&
       current_frame->frame_type != KEY_FRAME) {
     if (av1_rc_drop_frame(cpi)) {
+      av1_setup_frame_size(cpi);
       av1_rc_postencode_update_drop_frame(cpi);
       release_scaled_references(cpi);
       return AOM_CODEC_OK;
     }
   }
 
-  if (oxcf->tuning == AOM_TUNE_SSIM) set_mb_ssim_rdmult_scaling(cpi);
+  if (oxcf->tune_cfg.tuning == AOM_TUNE_SSIM)
+    av1_set_mb_ssim_rdmult_scaling(cpi);
 
 #if CONFIG_TUNE_VMAF
-  if (oxcf->tuning == AOM_TUNE_VMAF_WITHOUT_PREPROCESSING ||
-      oxcf->tuning == AOM_TUNE_VMAF_MAX_GAIN) {
+  if (oxcf->tune_cfg.tuning == AOM_TUNE_VMAF_WITHOUT_PREPROCESSING ||
+      oxcf->tune_cfg.tuning == AOM_TUNE_VMAF_MAX_GAIN ||
+      oxcf->tune_cfg.tuning == AOM_TUNE_VMAF_NEG_MAX_GAIN) {
     av1_set_mb_vmaf_rdmult_scaling(cpi);
   }
 #endif
@@ -6414,7 +3056,7 @@
       // same across different streams of the same content current_frame_id
       // should be the same and not random. 0x37 is a chosen number as start
       // point
-      if (cpi->oxcf.sframe_enabled) cm->current_frame_id = 0x37;
+      if (oxcf->kf_cfg.sframe_dist != 0) cm->current_frame_id = 0x37;
     } else {
       cm->current_frame_id =
           (cm->current_frame_id + 1 + (1 << seq_params->frame_id_length)) %
@@ -6422,7 +3064,7 @@
     }
   }
 
-  switch (cpi->oxcf.cdf_update_mode) {
+  switch (oxcf->algo_cfg.cdf_update_mode) {
     case 0:  // No CDF update for any frames(4~6% compression loss).
       features->disable_cdf_update = 1;
       break;
@@ -6442,21 +3084,20 @@
   seq_params->timing_info_present &= !seq_params->reduced_still_picture_hdr;
 
   int largest_tile_id = 0;
-#if CONFIG_SUPERRES_IN_RECODE
-  if (superres_in_recode_allowed(cpi)) {
+  if (av1_superres_in_recode_allowed(cpi)) {
     if (encode_with_and_without_superres(cpi, size, dest, &largest_tile_id) !=
         AOM_CODEC_OK) {
       return AOM_CODEC_ERROR;
     }
   } else {
-#endif  // CONFIG_SUPERRES_IN_RECODE
+    const aom_superres_mode orig_superres_mode = cpi->superres_mode;  // save
+    cpi->superres_mode = cpi->oxcf.superres_cfg.superres_mode;
     if (encode_with_recode_loop_and_filter(cpi, size, dest, NULL, NULL,
                                            &largest_tile_id) != AOM_CODEC_OK) {
       return AOM_CODEC_ERROR;
     }
-#if CONFIG_SUPERRES_IN_RECODE
+    cpi->superres_mode = orig_superres_mode;  // restore
   }
-#endif  // CONFIG_SUPERRES_IN_RECODE
 
   cpi->seq_params_locked = 1;
 
@@ -6469,9 +3110,12 @@
     }
   }
 
+  if (cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1)
+    cpi->svc.num_encoded_top_layer++;
+
 #if DUMP_RECON_FRAMES == 1
   // NOTE(zoeliu): For debug - Output the filtered reconstructed video.
-  dump_filtered_recon_frames(cpi);
+  av1_dump_filtered_recon_frames(cpi);
 #endif  // DUMP_RECON_FRAMES
 
   if (cm->seg.enabled) {
@@ -6479,7 +3123,8 @@
       update_reference_segmentation_map(cpi);
     } else if (cm->last_frame_seg_map) {
       memcpy(cm->cur_frame->seg_map, cm->last_frame_seg_map,
-             cm->mi_params.mi_cols * cm->mi_params.mi_rows * sizeof(uint8_t));
+             cm->cur_frame->mi_cols * cm->cur_frame->mi_rows *
+                 sizeof(*cm->cur_frame->seg_map));
     }
   }
 
@@ -6505,7 +3150,7 @@
     cm->cur_frame->frame_context = *cm->fc;
   }
 
-  if (cpi->oxcf.ext_tile_debug) {
+  if (tile_cfg->enable_ext_tile_debug) {
     // (yunqing) This test ensures the correctness of large scale tile coding.
     if (cm->tiles.large_scale && is_stat_consumption_stage(cpi)) {
       char fn[20] = "./fc";
@@ -6517,23 +3162,6 @@
     }
   }
 
-#if CONFIG_COLLECT_COMPONENT_TIMING
-  end_timing(cpi, encode_frame_to_data_rate_time);
-
-  // Print out timing information.
-  int i;
-  fprintf(stderr, "\n Frame number: %d, Frame type: %s, Show Frame: %d\n",
-          cm->current_frame.frame_number,
-          get_frame_type_enum(cm->current_frame.frame_type), cm->show_frame);
-  for (i = 0; i < kTimingComponents; i++) {
-    cpi->component_time[i] += cpi->frame_component_time[i];
-    fprintf(stderr, " %s:  %" PRId64 " us (total: %" PRId64 " us)\n",
-            get_component_name(i), cpi->frame_component_time[i],
-            cpi->component_time[i]);
-    cpi->frame_component_time[i] = 0;
-  }
-#endif
-
   cpi->last_frame_type = current_frame->frame_type;
 
   av1_rc_postencode_update(cpi, *size);
@@ -6547,12 +3175,11 @@
   // A droppable frame might not be shown but it always
   // takes a space in the gf group. Therefore, even when
   // it is not shown, we still need update the count down.
+  if (cm->show_frame) ++current_frame->frame_number;
 
-  if (cm->show_frame) {
-    // Don't increment frame counters if this was an altref buffer
-    // update not a real frame
-    ++current_frame->frame_number;
-  }
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  end_timing(cpi, encode_frame_to_data_rate_time);
+#endif
 
   return AOM_CODEC_OK;
 }
@@ -6581,15 +3208,15 @@
   memcpy(cm->remapped_ref_idx, frame_params->remapped_ref_idx,
          REF_FRAMES * sizeof(*cm->remapped_ref_idx));
 
-  cpi->refresh_golden_frame = frame_params->refresh_golden_frame;
-  cpi->refresh_bwd_ref_frame = frame_params->refresh_bwd_ref_frame;
-  cpi->refresh_alt_ref_frame = frame_params->refresh_alt_ref_frame;
+  memcpy(&cpi->refresh_frame, &frame_params->refresh_frame,
+         sizeof(cpi->refresh_frame));
 
-  if (current_frame->frame_type == KEY_FRAME && cm->show_frame)
+  if (current_frame->frame_type == KEY_FRAME && !cpi->no_show_fwd_kf)
     current_frame->frame_number = 0;
 
   current_frame->order_hint =
       current_frame->frame_number + frame_params->order_offset;
+
   current_frame->display_order_hint = current_frame->order_hint;
   current_frame->order_hint %=
       (1 << (cm->seq_params.order_hint_info.order_hint_bits_minus_1 + 1));
@@ -6656,11 +3283,11 @@
 
 #if CONFIG_TUNE_VMAF
   if (!is_stat_generation_stage(cpi) &&
-      cpi->oxcf.tuning == AOM_TUNE_VMAF_WITH_PREPROCESSING) {
+      cpi->oxcf.tune_cfg.tuning == AOM_TUNE_VMAF_WITH_PREPROCESSING) {
     av1_vmaf_frame_preprocessing(cpi, sd);
   }
   if (!is_stat_generation_stage(cpi) &&
-      cpi->oxcf.tuning == AOM_TUNE_VMAF_MAX_GAIN) {
+      cpi->oxcf.tune_cfg.tuning == AOM_TUNE_VMAF_MAX_GAIN) {
     av1_vmaf_blk_preprocessing(cpi, sd);
   }
 #endif
@@ -6683,6 +3310,14 @@
   aom_usec_timer_mark(&timer);
   cpi->time_receive_data += aom_usec_timer_elapsed(&timer);
 #endif
+
+  // Note: Regarding profile setting, the following checks are added to help
+  // choose a proper profile for the input video. The criterion is that all
+  // bitstreams must be designated as the lowest profile that match its content.
+  // E.G. A bitstream that contains 4:4:4 video must be designated as High
+  // Profile in the seq header, and likewise a bitstream that contains 4:2:2
+  // bitstream must be designated as Professional Profile in the sequence
+  // header.
   if ((seq_params->profile == PROFILE_0) && !seq_params->monochrome &&
       (subsampling_x != 1 || subsampling_y != 1)) {
     aom_internal_error(&cm->error, AOM_CODEC_INVALID_PARAM,
@@ -6699,7 +3334,7 @@
       (seq_params->bit_depth <= AOM_BITS_10) &&
       !(subsampling_x == 1 && subsampling_y == 0)) {
     aom_internal_error(&cm->error, AOM_CODEC_INVALID_PARAM,
-                       "Profile 2 bit-depth < 10 requires 4:2:2 color format");
+                       "Profile 2 bit-depth <= 10 requires 4:2:2 color format");
     res = -1;
   }
 
@@ -6723,7 +3358,7 @@
 static void compute_internal_stats(AV1_COMP *cpi, int frame_bytes) {
   AV1_COMMON *const cm = &cpi->common;
   double samples = 0.0;
-  const uint32_t in_bit_depth = cpi->oxcf.input_bit_depth;
+  const uint32_t in_bit_depth = cpi->oxcf.input_cfg.input_bit_depth;
   const uint32_t bit_depth = cpi->td.mb.e_mbd.bd;
 
 #if CONFIG_INTER_STATS_ONLY
@@ -6735,10 +3370,12 @@
     const YV12_BUFFER_CONFIG *recon = &cpi->common.cur_frame->buf;
     double y, u, v, frame_all;
 
-    cpi->count++;
+    cpi->count[0]++;
+    cpi->count[1]++;
     if (cpi->b_calculate_psnr) {
       PSNR_STATS psnr;
-      double frame_ssim2 = 0.0, weight = 0.0;
+      double weight[2] = { 0.0, 0.0 };
+      double frame_ssim2[2] = { 0.0, 0.0 };
       aom_clear_system_state();
 #if CONFIG_AV1_HIGHBITDEPTH
       aom_calc_highbd_psnr(orig, recon, &psnr, bit_depth, in_bit_depth);
@@ -6746,20 +3383,36 @@
       aom_calc_psnr(orig, recon, &psnr);
 #endif
       adjust_image_stat(psnr.psnr[1], psnr.psnr[2], psnr.psnr[3], psnr.psnr[0],
-                        &cpi->psnr);
-      cpi->total_sq_error += psnr.sse[0];
-      cpi->total_samples += psnr.samples[0];
+                        &(cpi->psnr[0]));
+      cpi->total_sq_error[0] += psnr.sse[0];
+      cpi->total_samples[0] += psnr.samples[0];
       samples = psnr.samples[0];
+
       // TODO(yaowu): unify these two versions into one.
       if (cm->seq_params.use_highbitdepth)
-        frame_ssim2 =
-            aom_highbd_calc_ssim(orig, recon, &weight, bit_depth, in_bit_depth);
+        aom_highbd_calc_ssim(orig, recon, weight, bit_depth, in_bit_depth,
+                             frame_ssim2);
       else
-        frame_ssim2 = aom_calc_ssim(orig, recon, &weight);
+        aom_calc_ssim(orig, recon, &weight[0], &frame_ssim2[0]);
 
-      cpi->worst_ssim = AOMMIN(cpi->worst_ssim, frame_ssim2);
-      cpi->summed_quality += frame_ssim2 * weight;
-      cpi->summed_weights += weight;
+      cpi->worst_ssim = AOMMIN(cpi->worst_ssim, frame_ssim2[0]);
+      cpi->summed_quality += frame_ssim2[0] * weight[0];
+      cpi->summed_weights += weight[0];
+
+#if CONFIG_AV1_HIGHBITDEPTH
+      // Compute PSNR based on stream bit depth
+      if ((cpi->source->flags & YV12_FLAG_HIGHBITDEPTH) &&
+          (in_bit_depth < bit_depth)) {
+        adjust_image_stat(psnr.psnr_hbd[1], psnr.psnr_hbd[2], psnr.psnr_hbd[3],
+                          psnr.psnr_hbd[0], &cpi->psnr[1]);
+        cpi->total_sq_error[1] += psnr.sse_hbd[0];
+        cpi->total_samples[1] += psnr.samples_hbd[0];
+
+        cpi->worst_ssim_hbd = AOMMIN(cpi->worst_ssim_hbd, frame_ssim2[1]);
+        cpi->summed_quality_hbd += frame_ssim2[1] * weight[1];
+        cpi->summed_weights_hbd += weight[1];
+      }
+#endif
 
 #if 0
       {
@@ -6809,6 +3462,7 @@
   }
 }
 #endif  // CONFIG_INTERNAL_STATS
+
 int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags,
                             size_t *size, uint8_t *dest, int64_t *time_stamp,
                             int64_t *time_end, int flush,
@@ -6817,10 +3471,10 @@
   AV1_COMMON *const cm = &cpi->common;
 
 #if CONFIG_BITSTREAM_DEBUG
-  assert(cpi->oxcf.max_threads == 0 &&
+  assert(cpi->oxcf.max_threads <= 1 &&
          "bitstream debug tool does not support multithreading");
   bitstream_queue_record_write();
-  aom_bitstream_queue_set_frame_write(cm->current_frame.frame_number * 2 +
+  aom_bitstream_queue_set_frame_write(cm->current_frame.order_hint * 2 +
                                       cm->show_frame);
 #endif
   if (cpi->use_svc && cm->number_spatial_layers > 1) {
@@ -6836,49 +3490,70 @@
   av1_set_high_precision_mv(cpi, 1, 0);
 
   // Normal defaults
-  cm->features.refresh_frame_context = oxcf->frame_parallel_decoding_mode
-                                           ? REFRESH_FRAME_CONTEXT_DISABLED
-                                           : REFRESH_FRAME_CONTEXT_BACKWARD;
-  if (oxcf->large_scale_tile)
+  cm->features.refresh_frame_context =
+      oxcf->tool_cfg.frame_parallel_decoding_mode
+          ? REFRESH_FRAME_CONTEXT_DISABLED
+          : REFRESH_FRAME_CONTEXT_BACKWARD;
+  if (oxcf->tile_cfg.enable_large_scale_tile)
     cm->features.refresh_frame_context = REFRESH_FRAME_CONTEXT_DISABLED;
 
   // Initialize fields related to forward keyframes
-  cpi->no_show_kf = 0;
+  cpi->no_show_fwd_kf = 0;
 
   if (assign_cur_frame_new_fb(cm) == NULL) return AOM_CODEC_ERROR;
 
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  // Only accumulate 2nd pass time.
+  if (cpi->oxcf.pass == 2) start_timing(cpi, av1_encode_strategy_time);
+#endif
+
   const int result =
       av1_encode_strategy(cpi, size, dest, frame_flags, time_stamp, time_end,
                           timestamp_ratio, flush);
-  if (result != AOM_CODEC_OK && result != -1) {
-    return AOM_CODEC_ERROR;
-  } else if (result == -1) {
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  if (cpi->oxcf.pass == 2) end_timing(cpi, av1_encode_strategy_time);
+
+  // Print out timing information.
+  // Note: Use "cpi->frame_component_time[0] > 100 us" to avoid showing of
+  // show_existing_frame and lag-in-frames.
+  if (cpi->oxcf.pass == 2 && cpi->frame_component_time[0] > 100) {
+    int i;
+    fprintf(stderr, "\n Frame number: %d, Frame type: %s, Show Frame: %d\n",
+            cm->current_frame.frame_number,
+            get_frame_type_enum(cm->current_frame.frame_type), cm->show_frame);
+    for (i = 0; i < kTimingComponents; i++) {
+      cpi->component_time[i] += cpi->frame_component_time[i];
+      fprintf(stderr, " %s:  %" PRId64 " us (total: %" PRId64 " us)\n",
+              get_component_name(i), cpi->frame_component_time[i],
+              cpi->component_time[i]);
+      cpi->frame_component_time[i] = 0;
+    }
+  }
+#endif
+
+  if (result == -1) {
     // Returning -1 indicates no frame encoded; more input is required
     return -1;
   }
+  if (result != AOM_CODEC_OK) {
+    return AOM_CODEC_ERROR;
+  }
 #if CONFIG_INTERNAL_STATS
   aom_usec_timer_mark(&cmptimer);
   cpi->time_compress_data += aom_usec_timer_elapsed(&cmptimer);
 #endif  // CONFIG_INTERNAL_STATS
-  if (cpi->b_calculate_psnr) {
+  // Note *size = 0 indicates a dropeed frame for which psnr is not calculated
+  if (cpi->b_calculate_psnr && *size > 0) {
     if (cm->show_existing_frame ||
         (!is_stat_generation_stage(cpi) && cm->show_frame)) {
       generate_psnr_packet(cpi);
     }
   }
 
-#if CONFIG_TUNE_VMAF
-  if (!is_stat_generation_stage(cpi) &&
-      (oxcf->tuning == AOM_TUNE_VMAF_WITH_PREPROCESSING ||
-       oxcf->tuning == AOM_TUNE_VMAF_WITHOUT_PREPROCESSING ||
-       oxcf->tuning == AOM_TUNE_VMAF_MAX_GAIN)) {
-    av1_update_vmaf_curve(cpi, cpi->source, &cpi->common.cur_frame->buf);
-  }
-#endif
-
   if (cpi->level_params.keep_level_stats && !is_stat_generation_stage(cpi)) {
     // Initialize level info. at the beginning of each sequence.
-    if (cm->current_frame.frame_type == KEY_FRAME && cm->show_frame) {
+    if (cm->current_frame.frame_type == KEY_FRAME && !cpi->no_show_fwd_kf) {
       av1_init_level_info(cpi);
     }
     av1_update_level_info(cpi, *size, *time_stamp, *time_end);
@@ -6891,8 +3566,8 @@
 #endif  // CONFIG_INTERNAL_STATS
 #if CONFIG_SPEED_STATS
   if (!is_stat_generation_stage(cpi) && !cm->show_existing_frame) {
-    cpi->tx_search_count += cpi->td.mb.tx_search_count;
-    cpi->td.mb.tx_search_count = 0;
+    cpi->tx_search_count += cpi->td.mb.txfm_search_info.tx_search_count;
+    cpi->td.mb.txfm_search_info.tx_search_count = 0;
   }
 #endif  // CONFIG_SPEED_STATS
 
@@ -6929,16 +3604,6 @@
   return 0;
 }
 
-static int equal_dimensions_and_border(const YV12_BUFFER_CONFIG *a,
-                                       const YV12_BUFFER_CONFIG *b) {
-  return a->y_height == b->y_height && a->y_width == b->y_width &&
-         a->uv_height == b->uv_height && a->uv_width == b->uv_width &&
-         a->y_stride == b->y_stride && a->uv_stride == b->uv_stride &&
-         a->border == b->border &&
-         (a->flags & YV12_FLAG_HIGHBITDEPTH) ==
-             (b->flags & YV12_FLAG_HIGHBITDEPTH);
-}
-
 aom_codec_err_t av1_copy_new_frame_enc(AV1_COMMON *cm,
                                        YV12_BUFFER_CONFIG *new_frame,
                                        YV12_BUFFER_CONFIG *sd) {
@@ -6963,9 +3628,13 @@
   Scale2Ratio(vert_mode, &vr, &vs);
 
   // always go to the next whole number
-  resize_pending_params->width = (hs - 1 + oxcf->width * hr) / hs;
-  resize_pending_params->height = (vs - 1 + oxcf->height * vr) / vs;
+  resize_pending_params->width = (hs - 1 + oxcf->frm_dim_cfg.width * hr) / hs;
+  resize_pending_params->height = (vs - 1 + oxcf->frm_dim_cfg.height * vr) / vs;
 
+  if (horiz_mode != NORMAL || vert_mode != NORMAL) {
+    oxcf->resize_cfg.resize_mode = RESIZE_FIXED;
+    oxcf->algo_cfg.enable_tpl_model = 0;
+  }
   return 0;
 }
 
@@ -7031,13 +3700,13 @@
 }
 
 static void svc_set_updates_external_ref_frame_config(
-    ExternalFlags *const ext_flags, SVC *const svc) {
-  ext_flags->refresh_frame_flags_pending = 1;
-  ext_flags->refresh_last_frame = svc->refresh[svc->ref_idx[0]];
-  ext_flags->refresh_golden_frame = svc->refresh[svc->ref_idx[3]];
-  ext_flags->refresh_bwd_ref_frame = svc->refresh[svc->ref_idx[4]];
-  ext_flags->refresh_alt2_ref_frame = svc->refresh[svc->ref_idx[5]];
-  ext_flags->refresh_alt_ref_frame = svc->refresh[svc->ref_idx[6]];
+    ExtRefreshFrameFlagsInfo *const ext_refresh_frame_flags, SVC *const svc) {
+  ext_refresh_frame_flags->update_pending = 1;
+  ext_refresh_frame_flags->last_frame = svc->refresh[svc->ref_idx[0]];
+  ext_refresh_frame_flags->golden_frame = svc->refresh[svc->ref_idx[3]];
+  ext_refresh_frame_flags->bwd_ref_frame = svc->refresh[svc->ref_idx[4]];
+  ext_refresh_frame_flags->alt2_ref_frame = svc->refresh[svc->ref_idx[5]];
+  ext_refresh_frame_flags->alt_ref_frame = svc->refresh[svc->ref_idx[6]];
   svc->non_reference_frame = 1;
   for (int i = 0; i < REF_FRAMES; i++) {
     if (svc->refresh[i] == 1) {
@@ -7065,6 +3734,8 @@
   // GOLDEN, BWDREF, ALTREF2.
 
   ExternalFlags *const ext_flags = &cpi->ext_flags;
+  ExtRefreshFrameFlagsInfo *const ext_refresh_frame_flags =
+      &ext_flags->refresh_frame;
   ext_flags->ref_frame_flags = AOM_REFFRAME_ALL;
   if (flags &
       (AOM_EFLAG_NO_REF_LAST | AOM_EFLAG_NO_REF_LAST2 | AOM_EFLAG_NO_REF_LAST3 |
@@ -7110,31 +3781,32 @@
       upd ^= AOM_ALT2_FLAG;
     }
 
-    ext_flags->refresh_last_frame = (upd & AOM_LAST_FLAG) != 0;
-    ext_flags->refresh_golden_frame = (upd & AOM_GOLD_FLAG) != 0;
-    ext_flags->refresh_alt_ref_frame = (upd & AOM_ALT_FLAG) != 0;
-    ext_flags->refresh_bwd_ref_frame = (upd & AOM_BWD_FLAG) != 0;
-    ext_flags->refresh_alt2_ref_frame = (upd & AOM_ALT2_FLAG) != 0;
-    ext_flags->refresh_frame_flags_pending = 1;
+    ext_refresh_frame_flags->last_frame = (upd & AOM_LAST_FLAG) != 0;
+    ext_refresh_frame_flags->golden_frame = (upd & AOM_GOLD_FLAG) != 0;
+    ext_refresh_frame_flags->alt_ref_frame = (upd & AOM_ALT_FLAG) != 0;
+    ext_refresh_frame_flags->bwd_ref_frame = (upd & AOM_BWD_FLAG) != 0;
+    ext_refresh_frame_flags->alt2_ref_frame = (upd & AOM_ALT2_FLAG) != 0;
+    ext_refresh_frame_flags->update_pending = 1;
   } else {
     if (cpi->svc.external_ref_frame_config)
-      svc_set_updates_external_ref_frame_config(ext_flags, &cpi->svc);
+      svc_set_updates_external_ref_frame_config(ext_refresh_frame_flags,
+                                                &cpi->svc);
     else
-      ext_flags->refresh_frame_flags_pending = 0;
+      ext_refresh_frame_flags->update_pending = 0;
   }
 
-  ext_flags->use_ref_frame_mvs = cpi->oxcf.allow_ref_frame_mvs &
+  ext_flags->use_ref_frame_mvs = cpi->oxcf.tool_cfg.enable_ref_frame_mvs &
                                  ((flags & AOM_EFLAG_NO_REF_FRAME_MVS) == 0);
-  ext_flags->use_error_resilient = cpi->oxcf.error_resilient_mode |
+  ext_flags->use_error_resilient = cpi->oxcf.tool_cfg.error_resilient_mode |
                                    ((flags & AOM_EFLAG_ERROR_RESILIENT) != 0);
   ext_flags->use_s_frame =
-      cpi->oxcf.s_frame_mode | ((flags & AOM_EFLAG_SET_S_FRAME) != 0);
+      cpi->oxcf.kf_cfg.enable_sframe | ((flags & AOM_EFLAG_SET_S_FRAME) != 0);
   ext_flags->use_primary_ref_none =
       (flags & AOM_EFLAG_SET_PRIMARY_REF_NONE) != 0;
 
   if (flags & AOM_EFLAG_NO_UPD_ENTROPY) {
-    av1_update_entropy(&ext_flags->refresh_frame_context,
-                       &ext_flags->refresh_frame_context_pending, 0);
+    update_entropy(&ext_flags->refresh_frame_context,
+                   &ext_flags->refresh_frame_context_pending, 0);
   }
 }
 
diff --git a/av1/encoder/encoder.h b/av1/encoder/encoder.h
index 82d00cb..ed79ba0 100644
--- a/av1/encoder/encoder.h
+++ b/av1/encoder/encoder.h
@@ -9,6 +9,9 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
+/*!\file
+ * \brief Declares top-level encoder structures and functions.
+ */
 #ifndef AOM_AV1_ENCODER_ENCODER_H_
 #define AOM_AV1_ENCODER_ENCODER_H_
 
@@ -33,6 +36,7 @@
 #include "av1/encoder/context_tree.h"
 #include "av1/encoder/encodemb.h"
 #include "av1/encoder/firstpass.h"
+#include "av1/encoder/global_motion.h"
 #include "av1/encoder/level.h"
 #include "av1/encoder/lookahead.h"
 #include "av1/encoder/mcomp.h"
@@ -40,7 +44,10 @@
 #include "av1/encoder/rd.h"
 #include "av1/encoder/speed_features.h"
 #include "av1/encoder/svc_layercontext.h"
+#include "av1/encoder/temporal_filter.h"
 #include "av1/encoder/tokenize.h"
+#include "av1/encoder/tpl_model.h"
+#include "av1/encoder/av1_noise_estimate.h"
 
 #if CONFIG_INTERNAL_STATS
 #include "aom_dsp/ssim.h"
@@ -49,6 +56,10 @@
 #if CONFIG_DENOISE
 #include "aom_dsp/noise_model.h"
 #endif
+#if CONFIG_TUNE_VMAF
+#include "av1/encoder/tune_vmaf.h"
+#endif
+
 #include "aom/internal/aom_codec_internal.h"
 #include "aom_util/aom_thread.h"
 
@@ -56,6 +67,9 @@
 extern "C" {
 #endif
 
+// TODO(yunqing, any): Added suppression tag to quiet Doxygen warnings. Need to
+// adjust it while we work on documentation.
+/*!\cond */
 // Number of frames required to test for scene cut detection
 #define SCENE_CUT_KEY_TEST_INTERVAL 16
 
@@ -66,20 +80,14 @@
   int den;           // fraction denominator
 } aom_rational64_t;  // alias for struct aom_rational
 
-typedef struct {
-#if CONFIG_SUPERRES_IN_RECODE
-  struct loopfilter lf;
-  CdefInfo cdef_info;
-  YV12_BUFFER_CONFIG copy_buffer;
-  RATE_CONTROL rc;
-#endif  // CONFIG_SUPERRES_IN_RECODE
-} CODING_CONTEXT;
-
 enum {
   NORMAL = 0,
   FOURFIVE = 1,
   THREEFIVE = 2,
-  ONETWO = 3
+  THREEFOUR = 3,
+  ONEFOUR = 4,
+  ONEEIGHT = 5,
+  ONETWO = 6
 } UENUM1BYTE(AOM_SCALING);
 
 enum {
@@ -117,93 +125,29 @@
 } UENUM1BYTE(DELTAQ_MODE);
 
 enum {
-  RESIZE_NONE = 0,    // No frame resizing allowed.
-  RESIZE_FIXED = 1,   // All frames are coded at the specified scale.
-  RESIZE_RANDOM = 2,  // All frames are coded at a random scale.
+  RESIZE_NONE = 0,     // No frame resizing allowed.
+  RESIZE_FIXED = 1,    // All frames are coded at the specified scale.
+  RESIZE_RANDOM = 2,   // All frames are coded at a random scale.
+  RESIZE_DYNAMIC = 3,  // Frames coded at lower scale based on rate control.
   RESIZE_MODES
 } UENUM1BYTE(RESIZE_MODE);
 
 enum {
-  SUPERRES_NONE,     // No frame superres allowed.
-  SUPERRES_FIXED,    // All frames are coded at the specified scale,
-                     // and super-resolved.
-  SUPERRES_RANDOM,   // All frames are coded at a random scale,
-                     // and super-resolved.
-  SUPERRES_QTHRESH,  // Superres scale for a frame is determined based on
-                     // q_index.
-  SUPERRES_AUTO,     // Automatically select superres for appropriate frames.
-  SUPERRES_MODES
-} UENUM1BYTE(SUPERRES_MODE);
-
-typedef enum {
-  kInvalid = 0,
-  kLowSad = 1,
-  kHighSad = 2,
-  kLowVarHighSumdiff = 3,
-} CONTENT_STATE_SB;
-
-enum {
   SS_CFG_SRC = 0,
   SS_CFG_LOOKAHEAD = 1,
   SS_CFG_FPF = 2,
   SS_CFG_TOTAL = 3
 } UENUM1BYTE(SS_CFG_OFFSET);
 
-// TODO(jingning): This needs to be cleaned up next.
-#define MAX_LENGTH_TPL_FRAME_STATS (MAX_TOTAL_BUFFERS + REF_FRAMES + 1)
+enum {
+  DISABLE_SCENECUT,        // For LAP, lag_in_frames < 19
+  ENABLE_SCENECUT_MODE_1,  // For LAP, lag_in_frames >=19 and < 33
+  ENABLE_SCENECUT_MODE_2   // For twopass and LAP - lag_in_frames >=33
+} UENUM1BYTE(SCENECUT_MODE);
 
-typedef struct TplDepStats {
-  int64_t intra_cost;
-  int64_t inter_cost;
-  int64_t srcrf_dist;
-  int64_t recrf_dist;
-  int64_t srcrf_rate;
-  int64_t recrf_rate;
-  int64_t mc_dep_rate;
-  int64_t mc_dep_dist;
-  int_mv mv[INTER_REFS_PER_FRAME];
-  int ref_frame_index;
-  int64_t pred_error[INTER_REFS_PER_FRAME];
-  int64_t mc_count;
-  int64_t mc_saved;
-} TplDepStats;
+#define MAX_VBR_CORPUS_COMPLEXITY 10000
 
-typedef struct TplDepFrame {
-  uint8_t is_valid;
-  TplDepStats *tpl_stats_ptr;
-  const YV12_BUFFER_CONFIG *gf_picture;
-  YV12_BUFFER_CONFIG *rec_picture;
-  int ref_map_index[REF_FRAMES];
-  int stride;
-  int width;
-  int height;
-  int mi_rows;
-  int mi_cols;
-  unsigned int frame_display_index;
-  int base_rdmult;
-} TplDepFrame;
-
-typedef struct TplParams {
-  // Block granularity of tpl score storage.
-  uint8_t tpl_stats_block_mis_log2;
-
-  // Buffer to store the frame level tpl information for each frame in a gf
-  // group. tpl_stats_buffer[i] stores the tpl information of ith frame in a gf
-  // group
-  TplDepFrame tpl_stats_buffer[MAX_LENGTH_TPL_FRAME_STATS];
-
-  // Buffer to store tpl stats at block granularity.
-  // tpl_stats_pool[i][j] stores the tpl stats of jth block of ith frame in a gf
-  // group.
-  TplDepStats *tpl_stats_pool[MAX_LAG_BUFFERS];
-
-  // Buffer to store tpl reconstructed frame.
-  // tpl_rec_pool[i] stores the reconstructed frame of ith frame in a gf group.
-  YV12_BUFFER_CONFIG tpl_rec_pool[MAX_LAG_BUFFERS];
-
-  // Pointer to tpl_stats_buffer.
-  TplDepFrame *tpl_frame;
-} TplParams;
+/*!\cond */
 
 typedef enum {
   COST_UPD_SB,
@@ -212,273 +156,806 @@
   COST_UPD_OFF,
 } COST_UPDATE_TYPE;
 
-#define TPL_DEP_COST_SCALE_LOG2 4
+/*!\endcond */
 
-typedef struct AV1EncoderConfig {
-  BITSTREAM_PROFILE profile;
-  aom_bit_depth_t bit_depth;     // Codec bit-depth.
-  int width;                     // width of data passed to the compressor
-  int height;                    // height of data passed to the compressor
-  int forced_max_frame_width;    // forced maximum width of frame (if != 0)
-  int forced_max_frame_height;   // forced maximum height of frame (if != 0)
-  unsigned int input_bit_depth;  // Input bit depth.
-  double init_framerate;         // set to passed in framerate
-  int64_t target_bandwidth;      // bandwidth to be used in bits per second
-
-  int noise_sensitivity;  // pre processing blur: recommendation 0
-  int sharpness;          // sharpening output: recommendation 0:
-  int speed;
-  // maximum allowed bitrate for any intra frame in % of bitrate target.
-  unsigned int rc_max_intra_bitrate_pct;
-  // maximum allowed bitrate for any inter frame in % of bitrate target.
-  unsigned int rc_max_inter_bitrate_pct;
-  // percent of rate boost for golden frame in CBR mode.
-  unsigned int gf_cbr_boost_pct;
-
-  MODE mode;
-  int pass;
-
-  // Key Framing Operations
-  int auto_key;  // autodetect cut scenes and set the keyframes
-  int key_freq;  // maximum distance to key frame.
-  int sframe_dist;
-  int sframe_mode;
-  int sframe_enabled;
-  int lag_in_frames;  // how many frames lag before we start encoding
-  int fwd_kf_enabled;
-
-  // ----------------------------------------------------------------
-  // DATARATE CONTROL OPTIONS
-
-  // vbr, cbr, constrained quality or constant quality
-  enum aom_rc_mode rc_mode;
-
-  // buffer targeting aggressiveness
-  int under_shoot_pct;
-  int over_shoot_pct;
-
-  // buffering parameters
-  int64_t starting_buffer_level_ms;
-  int64_t optimal_buffer_level_ms;
-  int64_t maximum_buffer_size_ms;
-
-  // Frame drop threshold.
-  int drop_frames_water_mark;
-
-  // controlling quality
-  int fixed_q;
-  int worst_allowed_q;
-  int best_allowed_q;
-  int cq_level;
-  int enable_chroma_deltaq;
-  AQ_MODE aq_mode;  // Adaptive Quantization mode
-  DELTAQ_MODE deltaq_mode;
-  int deltalf_mode;
-  int enable_cdef;
-  int enable_restoration;
-  int force_video_mode;
-  int enable_obmc;
-  int disable_trellis_quant;
-  int using_qm;
-  int qm_y;
-  int qm_u;
-  int qm_v;
-  int qm_minlevel;
-  int qm_maxlevel;
-  unsigned int num_tile_groups;
-  unsigned int mtu;
-
-  // Internal frame size scaling.
+/*!
+ * \brief Encoder config related to resize.
+ */
+typedef struct {
+  /*!
+   * Indicates the frame resize mode to be used by the encoder.
+   */
   RESIZE_MODE resize_mode;
+  /*!
+   * Indicates the denominator for resize of inter frames, assuming 8 as the
+   *  numerator. Its value ranges between 8-16.
+   */
   uint8_t resize_scale_denominator;
+  /*!
+   * Indicates the denominator for resize of key frames, assuming 8 as the
+   * numerator. Its value ranges between 8-16.
+   */
   uint8_t resize_kf_scale_denominator;
+} ResizeCfg;
 
-  // Frame Super-Resolution size scaling.
-  SUPERRES_MODE superres_mode;
-  uint8_t superres_scale_denominator;
-  uint8_t superres_kf_scale_denominator;
+/*!
+ * \brief Encoder config for coding block partitioning.
+ */
+typedef struct {
+  /*!
+   * Flag to indicate if rectanguar partitions should be enabled.
+   */
+  bool enable_rect_partitions;
+  /*!
+   * Flag to indicate if AB partitions should be enabled.
+   */
+  bool enable_ab_partitions;
+  /*!
+   * Flag to indicate if 1:4 / 4:1 partitions should be enabled.
+   */
+  bool enable_1to4_partitions;
+  /*!
+   * Indicates the minimum partition size that should be allowed. Both width and
+   * height of a partition cannot be smaller than the min_partition_size.
+   */
+  BLOCK_SIZE min_partition_size;
+  /*!
+   * Indicates the maximum partition size that should be allowed. Both width and
+   * height of a partition cannot be larger than the max_partition_size.
+   */
+  BLOCK_SIZE max_partition_size;
+} PartitionCfg;
+
+/*!
+ * \brief Encoder flags for intra prediction.
+ */
+typedef struct {
+  /*!
+   * Flag to indicate if intra edge filtering process should be enabled.
+   */
+  bool enable_intra_edge_filter;
+  /*!
+   * Flag to indicate if recursive filtering based intra prediction should be
+   * enabled.
+   */
+  bool enable_filter_intra;
+  /*!
+   * Flag to indicate if smooth intra prediction modes should be enabled.
+   */
+  bool enable_smooth_intra;
+  /*!
+   * Flag to indicate if PAETH intra prediction mode should be enabled.
+   */
+  bool enable_paeth_intra;
+  /*!
+   * Flag to indicate if CFL uv intra mode should be enabled.
+   */
+  bool enable_cfl_intra;
+  /*!
+   * Flag to indicate if delta angles for directional intra prediction should be
+   * enabled.
+   */
+  bool enable_angle_delta;
+} IntraModeCfg;
+
+/*!
+ * \brief Encoder flags for transform sizes and types.
+ */
+typedef struct {
+  /*!
+   * Flag to indicate if 64-pt transform should be enabled.
+   */
+  bool enable_tx64;
+  /*!
+   * Flag to indicate if flip and identity transform types should be enabled.
+   */
+  bool enable_flip_idtx;
+  /*!
+   * Flag to indicate if rectangular transform should be enabled.
+   */
+  bool enable_rect_tx;
+  /*!
+   * Flag to indicate whether or not to use a default reduced set for ext-tx
+   * rather than the potential full set of 16 transforms.
+   */
+  bool reduced_tx_type_set;
+  /*!
+   * Flag to indicate if transform type for intra blocks should be limited to
+   * DCT_DCT.
+   */
+  bool use_intra_dct_only;
+  /*!
+   * Flag to indicate if transform type for inter blocks should be limited to
+   * DCT_DCT.
+   */
+  bool use_inter_dct_only;
+  /*!
+   * Flag to indicate if intra blocks should use default transform type
+   * (mode-dependent) only.
+   */
+  bool use_intra_default_tx_only;
+} TxfmSizeTypeCfg;
+
+/*!
+ * \brief Encoder flags for compound prediction modes.
+ */
+typedef struct {
+  /*!
+   * Flag to indicate if distance-weighted compound type should be enabled.
+   */
+  bool enable_dist_wtd_comp;
+  /*!
+   * Flag to indicate if masked (wedge/diff-wtd) compound type should be
+   * enabled.
+   */
+  bool enable_masked_comp;
+  /*!
+   * Flag to indicate if smooth interintra mode should be enabled.
+   */
+  bool enable_smooth_interintra;
+  /*!
+   * Flag to indicate if difference-weighted compound type should be enabled.
+   */
+  bool enable_diff_wtd_comp;
+  /*!
+   * Flag to indicate if inter-inter wedge compound type should be enabled.
+   */
+  bool enable_interinter_wedge;
+  /*!
+   * Flag to indicate if inter-intra wedge compound type should be enabled.
+   */
+  bool enable_interintra_wedge;
+} CompoundTypeCfg;
+
+/*!
+ * \brief Encoder config related to frame super-resolution.
+ */
+typedef struct {
+  /*!
+   * Indicates the qindex based threshold to be used when AOM_SUPERRES_QTHRESH
+   * mode is used for inter frames.
+   */
   int superres_qthresh;
+  /*!
+   * Indicates the qindex based threshold to be used when AOM_SUPERRES_QTHRESH
+   * mode is used for key frames.
+   */
   int superres_kf_qthresh;
-
-  // Enable feature to reduce the frame quantization every x frames.
-  int frame_periodic_boost;
-
-  // two pass datarate control
-  int two_pass_vbrbias;  // two pass datarate control tweaks
-  int two_pass_vbrmin_section;
-  int two_pass_vbrmax_section;
-  // END DATARATE CONTROL OPTIONS
-  // ----------------------------------------------------------------
-
-  int enable_auto_arf;
-  int enable_auto_brf;  // (b)ackward (r)ef (f)rame
-
-  /* Bitfield defining the error resiliency features to enable.
-   * Can provide decodable frames after losses in previous
-   * frames and decodable partitions after losses in the same frame.
+  /*!
+   * Indicates the denominator of the fraction that specifies the ratio between
+   * the superblock width before and after upscaling for inter frames. The
+   * numerator of this fraction is equal to the constant SCALE_NUMERATOR.
    */
-  unsigned int error_resilient_mode;
-
-  unsigned int s_frame_mode;
-
-  /* Bitfield defining the parallel decoding mode where the
-   * decoding in successive frames may be conducted in parallel
-   * just by decoding the frame headers.
+  uint8_t superres_scale_denominator;
+  /*!
+   * Indicates the denominator of the fraction that specifies the ratio between
+   * the superblock width before and after upscaling for key frames. The
+   * numerator of this fraction is equal to the constant SCALE_NUMERATOR.
    */
-  unsigned int frame_parallel_decoding_mode;
+  uint8_t superres_kf_scale_denominator;
+  /*!
+   * Indicates the Super-resolution mode to be used by the encoder.
+   */
+  aom_superres_mode superres_mode;
+  /*!
+   * Flag to indicate if super-resolution should be enabled for the sequence.
+   */
+  bool enable_superres;
+} SuperResCfg;
 
-  unsigned int limit;
+/*!
+ * \brief Encoder config related to the coding of key frames.
+ */
+typedef struct {
+  /*!
+   * Indicates the minimum distance to a key frame.
+   */
+  int key_freq_min;
 
-  int arnr_max_frames;
-  int arnr_strength;
+  /*!
+   * Indicates the maximum distance to a key frame.
+   */
+  int key_freq_max;
 
-  int min_gf_interval;
-  int max_gf_interval;
-  int gf_min_pyr_height;
-  int gf_max_pyr_height;
-
-  int row_mt;
-  int tile_columns;
-  int tile_rows;
-  int tile_width_count;
-  int tile_height_count;
-  int tile_widths[MAX_TILE_COLS];
-  int tile_heights[MAX_TILE_ROWS];
-
-  int enable_tpl_model;
+  /*!
+   * Indicates if temporal filtering should be applied on keyframe.
+   */
   int enable_keyframe_filtering;
 
-  int max_threads;
+  /*!
+   * Indicates the number of frames after which a frame may be coded as an
+   * S-Frame.
+   */
+  int sframe_dist;
 
-  aom_fixed_buf_t two_pass_stats_in;
+  /*!
+   * Indicates how an S-Frame should be inserted.
+   * 1: the considered frame will be made into an S-Frame only if it is an
+   * altref frame. 2: the next altref frame will be made into an S-Frame.
+   */
+  int sframe_mode;
 
-  aom_tune_metric tuning;
-  const char *vmaf_model_path;
-  aom_tune_content content;
-  int use_highbitdepth;
-  aom_color_primaries_t color_primaries;
-  aom_transfer_characteristics_t transfer_characteristics;
-  aom_matrix_coefficients_t matrix_coefficients;
-  aom_chroma_sample_position_t chroma_sample_position;
-  int color_range;
+  /*!
+   * Indicates if encoder should autodetect cut scenes and set the keyframes.
+   */
+  bool auto_key;
+
+  /*!
+   * Indicates if forward keyframe reference should be enabled.
+   */
+  bool fwd_kf_enabled;
+
+  /*!
+   * Indicates if S-Frames should be enabled for the sequence.
+   */
+  bool enable_sframe;
+
+  /*!
+   * Indicates if intra block copy prediction mode should be enabled or not.
+   */
+  bool enable_intrabc;
+} KeyFrameCfg;
+
+/*!
+ * \brief Encoder rate control configuration parameters
+ */
+typedef struct {
+  /*!\cond */
+  // BUFFERING PARAMETERS
+  /*!\endcond */
+  /*!
+   * Indicates the amount of data that will be buffered by the decoding
+   * application prior to beginning playback, and is expressed in units of
+   * time(milliseconds).
+   */
+  int64_t starting_buffer_level_ms;
+  /*!
+   * Indicates the amount of data that the encoder should try to maintain in the
+   * decoder's buffer, and is expressed in units of time(milliseconds).
+   */
+  int64_t optimal_buffer_level_ms;
+  /*!
+   * Indicates the maximum amount of data that may be buffered by the decoding
+   * application, and is expressed in units of time(milliseconds).
+   */
+  int64_t maximum_buffer_size_ms;
+
+  /*!
+   * Indicates the bandwidth to be used in bits per second.
+   */
+  int64_t target_bandwidth;
+
+  /*!
+   * Indicates average complexity of the corpus in single pass vbr based on
+   * LAP. 0 indicates that corpus complexity vbr mode is disabled.
+   */
+  unsigned int vbr_corpus_complexity_lap;
+  /*!
+   * Indicates the maximum allowed bitrate for any intra frame as % of bitrate
+   * target.
+   */
+  unsigned int max_intra_bitrate_pct;
+  /*!
+   * Indicates the maximum allowed bitrate for any inter frame as % of bitrate
+   * target.
+   */
+  unsigned int max_inter_bitrate_pct;
+  /*!
+   * Indicates the percentage of rate boost for golden frame in CBR mode.
+   */
+  unsigned int gf_cbr_boost_pct;
+  /*!
+   * min_cr / 100 indicates the target minimum compression ratio for each
+   * frame.
+   */
+  unsigned int min_cr;
+  /*!
+   * Indicates the frame drop threshold.
+   */
+  int drop_frames_water_mark;
+  /*!
+   * under_shoot_pct indicates the tolerance of the VBR algorithm to
+   * undershoot and is used as a trigger threshold for more agressive
+   * adaptation of Q. It's value can range from 0-100.
+   */
+  int under_shoot_pct;
+  /*!
+   * over_shoot_pct indicates the tolerance of the VBR algorithm to overshoot
+   * and is used as a trigger threshold for more agressive adaptation of Q.
+   * It's value can range from 0-1000.
+   */
+  int over_shoot_pct;
+  /*!
+   * Indicates the maximum qindex that can be used by the quantizer i.e. the
+   * worst quality qindex.
+   */
+  int worst_allowed_q;
+  /*!
+   * Indicates the minimum qindex that can be used by the quantizer i.e. the
+   * best quality qindex.
+   */
+  int best_allowed_q;
+  /*!
+   * Indicates the Constant/Constrained Quality level.
+   */
+  int cq_level;
+  /*!
+   * Indicates if the encoding mode is vbr, cbr, constrained quality or
+   * constant quality.
+   */
+  enum aom_rc_mode mode;
+  /*!
+   * Indicates the bias (expressed on a scale of 0 to 100) for determining
+   * target size for the current frame. The value 0 indicates the optimal CBR
+   * mode value should be used, and 100 indicates the optimal VBR mode value
+   * should be used.
+   */
+  int vbrbias;
+  /*!
+   * Indicates the minimum bitrate to be used for a single frame as a percentage
+   * of the target bitrate.
+   */
+  int vbrmin_section;
+  /*!
+   * Indicates the maximum bitrate to be used for a single frame as a percentage
+   * of the target bitrate.
+   */
+  int vbrmax_section;
+} RateControlCfg;
+
+/*!\cond */
+typedef struct {
+  // Indicates the number of frames lag before encoding is started.
+  int lag_in_frames;
+  // Indicates the minimum gf/arf interval to be used.
+  int min_gf_interval;
+  // Indicates the maximum gf/arf interval to be used.
+  int max_gf_interval;
+  // Indicates the minimum height for GF group pyramid structure to be used.
+  int gf_min_pyr_height;
+  // Indicates the maximum height for GF group pyramid structure to be used.
+  int gf_max_pyr_height;
+  // Indicates if automatic set and use of altref frames should be enabled.
+  bool enable_auto_arf;
+  // Indicates if automatic set and use of (b)ackward (r)ef (f)rames should be
+  // enabled.
+  bool enable_auto_brf;
+} GFConfig;
+
+typedef struct {
+  // Indicates the number of tile groups.
+  unsigned int num_tile_groups;
+  // Indicates the MTU size for a tile group. If mtu is non-zero,
+  // num_tile_groups is set to DEFAULT_MAX_NUM_TG.
+  unsigned int mtu;
+  // Indicates the number of tile columns in log2.
+  int tile_columns;
+  // Indicates the number of tile rows in log2.
+  int tile_rows;
+  // Indicates the number of widths in the tile_widths[] array.
+  int tile_width_count;
+  // Indicates the number of heights in the tile_heights[] array.
+  int tile_height_count;
+  // Indicates the tile widths, and may be empty.
+  int tile_widths[MAX_TILE_COLS];
+  // Indicates the tile heights, and may be empty.
+  int tile_heights[MAX_TILE_ROWS];
+  // Indicates if large scale tile coding should be used.
+  bool enable_large_scale_tile;
+  // Indicates if single tile decoding mode should be enabled.
+  bool enable_single_tile_decoding;
+  // Indicates if EXT_TILE_DEBUG should be enabled.
+  bool enable_ext_tile_debug;
+} TileConfig;
+
+typedef struct {
+  // Indicates the width of the input frame.
+  int width;
+  // Indicates the height of the input frame.
+  int height;
+  // If forced_max_frame_width is non-zero then it is used to force the maximum
+  // frame width written in write_sequence_header().
+  int forced_max_frame_width;
+  // If forced_max_frame_width is non-zero then it is used to force the maximum
+  // frame height written in write_sequence_header().
+  int forced_max_frame_height;
+  // Indicates the frame width after applying both super-resolution and resize
+  // to the coded frame.
   int render_width;
+  // Indicates the frame height after applying both super-resolution and resize
+  // to the coded frame.
   int render_height;
-  int timing_info_present;
+} FrameDimensionCfg;
+
+typedef struct {
+  // Indicates if warped motion should be enabled.
+  bool enable_warped_motion;
+  // Indicates if warped motion should be evaluated or not.
+  bool allow_warped_motion;
+  // Indicates if OBMC motion should be enabled.
+  bool enable_obmc;
+} MotionModeCfg;
+
+typedef struct {
+  // Timing info for each frame.
   aom_timing_info_t timing_info;
-  int decoder_model_info_present_flag;
-  int display_model_info_present_flag;
-  int buffer_removal_time_present;
-  aom_dec_model_info_t buffer_model;
-  int film_grain_test_vector;
-  const char *film_grain_table_filename;
+  // Indicates the number of time units of a decoding clock.
+  uint32_t num_units_in_decoding_tick;
+  // Indicates if decoder model information is present in the coded sequence
+  // header.
+  bool decoder_model_info_present_flag;
+  // Indicates if display model information is present in the coded sequence
+  // header.
+  bool display_model_info_present_flag;
+  // Indicates if timing info for each frame is present.
+  bool timing_info_present;
+} DecoderModelCfg;
 
-  uint8_t cdf_update_mode;
-  aom_superblock_size_t superblock_size;
-  unsigned int large_scale_tile;
-  unsigned int single_tile_decoding;
-  uint8_t monochrome;
-  unsigned int full_still_picture_hdr;
-  int enable_dual_filter;
-  unsigned int motion_vector_unit_test;
-  unsigned int sb_multipass_unit_test;
-  unsigned int ext_tile_debug;
-  int enable_rect_partitions;
-  int enable_ab_partitions;
-  int enable_1to4_partitions;
-  int min_partition_size;
-  int max_partition_size;
-  int enable_intra_edge_filter;
-  int enable_tx64;
-  int enable_flip_idtx;
-  int enable_order_hint;
-  int enable_dist_wtd_comp;
-  int enable_ref_frame_mvs;
+typedef struct {
+  // Indicates the update frequency for coeff costs.
+  COST_UPDATE_TYPE coeff;
+  // Indicates the update frequency for mode costs.
+  COST_UPDATE_TYPE mode;
+  // Indicates the update frequency for mv costs.
+  COST_UPDATE_TYPE mv;
+} CostUpdateFreq;
+
+typedef struct {
+  // Indicates the maximum number of reference frames allowed per frame.
   unsigned int max_reference_frames;
-  int enable_reduced_reference_set;
-  unsigned int allow_ref_frame_mvs;
-  int enable_masked_comp;
-  int enable_onesided_comp;
-  int enable_interintra_comp;
-  int enable_smooth_interintra;
-  int enable_diff_wtd_comp;
-  int enable_interinter_wedge;
-  int enable_interintra_wedge;
-  int enable_global_motion;
-  int enable_warped_motion;
-  int allow_warped_motion;
-  int enable_filter_intra;
-  int enable_smooth_intra;
-  int enable_paeth_intra;
-  int enable_cfl_intra;
-  int enable_superres;
-  int enable_overlay;
-  int enable_palette;
-  int enable_intrabc;
-  int enable_angle_delta;
-  unsigned int save_as_annexb;
+  // Indicates if the reduced set of references should be enabled.
+  bool enable_reduced_reference_set;
+  // Indicates if one-sided compound should be enabled.
+  bool enable_onesided_comp;
+} RefFrameCfg;
 
-#if CONFIG_DENOISE
-  float noise_level;
-  int noise_block_size;
-#endif
+typedef struct {
+  // Indicates the color space that should be used.
+  aom_color_primaries_t color_primaries;
+  // Indicates the characteristics of transfer function to be used.
+  aom_transfer_characteristics_t transfer_characteristics;
+  // Indicates the matrix coefficients to be used for the transfer function.
+  aom_matrix_coefficients_t matrix_coefficients;
+  // Indicates the chroma 4:2:0 sample position info.
+  aom_chroma_sample_position_t chroma_sample_position;
+  // Indicates if a limited color range or full color range should be used.
+  aom_color_range_t color_range;
+} ColorCfg;
 
+typedef struct {
+  // Indicates if extreme motion vector unit test should be enabled or not.
+  unsigned int motion_vector_unit_test;
+  // Indicates if superblock multipass unit test should be enabled or not.
+  unsigned int sb_multipass_unit_test;
+} UnitTestCfg;
+
+typedef struct {
+  // Indicates the file path to the VMAF model.
+  const char *vmaf_model_path;
+  // Indicates the path to the film grain parameters.
+  const char *film_grain_table_filename;
+  // Indicates the visual tuning metric.
+  aom_tune_metric tuning;
+  // Indicates if the current content is screen or default type.
+  aom_tune_content content;
+  // Indicates the film grain parameters.
+  int film_grain_test_vector;
+} TuneCfg;
+
+typedef struct {
+  // Indicates the framerate of the input video.
+  double init_framerate;
+  // Indicates the bit-depth of the input video.
+  unsigned int input_bit_depth;
+  // Indicates the maximum number of frames to be encoded.
+  unsigned int limit;
+  // Indicates the chrome subsampling x value.
   unsigned int chroma_subsampling_x;
+  // Indicates the chrome subsampling y value.
   unsigned int chroma_subsampling_y;
-  int reduced_tx_type_set;
-  int use_intra_dct_only;
-  int use_inter_dct_only;
-  int use_intra_default_tx_only;
-  int quant_b_adapt;
-  COST_UPDATE_TYPE coeff_cost_upd_freq;
-  COST_UPDATE_TYPE mode_cost_upd_freq;
-  COST_UPDATE_TYPE mv_cost_upd_freq;
-  int border_in_pixels;
-  AV1_LEVEL target_seq_level_idx[MAX_NUM_OPERATING_POINTS];
-  // Bit mask to specify which tier each of the 32 possible operating points
-  // conforms to.
-  unsigned int tier_mask;
-  // If true, encoder will use fixed QP offsets, that are either:
-  // - Given by the user, and stored in 'fixed_qp_offsets' array, OR
-  // - Picked automatically from cq_level.
-  int use_fixed_qp_offsets;
+} InputCfg;
+
+typedef struct {
   // List of QP offsets for: keyframe, ALTREF, and 3 levels of internal ARFs.
   // If any of these values are negative, fixed offsets are disabled.
   // Uses internal q range.
   double fixed_qp_offsets[FIXED_QP_OFFSET_COUNT];
-  // min_cr / 100 is the target minimum compression ratio for each frame.
-  unsigned int min_cr;
-  const cfg_options_t *encoder_cfg;
-} AV1EncoderConfig;
+  // If true, encoder will use fixed QP offsets, that are either:
+  // - Given by the user, and stored in 'fixed_qp_offsets' array, OR
+  // - Picked automatically from cq_level.
+  int use_fixed_qp_offsets;
+  // Indicates the minimum flatness of the quantization matrix.
+  int qm_minlevel;
+  // Indicates the maximum flatness of the quantization matrix.
+  int qm_maxlevel;
+  // Indicates if adaptive quantize_b should be enabled.
+  int quant_b_adapt;
+  // Indicates the Adaptive Quantization mode to be used.
+  AQ_MODE aq_mode;
+  // Indicates the delta q mode to be used.
+  DELTAQ_MODE deltaq_mode;
+  // Indicates if delta quantization should be enabled in chroma planes.
+  bool enable_chroma_deltaq;
+  // Indicates if encoding with quantization matrices should be enabled.
+  bool using_qm;
+} QuantizationCfg;
 
-static INLINE int is_lossless_requested(const AV1EncoderConfig *cfg) {
-  return cfg->best_allowed_q == 0 && cfg->worst_allowed_q == 0;
-}
+/*!\endcond */
+/*!
+ * \brief Algorithm configuration parameters.
+ */
+typedef struct {
+  /*!
+   * Indicates the loop filter sharpness.
+   */
+  int sharpness;
+
+  /*!
+   * Indicates the trellis optimization mode of quantized coefficients.
+   * 0: disabled
+   * 1: enabled
+   * 2: enabled for rd search
+   * 3: true for estimate yrd search
+   */
+  int disable_trellis_quant;
+
+  /*!
+   * The maximum number of frames used to create an arf.
+   */
+  int arnr_max_frames;
+
+  /*!
+   * The temporal filter strength for arf used when creating ARFs.
+   */
+  int arnr_strength;
+
+  /*!
+   * Indicates the CDF update mode
+   * 0: no update
+   * 1: update on every frame(default)
+   * 2: selectively update
+   */
+  uint8_t cdf_update_mode;
+
+  /*!
+   * Indicates if RDO based on frame temporal dependency should be enabled.
+   */
+  bool enable_tpl_model;
+
+  /*!
+   * Indicates if coding of overlay frames for filtered ALTREF frames is
+   * enabled.
+   */
+  bool enable_overlay;
+} AlgoCfg;
+/*!\cond */
 
 typedef struct {
-  // obmc_probs[i][j] is the probability of OBMC being the best motion mode for
-  // jth block size and ith frame update type, averaged over past frames. If
-  // obmc_probs[i][j] < thresh, then OBMC search is pruned.
+  // Indicates the codec bit-depth.
+  aom_bit_depth_t bit_depth;
+  // Indicates the superblock size that should be used by the encoder.
+  aom_superblock_size_t superblock_size;
+  // Indicates if loopfilter modulation should be enabled.
+  bool enable_deltalf_mode;
+  // Indicates if CDEF should be enabled.
+  bool enable_cdef;
+  // Indicates if loop restoration filter should be enabled.
+  bool enable_restoration;
+  // When enabled, video mode should be used even for single frame input.
+  bool force_video_mode;
+  // Indicates if the error resiliency features should be enabled.
+  bool error_resilient_mode;
+  // Indicates if frame parallel decoding feature should be enabled.
+  bool frame_parallel_decoding_mode;
+  // Indicates if the input should be encoded as monochrome.
+  bool enable_monochrome;
+  // When enabled, the encoder will use a full header even for still pictures.
+  // When disabled, a reduced header is used for still pictures.
+  bool full_still_picture_hdr;
+  // Indicates if dual interpolation filters should be enabled.
+  bool enable_dual_filter;
+  // Indicates if frame order hint should be enabled or not.
+  bool enable_order_hint;
+  // Indicates if ref_frame_mvs should be enabled at the sequence level.
+  bool ref_frame_mvs_present;
+  // Indicates if ref_frame_mvs should be enabled at the frame level.
+  bool enable_ref_frame_mvs;
+  // Indicates if interintra compound mode is enabled.
+  bool enable_interintra_comp;
+  // Indicates if global motion should be enabled.
+  bool enable_global_motion;
+  // Indicates if palette should be enabled.
+  bool enable_palette;
+} ToolCfg;
+
+/*!\endcond */
+/*!
+ * \brief Main encoder configuration data structure.
+ */
+typedef struct AV1EncoderConfig {
+  /*!\cond */
+  // Configuration related to the input video.
+  InputCfg input_cfg;
+
+  // Configuration related to frame-dimensions.
+  FrameDimensionCfg frm_dim_cfg;
+
+  /*!\endcond */
+  /*!
+   * Encoder algorithm configuration.
+   */
+  AlgoCfg algo_cfg;
+
+  /*!
+   * Configuration related to key-frames.
+   */
+  KeyFrameCfg kf_cfg;
+
+  /*!
+   * Rate control configuration
+   */
+  RateControlCfg rc_cfg;
+  /*!\cond */
+
+  // Configuration related to Quantization.
+  QuantizationCfg q_cfg;
+
+  // Internal frame size scaling.
+  ResizeCfg resize_cfg;
+
+  // Frame Super-Resolution size scaling.
+  SuperResCfg superres_cfg;
+
+  /*!\endcond */
+  /*!
+   * stats_in buffer contains all of the stats packets produced in the first
+   * pass, concatenated.
+   */
+  aom_fixed_buf_t twopass_stats_in;
+  /*!\cond */
+
+  // Configuration related to encoder toolsets.
+  ToolCfg tool_cfg;
+
+  // Configuration related to Group of frames.
+  GFConfig gf_cfg;
+
+  // Tile related configuration parameters.
+  TileConfig tile_cfg;
+
+  // Configuration related to Tune.
+  TuneCfg tune_cfg;
+
+  // Configuration related to color.
+  ColorCfg color_cfg;
+
+  // Configuration related to decoder model.
+  DecoderModelCfg dec_model_cfg;
+
+  // Configuration related to reference frames.
+  RefFrameCfg ref_frm_cfg;
+
+  // Configuration related to unit tests.
+  UnitTestCfg unit_test_cfg;
+
+  // Flags related to motion mode.
+  MotionModeCfg motion_mode_cfg;
+
+  // Flags related to intra mode search.
+  IntraModeCfg intra_mode_cfg;
+
+  // Flags related to transform size/type.
+  TxfmSizeTypeCfg txfm_cfg;
+
+  // Flags related to compound type.
+  CompoundTypeCfg comp_type_cfg;
+
+  // Partition related information.
+  PartitionCfg part_cfg;
+
+  // Configuration related to frequency of cost update.
+  CostUpdateFreq cost_upd_freq;
+
+#if CONFIG_DENOISE
+  // Indicates the noise level.
+  float noise_level;
+  // Indicates the the denoisers block size.
+  int noise_block_size;
+#endif
+
+  // Bit mask to specify which tier each of the 32 possible operating points
+  // conforms to.
+  unsigned int tier_mask;
+
+  // Indicates the number of pixels off the edge of a reference frame we're
+  // allowed to go when forming an inter prediction.
+  int border_in_pixels;
+
+  // Indicates the maximum number of threads that may be used by the encoder.
+  int max_threads;
+
+  // Indicates the spped preset to be used.
+  int speed;
+
+  // Indicates the target sequence level index for each operating point(OP).
+  AV1_LEVEL target_seq_level_idx[MAX_NUM_OPERATING_POINTS];
+
+  // Indicates the bitstream profile to be used.
+  BITSTREAM_PROFILE profile;
+
+  /*!\endcond */
+  /*!
+   * Indicates the current encoder pass :
+   * 0 = 1 Pass encode,
+   * 1 = First pass of two pass,
+   * 2 = Second pass of two pass.
+   *
+   */
+  enum aom_enc_pass pass;
+  /*!\cond */
+
+  // Indicates if the encoding is GOOD or REALTIME.
+  MODE mode;
+
+  // Indicates if row-based multi-threading should be enabled or not.
+  bool row_mt;
+
+  // Indicates if 16bit frame buffers are to be used i.e., the content is >
+  // 8-bit.
+  bool use_highbitdepth;
+
+  // Indicates the bitstream syntax mode. 0 indicates bitstream is saved as
+  // Section 5 bitstream, while 1 indicates the bitstream is saved in Annex - B
+  // format.
+  bool save_as_annexb;
+
+  /*!\endcond */
+} AV1EncoderConfig;
+
+/*!\cond */
+static INLINE int is_lossless_requested(const RateControlCfg *const rc_cfg) {
+  return rc_cfg->best_allowed_q == 0 && rc_cfg->worst_allowed_q == 0;
+}
+/*!\endcond */
+
+/*!
+ * \brief Encoder-side probabilities for pruning of various AV1 tools
+ */
+typedef struct {
+  /*!
+   * obmc_probs[i][j] is the probability of OBMC being the best motion mode for
+   * jth block size and ith frame update type, averaged over past frames. If
+   * obmc_probs[i][j] < thresh, then OBMC search is pruned.
+   */
   int obmc_probs[FRAME_UPDATE_TYPES][BLOCK_SIZES_ALL];
 
-  // warped_probs[i] is the probability of warped motion being the best motion
-  // mode for ith frame update type, averaged over past frames. If
-  // warped_probs[i] < thresh, then warped motion search is pruned.
+  /*!
+   * warped_probs[i] is the probability of warped motion being the best motion
+   * mode for ith frame update type, averaged over past frames. If
+   * warped_probs[i] < thresh, then warped motion search is pruned.
+   */
   int warped_probs[FRAME_UPDATE_TYPES];
 
-  // tx_type_probs[i][j][k] is the probability of kth tx_type being the best
-  // for jth transform size and ith frame update type, averaged over past
-  // frames. If tx_type_probs[i][j][k] < thresh, then transform search for that
-  // type is pruned.
+  /*!
+   * tx_type_probs[i][j][k] is the probability of kth tx_type being the best
+   * for jth transform size and ith frame update type, averaged over past
+   * frames. If tx_type_probs[i][j][k] < thresh, then transform search for that
+   * type is pruned.
+   */
   int tx_type_probs[FRAME_UPDATE_TYPES][TX_SIZES_ALL][TX_TYPES];
 
-  // switchable_interp_probs[i][j][k] is the probability of kth interpolation
-  // filter being the best for jth filter context and ith frame update type,
-  // averaged over past frames. If switchable_interp_probs[i][j][k] < thresh,
-  // then interpolation filter search is pruned for that case.
+  /*!
+   * switchable_interp_probs[i][j][k] is the probability of kth interpolation
+   * filter being the best for jth filter context and ith frame update type,
+   * averaged over past frames. If switchable_interp_probs[i][j][k] < thresh,
+   * then interpolation filter search is pruned for that case.
+   */
   int switchable_interp_probs[FRAME_UPDATE_TYPES][SWITCHABLE_FILTER_CONTEXTS]
                              [SWITCHABLE_FILTERS];
 } FrameProbInfo;
 
+/*!\cond */
+
 typedef struct FRAME_COUNTS {
 // Note: This structure should only contain 'unsigned int' fields, or
 // aggregates built solely from 'unsigned int' fields/elements
@@ -544,7 +1021,7 @@
   unsigned int txfm_partition[TXFM_PARTITION_CONTEXTS][2];
   unsigned int intra_tx_size[MAX_TX_CATS][TX_SIZE_CONTEXTS][MAX_TX_DEPTH + 1];
   unsigned int skip_mode[SKIP_MODE_CONTEXTS][2];
-  unsigned int skip[SKIP_CONTEXTS][2];
+  unsigned int skip_txfm[SKIP_CONTEXTS][2];
   unsigned int compound_index[COMP_INDEX_CONTEXTS][2];
   unsigned int comp_group_idx[COMP_GROUP_IDX_CONTEXTS][2];
   unsigned int delta_q[DELTA_Q_PROBS][2];
@@ -591,35 +1068,56 @@
 // TODO(angiebird): This is an estimated size. We still need to figure what is
 // the maximum number of modes.
 #define MAX_INTER_MODES 1024
+// TODO(any): rename this struct to something else. There is already another
+// struct called inter_mode_info, which makes this terribly confusing.
+/*!\endcond */
+/*!
+ * \brief Struct used to hold inter mode data for fast tx search.
+ *
+ * This struct is used to perform a full transform search only on winning
+ * candidates searched with an estimate for transform coding RD.
+ */
 typedef struct inter_modes_info {
+  /*!
+   * The number of inter modes for which data was stored in each of the
+   * following arrays.
+   */
   int num;
+  /*!
+   * Mode info struct for each of the candidate modes.
+   */
   MB_MODE_INFO mbmi_arr[MAX_INTER_MODES];
+  /*!
+   * The rate for each of the candidate modes.
+   */
   int mode_rate_arr[MAX_INTER_MODES];
+  /*!
+   * The sse of the predictor for each of the candidate modes.
+   */
   int64_t sse_arr[MAX_INTER_MODES];
+  /*!
+   * The estimated rd of the predictor for each of the candidate modes.
+   */
   int64_t est_rd_arr[MAX_INTER_MODES];
+  /*!
+   * The rate and mode index for each of the candidate modes.
+   */
   RdIdxPair rd_idx_pair_arr[MAX_INTER_MODES];
+  /*!
+   * The full rd stats for each of the candidate modes.
+   */
   RD_STATS rd_cost_arr[MAX_INTER_MODES];
+  /*!
+   * The full rd stats of luma only for each of the candidate modes.
+   */
   RD_STATS rd_cost_y_arr[MAX_INTER_MODES];
+  /*!
+   * The full rd stats of chroma only for each of the candidate modes.
+   */
   RD_STATS rd_cost_uv_arr[MAX_INTER_MODES];
 } InterModesInfo;
 
-// Encoder row synchronization
-typedef struct AV1RowMTSyncData {
-#if CONFIG_MULTITHREAD
-  pthread_mutex_t *mutex_;
-  pthread_cond_t *cond_;
-#endif
-  // Allocate memory to store the sb/mb block index in each row.
-  int *cur_col;
-  int sync_range;
-  int rows;
-} AV1RowMTSync;
-
-typedef struct AV1RowMTInfo {
-  int current_mi_row;
-  int num_threads_working;
-} AV1RowMTInfo;
-
+/*!\cond */
 typedef struct {
   // TODO(kyslov): consider changing to 64bit
 
@@ -669,46 +1167,81 @@
   VP64x64 *split;
 } VP128x128;
 
+/*!\endcond */
+
+/*!
+ * \brief Thresholds for variance based partitioning.
+ */
 typedef struct {
-  // Thresholds for variance based partitioning. If block variance > threshold,
-  // then that block is forced to split.
-  // thresholds[0] - threshold for 128x128;
-  // thresholds[1] - threshold for 64x64;
-  // thresholds[2] - threshold for 32x32;
-  // thresholds[3] - threshold for 16x16;
-  // thresholds[4] - threshold for 8x8;
+  /*!
+   * If block variance > threshold, then that block is forced to split.
+   * thresholds[0] - threshold for 128x128;
+   * thresholds[1] - threshold for 64x64;
+   * thresholds[2] - threshold for 32x32;
+   * thresholds[3] - threshold for 16x16;
+   * thresholds[4] - threshold for 8x8;
+   */
   int64_t thresholds[5];
 
-  // MinMax variance threshold for 8x8 sub blocks of a 16x16 block. If actual
-  // minmax > threshold_minmax, the 16x16 is forced to split.
+  /*!
+   * MinMax variance threshold for 8x8 sub blocks of a 16x16 block. If actual
+   * minmax > threshold_minmax, the 16x16 is forced to split.
+   */
   int64_t threshold_minmax;
 } VarBasedPartitionInfo;
 
+/*!
+ * \brief Encoder parameters for synchronization of row based multi-threading
+ */
+typedef struct {
+#if CONFIG_MULTITHREAD
+  /**
+   * \name Synchronization objects for top-right dependency.
+   */
+  /**@{*/
+  pthread_mutex_t *mutex_; /*!< Mutex lock object */
+  pthread_cond_t *cond_;   /*!< Condition variable */
+  /**@}*/
+#endif  // CONFIG_MULTITHREAD
+  /*!
+   * Buffer to store the superblock whose encoding is complete.
+   * cur_col[i] stores the number of superblocks which finished encoding in the
+   * ith superblock row.
+   */
+  int *num_finished_cols;
+  /*!
+   * Number of extra superblocks of the top row to be complete for encoding
+   * of the current superblock to start. A value of 1 indicates top-right
+   * dependency.
+   */
+  int sync_range;
+  /*!
+   * Number of superblock rows.
+   */
+  int rows;
+  /*!
+   * The superblock row (in units of MI blocks) to be processed next.
+   */
+  int next_mi_row;
+  /*!
+   * Number of threads processing the current tile.
+   */
+  int num_threads_working;
+} AV1EncRowMultiThreadSync;
+
+/*!\cond */
+
 // TODO(jingning) All spatially adaptive variables should go to TileDataEnc.
 typedef struct TileDataEnc {
   TileInfo tile_info;
-  CFL_CTX cfl;
   DECLARE_ALIGNED(16, FRAME_CONTEXT, tctx);
   FRAME_CONTEXT *row_ctx;
   uint8_t allow_update_cdf;
   InterModeRdModel inter_mode_rd_models[BLOCK_SIZES_ALL];
-  AV1RowMTSync row_mt_sync;
-  AV1RowMTInfo row_mt_info;
+  AV1EncRowMultiThreadSync row_mt_sync;
+  MV firstpass_top_mv;
 } TileDataEnc;
 
-typedef struct {
-  TOKENEXTRA *start;
-  TOKENEXTRA *stop;
-  unsigned int count;
-} TOKENLIST;
-
-typedef struct MultiThreadHandle {
-  int allocated_tile_rows;
-  int allocated_tile_cols;
-  int allocated_sb_rows;
-  int thread_id_to_tile_id[MAX_NUM_THREADS];  // Mapping of threads to tiles
-} MultiThreadHandle;
-
 typedef struct RD_COUNTS {
   int64_t comp_pred_diff[REFERENCE_MODES];
   // Stores number of 4x4 blocks using global motion per reference frame.
@@ -724,47 +1257,181 @@
   MACROBLOCK mb;
   RD_COUNTS rd_counts;
   FRAME_COUNTS *counts;
-  PC_TREE *pc_tree;
-  PC_TREE *pc_root;
-  tran_low_t *tree_coeff_buf[MAX_MB_PLANE];
-  tran_low_t *tree_qcoeff_buf[MAX_MB_PLANE];
-  tran_low_t *tree_dqcoeff_buf[MAX_MB_PLANE];
+  PC_TREE_SHARED_BUFFERS shared_coeff_buf;
+  SIMPLE_MOTION_DATA_TREE *sms_tree;
+  SIMPLE_MOTION_DATA_TREE *sms_root;
   InterModesInfo *inter_modes_info;
   uint32_t *hash_value_buffer[2][2];
-  int32_t *wsrc_buf;
-  int32_t *mask_buf;
-  uint8_t *above_pred_buf;
-  uint8_t *left_pred_buf;
+  OBMCBuffer obmc_buffer;
   PALETTE_BUFFER *palette_buffer;
   CompoundTypeRdBuffers comp_rd_buffer;
   CONV_BUF_TYPE *tmp_conv_dst;
-  uint8_t *tmp_obmc_bufs[2];
+  uint8_t *tmp_pred_bufs[2];
   int intrabc_used;
   int deltaq_used;
   FRAME_CONTEXT *tctx;
-  MB_MODE_INFO_EXT *mbmi_ext;
   VP64x64 *vt64x64;
   int32_t num_64x64_blocks;
+  PICK_MODE_CONTEXT *firstpass_ctx;
+  TemporalFilterData tf_data;
 } ThreadData;
 
 struct EncWorkerData;
 
+/*!\endcond */
+
+/*!
+ * \brief Encoder data related to row-based multi-threading
+ */
+typedef struct {
+  /*!
+   * Number of tile rows for which row synchronization memory is allocated.
+   */
+  int allocated_tile_rows;
+  /*!
+   * Number of tile cols for which row synchronization memory is allocated.
+   */
+  int allocated_tile_cols;
+  /*!
+   * Number of rows for which row synchronization memory is allocated
+   * per tile. During first-pass/look-ahead stage this equals the
+   * maximum number of macroblock rows in a tile. During encode stage,
+   * this equals the maximum number of superblock rows in a tile.
+   */
+  int allocated_rows;
+  /*!
+   * Number of columns for which entropy context memory is allocated
+   * per tile. During encode stage, this equals the maximum number of
+   * superblock columns in a tile minus 1. The entropy context memory
+   * is not allocated during first-pass/look-ahead stage.
+   */
+  int allocated_cols;
+
+  /*!
+   * thread_id_to_tile_id[i] indicates the tile id assigned to the ith thread.
+   */
+  int thread_id_to_tile_id[MAX_NUM_THREADS];
+
+#if CONFIG_MULTITHREAD
+  /*!
+   * Mutex lock used while dispatching jobs.
+   */
+  pthread_mutex_t *mutex_;
+#endif
+
+  /**
+   * \name Row synchronization related function pointers.
+   */
+  /**@{*/
+  /*!
+   * Reader.
+   */
+  void (*sync_read_ptr)(AV1EncRowMultiThreadSync *const, int, int);
+  /*!
+   * Writer.
+   */
+  void (*sync_write_ptr)(AV1EncRowMultiThreadSync *const, int, int, int);
+  /**@}*/
+} AV1EncRowMultiThreadInfo;
+
+/*!
+ * \brief Encoder parameters related to multi-threading.
+ */
+typedef struct {
+  /*!
+   * Number of workers created for multi-threading.
+   */
+  int num_workers;
+
+  /*!
+   * Number of workers created for tpl and tile/row multi-threading of encoder.
+   */
+  int num_enc_workers;
+
+  /*!
+   * Number of workers created for first-pass multi-threading.
+   */
+  int num_fp_workers;
+
+  /*!
+   * Synchronization object used to launch job in the worker thread.
+   */
+  AVxWorker *workers;
+
+  /*!
+   * Data specific to each worker in encoder multi-threading.
+   * tile_thr_data[i] stores the worker data of the ith thread.
+   */
+  struct EncWorkerData *tile_thr_data;
+
+  /*!
+   * When set, indicates that row based multi-threading of the encoder is
+   * enabled.
+   */
+  bool row_mt_enabled;
+
+  /*!
+   * Encoder row multi-threading data.
+   */
+  AV1EncRowMultiThreadInfo enc_row_mt;
+
+  /*!
+   * Tpl row multi-threading data.
+   */
+  AV1TplRowMultiThreadInfo tpl_row_mt;
+
+  /*!
+   * Loop Filter multi-threading object.
+   */
+  AV1LfSync lf_row_sync;
+
+  /*!
+   * Loop Restoration multi-threading object.
+   */
+  AV1LrSync lr_row_sync;
+
+  /*!
+   * Global Motion multi-threading object.
+   */
+  AV1GlobalMotionSync gm_sync;
+
+  /*!
+   * Temporal Filter multi-threading object.
+   */
+  AV1TemporalFilterSync tf_sync;
+} MultiThreadInfo;
+
+/*!\cond */
+
 typedef struct ActiveMap {
   int enabled;
   int update;
   unsigned char *map;
 } ActiveMap;
 
+/*!\endcond */
+
+/*!
+ * \brief Encoder info used for decision on forcing integer motion vectors.
+ */
 typedef struct {
-  // cs_rate_array[i] is the fraction of blocks in a frame which either match
-  // with the collocated block or are smooth, where i is the rate_index.
+  /*!
+   * cs_rate_array[i] is the fraction of blocks in a frame which either match
+   * with the collocated block or are smooth, where i is the rate_index.
+   */
   double cs_rate_array[32];
-  // rate_index is used to index cs_rate_array.
+  /*!
+   * rate_index is used to index cs_rate_array.
+   */
   int rate_index;
-  // rate_size is the total number of entries populated in cs_rate_array.
+  /*!
+   * rate_size is the total number of entries populated in cs_rate_array.
+   */
   int rate_size;
 } ForceIntegerMVInfo;
 
+/*!\cond */
+
 #if CONFIG_INTERNAL_STATS
 // types of stats
 enum {
@@ -786,17 +1453,31 @@
   YV12_BUFFER_CONFIG buf;
 } EncRefCntBuffer;
 
+/*!\endcond */
+
+/*!
+ * \brief Buffer to store mode information at mi_alloc_bsize (4x4 or 8x8) level
+ *
+ * This is used for bitstream preparation.
+ */
 typedef struct {
-  // Buffer to store mode information at mi_alloc_bsize (4x4 or 8x8) level for
-  // use in bitstream preparation. frame_base[mi_row * stride + mi_col] stores
-  // the mode information of block (mi_row,mi_col).
+  /*!
+   * frame_base[mi_row * stride + mi_col] stores the mode information of
+   * block (mi_row,mi_col).
+   */
   MB_MODE_INFO_EXT_FRAME *frame_base;
-  // Size of frame_base buffer.
+  /*!
+   * Size of frame_base buffer.
+   */
   int alloc_size;
-  // Stride of frame_base buffer.
+  /*!
+   * Stride of frame_base buffer.
+   */
   int stride;
 } MBMIExtFrameBufferInfo;
 
+/*!\cond */
+
 #if CONFIG_COLLECT_PARTITION_STATS == 2
 typedef struct PartitionStats {
   int partition_decisions[6][EXT_PARTITION_TYPES];
@@ -811,6 +1492,11 @@
 #include "aom_ports/aom_timer.h"
 // Adjust the following to add new components.
 enum {
+  av1_encode_strategy_time,
+  av1_get_second_pass_params_time,
+  denoise_and_encode_time,
+  apply_filtering_time,
+  av1_tpl_setup_stats_time,
   encode_frame_to_data_rate_time,
   encode_with_recode_loop_time,
   loop_filter_time,
@@ -820,13 +1506,25 @@
   av1_encode_frame_time,
   av1_compute_global_motion_time,
   av1_setup_motion_field_time,
-  encode_sb_time,
+  encode_sb_row_time,
+
   rd_pick_partition_time,
+  av1_prune_partitions_time,
+  none_partition_search_time,
+  split_partition_search_time,
+  rectangular_partition_search_time,
+  ab_partitions_search_time,
+  rd_pick_4partition_time,
+  encode_sb_time,
+
   rd_pick_sb_modes_time,
   av1_rd_pick_intra_mode_sb_time,
   av1_rd_pick_inter_mode_sb_time,
+  handle_inter_mode_time,
+  evaluate_motion_mode_for_winner_candidates_time,
   handle_intra_mode_time,
   do_tx_search_time,
+  av1_search_palette_mode_time,
   handle_newmv_time,
   compound_type_rd_time,
   interpolation_filter_search_time,
@@ -836,6 +1534,12 @@
 
 static INLINE char const *get_component_name(int index) {
   switch (index) {
+    case av1_encode_strategy_time: return "av1_encode_strategy_time";
+    case av1_get_second_pass_params_time:
+      return "av1_get_second_pass_params_time";
+    case denoise_and_encode_time: return "denoise_and_encode_time";
+    case apply_filtering_time: return "apply_filtering_time";
+    case av1_tpl_setup_stats_time: return "av1_tpl_setup_stats_time";
     case encode_frame_to_data_rate_time:
       return "encode_frame_to_data_rate_time";
     case encode_with_recode_loop_time: return "encode_with_recode_loop_time";
@@ -847,15 +1551,29 @@
     case av1_compute_global_motion_time:
       return "av1_compute_global_motion_time";
     case av1_setup_motion_field_time: return "av1_setup_motion_field_time";
-    case encode_sb_time: return "encode_sb_time";
+    case encode_sb_row_time: return "encode_sb_row_time";
+
     case rd_pick_partition_time: return "rd_pick_partition_time";
+    case av1_prune_partitions_time: return "av1_prune_partitions_time";
+    case none_partition_search_time: return "none_partition_search_time";
+    case split_partition_search_time: return "split_partition_search_time";
+    case rectangular_partition_search_time:
+      return "rectangular_partition_search_time";
+    case ab_partitions_search_time: return "ab_partitions_search_time";
+    case rd_pick_4partition_time: return "rd_pick_4partition_time";
+    case encode_sb_time: return "encode_sb_time";
+
     case rd_pick_sb_modes_time: return "rd_pick_sb_modes_time";
     case av1_rd_pick_intra_mode_sb_time:
       return "av1_rd_pick_intra_mode_sb_time";
     case av1_rd_pick_inter_mode_sb_time:
       return "av1_rd_pick_inter_mode_sb_time";
+    case handle_inter_mode_time: return "handle_inter_mode_time";
+    case evaluate_motion_mode_for_winner_candidates_time:
+      return "evaluate_motion_mode_for_winner_candidates_time";
     case handle_intra_mode_time: return "handle_intra_mode_time";
     case do_tx_search_time: return "do_tx_search_time";
+    case av1_search_palette_mode_time: return "av1_search_palette_mode_time";
     case handle_newmv_time: return "handle_newmv_time";
     case compound_type_rd_time: return "compound_type_rd_time";
     case interpolation_filter_search_time:
@@ -870,138 +1588,318 @@
 // The maximum number of internal ARFs except ALTREF_FRAME
 #define MAX_INTERNAL_ARFS (REF_FRAMES - BWDREF_FRAME - 1)
 
+/*!\endcond */
+
+/*!
+ * \brief Parameters related to global motion search
+ */
 typedef struct {
-  // Array to store the cost for signalling each global motion model.
-  // gmtype_cost[i] stores the cost of signalling the ith Global Motion model.
+  /*!
+   * Array to store the cost for signalling each global motion model.
+   * gmtype_cost[i] stores the cost of signalling the ith Global Motion model.
+   */
   int type_cost[TRANS_TYPES];
 
-  // Array to store the cost for signalling a particular global motion model for
-  // each reference frame. gmparams_cost[i] stores the cost of signalling global
-  // motion for the ith reference frame.
+  /*!
+   * Array to store the cost for signalling a particular global motion model for
+   * each reference frame. gmparams_cost[i] stores the cost of signalling global
+   * motion for the ith reference frame.
+   */
   int params_cost[REF_FRAMES];
 
-  // Flag to indicate if global motion search needs to be rerun.
+  /*!
+   * Flag to indicate if global motion search needs to be rerun.
+   */
   bool search_done;
+
+  /*!
+   * Array of pointers to the frame buffers holding the reference frames.
+   * ref_buf[i] stores the pointer to the reference frame of the ith
+   * reference frame type.
+   */
+  YV12_BUFFER_CONFIG *ref_buf[REF_FRAMES];
+
+  /*!
+   * Pointer to the source frame buffer.
+   */
+  unsigned char *src_buffer;
+
+  /*!
+   * Holds the number of valid reference frames in past and future directions
+   * w.r.t. the current frame. num_ref_frames[i] stores the total number of
+   * valid reference frames in 'i' direction.
+   */
+  int num_ref_frames[MAX_DIRECTIONS];
+
+  /*!
+   * Array of structure which stores the valid reference frames in past and
+   * future directions and their corresponding distance from the source frame.
+   * reference_frames[i][j] holds the jth valid reference frame type in the
+   * direction 'i' and its temporal distance from the source frame .
+   */
+  FrameDistPair reference_frames[MAX_DIRECTIONS][REF_FRAMES - 1];
+
+  /**
+   * \name Dimensions for which segment map is allocated.
+   */
+  /**@{*/
+  int segment_map_w; /*!< segment map width */
+  int segment_map_h; /*!< segment map height */
+  /**@}*/
+
+  /*!
+   * Holds the total number of corner points detected in the source frame.
+   */
+  int num_src_corners;
+
+  /*!
+   * Holds the x and y co-ordinates of the corner points detected in the source
+   * frame. src_corners[i] holds the x co-ordinate and src_corners[i+1] holds
+   * the y co-ordinate of the ith corner point detected.
+   */
+  int src_corners[2 * MAX_CORNERS];
 } GlobalMotionInfo;
 
+/*!
+ * \brief Initial frame dimensions
+ *
+ * Tracks the frame dimensions using which:
+ *  - Frame buffers (like altref and util frame buffers) were allocated
+ *  - Motion estimation related initializations were done
+ * This structure is helpful to reallocate / reinitialize the above when there
+ * is a change in frame dimensions.
+ */
 typedef struct {
-  // Stores the default value of skip flag depending on chroma format
-  // Set as 1 for monochrome and 3 for other color formats
+  int width;  /*!< initial width */
+  int height; /*!< initial height */
+} InitialDimensions;
+
+/*!
+ * \brief Flags related to interpolation filter search
+ */
+typedef struct {
+  /*!
+   * Stores the default value of skip flag depending on chroma format
+   * Set as 1 for monochrome and 3 for other color formats
+   */
   int default_interp_skip_flags;
-  // Filter mask to allow certain interp_filter type.
+  /*!
+   * Filter mask to allow certain interp_filter type.
+   */
   uint16_t interp_filter_search_mask;
 } InterpSearchFlags;
 
+/*!
+ * \brief Parameters for motion vector search process
+ */
 typedef struct {
-  // Largest MV component used in a frame.
-  // The value from the previous frame is used to set the full pixel search
-  // range for the current frame.
+  /*!
+   * Largest MV component used in a frame.
+   * The value from the previous frame is used to set the full pixel search
+   * range for the current frame.
+   */
   int max_mv_magnitude;
-  // Parameter indicating initial search window to be used in full-pixel search.
-  // Range [0, MAX_MVSEARCH_STEPS-2]. Lower value indicates larger window.
+  /*!
+   * Parameter indicating initial search window to be used in full-pixel search.
+   * Range [0, MAX_MVSEARCH_STEPS-2]. Lower value indicates larger window.
+   */
   int mv_step_param;
-  // Pointer to sub-pixel search function.
-  // In encoder: av1_find_best_sub_pixel_tree
-  //             av1_find_best_sub_pixel_tree_pruned
-  //             av1_find_best_sub_pixel_tree_pruned_more
-  //             av1_find_best_sub_pixel_tree_pruned_evenmore
-  // In MV unit test: av1_return_max_sub_pixel_mv
-  //                  av1_return_min_sub_pixel_mv
+  /*!
+   * Pointer to sub-pixel search function.
+   * In encoder: av1_find_best_sub_pixel_tree
+   *             av1_find_best_sub_pixel_tree_pruned
+   *             av1_find_best_sub_pixel_tree_pruned_more
+   * In MV unit test: av1_return_max_sub_pixel_mv
+   *                  av1_return_min_sub_pixel_mv
+   */
   fractional_mv_step_fp *find_fractional_mv_step;
-  // Search site configuration for full-pel MV search.
-  // ss_cfg[SS_CFG_SRC]: Used in tpl, rd/non-rd inter mode loop, simple motion
-  // search.
-  // ss_cfg[SS_CFG_LOOKAHEAD]: Used in intraBC, temporal filter
-  // ss_cfg[SS_CFG_FPF]: Used during first pass and lookahead
-  search_site_config ss_cfg[SS_CFG_TOTAL];
+  /*!
+   * Search site configuration for full-pel MV search.
+   * search_site_cfg[SS_CFG_SRC]: Used in tpl, rd/non-rd inter mode loop, simple
+   * motion search. search_site_cfg[SS_CFG_LOOKAHEAD]: Used in intraBC, temporal
+   * filter search_site_cfg[SS_CFG_FPF]: Used during first pass and lookahead
+   */
+  search_site_config search_site_cfg[SS_CFG_TOTAL][NUM_DISTINCT_SEARCH_METHODS];
 } MotionVectorSearchParams;
 
+/*!
+ * \brief Refresh frame flags for different type of frames.
+ *
+ * If the refresh flag is true for a particular reference frame, after the
+ * current frame is encoded, the reference frame gets refreshed (updated) to
+ * be the current frame. Note: Usually at most one flag will be set to true at
+ * a time. But, for key-frames, all flags are set to true at once.
+ */
 typedef struct {
-  // When resize is triggered externally, the desired dimensions are stored in
-  // this struct until used in the next frame to be coded. These values are
-  // effective only for one frame and are reset after they are used.
-  int width;
-  int height;
+  bool golden_frame;  /*!< Refresh flag for golden frame */
+  bool bwd_ref_frame; /*!< Refresh flag for bwd-ref frame */
+  bool alt_ref_frame; /*!< Refresh flag for alt-ref frame */
+} RefreshFrameFlagsInfo;
+
+/*!
+ * \brief Desired dimensions for an externally triggered resize.
+ *
+ * When resize is triggered externally, the desired dimensions are stored in
+ * this struct until used in the next frame to be coded. These values are
+ * effective only for one frame and are reset after they are used.
+ */
+typedef struct {
+  int width;  /*!< Desired resized width */
+  int height; /*!< Desired resized height */
 } ResizePendingParams;
 
+/*!
+ * \brief Refrence frame distance related variables.
+ */
 typedef struct {
-  // Threshold of transform domain distortion
-  // Index 0: Default mode evaluation, Winner mode processing is not applicable
-  // (Eg : IntraBc).
-  // Index 1: Mode evaluation.
-  // Index 2: Winner mode evaluation.
-  // Index 1 and 2 are applicable when enable_winner_mode_for_use_tx_domain_dist
-  // speed feature is ON
-  unsigned int tx_domain_dist_threshold[MODE_EVAL_TYPES];
+  /*!
+   * True relative distance of reference frames w.r.t. the current frame.
+   */
+  int ref_relative_dist[INTER_REFS_PER_FRAME];
+  /*!
+   * The nearest reference w.r.t. current frame in the past.
+   */
+  int8_t nearest_past_ref;
+  /*!
+   * The nearest reference w.r.t. current frame in the future.
+   */
+  int8_t nearest_future_ref;
+} RefFrameDistanceInfo;
 
-  // Factor to control R-D optimization of coeffs based on block
-  // mse.
-  // Index 0: Default mode evaluation, Winner mode processing is not applicable
-  // (Eg : IntraBc). Index 1: Mode evaluation.
-  // Index 2: Winner mode evaluation
-  // Index 1 and 2 are applicable when enable_winner_mode_for_coeff_opt speed
-  // feature is ON
+/*!
+ * \brief Parameters used for winner mode processing.
+ *
+ * This is a basic two pass approach: in the first pass, we reduce the number of
+ * transform searches based on some thresholds during the rdopt process to find
+ * the  "winner mode". In the second pass, we perform a more through tx search
+ * on the winner mode.
+ * There are some arrays in the struct, and their indices are used in the
+ * following manner:
+ * Index 0: Default mode evaluation, Winner mode processing is not applicable
+ * (Eg : IntraBc).
+ * Index 1: Mode evaluation.
+ * Index 2: Winner mode evaluation
+ * Index 1 and 2 are only used when the respective speed feature is on.
+ */
+typedef struct {
+  /*!
+   * Threshold to determine the best number of transform coefficients to keep
+   * using trellis optimization.
+   * Corresponds to enable_winner_mode_for_coeff_opt speed feature.
+   */
   unsigned int coeff_opt_dist_threshold[MODE_EVAL_TYPES];
 
-  // Transform size to be used in transform search
-  // Index 0: Default mode evaluation, Winner mode processing is not applicable
-  // (Eg : IntraBc).
-  // Index 1: Mode evaluation. Index 2: Winner mode evaluation
-  // Index 1 and 2 are applicable when enable_winner_mode_for_tx_size_srch speed
-  // feature is ON
+  /*!
+   * Threshold to determine if trellis optimization is to be enabled
+   * based on SATD.
+   * Corresponds to enable_winner_mode_for_coeff_opt speed feature.
+   */
+  unsigned int coeff_opt_satd_threshold[MODE_EVAL_TYPES];
+
+  /*!
+   * Determines the tx size search method during rdopt.
+   * Corresponds to enable_winner_mode_for_tx_size_srch speed feature.
+   */
   TX_SIZE_SEARCH_METHOD tx_size_search_methods[MODE_EVAL_TYPES];
 
-  // Transform domain distortion levels
-  // Index 0: Default mode evaluation, Winner mode processing is not applicable
-  // (Eg : IntraBc).
-  // Index 1: Mode evaluation. Index 2: Winner mode evaluation
-  // Index 1 and 2 are applicable when enable_winner_mode_for_use_tx_domain_dist
-  // speed feature is ON
+  /*!
+   * Controls how often we should approximate prediction error with tx
+   * coefficients. If it's 0, then never. If 1, then it's during the tx_type
+   * search only. If 2, then always.
+   * Corresponds to tx_domain_dist_level speed feature.
+   */
   unsigned int use_transform_domain_distortion[MODE_EVAL_TYPES];
 
-  // Predict transform skip levels to be used for default, mode and winner mode
-  // evaluation. Index 0: Default mode evaluation, Winner mode processing is not
-  // applicable. Index 1: Mode evaluation, Index 2: Winner mode evaluation
-  unsigned int predict_skip_level[MODE_EVAL_TYPES];
+  /*!
+   * Threshold to approximate pixel domain distortion with transform domain
+   * distortion. This is only used if use_txform_domain_distortion is on.
+   * Corresponds to enable_winner_mode_for_use_tx_domain_dist speed feature.
+   */
+  unsigned int tx_domain_dist_threshold[MODE_EVAL_TYPES];
+
+  /*!
+   * Controls how often we should try to skip the transform process based on
+   * result from dct.
+   * Corresponds to use_skip_flag_prediction speed feature.
+   */
+  unsigned int skip_txfm_level[MODE_EVAL_TYPES];
+
+  /*!
+   * Predict DC only txfm blocks for default, mode and winner mode evaluation.
+   * Index 0: Default mode evaluation, Winner mode processing is not applicable.
+   * Index 1: Mode evaluation, Index 2: Winner mode evaluation
+   */
+  unsigned int predict_dc_level[MODE_EVAL_TYPES];
 } WinnerModeParams;
 
+/*!
+ * \brief Frame refresh flags set by the external interface.
+ *
+ * Flags set by external interface to determine which reference buffers are
+ * refreshed by this frame. When set, the encoder will update the particular
+ * reference frame buffer with the contents of the current frame.
+ */
 typedef struct {
-  // Bit mask to disable certain reference frame types.
+  bool last_frame;     /*!< Refresh flag for last frame */
+  bool golden_frame;   /*!< Refresh flag for golden frame */
+  bool bwd_ref_frame;  /*!< Refresh flag for bwd-ref frame */
+  bool alt2_ref_frame; /*!< Refresh flag for alt2-ref frame */
+  bool alt_ref_frame;  /*!< Refresh flag for alt-ref frame */
+  /*!
+   * Flag indicating if the update of refresh frame flags is pending.
+   */
+  bool update_pending;
+} ExtRefreshFrameFlagsInfo;
+
+/*!
+ * \brief Flags signalled by the external interface at frame level.
+ */
+typedef struct {
+  /*!
+   * Bit mask to disable certain reference frame types.
+   */
   int ref_frame_flags;
 
-  // Flags to determine which reference buffers are refreshed by this frame.
-  // When set, the encoder will update the particular reference frame buffer
-  // with the contents of the current frame.
-  bool refresh_last_frame;
-  bool refresh_golden_frame;
-  bool refresh_bwd_ref_frame;
-  bool refresh_alt2_ref_frame;
-  bool refresh_alt_ref_frame;
+  /*!
+   * Frame refresh flags set by the external interface.
+   */
+  ExtRefreshFrameFlagsInfo refresh_frame;
 
-  // Flag to indicate that updation of refresh frame flags from external
-  // interface is pending.
-  bool refresh_frame_flags_pending;
-
-  // Flag to enable the updation of frame contexts at the end of a frame decode.
+  /*!
+   * Flag to enable the update of frame contexts at the end of a frame decode.
+   */
   bool refresh_frame_context;
 
-  // Flag to indicate that updation of refresh_frame_context from external
-  // interface is pending.
+  /*!
+   * Flag to indicate that update of refresh_frame_context from external
+   * interface is pending.
+   */
   bool refresh_frame_context_pending;
 
-  // Flag to enable temporal MV prediction.
+  /*!
+   * Flag to enable temporal MV prediction.
+   */
   bool use_ref_frame_mvs;
 
-  // Flag to code the frame as error-resilient.
+  /*!
+   * Indicates whether the current frame is to be coded as error resilient.
+   */
   bool use_error_resilient;
 
-  // Flag to code the frame as s-frame.
+  /*!
+   * Indicates whether the current frame is to be coded as s-frame.
+   */
   bool use_s_frame;
 
-  // Flag to set the frame's primary_ref_frame to PRIMARY_REF_NONE.
+  /*!
+   * Indicates whether the current frame's primary_ref_frame is set to
+   * PRIMARY_REF_NONE.
+   */
   bool use_primary_ref_none;
 } ExternalFlags;
 
+/*!\cond */
+
 typedef struct {
   int arf_stack[FRAME_BUFFERS];
   int arf_stack_size;
@@ -1040,6 +1938,14 @@
 } MV_STATS;
 
 typedef struct {
+  struct loopfilter lf;
+  CdefInfo cdef_info;
+  YV12_BUFFER_CONFIG copy_buffer;
+  RATE_CONTROL rc;
+  MV_STATS mv_stats;
+} CODING_CONTEXT;
+
+typedef struct {
   int frame_width;
   int frame_height;
   int mi_rows;
@@ -1052,155 +1958,365 @@
   int subsampling_y;
 } FRAME_INFO;
 
+/*!\endcond */
+
+/*!
+ * \brief Segmentation related information for the current frame.
+ */
 typedef struct {
-  // 3-bit number containing the segment affiliation for each 4x4 block in the
-  // frame. map[y * stride + x] contains the segment id of the 4x4 block at
-  // (x,y) position.
+  /*!
+   * 3-bit number containing the segment affiliation for each 4x4 block in the
+   * frame. map[y * stride + x] contains the segment id of the 4x4 block at
+   * (x,y) position.
+   */
   uint8_t *map;
-  // Flag to indicate if current frame has lossless segments or not.
-  // 1: frame has at least one lossless segment.
-  // 0: frame has no lossless segments.
+  /*!
+   * Flag to indicate if current frame has lossless segments or not.
+   * 1: frame has at least one lossless segment.
+   * 0: frame has no lossless segments.
+   */
   bool has_lossless_segment;
 } EncSegmentationInfo;
 
+/*!
+ * \brief Frame time stamps.
+ */
 typedef struct {
-  // Start time stamp of the previous frame
-  int64_t prev_start_seen;
-  // End time stamp of the previous frame
-  int64_t prev_end_seen;
-  // Start time stamp of the first frame
-  int64_t first_ever;
+  /*!
+   * Start time stamp of the previous frame
+   */
+  int64_t prev_ts_start;
+  /*!
+   * End time stamp of the previous frame
+   */
+  int64_t prev_ts_end;
+  /*!
+   * Start time stamp of the first frame
+   */
+  int64_t first_ts_start;
 } TimeStamps;
 
+/*!
+ * Pointers to the memory allocated for frame level transform coeff related
+ * info.
+ */
+typedef struct {
+  /*!
+   * Pointer to the transformed coefficients buffer.
+   */
+  tran_low_t *tcoeff;
+  /*!
+   * Pointer to the eobs buffer.
+   */
+  uint16_t *eobs;
+  /*!
+   * Pointer to the entropy_ctx buffer.
+   */
+  uint8_t *entropy_ctx;
+} CoeffBufferPool;
+
+/*!
+ * \brief Top level encoder structure.
+ */
 typedef struct AV1_COMP {
-  // Quantization and dequantization parameters for internal quantizer setup
-  // in the encoder.
+  /*!
+   * Quantization and dequantization parameters for internal quantizer setup
+   * in the encoder.
+   */
   EncQuantDequantParams enc_quant_dequant_params;
+
+  /*!
+   * Structure holding thread specific variables.
+   */
   ThreadData td;
+
+  /*!
+   * Statistics collected at frame level.
+   */
   FRAME_COUNTS counts;
 
-  // Holds buffer storing mode information at 4x4/8x8 level.
+  /*!
+   * Holds buffer storing mode information at 4x4/8x8 level.
+   */
   MBMIExtFrameBufferInfo mbmi_ext_info;
 
+  /*!
+   * Buffer holding the transform block related information.
+   * coeff_buffer_base[i] stores the transform block related information of the
+   * ith superblock in raster scan order.
+   */
   CB_COEFF_BUFFER *coeff_buffer_base;
-  AV1_COMMON common;
-  AV1EncoderConfig oxcf;
-  struct lookahead_ctx *lookahead;
-  int no_show_kf;
 
+  /*!
+   * Structure holding pointers to frame level memory allocated for transform
+   * block related information.
+   */
+  CoeffBufferPool coeff_buffer_pool;
+
+  /*!
+   * Structure holding variables common to encoder and decoder.
+   */
+  AV1_COMMON common;
+
+  /*!
+   * Encoder configuration related parameters.
+   */
+  AV1EncoderConfig oxcf;
+
+  /*!
+   * Look-ahead context.
+   */
+  struct lookahead_ctx *lookahead;
+
+  /*!
+   * When set, this flag indicates that the current frame is a forward keyframe.
+   */
+  int no_show_fwd_kf;
+
+  /*!
+   * Stores the trellis optimization type at segment level.
+   * optimize_seg_arr[i] stores the trellis opt type for ith segment.
+   */
   TRELLIS_OPT_TYPE optimize_seg_arr[MAX_SEGMENTS];
 
+  /*!
+   * Pointer to the frame buffer holding the source frame to be used during the
+   * current stage of encoding. It can be the raw input, temporally filtered
+   * input or scaled input.
+   */
   YV12_BUFFER_CONFIG *source;
-  YV12_BUFFER_CONFIG *last_source;  // NULL for first frame and alt_ref frames
+
+  /*!
+   * Pointer to the frame buffer holding the last raw source frame.
+   * NULL for first frame and alt_ref frames.
+   */
+  YV12_BUFFER_CONFIG *last_source;
+
+  /*!
+   * Pointer to the frame buffer holding the unscaled source frame.
+   * It can be either the raw input or temporally filtered input.
+   */
   YV12_BUFFER_CONFIG *unscaled_source;
+
+  /*!
+   * Frame buffer holding the resized source frame (cropping / superres).
+   */
   YV12_BUFFER_CONFIG scaled_source;
+
+  /*!
+   * Pointer to the frame buffer holding the unscaled last source frame.
+   */
   YV12_BUFFER_CONFIG *unscaled_last_source;
+
+  /*!
+   * Frame buffer holding the resized last source frame.
+   */
   YV12_BUFFER_CONFIG scaled_last_source;
+
+  /*!
+   * Pointer to the original source frame. This is used to determine if the
+   * content is screen.
+   */
   YV12_BUFFER_CONFIG *unfiltered_source;
 
+  /*!
+   * Parameters related to tpl.
+   */
   TplParams tpl_data;
 
-  // For a still frame, this flag is set to 1 to skip partition search.
+  /*!
+   * Temporal filter context.
+   */
+  TemporalFilterCtx tf_ctx;
+
+  /*!
+   * For a still frame, this flag is set to 1 to skip partition search.
+   */
   int partition_search_skippable_frame;
 
-  // Variables related to forcing integer mv decisions for the current frame.
+  /*!
+   * Variables related to forcing integer mv decisions for the current frame.
+   */
   ForceIntegerMVInfo force_intpel_info;
 
-  unsigned int row_mt;
+  /*!
+   * Pointer to the buffer holding the scaled reference frames.
+   * scaled_ref_buf[i] holds the scaled reference frame of type i.
+   */
   RefCntBuffer *scaled_ref_buf[INTER_REFS_PER_FRAME];
 
-  RefCntBuffer *last_show_frame_buf;  // last show frame buffer
+  /*!
+   * Pointer to the buffer holding the last show frame.
+   */
+  RefCntBuffer *last_show_frame_buf;
 
-  // refresh_*_frame are boolean flags. If 'refresh_xyz_frame' is true, then
-  // after the current frame is encoded, the XYZ reference frame gets refreshed
-  // (updated) to be the current frame.
-  //
-  // Note: Usually at most one of these refresh flags is true at a time.
-  // But a key-frame is special, for which all the flags are true at once.
-  int refresh_golden_frame;
-  int refresh_bwd_ref_frame;
-  int refresh_alt_ref_frame;
+  /*!
+   * Refresh frame flags for golden, bwd-ref and alt-ref frames.
+   */
+  RefreshFrameFlagsInfo refresh_frame;
 
-  // For each type of reference frame, this contains the index of a reference
-  // frame buffer for a reference frame of the same type.  We use this to
-  // choose our primary reference frame (which is the most recent reference
-  // frame of the same type as the current frame).
+  /*!
+   * For each type of reference frame, this contains the index of a reference
+   * frame buffer for a reference frame of the same type.  We use this to
+   * choose our primary reference frame (which is the most recent reference
+   * frame of the same type as the current frame).
+   */
   int fb_of_context_type[REF_FRAMES];
 
-  // Flags signalled by the external interface at frame level.
+  /*!
+   * Flags signalled by the external interface at frame level.
+   */
   ExternalFlags ext_flags;
 
+  /*!
+   * Temporary frame buffer used to store the non-loop filtered reconstructed
+   * frame during the search of loop filter level.
+   */
   YV12_BUFFER_CONFIG last_frame_uf;
+
+  /*!
+   * Temporary frame buffer used to store the loop restored frame during loop
+   * restoration search.
+   */
   YV12_BUFFER_CONFIG trial_frame_rst;
 
-  // Ambient reconstruction err target for force key frames
+  /*!
+   * Ambient reconstruction err target for force key frames.
+   */
   int64_t ambient_err;
 
+  /*!
+   * Parameters related to rate distortion optimization.
+   */
   RD_OPT rd;
 
+  /*!
+   * Temporary coding context used to save and restore when encoding with and
+   * without super-resolution.
+   */
   CODING_CONTEXT coding_context;
 
-  // Parameters related to global motion search.
+  /*!
+   * Parameters related to global motion search.
+   */
   GlobalMotionInfo gm_info;
 
-  // Parameters related to winner mode processing.
+  /*!
+   * Parameters related to winner mode processing.
+   */
   WinnerModeParams winner_mode_params;
 
-  // Frame time stamps
+  /*!
+   * Frame time stamps.
+   */
   TimeStamps time_stamps;
 
+  /*!
+   * Rate control related parameters.
+   */
   RATE_CONTROL rc;
+
+  /*!
+   * Frame rate of the video.
+   */
   double framerate;
 
+  /*!
+   * Pointer to internal utility functions that manipulate aom_codec_* data
+   * structures.
+   */
   struct aom_codec_pkt_list *output_pkt_list;
 
+  /*!
+   * Bitmask indicating which reference buffers may be referenced by this frame.
+   */
   int ref_frame_flags;
 
-  // speed is passed as a per-frame parameter into the encoder
+  /*!
+   * speed is passed as a per-frame parameter into the encoder.
+   */
   int speed;
-  // sf contains fine-grained config set internally based on speed
+
+  /*!
+   * sf contains fine-grained config set internally based on speed.
+   */
   SPEED_FEATURES sf;
 
-  // Parameters for motion vector search process.
+  /*!
+   * Parameters for motion vector search process.
+   */
   MotionVectorSearchParams mv_search_params;
 
+  /*!
+   * When set, indicates that all reference frames are forward references,
+   * i.e., all the reference frames are output before the current frame.
+   */
   int all_one_sided_refs;
 
-  // Segmentation related information for current frame.
+  /*!
+   * Segmentation related information for current frame.
+   */
   EncSegmentationInfo enc_seg;
 
+  /*!
+   * Parameters related to cyclic refresh aq-mode.
+   */
   CYCLIC_REFRESH *cyclic_refresh;
+  /*!
+   * Parameters related to active map. Active maps indicate
+   * if there is any activity on a 4x4 block basis.
+   */
   ActiveMap active_map;
 
+  /*!
+   * Function pointers to variants of sse/sad/variance computation functions.
+   * fn_ptr[i] indicates the list of function pointers corresponding to block
+   * size i.
+   */
   aom_variance_fn_ptr_t fn_ptr[BLOCK_SIZES_ALL];
 
-#if CONFIG_INTERNAL_STATS
-  uint64_t time_receive_data;
-  uint64_t time_compress_data;
-#endif
-
-  // number of show frames encoded in current gf_group
-  int num_gf_group_show_frames;
-
+  /*!
+   * Information related to two pass encoding.
+   */
   TWO_PASS twopass;
 
+  /*!
+   * Information related to a gf group.
+   */
   GF_GROUP gf_group;
 
-  // To control the reference frame buffer and selection.
+  /*!
+   * Track prior gf group state.
+   */
+  GF_STATE gf_state;
+
+  /*!
+   * To control the reference frame buffer and selection.
+   */
   RefBufferStack ref_buffer_stack;
 
+  /*!
+   * Frame buffer holding the temporally filtered source frame. It can be KEY
+   * frame or ARF frame.
+   */
   YV12_BUFFER_CONFIG alt_ref_buffer;
 
-  // Tell if OVERLAY frame shows existing alt_ref frame.
+  /*!
+   * Tell if OVERLAY frame shows existing alt_ref frame.
+   */
   int show_existing_alt_ref;
 
 #if CONFIG_INTERNAL_STATS
+  /*!\cond */
+  uint64_t time_receive_data;
+  uint64_t time_compress_data;
+
   unsigned int mode_chosen_counts[MAX_MODES];
 
-  int count;
-  uint64_t total_sq_error;
-  uint64_t total_samples;
-  ImageStat psnr;
+  int count[2];
+  uint64_t total_sq_error[2];
+  uint64_t total_samples[2];
+  ImageStat psnr[2];
 
   double total_blockiness;
   double worst_blockiness;
@@ -1208,8 +2324,11 @@
   int bytes;
   double summed_quality;
   double summed_weights;
+  double summed_quality_hbd;
+  double summed_weights_hbd;
   unsigned int tot_recode_hits;
   double worst_ssim;
+  double worst_ssim_hbd;
 
   ImageStat fastssim;
   ImageStat psnrhvs;
@@ -1221,172 +2340,347 @@
   double worst_consistency;
   Ssimv *ssim_vars;
   Metrics metrics;
+  /*!\endcond */
 #endif
+
+  /*!
+   * Calculates PSNR on each frame when set to 1.
+   */
   int b_calculate_psnr;
+
 #if CONFIG_SPEED_STATS
+  /*!
+   * For debugging: number of transform searches we have performed.
+   */
   unsigned int tx_search_count;
 #endif  // CONFIG_SPEED_STATS
 
+  /*!
+   * When set, indicates that the frame is droppable, i.e., this frame
+   * does not update any reference buffers.
+   */
   int droppable;
 
+  /*!
+   * Stores the frame parameters during encoder initialization.
+   */
   FRAME_INFO frame_info;
 
-  int initial_width;
-  int initial_height;
-  int initial_mbs;  // Number of MBs in the full-size frame; to be used to
-                    // normalize the firstpass stats. This will differ from the
-                    // number of MBs in the current frame when the frame is
-                    // scaled.
-  // Resize related parameters
+  /*!
+   * Structure to store the dimensions of current frame.
+   */
+  InitialDimensions initial_dimensions;
+
+  /*!
+   * Number of MBs in the full-size frame; to be used to
+   * normalize the firstpass stats. This will differ from the
+   * number of MBs in the current frame when the frame is
+   * scaled.
+   */
+  int initial_mbs;
+
+  /*!
+   * Resize related parameters.
+   */
   ResizePendingParams resize_pending_params;
 
+  /*!
+   * Pointer to struct holding adaptive data/contexts/models for the tile during
+   * encoding.
+   */
   TileDataEnc *tile_data;
-  int allocated_tiles;  // Keep track of memory allocated for tiles.
+  /*!
+   * Number of tiles for which memory has been allocated for tile_data.
+   */
+  int allocated_tiles;
 
-  TOKENEXTRA *tile_tok[MAX_TILE_ROWS][MAX_TILE_COLS];
-  TOKENLIST *tplist[MAX_TILE_ROWS][MAX_TILE_COLS];
+  /*!
+   * Structure to store the palette token related information.
+   */
+  TokenInfo token_info;
 
-  // Sequence parameters have been transmitted already and locked
-  // or not. Once locked av1_change_config cannot change the seq
-  // parameters.
+  /*!
+   * Sequence parameters have been transmitted already and locked
+   * or not. Once locked av1_change_config cannot change the seq
+   * parameters.
+   */
   int seq_params_locked;
 
-  // VARIANCE_AQ segment map refresh
+  /*!
+   * VARIANCE_AQ segment map refresh.
+   */
   int vaq_refresh;
 
-  // Thresholds for variance based partitioning.
+  /*!
+   * Thresholds for variance based partitioning.
+   */
   VarBasedPartitionInfo vbp_info;
 
-  // Probabilities for pruning of various AV1 tools.
+  /*!
+   * Probabilities for pruning of various AV1 tools.
+   */
   FrameProbInfo frame_probs;
 
-  // Multi-threading
-  int num_workers;
-  AVxWorker *workers;
-  struct EncWorkerData *tile_thr_data;
+  /*!
+   * Multi-threading parameters.
+   */
+  MultiThreadInfo mt_info;
+
+  /*!
+   * Specifies the frame to be output. It is valid only if show_existing_frame
+   * is 1. When show_existing_frame is 0, existing_fb_idx_to_show is set to
+   * INVALID_IDX.
+   */
   int existing_fb_idx_to_show;
+
+  /*!
+   * When set, indicates that internal ARFs are enabled.
+   */
   int internal_altref_allowed;
-  // A flag to indicate if intrabc is ever used in current frame.
+
+  /*!
+   * A flag to indicate if intrabc is ever used in current frame.
+   */
   int intrabc_used;
 
-  // Tables to calculate IntraBC MV cost.
+  /*!
+   * Tables to calculate IntraBC MV cost.
+   */
   IntraBCMVCosts dv_costs;
 
-  // Mark which ref frames can be skipped for encoding current frame druing RDO.
+  /*!
+   * Mark which ref frames can be skipped for encoding current frame during RDO.
+   */
   int prune_ref_frame_mask;
 
-  AV1LfSync lf_row_sync;
-  AV1LrSync lr_row_sync;
+  /*!
+   * Loop Restoration context.
+   */
   AV1LrStruct lr_ctxt;
 
+  /*!
+   * Pointer to list of tables with film grain parameters.
+   */
   aom_film_grain_table_t *film_grain_table;
+
 #if CONFIG_DENOISE
+  /*!
+   * Pointer to structure holding the denoised image buffers and the helper
+   * noise models.
+   */
   struct aom_denoise_and_model_t *denoise_and_model;
 #endif
 
-  // Flags related to interpolation filter search.
+  /*!
+   * Flags related to interpolation filter search.
+   */
   InterpSearchFlags interp_search_flags;
 
-  MultiThreadHandle multi_thread_ctxt;
-  void (*row_mt_sync_read_ptr)(AV1RowMTSync *const, int, int);
-  void (*row_mt_sync_write_ptr)(AV1RowMTSync *const, int, int, const int);
-#if CONFIG_MULTITHREAD
-  pthread_mutex_t *row_mt_mutex_;
-#endif
-  // Set if screen content is set or relevant tools are enabled
+  /*!
+   * Turn on screen content tools flag.
+   * Note that some videos are not screen content videos, but
+   * screen content tools could also improve coding efficiency.
+   * For example, videos with large flat regions, gaming videos that look
+   * like natural videos.
+   */
+  int use_screen_content_tools;
+
+  /*!
+   * A flag to indicate "real" screen content videos.
+   * For example, screen shares, screen editing.
+   * This type is true indicates |use_screen_content_tools| must be true.
+   * In addition, rate control strategy is adjusted when this flag is true.
+   */
   int is_screen_content_type;
+
 #if CONFIG_COLLECT_PARTITION_STATS == 2
   PartitionStats partition_stats;
 #endif
 
 #if CONFIG_COLLECT_COMPONENT_TIMING
-  // component_time[] are initialized to zero while encoder starts.
+  /*!
+   * component_time[] are initialized to zero while encoder starts.
+   */
   uint64_t component_time[kTimingComponents];
   struct aom_usec_timer component_timer[kTimingComponents];
-  // frame_component_time[] are initialized to zero at beginning of each frame.
+  /*!
+   * frame_component_time[] are initialized to zero at beginning of each frame.
+   */
   uint64_t frame_component_time[kTimingComponents];
 #endif
 
-  // Parameters for AV1 bitstream levels.
+  /*!
+   * Parameters for AV1 bitstream levels.
+   */
   AV1LevelParams level_params;
 
-  // whether any no-zero delta_q was actually used
+  /*!
+   * Whether any no-zero delta_q was actually used.
+   */
   int deltaq_used;
 
-  // Indicates the true relative distance of ref frame w.r.t. current frame
-  int ref_relative_dist[INTER_REFS_PER_FRAME];
+  /*!
+   * Refrence frame distance related variables.
+   */
+  RefFrameDistanceInfo ref_frame_dist_info;
 
-  // Indicate nearest references w.r.t. current frame in past and future
-  int8_t nearest_past_ref;
-  int8_t nearest_future_ref;
-
-  // TODO(sdeng): consider merge the following arrays.
+  /*!
+   * Scaling factors used in the RD multiplier modulation.
+   * TODO(sdeng): consider merge the following arrays.
+   * tpl_rdmult_scaling_factors is a temporary buffer used to store the
+   * intermediate scaling factors which are used in the calculation of
+   * tpl_sb_rdmult_scaling_factors. tpl_rdmult_scaling_factors[i] stores the
+   * intermediate scaling factor of the ith 16 x 16 block in raster scan order.
+   */
   double *tpl_rdmult_scaling_factors;
+  /*!
+   * tpl_sb_rdmult_scaling_factors[i] stores the RD multiplier scaling factor of
+   * the ith 16 x 16 block in raster scan order.
+   */
   double *tpl_sb_rdmult_scaling_factors;
+  /*!
+   * ssim_rdmult_scaling_factors[i] stores the RD multiplier scaling factor of
+   * the ith 16 x 16 block in raster scan order. This scaling factor is used for
+   * RD multiplier modulation when SSIM tuning is enabled.
+   */
   double *ssim_rdmult_scaling_factors;
 
 #if CONFIG_TUNE_VMAF
-  double *vmaf_rdmult_scaling_factors;
-  double last_frame_ysse;
-  double last_frame_vmaf;
-  double last_frame_unsharp_amount;
+  /*!
+   * Parameters for VMAF tuning.
+   */
+  TuneVMAFInfo vmaf_info;
 #endif
 
+  /*!
+   * Indicates whether to use SVC.
+   */
   int use_svc;
+  /*!
+   * Parameters for scalable video coding.
+   */
   SVC svc;
 
+  /*!
+   * Flag indicating whether look ahead processing (LAP) is enabled.
+   */
   int lap_enabled;
+  /*!
+   * Indicates whether current processing stage is encode stage or LAP stage.
+   */
   COMPRESSOR_STAGE compressor_stage;
 
-  // Some motion vector stats from the last encoded frame to help us decide what
-  // precision to use to encode the current frame.
+  /*!
+   * Some motion vector stats from the last encoded frame to help us decide what
+   * precision to use to encode the current frame.
+   */
   MV_STATS mv_stats;
 
-  // Frame type of the last frame. May be used in some heuristics for speeding
-  // up the encoding.
+  /*!
+   * Frame type of the last frame. May be used in some heuristics for speeding
+   * up the encoding.
+   */
   FRAME_TYPE last_frame_type;
+
+  /*!
+   * Number of tile-groups.
+   */
   int num_tg;
 
-  // Super-resolution mode currently being used by the encoder.
-  // This may / may not be same as user-supplied mode in oxcf->superres_mode
-  // (when we are recoding to try multiple options for example).
-  SUPERRES_MODE superres_mode;
+  /*!
+   * Super-resolution mode currently being used by the encoder.
+   * This may / may not be same as user-supplied mode in oxcf->superres_mode
+   * (when we are recoding to try multiple options for example).
+   */
+  aom_superres_mode superres_mode;
+
+  /*!
+   * First pass related data.
+   */
+  FirstPassData firstpass_data;
+
+  /*!
+   * Temporal Noise Estimate
+   */
+  NOISE_ESTIMATE noise_estimate;
+
+  /*!
+   * Count on how many consecutive times a block uses small/zeromv for encoding
+   * in a scale of 8x8 block.
+   */
+  uint8_t *consec_zero_mv;
+
+  /*!
+   * Number of frames left to be encoded, is 0 if limit is not set.
+   */
+  int frames_left;
 } AV1_COMP;
 
-typedef struct {
+/*!
+ * \brief Input frames and last input frame
+ */
+typedef struct EncodeFrameInput {
+  /*!\cond */
   YV12_BUFFER_CONFIG *source;
   YV12_BUFFER_CONFIG *last_source;
   int64_t ts_duration;
+  /*!\endcond */
 } EncodeFrameInput;
 
-// EncodeFrameParams contains per-frame encoding parameters decided upon by
-// av1_encode_strategy() and passed down to av1_encode()
-struct EncodeFrameParams {
+/*!
+ * \brief contains per-frame encoding parameters decided upon by
+ * av1_encode_strategy() and passed down to av1_encode().
+ */
+typedef struct EncodeFrameParams {
+  /*!
+   * Is error resilient mode enabled
+   */
   int error_resilient_mode;
+  /*!
+   * Frame type (eg KF vs inter frame etc)
+   */
   FRAME_TYPE frame_type;
+
+  /*!\cond */
   int primary_ref_frame;
   int order_offset;
+
+  /*!\endcond */
+  /*!
+   * Should the current frame be displayed after being decoded
+   */
   int show_frame;
+
+  /*!\cond */
   int refresh_frame_flags;
 
   int show_existing_frame;
   int existing_fb_idx_to_show;
 
-  // Bitmask of which reference buffers may be referenced by this frame
+  /*!\endcond */
+  /*!
+   *  Bitmask of which reference buffers may be referenced by this frame.
+   */
   int ref_frame_flags;
 
-  // Reference buffer assignment for this frame.
+  /*!
+   *  Reference buffer assignment for this frame.
+   */
   int remapped_ref_idx[REF_FRAMES];
 
-  // Flags which determine which reference buffers are refreshed by this frame
-  int refresh_golden_frame;
-  int refresh_bwd_ref_frame;
-  int refresh_alt_ref_frame;
+  /*!
+   *  Flags which determine which reference buffers are refreshed by this
+   *  frame.
+   */
+  RefreshFrameFlagsInfo refresh_frame;
 
-  // Speed level to use for this frame: Bigger number means faster.
+  /*!
+   *  Speed level to use for this frame: Bigger number means faster.
+   */
   int speed;
-};
-typedef struct EncodeFrameParams EncodeFrameParams;
+} EncodeFrameParams;
+
+/*!\cond */
 
 // EncodeFrameResults contains information about the result of encoding a
 // single frame
@@ -1411,22 +2705,72 @@
 void av1_check_initial_width(AV1_COMP *cpi, int use_highbitdepth,
                              int subsampling_x, int subsampling_y);
 
-// receive a frames worth of data. caller can assume that a copy of this
-// frame is made and not just a copy of the pointer..
+void av1_init_seq_coding_tools(SequenceHeader *seq, AV1_COMMON *cm,
+                               const AV1EncoderConfig *oxcf, int use_svc);
+
+/*!\endcond */
+
+/*!\brief Obtain the raw frame data
+ *
+ * \ingroup high_level_algo
+ * This function receives the raw frame data from input.
+ *
+ * \param[in]    cpi            Top-level encoder structure
+ * \param[in]    frame_flags    Flags to decide how to encoding the frame
+ * \param[in]    sd             Contain raw frame data
+ * \param[in]    time_stamp     Time stamp of the frame
+ * \param[in]    end_time_stamp End time stamp
+ *
+ * \return Returns a value to indicate if the frame data is received
+ * successfully.
+ * \note The caller can assume that a copy of this frame is made and not just a
+ * copy of the pointer.
+ */
 int av1_receive_raw_frame(AV1_COMP *cpi, aom_enc_frame_flags_t frame_flags,
                           YV12_BUFFER_CONFIG *sd, int64_t time_stamp,
                           int64_t end_time_stamp);
 
+/*!\brief Encode a frame
+ *
+ * \ingroup high_level_algo
+ * \callgraph
+ * \callergraph
+ * This function encodes the raw frame data, and outputs the frame bit stream
+ * to the designated buffer. The caller should use the output parameters
+ * *time_stamp and *time_end only when this function returns AOM_CODEC_OK.
+ *
+ * \param[in]    cpi         Top-level encoder structure
+ * \param[in]    frame_flags Flags to decide how to encoding the frame
+ * \param[in]    size        Bitstream size
+ * \param[in]    dest        Bitstream output
+ * \param[out]   time_stamp  Time stamp of the frame
+ * \param[out]   time_end    Time end
+ * \param[in]    flush       Decide to encode one frame or the rest of frames
+ * \param[in]    timebase    Time base used
+ *
+ * \return Returns a value to indicate if the encoding is done successfully.
+ * \retval #AOM_CODEC_OK
+ * \retval -1
+ *     No frame encoded; more input is required.
+ * \retval #AOM_CODEC_ERROR
+ */
 int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags,
                             size_t *size, uint8_t *dest, int64_t *time_stamp,
                             int64_t *time_end, int flush,
                             const aom_rational64_t *timebase);
 
+/*!\brief Run 1-pass/2-pass encoding
+ *
+ * \ingroup high_level_algo
+ * \callgraph
+ * \callergraph
+ */
 int av1_encode(AV1_COMP *const cpi, uint8_t *const dest,
                const EncodeFrameInput *const frame_input,
                const EncodeFrameParams *const frame_params,
                EncodeFrameResults *const frame_results);
 
+/*!\cond */
 int av1_get_preview_raw_frame(AV1_COMP *cpi, YV12_BUFFER_CONFIG *dest);
 
 int av1_get_last_show_frame(AV1_COMP *cpi, YV12_BUFFER_CONFIG *frame);
@@ -1445,9 +2789,6 @@
 
 void av1_set_frame_size(AV1_COMP *cpi, int width, int height);
 
-int av1_update_entropy(bool *ext_refresh_frame_context,
-                       bool *ext_refresh_frame_context_pending, bool update);
-
 int av1_set_active_map(AV1_COMP *cpi, unsigned char *map, int rows, int cols);
 
 int av1_get_active_map(AV1_COMP *cpi, unsigned char *map, int rows, int cols);
@@ -1460,20 +2801,18 @@
 
 int av1_convert_sect5obus_to_annexb(uint8_t *buffer, size_t *input_size);
 
-void av1_alloc_compound_type_rd_buffers(AV1_COMMON *const cm,
-                                        CompoundTypeRdBuffers *const bufs);
-void av1_release_compound_type_rd_buffers(CompoundTypeRdBuffers *const bufs);
-
 // Set screen content options.
 // This function estimates whether to use screen content tools, by counting
 // the portion of blocks that have few luma colors.
 // Modifies:
-//   cpi->commom.allow_screen_content_tools
-//   cpi->common.allow_intrabc
+//   cpi->commom.features.allow_screen_content_tools
+//   cpi->common.features.allow_intrabc
+//   cpi->use_screen_content_tools
+//   cpi->is_screen_content_type
 // However, the estimation is not accurate and may misclassify videos.
 // A slower but more accurate approach that determines whether to use screen
-// content tools is employed later. See determine_sc_tools_with_encoding().
-void av1_set_screen_content_options(const struct AV1_COMP *cpi,
+// content tools is employed later. See av1_determine_sc_tools_with_encoding().
+void av1_set_screen_content_options(struct AV1_COMP *cpi,
                                     FeatureFlags *features);
 
 // TODO(jingning): Move these functions as primitive members for the new cpi
@@ -1561,23 +2900,6 @@
   buf->height = cm->height;
 }
 
-// Token buffer is only used for palette tokens.
-static INLINE unsigned int get_token_alloc(int mb_rows, int mb_cols,
-                                           int sb_size_log2,
-                                           const int num_planes) {
-  // Calculate the maximum number of max superblocks in the image.
-  const int shift = sb_size_log2 - 4;
-  const int sb_size = 1 << sb_size_log2;
-  const int sb_size_square = sb_size * sb_size;
-  const int sb_rows = ALIGN_POWER_OF_TWO(mb_rows, shift) >> shift;
-  const int sb_cols = ALIGN_POWER_OF_TWO(mb_cols, shift) >> shift;
-
-  // One palette token for each pixel. There can be palettes on two planes.
-  const int sb_palette_toks = AOMMIN(2, num_planes) * sb_size_square;
-
-  return sb_rows * sb_cols * sb_palette_toks;
-}
-
 // Get the allocated token size for a tile. It does the same calculation as in
 // the frame token allocation.
 static INLINE unsigned int allocated_tokens(TileInfo tile, int sb_size_log2,
@@ -1589,7 +2911,7 @@
 }
 
 static INLINE void get_start_tok(AV1_COMP *cpi, int tile_row, int tile_col,
-                                 int mi_row, TOKENEXTRA **tok, int sb_size_log2,
+                                 int mi_row, TokenExtra **tok, int sb_size_log2,
                                  int num_planes) {
   AV1_COMMON *const cm = &cpi->common;
   const int tile_cols = cm->tiles.cols;
@@ -1600,15 +2922,15 @@
       (tile_info->mi_col_end - tile_info->mi_col_start + 2) >> 2;
   const int tile_mb_row = (mi_row - tile_info->mi_row_start + 2) >> 2;
 
-  *tok = cpi->tile_tok[tile_row][tile_col] +
+  *tok = cpi->token_info.tile_tok[tile_row][tile_col] +
          get_token_alloc(tile_mb_row, tile_mb_cols, sb_size_log2, num_planes);
 }
 
 void av1_apply_encoding_flags(AV1_COMP *cpi, aom_enc_frame_flags_t flags);
 
 #define ALT_MIN_LAG 3
-static INLINE int is_altref_enabled(const AV1_COMP *const cpi) {
-  return cpi->oxcf.lag_in_frames >= ALT_MIN_LAG && cpi->oxcf.enable_auto_arf;
+static INLINE int is_altref_enabled(int lag_in_frames, bool enable_auto_arf) {
+  return lag_in_frames >= ALT_MIN_LAG && enable_auto_arf;
 }
 
 // Check if statistics generation stage
@@ -1629,11 +2951,20 @@
            cpi->lap_enabled));
 }
 
-// Check if the current stage has statistics
+/*!\endcond */
+/*!\brief Check if the current stage has statistics
+ *
+ *\ingroup two_pass_algo
+ *
+ * \param[in]    cpi     Top - level encoder instance structure
+ *
+ * \return 0 if no stats for current stage else 1
+ */
 static INLINE int has_no_stats_stage(const AV1_COMP *const cpi) {
   assert(IMPLIES(!cpi->lap_enabled, cpi->compressor_stage == ENCODE_STAGE));
   return (cpi->oxcf.pass == 0 && !cpi->lap_enabled);
 }
+/*!\cond */
 
 // Function return size of frame stats buffer
 static INLINE int get_stats_buf_size(int num_lap_buffer, int num_lag_buffer) {
@@ -1641,7 +2972,7 @@
   return (num_lap_buffer > 0 ? num_lap_buffer + 1 : num_lag_buffer);
 }
 
-// TODO(zoeliu): To set up cpi->oxcf.enable_auto_brf
+// TODO(zoeliu): To set up cpi->oxcf.gf_cfg.enable_auto_brf
 
 static INLINE void set_ref_ptrs(const AV1_COMMON *cm, MACROBLOCKD *xd,
                                 MV_REFERENCE_FRAME ref0,
@@ -1758,14 +3089,6 @@
   GOLDEN_FRAME,
 };
 
-static INLINE int get_max_allowed_ref_frames(const AV1_COMP *cpi) {
-  const unsigned int max_allowed_refs_for_given_speed =
-      (cpi->sf.inter_sf.selective_ref_frame >= 3) ? INTER_REFS_PER_FRAME - 1
-                                                  : INTER_REFS_PER_FRAME;
-  return AOMMIN(max_allowed_refs_for_given_speed,
-                cpi->oxcf.max_reference_frames);
-}
-
 static const MV_REFERENCE_FRAME
     ref_frame_priority_order[INTER_REFS_PER_FRAME] = {
       LAST_FRAME,    ALTREF_FRAME, BWDREF_FRAME, GOLDEN_FRAME,
@@ -1800,40 +3123,6 @@
   return flags;
 }
 
-// Enforce the number of references for each arbitrary frame based on user
-// options and speed.
-static AOM_INLINE void enforce_max_ref_frames(AV1_COMP *cpi,
-                                              int *ref_frame_flags) {
-  MV_REFERENCE_FRAME ref_frame;
-  int total_valid_refs = 0;
-
-  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
-    if (*ref_frame_flags & av1_ref_frame_flag_list[ref_frame]) {
-      total_valid_refs++;
-    }
-  }
-
-  const int max_allowed_refs = get_max_allowed_ref_frames(cpi);
-
-  for (int i = 0; i < 4 && total_valid_refs > max_allowed_refs; ++i) {
-    const MV_REFERENCE_FRAME ref_frame_to_disable = disable_order[i];
-
-    if (!(*ref_frame_flags & av1_ref_frame_flag_list[ref_frame_to_disable])) {
-      continue;
-    }
-
-    switch (ref_frame_to_disable) {
-      case LAST3_FRAME: *ref_frame_flags &= ~AOM_LAST3_FLAG; break;
-      case LAST2_FRAME: *ref_frame_flags &= ~AOM_LAST2_FLAG; break;
-      case ALTREF2_FRAME: *ref_frame_flags &= ~AOM_ALT2_FLAG; break;
-      case GOLDEN_FRAME: *ref_frame_flags &= ~AOM_GOLD_FLAG; break;
-      default: assert(0);
-    }
-    --total_valid_refs;
-  }
-  assert(total_valid_refs <= max_allowed_refs);
-}
-
 // Returns a Sequence Header OBU stored in an aom_fixed_buf_t, or NULL upon
 // failure. When a non-NULL aom_fixed_buf_t pointer is returned by this
 // function, the memory must be freed by the caller. Both the buf member of the
@@ -1847,31 +3136,20 @@
 
 #define MAX_GFUBOOST_FACTOR 10.0
 #define MIN_GFUBOOST_FACTOR 4.0
-double av1_get_gfu_boost_projection_factor(double min_factor, double max_factor,
-                                           int frame_count);
-double av1_get_kf_boost_projection_factor(int frame_count);
 
-#define ENABLE_KF_TPL 1
-#define MAX_PYR_LEVEL_FROMTOP_DELTAQ 0
-
-static INLINE int is_frame_kf_and_tpl_eligible(AV1_COMP *const cpi) {
-  AV1_COMMON *cm = &cpi->common;
-  return (cm->current_frame.frame_type == KEY_FRAME) && cm->show_frame &&
-         (cpi->rc.frames_to_key > 1);
+static INLINE int is_frame_tpl_eligible(const GF_GROUP *const gf_group,
+                                        uint8_t index) {
+  const FRAME_UPDATE_TYPE update_type = gf_group->update_type[index];
+  return update_type == ARF_UPDATE || update_type == GF_UPDATE ||
+         update_type == KF_UPDATE;
 }
 
-static INLINE int is_frame_arf_and_tpl_eligible(const GF_GROUP *gf_group) {
-  const FRAME_UPDATE_TYPE update_type = gf_group->update_type[gf_group->index];
-  return update_type == ARF_UPDATE || update_type == GF_UPDATE;
-}
-
-static INLINE int is_frame_tpl_eligible(AV1_COMP *const cpi) {
-#if ENABLE_KF_TPL
-  return is_frame_kf_and_tpl_eligible(cpi) ||
-         is_frame_arf_and_tpl_eligible(&cpi->gf_group);
-#else
-  return is_frame_arf_and_tpl_eligible(&cpi->gf_group);
-#endif  // ENABLE_KF_TPL
+static INLINE int is_frame_eligible_for_ref_pruning(const GF_GROUP *gf_group,
+                                                    int selective_ref_frame,
+                                                    int prune_ref_frames,
+                                                    int gf_index) {
+  return (selective_ref_frame > 0) && (prune_ref_frames > 0) &&
+         !is_frame_tpl_eligible(gf_group, gf_index);
 }
 
 // Get update type of the current frame.
@@ -1884,6 +3162,13 @@
   return ALIGN_POWER_OF_TWO(pixels, 3) >> MI_SIZE_LOG2;
 }
 
+static AOM_INLINE int is_psnr_calc_enabled(const AV1_COMP *cpi) {
+  const AV1_COMMON *const cm = &cpi->common;
+
+  return cpi->b_calculate_psnr && !is_stat_generation_stage(cpi) &&
+         cm->show_frame;
+}
+
 #if CONFIG_COLLECT_PARTITION_STATS == 2
 static INLINE void av1_print_partition_stats(PartitionStats *part_stats) {
   FILE *f = fopen("partition_stats.csv", "w");
@@ -1958,6 +3243,8 @@
 }
 #endif
 
+/*!\endcond */
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/av1/encoder/encoder_alloc.h b/av1/encoder/encoder_alloc.h
new file mode 100644
index 0000000..32eeb6b
--- /dev/null
+++ b/av1/encoder/encoder_alloc.h
@@ -0,0 +1,423 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_ENCODER_ALLOC_H_
+#define AOM_AV1_ENCODER_ENCODER_ALLOC_H_
+
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/encodetxb.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+static AOM_INLINE void dealloc_context_buffers_ext(
+    MBMIExtFrameBufferInfo *mbmi_ext_info) {
+  if (mbmi_ext_info->frame_base) {
+    aom_free(mbmi_ext_info->frame_base);
+    mbmi_ext_info->frame_base = NULL;
+    mbmi_ext_info->alloc_size = 0;
+  }
+}
+
+static AOM_INLINE void alloc_context_buffers_ext(
+    AV1_COMMON *cm, MBMIExtFrameBufferInfo *mbmi_ext_info) {
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+
+  const int mi_alloc_size_1d = mi_size_wide[mi_params->mi_alloc_bsize];
+  const int mi_alloc_rows =
+      (mi_params->mi_rows + mi_alloc_size_1d - 1) / mi_alloc_size_1d;
+  const int mi_alloc_cols =
+      (mi_params->mi_cols + mi_alloc_size_1d - 1) / mi_alloc_size_1d;
+  const int new_ext_mi_size = mi_alloc_rows * mi_alloc_cols;
+
+  if (new_ext_mi_size > mbmi_ext_info->alloc_size) {
+    dealloc_context_buffers_ext(mbmi_ext_info);
+    CHECK_MEM_ERROR(
+        cm, mbmi_ext_info->frame_base,
+        aom_calloc(new_ext_mi_size, sizeof(*mbmi_ext_info->frame_base)));
+    mbmi_ext_info->alloc_size = new_ext_mi_size;
+  }
+  // The stride needs to be updated regardless of whether new allocation
+  // happened or not.
+  mbmi_ext_info->stride = mi_alloc_cols;
+}
+
+static AOM_INLINE void alloc_compressor_data(AV1_COMP *cpi) {
+  AV1_COMMON *cm = &cpi->common;
+  TokenInfo *token_info = &cpi->token_info;
+
+  if (av1_alloc_context_buffers(cm, cm->width, cm->height)) {
+    aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+                       "Failed to allocate context buffers");
+  }
+
+  if (!is_stat_generation_stage(cpi)) {
+    av1_alloc_txb_buf(cpi);
+
+    alloc_context_buffers_ext(cm, &cpi->mbmi_ext_info);
+  }
+
+  free_token_info(token_info);
+
+  if (!is_stat_generation_stage(cpi)) {
+    alloc_token_info(cm, token_info);
+  }
+
+  av1_setup_shared_coeff_buffer(&cpi->common, &cpi->td.shared_coeff_buf);
+  av1_setup_sms_tree(cpi, &cpi->td);
+  cpi->td.firstpass_ctx =
+      av1_alloc_pmc(cm, BLOCK_16X16, &cpi->td.shared_coeff_buf);
+}
+
+static AOM_INLINE void realloc_segmentation_maps(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  CommonModeInfoParams *const mi_params = &cm->mi_params;
+
+  // Create the encoder segmentation map and set all entries to 0
+  aom_free(cpi->enc_seg.map);
+  CHECK_MEM_ERROR(cm, cpi->enc_seg.map,
+                  aom_calloc(mi_params->mi_rows * mi_params->mi_cols, 1));
+
+  // Create a map used for cyclic background refresh.
+  if (cpi->cyclic_refresh) av1_cyclic_refresh_free(cpi->cyclic_refresh);
+  CHECK_MEM_ERROR(
+      cm, cpi->cyclic_refresh,
+      av1_cyclic_refresh_alloc(mi_params->mi_rows, mi_params->mi_cols));
+
+  // Create a map used to mark inactive areas.
+  aom_free(cpi->active_map.map);
+  CHECK_MEM_ERROR(cm, cpi->active_map.map,
+                  aom_calloc(mi_params->mi_rows * mi_params->mi_cols, 1));
+}
+
+static AOM_INLINE void set_tpl_stats_block_size(uint8_t *block_mis_log2,
+                                                uint8_t *tpl_bsize_1d) {
+  // tpl stats bsize: 2 means 16x16
+  *block_mis_log2 = 2;
+  // Block size used in tpl motion estimation
+  *tpl_bsize_1d = 16;
+  // MIN_TPL_BSIZE_1D = 16;
+  assert(*tpl_bsize_1d >= 16);
+}
+
+static AOM_INLINE void setup_tpl_buffers(AV1_COMMON *const cm,
+                                         TplParams *const tpl_data,
+                                         int lag_in_frames) {
+  CommonModeInfoParams *const mi_params = &cm->mi_params;
+  set_tpl_stats_block_size(&tpl_data->tpl_stats_block_mis_log2,
+                           &tpl_data->tpl_bsize_1d);
+  const uint8_t block_mis_log2 = tpl_data->tpl_stats_block_mis_log2;
+  tpl_data->border_in_pixels =
+      ALIGN_POWER_OF_TWO(tpl_data->tpl_bsize_1d + 2 * AOM_INTERP_EXTEND, 5);
+
+  for (int frame = 0; frame < MAX_LENGTH_TPL_FRAME_STATS; ++frame) {
+    const int mi_cols =
+        ALIGN_POWER_OF_TWO(mi_params->mi_cols, MAX_MIB_SIZE_LOG2);
+    const int mi_rows =
+        ALIGN_POWER_OF_TWO(mi_params->mi_rows, MAX_MIB_SIZE_LOG2);
+
+    tpl_data->tpl_stats_buffer[frame].is_valid = 0;
+    tpl_data->tpl_stats_buffer[frame].width = mi_cols >> block_mis_log2;
+    tpl_data->tpl_stats_buffer[frame].height = mi_rows >> block_mis_log2;
+    tpl_data->tpl_stats_buffer[frame].stride =
+        tpl_data->tpl_stats_buffer[frame].width;
+    tpl_data->tpl_stats_buffer[frame].mi_rows = mi_params->mi_rows;
+    tpl_data->tpl_stats_buffer[frame].mi_cols = mi_params->mi_cols;
+  }
+  tpl_data->tpl_frame = &tpl_data->tpl_stats_buffer[REF_FRAMES + 1];
+
+  // If lag_in_frames <= 1, TPL module is not invoked. Hence tpl recon and
+  // stats buffers are not allocated.
+  if (lag_in_frames <= 1) return;
+
+  // TODO(aomedia:2873): Explore the allocation of tpl buffers based on
+  // lag_in_frames.
+  for (int frame = 0; frame < MAX_LAG_BUFFERS; ++frame) {
+    CHECK_MEM_ERROR(
+        cm, tpl_data->tpl_stats_pool[frame],
+        aom_calloc(tpl_data->tpl_stats_buffer[frame].width *
+                       tpl_data->tpl_stats_buffer[frame].height,
+                   sizeof(*tpl_data->tpl_stats_buffer[frame].tpl_stats_ptr)));
+    if (aom_alloc_frame_buffer(
+            &tpl_data->tpl_rec_pool[frame], cm->width, cm->height,
+            cm->seq_params.subsampling_x, cm->seq_params.subsampling_y,
+            cm->seq_params.use_highbitdepth, tpl_data->border_in_pixels,
+            cm->features.byte_alignment))
+      aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+                         "Failed to allocate frame buffer");
+  }
+}
+
+static AOM_INLINE void alloc_obmc_buffers(OBMCBuffer *obmc_buffer,
+                                          AV1_COMMON *cm) {
+  CHECK_MEM_ERROR(
+      cm, obmc_buffer->wsrc,
+      (int32_t *)aom_memalign(16, MAX_SB_SQUARE * sizeof(*obmc_buffer->wsrc)));
+  CHECK_MEM_ERROR(
+      cm, obmc_buffer->mask,
+      (int32_t *)aom_memalign(16, MAX_SB_SQUARE * sizeof(*obmc_buffer->mask)));
+  CHECK_MEM_ERROR(
+      cm, obmc_buffer->above_pred,
+      (uint8_t *)aom_memalign(
+          16, MAX_MB_PLANE * MAX_SB_SQUARE * sizeof(*obmc_buffer->above_pred)));
+  CHECK_MEM_ERROR(
+      cm, obmc_buffer->left_pred,
+      (uint8_t *)aom_memalign(
+          16, MAX_MB_PLANE * MAX_SB_SQUARE * sizeof(*obmc_buffer->left_pred)));
+}
+
+static AOM_INLINE void release_obmc_buffers(OBMCBuffer *obmc_buffer) {
+  aom_free(obmc_buffer->mask);
+  aom_free(obmc_buffer->above_pred);
+  aom_free(obmc_buffer->left_pred);
+  aom_free(obmc_buffer->wsrc);
+
+  obmc_buffer->mask = NULL;
+  obmc_buffer->above_pred = NULL;
+  obmc_buffer->left_pred = NULL;
+  obmc_buffer->wsrc = NULL;
+}
+
+static AOM_INLINE void alloc_compound_type_rd_buffers(
+    AV1_COMMON *const cm, CompoundTypeRdBuffers *const bufs) {
+  CHECK_MEM_ERROR(
+      cm, bufs->pred0,
+      (uint8_t *)aom_memalign(16, 2 * MAX_SB_SQUARE * sizeof(*bufs->pred0)));
+  CHECK_MEM_ERROR(
+      cm, bufs->pred1,
+      (uint8_t *)aom_memalign(16, 2 * MAX_SB_SQUARE * sizeof(*bufs->pred1)));
+  CHECK_MEM_ERROR(
+      cm, bufs->residual1,
+      (int16_t *)aom_memalign(32, MAX_SB_SQUARE * sizeof(*bufs->residual1)));
+  CHECK_MEM_ERROR(
+      cm, bufs->diff10,
+      (int16_t *)aom_memalign(32, MAX_SB_SQUARE * sizeof(*bufs->diff10)));
+  CHECK_MEM_ERROR(cm, bufs->tmp_best_mask_buf,
+                  (uint8_t *)aom_malloc(2 * MAX_SB_SQUARE *
+                                        sizeof(*bufs->tmp_best_mask_buf)));
+}
+
+static AOM_INLINE void release_compound_type_rd_buffers(
+    CompoundTypeRdBuffers *const bufs) {
+  aom_free(bufs->pred0);
+  aom_free(bufs->pred1);
+  aom_free(bufs->residual1);
+  aom_free(bufs->diff10);
+  aom_free(bufs->tmp_best_mask_buf);
+  av1_zero(*bufs);  // Set all pointers to NULL for safety.
+}
+
+static AOM_INLINE void dealloc_compressor_data(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  TokenInfo *token_info = &cpi->token_info;
+
+  dealloc_context_buffers_ext(&cpi->mbmi_ext_info);
+
+  aom_free(cpi->tile_data);
+  cpi->tile_data = NULL;
+
+  // Delete sementation map
+  aom_free(cpi->enc_seg.map);
+  cpi->enc_seg.map = NULL;
+
+  av1_cyclic_refresh_free(cpi->cyclic_refresh);
+  cpi->cyclic_refresh = NULL;
+
+  aom_free(cpi->active_map.map);
+  cpi->active_map.map = NULL;
+
+  aom_free(cpi->ssim_rdmult_scaling_factors);
+  cpi->ssim_rdmult_scaling_factors = NULL;
+
+  aom_free(cpi->tpl_rdmult_scaling_factors);
+  cpi->tpl_rdmult_scaling_factors = NULL;
+
+  aom_free(cpi->tpl_sb_rdmult_scaling_factors);
+  cpi->tpl_sb_rdmult_scaling_factors = NULL;
+
+#if CONFIG_TUNE_VMAF
+  aom_free(cpi->vmaf_info.rdmult_scaling_factors);
+  cpi->vmaf_info.rdmult_scaling_factors = NULL;
+
+#if CONFIG_USE_VMAF_RC
+  aom_close_vmaf_model_rc(cpi->vmaf_info.vmaf_model);
+#endif
+#endif
+
+  release_obmc_buffers(&cpi->td.mb.obmc_buffer);
+
+  aom_free(cpi->td.mb.inter_modes_info);
+  cpi->td.mb.inter_modes_info = NULL;
+
+  for (int i = 0; i < 2; i++)
+    for (int j = 0; j < 2; j++) {
+      aom_free(cpi->td.mb.intrabc_hash_info.hash_value_buffer[i][j]);
+      cpi->td.mb.intrabc_hash_info.hash_value_buffer[i][j] = NULL;
+    }
+
+  aom_free(cm->tpl_mvs);
+  cm->tpl_mvs = NULL;
+
+  if (cpi->td.vt64x64) {
+    aom_free(cpi->td.vt64x64);
+    cpi->td.vt64x64 = NULL;
+  }
+
+  av1_free_pmc(cpi->td.firstpass_ctx, av1_num_planes(cm));
+  cpi->td.firstpass_ctx = NULL;
+
+  av1_free_ref_frame_buffers(cm->buffer_pool);
+  av1_free_txb_buf(cpi);
+  av1_free_context_buffers(cm);
+
+  aom_free_frame_buffer(&cpi->last_frame_uf);
+#if !CONFIG_REALTIME_ONLY
+  av1_free_restoration_buffers(cm);
+#endif
+  aom_free_frame_buffer(&cpi->trial_frame_rst);
+  aom_free_frame_buffer(&cpi->scaled_source);
+  aom_free_frame_buffer(&cpi->scaled_last_source);
+  aom_free_frame_buffer(&cpi->alt_ref_buffer);
+  av1_lookahead_destroy(cpi->lookahead);
+
+  free_token_info(token_info);
+
+  av1_free_shared_coeff_buffer(&cpi->td.shared_coeff_buf);
+  av1_free_sms_tree(&cpi->td);
+
+  aom_free(cpi->td.mb.palette_buffer);
+  release_compound_type_rd_buffers(&cpi->td.mb.comp_rd_buffer);
+  aom_free(cpi->td.mb.tmp_conv_dst);
+  for (int j = 0; j < 2; ++j) {
+    aom_free(cpi->td.mb.tmp_pred_bufs[j]);
+  }
+
+#if CONFIG_DENOISE
+  if (cpi->denoise_and_model) {
+    aom_denoise_and_model_free(cpi->denoise_and_model);
+    cpi->denoise_and_model = NULL;
+  }
+#endif
+  if (cpi->film_grain_table) {
+    aom_film_grain_table_free(cpi->film_grain_table);
+    cpi->film_grain_table = NULL;
+  }
+
+  for (int i = 0; i < MAX_NUM_OPERATING_POINTS; ++i) {
+    aom_free(cpi->level_params.level_info[i]);
+  }
+
+  if (cpi->use_svc) av1_free_svc_cyclic_refresh(cpi);
+
+  if (cpi->consec_zero_mv) {
+    aom_free(cpi->consec_zero_mv);
+    cpi->consec_zero_mv = NULL;
+  }
+}
+
+static AOM_INLINE void variance_partition_alloc(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  const int num_64x64_blocks = (cm->seq_params.sb_size == BLOCK_64X64) ? 1 : 4;
+  if (cpi->td.vt64x64) {
+    if (num_64x64_blocks != cpi->td.num_64x64_blocks) {
+      aom_free(cpi->td.vt64x64);
+      cpi->td.vt64x64 = NULL;
+    }
+  }
+  if (!cpi->td.vt64x64) {
+    CHECK_MEM_ERROR(cm, cpi->td.vt64x64,
+                    aom_malloc(sizeof(*cpi->td.vt64x64) * num_64x64_blocks));
+    cpi->td.num_64x64_blocks = num_64x64_blocks;
+  }
+}
+
+static AOM_INLINE void alloc_altref_frame_buffer(AV1_COMP *cpi) {
+  AV1_COMMON *cm = &cpi->common;
+  const SequenceHeader *const seq_params = &cm->seq_params;
+  const AV1EncoderConfig *oxcf = &cpi->oxcf;
+
+  // TODO(agrange) Check if ARF is enabled and skip allocation if not.
+  if (aom_realloc_frame_buffer(
+          &cpi->alt_ref_buffer, oxcf->frm_dim_cfg.width,
+          oxcf->frm_dim_cfg.height, seq_params->subsampling_x,
+          seq_params->subsampling_y, seq_params->use_highbitdepth,
+          cpi->oxcf.border_in_pixels, cm->features.byte_alignment, NULL, NULL,
+          NULL))
+    aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+                       "Failed to allocate altref buffer");
+}
+
+static AOM_INLINE void alloc_util_frame_buffers(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  const SequenceHeader *const seq_params = &cm->seq_params;
+  const int byte_alignment = cm->features.byte_alignment;
+  if (aom_realloc_frame_buffer(
+          &cpi->last_frame_uf, cm->width, cm->height, seq_params->subsampling_x,
+          seq_params->subsampling_y, seq_params->use_highbitdepth,
+          cpi->oxcf.border_in_pixels, byte_alignment, NULL, NULL, NULL))
+    aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+                       "Failed to allocate last frame buffer");
+
+  if (aom_realloc_frame_buffer(
+          &cpi->trial_frame_rst, cm->superres_upscaled_width,
+          cm->superres_upscaled_height, seq_params->subsampling_x,
+          seq_params->subsampling_y, seq_params->use_highbitdepth,
+          AOM_RESTORATION_FRAME_BORDER, byte_alignment, NULL, NULL, NULL))
+    aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+                       "Failed to allocate trial restored frame buffer");
+
+  if (aom_realloc_frame_buffer(
+          &cpi->scaled_source, cm->width, cm->height, seq_params->subsampling_x,
+          seq_params->subsampling_y, seq_params->use_highbitdepth,
+          cpi->oxcf.border_in_pixels, byte_alignment, NULL, NULL, NULL))
+    aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+                       "Failed to allocate scaled source buffer");
+
+  if (aom_realloc_frame_buffer(
+          &cpi->scaled_last_source, cm->width, cm->height,
+          seq_params->subsampling_x, seq_params->subsampling_y,
+          seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels,
+          byte_alignment, NULL, NULL, NULL))
+    aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+                       "Failed to allocate scaled last source buffer");
+}
+
+static AOM_INLINE YV12_BUFFER_CONFIG *realloc_and_scale_source(
+    AV1_COMP *cpi, int scaled_width, int scaled_height) {
+  AV1_COMMON *cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+
+  if (scaled_width == cpi->unscaled_source->y_crop_width &&
+      scaled_height == cpi->unscaled_source->y_crop_height) {
+    return cpi->unscaled_source;
+  }
+
+  if (aom_realloc_frame_buffer(
+          &cpi->scaled_source, scaled_width, scaled_height,
+          cm->seq_params.subsampling_x, cm->seq_params.subsampling_y,
+          cm->seq_params.use_highbitdepth, AOM_BORDER_IN_PIXELS,
+          cm->features.byte_alignment, NULL, NULL, NULL))
+    aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+                       "Failed to reallocate scaled source buffer");
+  assert(cpi->scaled_source.y_crop_width == scaled_width);
+  assert(cpi->scaled_source.y_crop_height == scaled_height);
+  av1_resize_and_extend_frame_nonnormative(
+      cpi->unscaled_source, &cpi->scaled_source, (int)cm->seq_params.bit_depth,
+      num_planes);
+  return &cpi->scaled_source;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_ENCODER_ALLOC_H_
diff --git a/av1/encoder/encoder_utils.c b/av1/encoder/encoder_utils.c
new file mode 100644
index 0000000..00700a7
--- /dev/null
+++ b/av1/encoder/encoder_utils.c
@@ -0,0 +1,1348 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_ports/system_state.h"
+
+#include "av1/encoder/bitstream.h"
+#include "av1/encoder/encodeframe.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/encoder_alloc.h"
+#include "av1/encoder/encodetxb.h"
+#include "av1/encoder/encoder_utils.h"
+#include "av1/encoder/grain_test_vectors.h"
+#include "av1/encoder/mv_prec.h"
+#include "av1/encoder/rc_utils.h"
+#include "av1/encoder/rdopt.h"
+#include "av1/encoder/segmentation.h"
+#include "av1/encoder/superres_scale.h"
+#include "av1/encoder/var_based_part.h"
+
+#if CONFIG_TUNE_VMAF
+#include "av1/encoder/tune_vmaf.h"
+#endif
+
+#define MIN_BOOST_COMBINE_FACTOR 4.0
+#define MAX_BOOST_COMBINE_FACTOR 12.0
+
+const int default_tx_type_probs[FRAME_UPDATE_TYPES][TX_SIZES_ALL][TX_TYPES] = {
+  { { 221, 189, 214, 292, 0, 0, 0, 0, 0, 2, 38, 68, 0, 0, 0, 0 },
+    { 262, 203, 216, 239, 0, 0, 0, 0, 0, 1, 37, 66, 0, 0, 0, 0 },
+    { 315, 231, 239, 226, 0, 0, 0, 0, 0, 13, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 222, 188, 214, 287, 0, 0, 0, 0, 0, 2, 50, 61, 0, 0, 0, 0 },
+    { 256, 182, 205, 282, 0, 0, 0, 0, 0, 2, 21, 76, 0, 0, 0, 0 },
+    { 281, 214, 217, 222, 0, 0, 0, 0, 0, 1, 48, 41, 0, 0, 0, 0 },
+    { 263, 194, 225, 225, 0, 0, 0, 0, 0, 2, 15, 100, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 170, 192, 242, 293, 0, 0, 0, 0, 0, 1, 68, 58, 0, 0, 0, 0 },
+    { 199, 210, 213, 291, 0, 0, 0, 0, 0, 1, 14, 96, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+  { { 106, 69, 107, 278, 9, 15, 20, 45, 49, 23, 23, 88, 36, 74, 25, 57 },
+    { 105, 72, 81, 98, 45, 49, 47, 50, 56, 72, 30, 81, 33, 95, 27, 83 },
+    { 211, 105, 109, 120, 57, 62, 43, 49, 52, 58, 42, 116, 0, 0, 0, 0 },
+    { 1008, 0, 0, 0, 0, 0, 0, 0, 0, 16, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 131, 57, 98, 172, 19, 40, 37, 64, 69, 22, 41, 52, 51, 77, 35, 59 },
+    { 176, 83, 93, 202, 22, 24, 28, 47, 50, 16, 12, 93, 26, 76, 17, 59 },
+    { 136, 72, 89, 95, 46, 59, 47, 56, 61, 68, 35, 51, 32, 82, 26, 69 },
+    { 122, 80, 87, 105, 49, 47, 46, 46, 57, 52, 13, 90, 19, 103, 15, 93 },
+    { 1009, 0, 0, 0, 0, 0, 0, 0, 0, 15, 0, 0, 0, 0, 0, 0 },
+    { 1011, 0, 0, 0, 0, 0, 0, 0, 0, 13, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 202, 20, 84, 114, 14, 60, 41, 79, 99, 21, 41, 15, 50, 84, 34, 66 },
+    { 196, 44, 23, 72, 30, 22, 28, 57, 67, 13, 4, 165, 15, 148, 9, 131 },
+    { 882, 0, 0, 0, 0, 0, 0, 0, 0, 142, 0, 0, 0, 0, 0, 0 },
+    { 840, 0, 0, 0, 0, 0, 0, 0, 0, 184, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+  { { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 } },
+  { { 213, 110, 141, 269, 12, 16, 15, 19, 21, 11, 38, 68, 22, 29, 16, 24 },
+    { 216, 119, 128, 143, 38, 41, 26, 30, 31, 30, 42, 70, 23, 36, 19, 32 },
+    { 367, 149, 154, 154, 38, 35, 17, 21, 21, 10, 22, 36, 0, 0, 0, 0 },
+    { 1022, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 219, 96, 127, 191, 21, 40, 25, 32, 34, 18, 45, 45, 33, 39, 26, 33 },
+    { 296, 99, 122, 198, 23, 21, 19, 24, 25, 13, 20, 64, 23, 32, 18, 27 },
+    { 275, 128, 142, 143, 35, 48, 23, 30, 29, 18, 42, 36, 18, 23, 14, 20 },
+    { 239, 132, 166, 175, 36, 27, 19, 21, 24, 14, 13, 85, 9, 31, 8, 25 },
+    { 1022, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0 },
+    { 1022, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 309, 25, 79, 59, 25, 80, 34, 53, 61, 25, 49, 23, 43, 64, 36, 59 },
+    { 270, 57, 40, 54, 50, 42, 41, 53, 56, 28, 17, 81, 45, 86, 34, 70 },
+    { 1005, 0, 0, 0, 0, 0, 0, 0, 0, 19, 0, 0, 0, 0, 0, 0 },
+    { 992, 0, 0, 0, 0, 0, 0, 0, 0, 32, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+  { { 133, 63, 55, 83, 57, 87, 58, 72, 68, 16, 24, 35, 29, 105, 25, 114 },
+    { 131, 75, 74, 60, 71, 77, 65, 66, 73, 33, 21, 79, 20, 83, 18, 78 },
+    { 276, 95, 82, 58, 86, 93, 63, 60, 64, 17, 38, 92, 0, 0, 0, 0 },
+    { 1006, 0, 0, 0, 0, 0, 0, 0, 0, 18, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 147, 49, 75, 78, 50, 97, 60, 67, 76, 17, 42, 35, 31, 93, 27, 80 },
+    { 157, 49, 58, 75, 61, 52, 56, 67, 69, 12, 15, 79, 24, 119, 11, 120 },
+    { 178, 69, 83, 77, 69, 85, 72, 77, 77, 20, 35, 40, 25, 48, 23, 46 },
+    { 174, 55, 64, 57, 73, 68, 62, 61, 75, 15, 12, 90, 17, 99, 16, 86 },
+    { 1008, 0, 0, 0, 0, 0, 0, 0, 0, 16, 0, 0, 0, 0, 0, 0 },
+    { 1018, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 266, 31, 63, 64, 21, 52, 39, 54, 63, 30, 52, 31, 48, 89, 46, 75 },
+    { 272, 26, 32, 44, 29, 31, 32, 53, 51, 13, 13, 88, 22, 153, 16, 149 },
+    { 923, 0, 0, 0, 0, 0, 0, 0, 0, 101, 0, 0, 0, 0, 0, 0 },
+    { 969, 0, 0, 0, 0, 0, 0, 0, 0, 55, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+  { { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 } },
+  { { 158, 92, 125, 298, 12, 15, 20, 29, 31, 12, 29, 67, 34, 44, 23, 35 },
+    { 147, 94, 103, 123, 45, 48, 38, 41, 46, 48, 37, 78, 33, 63, 27, 53 },
+    { 268, 126, 125, 136, 54, 53, 31, 38, 38, 33, 35, 87, 0, 0, 0, 0 },
+    { 1018, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 159, 72, 103, 194, 20, 35, 37, 50, 56, 21, 39, 40, 51, 61, 38, 48 },
+    { 259, 86, 95, 188, 32, 20, 25, 34, 37, 13, 12, 85, 25, 53, 17, 43 },
+    { 189, 99, 113, 123, 45, 59, 37, 46, 48, 44, 39, 41, 31, 47, 26, 37 },
+    { 175, 110, 113, 128, 58, 38, 33, 33, 43, 29, 13, 100, 14, 68, 12, 57 },
+    { 1017, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0 },
+    { 1019, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 208, 22, 84, 101, 21, 59, 44, 70, 90, 25, 59, 13, 64, 67, 49, 48 },
+    { 277, 52, 32, 63, 43, 26, 33, 48, 54, 11, 6, 130, 18, 119, 11, 101 },
+    { 963, 0, 0, 0, 0, 0, 0, 0, 0, 61, 0, 0, 0, 0, 0, 0 },
+    { 979, 0, 0, 0, 0, 0, 0, 0, 0, 45, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }
+};
+
+const int default_obmc_probs[FRAME_UPDATE_TYPES][BLOCK_SIZES_ALL] = {
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 0,  0,  0,  106, 90, 90, 97, 67, 59, 70, 28,
+    30, 38, 16, 16,  16, 0,  0,  44, 50, 26, 25 },
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 0,  0,  0,  98, 93, 97, 68, 82, 85, 33, 30,
+    33, 16, 16, 16, 16, 0,  0,  43, 37, 26, 16 },
+  { 0,  0,  0,  91, 80, 76, 78, 55, 49, 24, 16,
+    16, 16, 16, 16, 16, 0,  0,  29, 45, 16, 38 },
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 0,  0,  0,  103, 89, 89, 89, 62, 63, 76, 34,
+    35, 32, 19, 16,  16, 0,  0,  49, 55, 29, 19 }
+};
+
+const int default_warped_probs[FRAME_UPDATE_TYPES] = { 64, 64, 64, 64,
+                                                       64, 64, 64 };
+
+// TODO(yunqing): the default probs can be trained later from better
+// performance.
+const int default_switchable_interp_probs[FRAME_UPDATE_TYPES]
+                                         [SWITCHABLE_FILTER_CONTEXTS]
+                                         [SWITCHABLE_FILTERS] = {
+                                           { { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 } },
+                                           { { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 } },
+                                           { { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 } },
+                                           { { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 } },
+                                           { { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 } },
+                                           { { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 } },
+                                           { { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 } }
+                                         };
+
+static void configure_static_seg_features(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  const RATE_CONTROL *const rc = &cpi->rc;
+  struct segmentation *const seg = &cm->seg;
+
+  int high_q = (int)(rc->avg_q > 48.0);
+  int qi_delta;
+
+  // Disable and clear down for KF
+  if (cm->current_frame.frame_type == KEY_FRAME) {
+    // Clear down the global segmentation map
+    memset(cpi->enc_seg.map, 0, cm->mi_params.mi_rows * cm->mi_params.mi_cols);
+    seg->update_map = 0;
+    seg->update_data = 0;
+
+    // Disable segmentation
+    av1_disable_segmentation(seg);
+
+    // Clear down the segment features.
+    av1_clearall_segfeatures(seg);
+  } else if (cpi->refresh_frame.alt_ref_frame) {
+    // If this is an alt ref frame
+    // Clear down the global segmentation map
+    memset(cpi->enc_seg.map, 0, cm->mi_params.mi_rows * cm->mi_params.mi_cols);
+    seg->update_map = 0;
+    seg->update_data = 0;
+
+    // Disable segmentation and individual segment features by default
+    av1_disable_segmentation(seg);
+    av1_clearall_segfeatures(seg);
+
+    // If segmentation was enabled set those features needed for the
+    // arf itself.
+    if (seg->enabled) {
+      seg->update_map = 1;
+      seg->update_data = 1;
+
+      qi_delta = av1_compute_qdelta(rc, rc->avg_q, rc->avg_q * 0.875,
+                                    cm->seq_params.bit_depth);
+      av1_set_segdata(seg, 1, SEG_LVL_ALT_Q, qi_delta - 2);
+      av1_set_segdata(seg, 1, SEG_LVL_ALT_LF_Y_H, -2);
+      av1_set_segdata(seg, 1, SEG_LVL_ALT_LF_Y_V, -2);
+      av1_set_segdata(seg, 1, SEG_LVL_ALT_LF_U, -2);
+      av1_set_segdata(seg, 1, SEG_LVL_ALT_LF_V, -2);
+
+      av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF_Y_H);
+      av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF_Y_V);
+      av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF_U);
+      av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF_V);
+
+      av1_enable_segfeature(seg, 1, SEG_LVL_ALT_Q);
+    }
+  } else if (seg->enabled) {
+    // All other frames if segmentation has been enabled
+
+    // First normal frame in a valid gf or alt ref group
+    if (rc->frames_since_golden == 0) {
+      // Set up segment features for normal frames in an arf group
+      // Disable segmentation and clear down features if alt ref
+      // is not active for this group
+
+      av1_disable_segmentation(seg);
+
+      memset(cpi->enc_seg.map, 0,
+             cm->mi_params.mi_rows * cm->mi_params.mi_cols);
+
+      seg->update_map = 0;
+      seg->update_data = 0;
+
+      av1_clearall_segfeatures(seg);
+    } else if (rc->is_src_frame_alt_ref) {
+      // Special case where we are coding over the top of a previous
+      // alt ref frame.
+      // Segment coding disabled for compred testing
+
+      // Enable ref frame features for segment 0 as well
+      av1_enable_segfeature(seg, 0, SEG_LVL_REF_FRAME);
+      av1_enable_segfeature(seg, 1, SEG_LVL_REF_FRAME);
+
+      // All mbs should use ALTREF_FRAME
+      av1_clear_segdata(seg, 0, SEG_LVL_REF_FRAME);
+      av1_set_segdata(seg, 0, SEG_LVL_REF_FRAME, ALTREF_FRAME);
+      av1_clear_segdata(seg, 1, SEG_LVL_REF_FRAME);
+      av1_set_segdata(seg, 1, SEG_LVL_REF_FRAME, ALTREF_FRAME);
+
+      // Skip all MBs if high Q (0,0 mv and skip coeffs)
+      if (high_q) {
+        av1_enable_segfeature(seg, 0, SEG_LVL_SKIP);
+        av1_enable_segfeature(seg, 1, SEG_LVL_SKIP);
+      }
+      // Enable data update
+      seg->update_data = 1;
+    } else {
+      // All other frames.
+
+      // No updates.. leave things as they are.
+      seg->update_map = 0;
+      seg->update_data = 0;
+    }
+  }
+}
+
+void av1_apply_active_map(AV1_COMP *cpi) {
+  struct segmentation *const seg = &cpi->common.seg;
+  unsigned char *const seg_map = cpi->enc_seg.map;
+  const unsigned char *const active_map = cpi->active_map.map;
+  int i;
+
+  assert(AM_SEGMENT_ID_ACTIVE == CR_SEGMENT_ID_BASE);
+
+  if (frame_is_intra_only(&cpi->common)) {
+    cpi->active_map.enabled = 0;
+    cpi->active_map.update = 1;
+  }
+
+  if (cpi->active_map.update) {
+    if (cpi->active_map.enabled) {
+      for (i = 0;
+           i < cpi->common.mi_params.mi_rows * cpi->common.mi_params.mi_cols;
+           ++i)
+        if (seg_map[i] == AM_SEGMENT_ID_ACTIVE) seg_map[i] = active_map[i];
+      av1_enable_segmentation(seg);
+      av1_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_SKIP);
+      av1_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_Y_H);
+      av1_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_Y_V);
+      av1_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_U);
+      av1_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_V);
+
+      av1_set_segdata(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_Y_H,
+                      -MAX_LOOP_FILTER);
+      av1_set_segdata(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_Y_V,
+                      -MAX_LOOP_FILTER);
+      av1_set_segdata(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_U,
+                      -MAX_LOOP_FILTER);
+      av1_set_segdata(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_V,
+                      -MAX_LOOP_FILTER);
+    } else {
+      av1_disable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_SKIP);
+      av1_disable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_Y_H);
+      av1_disable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_Y_V);
+      av1_disable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_U);
+      av1_disable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_V);
+      if (seg->enabled) {
+        seg->update_data = 1;
+        seg->update_map = 1;
+      }
+    }
+    cpi->active_map.update = 0;
+  }
+}
+
+#if !CONFIG_REALTIME_ONLY
+static void process_tpl_stats_frame(AV1_COMP *cpi) {
+  const GF_GROUP *const gf_group = &cpi->gf_group;
+  AV1_COMMON *const cm = &cpi->common;
+
+  assert(IMPLIES(gf_group->size > 0, gf_group->index < gf_group->size));
+
+  const int tpl_idx = gf_group->index;
+  TplParams *const tpl_data = &cpi->tpl_data;
+  TplDepFrame *tpl_frame = &tpl_data->tpl_frame[tpl_idx];
+  TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr;
+
+  if (tpl_frame->is_valid) {
+    int tpl_stride = tpl_frame->stride;
+    int64_t intra_cost_base = 0;
+    int64_t mc_dep_cost_base = 0;
+    const int step = 1 << tpl_data->tpl_stats_block_mis_log2;
+    const int row_step = step;
+    const int col_step_sr =
+        coded_to_superres_mi(step, cm->superres_scale_denominator);
+    const int mi_cols_sr = av1_pixels_to_mi(cm->superres_upscaled_width);
+
+    for (int row = 0; row < cm->mi_params.mi_rows; row += row_step) {
+      for (int col = 0; col < mi_cols_sr; col += col_step_sr) {
+        TplDepStats *this_stats = &tpl_stats[av1_tpl_ptr_pos(
+            row, col, tpl_stride, tpl_data->tpl_stats_block_mis_log2)];
+        int64_t mc_dep_delta =
+            RDCOST(tpl_frame->base_rdmult, this_stats->mc_dep_rate,
+                   this_stats->mc_dep_dist);
+        intra_cost_base += (this_stats->recrf_dist << RDDIV_BITS);
+        mc_dep_cost_base +=
+            (this_stats->recrf_dist << RDDIV_BITS) + mc_dep_delta;
+      }
+    }
+
+    if (mc_dep_cost_base == 0) {
+      tpl_frame->is_valid = 0;
+    } else {
+      aom_clear_system_state();
+      cpi->rd.r0 = (double)intra_cost_base / mc_dep_cost_base;
+      if (is_frame_tpl_eligible(gf_group, gf_group->index)) {
+        if (cpi->lap_enabled) {
+          double min_boost_factor = sqrt(cpi->rc.baseline_gf_interval);
+          const int gfu_boost = get_gfu_boost_from_r0_lap(
+              min_boost_factor, MAX_GFUBOOST_FACTOR, cpi->rd.r0,
+              cpi->rc.num_stats_required_for_gfu_boost);
+          // printf("old boost %d new boost %d\n", cpi->rc.gfu_boost,
+          //        gfu_boost);
+          cpi->rc.gfu_boost = combine_prior_with_tpl_boost(
+              min_boost_factor, MAX_BOOST_COMBINE_FACTOR, cpi->rc.gfu_boost,
+              gfu_boost, cpi->rc.num_stats_used_for_gfu_boost);
+        } else {
+          const int gfu_boost = (int)(200.0 / cpi->rd.r0);
+          cpi->rc.gfu_boost = combine_prior_with_tpl_boost(
+              MIN_BOOST_COMBINE_FACTOR, MAX_BOOST_COMBINE_FACTOR,
+              cpi->rc.gfu_boost, gfu_boost, cpi->rc.frames_to_key);
+        }
+      }
+      aom_clear_system_state();
+    }
+  }
+}
+#endif  // !CONFIG_REALTIME_ONLY
+
+void av1_set_size_dependent_vars(AV1_COMP *cpi, int *q, int *bottom_index,
+                                 int *top_index) {
+  AV1_COMMON *const cm = &cpi->common;
+
+  // Setup variables that depend on the dimensions of the frame.
+  av1_set_speed_features_framesize_dependent(cpi, cpi->speed);
+
+#if !CONFIG_REALTIME_ONLY
+  GF_GROUP *gf_group = &cpi->gf_group;
+  if (cpi->oxcf.algo_cfg.enable_tpl_model &&
+      is_frame_tpl_eligible(gf_group, gf_group->index)) {
+    process_tpl_stats_frame(cpi);
+    av1_tpl_rdmult_setup(cpi);
+  }
+#endif
+
+  // Decide q and q bounds.
+  *q = av1_rc_pick_q_and_bounds(cpi, &cpi->rc, cm->width, cm->height,
+                                cpi->gf_group.index, bottom_index, top_index);
+
+  // Configure experimental use of segmentation for enhanced coding of
+  // static regions if indicated.
+  // Only allowed in the second pass of a two pass encode, as it requires
+  // lagged coding, and if the relevant speed feature flag is set.
+  if (is_stat_consumption_stage_twopass(cpi) &&
+      cpi->sf.hl_sf.static_segmentation)
+    configure_static_seg_features(cpi);
+}
+
+static void reset_film_grain_chroma_params(aom_film_grain_t *pars) {
+  pars->num_cr_points = 0;
+  pars->cr_mult = 0;
+  pars->cr_luma_mult = 0;
+  memset(pars->scaling_points_cr, 0, sizeof(pars->scaling_points_cr));
+  memset(pars->ar_coeffs_cr, 0, sizeof(pars->ar_coeffs_cr));
+  pars->num_cb_points = 0;
+  pars->cb_mult = 0;
+  pars->cb_luma_mult = 0;
+  pars->chroma_scaling_from_luma = 0;
+  memset(pars->scaling_points_cb, 0, sizeof(pars->scaling_points_cb));
+  memset(pars->ar_coeffs_cb, 0, sizeof(pars->ar_coeffs_cb));
+}
+
+void av1_update_film_grain_parameters(struct AV1_COMP *cpi,
+                                      const AV1EncoderConfig *oxcf) {
+  AV1_COMMON *const cm = &cpi->common;
+  cpi->oxcf = *oxcf;
+  const TuneCfg *const tune_cfg = &oxcf->tune_cfg;
+
+  if (cpi->film_grain_table) {
+    aom_film_grain_table_free(cpi->film_grain_table);
+    aom_free(cpi->film_grain_table);
+    cpi->film_grain_table = NULL;
+  }
+
+  if (tune_cfg->film_grain_test_vector) {
+    cm->seq_params.film_grain_params_present = 1;
+    if (cm->current_frame.frame_type == KEY_FRAME) {
+      memcpy(&cm->film_grain_params,
+             film_grain_test_vectors + tune_cfg->film_grain_test_vector - 1,
+             sizeof(cm->film_grain_params));
+      if (oxcf->tool_cfg.enable_monochrome)
+        reset_film_grain_chroma_params(&cm->film_grain_params);
+      cm->film_grain_params.bit_depth = cm->seq_params.bit_depth;
+      if (cm->seq_params.color_range == AOM_CR_FULL_RANGE) {
+        cm->film_grain_params.clip_to_restricted_range = 0;
+      }
+    }
+  } else if (tune_cfg->film_grain_table_filename) {
+    cm->seq_params.film_grain_params_present = 1;
+
+    cpi->film_grain_table = aom_malloc(sizeof(*cpi->film_grain_table));
+    memset(cpi->film_grain_table, 0, sizeof(aom_film_grain_table_t));
+
+    aom_film_grain_table_read(cpi->film_grain_table,
+                              tune_cfg->film_grain_table_filename, &cm->error);
+  } else {
+#if CONFIG_DENOISE
+    cm->seq_params.film_grain_params_present = (cpi->oxcf.noise_level > 0);
+#else
+    cm->seq_params.film_grain_params_present = 0;
+#endif
+    memset(&cm->film_grain_params, 0, sizeof(cm->film_grain_params));
+  }
+}
+
+void av1_scale_references(AV1_COMP *cpi, const InterpFilter filter,
+                          const int phase, const int use_optimized_scaler) {
+  AV1_COMMON *cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  MV_REFERENCE_FRAME ref_frame;
+
+  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+    // Need to convert from AOM_REFFRAME to index into ref_mask (subtract 1).
+    if (cpi->ref_frame_flags & av1_ref_frame_flag_list[ref_frame]) {
+      BufferPool *const pool = cm->buffer_pool;
+      const YV12_BUFFER_CONFIG *const ref =
+          get_ref_frame_yv12_buf(cm, ref_frame);
+
+      if (ref == NULL) {
+        cpi->scaled_ref_buf[ref_frame - 1] = NULL;
+        continue;
+      }
+
+      if (ref->y_crop_width != cm->width || ref->y_crop_height != cm->height) {
+        // Replace the reference buffer with a copy having a thicker border,
+        // if the reference buffer is higher resolution than the current
+        // frame, and the border is thin.
+        if ((ref->y_crop_width > cm->width ||
+             ref->y_crop_height > cm->height) &&
+            ref->border < AOM_BORDER_IN_PIXELS) {
+          RefCntBuffer *ref_fb = get_ref_frame_buf(cm, ref_frame);
+          if (aom_yv12_realloc_with_new_border(
+                  &ref_fb->buf, AOM_BORDER_IN_PIXELS,
+                  cm->features.byte_alignment, num_planes) != 0) {
+            aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+                               "Failed to allocate frame buffer");
+          }
+        }
+        int force_scaling = 0;
+        RefCntBuffer *new_fb = cpi->scaled_ref_buf[ref_frame - 1];
+        if (new_fb == NULL) {
+          const int new_fb_idx = get_free_fb(cm);
+          if (new_fb_idx == INVALID_IDX) {
+            aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+                               "Unable to find free frame buffer");
+          }
+          force_scaling = 1;
+          new_fb = &pool->frame_bufs[new_fb_idx];
+        }
+
+        if (force_scaling || new_fb->buf.y_crop_width != cm->width ||
+            new_fb->buf.y_crop_height != cm->height) {
+          if (aom_realloc_frame_buffer(
+                  &new_fb->buf, cm->width, cm->height,
+                  cm->seq_params.subsampling_x, cm->seq_params.subsampling_y,
+                  cm->seq_params.use_highbitdepth, AOM_BORDER_IN_PIXELS,
+                  cm->features.byte_alignment, NULL, NULL, NULL)) {
+            if (force_scaling) {
+              // Release the reference acquired in the get_free_fb() call above.
+              --new_fb->ref_count;
+            }
+            aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+                               "Failed to allocate frame buffer");
+          }
+#if CONFIG_AV1_HIGHBITDEPTH
+          if (use_optimized_scaler && cm->seq_params.bit_depth == AOM_BITS_8)
+            av1_resize_and_extend_frame(ref, &new_fb->buf, filter, phase,
+                                        num_planes);
+          else
+            av1_resize_and_extend_frame_nonnormative(
+                ref, &new_fb->buf, (int)cm->seq_params.bit_depth, num_planes);
+#else
+          if (use_optimized_scaler)
+            av1_resize_and_extend_frame(ref, &new_fb->buf, filter, phase,
+                                        num_planes);
+          else
+            av1_resize_and_extend_frame_nonnormative(
+                ref, &new_fb->buf, (int)cm->seq_params.bit_depth, num_planes);
+#endif
+          cpi->scaled_ref_buf[ref_frame - 1] = new_fb;
+          alloc_frame_mvs(cm, new_fb);
+        }
+      } else {
+        RefCntBuffer *buf = get_ref_frame_buf(cm, ref_frame);
+        buf->buf.y_crop_width = ref->y_crop_width;
+        buf->buf.y_crop_height = ref->y_crop_height;
+        cpi->scaled_ref_buf[ref_frame - 1] = buf;
+        ++buf->ref_count;
+      }
+    } else {
+      if (!has_no_stats_stage(cpi)) cpi->scaled_ref_buf[ref_frame - 1] = NULL;
+    }
+  }
+}
+
+BLOCK_SIZE av1_select_sb_size(const AV1_COMP *const cpi) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+
+  if (oxcf->tool_cfg.superblock_size == AOM_SUPERBLOCK_SIZE_64X64)
+    return BLOCK_64X64;
+  if (oxcf->tool_cfg.superblock_size == AOM_SUPERBLOCK_SIZE_128X128)
+    return BLOCK_128X128;
+
+  assert(oxcf->tool_cfg.superblock_size == AOM_SUPERBLOCK_SIZE_DYNAMIC);
+
+  if (cpi->svc.number_spatial_layers > 1 ||
+      oxcf->resize_cfg.resize_mode != RESIZE_NONE) {
+    // Use the configured size (top resolution) for spatial layers or
+    // on resize.
+    return AOMMIN(oxcf->frm_dim_cfg.width, oxcf->frm_dim_cfg.height) > 480
+               ? BLOCK_128X128
+               : BLOCK_64X64;
+  }
+
+  // TODO(any): Possibly could improve this with a heuristic.
+  // When superres / resize is on, 'cm->width / height' can change between
+  // calls, so we don't apply this heuristic there.
+  // Things break if superblock size changes between the first pass and second
+  // pass encoding, which is why this heuristic is not configured as a
+  // speed-feature.
+  if (oxcf->superres_cfg.superres_mode == AOM_SUPERRES_NONE &&
+      oxcf->resize_cfg.resize_mode == RESIZE_NONE && oxcf->speed >= 1) {
+    return AOMMIN(cm->width, cm->height) > 480 ? BLOCK_128X128 : BLOCK_64X64;
+  }
+
+  return BLOCK_128X128;
+}
+
+void av1_setup_frame(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  // Set up entropy context depending on frame type. The decoder mandates
+  // the use of the default context, index 0, for keyframes and inter
+  // frames where the error_resilient_mode or intra_only flag is set. For
+  // other inter-frames the encoder currently uses only two contexts;
+  // context 1 for ALTREF frames and context 0 for the others.
+
+  if (frame_is_intra_only(cm) || cm->features.error_resilient_mode ||
+      cpi->ext_flags.use_primary_ref_none) {
+    av1_setup_past_independence(cm);
+  }
+
+  if ((cm->current_frame.frame_type == KEY_FRAME && cm->show_frame) ||
+      frame_is_sframe(cm)) {
+    if (!cpi->seq_params_locked) {
+      set_sb_size(&cm->seq_params, av1_select_sb_size(cpi));
+    }
+  } else {
+    const RefCntBuffer *const primary_ref_buf = get_primary_ref_frame_buf(cm);
+    if (primary_ref_buf == NULL) {
+      av1_setup_past_independence(cm);
+      cm->seg.update_map = 1;
+      cm->seg.update_data = 1;
+    } else {
+      *cm->fc = primary_ref_buf->frame_context;
+    }
+  }
+
+  av1_zero(cm->cur_frame->interp_filter_selected);
+  cm->prev_frame = get_primary_ref_frame_buf(cm);
+  cpi->vaq_refresh = 0;
+}
+
+#if !CONFIG_REALTIME_ONLY
+static int get_interp_filter_selected(const AV1_COMMON *const cm,
+                                      MV_REFERENCE_FRAME ref,
+                                      InterpFilter ifilter) {
+  const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref);
+  if (buf == NULL) return 0;
+  return buf->interp_filter_selected[ifilter];
+}
+
+uint16_t av1_setup_interp_filter_search_mask(AV1_COMP *cpi) {
+  const AV1_COMMON *const cm = &cpi->common;
+  int ref_total[REF_FRAMES] = { 0 };
+  uint16_t mask = ALLOW_ALL_INTERP_FILT_MASK;
+
+  if (cpi->last_frame_type == KEY_FRAME || cpi->refresh_frame.alt_ref_frame)
+    return mask;
+
+  for (MV_REFERENCE_FRAME ref = LAST_FRAME; ref <= ALTREF_FRAME; ++ref) {
+    for (InterpFilter ifilter = EIGHTTAP_REGULAR; ifilter <= MULTITAP_SHARP;
+         ++ifilter) {
+      ref_total[ref] += get_interp_filter_selected(cm, ref, ifilter);
+    }
+  }
+  int ref_total_total = (ref_total[LAST2_FRAME] + ref_total[LAST3_FRAME] +
+                         ref_total[GOLDEN_FRAME] + ref_total[BWDREF_FRAME] +
+                         ref_total[ALTREF2_FRAME] + ref_total[ALTREF_FRAME]);
+
+  for (InterpFilter ifilter = EIGHTTAP_REGULAR; ifilter <= MULTITAP_SHARP;
+       ++ifilter) {
+    int last_score = get_interp_filter_selected(cm, LAST_FRAME, ifilter) * 30;
+    if (ref_total[LAST_FRAME] && last_score <= ref_total[LAST_FRAME]) {
+      int filter_score =
+          get_interp_filter_selected(cm, LAST2_FRAME, ifilter) * 20 +
+          get_interp_filter_selected(cm, LAST3_FRAME, ifilter) * 20 +
+          get_interp_filter_selected(cm, GOLDEN_FRAME, ifilter) * 20 +
+          get_interp_filter_selected(cm, BWDREF_FRAME, ifilter) * 10 +
+          get_interp_filter_selected(cm, ALTREF2_FRAME, ifilter) * 10 +
+          get_interp_filter_selected(cm, ALTREF_FRAME, ifilter) * 10;
+      if (filter_score < ref_total_total) {
+        DUAL_FILTER_TYPE filt_type = ifilter + SWITCHABLE_FILTERS * ifilter;
+        reset_interp_filter_allowed_mask(&mask, filt_type);
+      }
+    }
+  }
+  return mask;
+}
+
+#define STRICT_PSNR_DIFF_THRESH 0.9
+// Encode key frame with/without screen content tools to determine whether
+// screen content tools should be enabled for this key frame group or not.
+// The first encoding is without screen content tools.
+// The second encoding is with screen content tools.
+// We compare the psnr and frame size to make the decision.
+static void screen_content_tools_determination(
+    AV1_COMP *cpi, const int allow_screen_content_tools_orig_decision,
+    const int allow_intrabc_orig_decision,
+    const int use_screen_content_tools_orig_decision,
+    const int is_screen_content_type_orig_decision, const int pass,
+    int *projected_size_pass, PSNR_STATS *psnr) {
+  AV1_COMMON *const cm = &cpi->common;
+  FeatureFlags *const features = &cm->features;
+  projected_size_pass[pass] = cpi->rc.projected_frame_size;
+#if CONFIG_AV1_HIGHBITDEPTH
+  const uint32_t in_bit_depth = cpi->oxcf.input_cfg.input_bit_depth;
+  const uint32_t bit_depth = cpi->td.mb.e_mbd.bd;
+  aom_calc_highbd_psnr(cpi->source, &cpi->common.cur_frame->buf, &psnr[pass],
+                       bit_depth, in_bit_depth);
+#else
+  aom_calc_psnr(cpi->source, &cpi->common.cur_frame->buf, &psnr[pass]);
+#endif
+  if (pass != 1) return;
+
+  const double psnr_diff = psnr[1].psnr[0] - psnr[0].psnr[0];
+  const int is_sc_encoding_much_better = psnr_diff > STRICT_PSNR_DIFF_THRESH;
+  if (is_sc_encoding_much_better) {
+    // Use screen content tools, if we get coding gain.
+    features->allow_screen_content_tools = 1;
+    features->allow_intrabc = cpi->intrabc_used;
+    cpi->use_screen_content_tools = 1;
+    cpi->is_screen_content_type = 1;
+  } else {
+    // Use original screen content decision.
+    features->allow_screen_content_tools =
+        allow_screen_content_tools_orig_decision;
+    features->allow_intrabc = allow_intrabc_orig_decision;
+    cpi->use_screen_content_tools = use_screen_content_tools_orig_decision;
+    cpi->is_screen_content_type = is_screen_content_type_orig_decision;
+  }
+}
+
+// Set some encoding parameters to make the encoding process fast.
+// A fixed block partition size, and a large q is used.
+static void set_encoding_params_for_screen_content(AV1_COMP *cpi,
+                                                   const int pass) {
+  AV1_COMMON *const cm = &cpi->common;
+  if (pass == 0) {
+    // In the first pass, encode without screen content tools.
+    // Use a high q, and a fixed block size for fast encoding.
+    cm->features.allow_screen_content_tools = 0;
+    cm->features.allow_intrabc = 0;
+    cpi->use_screen_content_tools = 0;
+    cpi->sf.part_sf.partition_search_type = FIXED_PARTITION;
+    cpi->sf.part_sf.fixed_partition_size = BLOCK_32X32;
+    return;
+  }
+  assert(pass == 1);
+  // In the second pass, encode with screen content tools.
+  // Use a high q, and a fixed block size for fast encoding.
+  cm->features.allow_screen_content_tools = 1;
+  // TODO(chengchen): turn intrabc on could lead to data race issue.
+  // cm->allow_intrabc = 1;
+  cpi->use_screen_content_tools = 1;
+  cpi->sf.part_sf.partition_search_type = FIXED_PARTITION;
+  cpi->sf.part_sf.fixed_partition_size = BLOCK_32X32;
+}
+
+// Determines whether to use screen content tools for the key frame group.
+// This function modifies "cm->features.allow_screen_content_tools",
+// "cm->features.allow_intrabc" and "cpi->use_screen_content_tools".
+void av1_determine_sc_tools_with_encoding(AV1_COMP *cpi, const int q_orig) {
+  AV1_COMMON *const cm = &cpi->common;
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  const QuantizationCfg *const q_cfg = &oxcf->q_cfg;
+  // Variables to help determine if we should allow screen content tools.
+  int projected_size_pass[3] = { 0 };
+  PSNR_STATS psnr[3];
+  const int is_key_frame = cm->current_frame.frame_type == KEY_FRAME;
+  const int allow_screen_content_tools_orig_decision =
+      cm->features.allow_screen_content_tools;
+  const int allow_intrabc_orig_decision = cm->features.allow_intrabc;
+  const int use_screen_content_tools_orig_decision =
+      cpi->use_screen_content_tools;
+  const int is_screen_content_type_orig_decision = cpi->is_screen_content_type;
+  // Turn off the encoding trial for forward key frame and superres.
+  if (cpi->sf.rt_sf.use_nonrd_pick_mode || oxcf->kf_cfg.fwd_kf_enabled ||
+      cpi->superres_mode != AOM_SUPERRES_NONE || oxcf->mode == REALTIME ||
+      use_screen_content_tools_orig_decision || !is_key_frame) {
+    return;
+  }
+
+  // TODO(chengchen): multiple encoding for the lossless mode is time consuming.
+  // Find a better way to determine whether screen content tools should be used
+  // for lossless coding.
+  // Use a high q and a fixed partition to do quick encoding.
+  const int q_for_screen_content_quick_run =
+      is_lossless_requested(&oxcf->rc_cfg) ? q_orig : AOMMAX(q_orig, 244);
+  const int partition_search_type_orig = cpi->sf.part_sf.partition_search_type;
+  const BLOCK_SIZE fixed_partition_block_size_orig =
+      cpi->sf.part_sf.fixed_partition_size;
+
+  // Setup necessary params for encoding, including frame source, etc.
+  aom_clear_system_state();
+
+  cpi->source =
+      av1_scale_if_required(cm, cpi->unscaled_source, &cpi->scaled_source,
+                            cm->features.interp_filter, 0, false, false);
+  if (cpi->unscaled_last_source != NULL) {
+    cpi->last_source = av1_scale_if_required(
+        cm, cpi->unscaled_last_source, &cpi->scaled_last_source,
+        cm->features.interp_filter, 0, false, false);
+  }
+
+  av1_setup_frame(cpi);
+
+  if (cm->seg.enabled) {
+    if (!cm->seg.update_data && cm->prev_frame) {
+      segfeatures_copy(&cm->seg, &cm->prev_frame->seg);
+      cm->seg.enabled = cm->prev_frame->seg.enabled;
+    } else {
+      av1_calculate_segdata(&cm->seg);
+    }
+  } else {
+    memset(&cm->seg, 0, sizeof(cm->seg));
+  }
+  segfeatures_copy(&cm->cur_frame->seg, &cm->seg);
+  cm->cur_frame->seg.enabled = cm->seg.enabled;
+
+  // The two encoding passes aim to help determine whether to use screen
+  // content tools, with a high q and fixed partition.
+  for (int pass = 0; pass < 2; ++pass) {
+    set_encoding_params_for_screen_content(cpi, pass);
+    av1_set_quantizer(cm, q_cfg->qm_minlevel, q_cfg->qm_maxlevel,
+                      q_for_screen_content_quick_run,
+                      q_cfg->enable_chroma_deltaq);
+    av1_set_speed_features_qindex_dependent(cpi, oxcf->speed);
+    if (q_cfg->deltaq_mode != NO_DELTA_Q)
+      av1_init_quantizer(&cpi->enc_quant_dequant_params, &cm->quant_params,
+                         cm->seq_params.bit_depth);
+
+    av1_set_variance_partition_thresholds(cpi, q_for_screen_content_quick_run,
+                                          0);
+    // transform / motion compensation build reconstruction frame
+    av1_encode_frame(cpi);
+    // Screen content decision
+    screen_content_tools_determination(
+        cpi, allow_screen_content_tools_orig_decision,
+        allow_intrabc_orig_decision, use_screen_content_tools_orig_decision,
+        is_screen_content_type_orig_decision, pass, projected_size_pass, psnr);
+  }
+
+  // Set partition speed feature back.
+  cpi->sf.part_sf.partition_search_type = partition_search_type_orig;
+  cpi->sf.part_sf.fixed_partition_size = fixed_partition_block_size_orig;
+}
+
+#define GM_RECODE_LOOP_NUM4X4_FACTOR 192
+int av1_recode_loop_test_global_motion(WarpedMotionParams *const global_motion,
+                                       const int *const global_motion_used,
+                                       int *const gm_params_cost) {
+  int i;
+  int recode = 0;
+  for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
+    if (global_motion[i].wmtype != IDENTITY &&
+        global_motion_used[i] * GM_RECODE_LOOP_NUM4X4_FACTOR <
+            gm_params_cost[i]) {
+      global_motion[i] = default_warp_params;
+      assert(global_motion[i].wmtype == IDENTITY);
+      gm_params_cost[i] = 0;
+      recode = 1;
+      // TODO(sarahparker): The earlier condition for recoding here was:
+      // "recode |= (rdc->global_motion_used[i] > 0);". Can we bring something
+      // similar to that back to speed up global motion?
+    }
+  }
+  return recode;
+}
+#endif  // CONFIG_REALTIME_ONLY
+
+static void fix_interp_filter(InterpFilter *const interp_filter,
+                              const FRAME_COUNTS *const counts) {
+  if (*interp_filter == SWITCHABLE) {
+    // Check to see if only one of the filters is actually used
+    int count[SWITCHABLE_FILTERS] = { 0 };
+    int num_filters_used = 0;
+    for (int i = 0; i < SWITCHABLE_FILTERS; ++i) {
+      for (int j = 0; j < SWITCHABLE_FILTER_CONTEXTS; ++j)
+        count[i] += counts->switchable_interp[j][i];
+      num_filters_used += (count[i] > 0);
+    }
+    if (num_filters_used == 1) {
+      // Only one filter is used. So set the filter at frame level
+      for (int i = 0; i < SWITCHABLE_FILTERS; ++i) {
+        if (count[i]) {
+          if (i == EIGHTTAP_REGULAR) *interp_filter = i;
+          break;
+        }
+      }
+    }
+  }
+}
+
+void av1_finalize_encoded_frame(AV1_COMP *const cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  CurrentFrame *const current_frame = &cm->current_frame;
+
+  if (!cm->seq_params.reduced_still_picture_hdr &&
+      encode_show_existing_frame(cm)) {
+    RefCntBuffer *const frame_to_show =
+        cm->ref_frame_map[cpi->existing_fb_idx_to_show];
+
+    if (frame_to_show == NULL) {
+      aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+                         "Buffer does not contain a reconstructed frame");
+    }
+    assert(frame_to_show->ref_count > 0);
+    assign_frame_buffer_p(&cm->cur_frame, frame_to_show);
+  }
+
+  if (!encode_show_existing_frame(cm) &&
+      cm->seq_params.film_grain_params_present &&
+      (cm->show_frame || cm->showable_frame)) {
+    // Copy the current frame's film grain params to the its corresponding
+    // RefCntBuffer slot.
+    cm->cur_frame->film_grain_params = cm->film_grain_params;
+
+    // We must update the parameters if this is not an INTER_FRAME
+    if (current_frame->frame_type != INTER_FRAME)
+      cm->cur_frame->film_grain_params.update_parameters = 1;
+
+    // Iterate the random seed for the next frame.
+    cm->film_grain_params.random_seed += 3381;
+    if (cm->film_grain_params.random_seed == 0)
+      cm->film_grain_params.random_seed = 7391;
+  }
+
+  // Initialise all tiles' contexts from the global frame context
+  for (int tile_col = 0; tile_col < cm->tiles.cols; tile_col++) {
+    for (int tile_row = 0; tile_row < cm->tiles.rows; tile_row++) {
+      const int tile_idx = tile_row * cm->tiles.cols + tile_col;
+      cpi->tile_data[tile_idx].tctx = *cm->fc;
+    }
+  }
+
+  fix_interp_filter(&cm->features.interp_filter, cpi->td.counts);
+}
+
+int av1_is_integer_mv(const YV12_BUFFER_CONFIG *cur_picture,
+                      const YV12_BUFFER_CONFIG *last_picture,
+                      ForceIntegerMVInfo *const force_intpel_info) {
+  aom_clear_system_state();
+  // check use hash ME
+  int k;
+
+  const int block_size = FORCE_INT_MV_DECISION_BLOCK_SIZE;
+  const double threshold_current = 0.8;
+  const double threshold_average = 0.95;
+  const int max_history_size = 32;
+  int T = 0;  // total block
+  int C = 0;  // match with collocated block
+  int S = 0;  // smooth region but not match with collocated block
+
+  const int pic_width = cur_picture->y_width;
+  const int pic_height = cur_picture->y_height;
+  for (int i = 0; i + block_size <= pic_height; i += block_size) {
+    for (int j = 0; j + block_size <= pic_width; j += block_size) {
+      const int x_pos = j;
+      const int y_pos = i;
+      int match = 1;
+      T++;
+
+      // check whether collocated block match with current
+      uint8_t *p_cur = cur_picture->y_buffer;
+      uint8_t *p_ref = last_picture->y_buffer;
+      int stride_cur = cur_picture->y_stride;
+      int stride_ref = last_picture->y_stride;
+      p_cur += (y_pos * stride_cur + x_pos);
+      p_ref += (y_pos * stride_ref + x_pos);
+
+      if (cur_picture->flags & YV12_FLAG_HIGHBITDEPTH) {
+        uint16_t *p16_cur = CONVERT_TO_SHORTPTR(p_cur);
+        uint16_t *p16_ref = CONVERT_TO_SHORTPTR(p_ref);
+        for (int tmpY = 0; tmpY < block_size && match; tmpY++) {
+          for (int tmpX = 0; tmpX < block_size && match; tmpX++) {
+            if (p16_cur[tmpX] != p16_ref[tmpX]) {
+              match = 0;
+            }
+          }
+          p16_cur += stride_cur;
+          p16_ref += stride_ref;
+        }
+      } else {
+        for (int tmpY = 0; tmpY < block_size && match; tmpY++) {
+          for (int tmpX = 0; tmpX < block_size && match; tmpX++) {
+            if (p_cur[tmpX] != p_ref[tmpX]) {
+              match = 0;
+            }
+          }
+          p_cur += stride_cur;
+          p_ref += stride_ref;
+        }
+      }
+
+      if (match) {
+        C++;
+        continue;
+      }
+
+      if (av1_hash_is_horizontal_perfect(cur_picture, block_size, x_pos,
+                                         y_pos) ||
+          av1_hash_is_vertical_perfect(cur_picture, block_size, x_pos, y_pos)) {
+        S++;
+        continue;
+      }
+    }
+  }
+
+  assert(T > 0);
+  double cs_rate = ((double)(C + S)) / ((double)(T));
+
+  force_intpel_info->cs_rate_array[force_intpel_info->rate_index] = cs_rate;
+
+  force_intpel_info->rate_index =
+      (force_intpel_info->rate_index + 1) % max_history_size;
+  force_intpel_info->rate_size++;
+  force_intpel_info->rate_size =
+      AOMMIN(force_intpel_info->rate_size, max_history_size);
+
+  if (cs_rate < threshold_current) {
+    return 0;
+  }
+
+  if (C == T) {
+    return 1;
+  }
+
+  double cs_average = 0.0;
+
+  for (k = 0; k < force_intpel_info->rate_size; k++) {
+    cs_average += force_intpel_info->cs_rate_array[k];
+  }
+  cs_average /= force_intpel_info->rate_size;
+
+  if (cs_average < threshold_average) {
+    return 0;
+  }
+
+  if ((T - C - S) < 0) {
+    return 1;
+  }
+
+  if (cs_average > 1.01) {
+    return 1;
+  }
+
+  return 0;
+}
+
+void av1_set_mb_ssim_rdmult_scaling(AV1_COMP *cpi) {
+  const CommonModeInfoParams *const mi_params = &cpi->common.mi_params;
+  ThreadData *td = &cpi->td;
+  MACROBLOCK *x = &td->mb;
+  MACROBLOCKD *xd = &x->e_mbd;
+  uint8_t *y_buffer = cpi->source->y_buffer;
+  const int y_stride = cpi->source->y_stride;
+  const int block_size = BLOCK_16X16;
+
+  const int num_mi_w = mi_size_wide[block_size];
+  const int num_mi_h = mi_size_high[block_size];
+  const int num_cols = (mi_params->mi_cols + num_mi_w - 1) / num_mi_w;
+  const int num_rows = (mi_params->mi_rows + num_mi_h - 1) / num_mi_h;
+  double log_sum = 0.0;
+  const int use_hbd = cpi->source->flags & YV12_FLAG_HIGHBITDEPTH;
+
+  // Loop through each 16x16 block.
+  for (int row = 0; row < num_rows; ++row) {
+    for (int col = 0; col < num_cols; ++col) {
+      double var = 0.0, num_of_var = 0.0;
+      const int index = row * num_cols + col;
+
+      // Loop through each 8x8 block.
+      for (int mi_row = row * num_mi_h;
+           mi_row < mi_params->mi_rows && mi_row < (row + 1) * num_mi_h;
+           mi_row += 2) {
+        for (int mi_col = col * num_mi_w;
+             mi_col < mi_params->mi_cols && mi_col < (col + 1) * num_mi_w;
+             mi_col += 2) {
+          struct buf_2d buf;
+          const int row_offset_y = mi_row << 2;
+          const int col_offset_y = mi_col << 2;
+
+          buf.buf = y_buffer + row_offset_y * y_stride + col_offset_y;
+          buf.stride = y_stride;
+
+          if (use_hbd) {
+            var += av1_high_get_sby_perpixel_variance(cpi, &buf, BLOCK_8X8,
+                                                      xd->bd);
+          } else {
+            var += av1_get_sby_perpixel_variance(cpi, &buf, BLOCK_8X8);
+          }
+
+          num_of_var += 1.0;
+        }
+      }
+      var = var / num_of_var;
+
+      // Curve fitting with an exponential model on all 16x16 blocks from the
+      // midres dataset.
+      var = 67.035434 * (1 - exp(-0.0021489 * var)) + 17.492222;
+      cpi->ssim_rdmult_scaling_factors[index] = var;
+      log_sum += log(var);
+    }
+  }
+  log_sum = exp(log_sum / (double)(num_rows * num_cols));
+
+  for (int row = 0; row < num_rows; ++row) {
+    for (int col = 0; col < num_cols; ++col) {
+      const int index = row * num_cols + col;
+      cpi->ssim_rdmult_scaling_factors[index] /= log_sum;
+    }
+  }
+}
+
+// Coding context that only needs to be saved when recode loop includes
+// filtering (deblocking, CDEF, superres post-encode upscale and/or loop
+// restoraton).
+static void save_extra_coding_context(AV1_COMP *cpi) {
+  CODING_CONTEXT *const cc = &cpi->coding_context;
+  AV1_COMMON *cm = &cpi->common;
+
+  cc->lf = cm->lf;
+  cc->cdef_info = cm->cdef_info;
+  cc->rc = cpi->rc;
+  cc->mv_stats = cpi->mv_stats;
+}
+
+void av1_save_all_coding_context(AV1_COMP *cpi) {
+  save_extra_coding_context(cpi);
+  if (!frame_is_intra_only(&cpi->common)) release_scaled_references(cpi);
+}
+
+#if DUMP_RECON_FRAMES == 1
+
+// NOTE(zoeliu): For debug - Output the filtered reconstructed video.
+void av1_dump_filtered_recon_frames(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  const CurrentFrame *const current_frame = &cm->current_frame;
+  const YV12_BUFFER_CONFIG *recon_buf = &cm->cur_frame->buf;
+
+  if (recon_buf == NULL) {
+    printf("Frame %d is not ready.\n", current_frame->frame_number);
+    return;
+  }
+
+  static const int flag_list[REF_FRAMES] = { 0,
+                                             AOM_LAST_FLAG,
+                                             AOM_LAST2_FLAG,
+                                             AOM_LAST3_FLAG,
+                                             AOM_GOLD_FLAG,
+                                             AOM_BWD_FLAG,
+                                             AOM_ALT2_FLAG,
+                                             AOM_ALT_FLAG };
+  printf(
+      "\n***Frame=%d (frame_offset=%d, show_frame=%d, "
+      "show_existing_frame=%d) "
+      "[LAST LAST2 LAST3 GOLDEN BWD ALT2 ALT]=[",
+      current_frame->frame_number, current_frame->order_hint, cm->show_frame,
+      cm->show_existing_frame);
+  for (int ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+    const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame);
+    const int ref_offset = buf != NULL ? (int)buf->order_hint : -1;
+    printf(" %d(%c)", ref_offset,
+           (cpi->ref_frame_flags & flag_list[ref_frame]) ? 'Y' : 'N');
+  }
+  printf(" ]\n");
+
+  if (!cm->show_frame) {
+    printf("Frame %d is a no show frame, so no image dump.\n",
+           current_frame->frame_number);
+    return;
+  }
+
+  int h;
+  char file_name[256] = "/tmp/enc_filtered_recon.yuv";
+  FILE *f_recon = NULL;
+
+  if (current_frame->frame_number == 0) {
+    if ((f_recon = fopen(file_name, "wb")) == NULL) {
+      printf("Unable to open file %s to write.\n", file_name);
+      return;
+    }
+  } else {
+    if ((f_recon = fopen(file_name, "ab")) == NULL) {
+      printf("Unable to open file %s to append.\n", file_name);
+      return;
+    }
+  }
+  printf(
+      "\nFrame=%5d, encode_update_type[%5d]=%1d, frame_offset=%d, "
+      "show_frame=%d, show_existing_frame=%d, source_alt_ref_active=%d, "
+      "refresh_alt_ref_frame=%d, "
+      "y_stride=%4d, uv_stride=%4d, cm->width=%4d, cm->height=%4d\n\n",
+      current_frame->frame_number, cpi->gf_group.index,
+      cpi->gf_group.update_type[cpi->gf_group.index], current_frame->order_hint,
+      cm->show_frame, cm->show_existing_frame, cpi->rc.source_alt_ref_active,
+      cpi->refresh_frame.alt_ref_frame, recon_buf->y_stride,
+      recon_buf->uv_stride, cm->width, cm->height);
+#if 0
+  int ref_frame;
+  printf("get_ref_frame_map_idx: [");
+  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame)
+    printf(" %d", get_ref_frame_map_idx(cm, ref_frame));
+  printf(" ]\n");
+#endif  // 0
+
+  // --- Y ---
+  for (h = 0; h < cm->height; ++h) {
+    fwrite(&recon_buf->y_buffer[h * recon_buf->y_stride], 1, cm->width,
+           f_recon);
+  }
+  // --- U ---
+  for (h = 0; h < (cm->height >> 1); ++h) {
+    fwrite(&recon_buf->u_buffer[h * recon_buf->uv_stride], 1, (cm->width >> 1),
+           f_recon);
+  }
+  // --- V ---
+  for (h = 0; h < (cm->height >> 1); ++h) {
+    fwrite(&recon_buf->v_buffer[h * recon_buf->uv_stride], 1, (cm->width >> 1),
+           f_recon);
+  }
+
+  fclose(f_recon);
+}
+#endif  // DUMP_RECON_FRAMES
diff --git a/av1/encoder/encoder_utils.h b/av1/encoder/encoder_utils.h
new file mode 100644
index 0000000..ef7538f
--- /dev/null
+++ b/av1/encoder/encoder_utils.h
@@ -0,0 +1,1009 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_ENCODER_UTILS_H_
+#define AOM_AV1_ENCODER_ENCODER_UTILS_H_
+
+#include "config/aom_dsp_rtcd.h"
+#include "config/aom_scale_rtcd.h"
+
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/encodetxb.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define AM_SEGMENT_ID_INACTIVE 7
+#define AM_SEGMENT_ID_ACTIVE 0
+#define DUMP_RECON_FRAMES 0
+
+extern const int default_tx_type_probs[FRAME_UPDATE_TYPES][TX_SIZES_ALL]
+                                      [TX_TYPES];
+
+extern const int default_obmc_probs[FRAME_UPDATE_TYPES][BLOCK_SIZES_ALL];
+
+extern const int default_warped_probs[FRAME_UPDATE_TYPES];
+
+extern const int default_switchable_interp_probs[FRAME_UPDATE_TYPES]
+                                                [SWITCHABLE_FILTER_CONTEXTS]
+                                                [SWITCHABLE_FILTERS];
+
+// Mark all inactive blocks as active. Other segmentation features may be set
+// so memset cannot be used, instead only inactive blocks should be reset.
+static AOM_INLINE void suppress_active_map(AV1_COMP *cpi) {
+  unsigned char *const seg_map = cpi->enc_seg.map;
+  int i;
+  if (cpi->active_map.enabled || cpi->active_map.update)
+    for (i = 0;
+         i < cpi->common.mi_params.mi_rows * cpi->common.mi_params.mi_cols; ++i)
+      if (seg_map[i] == AM_SEGMENT_ID_INACTIVE)
+        seg_map[i] = AM_SEGMENT_ID_ACTIVE;
+}
+
+static AOM_INLINE void set_mb_mi(CommonModeInfoParams *mi_params, int width,
+                                 int height) {
+  // Ensure that the decoded width and height are both multiples of
+  // 8 luma pixels (note: this may only be a multiple of 4 chroma pixels if
+  // subsampling is used).
+  // This simplifies the implementation of various experiments,
+  // eg. cdef, which operates on units of 8x8 luma pixels.
+  const int aligned_width = ALIGN_POWER_OF_TWO(width, 3);
+  const int aligned_height = ALIGN_POWER_OF_TWO(height, 3);
+
+  mi_params->mi_cols = aligned_width >> MI_SIZE_LOG2;
+  mi_params->mi_rows = aligned_height >> MI_SIZE_LOG2;
+  mi_params->mi_stride = calc_mi_size(mi_params->mi_cols);
+
+  mi_params->mb_cols = (mi_params->mi_cols + 2) >> 2;
+  mi_params->mb_rows = (mi_params->mi_rows + 2) >> 2;
+  mi_params->MBs = mi_params->mb_rows * mi_params->mb_cols;
+
+  const int mi_alloc_size_1d = mi_size_wide[mi_params->mi_alloc_bsize];
+  mi_params->mi_alloc_stride =
+      (mi_params->mi_stride + mi_alloc_size_1d - 1) / mi_alloc_size_1d;
+
+  assert(mi_size_wide[mi_params->mi_alloc_bsize] ==
+         mi_size_high[mi_params->mi_alloc_bsize]);
+
+#if CONFIG_LPF_MASK
+  av1_alloc_loop_filter_mask(mi_params);
+#endif
+}
+
+static AOM_INLINE void enc_free_mi(CommonModeInfoParams *mi_params) {
+  aom_free(mi_params->mi_alloc);
+  mi_params->mi_alloc = NULL;
+  aom_free(mi_params->mi_grid_base);
+  mi_params->mi_grid_base = NULL;
+  mi_params->mi_alloc_size = 0;
+  aom_free(mi_params->tx_type_map);
+  mi_params->tx_type_map = NULL;
+}
+
+static AOM_INLINE void enc_set_mb_mi(CommonModeInfoParams *mi_params, int width,
+                                     int height) {
+  const int is_4k_or_larger = AOMMIN(width, height) >= 2160;
+  mi_params->mi_alloc_bsize = is_4k_or_larger ? BLOCK_8X8 : BLOCK_4X4;
+
+  set_mb_mi(mi_params, width, height);
+}
+
+static AOM_INLINE void stat_stage_set_mb_mi(CommonModeInfoParams *mi_params,
+                                            int width, int height) {
+  mi_params->mi_alloc_bsize = BLOCK_16X16;
+
+  set_mb_mi(mi_params, width, height);
+}
+
+static AOM_INLINE void enc_setup_mi(CommonModeInfoParams *mi_params) {
+  const int mi_grid_size =
+      mi_params->mi_stride * calc_mi_size(mi_params->mi_rows);
+  memset(mi_params->mi_alloc, 0,
+         mi_params->mi_alloc_size * sizeof(*mi_params->mi_alloc));
+  memset(mi_params->mi_grid_base, 0,
+         mi_grid_size * sizeof(*mi_params->mi_grid_base));
+  memset(mi_params->tx_type_map, 0,
+         mi_grid_size * sizeof(*mi_params->tx_type_map));
+}
+
+static AOM_INLINE void init_buffer_indices(
+    ForceIntegerMVInfo *const force_intpel_info, int *const remapped_ref_idx) {
+  int fb_idx;
+  for (fb_idx = 0; fb_idx < REF_FRAMES; ++fb_idx)
+    remapped_ref_idx[fb_idx] = fb_idx;
+  force_intpel_info->rate_index = 0;
+  force_intpel_info->rate_size = 0;
+}
+
+#define HIGHBD_BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX4DF, JSDAF, JSVAF) \
+  cpi->fn_ptr[BT].sdf = SDF;                                           \
+  cpi->fn_ptr[BT].sdaf = SDAF;                                         \
+  cpi->fn_ptr[BT].vf = VF;                                             \
+  cpi->fn_ptr[BT].svf = SVF;                                           \
+  cpi->fn_ptr[BT].svaf = SVAF;                                         \
+  cpi->fn_ptr[BT].sdx4df = SDX4DF;                                     \
+  cpi->fn_ptr[BT].jsdaf = JSDAF;                                       \
+  cpi->fn_ptr[BT].jsvaf = JSVAF;
+
+#define HIGHBD_BFP_WRAPPER(WIDTH, HEIGHT, BD)                                \
+  HIGHBD_BFP(                                                                \
+      BLOCK_##WIDTH##X##HEIGHT, aom_highbd_sad##WIDTH##x##HEIGHT##_bits##BD, \
+      aom_highbd_sad##WIDTH##x##HEIGHT##_avg_bits##BD,                       \
+      aom_highbd_##BD##_variance##WIDTH##x##HEIGHT,                          \
+      aom_highbd_##BD##_sub_pixel_variance##WIDTH##x##HEIGHT,                \
+      aom_highbd_##BD##_sub_pixel_avg_variance##WIDTH##x##HEIGHT,            \
+      aom_highbd_sad##WIDTH##x##HEIGHT##x4d_bits##BD,                        \
+      aom_highbd_dist_wtd_sad##WIDTH##x##HEIGHT##_avg_bits##BD,              \
+      aom_highbd_##BD##_dist_wtd_sub_pixel_avg_variance##WIDTH##x##HEIGHT)
+
+#define MAKE_BFP_SAD_WRAPPER(fnname)                                           \
+  static unsigned int fnname##_bits8(const uint8_t *src_ptr,                   \
+                                     int source_stride,                        \
+                                     const uint8_t *ref_ptr, int ref_stride) { \
+    return fnname(src_ptr, source_stride, ref_ptr, ref_stride);                \
+  }                                                                            \
+  static unsigned int fnname##_bits10(                                         \
+      const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr,       \
+      int ref_stride) {                                                        \
+    return fnname(src_ptr, source_stride, ref_ptr, ref_stride) >> 2;           \
+  }                                                                            \
+  static unsigned int fnname##_bits12(                                         \
+      const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr,       \
+      int ref_stride) {                                                        \
+    return fnname(src_ptr, source_stride, ref_ptr, ref_stride) >> 4;           \
+  }
+
+#define MAKE_BFP_SADAVG_WRAPPER(fnname)                                        \
+  static unsigned int fnname##_bits8(                                          \
+      const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr,       \
+      int ref_stride, const uint8_t *second_pred) {                            \
+    return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred);   \
+  }                                                                            \
+  static unsigned int fnname##_bits10(                                         \
+      const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr,       \
+      int ref_stride, const uint8_t *second_pred) {                            \
+    return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred) >> \
+           2;                                                                  \
+  }                                                                            \
+  static unsigned int fnname##_bits12(                                         \
+      const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr,       \
+      int ref_stride, const uint8_t *second_pred) {                            \
+    return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred) >> \
+           4;                                                                  \
+  }
+
+#define MAKE_BFP_SAD4D_WRAPPER(fnname)                                        \
+  static void fnname##_bits8(const uint8_t *src_ptr, int source_stride,       \
+                             const uint8_t *const ref_ptr[], int ref_stride,  \
+                             unsigned int *sad_array) {                       \
+    fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array);           \
+  }                                                                           \
+  static void fnname##_bits10(const uint8_t *src_ptr, int source_stride,      \
+                              const uint8_t *const ref_ptr[], int ref_stride, \
+                              unsigned int *sad_array) {                      \
+    int i;                                                                    \
+    fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array);           \
+    for (i = 0; i < 4; i++) sad_array[i] >>= 2;                               \
+  }                                                                           \
+  static void fnname##_bits12(const uint8_t *src_ptr, int source_stride,      \
+                              const uint8_t *const ref_ptr[], int ref_stride, \
+                              unsigned int *sad_array) {                      \
+    int i;                                                                    \
+    fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array);           \
+    for (i = 0; i < 4; i++) sad_array[i] >>= 4;                               \
+  }
+
+#define MAKE_BFP_JSADAVG_WRAPPER(fnname)                                    \
+  static unsigned int fnname##_bits8(                                       \
+      const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr,    \
+      int ref_stride, const uint8_t *second_pred,                           \
+      const DIST_WTD_COMP_PARAMS *jcp_param) {                              \
+    return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred, \
+                  jcp_param);                                               \
+  }                                                                         \
+  static unsigned int fnname##_bits10(                                      \
+      const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr,    \
+      int ref_stride, const uint8_t *second_pred,                           \
+      const DIST_WTD_COMP_PARAMS *jcp_param) {                              \
+    return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred, \
+                  jcp_param) >>                                             \
+           2;                                                               \
+  }                                                                         \
+  static unsigned int fnname##_bits12(                                      \
+      const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr,    \
+      int ref_stride, const uint8_t *second_pred,                           \
+      const DIST_WTD_COMP_PARAMS *jcp_param) {                              \
+    return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred, \
+                  jcp_param) >>                                             \
+           4;                                                               \
+  }
+
+#if CONFIG_AV1_HIGHBITDEPTH
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad128x128)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad128x128_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad128x128x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad128x64)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad128x64_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad128x64x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad64x128)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad64x128_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x128x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad32x16)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad32x16_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x16x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad16x32)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad16x32_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x32x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad64x32)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad64x32_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x32x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad32x64)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad32x64_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x64x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad32x32)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad32x32_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x32x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad64x64)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad64x64_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x64x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad16x16)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad16x16_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x16x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad16x8)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad16x8_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x8x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad8x16)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad8x16_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x16x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad8x8)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad8x8_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x8x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad8x4)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad8x4_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x4x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad4x8)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad4x8_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad4x8x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad4x4)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad4x4_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad4x4x4d)
+
+#if !CONFIG_REALTIME_ONLY
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad4x16)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad4x16_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad4x16x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad16x4)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad16x4_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x4x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad8x32)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad8x32_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x32x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad32x8)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad32x8_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x8x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad16x64)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad16x64_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x64x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad64x16)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad64x16_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x16x4d)
+#endif
+
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad128x128_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad128x64_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad64x128_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad32x16_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad16x32_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad64x32_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad32x64_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad32x32_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad64x64_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad16x16_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad16x8_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad8x16_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad8x8_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad8x4_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad4x8_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad4x4_avg)
+#if !CONFIG_REALTIME_ONLY
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad4x16_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad16x4_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad8x32_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad32x8_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad16x64_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad64x16_avg)
+#endif
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+#define HIGHBD_MBFP(BT, MCSDF, MCSVF) \
+  cpi->fn_ptr[BT].msdf = MCSDF;       \
+  cpi->fn_ptr[BT].msvf = MCSVF;
+
+#define HIGHBD_MBFP_WRAPPER(WIDTH, HEIGHT, BD)                    \
+  HIGHBD_MBFP(BLOCK_##WIDTH##X##HEIGHT,                           \
+              aom_highbd_masked_sad##WIDTH##x##HEIGHT##_bits##BD, \
+              aom_highbd_##BD##_masked_sub_pixel_variance##WIDTH##x##HEIGHT)
+
+#define MAKE_MBFP_COMPOUND_SAD_WRAPPER(fnname)                           \
+  static unsigned int fnname##_bits8(                                    \
+      const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \
+      int ref_stride, const uint8_t *second_pred_ptr, const uint8_t *m,  \
+      int m_stride, int invert_mask) {                                   \
+    return fnname(src_ptr, source_stride, ref_ptr, ref_stride,           \
+                  second_pred_ptr, m, m_stride, invert_mask);            \
+  }                                                                      \
+  static unsigned int fnname##_bits10(                                   \
+      const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \
+      int ref_stride, const uint8_t *second_pred_ptr, const uint8_t *m,  \
+      int m_stride, int invert_mask) {                                   \
+    return fnname(src_ptr, source_stride, ref_ptr, ref_stride,           \
+                  second_pred_ptr, m, m_stride, invert_mask) >>          \
+           2;                                                            \
+  }                                                                      \
+  static unsigned int fnname##_bits12(                                   \
+      const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \
+      int ref_stride, const uint8_t *second_pred_ptr, const uint8_t *m,  \
+      int m_stride, int invert_mask) {                                   \
+    return fnname(src_ptr, source_stride, ref_ptr, ref_stride,           \
+                  second_pred_ptr, m, m_stride, invert_mask) >>          \
+           4;                                                            \
+  }
+
+#if CONFIG_AV1_HIGHBITDEPTH
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad128x128)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad128x64)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad64x128)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad64x64)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad64x32)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad32x64)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad32x32)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad32x16)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad16x32)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad16x16)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad16x8)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad8x16)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad8x8)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad8x4)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad4x8)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad4x4)
+#if !CONFIG_REALTIME_ONLY
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad4x16)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad16x4)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad8x32)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad32x8)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad16x64)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad64x16)
+#endif
+#endif
+
+#define HIGHBD_SDSFP(BT, SDSF, SDSX4DF) \
+  cpi->fn_ptr[BT].sdsf = SDSF;          \
+  cpi->fn_ptr[BT].sdsx4df = SDSX4DF;
+
+#define HIGHBD_SDSFP_WRAPPER(WIDTH, HEIGHT, BD)                   \
+  HIGHBD_SDSFP(BLOCK_##WIDTH##X##HEIGHT,                          \
+               aom_highbd_sad_skip_##WIDTH##x##HEIGHT##_bits##BD, \
+               aom_highbd_sad_skip_##WIDTH##x##HEIGHT##x4d##_bits##BD)
+
+#define MAKE_SDSF_SKIP_SAD_WRAPPER(fnname)                                  \
+  static unsigned int fnname##_bits8(const uint8_t *src, int src_stride,    \
+                                     const uint8_t *ref, int ref_stride) {  \
+    return fnname(src, src_stride, ref, ref_stride);                        \
+  }                                                                         \
+  static unsigned int fnname##_bits10(const uint8_t *src, int src_stride,   \
+                                      const uint8_t *ref, int ref_stride) { \
+    return fnname(src, src_stride, ref, ref_stride) >> 2;                   \
+  }                                                                         \
+  static unsigned int fnname##_bits12(const uint8_t *src, int src_stride,   \
+                                      const uint8_t *ref, int ref_stride) { \
+    return fnname(src, src_stride, ref, ref_stride) >> 4;                   \
+  }
+
+#define MAKE_SDSF_SKIP_SAD_4D_WRAPPER(fnname)                                 \
+  static void fnname##_bits8(const uint8_t *src_ptr, int source_stride,       \
+                             const uint8_t *const ref_ptr[], int ref_stride,  \
+                             unsigned int *sad_array) {                       \
+    fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array);           \
+  }                                                                           \
+  static void fnname##_bits10(const uint8_t *src_ptr, int source_stride,      \
+                              const uint8_t *const ref_ptr[], int ref_stride, \
+                              unsigned int *sad_array) {                      \
+    int i;                                                                    \
+    fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array);           \
+    for (i = 0; i < 4; i++) sad_array[i] >>= 2;                               \
+  }                                                                           \
+  static void fnname##_bits12(const uint8_t *src_ptr, int source_stride,      \
+                              const uint8_t *const ref_ptr[], int ref_stride, \
+                              unsigned int *sad_array) {                      \
+    int i;                                                                    \
+    fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array);           \
+    for (i = 0; i < 4; i++) sad_array[i] >>= 4;                               \
+  }
+
+#if CONFIG_AV1_HIGHBITDEPTH
+MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_128x128)
+MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_128x64)
+MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_64x128)
+MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_64x64)
+MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_64x32)
+MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_32x64)
+MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_32x32)
+MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_32x16)
+MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_16x32)
+MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_16x16)
+MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_16x8)
+MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_8x16)
+MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_8x8)
+MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_4x8)
+
+#if !CONFIG_REALTIME_ONLY
+MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_64x16)
+MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_32x8)
+MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_16x64)
+MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_4x16)
+MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_8x32)
+#endif
+
+MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_128x128x4d)
+MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_128x64x4d)
+MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_64x128x4d)
+MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_64x64x4d)
+MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_64x32x4d)
+MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_32x64x4d)
+MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_32x32x4d)
+MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_32x16x4d)
+MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_16x32x4d)
+MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_16x16x4d)
+MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_16x8x4d)
+MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_8x16x4d)
+MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_8x8x4d)
+MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_4x8x4d)
+
+#if !CONFIG_REALTIME_ONLY
+MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_64x16x4d)
+MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_32x8x4d)
+MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_16x64x4d)
+MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_4x16x4d)
+MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_8x32x4d)
+#endif
+#endif
+
+#if !CONFIG_REALTIME_ONLY
+
+#define LOWBD_OBFP(BT, OSDF, OVF, OSVF) \
+  cpi->fn_ptr[BT].osdf = OSDF;          \
+  cpi->fn_ptr[BT].ovf = OVF;            \
+  cpi->fn_ptr[BT].osvf = OSVF;
+
+#define LOWBD_OBFP_WRAPPER(WIDTH, HEIGHT)                              \
+  LOWBD_OBFP(BLOCK_##WIDTH##X##HEIGHT, aom_obmc_sad##WIDTH##x##HEIGHT, \
+             aom_obmc_variance##WIDTH##x##HEIGHT,                      \
+             aom_obmc_sub_pixel_variance##WIDTH##x##HEIGHT)
+
+#if CONFIG_AV1_HIGHBITDEPTH
+#define HIGHBD_OBFP(BT, OSDF, OVF, OSVF) \
+  cpi->fn_ptr[BT].osdf = OSDF;           \
+  cpi->fn_ptr[BT].ovf = OVF;             \
+  cpi->fn_ptr[BT].osvf = OSVF;
+
+#define HIGHBD_OBFP_WRAPPER(WIDTH, HEIGHT, BD)                   \
+  HIGHBD_OBFP(BLOCK_##WIDTH##X##HEIGHT,                          \
+              aom_highbd_obmc_sad##WIDTH##x##HEIGHT##_bits##BD,  \
+              aom_highbd_##BD##_obmc_variance##WIDTH##x##HEIGHT, \
+              aom_highbd_##BD##_obmc_sub_pixel_variance##WIDTH##x##HEIGHT)
+
+#define MAKE_OBFP_SAD_WRAPPER(fnname)                                     \
+  static unsigned int fnname##_bits10(const uint8_t *ref, int ref_stride, \
+                                      const int32_t *wsrc,                \
+                                      const int32_t *msk) {               \
+    return fnname(ref, ref_stride, wsrc, msk) >> 2;                       \
+  }                                                                       \
+  static unsigned int fnname##_bits12(const uint8_t *ref, int ref_stride, \
+                                      const int32_t *wsrc,                \
+                                      const int32_t *msk) {               \
+    return fnname(ref, ref_stride, wsrc, msk) >> 4;                       \
+  }
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+#endif  // !CONFIG_REALTIME_ONLY
+
+#if CONFIG_AV1_HIGHBITDEPTH
+#if !CONFIG_REALTIME_ONLY
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad128x128)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad128x64)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad64x128)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad64x64)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad64x32)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad32x64)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad32x32)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad32x16)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad16x32)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad16x16)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad16x8)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad8x16)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad8x8)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad8x4)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad4x8)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad4x4)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad4x16)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad16x4)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad8x32)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad32x8)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad16x64)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad64x16)
+#endif
+
+static AOM_INLINE void highbd_set_var_fns(AV1_COMP *const cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  if (cm->seq_params.use_highbitdepth) {
+    switch (cm->seq_params.bit_depth) {
+      case AOM_BITS_8:
+#if !CONFIG_REALTIME_ONLY
+        HIGHBD_BFP_WRAPPER(64, 16, 8)
+        HIGHBD_BFP_WRAPPER(16, 64, 8)
+        HIGHBD_BFP_WRAPPER(32, 8, 8)
+        HIGHBD_BFP_WRAPPER(8, 32, 8)
+        HIGHBD_BFP_WRAPPER(16, 4, 8)
+        HIGHBD_BFP_WRAPPER(4, 16, 8)
+#endif
+        HIGHBD_BFP_WRAPPER(32, 16, 8)
+        HIGHBD_BFP_WRAPPER(16, 32, 8)
+        HIGHBD_BFP_WRAPPER(64, 32, 8)
+        HIGHBD_BFP_WRAPPER(32, 64, 8)
+        HIGHBD_BFP_WRAPPER(32, 32, 8)
+        HIGHBD_BFP_WRAPPER(64, 64, 8)
+        HIGHBD_BFP_WRAPPER(16, 16, 8)
+        HIGHBD_BFP_WRAPPER(16, 8, 8)
+        HIGHBD_BFP_WRAPPER(8, 16, 8)
+        HIGHBD_BFP_WRAPPER(8, 8, 8)
+        HIGHBD_BFP_WRAPPER(8, 4, 8)
+        HIGHBD_BFP_WRAPPER(4, 8, 8)
+        HIGHBD_BFP_WRAPPER(4, 4, 8)
+        HIGHBD_BFP_WRAPPER(128, 128, 8)
+        HIGHBD_BFP_WRAPPER(128, 64, 8)
+        HIGHBD_BFP_WRAPPER(64, 128, 8)
+
+        HIGHBD_MBFP_WRAPPER(128, 128, 8)
+        HIGHBD_MBFP_WRAPPER(128, 64, 8)
+        HIGHBD_MBFP_WRAPPER(64, 128, 8)
+        HIGHBD_MBFP_WRAPPER(64, 64, 8)
+        HIGHBD_MBFP_WRAPPER(64, 32, 8)
+        HIGHBD_MBFP_WRAPPER(32, 64, 8)
+        HIGHBD_MBFP_WRAPPER(32, 32, 8)
+        HIGHBD_MBFP_WRAPPER(32, 16, 8)
+        HIGHBD_MBFP_WRAPPER(16, 32, 8)
+        HIGHBD_MBFP_WRAPPER(16, 16, 8)
+        HIGHBD_MBFP_WRAPPER(8, 16, 8)
+        HIGHBD_MBFP_WRAPPER(16, 8, 8)
+        HIGHBD_MBFP_WRAPPER(8, 8, 8)
+        HIGHBD_MBFP_WRAPPER(4, 8, 8)
+        HIGHBD_MBFP_WRAPPER(8, 4, 8)
+        HIGHBD_MBFP_WRAPPER(4, 4, 8)
+#if !CONFIG_REALTIME_ONLY
+        HIGHBD_MBFP_WRAPPER(64, 16, 8)
+        HIGHBD_MBFP_WRAPPER(16, 64, 8)
+        HIGHBD_MBFP_WRAPPER(32, 8, 8)
+        HIGHBD_MBFP_WRAPPER(8, 32, 8)
+        HIGHBD_MBFP_WRAPPER(16, 4, 8)
+        HIGHBD_MBFP_WRAPPER(4, 16, 8)
+#endif
+
+// OBMC excluded from realtime only build.
+#if !CONFIG_REALTIME_ONLY
+        LOWBD_OBFP_WRAPPER(128, 128)
+        LOWBD_OBFP_WRAPPER(128, 64)
+        LOWBD_OBFP_WRAPPER(64, 128)
+        LOWBD_OBFP_WRAPPER(64, 64)
+        LOWBD_OBFP_WRAPPER(64, 32)
+        LOWBD_OBFP_WRAPPER(32, 64)
+        LOWBD_OBFP_WRAPPER(32, 32)
+        LOWBD_OBFP_WRAPPER(32, 16)
+        LOWBD_OBFP_WRAPPER(16, 32)
+        LOWBD_OBFP_WRAPPER(16, 16)
+        LOWBD_OBFP_WRAPPER(8, 16)
+        LOWBD_OBFP_WRAPPER(16, 8)
+        LOWBD_OBFP_WRAPPER(8, 8)
+        LOWBD_OBFP_WRAPPER(4, 8)
+        LOWBD_OBFP_WRAPPER(8, 4)
+        LOWBD_OBFP_WRAPPER(4, 4)
+        LOWBD_OBFP_WRAPPER(64, 16)
+        LOWBD_OBFP_WRAPPER(16, 64)
+        LOWBD_OBFP_WRAPPER(32, 8)
+        LOWBD_OBFP_WRAPPER(8, 32)
+        LOWBD_OBFP_WRAPPER(16, 4)
+        LOWBD_OBFP_WRAPPER(4, 16)
+#endif
+
+        HIGHBD_SDSFP_WRAPPER(128, 128, 8);
+        HIGHBD_SDSFP_WRAPPER(128, 64, 8);
+        HIGHBD_SDSFP_WRAPPER(64, 128, 8);
+        HIGHBD_SDSFP_WRAPPER(64, 64, 8);
+        HIGHBD_SDSFP_WRAPPER(64, 32, 8);
+        HIGHBD_SDSFP_WRAPPER(32, 64, 8);
+        HIGHBD_SDSFP_WRAPPER(32, 32, 8);
+        HIGHBD_SDSFP_WRAPPER(32, 16, 8);
+        HIGHBD_SDSFP_WRAPPER(16, 32, 8);
+        HIGHBD_SDSFP_WRAPPER(16, 16, 8);
+        HIGHBD_SDSFP_WRAPPER(16, 8, 8);
+        HIGHBD_SDSFP_WRAPPER(8, 16, 8);
+        HIGHBD_SDSFP_WRAPPER(8, 8, 8);
+        HIGHBD_SDSFP_WRAPPER(4, 8, 8);
+#if !CONFIG_REALTIME_ONLY
+        HIGHBD_SDSFP_WRAPPER(64, 16, 8);
+        HIGHBD_SDSFP_WRAPPER(32, 8, 8);
+        HIGHBD_SDSFP_WRAPPER(16, 64, 8);
+        HIGHBD_SDSFP_WRAPPER(8, 32, 8);
+        HIGHBD_SDSFP_WRAPPER(4, 16, 8);
+#endif
+        break;
+
+      case AOM_BITS_10:
+#if !CONFIG_REALTIME_ONLY
+        HIGHBD_BFP_WRAPPER(64, 16, 10)
+        HIGHBD_BFP_WRAPPER(16, 64, 10)
+        HIGHBD_BFP_WRAPPER(32, 8, 10)
+        HIGHBD_BFP_WRAPPER(8, 32, 10)
+        HIGHBD_BFP_WRAPPER(16, 4, 10)
+        HIGHBD_BFP_WRAPPER(4, 16, 10)
+#endif
+        HIGHBD_BFP_WRAPPER(32, 16, 10)
+        HIGHBD_BFP_WRAPPER(16, 32, 10)
+        HIGHBD_BFP_WRAPPER(64, 32, 10)
+        HIGHBD_BFP_WRAPPER(32, 64, 10)
+        HIGHBD_BFP_WRAPPER(32, 32, 10)
+        HIGHBD_BFP_WRAPPER(64, 64, 10)
+        HIGHBD_BFP_WRAPPER(16, 16, 10)
+        HIGHBD_BFP_WRAPPER(16, 8, 10)
+        HIGHBD_BFP_WRAPPER(8, 16, 10)
+        HIGHBD_BFP_WRAPPER(8, 8, 10)
+        HIGHBD_BFP_WRAPPER(8, 4, 10)
+        HIGHBD_BFP_WRAPPER(4, 8, 10)
+        HIGHBD_BFP_WRAPPER(4, 4, 10)
+        HIGHBD_BFP_WRAPPER(128, 128, 10)
+        HIGHBD_BFP_WRAPPER(128, 64, 10)
+        HIGHBD_BFP_WRAPPER(64, 128, 10)
+
+        HIGHBD_MBFP_WRAPPER(128, 128, 10)
+        HIGHBD_MBFP_WRAPPER(128, 64, 10)
+        HIGHBD_MBFP_WRAPPER(64, 128, 10)
+        HIGHBD_MBFP_WRAPPER(64, 64, 10)
+        HIGHBD_MBFP_WRAPPER(64, 32, 10)
+        HIGHBD_MBFP_WRAPPER(32, 64, 10)
+        HIGHBD_MBFP_WRAPPER(32, 32, 10)
+        HIGHBD_MBFP_WRAPPER(32, 16, 10)
+        HIGHBD_MBFP_WRAPPER(16, 32, 10)
+        HIGHBD_MBFP_WRAPPER(16, 16, 10)
+        HIGHBD_MBFP_WRAPPER(8, 16, 10)
+        HIGHBD_MBFP_WRAPPER(16, 8, 10)
+        HIGHBD_MBFP_WRAPPER(8, 8, 10)
+        HIGHBD_MBFP_WRAPPER(4, 8, 10)
+        HIGHBD_MBFP_WRAPPER(8, 4, 10)
+        HIGHBD_MBFP_WRAPPER(4, 4, 10)
+#if !CONFIG_REALTIME_ONLY
+        HIGHBD_MBFP_WRAPPER(64, 16, 10)
+        HIGHBD_MBFP_WRAPPER(16, 64, 10)
+        HIGHBD_MBFP_WRAPPER(32, 8, 10)
+        HIGHBD_MBFP_WRAPPER(8, 32, 10)
+        HIGHBD_MBFP_WRAPPER(16, 4, 10)
+        HIGHBD_MBFP_WRAPPER(4, 16, 10)
+#endif
+
+// OBMC excluded from realtime only build.
+#if !CONFIG_REALTIME_ONLY
+        HIGHBD_OBFP_WRAPPER(128, 128, 10)
+        HIGHBD_OBFP_WRAPPER(128, 64, 10)
+        HIGHBD_OBFP_WRAPPER(64, 128, 10)
+        HIGHBD_OBFP_WRAPPER(64, 64, 10)
+        HIGHBD_OBFP_WRAPPER(64, 32, 10)
+        HIGHBD_OBFP_WRAPPER(32, 64, 10)
+        HIGHBD_OBFP_WRAPPER(32, 32, 10)
+        HIGHBD_OBFP_WRAPPER(32, 16, 10)
+        HIGHBD_OBFP_WRAPPER(16, 32, 10)
+        HIGHBD_OBFP_WRAPPER(16, 16, 10)
+        HIGHBD_OBFP_WRAPPER(8, 16, 10)
+        HIGHBD_OBFP_WRAPPER(16, 8, 10)
+        HIGHBD_OBFP_WRAPPER(8, 8, 10)
+        HIGHBD_OBFP_WRAPPER(4, 8, 10)
+        HIGHBD_OBFP_WRAPPER(8, 4, 10)
+        HIGHBD_OBFP_WRAPPER(4, 4, 10)
+        HIGHBD_OBFP_WRAPPER(64, 16, 10)
+        HIGHBD_OBFP_WRAPPER(16, 64, 10)
+        HIGHBD_OBFP_WRAPPER(32, 8, 10)
+        HIGHBD_OBFP_WRAPPER(8, 32, 10)
+        HIGHBD_OBFP_WRAPPER(16, 4, 10)
+        HIGHBD_OBFP_WRAPPER(4, 16, 10)
+#endif
+
+        HIGHBD_SDSFP_WRAPPER(128, 128, 10);
+        HIGHBD_SDSFP_WRAPPER(128, 64, 10);
+        HIGHBD_SDSFP_WRAPPER(64, 128, 10);
+        HIGHBD_SDSFP_WRAPPER(64, 64, 10);
+        HIGHBD_SDSFP_WRAPPER(64, 32, 10);
+        HIGHBD_SDSFP_WRAPPER(32, 64, 10);
+        HIGHBD_SDSFP_WRAPPER(32, 32, 10);
+        HIGHBD_SDSFP_WRAPPER(32, 16, 10);
+        HIGHBD_SDSFP_WRAPPER(16, 32, 10);
+        HIGHBD_SDSFP_WRAPPER(16, 16, 10);
+        HIGHBD_SDSFP_WRAPPER(16, 8, 10);
+        HIGHBD_SDSFP_WRAPPER(8, 16, 10);
+        HIGHBD_SDSFP_WRAPPER(8, 8, 10);
+        HIGHBD_SDSFP_WRAPPER(4, 8, 10);
+
+#if !CONFIG_REALTIME_ONLY
+        HIGHBD_SDSFP_WRAPPER(64, 16, 10);
+        HIGHBD_SDSFP_WRAPPER(32, 8, 10);
+        HIGHBD_SDSFP_WRAPPER(16, 64, 10);
+        HIGHBD_SDSFP_WRAPPER(8, 32, 10);
+        HIGHBD_SDSFP_WRAPPER(4, 16, 10);
+#endif
+        break;
+
+      case AOM_BITS_12:
+#if !CONFIG_REALTIME_ONLY
+        HIGHBD_BFP_WRAPPER(64, 16, 12)
+        HIGHBD_BFP_WRAPPER(16, 64, 12)
+        HIGHBD_BFP_WRAPPER(32, 8, 12)
+        HIGHBD_BFP_WRAPPER(8, 32, 12)
+        HIGHBD_BFP_WRAPPER(16, 4, 12)
+        HIGHBD_BFP_WRAPPER(4, 16, 12)
+#endif
+        HIGHBD_BFP_WRAPPER(32, 16, 12)
+        HIGHBD_BFP_WRAPPER(16, 32, 12)
+        HIGHBD_BFP_WRAPPER(64, 32, 12)
+        HIGHBD_BFP_WRAPPER(32, 64, 12)
+        HIGHBD_BFP_WRAPPER(32, 32, 12)
+        HIGHBD_BFP_WRAPPER(64, 64, 12)
+        HIGHBD_BFP_WRAPPER(16, 16, 12)
+        HIGHBD_BFP_WRAPPER(16, 8, 12)
+        HIGHBD_BFP_WRAPPER(8, 16, 12)
+        HIGHBD_BFP_WRAPPER(8, 8, 12)
+        HIGHBD_BFP_WRAPPER(8, 4, 12)
+        HIGHBD_BFP_WRAPPER(4, 8, 12)
+        HIGHBD_BFP_WRAPPER(4, 4, 12)
+        HIGHBD_BFP_WRAPPER(128, 128, 12)
+        HIGHBD_BFP_WRAPPER(128, 64, 12)
+        HIGHBD_BFP_WRAPPER(64, 128, 12)
+
+        HIGHBD_MBFP_WRAPPER(128, 128, 12)
+        HIGHBD_MBFP_WRAPPER(128, 64, 12)
+        HIGHBD_MBFP_WRAPPER(64, 128, 12)
+        HIGHBD_MBFP_WRAPPER(64, 64, 12)
+        HIGHBD_MBFP_WRAPPER(64, 32, 12)
+        HIGHBD_MBFP_WRAPPER(32, 64, 12)
+        HIGHBD_MBFP_WRAPPER(32, 32, 12)
+        HIGHBD_MBFP_WRAPPER(32, 16, 12)
+        HIGHBD_MBFP_WRAPPER(16, 32, 12)
+        HIGHBD_MBFP_WRAPPER(16, 16, 12)
+        HIGHBD_MBFP_WRAPPER(8, 16, 12)
+        HIGHBD_MBFP_WRAPPER(16, 8, 12)
+        HIGHBD_MBFP_WRAPPER(8, 8, 12)
+        HIGHBD_MBFP_WRAPPER(4, 8, 12)
+        HIGHBD_MBFP_WRAPPER(8, 4, 12)
+        HIGHBD_MBFP_WRAPPER(4, 4, 12)
+#if !CONFIG_REALTIME_ONLY
+        HIGHBD_MBFP_WRAPPER(64, 16, 12)
+        HIGHBD_MBFP_WRAPPER(16, 64, 12)
+        HIGHBD_MBFP_WRAPPER(32, 8, 12)
+        HIGHBD_MBFP_WRAPPER(8, 32, 12)
+        HIGHBD_MBFP_WRAPPER(16, 4, 12)
+        HIGHBD_MBFP_WRAPPER(4, 16, 12)
+#endif
+
+// OBMC excluded from realtime only build.
+#if !CONFIG_REALTIME_ONLY
+        HIGHBD_OBFP_WRAPPER(128, 128, 12)
+        HIGHBD_OBFP_WRAPPER(128, 64, 12)
+        HIGHBD_OBFP_WRAPPER(64, 128, 12)
+        HIGHBD_OBFP_WRAPPER(64, 64, 12)
+        HIGHBD_OBFP_WRAPPER(64, 32, 12)
+        HIGHBD_OBFP_WRAPPER(32, 64, 12)
+        HIGHBD_OBFP_WRAPPER(32, 32, 12)
+        HIGHBD_OBFP_WRAPPER(32, 16, 12)
+        HIGHBD_OBFP_WRAPPER(16, 32, 12)
+        HIGHBD_OBFP_WRAPPER(16, 16, 12)
+        HIGHBD_OBFP_WRAPPER(8, 16, 12)
+        HIGHBD_OBFP_WRAPPER(16, 8, 12)
+        HIGHBD_OBFP_WRAPPER(8, 8, 12)
+        HIGHBD_OBFP_WRAPPER(4, 8, 12)
+        HIGHBD_OBFP_WRAPPER(8, 4, 12)
+        HIGHBD_OBFP_WRAPPER(4, 4, 12)
+        HIGHBD_OBFP_WRAPPER(64, 16, 12)
+        HIGHBD_OBFP_WRAPPER(16, 64, 12)
+        HIGHBD_OBFP_WRAPPER(32, 8, 12)
+        HIGHBD_OBFP_WRAPPER(8, 32, 12)
+        HIGHBD_OBFP_WRAPPER(16, 4, 12)
+        HIGHBD_OBFP_WRAPPER(4, 16, 12)
+#endif
+
+        HIGHBD_SDSFP_WRAPPER(128, 128, 12);
+        HIGHBD_SDSFP_WRAPPER(128, 64, 12);
+        HIGHBD_SDSFP_WRAPPER(64, 128, 12);
+        HIGHBD_SDSFP_WRAPPER(64, 64, 12);
+        HIGHBD_SDSFP_WRAPPER(64, 32, 12);
+        HIGHBD_SDSFP_WRAPPER(32, 64, 12);
+        HIGHBD_SDSFP_WRAPPER(32, 32, 12);
+        HIGHBD_SDSFP_WRAPPER(32, 16, 12);
+        HIGHBD_SDSFP_WRAPPER(16, 32, 12);
+        HIGHBD_SDSFP_WRAPPER(16, 16, 12);
+        HIGHBD_SDSFP_WRAPPER(16, 8, 12);
+        HIGHBD_SDSFP_WRAPPER(8, 16, 12);
+        HIGHBD_SDSFP_WRAPPER(8, 8, 12);
+        HIGHBD_SDSFP_WRAPPER(4, 8, 12);
+
+#if !CONFIG_REALTIME_ONLY
+        HIGHBD_SDSFP_WRAPPER(64, 16, 12);
+        HIGHBD_SDSFP_WRAPPER(32, 8, 12);
+        HIGHBD_SDSFP_WRAPPER(16, 64, 12);
+        HIGHBD_SDSFP_WRAPPER(8, 32, 12);
+        HIGHBD_SDSFP_WRAPPER(4, 16, 12);
+#endif
+        break;
+
+      default:
+        assert(0 &&
+               "cm->seq_params.bit_depth should be AOM_BITS_8, "
+               "AOM_BITS_10 or AOM_BITS_12");
+    }
+  }
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+static AOM_INLINE void copy_frame_prob_info(AV1_COMP *cpi) {
+  FrameProbInfo *const frame_probs = &cpi->frame_probs;
+  if (cpi->sf.tx_sf.tx_type_search.prune_tx_type_using_stats) {
+    av1_copy(frame_probs->tx_type_probs, default_tx_type_probs);
+  }
+  if (!cpi->sf.inter_sf.disable_obmc &&
+      cpi->sf.inter_sf.prune_obmc_prob_thresh > 0) {
+    av1_copy(frame_probs->obmc_probs, default_obmc_probs);
+  }
+  if (cpi->sf.inter_sf.prune_warped_prob_thresh > 0) {
+    av1_copy(frame_probs->warped_probs, default_warped_probs);
+  }
+  if (cpi->sf.interp_sf.adaptive_interp_filter_search == 2) {
+    av1_copy(frame_probs->switchable_interp_probs,
+             default_switchable_interp_probs);
+  }
+}
+
+// Coding context that only needs to be restored when recode loop includes
+// filtering (deblocking, CDEF, superres post-encode upscale and/or loop
+// restoraton).
+static AOM_INLINE void restore_extra_coding_context(AV1_COMP *cpi) {
+  CODING_CONTEXT *const cc = &cpi->coding_context;
+  AV1_COMMON *cm = &cpi->common;
+  cm->lf = cc->lf;
+  cm->cdef_info = cc->cdef_info;
+  cpi->rc = cc->rc;
+  cpi->mv_stats = cc->mv_stats;
+}
+
+static AOM_INLINE int equal_dimensions_and_border(const YV12_BUFFER_CONFIG *a,
+                                                  const YV12_BUFFER_CONFIG *b) {
+  return a->y_height == b->y_height && a->y_width == b->y_width &&
+         a->uv_height == b->uv_height && a->uv_width == b->uv_width &&
+         a->y_stride == b->y_stride && a->uv_stride == b->uv_stride &&
+         a->border == b->border &&
+         (a->flags & YV12_FLAG_HIGHBITDEPTH) ==
+             (b->flags & YV12_FLAG_HIGHBITDEPTH);
+}
+
+static AOM_INLINE int update_entropy(bool *ext_refresh_frame_context,
+                                     bool *ext_refresh_frame_context_pending,
+                                     bool update) {
+  *ext_refresh_frame_context = update;
+  *ext_refresh_frame_context_pending = 1;
+  return 0;
+}
+
+#if !CONFIG_REALTIME_ONLY
+static AOM_INLINE int combine_prior_with_tpl_boost(double min_factor,
+                                                   double max_factor,
+                                                   int prior_boost,
+                                                   int tpl_boost,
+                                                   int frames_to_key) {
+  double factor = sqrt((double)frames_to_key);
+  double range = max_factor - min_factor;
+  factor = AOMMIN(factor, max_factor);
+  factor = AOMMAX(factor, min_factor);
+  factor -= min_factor;
+  int boost =
+      (int)((factor * prior_boost + (range - factor) * tpl_boost) / range);
+  return boost;
+}
+#endif
+
+static AOM_INLINE void set_size_independent_vars(AV1_COMP *cpi) {
+  int i;
+  AV1_COMMON *const cm = &cpi->common;
+  for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
+    cm->global_motion[i] = default_warp_params;
+  }
+  cpi->gm_info.search_done = 0;
+
+  av1_set_speed_features_framesize_independent(cpi, cpi->speed);
+  av1_set_rd_speed_thresholds(cpi);
+  cm->features.interp_filter = SWITCHABLE;
+  cm->features.switchable_motion_mode = 1;
+}
+
+static AOM_INLINE void release_scaled_references(AV1_COMP *cpi) {
+  // TODO(isbs): only refresh the necessary frames, rather than all of them
+  for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
+    RefCntBuffer *const buf = cpi->scaled_ref_buf[i];
+    if (buf != NULL) {
+      --buf->ref_count;
+      cpi->scaled_ref_buf[i] = NULL;
+    }
+  }
+}
+
+static AOM_INLINE void restore_all_coding_context(AV1_COMP *cpi) {
+  restore_extra_coding_context(cpi);
+  if (!frame_is_intra_only(&cpi->common)) release_scaled_references(cpi);
+}
+
+// Refresh reference frame buffers according to refresh_frame_flags.
+static AOM_INLINE void refresh_reference_frames(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  // All buffers are refreshed for shown keyframes and S-frames.
+
+  for (int ref_frame = 0; ref_frame < REF_FRAMES; ref_frame++) {
+    if (((cm->current_frame.refresh_frame_flags >> ref_frame) & 1) == 1) {
+      assign_frame_buffer_p(&cm->ref_frame_map[ref_frame], cm->cur_frame);
+    }
+  }
+}
+
+void av1_update_film_grain_parameters(struct AV1_COMP *cpi,
+                                      const AV1EncoderConfig *oxcf);
+
+void av1_scale_references(AV1_COMP *cpi, const InterpFilter filter,
+                          const int phase, const int use_optimized_scaler);
+
+void av1_setup_frame(AV1_COMP *cpi);
+
+BLOCK_SIZE av1_select_sb_size(const AV1_COMP *const cpi);
+
+void av1_apply_active_map(AV1_COMP *cpi);
+
+#if !CONFIG_REALTIME_ONLY
+uint16_t av1_setup_interp_filter_search_mask(AV1_COMP *cpi);
+
+void av1_determine_sc_tools_with_encoding(AV1_COMP *cpi, const int q_orig);
+
+int av1_recode_loop_test_global_motion(WarpedMotionParams *const global_motion,
+                                       const int *const global_motion_used,
+                                       int *const gm_params_cost);
+#endif
+
+void av1_set_size_dependent_vars(AV1_COMP *cpi, int *q, int *bottom_index,
+                                 int *top_index);
+
+void av1_finalize_encoded_frame(AV1_COMP *const cpi);
+
+int av1_is_integer_mv(const YV12_BUFFER_CONFIG *cur_picture,
+                      const YV12_BUFFER_CONFIG *last_picture,
+                      ForceIntegerMVInfo *const force_intpel_info);
+
+void av1_set_mb_ssim_rdmult_scaling(AV1_COMP *cpi);
+
+void av1_save_all_coding_context(AV1_COMP *cpi);
+
+#if DUMP_RECON_FRAMES == 1
+void av1_dump_filtered_recon_frames(AV1_COMP *cpi);
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_ENCODER_UTILS_H_
diff --git a/av1/encoder/encodetxb.c b/av1/encoder/encodetxb.c
index 825d52a..2975405 100644
--- a/av1/encoder/encodetxb.c
+++ b/av1/encoder/encodetxb.c
@@ -23,27 +23,6 @@
 #include "av1/encoder/rdopt.h"
 #include "av1/encoder/tokenize.h"
 
-#if CONFIG_HTB_TRELLIS
-static int hbt_needs_init = 1;
-static CRC32C crc_calculator;
-static const int HBT_EOB = 16;            // also the length in opt_qcoeff
-static const int HBT_TABLE_SIZE = 65536;  // 16 bit: holds 65536 'arrays'
-static const int HBT_ARRAY_LENGTH = 256;  // 8 bit: 256 entries
-// If removed in hbt_create_hashes or increased beyond int8_t, widen deltas type
-static const int HBT_KICKOUT = 3;
-
-typedef struct OptTxbQcoeff {
-  // Use larger type if larger/no kickout value is used in hbt_create_hashes
-  int8_t deltas[16];
-  uint32_t hbt_qc_hash;
-  uint32_t hbt_ctx_hash;
-  int init;
-  int rate_cost;
-} OptTxbQcoeff;
-
-OptTxbQcoeff *hbt_hash_table;
-#endif  // CONFIG_HTB_TRELLIS
-
 typedef struct LevelDownStats {
   int update;
   tran_low_t low_qc;
@@ -73,16 +52,53 @@
 
 void av1_alloc_txb_buf(AV1_COMP *cpi) {
   AV1_COMMON *cm = &cpi->common;
+  CoeffBufferPool *coeff_buf_pool = &cpi->coeff_buffer_pool;
   int size = ((cm->mi_params.mi_rows >> cm->seq_params.mib_size_log2) + 1) *
              ((cm->mi_params.mi_cols >> cm->seq_params.mib_size_log2) + 1);
+  const int num_planes = av1_num_planes(cm);
+  const int subsampling_x = cm->seq_params.subsampling_x;
+  const int subsampling_y = cm->seq_params.subsampling_y;
+  const int chroma_max_sb_square =
+      MAX_SB_SQUARE >> (subsampling_x + subsampling_y);
+  const int num_tcoeffs =
+      size * (MAX_SB_SQUARE + (num_planes - 1) * chroma_max_sb_square);
+  const int txb_unit_size = TX_SIZE_W_MIN * TX_SIZE_H_MIN;
 
   av1_free_txb_buf(cpi);
   // TODO(jingning): This should be further reduced.
-  CHECK_MEM_ERROR(cm, cpi->coeff_buffer_base,
-                  aom_memalign(32, sizeof(*cpi->coeff_buffer_base) * size));
+  cpi->coeff_buffer_base = aom_malloc(sizeof(*cpi->coeff_buffer_base) * size);
+  CHECK_MEM_ERROR(
+      cm, coeff_buf_pool->tcoeff,
+      aom_memalign(32, sizeof(*coeff_buf_pool->tcoeff) * num_tcoeffs));
+  coeff_buf_pool->eobs =
+      aom_malloc(sizeof(*coeff_buf_pool->eobs) * num_tcoeffs / txb_unit_size);
+  coeff_buf_pool->entropy_ctx = aom_malloc(
+      sizeof(*coeff_buf_pool->entropy_ctx) * num_tcoeffs / txb_unit_size);
+
+  tran_low_t *tcoeff_ptr = coeff_buf_pool->tcoeff;
+  uint16_t *eob_ptr = coeff_buf_pool->eobs;
+  uint8_t *entropy_ctx_ptr = coeff_buf_pool->entropy_ctx;
+  for (int i = 0; i < size; i++) {
+    for (int plane = 0; plane < num_planes; plane++) {
+      const int max_sb_square =
+          (plane == AOM_PLANE_Y) ? MAX_SB_SQUARE : chroma_max_sb_square;
+      cpi->coeff_buffer_base[i].tcoeff[plane] = tcoeff_ptr;
+      cpi->coeff_buffer_base[i].eobs[plane] = eob_ptr;
+      cpi->coeff_buffer_base[i].entropy_ctx[plane] = entropy_ctx_ptr;
+      tcoeff_ptr += max_sb_square;
+      eob_ptr += max_sb_square / txb_unit_size;
+      entropy_ctx_ptr += max_sb_square / txb_unit_size;
+    }
+  }
 }
 
-void av1_free_txb_buf(AV1_COMP *cpi) { aom_free(cpi->coeff_buffer_base); }
+void av1_free_txb_buf(AV1_COMP *cpi) {
+  CoeffBufferPool *coeff_buf_pool = &cpi->coeff_buffer_pool;
+  aom_free(cpi->coeff_buffer_base);
+  aom_free(coeff_buf_pool->tcoeff);
+  aom_free(coeff_buf_pool->eobs);
+  aom_free(coeff_buf_pool->entropy_ctx);
+}
 
 static void write_golomb(aom_writer *w, int level) {
   int x = level + 1;
@@ -100,23 +116,6 @@
   for (i = length - 1; i >= 0; --i) aom_write_bit(w, (x >> i) & 0x01);
 }
 
-static INLINE tran_low_t get_lower_coeff(tran_low_t qc) {
-  if (qc == 0) {
-    return 0;
-  }
-  return qc > 0 ? qc - 1 : qc + 1;
-}
-
-static INLINE tran_low_t qcoeff_to_dqcoeff(tran_low_t qc, int coeff_idx,
-                                           int dqv, int shift,
-                                           const qm_val_t *iqmatrix) {
-  int sign = qc < 0 ? -1 : 1;
-  if (iqmatrix != NULL)
-    dqv =
-        ((iqmatrix[coeff_idx] * dqv) + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS;
-  return sign * ((abs(qc) * dqv) >> shift);
-}
-
 static INLINE int64_t get_coeff_dist(tran_low_t tcoeff, tran_low_t dqcoeff,
                                      int shift) {
   const int64_t diff = (tcoeff - dqcoeff) * (1 << shift);
@@ -265,16 +264,6 @@
   return eob_cost;
 }
 
-static INLINE int get_sign_bit_cost(tran_low_t qc, int coeff_idx,
-                                    const int (*dc_sign_cost)[2],
-                                    int dc_sign_ctx) {
-  if (coeff_idx == 0) {
-    const int sign = (qc < 0) ? 1 : 0;
-    return dc_sign_cost[dc_sign_ctx][sign];
-  }
-  return av1_cost_literal(1);
-}
-
 static const int golomb_bits_cost[32] = {
   0,       512,     512 * 3, 512 * 3, 512 * 5, 512 * 5, 512 * 5, 512 * 5,
   512 * 7, 512 * 7, 512 * 7, 512 * 7, 512 * 7, 512 * 7, 512 * 7, 512 * 7,
@@ -321,35 +310,6 @@
   return coeff_lps[base_range] + get_golomb_cost(level);
 }
 
-static int get_coeff_cost(const tran_low_t qc, const int scan_idx,
-                          const int is_eob, const TxbInfo *const txb_info,
-                          const LV_MAP_COEFF_COST *const txb_costs,
-                          const int coeff_ctx, const TX_CLASS tx_class) {
-  const TXB_CTX *const txb_ctx = txb_info->txb_ctx;
-  const int is_nz = (qc != 0);
-  const tran_low_t abs_qc = abs(qc);
-  int cost = 0;
-  const int16_t *const scan = txb_info->scan_order->scan;
-  const int pos = scan[scan_idx];
-
-  if (is_eob) {
-    cost += txb_costs->base_eob_cost[coeff_ctx][AOMMIN(abs_qc, 3) - 1];
-  } else {
-    cost += txb_costs->base_cost[coeff_ctx][AOMMIN(abs_qc, 3)];
-  }
-  if (is_nz) {
-    cost += get_sign_bit_cost(qc, scan_idx, txb_costs->dc_sign_cost,
-                              txb_ctx->dc_sign_ctx);
-
-    if (abs_qc > NUM_BASE_LEVELS) {
-      const int ctx =
-          get_br_ctx(txb_info->levels, pos, txb_info->bwl, tx_class);
-      cost += get_br_cost(abs_qc, txb_costs->lps_cost[ctx]);
-    }
-  }
-  return cost;
-}
-
 static INLINE int get_nz_map_ctx(const uint8_t *const levels,
                                  const int coeff_idx, const int bwl,
                                  const int height, const int scan_idx,
@@ -366,111 +326,6 @@
   return get_nz_map_ctx_from_stats(stats, coeff_idx, bwl, tx_size, tx_class);
 }
 
-static void get_dist_cost_stats(LevelDownStats *const stats, const int scan_idx,
-                                const int is_eob,
-                                const LV_MAP_COEFF_COST *const txb_costs,
-                                const TxbInfo *const txb_info,
-                                const TX_CLASS tx_class) {
-  const int16_t *const scan = txb_info->scan_order->scan;
-  const int coeff_idx = scan[scan_idx];
-  const tran_low_t qc = txb_info->qcoeff[coeff_idx];
-  const uint8_t *const levels = txb_info->levels;
-  stats->new_eob = -1;
-  stats->update = 0;
-  stats->rd_low = 0;
-  stats->rd = 0;
-  stats->nz_rd = 0;
-  stats->dist_low = 0;
-  stats->rate_low = 0;
-  stats->low_qc = 0;
-
-  const tran_low_t tqc = txb_info->tcoeff[coeff_idx];
-  const int dqv = txb_info->dequant[coeff_idx != 0];
-  const int coeff_ctx =
-      get_nz_map_ctx(levels, coeff_idx, txb_info->bwl, txb_info->height,
-                     scan_idx, is_eob, txb_info->tx_size, tx_class);
-  const int qc_cost = get_coeff_cost(qc, scan_idx, is_eob, txb_info, txb_costs,
-                                     coeff_ctx, tx_class);
-  assert(qc != 0);
-  const tran_low_t dqc = qcoeff_to_dqcoeff(qc, coeff_idx, dqv, txb_info->shift,
-                                           txb_info->iqmatrix);
-  const int64_t dqc_dist = get_coeff_dist(tqc, dqc, txb_info->shift);
-
-  // distortion difference when coefficient is quantized to 0
-  const tran_low_t dqc0 =
-      qcoeff_to_dqcoeff(0, coeff_idx, dqv, txb_info->shift, txb_info->iqmatrix);
-
-  stats->dist0 = get_coeff_dist(tqc, dqc0, txb_info->shift);
-  stats->dist = dqc_dist - stats->dist0;
-  stats->rate = qc_cost;
-
-  stats->rd = RDCOST(txb_info->rdmult, stats->rate, stats->dist);
-
-  stats->low_qc = get_lower_coeff(qc);
-
-  if (is_eob && stats->low_qc == 0) {
-    stats->rd_low = stats->rd;  // disable selection of low_qc in this case.
-  } else {
-    if (stats->low_qc == 0) {
-      stats->dist_low = 0;
-    } else {
-      stats->low_dqc = qcoeff_to_dqcoeff(stats->low_qc, coeff_idx, dqv,
-                                         txb_info->shift, txb_info->iqmatrix);
-      const int64_t low_dqc_dist =
-          get_coeff_dist(tqc, stats->low_dqc, txb_info->shift);
-      stats->dist_low = low_dqc_dist - stats->dist0;
-    }
-    const int low_qc_cost =
-        get_coeff_cost(stats->low_qc, scan_idx, is_eob, txb_info, txb_costs,
-                       coeff_ctx, tx_class);
-    stats->rate_low = low_qc_cost;
-    stats->rd_low = RDCOST(txb_info->rdmult, stats->rate_low, stats->dist_low);
-  }
-}
-
-static void get_dist_cost_stats_with_eob(
-    LevelDownStats *const stats, const int scan_idx,
-    const LV_MAP_COEFF_COST *const txb_costs, const TxbInfo *const txb_info,
-    const TX_CLASS tx_class) {
-  const int is_eob = 0;
-  get_dist_cost_stats(stats, scan_idx, is_eob, txb_costs, txb_info, tx_class);
-
-  const int16_t *const scan = txb_info->scan_order->scan;
-  const int coeff_idx = scan[scan_idx];
-  const tran_low_t qc = txb_info->qcoeff[coeff_idx];
-  const int coeff_ctx_temp = get_nz_map_ctx(
-      txb_info->levels, coeff_idx, txb_info->bwl, txb_info->height, scan_idx, 1,
-      txb_info->tx_size, tx_class);
-  const int qc_eob_cost = get_coeff_cost(qc, scan_idx, 1, txb_info, txb_costs,
-                                         coeff_ctx_temp, tx_class);
-  int64_t rd_eob = RDCOST(txb_info->rdmult, qc_eob_cost, stats->dist);
-  if (stats->low_qc != 0) {
-    const int low_qc_eob_cost =
-        get_coeff_cost(stats->low_qc, scan_idx, 1, txb_info, txb_costs,
-                       coeff_ctx_temp, tx_class);
-    int64_t rd_eob_low =
-        RDCOST(txb_info->rdmult, low_qc_eob_cost, stats->dist_low);
-    rd_eob = (rd_eob > rd_eob_low) ? rd_eob_low : rd_eob;
-  }
-
-  stats->nz_rd = AOMMIN(stats->rd_low, stats->rd) - rd_eob;
-}
-
-static INLINE void update_qcoeff(const int coeff_idx, const tran_low_t qc,
-                                 const TxbInfo *const txb_info) {
-  txb_info->qcoeff[coeff_idx] = qc;
-  txb_info->levels[get_padded_idx(coeff_idx, txb_info->bwl)] =
-      (uint8_t)clamp(abs(qc), 0, INT8_MAX);
-}
-
-static INLINE void update_coeff(const int coeff_idx, const tran_low_t qc,
-                                const TxbInfo *const txb_info) {
-  update_qcoeff(coeff_idx, qc, txb_info);
-  const int dqv = txb_info->dequant[coeff_idx != 0];
-  txb_info->dqcoeff[coeff_idx] = qcoeff_to_dqcoeff(
-      qc, coeff_idx, dqv, txb_info->shift, txb_info->iqmatrix);
-}
-
 void av1_txb_init_levels_c(const tran_low_t *const coeff, const int width,
                            const int height, uint8_t *const levels) {
   const int stride = width + TX_PAD_HOR;
@@ -507,8 +362,9 @@
                           int block, TX_SIZE tx_size) {
   MACROBLOCKD *xd = &x->e_mbd;
   const CB_COEFF_BUFFER *cb_coef_buff = x->cb_coef_buff;
-  const int txb_offset =
-      x->mbmi_ext_frame->cb_offset / (TX_SIZE_W_MIN * TX_SIZE_H_MIN);
+  const PLANE_TYPE plane_type = get_plane_type(plane);
+  const int txb_offset = x->mbmi_ext_frame->cb_offset[plane_type] /
+                         (TX_SIZE_W_MIN * TX_SIZE_H_MIN);
   const uint16_t *eob_txb = cb_coef_buff->eobs[plane] + txb_offset;
   const uint16_t eob = eob_txb[block];
   const uint8_t *entropy_ctx = cb_coef_buff->entropy_ctx[plane] + txb_offset;
@@ -518,7 +374,6 @@
   aom_write_symbol(w, eob == 0, ec_ctx->txb_skip_cdf[txs_ctx][txb_skip_ctx], 2);
   if (eob == 0) return;
 
-  const PLANE_TYPE plane_type = get_plane_type(plane);
   const TX_TYPE tx_type =
       av1_get_tx_type(xd, plane_type, blk_row, blk_col, tx_size,
                       cm->features.reduced_tx_set_used);
@@ -582,7 +437,7 @@
   uint8_t levels_buf[TX_PAD_2D];
   uint8_t *const levels = set_levels(levels_buf, width);
   const tran_low_t *tcoeff_txb =
-      cb_coef_buff->tcoeff[plane] + x->mbmi_ext_frame->cb_offset;
+      cb_coef_buff->tcoeff[plane] + x->mbmi_ext_frame->cb_offset[plane_type];
   const tran_low_t *tcoeff = tcoeff_txb + BLOCK_OFFSET(block);
   av1_txb_init_levels(tcoeff, width, height, levels);
   const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type);
@@ -647,8 +502,8 @@
   aom_writer *w;
 } ENCODE_TXB_ARGS;
 
-void av1_write_coeffs_mb(const AV1_COMMON *const cm, MACROBLOCK *x,
-                         aom_writer *w, BLOCK_SIZE bsize) {
+void av1_write_intra_coeffs_mb(const AV1_COMMON *const cm, MACROBLOCK *x,
+                               aom_writer *w, BLOCK_SIZE bsize) {
   MACROBLOCKD *xd = &x->e_mbd;
   const int num_planes = av1_num_planes(cm);
   int block[MAX_MB_PLANE] = { 0 };
@@ -706,7 +561,8 @@
         get_ext_tx_set(tx_size, is_inter, reduced_tx_set_used);
     if (is_inter) {
       if (ext_tx_set > 0)
-        return x->inter_tx_type_costs[ext_tx_set][square_tx_size][tx_type];
+        return x->mode_costs
+            .inter_tx_type_costs[ext_tx_set][square_tx_size][tx_type];
     } else {
       if (ext_tx_set > 0) {
         PREDICTION_MODE intra_dir;
@@ -715,8 +571,8 @@
                                              .filter_intra_mode];
         else
           intra_dir = mbmi->mode;
-        return x->intra_tx_type_costs[ext_tx_set][square_tx_size][intra_dir]
-                                     [tx_type];
+        return x->mode_costs.intra_tx_type_costs[ext_tx_set][square_tx_size]
+                                                [intra_dir][tx_type];
       }
     }
   }
@@ -772,7 +628,7 @@
   DECLARE_ALIGNED(16, int8_t, coeff_contexts[MAX_TX_SQUARE]);
   const int eob_multi_size = txsize_log2_minus4[tx_size];
   const LV_MAP_EOB_COST *const eob_costs =
-      &x->eob_costs[eob_multi_size][plane_type];
+      &x->coeff_costs.eob_costs[eob_multi_size][plane_type];
   int cost = coeff_costs->txb_skip_cost[txb_skip_ctx][0];
 
   av1_txb_init_levels(qcoeff, width, height, levels);
@@ -859,7 +715,7 @@
 
   const int eob_multi_size = txsize_log2_minus4[tx_size];
   const LV_MAP_EOB_COST *const eob_costs =
-      &x->eob_costs[eob_multi_size][plane_type];
+      &x->coeff_costs.eob_costs[eob_multi_size][plane_type];
   int cost = coeff_costs->txb_skip_cost[txb_skip_ctx][0];
 
   cost += get_tx_type_cost(x, xd, plane, tx_size, tx_type, reduced_tx_set_used);
@@ -922,7 +778,7 @@
   const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
   const PLANE_TYPE plane_type = get_plane_type(plane);
   const LV_MAP_COEFF_COST *const coeff_costs =
-      &x->coeff_costs[txs_ctx][plane_type];
+      &x->coeff_costs.coeff_costs[txs_ctx][plane_type];
   if (eob == 0) {
     return coeff_costs->txb_skip_cost[txb_ctx->txb_skip_ctx][1];
   }
@@ -949,9 +805,7 @@
     const int16_t *scan = scan_order->scan;
     tran_low_t *tcoeff = p->coeff + BLOCK_OFFSET(block);
     tran_low_t *qcoeff = p->qcoeff + BLOCK_OFFSET(block);
-    const MACROBLOCKD *xd = &x->e_mbd;
-    const struct macroblockd_plane *const pd = &xd->plane[plane];
-    tran_low_t *dqcoeff = pd->dqcoeff + BLOCK_OFFSET(block);
+    tran_low_t *dqcoeff = p->dqcoeff + BLOCK_OFFSET(block);
     update_coeff_eob_fast(&eob, av1_get_tx_scale(tx_size), p->dequant_QTX, scan,
                           tcoeff, qcoeff, dqcoeff);
     p->eobs[block] = eob;
@@ -960,7 +814,7 @@
   const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
   const PLANE_TYPE plane_type = get_plane_type(plane);
   const LV_MAP_COEFF_COST *const coeff_costs =
-      &x->coeff_costs[txs_ctx][plane_type];
+      &x->coeff_costs.coeff_costs[txs_ctx][plane_type];
   if (eob == 0) {
     return coeff_costs->txb_skip_cost[txb_ctx->txb_skip_ctx][1];
   }
@@ -973,443 +827,6 @@
       tx_type, tx_class, reduced_tx_set_used);
 }
 
-static int optimize_txb(TxbInfo *txb_info, const LV_MAP_COEFF_COST *txb_costs,
-                        const LV_MAP_EOB_COST *txb_eob_costs, int *rate_cost) {
-  int update = 0;
-  if (txb_info->eob == 0) return update;
-  const int16_t *const scan = txb_info->scan_order->scan;
-  // forward optimize the nz_map`
-  const int init_eob = txb_info->eob;
-  const TX_CLASS tx_class = tx_type_to_class[txb_info->tx_type];
-  const int eob_cost =
-      get_eob_cost(init_eob, txb_eob_costs, txb_costs, tx_class);
-
-  // backward optimize the level-k map
-  int accu_rate = eob_cost;
-  int64_t accu_dist = 0;
-  int64_t prev_eob_rd_cost = INT64_MAX;
-  int64_t cur_eob_rd_cost = 0;
-
-  {
-    const int si = init_eob - 1;
-    const int coeff_idx = scan[si];
-    LevelDownStats stats;
-    get_dist_cost_stats(&stats, si, si == init_eob - 1, txb_costs, txb_info,
-                        tx_class);
-    if ((stats.rd_low < stats.rd) && (stats.low_qc != 0)) {
-      update = 1;
-      update_coeff(coeff_idx, stats.low_qc, txb_info);
-      accu_rate += stats.rate_low;
-      accu_dist += stats.dist_low;
-    } else {
-      accu_rate += stats.rate;
-      accu_dist += stats.dist;
-    }
-  }
-
-  int si = init_eob - 2;
-  int8_t has_nz_tail = 0;
-  // eob is not fixed
-  for (; si >= 0 && has_nz_tail < 2; --si) {
-    assert(si != init_eob - 1);
-    const int coeff_idx = scan[si];
-    tran_low_t qc = txb_info->qcoeff[coeff_idx];
-
-    if (qc == 0) {
-      const int coeff_ctx =
-          get_lower_levels_ctx(txb_info->levels, coeff_idx, txb_info->bwl,
-                               txb_info->tx_size, tx_class);
-      accu_rate += txb_costs->base_cost[coeff_ctx][0];
-    } else {
-      LevelDownStats stats;
-      get_dist_cost_stats_with_eob(&stats, si, txb_costs, txb_info, tx_class);
-      // check if it is better to make this the last significant coefficient
-      int cur_eob_rate =
-          get_eob_cost(si + 1, txb_eob_costs, txb_costs, tx_class);
-      cur_eob_rd_cost = RDCOST(txb_info->rdmult, cur_eob_rate, 0);
-      prev_eob_rd_cost =
-          RDCOST(txb_info->rdmult, accu_rate, accu_dist) + stats.nz_rd;
-      if (cur_eob_rd_cost <= prev_eob_rd_cost) {
-        update = 1;
-        for (int j = si + 1; j < txb_info->eob; j++) {
-          const int coeff_pos_j = scan[j];
-          update_coeff(coeff_pos_j, 0, txb_info);
-        }
-        txb_info->eob = si + 1;
-
-        // rerun cost calculation due to change of eob
-        accu_rate = cur_eob_rate;
-        accu_dist = 0;
-        get_dist_cost_stats(&stats, si, 1, txb_costs, txb_info, tx_class);
-        if ((stats.rd_low < stats.rd) && (stats.low_qc != 0)) {
-          update = 1;
-          update_coeff(coeff_idx, stats.low_qc, txb_info);
-          accu_rate += stats.rate_low;
-          accu_dist += stats.dist_low;
-        } else {
-          accu_rate += stats.rate;
-          accu_dist += stats.dist;
-        }
-
-        // reset non zero tail when new eob is found
-        has_nz_tail = 0;
-      } else {
-        int bUpdCoeff = 0;
-        if (stats.rd_low < stats.rd) {
-          if ((si < txb_info->eob - 1)) {
-            bUpdCoeff = 1;
-            update = 1;
-          }
-        } else {
-          ++has_nz_tail;
-        }
-
-        if (bUpdCoeff) {
-          update_coeff(coeff_idx, stats.low_qc, txb_info);
-          accu_rate += stats.rate_low;
-          accu_dist += stats.dist_low;
-        } else {
-          accu_rate += stats.rate;
-          accu_dist += stats.dist;
-        }
-      }
-    }
-  }  // for (si)
-
-  // eob is fixed
-  for (; si >= 0; --si) {
-    assert(si != init_eob - 1);
-    const int coeff_idx = scan[si];
-    tran_low_t qc = txb_info->qcoeff[coeff_idx];
-
-    if (qc == 0) {
-      const int coeff_ctx =
-          get_lower_levels_ctx(txb_info->levels, coeff_idx, txb_info->bwl,
-                               txb_info->tx_size, tx_class);
-      accu_rate += txb_costs->base_cost[coeff_ctx][0];
-    } else {
-      LevelDownStats stats;
-      get_dist_cost_stats(&stats, si, 0, txb_costs, txb_info, tx_class);
-
-      int bUpdCoeff = 0;
-      if (stats.rd_low < stats.rd) {
-        if ((si < txb_info->eob - 1)) {
-          bUpdCoeff = 1;
-          update = 1;
-        }
-      }
-      if (bUpdCoeff) {
-        update_coeff(coeff_idx, stats.low_qc, txb_info);
-        accu_rate += stats.rate_low;
-        accu_dist += stats.dist_low;
-      } else {
-        accu_rate += stats.rate;
-        accu_dist += stats.dist;
-      }
-    }
-  }  // for (si)
-
-  int non_zero_blk_rate =
-      txb_costs->txb_skip_cost[txb_info->txb_ctx->txb_skip_ctx][0];
-  prev_eob_rd_cost =
-      RDCOST(txb_info->rdmult, accu_rate + non_zero_blk_rate, accu_dist);
-
-  int zero_blk_rate =
-      txb_costs->txb_skip_cost[txb_info->txb_ctx->txb_skip_ctx][1];
-  int64_t zero_blk_rd_cost = RDCOST(txb_info->rdmult, zero_blk_rate, 0);
-  if (zero_blk_rd_cost <= prev_eob_rd_cost) {
-    update = 1;
-    for (int j = 0; j < txb_info->eob; j++) {
-      const int coeff_pos_j = scan[j];
-      update_coeff(coeff_pos_j, 0, txb_info);
-    }
-    txb_info->eob = 0;
-  }
-
-  // record total rate cost
-  *rate_cost = zero_blk_rd_cost <= prev_eob_rd_cost
-                   ? zero_blk_rate
-                   : accu_rate + non_zero_blk_rate;
-
-  if (txb_info->eob > 0) {
-    *rate_cost += txb_info->tx_type_cost;
-  }
-
-  return update;
-}
-
-#if CONFIG_HTB_TRELLIS
-static void hbt_init() {
-  hbt_hash_table =
-      aom_malloc(sizeof(OptTxbQcoeff) * HBT_TABLE_SIZE * HBT_ARRAY_LENGTH);
-  memset(hbt_hash_table, 0,
-         sizeof(OptTxbQcoeff) * HBT_TABLE_SIZE * HBT_ARRAY_LENGTH);
-  av1_crc32c_calculator_init(&crc_calculator);  // 31 bit: qc & ctx
-
-  hbt_needs_init = 0;
-}
-
-void hbt_destroy() { aom_free(hbt_hash_table); }
-
-static int hbt_hash_miss(uint32_t hbt_ctx_hash, uint32_t hbt_qc_hash,
-                         TxbInfo *txb_info, const LV_MAP_COEFF_COST *txb_costs,
-                         const LV_MAP_EOB_COST *txb_eob_costs,
-                         const struct macroblock_plane *p, int block,
-                         int fast_mode, int *rate_cost) {
-  (void)fast_mode;
-  const int16_t *scan = txb_info->scan_order->scan;
-  int prev_eob = txb_info->eob;
-  assert(HBT_EOB <= 16);  // Lengthen array if allowing longer eob.
-  int32_t prev_coeff[16];
-  for (int i = 0; i < prev_eob; i++) {
-    prev_coeff[i] = txb_info->qcoeff[scan[i]];
-  }
-  for (int i = prev_eob; i < HBT_EOB; i++) {
-    prev_coeff[i] = 0;  // For compiler piece of mind.
-  }
-
-  av1_txb_init_levels(txb_info->qcoeff, txb_info->width, txb_info->height,
-                      txb_info->levels);
-
-  const int update =
-      optimize_txb(txb_info, txb_costs, txb_eob_costs, rate_cost);
-
-  // Overwrite old entry
-  uint16_t hbt_table_index = hbt_ctx_hash % HBT_TABLE_SIZE;
-  uint16_t hbt_array_index = hbt_qc_hash % HBT_ARRAY_LENGTH;
-  hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
-      .rate_cost = *rate_cost;
-  hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index].init = 1;
-  hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
-      .hbt_qc_hash = hbt_qc_hash;
-  hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
-      .hbt_ctx_hash = hbt_ctx_hash;
-  assert(prev_eob >= txb_info->eob);  // eob can't get longer
-  for (int i = 0; i < txb_info->eob; i++) {
-    // Record how coeff changed. Convention: towards zero is negative.
-    if (txb_info->qcoeff[scan[i]] > 0)
-      hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
-          .deltas[i] = txb_info->qcoeff[scan[i]] - prev_coeff[i];
-    else
-      hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
-          .deltas[i] = prev_coeff[i] - txb_info->qcoeff[scan[i]];
-  }
-  for (int i = txb_info->eob; i < prev_eob; i++) {
-    // If eob got shorter, record that all after it changed to zero.
-    if (prev_coeff[i] > 0)
-      hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
-          .deltas[i] = -prev_coeff[i];
-    else
-      hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
-          .deltas[i] = prev_coeff[i];
-  }
-  for (int i = prev_eob; i < HBT_EOB; i++) {
-    // Record 'no change' after optimized coefficients run out.
-    hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
-        .deltas[i] = 0;
-  }
-
-  if (update) {
-    p->eobs[block] = txb_info->eob;
-    p->txb_entropy_ctx[block] = av1_get_txb_entropy_context(
-        txb_info->qcoeff, txb_info->scan_order, txb_info->eob);
-  }
-  return txb_info->eob;
-}
-
-static int hbt_hash_hit(uint32_t hbt_table_index, int hbt_array_index,
-                        TxbInfo *txb_info, const struct macroblock_plane *p,
-                        int block, int *rate_cost) {
-  const int16_t *scan = txb_info->scan_order->scan;
-  int new_eob = 0;
-  int update = 0;
-
-  for (int i = 0; i < txb_info->eob; i++) {
-    // Delta convention is negatives go towards zero, so only apply those ones.
-    if (hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
-            .deltas[i] < 0) {
-      if (txb_info->qcoeff[scan[i]] > 0)
-        txb_info->qcoeff[scan[i]] +=
-            hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
-                .deltas[i];
-      else
-        txb_info->qcoeff[scan[i]] -=
-            hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
-                .deltas[i];
-
-      update = 1;
-      update_coeff(scan[i], txb_info->qcoeff[scan[i]], txb_info);
-    }
-    if (txb_info->qcoeff[scan[i]]) new_eob = i + 1;
-  }
-
-  // Rate_cost can be calculated here instead (av1_cost_coeffs_txb), but
-  // it is expensive and gives little benefit as long as qc_hash is high bit
-  *rate_cost =
-      hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
-          .rate_cost;
-
-  if (update) {
-    txb_info->eob = new_eob;
-    p->eobs[block] = txb_info->eob;
-    p->txb_entropy_ctx[block] = av1_get_txb_entropy_context(
-        txb_info->qcoeff, txb_info->scan_order, txb_info->eob);
-  }
-
-  return txb_info->eob;
-}
-
-static int hbt_search_match(uint32_t hbt_ctx_hash, uint32_t hbt_qc_hash,
-                            TxbInfo *txb_info,
-                            const LV_MAP_COEFF_COST *txb_costs,
-                            const LV_MAP_EOB_COST *txb_eob_costs,
-                            const struct macroblock_plane *p, int block,
-                            int fast_mode, int *rate_cost) {
-  // Check for qcoeff match
-  int hbt_array_index = hbt_qc_hash % HBT_ARRAY_LENGTH;
-  int hbt_table_index = hbt_ctx_hash % HBT_TABLE_SIZE;
-
-  if (hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
-              .hbt_qc_hash == hbt_qc_hash &&
-      hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
-              .hbt_ctx_hash == hbt_ctx_hash &&
-      hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
-          .init) {
-    return hbt_hash_hit(hbt_table_index, hbt_array_index, txb_info, p, block,
-                        rate_cost);
-  } else {
-    return hbt_hash_miss(hbt_ctx_hash, hbt_qc_hash, txb_info, txb_costs,
-                         txb_eob_costs, p, block, fast_mode, rate_cost);
-  }
-}
-
-static int hbt_create_hashes(TxbInfo *txb_info,
-                             const LV_MAP_COEFF_COST *txb_costs,
-                             const LV_MAP_EOB_COST *txb_eob_costs,
-                             const struct macroblock_plane *p, int block,
-                             int fast_mode, int *rate_cost) {
-  // Initialize hash table if needed.
-  if (hbt_needs_init) {
-    hbt_init();
-  }
-
-  //// Hash creation
-  uint8_t txb_hash_data[256];  // Asserts below to ensure enough space.
-  const int16_t *scan = txb_info->scan_order->scan;
-  uint8_t chunk = 0;
-  int hash_data_index = 0;
-
-  // Make qc_hash.
-  int packing_index = 0;  // needed for packing.
-  for (int i = 0; i < txb_info->eob; i++) {
-    tran_low_t prechunk = txb_info->qcoeff[scan[i]];
-
-    // Softening: Improves speed. Aligns with signed deltas.
-    if (prechunk < 0) prechunk *= -1;
-
-    // Early kick out: Don't apply feature if there are large coeffs:
-    // If this kickout value is removed or raised beyond int8_t,
-    // widen deltas type in OptTxbQcoeff struct.
-    assert((int8_t)HBT_KICKOUT == HBT_KICKOUT);  // If not, widen types.
-    if (prechunk > HBT_KICKOUT) {
-      av1_txb_init_levels(txb_info->qcoeff, txb_info->width, txb_info->height,
-                          txb_info->levels);
-
-      const int update =
-          optimize_txb(txb_info, txb_costs, txb_eob_costs, rate_cost);
-
-      if (update) {
-        p->eobs[block] = txb_info->eob;
-        p->txb_entropy_ctx[block] = av1_get_txb_entropy_context(
-            txb_info->qcoeff, txb_info->scan_order, txb_info->eob);
-      }
-      return txb_info->eob;
-    }
-
-    // Since coeffs are 0 to 3, only 2 bits are needed: pack into bytes
-    if (packing_index == 0) txb_hash_data[hash_data_index] = 0;
-    chunk = prechunk << packing_index;
-    packing_index += 2;
-    txb_hash_data[hash_data_index] |= chunk;
-
-    // Full byte:
-    if (packing_index == 8) {
-      packing_index = 0;
-      hash_data_index++;
-    }
-  }
-  // Needed when packing_index != 0, to include final byte.
-  hash_data_index++;
-  assert(hash_data_index <= 64);
-  // 31 bit qc_hash: index to array
-  uint32_t hbt_qc_hash =
-      av1_get_crc32c_value(&crc_calculator, txb_hash_data, hash_data_index);
-
-  // Make ctx_hash.
-  hash_data_index = 0;
-  tran_low_t prechunk;
-
-  for (int i = 0; i < txb_info->eob; i++) {
-    // Save as magnitudes towards or away from zero.
-    if (txb_info->tcoeff[scan[i]] >= 0)
-      prechunk = txb_info->tcoeff[scan[i]] - txb_info->dqcoeff[scan[i]];
-    else
-      prechunk = txb_info->dqcoeff[scan[i]] - txb_info->tcoeff[scan[i]];
-
-    chunk = prechunk & 0xff;
-    txb_hash_data[hash_data_index++] = chunk;
-  }
-
-  // Extra ctx data:
-  // Include dequants.
-  txb_hash_data[hash_data_index++] = txb_info->dequant[0] & 0xff;
-  txb_hash_data[hash_data_index++] = txb_info->dequant[1] & 0xff;
-  chunk = txb_info->txb_ctx->txb_skip_ctx & 0xff;
-  txb_hash_data[hash_data_index++] = chunk;
-  chunk = txb_info->txb_ctx->dc_sign_ctx & 0xff;
-  txb_hash_data[hash_data_index++] = chunk;
-  // eob
-  chunk = txb_info->eob & 0xff;
-  txb_hash_data[hash_data_index++] = chunk;
-  // rdmult (int64)
-  chunk = txb_info->rdmult & 0xff;
-  txb_hash_data[hash_data_index++] = chunk;
-  // tx_type
-  chunk = txb_info->tx_type & 0xff;
-  txb_hash_data[hash_data_index++] = chunk;
-  // base_eob_cost
-  for (int i = 1; i < 3; i++) {  // i = 0 are softened away
-    for (int j = 0; j < SIG_COEF_CONTEXTS_EOB; j++) {
-      chunk = (txb_costs->base_eob_cost[j][i] & 0xff00) >> 8;
-      txb_hash_data[hash_data_index++] = chunk;
-    }
-  }
-  // eob_cost
-  for (int i = 0; i < 11; i++) {
-    for (int j = 0; j < 2; j++) {
-      chunk = (txb_eob_costs->eob_cost[j][i] & 0xff00) >> 8;
-      txb_hash_data[hash_data_index++] = chunk;
-    }
-  }
-  // dc_sign_cost
-  for (int i = 0; i < 2; i++) {
-    for (int j = 0; j < DC_SIGN_CONTEXTS; j++) {
-      chunk = (txb_costs->dc_sign_cost[j][i] & 0xff00) >> 8;
-      txb_hash_data[hash_data_index++] = chunk;
-    }
-  }
-
-  assert(hash_data_index <= 256);
-  // 31 bit ctx_hash: used to index table
-  uint32_t hbt_ctx_hash =
-      av1_get_crc32c_value(&crc_calculator, txb_hash_data, hash_data_index);
-  //// End hash creation
-
-  return hbt_search_match(hbt_ctx_hash, hbt_qc_hash, txb_info, txb_costs,
-                          txb_eob_costs, p, block, fast_mode, rate_cost);
-}
-#endif  // CONFIG_HTB_TRELLIS
-
 static AOM_FORCE_INLINE int get_two_coeff_cost_simple(
     int ci, tran_low_t abs_qc, int coeff_ctx,
     const LV_MAP_COEFF_COST *txb_costs, int bwl, TX_CLASS tx_class,
@@ -1741,9 +1158,8 @@
 int av1_optimize_txb_new(const struct AV1_COMP *cpi, MACROBLOCK *x, int plane,
                          int block, TX_SIZE tx_size, TX_TYPE tx_type,
                          const TXB_CTX *const txb_ctx, int *rate_cost,
-                         int sharpness, int fast_mode) {
+                         int sharpness) {
   MACROBLOCKD *xd = &x->e_mbd;
-  struct macroblockd_plane *pd = &xd->plane[plane];
   const struct macroblock_plane *p = &x->plane[plane];
   const SCAN_ORDER *scan_order = get_scan(tx_size, tx_type);
   const int16_t *scan = scan_order->scan;
@@ -1754,21 +1170,13 @@
       av1_get_iqmatrix(&cpi->common.quant_params, xd, plane, tx_size, tx_type);
   const int block_offset = BLOCK_OFFSET(block);
   tran_low_t *qcoeff = p->qcoeff + block_offset;
-  tran_low_t *dqcoeff = pd->dqcoeff + block_offset;
+  tran_low_t *dqcoeff = p->dqcoeff + block_offset;
   const tran_low_t *tcoeff = p->coeff + block_offset;
+  const CoeffCosts *coeff_costs = &x->coeff_costs;
 
   // This function is not called if eob = 0.
   assert(eob > 0);
 
-  if (fast_mode) {
-    update_coeff_eob_fast(&eob, shift, dequant, scan, tcoeff, qcoeff, dqcoeff);
-    p->eobs[block] = eob;
-    if (eob == 0) {
-      *rate_cost = av1_cost_skip_txb(x, txb_ctx, plane, tx_size);
-      return eob;
-    }
-  }
-
   const AV1_COMMON *cm = &cpi->common;
   const PLANE_TYPE plane_type = get_plane_type(plane);
   const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
@@ -1779,18 +1187,19 @@
   const int height = get_txb_high(tx_size);
   assert(width == (1 << bwl));
   const int is_inter = is_inter_block(mbmi);
-  const LV_MAP_COEFF_COST *txb_costs = &x->coeff_costs[txs_ctx][plane_type];
+  const LV_MAP_COEFF_COST *txb_costs =
+      &coeff_costs->coeff_costs[txs_ctx][plane_type];
   const int eob_multi_size = txsize_log2_minus4[tx_size];
   const LV_MAP_EOB_COST *txb_eob_costs =
-      &x->eob_costs[eob_multi_size][plane_type];
+      &coeff_costs->eob_costs[eob_multi_size][plane_type];
 
   const int rshift =
       (sharpness +
-       (cpi->oxcf.aq_mode == VARIANCE_AQ && mbmi->segment_id < 4
+       (cpi->oxcf.q_cfg.aq_mode == VARIANCE_AQ && mbmi->segment_id < 4
             ? 7 - mbmi->segment_id
             : 2) +
-       (cpi->oxcf.aq_mode != VARIANCE_AQ &&
-                cpi->oxcf.deltaq_mode == DELTA_Q_PERCEPTUAL &&
+       (cpi->oxcf.q_cfg.aq_mode != VARIANCE_AQ &&
+                cpi->oxcf.q_cfg.deltaq_mode == DELTA_Q_PERCEPTUAL &&
                 cm->delta_q_info.delta_q_present_flag && x->sb_energy_level < 0
             ? (3 - x->sb_energy_level)
             : 0));
@@ -1842,7 +1251,7 @@
 
 #define UPDATE_COEFF_EOB_CASE(tx_class_literal)                            \
   case tx_class_literal:                                                   \
-    for (; si >= 0 && nz_num <= max_nz_num && !fast_mode; --si) {          \
+    for (; si >= 0 && nz_num <= max_nz_num; --si) {                        \
       update_coeff_eob(&accu_rate, &accu_dist, &eob, &nz_num, nz_ci, si,   \
                        tx_size, tx_class_literal, bwl, height,             \
                        txb_ctx->dc_sign_ctx, rdmult, shift, dequant, scan, \
@@ -1904,83 +1313,8 @@
   return eob;
 }
 
-// This function is deprecated, but we keep it here because hash trellis
-// is not integrated with av1_optimize_txb_new yet
-int av1_optimize_txb(const struct AV1_COMP *cpi, MACROBLOCK *x, int plane,
-                     int blk_row, int blk_col, int block, TX_SIZE tx_size,
-                     TXB_CTX *txb_ctx, int fast_mode, int *rate_cost) {
-  const AV1_COMMON *cm = &cpi->common;
-  const int reduced_tx_set_used = cm->features.reduced_tx_set_used;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  const PLANE_TYPE plane_type = get_plane_type(plane);
-  const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
-  const TX_TYPE tx_type = av1_get_tx_type(xd, plane_type, blk_row, blk_col,
-                                          tx_size, reduced_tx_set_used);
-  const MB_MODE_INFO *mbmi = xd->mi[0];
-  const struct macroblock_plane *p = &x->plane[plane];
-  struct macroblockd_plane *pd = &xd->plane[plane];
-  const int eob = p->eobs[block];
-  const int block_offset = BLOCK_OFFSET(block);
-  tran_low_t *qcoeff = p->qcoeff + block_offset;
-  tran_low_t *dqcoeff = pd->dqcoeff + block_offset;
-  const tran_low_t *tcoeff = p->coeff + block_offset;
-  const int16_t *dequant = p->dequant_QTX;
-  const int seg_eob = av1_get_max_eob(tx_size);
-  const int bwl = get_txb_bwl(tx_size);
-  const int width = get_txb_wide(tx_size);
-  const int height = get_txb_high(tx_size);
-  const int is_inter = is_inter_block(mbmi);
-  const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type);
-  const LV_MAP_COEFF_COST *txb_costs = &x->coeff_costs[txs_ctx][plane_type];
-  const int eob_multi_size = txsize_log2_minus4[tx_size];
-  const LV_MAP_EOB_COST txb_eob_costs =
-      x->eob_costs[eob_multi_size][plane_type];
-
-  const int shift = av1_get_tx_scale(tx_size);
-  const int64_t rdmult =
-      (((int64_t)x->rdmult * plane_rd_mult[is_inter][plane_type]
-        << (2 * (xd->bd - 8))) +
-       2) >>
-      2;
-  uint8_t levels_buf[TX_PAD_2D];
-  uint8_t *const levels = set_levels(levels_buf, width);
-  const qm_val_t *iqmatrix =
-      av1_get_iqmatrix(&cpi->common.quant_params, xd, plane, tx_size, tx_type);
-  assert(width == (1 << bwl));
-  const int tx_type_cost =
-      get_tx_type_cost(x, xd, plane, tx_size, tx_type, reduced_tx_set_used);
-  TxbInfo txb_info = {
-    qcoeff,     levels,  dqcoeff, tcoeff,   dequant,      shift, tx_size,
-    txs_ctx,    tx_type, bwl,     width,    height,       eob,   seg_eob,
-    scan_order, txb_ctx, rdmult,  iqmatrix, tx_type_cost,
-  };
-
-#if CONFIG_HTB_TRELLIS
-  // Hash based trellis (hbt) speed feature: avoid expensive optimize_txb calls
-  // by storing the coefficient deltas in a hash table.
-  // Currently disabled in speedfeatures.c
-  if (eob <= HBT_EOB && eob > 0 && cpi->sf.use_hash_based_trellis) {
-    return hbt_create_hashes(&txb_info, txb_costs, &txb_eob_costs, p, block,
-                             fast_mode, rate_cost);
-  }
-#else
-  (void)fast_mode;
-#endif  // CONFIG_HTB_TRELLIS
-  av1_txb_init_levels(qcoeff, width, height, levels);
-
-  const int update =
-      optimize_txb(&txb_info, txb_costs, &txb_eob_costs, rate_cost);
-
-  if (update) {
-    p->eobs[block] = txb_info.eob;
-    p->txb_entropy_ctx[block] =
-        av1_get_txb_entropy_context(qcoeff, scan_order, txb_info.eob);
-  }
-  return txb_info.eob;
-}
-
-int av1_get_txb_entropy_context(const tran_low_t *qcoeff,
-                                const SCAN_ORDER *scan_order, int eob) {
+uint8_t av1_get_txb_entropy_context(const tran_low_t *qcoeff,
+                                    const SCAN_ORDER *scan_order, int eob) {
   const int16_t *const scan = scan_order->scan;
   int cul_level = 0;
   int c;
@@ -1994,7 +1328,7 @@
   cul_level = AOMMIN(COEFF_CONTEXT_MASK, cul_level);
   set_dc_sign(&cul_level, qcoeff[0]);
 
-  return cul_level;
+  return (uint8_t)cul_level;
 }
 
 static void update_tx_type_count(const AV1_COMP *cpi, const AV1_COMMON *cm,
@@ -2015,22 +1349,22 @@
   const TX_TYPE tx_type = av1_get_tx_type(xd, PLANE_TYPE_Y, blk_row, blk_col,
                                           tx_size, reduced_tx_set_used);
   if (is_inter) {
-    if (cpi->oxcf.use_inter_dct_only) {
+    if (cpi->oxcf.txfm_cfg.use_inter_dct_only) {
       assert(tx_type == DCT_DCT);
     }
   } else {
-    if (cpi->oxcf.use_intra_dct_only) {
+    if (cpi->oxcf.txfm_cfg.use_intra_dct_only) {
       assert(tx_type == DCT_DCT);
-    } else if (cpi->oxcf.use_intra_default_tx_only) {
+    } else if (cpi->oxcf.txfm_cfg.use_intra_default_tx_only) {
       const TX_TYPE default_type = get_default_tx_type(
-          PLANE_TYPE_Y, xd, tx_size, cpi->is_screen_content_type);
+          PLANE_TYPE_Y, xd, tx_size, cpi->use_screen_content_tools);
       (void)default_type;
       assert(tx_type == default_type);
     }
   }
 
   if (get_ext_tx_types(tx_size, is_inter, reduced_tx_set_used) > 1 &&
-      cm->quant_params.base_qindex > 0 && !mbmi->skip &&
+      cm->quant_params.base_qindex > 0 && !mbmi->skip_txfm &&
       !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
     const int eset = get_ext_tx_set(tx_size, is_inter, reduced_tx_set_used);
     if (eset > 0) {
@@ -2111,8 +1445,8 @@
     }
 
     CB_COEFF_BUFFER *cb_coef_buff = x->cb_coef_buff;
-    const int txb_offset =
-        x->mbmi_ext_frame->cb_offset / (TX_SIZE_W_MIN * TX_SIZE_H_MIN);
+    const int txb_offset = x->mbmi_ext_frame->cb_offset[plane_type] /
+                           (TX_SIZE_W_MIN * TX_SIZE_H_MIN);
     uint16_t *eob_txb = cb_coef_buff->eobs[plane] + txb_offset;
     uint8_t *const entropy_ctx = cb_coef_buff->entropy_ctx[plane] + txb_offset;
     entropy_ctx[block] = txb_ctx.txb_skip_ctx;
@@ -2126,7 +1460,7 @@
     const int segment_id = mbmi->segment_id;
     const int seg_eob = av1_get_tx_eob(&cpi->common.seg, segment_id, tx_size);
     tran_low_t *tcoeff_txb =
-        cb_coef_buff->tcoeff[plane] + x->mbmi_ext_frame->cb_offset;
+        cb_coef_buff->tcoeff[plane] + x->mbmi_ext_frame->cb_offset[plane_type];
     tcoeff = tcoeff_txb + block_offset;
     memcpy(tcoeff, qcoeff, sizeof(*tcoeff) * seg_eob);
 
@@ -2220,21 +1554,22 @@
   } else {
     tcoeff = qcoeff;
   }
-  const int cul_level = av1_get_txb_entropy_context(tcoeff, scan_order, eob);
+  const uint8_t cul_level =
+      av1_get_txb_entropy_context(tcoeff, scan_order, eob);
   av1_set_entropy_contexts(xd, pd, plane, plane_bsize, tx_size, cul_level,
                            blk_col, blk_row);
 }
 
-void av1_update_txb_context(const AV1_COMP *cpi, ThreadData *td,
-                            RUN_TYPE dry_run, BLOCK_SIZE bsize,
-                            uint8_t allow_update_cdf) {
+void av1_update_intra_mb_txb_context(const AV1_COMP *cpi, ThreadData *td,
+                                     RUN_TYPE dry_run, BLOCK_SIZE bsize,
+                                     uint8_t allow_update_cdf) {
   const AV1_COMMON *const cm = &cpi->common;
   const int num_planes = av1_num_planes(cm);
   MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = xd->mi[0];
   struct tokenize_b_args arg = { cpi, td, 0, allow_update_cdf, dry_run };
-  if (mbmi->skip) {
+  if (mbmi->skip_txfm) {
     av1_reset_entropy_context(xd, bsize, num_planes);
     return;
   }
diff --git a/av1/encoder/encodetxb.h b/av1/encoder/encodetxb.h
index 7122895..2cd3160 100644
--- a/av1/encoder/encodetxb.h
+++ b/av1/encoder/encodetxb.h
@@ -24,6 +24,7 @@
 extern "C" {
 #endif
 
+/*!\cond */
 #define TXB_SKIP_CTX_MASK 15
 #define DC_SIGN_CTX_SHIFT 4
 #define DC_SIGN_CTX_MASK 3
@@ -50,49 +51,341 @@
   int tx_type_cost;
 } TxbInfo;
 
+/*!\endcond */
+/*!\brief Allocate the memory resources for all the macro blocks in the current
+ * coding frame.
+ * \ingroup coefficient_coding
+ *
+ * Each macro block will need a \ref CB_COEFF_BUFFER to store information for
+ * rate-distortion optimization and entropy coding of transform coefficients.
+ *
+ * \param[in]    cpi            Top-level encoder structure
+ */
 void av1_alloc_txb_buf(AV1_COMP *cpi);
+/*!\brief Free the memory resources for all the macro blocks in the current
+ * coding frame.
+ * \ingroup coefficient_coding
+ *
+ * See \ref av1_alloc_txb_buf and \ref CB_COEFF_BUFFER for more details.
+ *
+ * \param[in]    cpi            Top-level encoder structure
+ */
 void av1_free_txb_buf(AV1_COMP *cpi);
+/*!\brief Compute the entropy cost of coding coefficients in a transform block.
+ *
+ * \ingroup coefficient_coding
+ *
+ * \param[in]    x                    Pointer to structure holding the data for
+ the current encoding macroblock.
+ * \param[in]    plane                The index of the current plane.
+ * \param[in]    block                The index of the current transform block
+ in the
+ * macroblock. It's defined by number of 4x4 units that have been coded before
+ * the currernt transform block.
+ * \param[in]    tx_size              The transform size.
+ * \param[in]    tx_type              The transform type.
+ * \param[in]    txb_ctx              Context info for entropy coding transform
+ block
+ * skip flag (tx_skip) and the sign of DC coefficient (dc_sign).
+ * \param[in]    reduced_tx_set_used  Whether the transform type is chosen from
+ * a reduced set.
+ */
 int av1_cost_coeffs_txb(const MACROBLOCK *x, const int plane, const int block,
                         const TX_SIZE tx_size, const TX_TYPE tx_type,
                         const TXB_CTX *const txb_ctx, int reduced_tx_set_used);
+
+/*!\brief Estimate the entropy cost of coding a transform block using Laplacian
+ * distribution.
+ *
+ * \ingroup coefficient_coding
+ *
+ * This function compute the entropy costs of the end of block position (eob)
+ * and the transform type (tx_type) precisely.
+ *
+ * Then using \ref av1_cost_coeffs_txb_estimate to estimate the entropy costs
+ * of coefficients in the transform block.
+ *
+ * In the end, the function returns the sum of entropy costs of end of block
+ * position (eob), transform type (tx_type) and coefficients.
+ *
+ * Compared to \ref av1_cost_coeffs_txb, this function is much faster but less
+ * accurate.
+ *
+ * \param[in]    x              Pointer to structure holding the data for the
+                                current encoding macroblock
+ * \param[in]    plane          The index of the current plane
+ * \param[in]    block          The index of the current transform block in the
+ * macroblock. It's defined by number of 4x4 units that have been coded before
+ * the currernt transform block
+ * \param[in]    tx_size        The transform size
+ * \param[in]    tx_type        The transform type
+ * \param[in]    txb_ctx        Context info for entropy coding transform block
+ * skip flag (tx_skip) and the sign of DC coefficient (dc_sign).
+ * \param[in]    reduced_tx_set_used  Whether the transform type is chosen from
+ * a reduced set.
+ * \param[in]    adjust_eob     Whether to adjust the end of block position
+ (eob)
+ * or not.
+ * \return       int            Estimated entropy cost of coding the transform
+ block.
+ */
 int av1_cost_coeffs_txb_laplacian(const MACROBLOCK *x, const int plane,
                                   const int block, const TX_SIZE tx_size,
                                   const TX_TYPE tx_type,
                                   const TXB_CTX *const txb_ctx,
                                   const int reduced_tx_set_used,
                                   const int adjust_eob);
+
+/*!\brief Estimate the entropy cost of transform coefficients using Laplacian
+ * distribution.
+ *
+ * \ingroup coefficient_coding
+ *
+ * This function assumes each transform coefficient is of its own Laplacian
+ * distribution and the coefficient is the only observation of the Laplacian
+ * distribution.
+ *
+ * Based on that, each coefficient's coding cost can be estimated by computing
+ * the entropy of the corresponding Laplacian distribution.
+ *
+ * This function then return the sum of the estimated entropy cost for all
+ * coefficients in the transform block.
+ *
+ * Note that the entropy cost of end of block (eob) and transform type (tx_type)
+ * are not included.
+ *
+ * \param[in]    x              Pointer to structure holding the data for the
+                                current encoding macroblock
+ * \param[in]    plane          The index of the current plane
+ * \param[in]    block          The index of the current transform block in the
+ * macroblock. It's defined by number of 4x4 units that have been coded before
+ * the currernt transform block
+ * \param[in]    tx_size        The transform size
+ * \param[in]    tx_type        The transform type
+ * \return       int            Estimated entropy cost of coefficients in the
+ * transform block.
+ */
 int av1_cost_coeffs_txb_estimate(const MACROBLOCK *x, const int plane,
                                  const int block, const TX_SIZE tx_size,
                                  const TX_TYPE tx_type);
+
+/*!\brief Write quantized coefficients in a transform block into bitstream using
+ * entropy coding.
+ *
+ * \ingroup coefficient_coding
+ *
+ * This function will write the quantized coefficients in a transform block into
+ * the bitstream using entropy coding.
+ *
+ * The coding steps are as follows.
+ *
+ * 1) Code the end of block position "eob", which is the scan index of the
+ * last non-zero coefficient plus one.
+ *
+ * 2) Code the lower magnitude level (<= COEFF_BASE_RANGE + NUM_BASE_LEVELS)
+ * for each coefficient in reversed scan order.
+ *
+ * 3) Code the sign and higher magnitude level
+ * (> COEFF_BASE_RANGE + NUM_BASE_LEVELS) in forward scan order.
+ *
+ * \param[in]    cm             Top-level structure shared by encoder and
+ * decoder
+ * \param[in]    x              Pointer to structure holding the data for the
+                                current encoding macroblock
+ * \param[in]    w              Entropy coding write pointer
+ * \param[in]    blk_row      The row index of the current transform block
+ * in the macroblock. Each unit has 4 pixels in y plane
+ * \param[in]    blk_col      The col index of the current transform block
+ * in the macroblock. Each unit has 4 pixels in y plane
+ * \param[in]    plane          The index of the current plane
+ * \param[in]    block          The index of the current transform block in the
+ * macroblock. It's defined by number of 4x4 units that have been coded before
+ * the currernt transform block
+ * \param[in]    tx_size        The given transform size
+ */
 void av1_write_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCK *const x,
                           aom_writer *w, int blk_row, int blk_col, int plane,
                           int block, TX_SIZE tx_size);
-void av1_write_coeffs_mb(const AV1_COMMON *const cm, MACROBLOCK *x,
-                         aom_writer *w, BLOCK_SIZE bsize);
-int av1_get_txb_entropy_context(const tran_low_t *qcoeff,
-                                const SCAN_ORDER *scan_order, int eob);
-void av1_update_txb_context(const AV1_COMP *cpi, ThreadData *td,
-                            RUN_TYPE dry_run, BLOCK_SIZE bsize,
-                            uint8_t allow_update_cdf);
+
+/*!\brief Write quantized coefficients of all transform blocks in an intra
+ * macroblock into the bitstream using entropy coding.
+ *
+ * \ingroup coefficient_coding
+ *
+ * All transform blocks in the intra macroblock share the same transform size.
+ *
+ * This function use \ref av1_write_coeffs_txb() to code each transform block in
+ * raster order.
+ *
+ * \param[in]    cm             Top-level structure shared by encoder and
+ * decoder
+ * \param[in]    x              Pointer to structure holding the data for the
+                                current encoding macroblock
+ * \param[in]    w              Entropy coding write pointer
+ * \param[in]    bsize          Block size of the current macroblock
+ */
+
+void av1_write_intra_coeffs_mb(const AV1_COMMON *const cm, MACROBLOCK *x,
+                               aom_writer *w, BLOCK_SIZE bsize);
+
+/*!\brief Pack the context info of the current transform block into an uint8_t.
+ * \ingroup coefficient_coding
+ *
+ * This context info will be collected and consolidated by its neighbor
+ * transform blocks for coding transform block skip flag (tx_skip) and
+ * the sign of DC coefficient (dc_sign).
+ *
+ * \param[in]    qcoeff         Buffer of quantized coefficients
+ * \param[in]    scan_order     Coding order of coefficients in the transform
+ * block
+ * \param[in]    eob            The scan index of last non-zero coefficient plus
+ * one
+ */
+uint8_t av1_get_txb_entropy_context(const tran_low_t *qcoeff,
+                                    const SCAN_ORDER *scan_order, int eob);
+
+/*!\brief Update the probability model (cdf) and the entropy context related to
+ * coefficient coding for all transform blocks in the intra macroblock.
+ *
+ * \ingroup coefficient_coding
+ *
+ * This function will go through each transform block in the intra macorblock
+ * and call \ref av1_update_and_record_txb_context to update the probability
+ * model and entropy context properly.
+ *
+ * \param[in]    cpi               Top-level encoder structure
+ * \param[in]    td                Top-level multithreading structure
+ * \param[in]    dry_run           Whether this is a dry run.
+ * \param[in]    bsize             Block size of the current macroblock
+ * \param[in]    allow_update_cdf  Allowed to update probability model (cdf) or
+ * not.
+ */
+void av1_update_intra_mb_txb_context(const AV1_COMP *cpi, ThreadData *td,
+                                     RUN_TYPE dry_run, BLOCK_SIZE bsize,
+                                     uint8_t allow_update_cdf);
+
+/*!\brief Update the probability model (cdf) and the entropy context related to
+ * coefficient coding for a transform block.
+ *
+ * \ingroup coefficient_coding
+ *
+ * There are regular mode and dry run for this funtion.
+ *
+ * Regular mode:
+ *
+ * The probability model (cdf) for each coding symbol in the
+ * transform block will be updated.
+ *
+ * The entropy context of this transform block will be updated.
+ *
+ * Dry run:
+ *
+ * The probability model update will be skipped.
+ *
+ * The entropy context of this transform block will be updated.
+ *
+ * \param[in]    plane        The index of the current plane.
+ * \param[in]    block        The index of the current transform block in the
+ * macroblock. It's defined by number of 4x4 units that have been coded before
+ * the currernt transform block.
+ * \param[in]    blk_row      The row index of the current transform block
+ * in the macroblock. Each unit has 4 pixels in y plane.
+ * \param[in]    blk_col      The col index of the current transform block
+ * in the macroblock. Each unit has 4 pixels in y plane.
+ * \param[in]    plane_bsize  Block size for this plane. When the video source
+ * uses chroma subsampling, the block size of UV planes will be smaller than the
+ * block size of Y plane.
+ * \param[in]    tx_size      The given transform size.
+ * \param[in]    arg          This parameter will be translated into
+ * tokenize_b_args, in which RUN_TYPE indicates using regular mode or dry run.
+ */
 void av1_update_and_record_txb_context(int plane, int block, int blk_row,
                                        int blk_col, BLOCK_SIZE plane_bsize,
                                        TX_SIZE tx_size, void *arg);
-#if CONFIG_HTB_TRELLIS
-void hbt_destroy();
-#endif  // CONFIG_HTB_TRELLIS
+
+/*!\brief Adjust the magnitude of quantized coefficients to achieve better
+ * rate-distortion (RD) trade-off.
+ *
+ * \ingroup coefficient_coding
+ *
+ * This function goes through each coefficient and greedily choose to lower
+ * the coefficient magnitude by 1 or not based on the RD score.
+ *
+ * The coefficients are processing in reversed scan order.
+ *
+ * Note that, the end of block position (eob) may change if the original last
+ * coefficient is lowered to zero.
+ *
+ * \param[in]    cpi            Top-level encoder structure
+ * \param[in]    x              Pointer to structure holding the data for the
+                                current encoding macroblock
+ * \param[in]    plane          The index of the current plane
+ * \param[in]    block          The index of the current transform block in the
+ * \param[in]    tx_size        The transform size
+ * \param[in]    tx_type        The transform type
+ * \param[in]    txb_ctx        Context info for entropy coding transform block
+ * skip flag (tx_skip) and the sign of DC coefficient (dc_sign).
+ * \param[out]   rate_cost      The entropy cost of coding the transform block
+ * after adjustment of coefficients.
+ * \param[in]    sharpness      When sharpness == 1, the function will be less
+ * aggressive toward lowering the magnitude of coefficients.
+ * In this way, the transform block will contain more high-frequency
+ coefficients
+ * and therefore preserve the sharpness of the reconstructed block.
+ */
 int av1_optimize_txb_new(const struct AV1_COMP *cpi, MACROBLOCK *x, int plane,
                          int block, TX_SIZE tx_size, TX_TYPE tx_type,
                          const TXB_CTX *const txb_ctx, int *rate_cost,
-                         int sharpness, int fast_mode);
+                         int sharpness);
 
+/*!\brief Get the corresponding \ref CB_COEFF_BUFFER of the current macro block.
+ *
+ * \ingroup coefficient_coding
+ *
+ * The macroblock's location is described by mi_row and mi_col, row and column
+ * mi indexes in the coding frame.
+ *
+ * Each mi unit is a 4x4 pixel block.
+ *
+ * \param[in]    cpi               Top-level encoder structure.
+ * \param[in]    mi_row            Row mi index of the current transform block
+ * in the frame.
+ * \param[in]    mi_col           Column mi index of the current transform
+ * block in the frame.
+ * \return       CB_COEFF_BUFFER*  Pointer of \ref CB_COEFF_BUFFER associated
+ * to this macroblock.
+ */
 CB_COEFF_BUFFER *av1_get_cb_coeff_buffer(const struct AV1_COMP *cpi, int mi_row,
                                          int mi_col);
 
+/*!\brief Returns the entropy cost associated with skipping the current
+ * transform block.
+ *
+ * \ingroup coefficient_coding
+ *
+ * \param[in]    coeff_costs    Table of entropy cost for coefficient coding.
+ * \param[in]    txb_ctx        Context info for entropy coding transform block
+ * skip flag (tx_skip) and the sign of DC coefficient (dc_sign).
+ * \param[in]    plane          The index of the current plane
+ * \param[in]    tx_size        The transform size
+ */
+static INLINE int av1_cost_skip_txb(const CoeffCosts *coeff_costs,
+                                    const TXB_CTX *const txb_ctx, int plane,
+                                    TX_SIZE tx_size) {
+  const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
+  const PLANE_TYPE plane_type = get_plane_type(plane);
+  const LV_MAP_COEFF_COST *const coeff_costs_ =
+      &coeff_costs->coeff_costs[txs_ctx][plane_type];
+  return coeff_costs_->txb_skip_cost[txb_ctx->txb_skip_ctx][1];
+}
+
+/*!\cond */
 // These numbers are empirically obtained.
 static const int plane_rd_mult[REF_TYPES][PLANE_TYPES] = {
   { 17, 13 },
   { 16, 10 },
 };
+/*!\endcond */
 
 #ifdef __cplusplus
 }
diff --git a/av1/encoder/ethread.c b/av1/encoder/ethread.c
index 693270b..bf22743 100644
--- a/av1/encoder/ethread.c
+++ b/av1/encoder/ethread.c
@@ -9,12 +9,21 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#include "av1/encoder/av1_multi_thread.h"
+#include "av1/common/warped_motion.h"
+
 #include "av1/encoder/encodeframe.h"
 #include "av1/encoder/encoder.h"
+#include "av1/encoder/encoder_alloc.h"
 #include "av1/encoder/ethread.h"
+#if !CONFIG_REALTIME_ONLY
+#include "av1/encoder/firstpass.h"
+#endif
+#include "av1/encoder/global_motion.h"
+#include "av1/encoder/global_motion_facade.h"
 #include "av1/encoder/rdopt.h"
 #include "aom_dsp/aom_dsp_common.h"
+#include "av1/encoder/temporal_filter.h"
+#include "av1/encoder/tpl_model.h"
 
 static AOM_INLINE void accumulate_rd_opt(ThreadData *td, ThreadData *td_t) {
   for (int i = 0; i < REFERENCE_MODES; i++)
@@ -63,7 +72,7 @@
           const int idx_str = cm->mi_params.mi_stride * mi_row + mi_col;
           MB_MODE_INFO **mi = cm->mi_params.mi_grid_base + idx_str;
           MB_MODE_INFO *mbmi = mi[0];
-          if (mbmi->skip == 1 && (mbmi->sb_type == cm->seq_params.sb_size)) {
+          if (mbmi->skip_txfm == 1 && (mbmi->bsize == cm->seq_params.sb_size)) {
             for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id)
               mbmi->delta_lf[lf_id] = xd->delta_lf[lf_id];
             mbmi->delta_lf_from_base = xd->delta_lf_from_base;
@@ -81,16 +90,16 @@
   }
 }
 
-void av1_row_mt_sync_read_dummy(struct AV1RowMTSyncData *const row_mt_sync,
-                                int r, int c) {
+void av1_row_mt_sync_read_dummy(AV1EncRowMultiThreadSync *row_mt_sync, int r,
+                                int c) {
   (void)row_mt_sync;
   (void)r;
   (void)c;
   return;
 }
 
-void av1_row_mt_sync_write_dummy(struct AV1RowMTSyncData *const row_mt_sync,
-                                 int r, int c, const int cols) {
+void av1_row_mt_sync_write_dummy(AV1EncRowMultiThreadSync *row_mt_sync, int r,
+                                 int c, int cols) {
   (void)row_mt_sync;
   (void)r;
   (void)c;
@@ -98,7 +107,7 @@
   return;
 }
 
-void av1_row_mt_sync_read(AV1RowMTSync *const row_mt_sync, int r, int c) {
+void av1_row_mt_sync_read(AV1EncRowMultiThreadSync *row_mt_sync, int r, int c) {
 #if CONFIG_MULTITHREAD
   const int nsync = row_mt_sync->sync_range;
 
@@ -106,7 +115,7 @@
     pthread_mutex_t *const mutex = &row_mt_sync->mutex_[r - 1];
     pthread_mutex_lock(mutex);
 
-    while (c > row_mt_sync->cur_col[r - 1] - nsync) {
+    while (c > row_mt_sync->num_finished_cols[r - 1] - nsync) {
       pthread_cond_wait(&row_mt_sync->cond_[r - 1], mutex);
     }
     pthread_mutex_unlock(mutex);
@@ -118,8 +127,8 @@
 #endif  // CONFIG_MULTITHREAD
 }
 
-void av1_row_mt_sync_write(AV1RowMTSync *const row_mt_sync, int r, int c,
-                           const int cols) {
+void av1_row_mt_sync_write(AV1EncRowMultiThreadSync *row_mt_sync, int r, int c,
+                           int cols) {
 #if CONFIG_MULTITHREAD
   const int nsync = row_mt_sync->sync_range;
   int cur;
@@ -136,7 +145,7 @@
   if (sig) {
     pthread_mutex_lock(&row_mt_sync->mutex_[r]);
 
-    row_mt_sync->cur_col[r] = cur;
+    row_mt_sync->num_finished_cols[r] = cur;
 
     pthread_cond_signal(&row_mt_sync->cond_[r]);
     pthread_mutex_unlock(&row_mt_sync->mutex_[r]);
@@ -150,40 +159,38 @@
 }
 
 // Allocate memory for row synchronization
-void av1_row_mt_sync_mem_alloc(AV1RowMTSync *row_mt_sync, AV1_COMMON *cm,
-                               int rows) {
-  row_mt_sync->rows = rows;
+static void row_mt_sync_mem_alloc(AV1EncRowMultiThreadSync *row_mt_sync,
+                                  AV1_COMMON *cm, int rows) {
 #if CONFIG_MULTITHREAD
-  {
-    int i;
+  int i;
 
-    CHECK_MEM_ERROR(cm, row_mt_sync->mutex_,
-                    aom_malloc(sizeof(*row_mt_sync->mutex_) * rows));
-    if (row_mt_sync->mutex_) {
-      for (i = 0; i < rows; ++i) {
-        pthread_mutex_init(&row_mt_sync->mutex_[i], NULL);
-      }
+  CHECK_MEM_ERROR(cm, row_mt_sync->mutex_,
+                  aom_malloc(sizeof(*row_mt_sync->mutex_) * rows));
+  if (row_mt_sync->mutex_) {
+    for (i = 0; i < rows; ++i) {
+      pthread_mutex_init(&row_mt_sync->mutex_[i], NULL);
     }
+  }
 
-    CHECK_MEM_ERROR(cm, row_mt_sync->cond_,
-                    aom_malloc(sizeof(*row_mt_sync->cond_) * rows));
-    if (row_mt_sync->cond_) {
-      for (i = 0; i < rows; ++i) {
-        pthread_cond_init(&row_mt_sync->cond_[i], NULL);
-      }
+  CHECK_MEM_ERROR(cm, row_mt_sync->cond_,
+                  aom_malloc(sizeof(*row_mt_sync->cond_) * rows));
+  if (row_mt_sync->cond_) {
+    for (i = 0; i < rows; ++i) {
+      pthread_cond_init(&row_mt_sync->cond_[i], NULL);
     }
   }
 #endif  // CONFIG_MULTITHREAD
 
-  CHECK_MEM_ERROR(cm, row_mt_sync->cur_col,
-                  aom_malloc(sizeof(*row_mt_sync->cur_col) * rows));
+  CHECK_MEM_ERROR(cm, row_mt_sync->num_finished_cols,
+                  aom_malloc(sizeof(*row_mt_sync->num_finished_cols) * rows));
 
+  row_mt_sync->rows = rows;
   // Set up nsync.
   row_mt_sync->sync_range = 1;
 }
 
 // Deallocate row based multi-threading synchronization related mutex and data
-void av1_row_mt_sync_mem_dealloc(AV1RowMTSync *row_mt_sync) {
+static void row_mt_sync_mem_dealloc(AV1EncRowMultiThreadSync *row_mt_sync) {
   if (row_mt_sync != NULL) {
 #if CONFIG_MULTITHREAD
     int i;
@@ -201,7 +208,8 @@
       aom_free(row_mt_sync->cond_);
     }
 #endif  // CONFIG_MULTITHREAD
-    aom_free(row_mt_sync->cur_col);
+    aom_free(row_mt_sync->num_finished_cols);
+
     // clear the structure as the source of this call may be dynamic change
     // in tiles in which case this call will be followed by an _alloc()
     // which may fail.
@@ -209,37 +217,89 @@
   }
 }
 
-static AOM_INLINE void assign_tile_to_thread(
-    MultiThreadHandle *multi_thread_ctxt, int num_tiles, int num_workers) {
+static void row_mt_mem_alloc(AV1_COMP *cpi, int max_rows, int max_cols,
+                             int alloc_row_ctx) {
+  struct AV1Common *cm = &cpi->common;
+  AV1EncRowMultiThreadInfo *const enc_row_mt = &cpi->mt_info.enc_row_mt;
+  const int tile_cols = cm->tiles.cols;
+  const int tile_rows = cm->tiles.rows;
+  int tile_col, tile_row;
+
+  // Allocate memory for row based multi-threading
+  for (tile_row = 0; tile_row < tile_rows; tile_row++) {
+    for (tile_col = 0; tile_col < tile_cols; tile_col++) {
+      int tile_index = tile_row * tile_cols + tile_col;
+      TileDataEnc *const this_tile = &cpi->tile_data[tile_index];
+
+      row_mt_sync_mem_alloc(&this_tile->row_mt_sync, cm, max_rows);
+
+      this_tile->row_ctx = NULL;
+      if (alloc_row_ctx) {
+        assert(max_cols > 0);
+        const int num_row_ctx = AOMMAX(1, (max_cols - 1));
+        CHECK_MEM_ERROR(cm, this_tile->row_ctx,
+                        (FRAME_CONTEXT *)aom_memalign(
+                            16, num_row_ctx * sizeof(*this_tile->row_ctx)));
+      }
+    }
+  }
+  enc_row_mt->allocated_tile_cols = tile_cols;
+  enc_row_mt->allocated_tile_rows = tile_rows;
+  enc_row_mt->allocated_rows = max_rows;
+  enc_row_mt->allocated_cols = max_cols - 1;
+}
+
+void av1_row_mt_mem_dealloc(AV1_COMP *cpi) {
+  AV1EncRowMultiThreadInfo *const enc_row_mt = &cpi->mt_info.enc_row_mt;
+  const int tile_cols = enc_row_mt->allocated_tile_cols;
+  const int tile_rows = enc_row_mt->allocated_tile_rows;
+  int tile_col, tile_row;
+
+  // Free row based multi-threading sync memory
+  for (tile_row = 0; tile_row < tile_rows; tile_row++) {
+    for (tile_col = 0; tile_col < tile_cols; tile_col++) {
+      int tile_index = tile_row * tile_cols + tile_col;
+      TileDataEnc *const this_tile = &cpi->tile_data[tile_index];
+
+      row_mt_sync_mem_dealloc(&this_tile->row_mt_sync);
+
+      if (cpi->oxcf.algo_cfg.cdf_update_mode) aom_free(this_tile->row_ctx);
+    }
+  }
+  enc_row_mt->allocated_rows = 0;
+  enc_row_mt->allocated_cols = 0;
+  enc_row_mt->allocated_tile_cols = 0;
+  enc_row_mt->allocated_tile_rows = 0;
+}
+
+static AOM_INLINE void assign_tile_to_thread(int *thread_id_to_tile_id,
+                                             int num_tiles, int num_workers) {
   int tile_id = 0;
   int i;
 
   for (i = 0; i < num_workers; i++) {
-    multi_thread_ctxt->thread_id_to_tile_id[i] = tile_id++;
+    thread_id_to_tile_id[i] = tile_id++;
     if (tile_id == num_tiles) tile_id = 0;
   }
 }
 
-static int get_next_job(AV1_COMP *const cpi, int *current_mi_row,
-                        int cur_tile_id) {
-  AV1_COMMON *const cm = &cpi->common;
-  TileDataEnc *const this_tile = &cpi->tile_data[cur_tile_id];
-  AV1RowMTInfo *row_mt_info = &this_tile->row_mt_info;
+static AOM_INLINE int get_next_job(TileDataEnc *const tile_data,
+                                   int *current_mi_row, int mib_size) {
+  AV1EncRowMultiThreadSync *const row_mt_sync = &tile_data->row_mt_sync;
+  const int mi_row_end = tile_data->tile_info.mi_row_end;
 
-  if (row_mt_info->current_mi_row < this_tile->tile_info.mi_row_end) {
-    *current_mi_row = row_mt_info->current_mi_row;
-    row_mt_info->num_threads_working++;
-    row_mt_info->current_mi_row += cm->seq_params.mib_size;
+  if (row_mt_sync->next_mi_row < mi_row_end) {
+    *current_mi_row = row_mt_sync->next_mi_row;
+    row_mt_sync->num_threads_working++;
+    row_mt_sync->next_mi_row += mib_size;
     return 1;
   }
   return 0;
 }
 
-static AOM_INLINE void switch_tile_and_get_next_job(AV1_COMP *const cpi,
-                                                    int *cur_tile_id,
-                                                    int *current_mi_row,
-                                                    int *end_of_frame) {
-  AV1_COMMON *const cm = &cpi->common;
+static AOM_INLINE void switch_tile_and_get_next_job(
+    AV1_COMMON *const cm, TileDataEnc *const tile_data, int *cur_tile_id,
+    int *current_mi_row, int *end_of_frame, int is_firstpass) {
   const int tile_cols = cm->tiles.cols;
   const int tile_rows = cm->tiles.rows;
 
@@ -250,18 +310,29 @@
   for (int tile_row = 0; tile_row < tile_rows; tile_row++) {
     for (int tile_col = 0; tile_col < tile_cols; tile_col++) {
       int tile_index = tile_row * tile_cols + tile_col;
-      TileDataEnc *this_tile = &cpi->tile_data[tile_index];
-      AV1RowMTInfo *row_mt_info = &this_tile->row_mt_info;
-      int num_sb_rows_in_tile =
+      TileDataEnc *const this_tile = &tile_data[tile_index];
+      AV1EncRowMultiThreadSync *const row_mt_sync = &this_tile->row_mt_sync;
+
+#if CONFIG_REALTIME_ONLY
+      int num_b_rows_in_tile =
           av1_get_sb_rows_in_tile(cm, this_tile->tile_info);
-      int num_sb_cols_in_tile =
+      int num_b_cols_in_tile =
           av1_get_sb_cols_in_tile(cm, this_tile->tile_info);
+#else
+      int num_b_rows_in_tile =
+          is_firstpass ? av1_get_mb_rows_in_tile(this_tile->tile_info)
+                       : av1_get_sb_rows_in_tile(cm, this_tile->tile_info);
+      int num_b_cols_in_tile =
+          is_firstpass ? av1_get_mb_cols_in_tile(this_tile->tile_info)
+                       : av1_get_sb_cols_in_tile(cm, this_tile->tile_info);
+#endif
       int theoretical_limit_on_threads =
-          AOMMIN((num_sb_cols_in_tile + 1) >> 1, num_sb_rows_in_tile);
-      int num_threads_working = row_mt_info->num_threads_working;
+          AOMMIN((num_b_cols_in_tile + 1) >> 1, num_b_rows_in_tile);
+      int num_threads_working = row_mt_sync->num_threads_working;
+
       if (num_threads_working < theoretical_limit_on_threads) {
         int num_mis_to_encode =
-            this_tile->tile_info.mi_row_end - row_mt_info->current_mi_row;
+            this_tile->tile_info.mi_row_end - row_mt_sync->next_mi_row;
 
         // Tile to be processed by this thread is selected on the basis of
         // availability of jobs:
@@ -287,21 +358,25 @@
   if (tile_id == -1) {
     *end_of_frame = 1;
   } else {
-    // Update the cur ID to the next tile ID that will be processed,
-    // which will be the least processed tile
+    // Update the current tile id to the tile id that will be processed next,
+    // which will be the least processed tile.
     *cur_tile_id = tile_id;
-    get_next_job(cpi, current_mi_row, *cur_tile_id);
+    get_next_job(&tile_data[tile_id], current_mi_row,
+                 is_firstpass ? FP_MIB_SIZE : cm->seq_params.mib_size);
   }
 }
 
-static int enc_row_mt_worker_hook(void *arg1, void *unused) {
+#if !CONFIG_REALTIME_ONLY
+static int fp_enc_row_mt_worker_hook(void *arg1, void *unused) {
   EncWorkerData *const thread_data = (EncWorkerData *)arg1;
   AV1_COMP *const cpi = thread_data->cpi;
   AV1_COMMON *const cm = &cpi->common;
-
-  MultiThreadHandle *multi_thread_ctxt = &cpi->multi_thread_ctxt;
   int thread_id = thread_data->thread_id;
-  int cur_tile_id = multi_thread_ctxt->thread_id_to_tile_id[thread_id];
+  AV1EncRowMultiThreadInfo *const enc_row_mt = &cpi->mt_info.enc_row_mt;
+  int cur_tile_id = enc_row_mt->thread_id_to_tile_id[thread_id];
+#if CONFIG_MULTITHREAD
+  pthread_mutex_t *enc_row_mt_mutex_ = enc_row_mt->mutex_;
+#endif
   (void)unused;
 
   assert(cur_tile_id != -1);
@@ -310,33 +385,88 @@
   while (1) {
     int current_mi_row = -1;
 #if CONFIG_MULTITHREAD
-    pthread_mutex_lock(cpi->row_mt_mutex_);
+    pthread_mutex_lock(enc_row_mt_mutex_);
 #endif
-    if (!get_next_job(cpi, &current_mi_row, cur_tile_id)) {
+    if (!get_next_job(&cpi->tile_data[cur_tile_id], &current_mi_row,
+                      FP_MIB_SIZE)) {
       // No jobs are available for the current tile. Query for the status of
       // other tiles and get the next job if available
-      switch_tile_and_get_next_job(cpi, &cur_tile_id, &current_mi_row,
-                                   &end_of_frame);
+      switch_tile_and_get_next_job(cm, cpi->tile_data, &cur_tile_id,
+                                   &current_mi_row, &end_of_frame, 1);
     }
 #if CONFIG_MULTITHREAD
-    pthread_mutex_unlock(cpi->row_mt_mutex_);
+    pthread_mutex_unlock(enc_row_mt_mutex_);
 #endif
     if (end_of_frame == 1) break;
 
     TileDataEnc *const this_tile = &cpi->tile_data[cur_tile_id];
-    int tile_row = this_tile->tile_info.tile_row;
-    int tile_col = this_tile->tile_info.tile_col;
+    AV1EncRowMultiThreadSync *const row_mt_sync = &this_tile->row_mt_sync;
+    ThreadData *td = thread_data->td;
 
     assert(current_mi_row != -1 &&
            current_mi_row <= this_tile->tile_info.mi_row_end);
 
+    av1_first_pass_row(cpi, td, this_tile, current_mi_row >> FP_MIB_SIZE_LOG2);
+#if CONFIG_MULTITHREAD
+    pthread_mutex_lock(enc_row_mt_mutex_);
+#endif
+    row_mt_sync->num_threads_working--;
+#if CONFIG_MULTITHREAD
+    pthread_mutex_unlock(enc_row_mt_mutex_);
+#endif
+  }
+
+  return 1;
+}
+#endif
+
+static int enc_row_mt_worker_hook(void *arg1, void *unused) {
+  EncWorkerData *const thread_data = (EncWorkerData *)arg1;
+  AV1_COMP *const cpi = thread_data->cpi;
+  AV1_COMMON *const cm = &cpi->common;
+  int thread_id = thread_data->thread_id;
+  AV1EncRowMultiThreadInfo *const enc_row_mt = &cpi->mt_info.enc_row_mt;
+  int cur_tile_id = enc_row_mt->thread_id_to_tile_id[thread_id];
+#if CONFIG_MULTITHREAD
+  pthread_mutex_t *enc_row_mt_mutex_ = enc_row_mt->mutex_;
+#endif
+  (void)unused;
+
+  assert(cur_tile_id != -1);
+
+  int end_of_frame = 0;
+  while (1) {
+    int current_mi_row = -1;
+#if CONFIG_MULTITHREAD
+    pthread_mutex_lock(enc_row_mt_mutex_);
+#endif
+    if (!get_next_job(&cpi->tile_data[cur_tile_id], &current_mi_row,
+                      cm->seq_params.mib_size)) {
+      // No jobs are available for the current tile. Query for the status of
+      // other tiles and get the next job if available
+      switch_tile_and_get_next_job(cm, cpi->tile_data, &cur_tile_id,
+                                   &current_mi_row, &end_of_frame, 0);
+    }
+#if CONFIG_MULTITHREAD
+    pthread_mutex_unlock(enc_row_mt_mutex_);
+#endif
+    if (end_of_frame == 1) break;
+
+    TileDataEnc *const this_tile = &cpi->tile_data[cur_tile_id];
+    AV1EncRowMultiThreadSync *const row_mt_sync = &this_tile->row_mt_sync;
+    const TileInfo *const tile_info = &this_tile->tile_info;
+    const int tile_row = tile_info->tile_row;
+    const int tile_col = tile_info->tile_col;
     ThreadData *td = thread_data->td;
 
+    assert(current_mi_row != -1 && current_mi_row <= tile_info->mi_row_end);
+
     td->mb.e_mbd.tile_ctx = td->tctx;
     td->mb.tile_pb_ctx = &this_tile->tctx;
+
     if (this_tile->allow_update_cdf) {
       td->mb.row_ctx = this_tile->row_ctx;
-      if (current_mi_row == this_tile->tile_info.mi_row_start)
+      if (current_mi_row == tile_info->mi_row_start)
         memcpy(td->mb.e_mbd.tile_ctx, &this_tile->tctx, sizeof(FRAME_CONTEXT));
     } else {
       memcpy(td->mb.e_mbd.tile_ctx, &this_tile->tctx, sizeof(FRAME_CONTEXT));
@@ -346,15 +476,16 @@
                            &td->mb.e_mbd);
 
     cfl_init(&td->mb.e_mbd.cfl, &cm->seq_params);
-    av1_crc32c_calculator_init(&td->mb.mb_rd_record.crc_calculator);
+    av1_crc32c_calculator_init(
+        &td->mb.txfm_search_info.mb_rd_record.crc_calculator);
 
     av1_encode_sb_row(cpi, td, tile_row, tile_col, current_mi_row);
 #if CONFIG_MULTITHREAD
-    pthread_mutex_lock(cpi->row_mt_mutex_);
+    pthread_mutex_lock(enc_row_mt_mutex_);
 #endif
-    this_tile->row_mt_info.num_threads_working--;
+    row_mt_sync->num_threads_working--;
 #if CONFIG_MULTITHREAD
-    pthread_mutex_unlock(cpi->row_mt_mutex_);
+    pthread_mutex_unlock(enc_row_mt_mutex_);
 #endif
   }
 
@@ -372,7 +503,7 @@
   (void)unused;
 
   for (t = thread_data->start; t < tile_rows * tile_cols;
-       t += cpi->num_workers) {
+       t += cpi->mt_info.num_workers) {
     int tile_row = t / tile_cols;
     int tile_col = t % tile_cols;
 
@@ -389,58 +520,49 @@
 static AOM_INLINE void create_enc_workers(AV1_COMP *cpi, int num_workers) {
   AV1_COMMON *const cm = &cpi->common;
   const AVxWorkerInterface *const winterface = aom_get_worker_interface();
-  int sb_mi_size = av1_get_sb_mi_size(cm);
+  MultiThreadInfo *const mt_info = &cpi->mt_info;
 
-  CHECK_MEM_ERROR(cm, cpi->workers,
-                  aom_malloc(num_workers * sizeof(*cpi->workers)));
-
-  CHECK_MEM_ERROR(cm, cpi->tile_thr_data,
-                  aom_calloc(num_workers, sizeof(*cpi->tile_thr_data)));
+  assert(mt_info->workers != NULL);
+  assert(mt_info->tile_thr_data != NULL);
 
 #if CONFIG_MULTITHREAD
   if (cpi->oxcf.row_mt == 1) {
-    if (cpi->row_mt_mutex_ == NULL) {
-      CHECK_MEM_ERROR(cm, cpi->row_mt_mutex_,
-                      aom_malloc(sizeof(*(cpi->row_mt_mutex_))));
-      if (cpi->row_mt_mutex_) pthread_mutex_init(cpi->row_mt_mutex_, NULL);
+    AV1EncRowMultiThreadInfo *enc_row_mt = &mt_info->enc_row_mt;
+    if (enc_row_mt->mutex_ == NULL) {
+      CHECK_MEM_ERROR(cm, enc_row_mt->mutex_,
+                      aom_malloc(sizeof(*(enc_row_mt->mutex_))));
+      if (enc_row_mt->mutex_) pthread_mutex_init(enc_row_mt->mutex_, NULL);
     }
   }
+  AV1GlobalMotionSync *gm_sync = &mt_info->gm_sync;
+  if (gm_sync->mutex_ == NULL) {
+    CHECK_MEM_ERROR(cm, gm_sync->mutex_,
+                    aom_malloc(sizeof(*(gm_sync->mutex_))));
+    if (gm_sync->mutex_) pthread_mutex_init(gm_sync->mutex_, NULL);
+  }
+  AV1TemporalFilterSync *tf_sync = &mt_info->tf_sync;
+  if (tf_sync->mutex_ == NULL) {
+    CHECK_MEM_ERROR(cm, tf_sync->mutex_, aom_malloc(sizeof(*tf_sync->mutex_)));
+    if (tf_sync->mutex_) pthread_mutex_init(tf_sync->mutex_, NULL);
+  }
 #endif
 
   for (int i = num_workers - 1; i >= 0; i--) {
-    AVxWorker *const worker = &cpi->workers[i];
-    EncWorkerData *const thread_data = &cpi->tile_thr_data[i];
+    AVxWorker *const worker = &mt_info->workers[i];
+    EncWorkerData *const thread_data = &mt_info->tile_thr_data[i];
 
-    ++cpi->num_workers;
-    winterface->init(worker);
-    worker->thread_name = "aom enc worker";
+    ++mt_info->num_enc_workers;
 
     thread_data->cpi = cpi;
     thread_data->thread_id = i;
+    // Set the starting tile for each thread.
+    thread_data->start = i;
 
     if (i > 0) {
-      // Allocate thread data.
-      CHECK_MEM_ERROR(cm, thread_data->td,
-                      aom_memalign(32, sizeof(*thread_data->td)));
-      av1_zero(*thread_data->td);
+      // Set up sms_tree.
+      av1_setup_sms_tree(cpi, thread_data->td);
 
-      // Set up pc_tree.
-      thread_data->td->pc_tree = NULL;
-      av1_setup_pc_tree(cpi, thread_data->td);
-
-      CHECK_MEM_ERROR(cm, thread_data->td->above_pred_buf,
-                      (uint8_t *)aom_memalign(
-                          16, MAX_MB_PLANE * MAX_SB_SQUARE *
-                                  sizeof(*thread_data->td->above_pred_buf)));
-      CHECK_MEM_ERROR(cm, thread_data->td->left_pred_buf,
-                      (uint8_t *)aom_memalign(
-                          16, MAX_MB_PLANE * MAX_SB_SQUARE *
-                                  sizeof(*thread_data->td->left_pred_buf)));
-
-      CHECK_MEM_ERROR(
-          cm, thread_data->td->wsrc_buf,
-          (int32_t *)aom_memalign(
-              16, MAX_SB_SQUARE * sizeof(*thread_data->td->wsrc_buf)));
+      alloc_obmc_buffers(&thread_data->td->obmc_buffer, cm);
 
       CHECK_MEM_ERROR(cm, thread_data->td->inter_modes_info,
                       (InterModesInfo *)aom_malloc(
@@ -454,10 +576,6 @@
                   AOM_BUFFER_SIZE_FOR_BLOCK_HASH *
                   sizeof(*thread_data->td->hash_value_buffer[0][0])));
 
-      CHECK_MEM_ERROR(
-          cm, thread_data->td->mask_buf,
-          (int32_t *)aom_memalign(
-              16, MAX_SB_SQUARE * sizeof(*thread_data->td->mask_buf)));
       // Allocate frame counters in thread data.
       CHECK_MEM_ERROR(cm, thread_data->td->counts,
                       aom_calloc(1, sizeof(*thread_data->td->counts)));
@@ -467,7 +585,7 @@
           cm, thread_data->td->palette_buffer,
           aom_memalign(16, sizeof(*thread_data->td->palette_buffer)));
 
-      av1_alloc_compound_type_rd_buffers(cm, &thread_data->td->comp_rd_buffer);
+      alloc_compound_type_rd_buffers(cm, &thread_data->td->comp_rd_buffer);
 
       CHECK_MEM_ERROR(
           cm, thread_data->td->tmp_conv_dst,
@@ -475,15 +593,11 @@
                                sizeof(*thread_data->td->tmp_conv_dst)));
       for (int j = 0; j < 2; ++j) {
         CHECK_MEM_ERROR(
-            cm, thread_data->td->tmp_obmc_bufs[j],
+            cm, thread_data->td->tmp_pred_bufs[j],
             aom_memalign(32, 2 * MAX_MB_PLANE * MAX_SB_SQUARE *
-                                 sizeof(*thread_data->td->tmp_obmc_bufs[j])));
+                                 sizeof(*thread_data->td->tmp_pred_bufs[j])));
       }
 
-      CHECK_MEM_ERROR(
-          cm, thread_data->td->mbmi_ext,
-          aom_calloc(sb_mi_size, sizeof(*thread_data->td->mbmi_ext)));
-
       if (cpi->sf.part_sf.partition_search_type == VAR_BASED_PARTITION) {
         const int num_64x64_blocks =
             (cm->seq_params.sb_size == BLOCK_64X64) ? 1 : 4;
@@ -508,16 +622,89 @@
   }
 }
 
-static AOM_INLINE void launch_enc_workers(AV1_COMP *cpi, int num_workers) {
+void av1_create_workers(AV1_COMP *cpi, int num_workers) {
+  AV1_COMMON *const cm = &cpi->common;
+  MultiThreadInfo *const mt_info = &cpi->mt_info;
   const AVxWorkerInterface *const winterface = aom_get_worker_interface();
-  // Encode a frame
-  for (int i = num_workers - 1; i >= 0; i--) {
-    AVxWorker *const worker = &cpi->workers[i];
-    EncWorkerData *const thread_data = (EncWorkerData *)worker->data1;
 
+  CHECK_MEM_ERROR(cm, mt_info->workers,
+                  aom_malloc(num_workers * sizeof(*mt_info->workers)));
+
+  CHECK_MEM_ERROR(cm, mt_info->tile_thr_data,
+                  aom_calloc(num_workers, sizeof(*mt_info->tile_thr_data)));
+
+  for (int i = num_workers - 1; i >= 0; i--) {
+    AVxWorker *const worker = &mt_info->workers[i];
+    EncWorkerData *const thread_data = &mt_info->tile_thr_data[i];
+
+    winterface->init(worker);
+    worker->thread_name = "aom enc worker";
+
+    if (i > 0) {
+      // Allocate thread data.
+      CHECK_MEM_ERROR(cm, thread_data->td,
+                      aom_memalign(32, sizeof(*thread_data->td)));
+      av1_zero(*thread_data->td);
+
+      // Set up shared coeff buffers.
+      av1_setup_shared_coeff_buffer(cm, &thread_data->td->shared_coeff_buf);
+    }
+    ++mt_info->num_workers;
+  }
+}
+
+#if !CONFIG_REALTIME_ONLY
+static AOM_INLINE void fp_create_enc_workers(AV1_COMP *cpi, int num_workers) {
+  AV1_COMMON *const cm = &cpi->common;
+  const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+  MultiThreadInfo *const mt_info = &cpi->mt_info;
+
+  assert(mt_info->workers != NULL);
+  assert(mt_info->tile_thr_data != NULL);
+
+#if CONFIG_MULTITHREAD
+  AV1EncRowMultiThreadInfo *enc_row_mt = &mt_info->enc_row_mt;
+  if (enc_row_mt->mutex_ == NULL) {
+    CHECK_MEM_ERROR(cm, enc_row_mt->mutex_,
+                    aom_malloc(sizeof(*(enc_row_mt->mutex_))));
+    if (enc_row_mt->mutex_) pthread_mutex_init(enc_row_mt->mutex_, NULL);
+  }
+#endif
+
+  for (int i = num_workers - 1; i >= 0; i--) {
+    AVxWorker *const worker = &mt_info->workers[i];
+    EncWorkerData *const thread_data = &mt_info->tile_thr_data[i];
+
+    ++mt_info->num_fp_workers;
+
+    thread_data->cpi = cpi;
+    thread_data->thread_id = i;
     // Set the starting tile for each thread.
     thread_data->start = i;
 
+    if (i > 0) {
+      // Set up firstpass PICK_MODE_CONTEXT.
+      thread_data->td->firstpass_ctx =
+          av1_alloc_pmc(cm, BLOCK_16X16, &thread_data->td->shared_coeff_buf);
+
+      // Create threads
+      if (!winterface->reset(worker))
+        aom_internal_error(&cm->error, AOM_CODEC_ERROR,
+                           "Tile encoder thread creation failed");
+    } else {
+      // Main thread acts as a worker and uses the thread data in cpi.
+      thread_data->td = &cpi->td;
+    }
+    winterface->sync(worker);
+  }
+}
+#endif
+
+static AOM_INLINE void launch_workers(MultiThreadInfo *const mt_info,
+                                      int num_workers) {
+  const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+  for (int i = num_workers - 1; i >= 0; i--) {
+    AVxWorker *const worker = &mt_info->workers[i];
     if (i == 0)
       winterface->execute(worker);
     else
@@ -525,25 +712,26 @@
   }
 }
 
-static AOM_INLINE void sync_enc_workers(AV1_COMP *cpi, int num_workers) {
+static AOM_INLINE void sync_enc_workers(MultiThreadInfo *const mt_info,
+                                        AV1_COMMON *const cm, int num_workers) {
   const AVxWorkerInterface *const winterface = aom_get_worker_interface();
   int had_error = 0;
 
   // Encoding ends.
   for (int i = num_workers - 1; i >= 0; i--) {
-    AVxWorker *const worker = &cpi->workers[i];
+    AVxWorker *const worker = &mt_info->workers[i];
     had_error |= !winterface->sync(worker);
   }
 
   if (had_error)
-    aom_internal_error(&cpi->common.error, AOM_CODEC_ERROR,
+    aom_internal_error(&cm->error, AOM_CODEC_ERROR,
                        "Failed to encode tile data");
 }
 
 static AOM_INLINE void accumulate_counters_enc_workers(AV1_COMP *cpi,
                                                        int num_workers) {
   for (int i = num_workers - 1; i >= 0; i--) {
-    AVxWorker *const worker = &cpi->workers[i];
+    AVxWorker *const worker = &cpi->mt_info.workers[i];
     EncWorkerData *const thread_data = (EncWorkerData *)worker->data1;
     cpi->intrabc_used |= thread_data->td->intrabc_used;
     cpi->deltaq_used |= thread_data->td->deltaq_used;
@@ -552,9 +740,11 @@
     if (i > 0) {
       av1_accumulate_frame_counts(&cpi->counts, thread_data->td->counts);
       accumulate_rd_opt(&cpi->td, thread_data->td);
-      cpi->td.mb.txb_split_count += thread_data->td->mb.txb_split_count;
+      cpi->td.mb.txfm_search_info.txb_split_count +=
+          thread_data->td->mb.txfm_search_info.txb_split_count;
 #if CONFIG_SPEED_STATS
-      cpi->td.mb.tx_search_count += thread_data->td->mb.tx_search_count;
+      cpi->td.mb.txfm_search_info.tx_search_count +=
+          thread_data->td->mb.txfm_search_info.tx_search_count;
 #endif  // CONFIG_SPEED_STATS
     }
   }
@@ -562,14 +752,20 @@
 
 static AOM_INLINE void prepare_enc_workers(AV1_COMP *cpi, AVxWorkerHook hook,
                                            int num_workers) {
+  MultiThreadInfo *const mt_info = &cpi->mt_info;
   for (int i = num_workers - 1; i >= 0; i--) {
-    AVxWorker *const worker = &cpi->workers[i];
-    EncWorkerData *const thread_data = &cpi->tile_thr_data[i];
+    AVxWorker *const worker = &mt_info->workers[i];
+    EncWorkerData *const thread_data = &mt_info->tile_thr_data[i];
 
     worker->hook = hook;
     worker->data1 = thread_data;
     worker->data2 = NULL;
 
+    thread_data->cpi = cpi;
+    if (i == 0) {
+      thread_data->td = &cpi->td;
+    }
+
     thread_data->td->intrabc_used = 0;
     thread_data->td->deltaq_used = 0;
 
@@ -577,9 +773,7 @@
     if (thread_data->td != &cpi->td) {
       thread_data->td->mb = cpi->td.mb;
       thread_data->td->rd_counts = cpi->td.rd_counts;
-      thread_data->td->mb.above_pred_buf = thread_data->td->above_pred_buf;
-      thread_data->td->mb.left_pred_buf = thread_data->td->left_pred_buf;
-      thread_data->td->mb.wsrc_buf = thread_data->td->wsrc_buf;
+      thread_data->td->mb.obmc_buffer = thread_data->td->obmc_buffer;
 
       thread_data->td->mb.inter_modes_info = thread_data->td->inter_modes_info;
       for (int x = 0; x < 2; x++) {
@@ -592,8 +786,6 @@
               thread_data->td->hash_value_buffer[x][y];
         }
       }
-      thread_data->td->mb.mask_buf = thread_data->td->mask_buf;
-      thread_data->td->mb.mbmi_ext = thread_data->td->mbmi_ext;
     }
     if (thread_data->td->counts != &cpi->counts) {
       memcpy(thread_data->td->counts, &cpi->counts, sizeof(cpi->counts));
@@ -604,38 +796,101 @@
       thread_data->td->mb.comp_rd_buffer = thread_data->td->comp_rd_buffer;
       thread_data->td->mb.tmp_conv_dst = thread_data->td->tmp_conv_dst;
       for (int j = 0; j < 2; ++j) {
-        thread_data->td->mb.tmp_obmc_bufs[j] =
-            thread_data->td->tmp_obmc_bufs[j];
+        thread_data->td->mb.tmp_pred_bufs[j] =
+            thread_data->td->tmp_pred_bufs[j];
       }
 
       thread_data->td->mb.e_mbd.tmp_conv_dst = thread_data->td->mb.tmp_conv_dst;
       for (int j = 0; j < 2; ++j) {
         thread_data->td->mb.e_mbd.tmp_obmc_bufs[j] =
-            thread_data->td->mb.tmp_obmc_bufs[j];
+            thread_data->td->mb.tmp_pred_bufs[j];
       }
     }
   }
 }
 
-void av1_encode_tiles_mt(AV1_COMP *cpi) {
-  AV1_COMMON *const cm = &cpi->common;
+#if !CONFIG_REALTIME_ONLY
+static AOM_INLINE void fp_prepare_enc_workers(AV1_COMP *cpi, AVxWorkerHook hook,
+                                              int num_workers) {
+  MultiThreadInfo *const mt_info = &cpi->mt_info;
+  for (int i = num_workers - 1; i >= 0; i--) {
+    AVxWorker *const worker = &mt_info->workers[i];
+    EncWorkerData *const thread_data = &mt_info->tile_thr_data[i];
+
+    worker->hook = hook;
+    worker->data1 = thread_data;
+    worker->data2 = NULL;
+
+    thread_data->cpi = cpi;
+    if (i == 0) {
+      thread_data->td = &cpi->td;
+    }
+
+    // Before encoding a frame, copy the thread data from cpi.
+    if (thread_data->td != &cpi->td) {
+      thread_data->td->mb = cpi->td.mb;
+    }
+  }
+}
+#endif
+
+// Computes the number of workers for row multi-threading of encoding stage
+static AOM_INLINE int compute_num_enc_row_mt_workers(AV1_COMMON *const cm,
+                                                     int max_threads) {
+  TileInfo tile_info;
   const int tile_cols = cm->tiles.cols;
   const int tile_rows = cm->tiles.rows;
-  int num_workers = AOMMIN(cpi->oxcf.max_threads, tile_cols * tile_rows);
+  int total_num_threads_row_mt = 0;
+  for (int row = 0; row < tile_rows; row++) {
+    for (int col = 0; col < tile_cols; col++) {
+      av1_tile_init(&tile_info, cm, row, col);
+      const int num_sb_rows_in_tile = av1_get_sb_rows_in_tile(cm, tile_info);
+      const int num_sb_cols_in_tile = av1_get_sb_cols_in_tile(cm, tile_info);
+      total_num_threads_row_mt +=
+          AOMMIN((num_sb_cols_in_tile + 1) >> 1, num_sb_rows_in_tile);
+    }
+  }
+  return AOMMIN(max_threads, total_num_threads_row_mt);
+}
 
-  if (cpi->tile_data == NULL || cpi->allocated_tiles < tile_cols * tile_rows)
-    av1_alloc_tile_data(cpi);
+// Computes the number of workers for tile multi-threading of encoding stage
+static AOM_INLINE int compute_num_enc_tile_mt_workers(AV1_COMMON *const cm,
+                                                      int max_threads) {
+  const int tile_cols = cm->tiles.cols;
+  const int tile_rows = cm->tiles.rows;
+  return AOMMIN(max_threads, tile_cols * tile_rows);
+}
+
+// Computes the number of workers for encoding stage (row/tile multi-threading)
+int av1_compute_num_enc_workers(AV1_COMP *cpi, int max_workers) {
+  if (max_workers <= 1) return 1;
+  if (cpi->oxcf.row_mt)
+    return compute_num_enc_row_mt_workers(&cpi->common, max_workers);
+  else
+    return compute_num_enc_tile_mt_workers(&cpi->common, max_workers);
+}
+
+void av1_encode_tiles_mt(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  MultiThreadInfo *const mt_info = &cpi->mt_info;
+  const int tile_cols = cm->tiles.cols;
+  const int tile_rows = cm->tiles.rows;
+  int num_workers = av1_compute_num_enc_workers(cpi, mt_info->num_workers);
+
+  assert(IMPLIES(cpi->tile_data == NULL,
+                 cpi->allocated_tiles < tile_cols * tile_rows));
+  if (cpi->allocated_tiles < tile_cols * tile_rows) av1_alloc_tile_data(cpi);
 
   av1_init_tile_data(cpi);
   // Only run once to create threads and allocate thread data.
-  if (cpi->num_workers == 0) {
+  if (mt_info->num_enc_workers == 0) {
     create_enc_workers(cpi, num_workers);
   } else {
-    num_workers = AOMMIN(num_workers, cpi->num_workers);
+    num_workers = AOMMIN(num_workers, mt_info->num_enc_workers);
   }
   prepare_enc_workers(cpi, enc_worker_hook, num_workers);
-  launch_enc_workers(cpi, num_workers);
-  sync_enc_workers(cpi, num_workers);
+  launch_workers(&cpi->mt_info, num_workers);
+  sync_enc_workers(&cpi->mt_info, cm, num_workers);
   accumulate_counters_enc_workers(cpi, num_workers);
 }
 
@@ -651,61 +906,117 @@
   for (unsigned int i = 0; i < n_counts; i++) acc[i] += cnt[i];
 }
 
-void av1_encode_tiles_row_mt(AV1_COMP *cpi) {
+// Computes the maximum number of sb_rows for row multi-threading of encoding
+// stage
+static AOM_INLINE void compute_max_sb_rows_cols(AV1_COMP *cpi, int *max_sb_rows,
+                                                int *max_sb_cols) {
   AV1_COMMON *const cm = &cpi->common;
   const int tile_cols = cm->tiles.cols;
   const int tile_rows = cm->tiles.rows;
-  MultiThreadHandle *multi_thread_ctxt = &cpi->multi_thread_ctxt;
-  int num_workers = 0;
-  int total_num_threads_row_mt = 0;
-  int max_sb_rows = 0;
+  for (int row = 0; row < tile_rows; row++) {
+    for (int col = 0; col < tile_cols; col++) {
+      const int tile_index = row * cm->tiles.cols + col;
+      TileInfo tile_info = cpi->tile_data[tile_index].tile_info;
+      const int num_sb_rows_in_tile = av1_get_sb_rows_in_tile(cm, tile_info);
+      const int num_sb_cols_in_tile = av1_get_sb_cols_in_tile(cm, tile_info);
+      *max_sb_rows = AOMMAX(*max_sb_rows, num_sb_rows_in_tile);
+      *max_sb_cols = AOMMAX(*max_sb_cols, num_sb_cols_in_tile);
+    }
+  }
+}
 
-  if (cpi->tile_data == NULL || cpi->allocated_tiles < tile_cols * tile_rows) {
+#if !CONFIG_REALTIME_ONLY
+// Computes the number of workers for firstpass stage (row/tile multi-threading)
+int av1_fp_compute_num_enc_workers(AV1_COMP *cpi) {
+  AV1_COMMON *cm = &cpi->common;
+  const int tile_cols = cm->tiles.cols;
+  const int tile_rows = cm->tiles.rows;
+  int total_num_threads_row_mt = 0;
+  TileInfo tile_info;
+
+  if (cpi->oxcf.max_threads <= 1) return 1;
+
+  for (int row = 0; row < tile_rows; row++) {
+    for (int col = 0; col < tile_cols; col++) {
+      av1_tile_init(&tile_info, cm, row, col);
+      const int num_mb_rows_in_tile = av1_get_mb_rows_in_tile(tile_info);
+      const int num_mb_cols_in_tile = av1_get_mb_cols_in_tile(tile_info);
+      total_num_threads_row_mt +=
+          AOMMIN((num_mb_cols_in_tile + 1) >> 1, num_mb_rows_in_tile);
+    }
+  }
+  return AOMMIN(cpi->oxcf.max_threads, total_num_threads_row_mt);
+}
+
+// Computes the maximum number of mb_rows for row multi-threading of firstpass
+// stage
+static AOM_INLINE int fp_compute_max_mb_rows(
+    const AV1_COMMON *const cm, const TileDataEnc *const tile_data) {
+  const int tile_cols = cm->tiles.cols;
+  const int tile_rows = cm->tiles.rows;
+  int max_mb_rows = 0;
+  for (int row = 0; row < tile_rows; row++) {
+    for (int col = 0; col < tile_cols; col++) {
+      const int tile_index = row * cm->tiles.cols + col;
+      TileInfo tile_info = tile_data[tile_index].tile_info;
+      const int num_mb_rows_in_tile = av1_get_mb_rows_in_tile(tile_info);
+      max_mb_rows = AOMMAX(max_mb_rows, num_mb_rows_in_tile);
+    }
+  }
+  return max_mb_rows;
+}
+#endif
+
+void av1_encode_tiles_row_mt(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  MultiThreadInfo *const mt_info = &cpi->mt_info;
+  AV1EncRowMultiThreadInfo *const enc_row_mt = &mt_info->enc_row_mt;
+  const int tile_cols = cm->tiles.cols;
+  const int tile_rows = cm->tiles.rows;
+  int *thread_id_to_tile_id = enc_row_mt->thread_id_to_tile_id;
+  int max_sb_rows = 0, max_sb_cols = 0;
+
+  // TODO(ravi.chaudhary@ittiam.com): Currently the percentage of
+  // post-processing stages in encoder is quiet low, so limiting the number of
+  // threads to the theoretical limit in row-mt does not have much impact on
+  // post-processing multi-threading stage. Need to revisit this when
+  // post-processing time starts shooting up.
+  int num_workers = av1_compute_num_enc_workers(cpi, mt_info->num_workers);
+
+  assert(IMPLIES(cpi->tile_data == NULL,
+                 cpi->allocated_tiles < tile_cols * tile_rows));
+  if (cpi->allocated_tiles < tile_cols * tile_rows) {
     av1_row_mt_mem_dealloc(cpi);
     av1_alloc_tile_data(cpi);
   }
 
   av1_init_tile_data(cpi);
 
-  for (int row = 0; row < tile_rows; row++) {
-    for (int col = 0; col < tile_cols; col++) {
-      TileDataEnc *tile_data = &cpi->tile_data[row * cm->tiles.cols + col];
-      int num_sb_rows_in_tile =
-          av1_get_sb_rows_in_tile(cm, tile_data->tile_info);
-      int num_sb_cols_in_tile =
-          av1_get_sb_cols_in_tile(cm, tile_data->tile_info);
-      total_num_threads_row_mt +=
-          AOMMIN((num_sb_cols_in_tile + 1) >> 1, num_sb_rows_in_tile);
-      max_sb_rows = AOMMAX(max_sb_rows, num_sb_rows_in_tile);
-    }
-  }
-  // TODO(ravi.chaudhary@ittiam.com): Currently the percentage of
-  // post-processing stages in encoder is quiet low, so limiting the number of
-  // threads to the theoretical limit in row-mt does not have much impact on
-  // post-processing multi-threading stage. Need to revisit this when
-  // post-processing time starts shooting up.
-  num_workers = AOMMIN(cpi->oxcf.max_threads, total_num_threads_row_mt);
+  compute_max_sb_rows_cols(cpi, &max_sb_rows, &max_sb_cols);
 
-  if (multi_thread_ctxt->allocated_tile_cols != tile_cols ||
-      multi_thread_ctxt->allocated_tile_rows != tile_rows ||
-      multi_thread_ctxt->allocated_sb_rows != max_sb_rows) {
+  if (enc_row_mt->allocated_tile_cols != tile_cols ||
+      enc_row_mt->allocated_tile_rows != tile_rows ||
+      enc_row_mt->allocated_rows != max_sb_rows ||
+      enc_row_mt->allocated_cols != (max_sb_cols - 1)) {
     av1_row_mt_mem_dealloc(cpi);
-    av1_row_mt_mem_alloc(cpi, max_sb_rows);
+    row_mt_mem_alloc(cpi, max_sb_rows, max_sb_cols,
+                     cpi->oxcf.algo_cfg.cdf_update_mode);
   }
 
-  memset(multi_thread_ctxt->thread_id_to_tile_id, -1,
-         sizeof(*multi_thread_ctxt->thread_id_to_tile_id) * MAX_NUM_THREADS);
+  memset(thread_id_to_tile_id, -1,
+         sizeof(*thread_id_to_tile_id) * MAX_NUM_THREADS);
 
   for (int tile_row = 0; tile_row < tile_rows; tile_row++) {
     for (int tile_col = 0; tile_col < tile_cols; tile_col++) {
-      int tile_id = tile_row * tile_cols + tile_col;
-      TileDataEnc *this_tile = &cpi->tile_data[tile_id];
+      int tile_index = tile_row * tile_cols + tile_col;
+      TileDataEnc *const this_tile = &cpi->tile_data[tile_index];
+      AV1EncRowMultiThreadSync *const row_mt_sync = &this_tile->row_mt_sync;
 
-      // Initialize cur_col to -1 for all rows.
-      memset(this_tile->row_mt_sync.cur_col, -1,
-             sizeof(*this_tile->row_mt_sync.cur_col) * max_sb_rows);
-      this_tile->row_mt_info.current_mi_row = this_tile->tile_info.mi_row_start;
-      this_tile->row_mt_info.num_threads_working = 0;
+      // Initialize num_finished_cols to -1 for all rows.
+      memset(row_mt_sync->num_finished_cols, -1,
+             sizeof(*row_mt_sync->num_finished_cols) * max_sb_rows);
+      row_mt_sync->next_mi_row = this_tile->tile_info.mi_row_start;
+      row_mt_sync->num_threads_working = 0;
 
       av1_inter_mode_data_init(this_tile);
       av1_zero_above_context(cm, &cpi->td.mb.e_mbd,
@@ -715,15 +1026,639 @@
   }
 
   // Only run once to create threads and allocate thread data.
-  if (cpi->num_workers == 0) {
+  if (mt_info->num_enc_workers == 0) {
     create_enc_workers(cpi, num_workers);
   } else {
-    num_workers = AOMMIN(num_workers, cpi->num_workers);
+    num_workers = AOMMIN(num_workers, mt_info->num_enc_workers);
   }
-  assign_tile_to_thread(multi_thread_ctxt, tile_cols * tile_rows, num_workers);
+  assign_tile_to_thread(thread_id_to_tile_id, tile_cols * tile_rows,
+                        num_workers);
   prepare_enc_workers(cpi, enc_row_mt_worker_hook, num_workers);
-  launch_enc_workers(cpi, num_workers);
-  sync_enc_workers(cpi, num_workers);
+  launch_workers(&cpi->mt_info, num_workers);
+  sync_enc_workers(&cpi->mt_info, cm, num_workers);
   if (cm->delta_q_info.delta_lf_present_flag) update_delta_lf_for_row_mt(cpi);
   accumulate_counters_enc_workers(cpi, num_workers);
 }
+
+#if !CONFIG_REALTIME_ONLY
+void av1_fp_encode_tiles_row_mt(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  MultiThreadInfo *const mt_info = &cpi->mt_info;
+  AV1EncRowMultiThreadInfo *const enc_row_mt = &mt_info->enc_row_mt;
+  const int tile_cols = cm->tiles.cols;
+  const int tile_rows = cm->tiles.rows;
+  int *thread_id_to_tile_id = enc_row_mt->thread_id_to_tile_id;
+  int num_workers = 0;
+  int max_mb_rows = 0;
+
+  assert(IMPLIES(cpi->tile_data == NULL,
+                 cpi->allocated_tiles < tile_cols * tile_rows));
+  if (cpi->allocated_tiles < tile_cols * tile_rows) {
+    av1_row_mt_mem_dealloc(cpi);
+    av1_alloc_tile_data(cpi);
+  }
+
+  av1_init_tile_data(cpi);
+
+  max_mb_rows = fp_compute_max_mb_rows(cm, cpi->tile_data);
+
+  // TODO(ravi.chaudhary@ittiam.com): Currently the percentage of
+  // post-processing stages in encoder is quiet low, so limiting the number of
+  // threads to the theoretical limit in row-mt does not have much impact on
+  // post-processing multi-threading stage. Need to revisit this when
+  // post-processing time starts shooting up.
+  num_workers = av1_fp_compute_num_enc_workers(cpi);
+
+  if (enc_row_mt->allocated_tile_cols != tile_cols ||
+      enc_row_mt->allocated_tile_rows != tile_rows ||
+      enc_row_mt->allocated_rows != max_mb_rows) {
+    av1_row_mt_mem_dealloc(cpi);
+    row_mt_mem_alloc(cpi, max_mb_rows, -1, 0);
+  }
+
+  memset(thread_id_to_tile_id, -1,
+         sizeof(*thread_id_to_tile_id) * MAX_NUM_THREADS);
+
+  for (int tile_row = 0; tile_row < tile_rows; tile_row++) {
+    for (int tile_col = 0; tile_col < tile_cols; tile_col++) {
+      int tile_index = tile_row * tile_cols + tile_col;
+      TileDataEnc *const this_tile = &cpi->tile_data[tile_index];
+      AV1EncRowMultiThreadSync *const row_mt_sync = &this_tile->row_mt_sync;
+
+      // Initialize num_finished_cols to -1 for all rows.
+      memset(row_mt_sync->num_finished_cols, -1,
+             sizeof(*row_mt_sync->num_finished_cols) * max_mb_rows);
+      row_mt_sync->next_mi_row = this_tile->tile_info.mi_row_start;
+      row_mt_sync->num_threads_working = 0;
+    }
+  }
+
+  num_workers = AOMMIN(num_workers, mt_info->num_workers);
+  // Only run once to create threads and allocate thread data.
+  if (mt_info->num_fp_workers == 0) fp_create_enc_workers(cpi, num_workers);
+  assign_tile_to_thread(thread_id_to_tile_id, tile_cols * tile_rows,
+                        num_workers);
+  fp_prepare_enc_workers(cpi, fp_enc_row_mt_worker_hook, num_workers);
+  launch_workers(&cpi->mt_info, num_workers);
+  sync_enc_workers(&cpi->mt_info, cm, num_workers);
+}
+
+void av1_tpl_row_mt_sync_read_dummy(AV1TplRowMultiThreadSync *tpl_mt_sync,
+                                    int r, int c) {
+  (void)tpl_mt_sync;
+  (void)r;
+  (void)c;
+  return;
+}
+
+void av1_tpl_row_mt_sync_write_dummy(AV1TplRowMultiThreadSync *tpl_mt_sync,
+                                     int r, int c, int cols) {
+  (void)tpl_mt_sync;
+  (void)r;
+  (void)c;
+  (void)cols;
+  return;
+}
+
+void av1_tpl_row_mt_sync_read(AV1TplRowMultiThreadSync *tpl_row_mt_sync, int r,
+                              int c) {
+#if CONFIG_MULTITHREAD
+  int nsync = tpl_row_mt_sync->sync_range;
+
+  if (r) {
+    pthread_mutex_t *const mutex = &tpl_row_mt_sync->mutex_[r - 1];
+    pthread_mutex_lock(mutex);
+
+    while (c > tpl_row_mt_sync->num_finished_cols[r - 1] - nsync)
+      pthread_cond_wait(&tpl_row_mt_sync->cond_[r - 1], mutex);
+    pthread_mutex_unlock(mutex);
+  }
+#else
+  (void)tpl_row_mt_sync;
+  (void)r;
+  (void)c;
+#endif  // CONFIG_MULTITHREAD
+}
+
+void av1_tpl_row_mt_sync_write(AV1TplRowMultiThreadSync *tpl_row_mt_sync, int r,
+                               int c, int cols) {
+#if CONFIG_MULTITHREAD
+  int nsync = tpl_row_mt_sync->sync_range;
+  int cur;
+  // Only signal when there are enough encoded blocks for next row to run.
+  int sig = 1;
+
+  if (c < cols - 1) {
+    cur = c;
+    if (c % nsync) sig = 0;
+  } else {
+    cur = cols + nsync;
+  }
+
+  if (sig) {
+    pthread_mutex_lock(&tpl_row_mt_sync->mutex_[r]);
+
+    tpl_row_mt_sync->num_finished_cols[r] = cur;
+
+    pthread_cond_signal(&tpl_row_mt_sync->cond_[r]);
+    pthread_mutex_unlock(&tpl_row_mt_sync->mutex_[r]);
+  }
+#else
+  (void)tpl_row_mt_sync;
+  (void)r;
+  (void)c;
+  (void)cols;
+#endif  // CONFIG_MULTITHREAD
+}
+
+// Each worker calls tpl_worker_hook() and computes the tpl data.
+static int tpl_worker_hook(void *arg1, void *unused) {
+  (void)unused;
+  EncWorkerData *thread_data = (EncWorkerData *)arg1;
+  AV1_COMP *cpi = thread_data->cpi;
+  AV1_COMMON *cm = &cpi->common;
+  MACROBLOCK *x = &thread_data->td->mb;
+  MACROBLOCKD *xd = &x->e_mbd;
+  CommonModeInfoParams *mi_params = &cm->mi_params;
+  BLOCK_SIZE bsize = convert_length_to_bsize(cpi->tpl_data.tpl_bsize_1d);
+  TX_SIZE tx_size = max_txsize_lookup[bsize];
+  int mi_height = mi_size_high[bsize];
+  int num_active_workers = cpi->tpl_data.tpl_mt_sync.num_threads_working;
+  for (int mi_row = thread_data->start * mi_height; mi_row < mi_params->mi_rows;
+       mi_row += num_active_workers * mi_height) {
+    // Motion estimation row boundary
+    av1_set_mv_row_limits(mi_params, &x->mv_limits, mi_row, mi_height,
+                          cpi->oxcf.border_in_pixels);
+    xd->mb_to_top_edge = -GET_MV_SUBPEL(mi_row * MI_SIZE);
+    xd->mb_to_bottom_edge =
+        GET_MV_SUBPEL((mi_params->mi_rows - mi_height - mi_row) * MI_SIZE);
+    av1_mc_flow_dispenser_row(cpi, x, mi_row, bsize, tx_size);
+  }
+  return 1;
+}
+
+// Deallocate tpl synchronization related mutex and data.
+void av1_tpl_dealloc(AV1TplRowMultiThreadSync *tpl_sync) {
+  assert(tpl_sync != NULL);
+
+#if CONFIG_MULTITHREAD
+  if (tpl_sync->mutex_ != NULL) {
+    for (int i = 0; i < tpl_sync->rows; ++i)
+      pthread_mutex_destroy(&tpl_sync->mutex_[i]);
+    aom_free(tpl_sync->mutex_);
+  }
+  if (tpl_sync->cond_ != NULL) {
+    for (int i = 0; i < tpl_sync->rows; ++i)
+      pthread_cond_destroy(&tpl_sync->cond_[i]);
+    aom_free(tpl_sync->cond_);
+  }
+#endif  // CONFIG_MULTITHREAD
+
+  aom_free(tpl_sync->num_finished_cols);
+  // clear the structure as the source of this call may be a resize in which
+  // case this call will be followed by an _alloc() which may fail.
+  av1_zero(*tpl_sync);
+}
+
+// Allocate memory for tpl row synchronization.
+void av1_tpl_alloc(AV1TplRowMultiThreadSync *tpl_sync, AV1_COMMON *cm,
+                   int mb_rows) {
+  tpl_sync->rows = mb_rows;
+#if CONFIG_MULTITHREAD
+  {
+    CHECK_MEM_ERROR(cm, tpl_sync->mutex_,
+                    aom_malloc(sizeof(*tpl_sync->mutex_) * mb_rows));
+    if (tpl_sync->mutex_) {
+      for (int i = 0; i < mb_rows; ++i)
+        pthread_mutex_init(&tpl_sync->mutex_[i], NULL);
+    }
+
+    CHECK_MEM_ERROR(cm, tpl_sync->cond_,
+                    aom_malloc(sizeof(*tpl_sync->cond_) * mb_rows));
+    if (tpl_sync->cond_) {
+      for (int i = 0; i < mb_rows; ++i)
+        pthread_cond_init(&tpl_sync->cond_[i], NULL);
+    }
+  }
+#endif  // CONFIG_MULTITHREAD
+  CHECK_MEM_ERROR(cm, tpl_sync->num_finished_cols,
+                  aom_malloc(sizeof(*tpl_sync->num_finished_cols) * mb_rows));
+
+  // Set up nsync.
+  tpl_sync->sync_range = 1;
+}
+
+// Each worker is prepared by assigning the hook function and individual thread
+// data.
+static AOM_INLINE void prepare_tpl_workers(AV1_COMP *cpi, AVxWorkerHook hook,
+                                           int num_workers) {
+  MultiThreadInfo *mt_info = &cpi->mt_info;
+  for (int i = num_workers - 1; i >= 0; i--) {
+    AVxWorker *worker = &mt_info->workers[i];
+    EncWorkerData *thread_data = &mt_info->tile_thr_data[i];
+
+    worker->hook = hook;
+    worker->data1 = thread_data;
+    worker->data2 = NULL;
+
+    thread_data->cpi = cpi;
+    if (i == 0) {
+      thread_data->td = &cpi->td;
+    }
+
+    // Before encoding a frame, copy the thread data from cpi.
+    if (thread_data->td != &cpi->td) {
+      thread_data->td->mb = cpi->td.mb;
+      thread_data->td->mb.obmc_buffer = thread_data->td->obmc_buffer;
+    }
+  }
+}
+
+// Computes num_workers for tpl multi-threading.
+static AOM_INLINE int compute_num_tpl_workers(AV1_COMP *cpi) {
+  return av1_compute_num_enc_workers(cpi, cpi->mt_info.num_workers);
+}
+
+// Implements multi-threading for tpl.
+void av1_mc_flow_dispenser_mt(AV1_COMP *cpi) {
+  AV1_COMMON *cm = &cpi->common;
+  CommonModeInfoParams *mi_params = &cm->mi_params;
+  MultiThreadInfo *mt_info = &cpi->mt_info;
+  TplParams *tpl_data = &cpi->tpl_data;
+  AV1TplRowMultiThreadSync *tpl_sync = &tpl_data->tpl_mt_sync;
+  int mb_rows = mi_params->mb_rows;
+  int num_workers = compute_num_tpl_workers(cpi);
+
+  if (mt_info->num_enc_workers == 0)
+    create_enc_workers(cpi, num_workers);
+  else
+    num_workers = AOMMIN(num_workers, mt_info->num_enc_workers);
+
+  if (mb_rows != tpl_sync->rows) {
+    av1_tpl_dealloc(tpl_sync);
+    av1_tpl_alloc(tpl_sync, cm, mb_rows);
+  }
+  tpl_sync->num_threads_working = num_workers;
+
+  // Initialize cur_mb_col to -1 for all MB rows.
+  memset(tpl_sync->num_finished_cols, -1,
+         sizeof(*tpl_sync->num_finished_cols) * mb_rows);
+
+  prepare_tpl_workers(cpi, tpl_worker_hook, num_workers);
+  launch_workers(&cpi->mt_info, num_workers);
+  sync_enc_workers(&cpi->mt_info, cm, num_workers);
+}
+
+// Deallocate memory for temporal filter multi-thread synchronization.
+void av1_tf_mt_dealloc(AV1TemporalFilterSync *tf_sync) {
+  assert(tf_sync != NULL);
+#if CONFIG_MULTITHREAD
+  if (tf_sync->mutex_ != NULL) {
+    pthread_mutex_destroy(tf_sync->mutex_);
+    aom_free(tf_sync->mutex_);
+  }
+#endif  // CONFIG_MULTITHREAD
+  tf_sync->next_tf_row = 0;
+}
+
+// Checks if a job is available. If job is available,
+// populates next_tf_row and returns 1, else returns 0.
+static AOM_INLINE int tf_get_next_job(AV1TemporalFilterSync *tf_mt_sync,
+                                      int *current_mb_row, int mb_rows) {
+  int do_next_row = 0;
+#if CONFIG_MULTITHREAD
+  pthread_mutex_t *tf_mutex_ = tf_mt_sync->mutex_;
+  pthread_mutex_lock(tf_mutex_);
+#endif
+  if (tf_mt_sync->next_tf_row < mb_rows) {
+    *current_mb_row = tf_mt_sync->next_tf_row;
+    tf_mt_sync->next_tf_row++;
+    do_next_row = 1;
+  }
+#if CONFIG_MULTITHREAD
+  pthread_mutex_unlock(tf_mutex_);
+#endif
+  return do_next_row;
+}
+
+// Hook function for each thread in temporal filter multi-threading.
+static int tf_worker_hook(void *arg1, void *unused) {
+  (void)unused;
+  EncWorkerData *thread_data = (EncWorkerData *)arg1;
+  AV1_COMP *cpi = thread_data->cpi;
+  ThreadData *td = thread_data->td;
+  TemporalFilterCtx *tf_ctx = &cpi->tf_ctx;
+  AV1TemporalFilterSync *tf_sync = &cpi->mt_info.tf_sync;
+  const struct scale_factors *scale = &cpi->tf_ctx.sf;
+  const int num_planes = av1_num_planes(&cpi->common);
+  assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
+
+  MACROBLOCKD *mbd = &td->mb.e_mbd;
+  uint8_t *input_buffer[MAX_MB_PLANE];
+  MB_MODE_INFO **input_mb_mode_info;
+  tf_save_state(mbd, &input_mb_mode_info, input_buffer, num_planes);
+  tf_setup_macroblockd(mbd, &td->tf_data, scale);
+
+  int current_mb_row = -1;
+
+  while (tf_get_next_job(tf_sync, &current_mb_row, tf_ctx->mb_rows))
+    av1_tf_do_filtering_row(cpi, td, current_mb_row);
+
+  tf_restore_state(mbd, input_mb_mode_info, input_buffer, num_planes);
+
+  return 1;
+}
+
+// Assigns temporal filter hook function and thread data to each worker.
+static void prepare_tf_workers(AV1_COMP *cpi, AVxWorkerHook hook,
+                               int num_workers, int is_highbitdepth) {
+  MultiThreadInfo *mt_info = &cpi->mt_info;
+  mt_info->tf_sync.next_tf_row = 0;
+  for (int i = num_workers - 1; i >= 0; i--) {
+    AVxWorker *worker = &mt_info->workers[i];
+    EncWorkerData *thread_data = &mt_info->tile_thr_data[i];
+
+    worker->hook = hook;
+    worker->data1 = thread_data;
+    worker->data2 = NULL;
+
+    thread_data->cpi = cpi;
+    if (i == 0) {
+      thread_data->td = &cpi->td;
+    }
+
+    // Before encoding a frame, copy the thread data from cpi.
+    if (thread_data->td != &cpi->td) {
+      thread_data->td->mb = cpi->td.mb;
+      thread_data->td->mb.obmc_buffer = thread_data->td->obmc_buffer;
+      tf_alloc_and_reset_data(&thread_data->td->tf_data, cpi->tf_ctx.num_pels,
+                              is_highbitdepth);
+    }
+  }
+}
+
+// Deallocate thread specific data for temporal filter.
+static void tf_dealloc_thread_data(AV1_COMP *cpi, int num_workers,
+                                   int is_highbitdepth) {
+  MultiThreadInfo *mt_info = &cpi->mt_info;
+  for (int i = num_workers - 1; i >= 0; i--) {
+    EncWorkerData *thread_data = &mt_info->tile_thr_data[i];
+    ThreadData *td = thread_data->td;
+    if (td != &cpi->td) tf_dealloc_data(&td->tf_data, is_highbitdepth);
+  }
+}
+
+// Accumulate sse and sum after temporal filtering.
+static void tf_accumulate_frame_diff(AV1_COMP *cpi, int num_workers) {
+  FRAME_DIFF *total_diff = &cpi->td.tf_data.diff;
+  for (int i = num_workers - 1; i >= 0; i--) {
+    AVxWorker *const worker = &cpi->mt_info.workers[i];
+    EncWorkerData *const thread_data = (EncWorkerData *)worker->data1;
+    ThreadData *td = thread_data->td;
+    FRAME_DIFF *diff = &td->tf_data.diff;
+    if (td != &cpi->td) {
+      total_diff->sse += diff->sse;
+      total_diff->sum += diff->sum;
+    }
+  }
+}
+
+// Implements multi-threading for temporal filter.
+void av1_tf_do_filtering_mt(AV1_COMP *cpi) {
+  AV1_COMMON *cm = &cpi->common;
+  MultiThreadInfo *mt_info = &cpi->mt_info;
+  const int is_highbitdepth = cpi->tf_ctx.is_highbitdepth;
+
+  int num_workers = mt_info->num_workers;
+  if (mt_info->num_enc_workers == 0)
+    create_enc_workers(cpi, num_workers);
+  else
+    num_workers = AOMMIN(num_workers, mt_info->num_enc_workers);
+
+  prepare_tf_workers(cpi, tf_worker_hook, num_workers, is_highbitdepth);
+  launch_workers(mt_info, num_workers);
+  sync_enc_workers(mt_info, cm, num_workers);
+  tf_accumulate_frame_diff(cpi, num_workers);
+  tf_dealloc_thread_data(cpi, num_workers, is_highbitdepth);
+}
+
+// Checks if a job is available in the current direction. If a job is available,
+// frame_idx will be populated and returns 1, else returns 0.
+static AOM_INLINE int get_next_gm_job(AV1_COMP *cpi, int *frame_idx,
+                                      int cur_dir) {
+  GlobalMotionInfo *gm_info = &cpi->gm_info;
+  JobInfo *job_info = &cpi->mt_info.gm_sync.job_info;
+
+  int total_refs = gm_info->num_ref_frames[cur_dir];
+  int8_t cur_frame_to_process = job_info->next_frame_to_process[cur_dir];
+
+  if (cur_frame_to_process < total_refs && !job_info->early_exit[cur_dir]) {
+    *frame_idx = gm_info->reference_frames[cur_dir][cur_frame_to_process].frame;
+    job_info->next_frame_to_process[cur_dir] += 1;
+    return 1;
+  }
+  return 0;
+}
+
+// Switches the current direction and calls the function get_next_gm_job() if
+// the speed feature 'prune_ref_frame_for_gm_search' is not set.
+static AOM_INLINE void switch_direction(AV1_COMP *cpi, int *frame_idx,
+                                        int *cur_dir) {
+  if (cpi->sf.gm_sf.prune_ref_frame_for_gm_search) return;
+  // Switch the direction and get next job
+  *cur_dir = !(*cur_dir);
+  get_next_gm_job(cpi, frame_idx, *(cur_dir));
+}
+
+// Initializes inliers, num_inliers and segment_map.
+static AOM_INLINE void init_gm_thread_data(
+    const GlobalMotionInfo *gm_info, GlobalMotionThreadData *thread_data) {
+  for (int m = 0; m < RANSAC_NUM_MOTIONS; m++) {
+    MotionModel motion_params = thread_data->params_by_motion[m];
+    av1_zero(motion_params.params);
+    motion_params.num_inliers = 0;
+  }
+
+  av1_zero_array(thread_data->segment_map,
+                 gm_info->segment_map_w * gm_info->segment_map_h);
+}
+
+// Hook function for each thread in global motion multi-threading.
+static int gm_mt_worker_hook(void *arg1, void *unused) {
+  (void)unused;
+
+  EncWorkerData *thread_data = (EncWorkerData *)arg1;
+  AV1_COMP *cpi = thread_data->cpi;
+  GlobalMotionInfo *gm_info = &cpi->gm_info;
+  MultiThreadInfo *mt_info = &cpi->mt_info;
+  JobInfo *job_info = &mt_info->gm_sync.job_info;
+  int thread_id = thread_data->thread_id;
+  GlobalMotionThreadData *gm_thread_data =
+      &mt_info->gm_sync.thread_data[thread_id];
+  int cur_dir = job_info->thread_id_to_dir[thread_id];
+#if CONFIG_MULTITHREAD
+  pthread_mutex_t *gm_mt_mutex_ = mt_info->gm_sync.mutex_;
+#endif
+
+  while (1) {
+    int ref_buf_idx = -1;
+    int ref_frame_idx = -1;
+
+#if CONFIG_MULTITHREAD
+    pthread_mutex_lock(gm_mt_mutex_);
+#endif
+
+    // Populates ref_buf_idx(the reference frame type) for which global motion
+    // estimation will be done.
+    if (!get_next_gm_job(cpi, &ref_buf_idx, cur_dir)) {
+      // No jobs are available for the current direction. Switch
+      // to other direction and get the next job, if available.
+      switch_direction(cpi, &ref_buf_idx, &cur_dir);
+    }
+
+    // 'ref_frame_idx' holds the index of the current reference frame type in
+    // gm_info->reference_frames. job_info->next_frame_to_process will be
+    // incremented in get_next_gm_job() and hence subtracting by 1.
+    ref_frame_idx = job_info->next_frame_to_process[cur_dir] - 1;
+
+#if CONFIG_MULTITHREAD
+    pthread_mutex_unlock(gm_mt_mutex_);
+#endif
+
+    if (ref_buf_idx == -1) break;
+
+    init_gm_thread_data(gm_info, gm_thread_data);
+
+    // Compute global motion for the given ref_buf_idx.
+    av1_compute_gm_for_valid_ref_frames(
+        cpi, gm_info->ref_buf, ref_buf_idx, gm_info->num_src_corners,
+        gm_info->src_corners, gm_info->src_buffer,
+        gm_thread_data->params_by_motion, gm_thread_data->segment_map,
+        gm_info->segment_map_w, gm_info->segment_map_h);
+
+#if CONFIG_MULTITHREAD
+    pthread_mutex_lock(gm_mt_mutex_);
+#endif
+    assert(ref_frame_idx != -1);
+    // If global motion w.r.t. current ref frame is
+    // INVALID/TRANSLATION/IDENTITY, skip the evaluation of global motion w.r.t
+    // the remaining ref frames in that direction. The below exit is disabled
+    // when ref frame distance w.r.t. current frame is zero. E.g.:
+    // source_alt_ref_frame w.r.t. ARF frames.
+    if (cpi->sf.gm_sf.prune_ref_frame_for_gm_search &&
+        gm_info->reference_frames[cur_dir][ref_frame_idx].distance != 0 &&
+        cpi->common.global_motion[ref_buf_idx].wmtype != ROTZOOM)
+      job_info->early_exit[cur_dir] = 1;
+
+#if CONFIG_MULTITHREAD
+    pthread_mutex_unlock(gm_mt_mutex_);
+#endif
+  }
+  return 1;
+}
+
+// Assigns global motion hook function and thread data to each worker.
+static AOM_INLINE void prepare_gm_workers(AV1_COMP *cpi, AVxWorkerHook hook,
+                                          int num_workers) {
+  MultiThreadInfo *mt_info = &cpi->mt_info;
+  for (int i = num_workers - 1; i >= 0; i--) {
+    AVxWorker *worker = &mt_info->workers[i];
+    EncWorkerData *thread_data = &mt_info->tile_thr_data[i];
+
+    worker->hook = hook;
+    worker->data1 = thread_data;
+    worker->data2 = NULL;
+
+    thread_data->cpi = cpi;
+  }
+}
+
+// Assigns available threads to past/future direction.
+static AOM_INLINE void assign_thread_to_dir(int8_t *thread_id_to_dir,
+                                            int num_workers) {
+  int8_t frame_dir_idx = 0;
+
+  for (int i = 0; i < num_workers; i++) {
+    thread_id_to_dir[i] = frame_dir_idx++;
+    if (frame_dir_idx == MAX_DIRECTIONS) frame_dir_idx = 0;
+  }
+}
+
+// Computes number of workers for global motion multi-threading.
+static AOM_INLINE int compute_gm_workers(const AV1_COMP *cpi) {
+  int total_refs =
+      cpi->gm_info.num_ref_frames[0] + cpi->gm_info.num_ref_frames[1];
+  int max_num_workers = cpi->mt_info.num_workers;
+  int max_allowed_workers = cpi->sf.gm_sf.prune_ref_frame_for_gm_search
+                                ? AOMMIN(MAX_DIRECTIONS, max_num_workers)
+                                : max_num_workers;
+
+  return (AOMMIN(total_refs, max_allowed_workers));
+}
+
+// Frees the memory allocated for each worker in global motion multi-threading.
+void av1_gm_dealloc(AV1GlobalMotionSync *gm_sync_data) {
+  if (gm_sync_data->thread_data != NULL) {
+    for (int j = 0; j < gm_sync_data->allocated_workers; j++) {
+      GlobalMotionThreadData *thread_data = &gm_sync_data->thread_data[j];
+      aom_free(thread_data->segment_map);
+
+      for (int m = 0; m < RANSAC_NUM_MOTIONS; m++)
+        aom_free(thread_data->params_by_motion[m].inliers);
+    }
+    aom_free(gm_sync_data->thread_data);
+  }
+}
+
+// Allocates memory for inliers and segment_map for each worker in global motion
+// multi-threading.
+static AOM_INLINE void gm_alloc(AV1_COMP *cpi, int num_workers) {
+  AV1_COMMON *cm = &cpi->common;
+  AV1GlobalMotionSync *gm_sync = &cpi->mt_info.gm_sync;
+  GlobalMotionInfo *gm_info = &cpi->gm_info;
+
+  gm_sync->allocated_workers = num_workers;
+  gm_sync->allocated_width = cpi->source->y_width;
+  gm_sync->allocated_height = cpi->source->y_height;
+
+  CHECK_MEM_ERROR(cm, gm_sync->thread_data,
+                  aom_malloc(sizeof(*gm_sync->thread_data) * num_workers));
+
+  for (int i = 0; i < num_workers; i++) {
+    GlobalMotionThreadData *thread_data = &gm_sync->thread_data[i];
+    CHECK_MEM_ERROR(
+        cm, thread_data->segment_map,
+        aom_malloc(sizeof(*thread_data->segment_map) * gm_info->segment_map_w *
+                   gm_info->segment_map_h));
+
+    for (int m = 0; m < RANSAC_NUM_MOTIONS; m++) {
+      CHECK_MEM_ERROR(
+          cm, thread_data->params_by_motion[m].inliers,
+          aom_malloc(sizeof(*thread_data->params_by_motion[m].inliers) * 2 *
+                     MAX_CORNERS));
+    }
+  }
+}
+
+// Implements multi-threading for global motion.
+void av1_global_motion_estimation_mt(AV1_COMP *cpi) {
+  AV1GlobalMotionSync *gm_sync = &cpi->mt_info.gm_sync;
+  JobInfo *job_info = &gm_sync->job_info;
+
+  av1_zero(*job_info);
+
+  int num_workers = compute_gm_workers(cpi);
+
+  if (num_workers > gm_sync->allocated_workers ||
+      cpi->source->y_width != gm_sync->allocated_width ||
+      cpi->source->y_height != gm_sync->allocated_height) {
+    av1_gm_dealloc(gm_sync);
+    gm_alloc(cpi, num_workers);
+  }
+
+  assign_thread_to_dir(job_info->thread_id_to_dir, num_workers);
+  prepare_gm_workers(cpi, gm_mt_worker_hook, num_workers);
+  launch_workers(&cpi->mt_info, num_workers);
+  sync_enc_workers(&cpi->mt_info, &cpi->common, num_workers);
+}
+#endif  // !CONFIG_REALTIME_ONLY
diff --git a/av1/encoder/ethread.h b/av1/encoder/ethread.h
index 1830759..ab8e1bb 100644
--- a/av1/encoder/ethread.h
+++ b/av1/encoder/ethread.h
@@ -18,7 +18,6 @@
 
 struct AV1_COMP;
 struct ThreadData;
-struct AV1RowMTSyncData;
 
 typedef struct EncWorkerData {
   struct AV1_COMP *cpi;
@@ -27,26 +26,58 @@
   int thread_id;
 } EncWorkerData;
 
-void av1_row_mt_sync_read(AV1RowMTSync *const row_mt_sync, int r, int c);
-void av1_row_mt_sync_write(AV1RowMTSync *const row_mt_sync, int r, int c,
-                           const int cols);
+void av1_row_mt_sync_read(AV1EncRowMultiThreadSync *row_mt_sync, int r, int c);
+void av1_row_mt_sync_write(AV1EncRowMultiThreadSync *row_mt_sync, int r, int c,
+                           int cols);
 
-void av1_row_mt_sync_read_dummy(struct AV1RowMTSyncData *const row_mt_sync,
-                                int r, int c);
-void av1_row_mt_sync_write_dummy(struct AV1RowMTSyncData *const row_mt_sync,
-                                 int r, int c, const int cols);
-
-void av1_row_mt_sync_mem_dealloc(AV1RowMTSync *row_mt_sync);
-// Allocate memory for row based multi-threading synchronization.
-void av1_row_mt_sync_mem_alloc(AV1RowMTSync *row_mt_sync, struct AV1Common *cm,
-                               int rows);
+void av1_row_mt_sync_read_dummy(AV1EncRowMultiThreadSync *row_mt_sync, int r,
+                                int c);
+void av1_row_mt_sync_write_dummy(AV1EncRowMultiThreadSync *row_mt_sync, int r,
+                                 int c, int cols);
 
 void av1_encode_tiles_mt(struct AV1_COMP *cpi);
 void av1_encode_tiles_row_mt(struct AV1_COMP *cpi);
 
+#if !CONFIG_REALTIME_ONLY
+void av1_fp_encode_tiles_row_mt(AV1_COMP *cpi);
+
+int av1_fp_compute_num_enc_workers(AV1_COMP *cpi);
+#endif
+
 void av1_accumulate_frame_counts(struct FRAME_COUNTS *acc_counts,
                                  const struct FRAME_COUNTS *counts);
 
+void av1_row_mt_mem_dealloc(AV1_COMP *cpi);
+
+void av1_global_motion_estimation_mt(AV1_COMP *cpi);
+
+void av1_gm_dealloc(AV1GlobalMotionSync *gm_sync_data);
+
+#if !CONFIG_REALTIME_ONLY
+void av1_tpl_row_mt_sync_read_dummy(AV1TplRowMultiThreadSync *tpl_mt_sync,
+                                    int r, int c);
+void av1_tpl_row_mt_sync_write_dummy(AV1TplRowMultiThreadSync *tpl_mt_sync,
+                                     int r, int c, int cols);
+
+void av1_tpl_row_mt_sync_read(AV1TplRowMultiThreadSync *tpl_mt_sync, int r,
+                              int c);
+void av1_tpl_row_mt_sync_write(AV1TplRowMultiThreadSync *tpl_mt_sync, int r,
+                               int c, int cols);
+
+void av1_mc_flow_dispenser_mt(AV1_COMP *cpi);
+
+void av1_tpl_dealloc(AV1TplRowMultiThreadSync *tpl_sync);
+
+#endif  // !CONFIG_REALTIME_ONLY
+
+void av1_tf_do_filtering_mt(AV1_COMP *cpi);
+
+void av1_tf_mt_dealloc(AV1TemporalFilterSync *tf_sync);
+
+int av1_compute_num_enc_workers(AV1_COMP *cpi, int max_workers);
+
+void av1_create_workers(AV1_COMP *cpi, int num_workers);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/av1/encoder/extend.c b/av1/encoder/extend.c
index 934cf56..381aec8 100644
--- a/av1/encoder/extend.c
+++ b/av1/encoder/extend.c
@@ -122,12 +122,10 @@
     highbd_copy_and_extend_plane(src->y_buffer, src->y_stride, dst->y_buffer,
                                  dst->y_stride, src->y_crop_width,
                                  src->y_crop_height, et_y, el_y, eb_y, er_y);
-    if (src->u_buffer) {
+    if (!src->monochrome) {
       highbd_copy_and_extend_plane(
           src->u_buffer, src->uv_stride, dst->u_buffer, dst->uv_stride,
           src->uv_crop_width, src->uv_crop_height, et_uv, el_uv, eb_uv, er_uv);
-    }
-    if (src->v_buffer) {
       highbd_copy_and_extend_plane(
           src->v_buffer, src->uv_stride, dst->v_buffer, dst->uv_stride,
           src->uv_crop_width, src->uv_crop_height, et_uv, el_uv, eb_uv, er_uv);
@@ -138,12 +136,10 @@
   copy_and_extend_plane(src->y_buffer, src->y_stride, dst->y_buffer,
                         dst->y_stride, src->y_crop_width, src->y_crop_height,
                         et_y, el_y, eb_y, er_y);
-  if (src->u_buffer) {
+  if (!src->monochrome) {
     copy_and_extend_plane(src->u_buffer, src->uv_stride, dst->u_buffer,
                           dst->uv_stride, src->uv_crop_width,
                           src->uv_crop_height, et_uv, el_uv, eb_uv, er_uv);
-  }
-  if (src->v_buffer) {
     copy_and_extend_plane(src->v_buffer, src->uv_stride, dst->v_buffer,
                           dst->uv_stride, src->uv_crop_width,
                           src->uv_crop_height, et_uv, el_uv, eb_uv, er_uv);
diff --git a/av1/encoder/firstpass.c b/av1/encoder/firstpass.c
index 0955510..24cb6c4 100644
--- a/av1/encoder/firstpass.c
+++ b/av1/encoder/firstpass.c
@@ -37,6 +37,7 @@
 #include "av1/encoder/encodemv.h"
 #include "av1/encoder/encoder.h"
 #include "av1/encoder/encode_strategy.h"
+#include "av1/encoder/ethread.h"
 #include "av1/encoder/extend.h"
 #include "av1/encoder/firstpass.h"
 #include "av1/encoder/mcomp.h"
@@ -109,8 +110,8 @@
   section->duration = 1.0;
 }
 
-static AOM_INLINE void accumulate_stats(FIRSTPASS_STATS *section,
-                                        const FIRSTPASS_STATS *frame) {
+void av1_accumulate_stats(FIRSTPASS_STATS *section,
+                          const FIRSTPASS_STATS *frame) {
   section->frame += frame->frame;
   section->weight += frame->weight;
   section->intra_error += frame->intra_error;
@@ -203,9 +204,9 @@
 
 // Refine the motion search range according to the frame dimension
 // for first pass test.
-static int get_search_range(const AV1_COMP *cpi) {
+static int get_search_range(const InitialDimensions *initial_dimensions) {
   int sr = 0;
-  const int dim = AOMMIN(cpi->initial_width, cpi->initial_height);
+  const int dim = AOMMIN(initial_dimensions->width, initial_dimensions->height);
 
   while ((dim << sr) < MAX_FULL_PEL_VAL) ++sr;
   return sr;
@@ -218,25 +219,33 @@
   MACROBLOCKD *const xd = &x->e_mbd;
   FULLPEL_MV start_mv = get_fullmv_from_mv(ref_mv);
   int tmp_err;
-  const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
-  aom_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[bsize];
+  const BLOCK_SIZE bsize = xd->mi[0]->bsize;
   const int new_mv_mode_penalty = NEW_MV_MODE_PENALTY;
-  const int sr = get_search_range(cpi);
+  const int sr = get_search_range(&cpi->initial_dimensions);
   const int step_param = 3 + sr;
 
   const search_site_config *first_pass_search_sites =
-      &cpi->mv_search_params.ss_cfg[SS_CFG_FPF];
+      cpi->mv_search_params.search_site_cfg[SS_CFG_FPF];
+  const int fine_search_interval =
+      cpi->is_screen_content_type && cpi->common.features.allow_intrabc;
+  if (fine_search_interval) {
+    av1_set_speed_features_framesize_independent(cpi, cpi->oxcf.speed);
+  }
   FULLPEL_MOTION_SEARCH_PARAMS ms_params;
   av1_make_default_fullpel_ms_params(&ms_params, cpi, x, bsize, ref_mv,
-                                     first_pass_search_sites);
-  ms_params.search_method = NSTEP;
+                                     first_pass_search_sites,
+                                     fine_search_interval);
+  av1_set_mv_search_method(&ms_params, first_pass_search_sites, NSTEP);
 
   FULLPEL_MV this_best_mv;
   tmp_err = av1_full_pixel_search(start_mv, &ms_params, step_param, NULL,
                                   &this_best_mv, NULL);
 
   if (tmp_err < INT_MAX) {
-    tmp_err = av1_get_mvpred_sse(x, &this_best_mv, ref_mv, &v_fn_ptr) +
+    aom_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[bsize];
+    const MSBuffers *ms_buffers = &ms_params.ms_buffers;
+    tmp_err = av1_get_mvpred_sse(&ms_params.mv_cost_params, this_best_mv,
+                                 &v_fn_ptr, ms_buffers->src, ms_buffers->ref) +
               new_mv_mode_penalty;
   }
 
@@ -263,6 +272,7 @@
 }
 
 static int find_fp_qindex(aom_bit_depth_t bit_depth) {
+  aom_clear_system_state();
   return av1_find_qindex(FIRST_PASS_Q, bit_depth, 0, QINDEX_RANGE - 1);
 }
 
@@ -289,56 +299,6 @@
   return raw_err_stdev;
 }
 
-// This structure contains several key parameters to be accumulate for this
-// frame.
-typedef struct {
-  // Intra prediction error.
-  int64_t intra_error;
-  // Average wavelet energy computed using Discrete Wavelet Transform (DWT).
-  int64_t frame_avg_wavelet_energy;
-  // Best of intra pred error and inter pred error using last frame as ref.
-  int64_t coded_error;
-  // Best of intra pred error and inter pred error using golden frame as ref.
-  int64_t sr_coded_error;
-  // Best of intra pred error and inter pred error using altref frame as ref.
-  int64_t tr_coded_error;
-  // Count of motion vector.
-  int mv_count;
-  // Count of blocks that pick inter prediction (inter pred error is smaller
-  // than intra pred error).
-  int inter_count;
-  // Count of blocks that pick second ref (golden frame).
-  int second_ref_count;
-  // Count of blocks that pick third ref (altref frame).
-  int third_ref_count;
-  // Count of blocks where the inter and intra are very close and very low.
-  double neutral_count;
-  // Count of blocks where intra error is very small.
-  int intra_skip_count;
-  // Start row.
-  int image_data_start_row;
-  // Count of unique non-zero motion vectors.
-  int new_mv_count;
-  // Sum of inward motion vectors.
-  int sum_in_vectors;
-  // Sum of motion vector row.
-  int sum_mvr;
-  // Sum of motion vector column.
-  int sum_mvc;
-  // Sum of absolute value of motion vector row.
-  int sum_mvr_abs;
-  // Sum of absolute value of motion vector column.
-  int sum_mvc_abs;
-  // Sum of the square of motion vector row.
-  int64_t sum_mvrs;
-  // Sum of the square of motion vector column.
-  int64_t sum_mvcs;
-  // A factor calculated using intra pred error.
-  double intra_factor;
-  // A factor that measures brightness.
-  double brightness_factor;
-} FRAME_STATS;
-
 #define UL_INTRA_THRESH 50
 #define INVALID_ROW -1
 // Computes and returns the intra pred error of a block.
@@ -366,14 +326,14 @@
 // Returns:
 //   this_intra_error.
 static int firstpass_intra_prediction(
-    AV1_COMP *cpi, YV12_BUFFER_CONFIG *const this_frame,
+    AV1_COMP *cpi, ThreadData *td, YV12_BUFFER_CONFIG *const this_frame,
     const TileInfo *const tile, const int mb_row, const int mb_col,
     const int y_offset, const int uv_offset, const BLOCK_SIZE fp_block_size,
     const int qindex, FRAME_STATS *const stats) {
   const AV1_COMMON *const cm = &cpi->common;
   const CommonModeInfoParams *const mi_params = &cm->mi_params;
   const SequenceHeader *const seq_params = &cm->seq_params;
-  MACROBLOCK *const x = &cpi->td.mb;
+  MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   const int mb_scale = mi_size_wide[fp_block_size];
   const int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row);
@@ -386,7 +346,7 @@
   xd->plane[1].dst.buf = this_frame->u_buffer + uv_offset;
   xd->plane[2].dst.buf = this_frame->v_buffer + uv_offset;
   xd->left_available = (mb_col != 0);
-  xd->mi[0]->sb_type = bsize;
+  xd->mi[0]->bsize = bsize;
   xd->mi[0]->ref_frame[0] = INTRA_FRAME;
   set_mi_row_col(xd, tile, mb_row * mb_scale, mi_size_high[bsize],
                  mb_col * mb_scale, mi_size_wide[bsize], mi_params->mi_rows,
@@ -400,13 +360,6 @@
 
   av1_encode_intra_block_plane(cpi, x, bsize, 0, DRY_RUN_NORMAL, 0);
   int this_intra_error = aom_get_mb_ss(x->plane[0].src_diff);
-
-  if (this_intra_error < UL_INTRA_THRESH) {
-    ++stats->intra_skip_count;
-  } else if ((mb_col > 0) && (stats->image_data_start_row == INVALID_ROW)) {
-    stats->image_data_start_row = mb_row;
-  }
-
   if (seq_params->use_highbitdepth) {
     switch (seq_params->bit_depth) {
       case AOM_BITS_8: break;
@@ -420,6 +373,12 @@
     }
   }
 
+  if (this_intra_error < UL_INTRA_THRESH) {
+    ++stats->intra_skip_count;
+  } else if ((mb_col > 0) && (stats->image_data_start_row == INVALID_ROW)) {
+    stats->image_data_start_row = mb_row;
+  }
+
   aom_clear_system_state();
   double log_intra = log(this_intra_error + 1.0);
   if (log_intra < 10.0) {
@@ -434,6 +393,19 @@
   } else {
     level_sample = x->plane[0].src.buf[0];
   }
+
+  if (seq_params->use_highbitdepth) {
+    switch (seq_params->bit_depth) {
+      case AOM_BITS_8: break;
+      case AOM_BITS_10: level_sample >>= 2; break;
+      case AOM_BITS_12: level_sample >>= 4; break;
+      default:
+        assert(0 &&
+               "seq_params->bit_depth should be AOM_BITS_8, "
+               "AOM_BITS_10 or AOM_BITS_12");
+        return -1;
+    }
+  }
   if ((level_sample < DARK_THRESH) && (log_intra < 9.0)) {
     stats->brightness_factor += 1.0 + (0.01 * (DARK_THRESH - level_sample));
   } else {
@@ -557,7 +529,7 @@
 //  Returns:
 //    this_inter_error
 static int firstpass_inter_prediction(
-    AV1_COMP *cpi, const YV12_BUFFER_CONFIG *const last_frame,
+    AV1_COMP *cpi, ThreadData *td, const YV12_BUFFER_CONFIG *const last_frame,
     const YV12_BUFFER_CONFIG *const golden_frame,
     const YV12_BUFFER_CONFIG *const alt_ref_frame, const int mb_row,
     const int mb_col, const int recon_yoffset, const int recon_uvoffset,
@@ -569,7 +541,7 @@
   AV1_COMMON *const cm = &cpi->common;
   const CommonModeInfoParams *const mi_params = &cm->mi_params;
   CurrentFrame *const current_frame = &cm->current_frame;
-  MACROBLOCK *const x = &cpi->td.mb;
+  MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   const int is_high_bitdepth = is_cur_buf_hbd(xd);
   const int bitdepth = xd->bd;
@@ -582,7 +554,7 @@
   xd->plane[0].pre[0].buf = last_frame->y_buffer + recon_yoffset;
   // Set up limit values for motion vectors to prevent them extending
   // outside the UMV borders.
-  av1_set_mv_col_limits(mi_params, &x->mv_limits, (mb_col << 2),
+  av1_set_mv_col_limits(mi_params, &x->mv_limits, (mb_col << FP_MIB_SIZE_LOG2),
                         (fp_block_size_height >> MI_SIZE_LOG2),
                         cpi->oxcf.border_in_pixels);
 
@@ -757,8 +729,9 @@
   // where the typical "real" energy per MB also falls.
   // Initial estimate here uses sqrt(mbs) to define the min_err, where the
   // number of mbs is proportional to the image area.
-  const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE) ? cpi->initial_mbs
-                                                             : mi_params->MBs;
+  const int num_mbs = (cpi->oxcf.resize_cfg.resize_mode != RESIZE_NONE)
+                          ? cpi->initial_mbs
+                          : mi_params->MBs;
   const double min_err = 200 * sqrt(num_mbs);
 
   fps.weight = stats->intra_factor * stats->brightness_factor;
@@ -814,7 +787,7 @@
   *this_frame_stats = fps;
   output_stats(this_frame_stats, cpi->output_pkt_list);
   if (cpi->twopass.stats_buf_ctx->total_stats != NULL) {
-    accumulate_stats(cpi->twopass.stats_buf_ctx->total_stats, &fps);
+    av1_accumulate_stats(cpi->twopass.stats_buf_ctx->total_stats, &fps);
   }
   /*In the case of two pass, first pass uses it as a circular buffer,
    * when LAP is enabled it is used as a linear buffer*/
@@ -845,38 +818,128 @@
   fclose(recon_file);
 }
 
+static FRAME_STATS accumulate_frame_stats(FRAME_STATS *mb_stats, int mb_rows,
+                                          int mb_cols) {
+  FRAME_STATS stats = { 0 };
+  int i, j;
+
+  stats.image_data_start_row = INVALID_ROW;
+  for (j = 0; j < mb_rows; j++) {
+    for (i = 0; i < mb_cols; i++) {
+      FRAME_STATS mb_stat = mb_stats[j * mb_cols + i];
+      stats.brightness_factor += mb_stat.brightness_factor;
+      stats.coded_error += mb_stat.coded_error;
+      stats.frame_avg_wavelet_energy += mb_stat.frame_avg_wavelet_energy;
+      if (stats.image_data_start_row == INVALID_ROW &&
+          mb_stat.image_data_start_row != INVALID_ROW) {
+        stats.image_data_start_row = mb_stat.image_data_start_row;
+      }
+      stats.inter_count += mb_stat.inter_count;
+      stats.intra_error += mb_stat.intra_error;
+      stats.intra_factor += mb_stat.intra_factor;
+      stats.intra_skip_count += mb_stat.intra_skip_count;
+      stats.mv_count += mb_stat.mv_count;
+      stats.neutral_count += mb_stat.neutral_count;
+      stats.new_mv_count += mb_stat.new_mv_count;
+      stats.second_ref_count += mb_stat.second_ref_count;
+      stats.sr_coded_error += mb_stat.sr_coded_error;
+      stats.sum_in_vectors += mb_stat.sum_in_vectors;
+      stats.sum_mvc += mb_stat.sum_mvc;
+      stats.sum_mvc_abs += mb_stat.sum_mvc_abs;
+      stats.sum_mvcs += mb_stat.sum_mvcs;
+      stats.sum_mvr += mb_stat.sum_mvr;
+      stats.sum_mvr_abs += mb_stat.sum_mvr_abs;
+      stats.sum_mvrs += mb_stat.sum_mvrs;
+      stats.third_ref_count += mb_stat.third_ref_count;
+      stats.tr_coded_error += mb_stat.tr_coded_error;
+    }
+  }
+  return stats;
+}
+
+static void setup_firstpass_data(AV1_COMMON *const cm,
+                                 FirstPassData *firstpass_data) {
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  CHECK_MEM_ERROR(cm, firstpass_data->raw_motion_err_list,
+                  aom_calloc(mi_params->mb_rows * mi_params->mb_cols,
+                             sizeof(*firstpass_data->raw_motion_err_list)));
+  CHECK_MEM_ERROR(cm, firstpass_data->mb_stats,
+                  aom_calloc(mi_params->mb_rows * mi_params->mb_cols,
+                             sizeof(*firstpass_data->mb_stats)));
+  for (int j = 0; j < mi_params->mb_rows; j++) {
+    for (int i = 0; i < mi_params->mb_cols; i++) {
+      firstpass_data->mb_stats[j * mi_params->mb_cols + i]
+          .image_data_start_row = INVALID_ROW;
+    }
+  }
+}
+
+static void free_firstpass_data(FirstPassData *firstpass_data) {
+  aom_free(firstpass_data->raw_motion_err_list);
+  aom_free(firstpass_data->mb_stats);
+}
+
+int av1_get_mb_rows_in_tile(TileInfo tile) {
+  int mi_rows_aligned_to_mb =
+      ALIGN_POWER_OF_TWO(tile.mi_row_end - tile.mi_row_start, FP_MIB_SIZE_LOG2);
+  int mb_rows = mi_rows_aligned_to_mb >> FP_MIB_SIZE_LOG2;
+
+  return mb_rows;
+}
+
+int av1_get_mb_cols_in_tile(TileInfo tile) {
+  int mi_cols_aligned_to_mb =
+      ALIGN_POWER_OF_TWO(tile.mi_col_end - tile.mi_col_start, FP_MIB_SIZE_LOG2);
+  int mb_cols = mi_cols_aligned_to_mb >> FP_MIB_SIZE_LOG2;
+
+  return mb_cols;
+}
+
 #define FIRST_PASS_ALT_REF_DISTANCE 16
-void av1_first_pass(AV1_COMP *cpi, const int64_t ts_duration) {
-  MACROBLOCK *const x = &cpi->td.mb;
+static void first_pass_tile(AV1_COMP *cpi, ThreadData *td,
+                            TileDataEnc *tile_data) {
+  TileInfo *tile = &tile_data->tile_info;
+  for (int mi_row = tile->mi_row_start; mi_row < tile->mi_row_end;
+       mi_row += FP_MIB_SIZE) {
+    av1_first_pass_row(cpi, td, tile_data, mi_row >> FP_MIB_SIZE_LOG2);
+  }
+}
+
+static void first_pass_tiles(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  const int tile_cols = cm->tiles.cols;
+  const int tile_rows = cm->tiles.rows;
+  for (int tile_row = 0; tile_row < tile_rows; ++tile_row) {
+    for (int tile_col = 0; tile_col < tile_cols; ++tile_col) {
+      TileDataEnc *const tile_data =
+          &cpi->tile_data[tile_row * tile_cols + tile_col];
+      first_pass_tile(cpi, &cpi->td, tile_data);
+    }
+  }
+}
+
+void av1_first_pass_row(AV1_COMP *cpi, ThreadData *td, TileDataEnc *tile_data,
+                        int mb_row) {
+  MACROBLOCK *const x = &td->mb;
   AV1_COMMON *const cm = &cpi->common;
   const CommonModeInfoParams *const mi_params = &cm->mi_params;
   CurrentFrame *const current_frame = &cm->current_frame;
   const SequenceHeader *const seq_params = &cm->seq_params;
   const int num_planes = av1_num_planes(cm);
   MACROBLOCKD *const xd = &x->e_mbd;
-  const PICK_MODE_CONTEXT *ctx = &cpi->td.pc_root->none;
-  MV last_mv = kZeroMv;
+  TileInfo *tile = &tile_data->tile_info;
   const int qindex = find_fp_qindex(seq_params->bit_depth);
-  // Detect if the key frame is screen content type.
-  if (frame_is_intra_only(cm)) {
-    FeatureFlags *const features = &cm->features;
-    av1_set_screen_content_options(cpi, features);
-    cpi->is_screen_content_type = features->allow_screen_content_tools;
-  }
   // First pass coding proceeds in raster scan order with unit size of 16x16.
   const BLOCK_SIZE fp_block_size = BLOCK_16X16;
   const int fp_block_size_width = block_size_high[fp_block_size];
   const int fp_block_size_height = block_size_wide[fp_block_size];
-  int *raw_motion_err_list;
   int raw_motion_err_counts = 0;
-  CHECK_MEM_ERROR(cm, raw_motion_err_list,
-                  aom_calloc(mi_params->mb_rows * mi_params->mb_cols,
-                             sizeof(*raw_motion_err_list)));
-  // Tiling is ignored in the first pass.
-  TileInfo tile;
-  av1_tile_init(&tile, cm, 0, 0);
-  FRAME_STATS stats = { 0 };
-  stats.image_data_start_row = INVALID_ROW;
+  int mb_row_in_tile = mb_row - (tile->mi_row_start >> FP_MIB_SIZE_LOG2);
+  int mb_col_start = tile->mi_col_start >> FP_MIB_SIZE_LOG2;
+  int mb_cols_in_tile = av1_get_mb_cols_in_tile(*tile);
+  MultiThreadInfo *const mt_info = &cpi->mt_info;
+  AV1EncRowMultiThreadInfo *const enc_row_mt = &mt_info->enc_row_mt;
+  AV1EncRowMultiThreadSync *const row_mt_sync = &tile_data->row_mt_sync;
 
   const YV12_BUFFER_CONFIG *const last_frame =
       get_ref_frame_yv12_buf(cm, LAST_FRAME);
@@ -895,6 +958,144 @@
     }
   }
   YV12_BUFFER_CONFIG *const this_frame = &cm->cur_frame->buf;
+
+  PICK_MODE_CONTEXT *ctx = td->firstpass_ctx;
+  FRAME_STATS *mb_stats =
+      cpi->firstpass_data.mb_stats + mb_row * mi_params->mb_cols + mb_col_start;
+  int *raw_motion_err_list = cpi->firstpass_data.raw_motion_err_list +
+                             mb_row * mi_params->mb_cols + mb_col_start;
+  MV *first_top_mv = &tile_data->firstpass_top_mv;
+
+  for (int i = 0; i < num_planes; ++i) {
+    x->plane[i].coeff = ctx->coeff[i];
+    x->plane[i].qcoeff = ctx->qcoeff[i];
+    x->plane[i].eobs = ctx->eobs[i];
+    x->plane[i].txb_entropy_ctx = ctx->txb_entropy_ctx[i];
+    x->plane[i].dqcoeff = ctx->dqcoeff[i];
+  }
+
+  const int src_y_stride = cpi->source->y_stride;
+  const int recon_y_stride = this_frame->y_stride;
+  const int recon_uv_stride = this_frame->uv_stride;
+  const int uv_mb_height =
+      fp_block_size_height >> (this_frame->y_height > this_frame->uv_height);
+
+  MV best_ref_mv = kZeroMv;
+  MV last_mv;
+
+  // Reset above block coeffs.
+  xd->up_available = (mb_row_in_tile != 0);
+  int recon_yoffset = (mb_row * recon_y_stride * fp_block_size_height) +
+                      (mb_col_start * fp_block_size_width);
+  int src_yoffset = (mb_row * src_y_stride * fp_block_size_height) +
+                    (mb_col_start * fp_block_size_width);
+  int recon_uvoffset =
+      (mb_row * recon_uv_stride * uv_mb_height) + (mb_col_start * uv_mb_height);
+  int alt_ref_frame_yoffset =
+      (alt_ref_frame != NULL)
+          ? (mb_row * alt_ref_frame->y_stride * fp_block_size_height) +
+                (mb_col_start * fp_block_size_width)
+          : -1;
+
+  // Set up limit values for motion vectors to prevent them extending
+  // outside the UMV borders.
+  av1_set_mv_row_limits(mi_params, &x->mv_limits, (mb_row << FP_MIB_SIZE_LOG2),
+                        (fp_block_size_height >> MI_SIZE_LOG2),
+                        cpi->oxcf.border_in_pixels);
+
+  av1_setup_src_planes(x, cpi->source, mb_row << FP_MIB_SIZE_LOG2,
+                       tile->mi_col_start, num_planes, fp_block_size);
+
+  // Fix - zero the 16x16 block first. This ensures correct this_intra_error for
+  // block sizes smaller than 16x16.
+  av1_zero_array(x->plane[0].src_diff, 256);
+
+  for (int mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
+       mi_col += FP_MIB_SIZE) {
+    int mb_col = mi_col >> FP_MIB_SIZE_LOG2;
+    int mb_col_in_tile = mb_col - mb_col_start;
+
+    (*(enc_row_mt->sync_read_ptr))(row_mt_sync, mb_row_in_tile, mb_col_in_tile);
+
+    if (mb_col_in_tile == 0) {
+      last_mv = *first_top_mv;
+    }
+    int this_intra_error = firstpass_intra_prediction(
+        cpi, td, this_frame, tile, mb_row, mb_col, recon_yoffset,
+        recon_uvoffset, fp_block_size, qindex, mb_stats);
+
+    if (!frame_is_intra_only(cm)) {
+      const int this_inter_error = firstpass_inter_prediction(
+          cpi, td, last_frame, golden_frame, alt_ref_frame, mb_row, mb_col,
+          recon_yoffset, recon_uvoffset, src_yoffset, alt_ref_frame_yoffset,
+          fp_block_size, this_intra_error, raw_motion_err_counts,
+          raw_motion_err_list, &best_ref_mv, &last_mv, mb_stats);
+      if (mb_col_in_tile == 0) {
+        *first_top_mv = last_mv;
+      }
+      mb_stats->coded_error += this_inter_error;
+      ++raw_motion_err_counts;
+    } else {
+      mb_stats->sr_coded_error += this_intra_error;
+      mb_stats->tr_coded_error += this_intra_error;
+      mb_stats->coded_error += this_intra_error;
+    }
+
+    // Adjust to the next column of MBs.
+    x->plane[0].src.buf += fp_block_size_width;
+    x->plane[1].src.buf += uv_mb_height;
+    x->plane[2].src.buf += uv_mb_height;
+
+    recon_yoffset += fp_block_size_width;
+    src_yoffset += fp_block_size_width;
+    recon_uvoffset += uv_mb_height;
+    alt_ref_frame_yoffset += fp_block_size_width;
+    mb_stats++;
+
+    (*(enc_row_mt->sync_write_ptr))(row_mt_sync, mb_row_in_tile, mb_col_in_tile,
+                                    mb_cols_in_tile);
+  }
+}
+
+void av1_first_pass(AV1_COMP *cpi, const int64_t ts_duration) {
+  MACROBLOCK *const x = &cpi->td.mb;
+  AV1_COMMON *const cm = &cpi->common;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  CurrentFrame *const current_frame = &cm->current_frame;
+  const SequenceHeader *const seq_params = &cm->seq_params;
+  const int num_planes = av1_num_planes(cm);
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const int qindex = find_fp_qindex(seq_params->bit_depth);
+  // Detect if the key frame is screen content type.
+  if (frame_is_intra_only(cm)) {
+    FeatureFlags *const features = &cm->features;
+    av1_set_screen_content_options(cpi, features);
+  }
+  // First pass coding proceeds in raster scan order with unit size of 16x16.
+  const BLOCK_SIZE fp_block_size = BLOCK_16X16;
+
+  setup_firstpass_data(cm, &cpi->firstpass_data);
+  int *raw_motion_err_list = cpi->firstpass_data.raw_motion_err_list;
+  FRAME_STATS *mb_stats = cpi->firstpass_data.mb_stats;
+
+  // multi threading info
+  MultiThreadInfo *const mt_info = &cpi->mt_info;
+  AV1EncRowMultiThreadInfo *const enc_row_mt = &mt_info->enc_row_mt;
+
+  const int tile_cols = cm->tiles.cols;
+  const int tile_rows = cm->tiles.rows;
+  if (cpi->allocated_tiles < tile_cols * tile_rows) {
+    av1_row_mt_mem_dealloc(cpi);
+    av1_alloc_tile_data(cpi);
+  }
+
+  av1_init_tile_data(cpi);
+
+  const YV12_BUFFER_CONFIG *const last_frame =
+      get_ref_frame_yv12_buf(cm, LAST_FRAME);
+  const YV12_BUFFER_CONFIG *golden_frame =
+      get_ref_frame_yv12_buf(cm, GOLDEN_FRAME);
+  YV12_BUFFER_CONFIG *const this_frame = &cm->cur_frame->buf;
   // First pass code requires valid last and new frame buffers.
   assert(this_frame != NULL);
   assert(frame_is_intra_only(cm) || (last_frame != NULL));
@@ -903,12 +1104,14 @@
   aom_clear_system_state();
 
   set_mi_offsets(mi_params, xd, 0, 0);
-  xd->mi[0]->sb_type = fp_block_size;
+  xd->mi[0]->bsize = fp_block_size;
 
   // Do not use periodic key frames.
   cpi->rc.frames_to_key = INT_MAX;
 
-  av1_set_quantizer(cm, cpi->oxcf.qm_minlevel, cpi->oxcf.qm_maxlevel, qindex);
+  av1_set_quantizer(cm, cpi->oxcf.q_cfg.qm_minlevel,
+                    cpi->oxcf.q_cfg.qm_maxlevel, qindex,
+                    cpi->oxcf.q_cfg.enable_chroma_deltaq);
 
   av1_setup_block_planes(xd, seq_params->subsampling_x,
                          seq_params->subsampling_y, num_planes);
@@ -927,82 +1130,27 @@
   xd->cfl.store_y = 0;
   av1_frame_init_quantizer(cpi);
 
-  for (int i = 0; i < num_planes; ++i) {
-    x->plane[i].coeff = ctx->coeff[i];
-    x->plane[i].qcoeff = ctx->qcoeff[i];
-    x->plane[i].eobs = ctx->eobs[i];
-    x->plane[i].txb_entropy_ctx = ctx->txb_entropy_ctx[i];
-    xd->plane[i].dqcoeff = ctx->dqcoeff[i];
-  }
-
   av1_init_mv_probs(cm);
   av1_initialize_rd_consts(cpi);
 
-  const int src_y_stride = cpi->source->y_stride;
-  const int recon_y_stride = this_frame->y_stride;
-  const int recon_uv_stride = this_frame->uv_stride;
-  const int uv_mb_height =
-      fp_block_size_height >> (this_frame->y_height > this_frame->uv_height);
+  enc_row_mt->sync_read_ptr = av1_row_mt_sync_read_dummy;
+  enc_row_mt->sync_write_ptr = av1_row_mt_sync_write_dummy;
 
-  for (int mb_row = 0; mb_row < mi_params->mb_rows; ++mb_row) {
-    MV best_ref_mv = kZeroMv;
-
-    // Reset above block coeffs.
-    xd->up_available = (mb_row != 0);
-    int recon_yoffset = (mb_row * recon_y_stride * fp_block_size_height);
-    int src_yoffset = (mb_row * src_y_stride * fp_block_size_height);
-    int recon_uvoffset = (mb_row * recon_uv_stride * uv_mb_height);
-    int alt_ref_frame_yoffset =
-        (alt_ref_frame != NULL)
-            ? mb_row * alt_ref_frame->y_stride * fp_block_size_height
-            : -1;
-
-    // Set up limit values for motion vectors to prevent them extending
-    // outside the UMV borders.
-    av1_set_mv_row_limits(mi_params, &x->mv_limits, (mb_row << 2),
-                          (fp_block_size_height >> MI_SIZE_LOG2),
-                          cpi->oxcf.border_in_pixels);
-
-    for (int mb_col = 0; mb_col < mi_params->mb_cols; ++mb_col) {
-      int this_intra_error = firstpass_intra_prediction(
-          cpi, this_frame, &tile, mb_row, mb_col, recon_yoffset, recon_uvoffset,
-          fp_block_size, qindex, &stats);
-
-      if (!frame_is_intra_only(cm)) {
-        const int this_inter_error = firstpass_inter_prediction(
-            cpi, last_frame, golden_frame, alt_ref_frame, mb_row, mb_col,
-            recon_yoffset, recon_uvoffset, src_yoffset, alt_ref_frame_yoffset,
-            fp_block_size, this_intra_error, raw_motion_err_counts,
-            raw_motion_err_list, &best_ref_mv, &last_mv, &stats);
-        stats.coded_error += this_inter_error;
-        ++raw_motion_err_counts;
-      } else {
-        stats.sr_coded_error += this_intra_error;
-        stats.tr_coded_error += this_intra_error;
-        stats.coded_error += this_intra_error;
-      }
-
-      // Adjust to the next column of MBs.
-      x->plane[0].src.buf += fp_block_size_width;
-      x->plane[1].src.buf += uv_mb_height;
-      x->plane[2].src.buf += uv_mb_height;
-
-      recon_yoffset += fp_block_size_width;
-      src_yoffset += fp_block_size_width;
-      recon_uvoffset += uv_mb_height;
-      alt_ref_frame_yoffset += fp_block_size_width;
-    }
-    // Adjust to the next row of MBs.
-    x->plane[0].src.buf += fp_block_size_height * x->plane[0].src.stride -
-                           fp_block_size_width * mi_params->mb_cols;
-    x->plane[1].src.buf += uv_mb_height * x->plane[1].src.stride -
-                           uv_mb_height * mi_params->mb_cols;
-    x->plane[2].src.buf += uv_mb_height * x->plane[1].src.stride -
-                           uv_mb_height * mi_params->mb_cols;
+  if (mt_info->num_workers > 1) {
+    enc_row_mt->sync_read_ptr = av1_row_mt_sync_read;
+    enc_row_mt->sync_write_ptr = av1_row_mt_sync_write;
+    av1_fp_encode_tiles_row_mt(cpi);
+  } else {
+    first_pass_tiles(cpi);
   }
+
+  FRAME_STATS stats =
+      accumulate_frame_stats(mb_stats, mi_params->mb_rows, mi_params->mb_cols);
+  int total_raw_motion_err_count =
+      frame_is_intra_only(cm) ? 0 : mi_params->mb_rows * mi_params->mb_cols;
   const double raw_err_stdev =
-      raw_motion_error_stdev(raw_motion_err_list, raw_motion_err_counts);
-  aom_free(raw_motion_err_list);
+      raw_motion_error_stdev(raw_motion_err_list, total_raw_motion_err_count);
+  free_firstpass_data(&cpi->firstpass_data);
 
   // Clamp the image start to rows/2. This number of rows is discarded top
   // and bottom as dead data so rows / 2 means the frame is blank.
@@ -1018,8 +1166,9 @@
   }
 
   TWO_PASS *twopass = &cpi->twopass;
-  const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE) ? cpi->initial_mbs
-                                                             : mi_params->MBs;
+  const int num_mbs = (cpi->oxcf.resize_cfg.resize_mode != RESIZE_NONE)
+                          ? cpi->initial_mbs
+                          : mi_params->MBs;
   stats.intra_factor = stats.intra_factor / (double)num_mbs;
   stats.brightness_factor = stats.brightness_factor / (double)num_mbs;
   FIRSTPASS_STATS *this_frame_stats = twopass->stats_buf_ctx->stats_in_end;
diff --git a/av1/encoder/firstpass.h b/av1/encoder/firstpass.h
index 99d4445..8764e77 100644
--- a/av1/encoder/firstpass.h
+++ b/av1/encoder/firstpass.h
@@ -30,75 +30,136 @@
 
 #define VLOW_MOTION_THRESHOLD 950
 
+// size of firstpass macroblocks in terms of MIs.
+#define FP_MIB_SIZE 4
+#define FP_MIB_SIZE_LOG2 2
+
+/*!
+ * \brief The stucture of acummulated frame stats in the first pass.
+ */
 typedef struct {
-  // Frame number in display order, if stats are for a single frame.
-  // No real meaning for a collection of frames.
+  /*!
+   * Frame number in display order, if stats are for a single frame.
+   * No real meaning for a collection of frames.
+   */
   double frame;
-  // Weight assigned to this frame (or total weight for the collection of
-  // frames) currently based on intra factor and brightness factor. This is used
-  // to distribute bits betweeen easier and harder frames.
+  /*!
+   * Weight assigned to this frame (or total weight for the collection of
+   * frames) currently based on intra factor and brightness factor. This is used
+   * to distribute bits betweeen easier and harder frames.
+   */
   double weight;
-  // Intra prediction error.
+  /*!
+   * Intra prediction error.
+   */
   double intra_error;
-  // Average wavelet energy computed using Discrete Wavelet Transform (DWT).
+  /*!
+   * Average wavelet energy computed using Discrete Wavelet Transform (DWT).
+   */
   double frame_avg_wavelet_energy;
-  // Best of intra pred error and inter pred error using last frame as ref.
+  /*!
+   * Best of intra pred error and inter pred error using last frame as ref.
+   */
   double coded_error;
-  // Best of intra pred error and inter pred error using golden frame as ref.
+  /*!
+   * Best of intra pred error and inter pred error using golden frame as ref.
+   */
   double sr_coded_error;
-  // Best of intra pred error and inter pred error using altref frame as ref.
+  /*!
+   * Best of intra pred error and inter pred error using altref frame as ref.
+   */
   double tr_coded_error;
-  // Percentage of blocks with inter pred error < intra pred error.
+  /*!
+   * Percentage of blocks with inter pred error < intra pred error.
+   */
   double pcnt_inter;
-  // Percentage of blocks using (inter prediction and) non-zero motion vectors.
+  /*!
+   * Percentage of blocks using (inter prediction and) non-zero motion vectors.
+   */
   double pcnt_motion;
-  // Percentage of blocks where golden frame was better than last or intra:
-  // inter pred error using golden frame < inter pred error using last frame and
-  // inter pred error using golden frame < intra pred error
+  /*!
+   * Percentage of blocks where golden frame was better than last or intra:
+   * inter pred error using golden frame < inter pred error using last frame and
+   * inter pred error using golden frame < intra pred error
+   */
   double pcnt_second_ref;
-  // Percentage of blocks where altref frame was better than intra, last, golden
+  /*!
+   * Percentage of blocks where altref frame was better than intra, last, golden
+   */
   double pcnt_third_ref;
-  // Percentage of blocks where intra and inter prediction errors were very
-  // close. Note that this is a 'weighted count', that is, the so blocks may be
-  // weighted by how close the two errors were.
+  /*!
+   * Percentage of blocks where intra and inter prediction errors were very
+   * close. Note that this is a 'weighted count', that is, the so blocks may be
+   * weighted by how close the two errors were.
+   */
   double pcnt_neutral;
-  // Percentage of blocks that have almost no intra error residual
-  // (i.e. are in effect completely flat and untextured in the intra
-  // domain). In natural videos this is uncommon, but it is much more
-  // common in animations, graphics and screen content, so may be used
-  // as a signal to detect these types of content.
+  /*!
+   * Percentage of blocks that have almost no intra error residual
+   * (i.e. are in effect completely flat and untextured in the intra
+   * domain). In natural videos this is uncommon, but it is much more
+   * common in animations, graphics and screen content, so may be used
+   * as a signal to detect these types of content.
+   */
   double intra_skip_pct;
-  // Image mask rows top and bottom.
+  /*!
+   * Image mask rows top and bottom.
+   */
   double inactive_zone_rows;
-  // Image mask columns at left and right edges.
+  /*!
+   * Image mask columns at left and right edges.
+   */
   double inactive_zone_cols;
-  // Average of row motion vectors.
+  /*!
+   * Average of row motion vectors.
+   */
   double MVr;
-  // Mean of absolute value of row motion vectors.
+  /*!
+   * Mean of absolute value of row motion vectors.
+   */
   double mvr_abs;
-  // Mean of column motion vectors.
+  /*!
+   * Mean of column motion vectors.
+   */
   double MVc;
-  // Mean of absolute value of column motion vectors.
+  /*!
+   * Mean of absolute value of column motion vectors.
+   */
   double mvc_abs;
-  // Variance of row motion vectors.
+  /*!
+   * Variance of row motion vectors.
+   */
   double MVrv;
-  // Variance of column motion vectors.
+  /*!
+   * Variance of column motion vectors.
+   */
   double MVcv;
-  // Value in range [-1,1] indicating fraction of row and column motion vectors
-  // that point inwards (negative MV value) or outwards (positive MV value).
-  // For example, value of 1 indicates, all row/column MVs are inwards.
+  /*!
+   * Value in range [-1,1] indicating fraction of row and column motion vectors
+   * that point inwards (negative MV value) or outwards (positive MV value).
+   * For example, value of 1 indicates, all row/column MVs are inwards.
+   */
   double mv_in_out_count;
-  // Count of unique non-zero motion vectors.
+  /*!
+   * Count of unique non-zero motion vectors.
+   */
   double new_mv_count;
-  // Duration of the frame / collection of frames.
+  /*!
+   * Duration of the frame / collection of frames.
+   */
   double duration;
-  // 1.0 if stats are for a single frame, OR
-  // Number of frames in this collection for which the stats are accumulated.
+  /*!
+   * 1.0 if stats are for a single frame, OR
+   * Number of frames in this collection for which the stats are accumulated.
+   */
   double count;
-  // standard deviation for (0, 0) motion prediction error
+  /*!
+   * standard deviation for (0, 0) motion prediction error
+   */
   double raw_error_stdev;
 } FIRSTPASS_STATS;
 
+/*!\cond */
+
 #define FC_ANIMATION_THRESH 0.15
 enum {
   FC_NORMAL = 0,
@@ -106,19 +167,21 @@
   FRAME_CONTENT_TYPES = 2
 } UENUM1BYTE(FRAME_CONTENT_TYPE);
 
+/*!\endcond */
+/*!
+ * \brief  Data related to the current GF/ARF group and the
+ * individual frames within the group
+ */
 typedef struct {
+  /*!\cond */
+  // The frame processing order within a GOP
   unsigned char index;
+  // Frame update type, e.g. ARF/GF/LF/Overlay
   FRAME_UPDATE_TYPE update_type[MAX_STATIC_GF_GROUP_LENGTH];
   unsigned char arf_src_offset[MAX_STATIC_GF_GROUP_LENGTH];
   // The number of frames displayed so far within the GOP at a given coding
   // frame.
   unsigned char cur_frame_idx[MAX_STATIC_GF_GROUP_LENGTH];
-  unsigned char frame_disp_idx[MAX_STATIC_GF_GROUP_LENGTH];
-  int ref_frame_disp_idx[MAX_STATIC_GF_GROUP_LENGTH][REF_FRAMES];
-  int ref_frame_gop_idx[MAX_STATIC_GF_GROUP_LENGTH][REF_FRAMES];
-
-  // TODO(jingning): Unify the data structure used here after the new control
-  // mechanism is in place.
   int layer_depth[MAX_STATIC_GF_GROUP_LENGTH];
   int arf_boost[MAX_STATIC_GF_GROUP_LENGTH];
   int max_layer_depth;
@@ -126,8 +189,20 @@
   // This is currently only populated for AOM_Q mode
   unsigned char q_val[MAX_STATIC_GF_GROUP_LENGTH];
   int bit_allocation[MAX_STATIC_GF_GROUP_LENGTH];
-  int size;
+  // The frame coding type - inter/intra frame
+  FRAME_TYPE frame_type[MAX_STATIC_GF_GROUP_LENGTH];
+  // The reference frame buffer control - update or reset
+  REFBUF_STATE refbuf_state[MAX_STATIC_GF_GROUP_LENGTH];
+  int arf_index;  // the index in the gf group of ARF, if no arf, then -1
+  int size;       // The total length of a GOP
+  /*!\endcond */
 } GF_GROUP;
+/*!\cond */
+
+typedef struct {
+  // Track if the last frame in a GOP has higher quality.
+  int arf_gf_boost_lst;
+} GF_STATE;
 
 typedef struct {
   FIRSTPASS_STATS *stats_in_start;
@@ -137,7 +212,13 @@
   FIRSTPASS_STATS *total_left_stats;
 } STATS_BUFFER_CTX;
 
+/*!\endcond */
+
+/*!
+ * \brief Two pass status and control data.
+ */
 typedef struct {
+  /*!\cond */
   unsigned int section_intra_rating;
   // Circular queue of first pass stats stored for most recent frames.
   // cpi->output_pkt_list[i].data.twopass_stats.buf points to actual data stored
@@ -177,17 +258,111 @@
   int extend_minq;
   int extend_maxq;
   int extend_minq_fast;
+  /*!\endcond */
 } TWO_PASS;
 
+/*!\cond */
+
+// This structure contains several key parameters to be accumulated for this
+// frame.
+typedef struct {
+  // Intra prediction error.
+  int64_t intra_error;
+  // Average wavelet energy computed using Discrete Wavelet Transform (DWT).
+  int64_t frame_avg_wavelet_energy;
+  // Best of intra pred error and inter pred error using last frame as ref.
+  int64_t coded_error;
+  // Best of intra pred error and inter pred error using golden frame as ref.
+  int64_t sr_coded_error;
+  // Best of intra pred error and inter pred error using altref frame as ref.
+  int64_t tr_coded_error;
+  // Count of motion vector.
+  int mv_count;
+  // Count of blocks that pick inter prediction (inter pred error is smaller
+  // than intra pred error).
+  int inter_count;
+  // Count of blocks that pick second ref (golden frame).
+  int second_ref_count;
+  // Count of blocks that pick third ref (altref frame).
+  int third_ref_count;
+  // Count of blocks where the inter and intra are very close and very low.
+  double neutral_count;
+  // Count of blocks where intra error is very small.
+  int intra_skip_count;
+  // Start row.
+  int image_data_start_row;
+  // Count of unique non-zero motion vectors.
+  int new_mv_count;
+  // Sum of inward motion vectors.
+  int sum_in_vectors;
+  // Sum of motion vector row.
+  int sum_mvr;
+  // Sum of motion vector column.
+  int sum_mvc;
+  // Sum of absolute value of motion vector row.
+  int sum_mvr_abs;
+  // Sum of absolute value of motion vector column.
+  int sum_mvc_abs;
+  // Sum of the square of motion vector row.
+  int64_t sum_mvrs;
+  // Sum of the square of motion vector column.
+  int64_t sum_mvcs;
+  // A factor calculated using intra pred error.
+  double intra_factor;
+  // A factor that measures brightness.
+  double brightness_factor;
+} FRAME_STATS;
+
+// This structure contains first pass data.
+typedef struct {
+  // Buffer holding frame stats for all MACROBLOCKs.
+  // mb_stats[i] stores the FRAME_STATS of the ith
+  // MB in raster scan order.
+  FRAME_STATS *mb_stats;
+  // Buffer to store the prediction error of the (0,0) motion
+  // vector using the last source frame as the reference.
+  // raw_motion_err_list[i] stores the raw_motion_err of
+  // the ith MB in raster scan order.
+  int *raw_motion_err_list;
+} FirstPassData;
+
 struct AV1_COMP;
 struct EncodeFrameParams;
 struct AV1EncoderConfig;
+struct TileDataEnc;
+
+int av1_get_mb_rows_in_tile(TileInfo tile);
+int av1_get_mb_cols_in_tile(TileInfo tile);
 
 void av1_rc_get_first_pass_params(struct AV1_COMP *cpi);
-void av1_first_pass(struct AV1_COMP *cpi, const int64_t ts_duration);
+void av1_first_pass_row(struct AV1_COMP *cpi, struct ThreadData *td,
+                        struct TileDataEnc *tile_data, int mb_row);
 void av1_end_first_pass(struct AV1_COMP *cpi);
 
 void av1_twopass_zero_stats(FIRSTPASS_STATS *section);
+void av1_accumulate_stats(FIRSTPASS_STATS *section,
+                          const FIRSTPASS_STATS *frame);
+/*!\endcond */
+
+/*!\brief AV1 first pass encoding.
+ *
+ * \ingroup rate_control
+ * This function is the first encoding pass for the two pass encoding mode.
+ * It encodes the whole video and collect essential information.
+ * Two pass encoding is an encoding mode in the reference software (libaom)
+ * of AV1 for high performance encoding. The first pass is a fast encoding
+ * process to collect essential information to help the second pass make
+ * encoding decisions and improve coding quality. The collected stats is used
+ * in rate control, for example, to determine frame cut, the position of
+ * alternative reference frame (ARF), etc.
+ *
+ * \param[in]    cpi            Top-level encoder structure
+ * \param[in]    ts_duration    Duration of the frame / collection of frames
+ *
+ * \return Nothing is returned. Instead, the "TWO_PASS" structure inside "cpi"
+ * is modified to store information computed in this function.
+ */
+void av1_first_pass(struct AV1_COMP *cpi, const int64_t ts_duration);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/av1/encoder/global_motion.c b/av1/encoder/global_motion.c
index 9623ec3..e299f1d 100644
--- a/av1/encoder/global_motion.c
+++ b/av1/encoder/global_motion.c
@@ -64,11 +64,9 @@
   double *level_dy_buffer;
 } ImagePyramid;
 
-int av1_is_enough_erroradvantage(double best_erroradvantage, int params_cost,
-                                 int erroradv_type) {
-  assert(erroradv_type < GM_ERRORADV_TR_TYPES);
-  return best_erroradvantage < erroradv_tr[erroradv_type] &&
-         best_erroradvantage * params_cost < erroradv_prod_tr[erroradv_type];
+int av1_is_enough_erroradvantage(double best_erroradvantage, int params_cost) {
+  return best_erroradvantage < erroradv_tr &&
+         best_erroradvantage * params_cost < erroradv_prod_tr;
 }
 
 static void convert_to_params(const double *params, int32_t *model) {
@@ -421,8 +419,8 @@
 }
 
 static int compute_global_motion_feature_based(
-    TransformationType type, unsigned char *frm_buffer, int frm_width,
-    int frm_height, int frm_stride, int *frm_corners, int num_frm_corners,
+    TransformationType type, unsigned char *src_buffer, int src_width,
+    int src_height, int src_stride, int *src_corners, int num_src_corners,
     YV12_BUFFER_CONFIG *ref, int bit_depth, int *num_inliers_by_motion,
     MotionModel *params_by_motion, int num_motions) {
   int i;
@@ -443,10 +441,10 @@
 
   // find correspondences between the two images
   correspondences =
-      (int *)malloc(num_frm_corners * 4 * sizeof(*correspondences));
+      (int *)malloc(num_src_corners * 4 * sizeof(*correspondences));
   num_correspondences = av1_determine_correspondence(
-      frm_buffer, (int *)frm_corners, num_frm_corners, ref_buffer,
-      (int *)ref_corners, num_ref_corners, frm_width, frm_height, frm_stride,
+      src_buffer, (int *)src_corners, num_src_corners, ref_buffer,
+      (int *)ref_corners, num_ref_corners, src_width, src_height, src_stride,
       ref->y_stride, correspondences);
 
   ransac(correspondences, num_correspondences, num_inliers_by_motion,
@@ -990,9 +988,9 @@
 }
 
 int av1_compute_global_motion(TransformationType type,
-                              unsigned char *frm_buffer, int frm_width,
-                              int frm_height, int frm_stride, int *frm_corners,
-                              int num_frm_corners, YV12_BUFFER_CONFIG *ref,
+                              unsigned char *src_buffer, int src_width,
+                              int src_height, int src_stride, int *src_corners,
+                              int num_src_corners, YV12_BUFFER_CONFIG *ref,
                               int bit_depth,
                               GlobalMotionEstimationType gm_estimation_type,
                               int *num_inliers_by_motion,
@@ -1000,13 +998,13 @@
   switch (gm_estimation_type) {
     case GLOBAL_MOTION_FEATURE_BASED:
       return compute_global_motion_feature_based(
-          type, frm_buffer, frm_width, frm_height, frm_stride, frm_corners,
-          num_frm_corners, ref, bit_depth, num_inliers_by_motion,
+          type, src_buffer, src_width, src_height, src_stride, src_corners,
+          num_src_corners, ref, bit_depth, num_inliers_by_motion,
           params_by_motion, num_motions);
     case GLOBAL_MOTION_DISFLOW_BASED:
       return compute_global_motion_disflow_based(
-          type, frm_buffer, frm_width, frm_height, frm_stride, frm_corners,
-          num_frm_corners, ref, bit_depth, num_inliers_by_motion,
+          type, src_buffer, src_width, src_height, src_stride, src_corners,
+          num_src_corners, ref, bit_depth, num_inliers_by_motion,
           params_by_motion, num_motions);
     default: assert(0 && "Unknown global motion estimation type");
   }
diff --git a/av1/encoder/global_motion.h b/av1/encoder/global_motion.h
index 0a6d0ec..a70bfa8 100644
--- a/av1/encoder/global_motion.h
+++ b/av1/encoder/global_motion.h
@@ -14,6 +14,8 @@
 
 #include "aom/aom_integer.h"
 #include "aom_scale/yv12config.h"
+#include "aom_util/aom_thread.h"
+
 #include "av1/common/mv.h"
 #include "av1/common/warped_motion.h"
 
@@ -24,6 +26,7 @@
 #define MAX_CORNERS 4096
 #define RANSAC_NUM_MOTIONS 1
 #define GM_REFINEMENT_COUNT 5
+#define MAX_DIRECTIONS 2
 
 typedef enum {
   GLOBAL_MOTION_FEATURE_BASED,
@@ -38,16 +41,70 @@
   int num_inliers;
 } MotionModel;
 
+// The structure holds a valid reference frame type and its temporal distance
+// from the source frame.
+typedef struct {
+  int distance;
+  MV_REFERENCE_FRAME frame;
+} FrameDistPair;
+
+typedef struct {
+  // Array of structure which holds the global motion parameters for a given
+  // motion model. params_by_motion[i] holds the parameters for a given motion
+  // model for the ith ransac motion.
+  MotionModel params_by_motion[RANSAC_NUM_MOTIONS];
+
+  // Pointer to hold inliers from motion model.
+  uint8_t *segment_map;
+} GlobalMotionThreadData;
+
+typedef struct {
+  // Holds the mapping of each thread to past/future direction.
+  // thread_id_to_dir[i] indicates the direction id (past - 0/future - 1)
+  // assigned to the ith thread.
+  int8_t thread_id_to_dir[MAX_NUM_THREADS];
+
+  // A flag which holds the early exit status based on the speed feature
+  // 'prune_ref_frame_for_gm_search'. early_exit[i] will be set if the speed
+  // feature based early exit happens in the direction 'i'.
+  int8_t early_exit[MAX_DIRECTIONS];
+
+  // Counter for the next reference frame to be processed.
+  // next_frame_to_process[i] will hold the count of next reference frame to be
+  // processed in the direction 'i'.
+  int8_t next_frame_to_process[MAX_DIRECTIONS];
+} JobInfo;
+
+typedef struct {
+  // Data related to assigning jobs for global motion multi-threading.
+  JobInfo job_info;
+
+  // Data specific to each worker in global motion multi-threading.
+  // thread_data[i] stores the thread specific data for worker 'i'.
+  GlobalMotionThreadData *thread_data;
+
+#if CONFIG_MULTITHREAD
+  // Mutex lock used while dispatching jobs.
+  pthread_mutex_t *mutex_;
+#endif
+
+  // Width and height for which segment_map is allocated for each thread.
+  int allocated_width;
+  int allocated_height;
+
+  // Number of workers for which thread_data is allocated.
+  int8_t allocated_workers;
+} AV1GlobalMotionSync;
+
 void av1_convert_model_to_params(const double *params,
                                  WarpedMotionParams *model);
 
 // TODO(sarahparker) These need to be retuned for speed 0 and 1 to
 // maximize gains from segmented error metric
-static const double erroradv_tr[] = { 0.65, 0.60, 0.65 };
-static const double erroradv_prod_tr[] = { 20000, 18000, 16000 };
+static const double erroradv_tr = 0.65;
+static const double erroradv_prod_tr = 20000;
 
-int av1_is_enough_erroradvantage(double best_erroradvantage, int params_cost,
-                                 int erroradv_type);
+int av1_is_enough_erroradvantage(double best_erroradvantage, int params_cost);
 
 void av1_compute_feature_segmentation_map(uint8_t *segment_map, int width,
                                           int height, int *inliers,
@@ -88,9 +145,9 @@
   num_inliers entry is 0 should be ignored by the caller.
 */
 int av1_compute_global_motion(TransformationType type,
-                              unsigned char *frm_buffer, int frm_width,
-                              int frm_height, int frm_stride, int *frm_corners,
-                              int num_frm_corners, YV12_BUFFER_CONFIG *ref,
+                              unsigned char *src_buffer, int src_width,
+                              int src_height, int src_stride, int *src_corners,
+                              int num_src_corners, YV12_BUFFER_CONFIG *ref,
                               int bit_depth,
                               GlobalMotionEstimationType gm_estimation_type,
                               int *num_inliers_by_motion,
diff --git a/av1/encoder/global_motion_facade.c b/av1/encoder/global_motion_facade.c
new file mode 100644
index 0000000..4cb4c43
--- /dev/null
+++ b/av1/encoder/global_motion_facade.c
@@ -0,0 +1,473 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/binary_codes_writer.h"
+#include "aom_ports/system_state.h"
+
+#include "av1/encoder/corner_detect.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/ethread.h"
+#include "av1/encoder/rdopt.h"
+
+// Highest motion model to search.
+#define GLOBAL_TRANS_TYPES_ENC 3
+
+// Computes the cost for the warp parameters.
+static int gm_get_params_cost(const WarpedMotionParams *gm,
+                              const WarpedMotionParams *ref_gm, int allow_hp) {
+  int params_cost = 0;
+  int trans_bits, trans_prec_diff;
+  switch (gm->wmtype) {
+    case AFFINE:
+    case ROTZOOM:
+      params_cost += aom_count_signed_primitive_refsubexpfin(
+          GM_ALPHA_MAX + 1, SUBEXPFIN_K,
+          (ref_gm->wmmat[2] >> GM_ALPHA_PREC_DIFF) - (1 << GM_ALPHA_PREC_BITS),
+          (gm->wmmat[2] >> GM_ALPHA_PREC_DIFF) - (1 << GM_ALPHA_PREC_BITS));
+      params_cost += aom_count_signed_primitive_refsubexpfin(
+          GM_ALPHA_MAX + 1, SUBEXPFIN_K,
+          (ref_gm->wmmat[3] >> GM_ALPHA_PREC_DIFF),
+          (gm->wmmat[3] >> GM_ALPHA_PREC_DIFF));
+      if (gm->wmtype >= AFFINE) {
+        params_cost += aom_count_signed_primitive_refsubexpfin(
+            GM_ALPHA_MAX + 1, SUBEXPFIN_K,
+            (ref_gm->wmmat[4] >> GM_ALPHA_PREC_DIFF),
+            (gm->wmmat[4] >> GM_ALPHA_PREC_DIFF));
+        params_cost += aom_count_signed_primitive_refsubexpfin(
+            GM_ALPHA_MAX + 1, SUBEXPFIN_K,
+            (ref_gm->wmmat[5] >> GM_ALPHA_PREC_DIFF) -
+                (1 << GM_ALPHA_PREC_BITS),
+            (gm->wmmat[5] >> GM_ALPHA_PREC_DIFF) - (1 << GM_ALPHA_PREC_BITS));
+      }
+      AOM_FALLTHROUGH_INTENDED;
+    case TRANSLATION:
+      trans_bits = (gm->wmtype == TRANSLATION)
+                       ? GM_ABS_TRANS_ONLY_BITS - !allow_hp
+                       : GM_ABS_TRANS_BITS;
+      trans_prec_diff = (gm->wmtype == TRANSLATION)
+                            ? GM_TRANS_ONLY_PREC_DIFF + !allow_hp
+                            : GM_TRANS_PREC_DIFF;
+      params_cost += aom_count_signed_primitive_refsubexpfin(
+          (1 << trans_bits) + 1, SUBEXPFIN_K,
+          (ref_gm->wmmat[0] >> trans_prec_diff),
+          (gm->wmmat[0] >> trans_prec_diff));
+      params_cost += aom_count_signed_primitive_refsubexpfin(
+          (1 << trans_bits) + 1, SUBEXPFIN_K,
+          (ref_gm->wmmat[1] >> trans_prec_diff),
+          (gm->wmmat[1] >> trans_prec_diff));
+      AOM_FALLTHROUGH_INTENDED;
+    case IDENTITY: break;
+    default: assert(0);
+  }
+  return (params_cost << AV1_PROB_COST_SHIFT);
+}
+
+// Calculates the threshold to be used for warp error computation.
+static AOM_INLINE int64_t calc_erroradv_threshold(int64_t ref_frame_error) {
+  return (int64_t)(ref_frame_error * erroradv_tr + 0.5);
+}
+
+// For the given reference frame, computes the global motion parameters for
+// different motion models and finds the best.
+static AOM_INLINE void compute_global_motion_for_ref_frame(
+    AV1_COMP *cpi, YV12_BUFFER_CONFIG *ref_buf[REF_FRAMES], int frame,
+    int num_src_corners, int *src_corners, unsigned char *src_buffer,
+    MotionModel *params_by_motion, uint8_t *segment_map,
+    const int segment_map_w, const int segment_map_h,
+    const WarpedMotionParams *ref_params) {
+  ThreadData *const td = &cpi->td;
+  MACROBLOCK *const x = &td->mb;
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  int i;
+  int src_width = cpi->source->y_width;
+  int src_height = cpi->source->y_height;
+  int src_stride = cpi->source->y_stride;
+  // clang-format off
+  static const double kIdentityParams[MAX_PARAMDIM - 1] = {
+     0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0
+  };
+  // clang-format on
+  WarpedMotionParams tmp_wm_params;
+  const double *params_this_motion;
+  int inliers_by_motion[RANSAC_NUM_MOTIONS];
+  assert(ref_buf[frame] != NULL);
+  TransformationType model;
+
+  aom_clear_system_state();
+
+  // TODO(sarahparker, debargha): Explore do_adaptive_gm_estimation = 1
+  const int do_adaptive_gm_estimation = 0;
+
+  const int ref_frame_dist = get_relative_dist(
+      &cm->seq_params.order_hint_info, cm->current_frame.order_hint,
+      cm->cur_frame->ref_order_hints[frame - LAST_FRAME]);
+  const GlobalMotionEstimationType gm_estimation_type =
+      cm->seq_params.order_hint_info.enable_order_hint &&
+              abs(ref_frame_dist) <= 2 && do_adaptive_gm_estimation
+          ? GLOBAL_MOTION_DISFLOW_BASED
+          : GLOBAL_MOTION_FEATURE_BASED;
+  for (model = ROTZOOM; model < GLOBAL_TRANS_TYPES_ENC; ++model) {
+    int64_t best_warp_error = INT64_MAX;
+    // Initially set all params to identity.
+    for (i = 0; i < RANSAC_NUM_MOTIONS; ++i) {
+      memcpy(params_by_motion[i].params, kIdentityParams,
+             (MAX_PARAMDIM - 1) * sizeof(*(params_by_motion[i].params)));
+      params_by_motion[i].num_inliers = 0;
+    }
+
+    av1_compute_global_motion(model, src_buffer, src_width, src_height,
+                              src_stride, src_corners, num_src_corners,
+                              ref_buf[frame], cpi->common.seq_params.bit_depth,
+                              gm_estimation_type, inliers_by_motion,
+                              params_by_motion, RANSAC_NUM_MOTIONS);
+    int64_t ref_frame_error = 0;
+    for (i = 0; i < RANSAC_NUM_MOTIONS; ++i) {
+      if (inliers_by_motion[i] == 0) continue;
+
+      params_this_motion = params_by_motion[i].params;
+      av1_convert_model_to_params(params_this_motion, &tmp_wm_params);
+
+      if (tmp_wm_params.wmtype != IDENTITY) {
+        av1_compute_feature_segmentation_map(
+            segment_map, segment_map_w, segment_map_h,
+            params_by_motion[i].inliers, params_by_motion[i].num_inliers);
+
+        ref_frame_error = av1_segmented_frame_error(
+            is_cur_buf_hbd(xd), xd->bd, ref_buf[frame]->y_buffer,
+            ref_buf[frame]->y_stride, cpi->source->y_buffer, src_width,
+            src_height, src_stride, segment_map, segment_map_w);
+
+        const int64_t erroradv_threshold =
+            calc_erroradv_threshold(ref_frame_error);
+
+        const int64_t warp_error = av1_refine_integerized_param(
+            &tmp_wm_params, tmp_wm_params.wmtype, is_cur_buf_hbd(xd), xd->bd,
+            ref_buf[frame]->y_buffer, ref_buf[frame]->y_width,
+            ref_buf[frame]->y_height, ref_buf[frame]->y_stride,
+            cpi->source->y_buffer, src_width, src_height, src_stride,
+            GM_REFINEMENT_COUNT, best_warp_error, segment_map, segment_map_w,
+            erroradv_threshold);
+
+        if (warp_error < best_warp_error) {
+          best_warp_error = warp_error;
+          // Save the wm_params modified by
+          // av1_refine_integerized_param() rather than motion index to
+          // avoid rerunning refine() below.
+          memcpy(&(cm->global_motion[frame]), &tmp_wm_params,
+                 sizeof(WarpedMotionParams));
+        }
+      }
+    }
+    if (cm->global_motion[frame].wmtype <= AFFINE)
+      if (!av1_get_shear_params(&cm->global_motion[frame]))
+        cm->global_motion[frame] = default_warp_params;
+
+    if (cm->global_motion[frame].wmtype == TRANSLATION) {
+      cm->global_motion[frame].wmmat[0] =
+          convert_to_trans_prec(cm->features.allow_high_precision_mv,
+                                cm->global_motion[frame].wmmat[0]) *
+          GM_TRANS_ONLY_DECODE_FACTOR;
+      cm->global_motion[frame].wmmat[1] =
+          convert_to_trans_prec(cm->features.allow_high_precision_mv,
+                                cm->global_motion[frame].wmmat[1]) *
+          GM_TRANS_ONLY_DECODE_FACTOR;
+    }
+
+    if (cm->global_motion[frame].wmtype == IDENTITY) continue;
+
+    if (ref_frame_error == 0) continue;
+
+    // If the best error advantage found doesn't meet the threshold for
+    // this motion type, revert to IDENTITY.
+    if (!av1_is_enough_erroradvantage(
+            (double)best_warp_error / ref_frame_error,
+            gm_get_params_cost(&cm->global_motion[frame], ref_params,
+                               cm->features.allow_high_precision_mv))) {
+      cm->global_motion[frame] = default_warp_params;
+    }
+
+    if (cm->global_motion[frame].wmtype != IDENTITY) break;
+  }
+
+  aom_clear_system_state();
+}
+
+// Computes global motion for the given reference frame.
+void av1_compute_gm_for_valid_ref_frames(
+    AV1_COMP *cpi, YV12_BUFFER_CONFIG *ref_buf[REF_FRAMES], int frame,
+    int num_src_corners, int *src_corners, unsigned char *src_buffer,
+    MotionModel *params_by_motion, uint8_t *segment_map, int segment_map_w,
+    int segment_map_h) {
+  AV1_COMMON *const cm = &cpi->common;
+  GlobalMotionInfo *const gm_info = &cpi->gm_info;
+  const WarpedMotionParams *ref_params =
+      cm->prev_frame ? &cm->prev_frame->global_motion[frame]
+                     : &default_warp_params;
+
+  compute_global_motion_for_ref_frame(
+      cpi, ref_buf, frame, num_src_corners, src_corners, src_buffer,
+      params_by_motion, segment_map, segment_map_w, segment_map_h, ref_params);
+
+  gm_info->params_cost[frame] =
+      gm_get_params_cost(&cm->global_motion[frame], ref_params,
+                         cm->features.allow_high_precision_mv) +
+      gm_info->type_cost[cm->global_motion[frame].wmtype] -
+      gm_info->type_cost[IDENTITY];
+}
+
+// Loops over valid reference frames and computes global motion estimation.
+static AOM_INLINE void compute_global_motion_for_references(
+    AV1_COMP *cpi, YV12_BUFFER_CONFIG *ref_buf[REF_FRAMES],
+    FrameDistPair reference_frame[REF_FRAMES - 1], int num_ref_frames,
+    int num_src_corners, int *src_corners, unsigned char *src_buffer,
+    MotionModel *params_by_motion, uint8_t *segment_map,
+    const int segment_map_w, const int segment_map_h) {
+  // Computation of frame corners for the source frame will be done already.
+  assert(num_src_corners != -1);
+  AV1_COMMON *const cm = &cpi->common;
+  // Compute global motion w.r.t. reference frames starting from the nearest ref
+  // frame in a given direction.
+  for (int frame = 0; frame < num_ref_frames; frame++) {
+    int ref_frame = reference_frame[frame].frame;
+    av1_compute_gm_for_valid_ref_frames(
+        cpi, ref_buf, ref_frame, num_src_corners, src_corners, src_buffer,
+        params_by_motion, segment_map, segment_map_w, segment_map_h);
+    // If global motion w.r.t. current ref frame is
+    // INVALID/TRANSLATION/IDENTITY, skip the evaluation of global motion w.r.t
+    // the remaining ref frames in that direction. The below exit is disabled
+    // when ref frame distance w.r.t. current frame is zero. E.g.:
+    // source_alt_ref_frame w.r.t. ARF frames.
+    if (cpi->sf.gm_sf.prune_ref_frame_for_gm_search &&
+        reference_frame[frame].distance != 0 &&
+        cm->global_motion[ref_frame].wmtype != ROTZOOM)
+      break;
+  }
+}
+
+// Compares the distance in 'a' and 'b'. Returns 1 if the frame corresponding to
+// 'a' is farther, -1 if the frame corresponding to 'b' is farther, 0 otherwise.
+static int compare_distance(const void *a, const void *b) {
+  const int diff =
+      ((FrameDistPair *)a)->distance - ((FrameDistPair *)b)->distance;
+  if (diff > 0)
+    return 1;
+  else if (diff < 0)
+    return -1;
+  return 0;
+}
+
+// Function to decide if we can skip the global motion parameter computation
+// for a particular ref frame.
+static AOM_INLINE int skip_gm_frame(AV1_COMMON *const cm, int ref_frame) {
+  if ((ref_frame == LAST3_FRAME || ref_frame == LAST2_FRAME) &&
+      cm->global_motion[GOLDEN_FRAME].wmtype != IDENTITY) {
+    return get_relative_dist(
+               &cm->seq_params.order_hint_info,
+               cm->cur_frame->ref_order_hints[ref_frame - LAST_FRAME],
+               cm->cur_frame->ref_order_hints[GOLDEN_FRAME - LAST_FRAME]) <= 0;
+  }
+  return 0;
+}
+
+// Prunes reference frames for global motion estimation based on the speed
+// feature 'gm_search_type'.
+static int do_gm_search_logic(SPEED_FEATURES *const sf, int frame) {
+  (void)frame;
+  switch (sf->gm_sf.gm_search_type) {
+    case GM_FULL_SEARCH: return 1;
+    case GM_REDUCED_REF_SEARCH_SKIP_L2_L3:
+      return !(frame == LAST2_FRAME || frame == LAST3_FRAME);
+    case GM_REDUCED_REF_SEARCH_SKIP_L2_L3_ARF2:
+      return !(frame == LAST2_FRAME || frame == LAST3_FRAME ||
+               (frame == ALTREF2_FRAME));
+    case GM_DISABLE_SEARCH: return 0;
+    default: assert(0);
+  }
+  return 1;
+}
+
+// Populates valid reference frames in past/future directions in
+// 'reference_frames' and their count in 'num_ref_frames'.
+static AOM_INLINE void update_valid_ref_frames_for_gm(
+    AV1_COMP *cpi, YV12_BUFFER_CONFIG *ref_buf[REF_FRAMES],
+    FrameDistPair reference_frames[MAX_DIRECTIONS][REF_FRAMES - 1],
+    int *num_ref_frames) {
+  AV1_COMMON *const cm = &cpi->common;
+  int *num_past_ref_frames = &num_ref_frames[0];
+  int *num_future_ref_frames = &num_ref_frames[1];
+  const GF_GROUP *gf_group = &cpi->gf_group;
+  int ref_pruning_enabled = is_frame_eligible_for_ref_pruning(
+      gf_group, cpi->sf.inter_sf.selective_ref_frame, 1, gf_group->index);
+
+  for (int frame = ALTREF_FRAME; frame >= LAST_FRAME; --frame) {
+    const MV_REFERENCE_FRAME ref_frame[2] = { frame, NONE_FRAME };
+    RefCntBuffer *buf = get_ref_frame_buf(cm, frame);
+    const int ref_disabled =
+        !(cpi->ref_frame_flags & av1_ref_frame_flag_list[frame]);
+    ref_buf[frame] = NULL;
+    cm->global_motion[frame] = default_warp_params;
+    // Skip global motion estimation for invalid ref frames
+    if (buf == NULL ||
+        (ref_disabled && cpi->sf.hl_sf.recode_loop != DISALLOW_RECODE)) {
+      cpi->gm_info.params_cost[frame] = 0;
+      continue;
+    } else {
+      ref_buf[frame] = &buf->buf;
+    }
+
+    int prune_ref_frames =
+        ref_pruning_enabled &&
+        prune_ref_by_selective_ref_frame(cpi, NULL, ref_frame,
+                                         cm->cur_frame->ref_display_order_hint);
+
+    if (ref_buf[frame]->y_crop_width == cpi->source->y_crop_width &&
+        ref_buf[frame]->y_crop_height == cpi->source->y_crop_height &&
+        do_gm_search_logic(&cpi->sf, frame) && !prune_ref_frames &&
+        !(cpi->sf.gm_sf.selective_ref_gm && skip_gm_frame(cm, frame))) {
+      assert(ref_buf[frame] != NULL);
+      const int relative_frame_dist = av1_encoder_get_relative_dist(
+          buf->display_order_hint, cm->cur_frame->display_order_hint);
+      // Populate past and future ref frames.
+      // reference_frames[0][] indicates past direction and
+      // reference_frames[1][] indicates future direction.
+      if (relative_frame_dist <= 0) {
+        reference_frames[0][*num_past_ref_frames].distance =
+            abs(relative_frame_dist);
+        reference_frames[0][*num_past_ref_frames].frame = frame;
+        (*num_past_ref_frames)++;
+      } else {
+        reference_frames[1][*num_future_ref_frames].distance =
+            abs(relative_frame_dist);
+        reference_frames[1][*num_future_ref_frames].frame = frame;
+        (*num_future_ref_frames)++;
+      }
+    }
+  }
+}
+
+// Allocates and initializes memory for segment_map and MotionModel.
+static AOM_INLINE void alloc_global_motion_data(MotionModel *params_by_motion,
+                                                uint8_t **segment_map,
+                                                const int segment_map_w,
+                                                const int segment_map_h) {
+  for (int m = 0; m < RANSAC_NUM_MOTIONS; m++) {
+    av1_zero(params_by_motion[m]);
+    params_by_motion[m].inliers =
+        aom_malloc(sizeof(*(params_by_motion[m].inliers)) * 2 * MAX_CORNERS);
+  }
+
+  *segment_map = (uint8_t *)aom_malloc(sizeof(*segment_map) * segment_map_w *
+                                       segment_map_h);
+  av1_zero_array(*segment_map, segment_map_w * segment_map_h);
+}
+
+// Deallocates segment_map and inliers.
+static AOM_INLINE void dealloc_global_motion_data(MotionModel *params_by_motion,
+                                                  uint8_t *segment_map) {
+  aom_free(segment_map);
+
+  for (int m = 0; m < RANSAC_NUM_MOTIONS; m++) {
+    aom_free(params_by_motion[m].inliers);
+  }
+}
+
+// Initializes parameters used for computing global motion.
+static AOM_INLINE void setup_global_motion_info_params(AV1_COMP *cpi) {
+  GlobalMotionInfo *const gm_info = &cpi->gm_info;
+  YV12_BUFFER_CONFIG *source = cpi->source;
+
+  gm_info->src_buffer = source->y_buffer;
+  if (source->flags & YV12_FLAG_HIGHBITDEPTH) {
+    // The source buffer is 16-bit, so we need to convert to 8 bits for the
+    // following code. We cache the result until the source frame is released.
+    gm_info->src_buffer =
+        av1_downconvert_frame(source, cpi->common.seq_params.bit_depth);
+  }
+
+  gm_info->segment_map_w =
+      (source->y_width + WARP_ERROR_BLOCK) >> WARP_ERROR_BLOCK_LOG;
+  gm_info->segment_map_h =
+      (source->y_height + WARP_ERROR_BLOCK) >> WARP_ERROR_BLOCK_LOG;
+
+  memset(gm_info->reference_frames, -1,
+         sizeof(gm_info->reference_frames[0][0]) * MAX_DIRECTIONS *
+             (REF_FRAMES - 1));
+  av1_zero(gm_info->num_ref_frames);
+
+  // Populate ref_buf for valid ref frames in global motion
+  update_valid_ref_frames_for_gm(cpi, gm_info->ref_buf,
+                                 gm_info->reference_frames,
+                                 gm_info->num_ref_frames);
+
+  // Sort the past and future ref frames in the ascending order of their
+  // distance from the current frame. reference_frames[0] => past direction
+  // and reference_frames[1] => future direction.
+  qsort(gm_info->reference_frames[0], gm_info->num_ref_frames[0],
+        sizeof(gm_info->reference_frames[0][0]), compare_distance);
+  qsort(gm_info->reference_frames[1], gm_info->num_ref_frames[1],
+        sizeof(gm_info->reference_frames[1][0]), compare_distance);
+
+  gm_info->num_src_corners = -1;
+  // If atleast one valid reference frame exists in past/future directions,
+  // compute interest points of source frame using FAST features.
+  if (gm_info->num_ref_frames[0] > 0 || gm_info->num_ref_frames[1] > 0) {
+    gm_info->num_src_corners = av1_fast_corner_detect(
+        gm_info->src_buffer, source->y_width, source->y_height,
+        source->y_stride, gm_info->src_corners, MAX_CORNERS);
+  }
+}
+
+// Computes global motion w.r.t. valid reference frames.
+static AOM_INLINE void global_motion_estimation(AV1_COMP *cpi) {
+  GlobalMotionInfo *const gm_info = &cpi->gm_info;
+  MotionModel params_by_motion[RANSAC_NUM_MOTIONS];
+  uint8_t *segment_map = NULL;
+
+  alloc_global_motion_data(params_by_motion, &segment_map,
+                           gm_info->segment_map_w, gm_info->segment_map_h);
+
+  // Compute global motion w.r.t. past reference frames and future reference
+  // frames
+  for (int dir = 0; dir < MAX_DIRECTIONS; dir++) {
+    if (gm_info->num_ref_frames[dir] > 0)
+      compute_global_motion_for_references(
+          cpi, gm_info->ref_buf, gm_info->reference_frames[dir],
+          gm_info->num_ref_frames[dir], gm_info->num_src_corners,
+          gm_info->src_corners, gm_info->src_buffer, params_by_motion,
+          segment_map, gm_info->segment_map_w, gm_info->segment_map_h);
+  }
+
+  dealloc_global_motion_data(params_by_motion, segment_map);
+}
+
+// Global motion estimation for the current frame is computed.This computation
+// happens once per frame and the winner motion model parameters are stored in
+// cm->cur_frame->global_motion.
+void av1_compute_global_motion_facade(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  GlobalMotionInfo *const gm_info = &cpi->gm_info;
+
+  av1_zero(cpi->td.rd_counts.global_motion_used);
+  av1_zero(gm_info->params_cost);
+
+  if (cpi->common.current_frame.frame_type == INTER_FRAME && cpi->source &&
+      cpi->oxcf.tool_cfg.enable_global_motion && !gm_info->search_done) {
+    setup_global_motion_info_params(cpi);
+    if (cpi->mt_info.num_workers > 1)
+      av1_global_motion_estimation_mt(cpi);
+    else
+      global_motion_estimation(cpi);
+    gm_info->search_done = 1;
+  }
+  memcpy(cm->cur_frame->global_motion, cm->global_motion,
+         sizeof(cm->cur_frame->global_motion));
+}
diff --git a/av1/encoder/global_motion_facade.h b/av1/encoder/global_motion_facade.h
new file mode 100644
index 0000000..52df19d
--- /dev/null
+++ b/av1/encoder/global_motion_facade.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_GLOBAL_MOTION_FACADE_H_
+#define AOM_AV1_ENCODER_GLOBAL_MOTION_FACADE_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+struct yv12_buffer_config;
+struct AV1_COMP;
+
+void av1_compute_gm_for_valid_ref_frames(
+    struct AV1_COMP *cpi, YV12_BUFFER_CONFIG *ref_buf[REF_FRAMES], int frame,
+    int num_src_corners, int *src_corners, unsigned char *src_buffer,
+    MotionModel *params_by_motion, uint8_t *segment_map, int segment_map_w,
+    int segment_map_h);
+void av1_compute_global_motion_facade(struct AV1_COMP *cpi);
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_GLOBAL_MOTION_FACADE_H_
diff --git a/av1/encoder/gop_structure.c b/av1/encoder/gop_structure.c
index 1ed71a0..0e4968a 100644
--- a/av1/encoder/gop_structure.c
+++ b/av1/encoder/gop_structure.c
@@ -11,6 +11,7 @@
 
 #include <stdint.h>
 
+#include "av1/common/blockd.h"
 #include "config/aom_config.h"
 #include "config/aom_scale_rtcd.h"
 
@@ -30,38 +31,39 @@
                                    GF_GROUP *const gf_group, RATE_CONTROL *rc,
                                    FRAME_INFO *frame_info, int start, int end,
                                    int *cur_frame_idx, int *frame_ind,
-                                   int arf_ind, int layer_depth) {
-  const int num_frames_to_process = end - start - 1;
-  assert(num_frames_to_process >= 0);
-  if (num_frames_to_process == 0) return;
+                                   int layer_depth) {
+  const int num_frames_to_process = end - start;
 
   // Either we are at the last level of the pyramid, or we don't have enough
   // frames between 'l' and 'r' to create one more level.
   if (layer_depth > gf_group->max_layer_depth_allowed ||
       num_frames_to_process < 3) {
     // Leaf nodes.
-    while (++start < end) {
+    while (start < end) {
       gf_group->update_type[*frame_ind] = LF_UPDATE;
       gf_group->arf_src_offset[*frame_ind] = 0;
-      ++*cur_frame_idx;
       gf_group->cur_frame_idx[*frame_ind] = *cur_frame_idx;
-      gf_group->frame_disp_idx[*frame_ind] = start;
       gf_group->layer_depth[*frame_ind] = MAX_ARF_LAYERS;
       gf_group->arf_boost[*frame_ind] = av1_calc_arf_boost(
           twopass, rc, frame_info, start, end - start, 0, NULL, NULL);
+      gf_group->frame_type[*frame_ind] = INTER_FRAME;
+      gf_group->refbuf_state[*frame_ind] = REFBUF_UPDATE;
       gf_group->max_layer_depth =
           AOMMAX(gf_group->max_layer_depth, layer_depth);
       ++(*frame_ind);
+      ++(*cur_frame_idx);
+      ++start;
     }
   } else {
-    const int m = (start + end) / 2;
+    const int m = (start + end - 1) / 2;
 
     // Internal ARF.
     gf_group->update_type[*frame_ind] = INTNL_ARF_UPDATE;
-    gf_group->arf_src_offset[*frame_ind] = m - start - 1;
+    gf_group->arf_src_offset[*frame_ind] = m - start;
     gf_group->cur_frame_idx[*frame_ind] = *cur_frame_idx;
-    gf_group->frame_disp_idx[*frame_ind] = m;
     gf_group->layer_depth[*frame_ind] = layer_depth;
+    gf_group->frame_type[*frame_ind] = INTER_FRAME;
+    gf_group->refbuf_state[*frame_ind] = REFBUF_UPDATE;
 
     // Get the boost factor for intermediate ARF frames.
     gf_group->arf_boost[*frame_ind] = av1_calc_arf_boost(
@@ -70,20 +72,22 @@
 
     // Frames displayed before this internal ARF.
     set_multi_layer_params(twopass, gf_group, rc, frame_info, start, m,
-                           cur_frame_idx, frame_ind, 1, layer_depth + 1);
+                           cur_frame_idx, frame_ind, layer_depth + 1);
 
     // Overlay for internal ARF.
     gf_group->update_type[*frame_ind] = INTNL_OVERLAY_UPDATE;
     gf_group->arf_src_offset[*frame_ind] = 0;
     gf_group->cur_frame_idx[*frame_ind] = *cur_frame_idx;
-    gf_group->frame_disp_idx[*frame_ind] = m;
     gf_group->arf_boost[*frame_ind] = 0;
     gf_group->layer_depth[*frame_ind] = layer_depth;
+    gf_group->frame_type[*frame_ind] = INTER_FRAME;
+    gf_group->refbuf_state[*frame_ind] = REFBUF_UPDATE;
     ++(*frame_ind);
+    ++(*cur_frame_idx);
 
     // Frames displayed after this internal ARF.
-    set_multi_layer_params(twopass, gf_group, rc, frame_info, m, end,
-                           cur_frame_idx, frame_ind, arf_ind, layer_depth + 1);
+    set_multi_layer_params(twopass, gf_group, rc, frame_info, m + 1, end,
+                           cur_frame_idx, frame_ind, layer_depth + 1);
   }
 }
 
@@ -92,220 +96,110 @@
     RATE_CONTROL *rc, FRAME_INFO *const frame_info, int gf_interval,
     FRAME_UPDATE_TYPE first_frame_update_type) {
   int frame_index = 0;
+  int cur_frame_index = 0;
 
   // Keyframe / Overlay frame / Golden frame.
-  assert(gf_interval >= 1);
   assert(first_frame_update_type == KF_UPDATE ||
          first_frame_update_type == OVERLAY_UPDATE ||
          first_frame_update_type == GF_UPDATE);
 
-  gf_group->update_type[frame_index] = first_frame_update_type;
-  gf_group->arf_src_offset[frame_index] = 0;
-  gf_group->cur_frame_idx[frame_index] = 0;
-  gf_group->layer_depth[frame_index] =
-      first_frame_update_type == OVERLAY_UPDATE ? MAX_ARF_LAYERS + 1 : 0;
-  gf_group->max_layer_depth = 0;
-  ++frame_index;
+  if (first_frame_update_type == KF_UPDATE &&
+      cpi->oxcf.kf_cfg.enable_keyframe_filtering > 1) {
+    gf_group->update_type[frame_index] = ARF_UPDATE;
+    gf_group->arf_src_offset[frame_index] = 0;
+    gf_group->cur_frame_idx[frame_index] = cur_frame_index;
+    gf_group->layer_depth[frame_index] = 0;
+    gf_group->frame_type[frame_index] = KEY_FRAME;
+    gf_group->refbuf_state[frame_index] = REFBUF_RESET;
+    gf_group->max_layer_depth = 0;
+    ++frame_index;
+
+    gf_group->update_type[frame_index] = OVERLAY_UPDATE;
+    gf_group->arf_src_offset[frame_index] = 0;
+    gf_group->cur_frame_idx[frame_index] = cur_frame_index;
+    gf_group->layer_depth[frame_index] = 0;
+    gf_group->frame_type[frame_index] = INTER_FRAME;
+    gf_group->refbuf_state[frame_index] = REFBUF_UPDATE;
+    gf_group->max_layer_depth = 0;
+    ++frame_index;
+    cur_frame_index++;
+  } else if (first_frame_update_type != OVERLAY_UPDATE) {
+    gf_group->update_type[frame_index] = first_frame_update_type;
+    gf_group->arf_src_offset[frame_index] = 0;
+    gf_group->cur_frame_idx[frame_index] = cur_frame_index;
+    gf_group->layer_depth[frame_index] =
+        first_frame_update_type == OVERLAY_UPDATE ? MAX_ARF_LAYERS + 1 : 0;
+    gf_group->frame_type[frame_index] =
+        (first_frame_update_type == KF_UPDATE) ? KEY_FRAME : INTER_FRAME;
+    gf_group->refbuf_state[frame_index] =
+        (first_frame_update_type == KF_UPDATE) ? REFBUF_RESET : REFBUF_UPDATE;
+    gf_group->max_layer_depth = 0;
+    ++frame_index;
+    ++cur_frame_index;
+  }
 
   // ALTREF.
   const int use_altref = gf_group->max_layer_depth_allowed > 0;
+  int is_fwd_kf = (gf_interval == cpi->rc.frames_to_key);
   if (use_altref) {
     gf_group->update_type[frame_index] = ARF_UPDATE;
-    gf_group->arf_src_offset[frame_index] = gf_interval - 1;
-    gf_group->cur_frame_idx[frame_index] = 0;
-    gf_group->frame_disp_idx[frame_index] = gf_interval;
+    gf_group->arf_src_offset[frame_index] = gf_interval - cur_frame_index;
+    gf_group->cur_frame_idx[frame_index] = cur_frame_index;
     gf_group->layer_depth[frame_index] = 1;
     gf_group->arf_boost[frame_index] = cpi->rc.gfu_boost;
+    gf_group->frame_type[frame_index] = is_fwd_kf ? KEY_FRAME : INTER_FRAME;
+    gf_group->refbuf_state[frame_index] = REFBUF_UPDATE;
     gf_group->max_layer_depth = 1;
+    gf_group->arf_index = frame_index;
     ++frame_index;
+  } else {
+    gf_group->arf_index = -1;
   }
 
-  int cur_frame_index = 0;
   // Rest of the frames.
-  set_multi_layer_params(twopass, gf_group, rc, frame_info, 0, gf_interval,
-                         &cur_frame_index, &frame_index, 0, use_altref + 1);
+  set_multi_layer_params(twopass, gf_group, rc, frame_info, cur_frame_index,
+                         gf_interval, &cur_frame_index, &frame_index,
+                         use_altref + 1);
 
-  // The end frame will be Overlay frame for an ARF GOP; otherwise set it to
-  // be GF, for consistency, which will be updated in the next GOP.
-  gf_group->update_type[frame_index] = use_altref ? OVERLAY_UPDATE : GF_UPDATE;
-  gf_group->arf_src_offset[frame_index] = 0;
+  if (use_altref) {
+    gf_group->update_type[frame_index] = OVERLAY_UPDATE;
+    gf_group->arf_src_offset[frame_index] = 0;
+    gf_group->cur_frame_idx[frame_index] = cur_frame_index;
+    gf_group->layer_depth[frame_index] = MAX_ARF_LAYERS;
+    gf_group->arf_boost[frame_index] = NORMAL_BOOST;
+    gf_group->frame_type[frame_index] = is_fwd_kf ? KEY_FRAME : INTER_FRAME;
+    gf_group->refbuf_state[frame_index] =
+        is_fwd_kf ? REFBUF_RESET : REFBUF_UPDATE;
+    ++frame_index;
+  } else {
+    for (; cur_frame_index <= gf_interval; ++cur_frame_index) {
+      gf_group->update_type[frame_index] = LF_UPDATE;
+      gf_group->arf_src_offset[frame_index] = 0;
+      gf_group->cur_frame_idx[frame_index] = cur_frame_index;
+      gf_group->layer_depth[frame_index] = MAX_ARF_LAYERS;
+      gf_group->arf_boost[frame_index] = NORMAL_BOOST;
+      gf_group->frame_type[frame_index] = INTER_FRAME;
+      gf_group->refbuf_state[frame_index] = REFBUF_UPDATE;
+      gf_group->max_layer_depth = AOMMAX(gf_group->max_layer_depth, 2);
+      ++frame_index;
+    }
+  }
   return frame_index;
 }
 
-#define CHECK_GF_PARAMETER 0
-#if CHECK_GF_PARAMETER
-void check_frame_params(GF_GROUP *const gf_group, int gf_interval) {
-  static const char *update_type_strings[FRAME_UPDATE_TYPES] = {
-    "KF_UPDATE",       "LF_UPDATE",      "GF_UPDATE",
-    "ARF_UPDATE",      "OVERLAY_UPDATE", "INTNL_OVERLAY_UPDATE",
-    "INTNL_ARF_UPDATE"
-  };
-  FILE *fid = fopen("GF_PARAMS.txt", "a");
-
-  fprintf(fid, "\ngf_interval = {%d}\n", gf_interval);
-  for (int i = 0; i < gf_group->size; ++i) {
-    fprintf(fid, "#%2d : %s %d %d %d %d\n", i,
-            update_type_strings[gf_group->update_type[i]],
-            gf_group->arf_src_offset[i], gf_group->arf_pos_in_gf[i],
-            gf_group->arf_update_idx[i], gf_group->pyramid_level[i]);
-  }
-
-  fprintf(fid, "number of nodes in each level: \n");
-  for (int i = 0; i < gf_group->pyramid_height; ++i) {
-    fprintf(fid, "lvl %d: %d ", i, gf_group->pyramid_lvl_nodes[i]);
-  }
-  fprintf(fid, "\n");
-  fclose(fid);
-}
-#endif  // CHECK_GF_PARAMETER
-
-#define REF_IDX(ref) ((ref)-LAST_FRAME)
-
-static INLINE void reset_ref_frame_idx(int *ref_idx, int reset_value) {
-  for (int i = 0; i < REF_FRAMES; ++i) ref_idx[i] = reset_value;
-}
-
-static INLINE void set_ref_frame_disp_idx(GF_GROUP *const gf_group) {
-  for (int i = 0; i < gf_group->size; ++i) {
-    for (int ref = 0; ref < INTER_REFS_PER_FRAME + 1; ++ref) {
-      int ref_gop_idx = gf_group->ref_frame_gop_idx[i][ref];
-      if (ref_gop_idx == -1) {
-        gf_group->ref_frame_disp_idx[i][ref] = -1;
-      } else {
-        gf_group->ref_frame_disp_idx[i][ref] =
-            gf_group->frame_disp_idx[ref_gop_idx];
-      }
-    }
-  }
-}
-
-static void set_gop_ref_frame_map(GF_GROUP *const gf_group) {
-  // Initialize the reference slots as all -1.
-  for (int frame_idx = 0; frame_idx < gf_group->size; ++frame_idx)
-    reset_ref_frame_idx(gf_group->ref_frame_gop_idx[frame_idx], -1);
-
-  // Set the map for frames in the current gop
-  for (int frame_idx = 0; frame_idx < gf_group->size; ++frame_idx) {
-    const FRAME_UPDATE_TYPE update_type = gf_group->update_type[frame_idx];
-    // TODO(yuec): need to figure out how to determine
-    // (1) whether a KEY_FRAME has show_frame on
-    // (2) whether a frame with INTNL_OVERLAY_UPDATE type has
-    //     show_existing_frame on
-    const int show_frame =
-        update_type != ARF_UPDATE && update_type != INTNL_ARF_UPDATE;
-    const int show_existing_frame =
-        update_type == OVERLAY_UPDATE || update_type == INTNL_OVERLAY_UPDATE;
-
-    int this_ref_map[INTER_REFS_PER_FRAME + 1];
-    memcpy(this_ref_map, gf_group->ref_frame_gop_idx[frame_idx],
-           sizeof(this_ref_map));
-    int *next_ref_map = &gf_group->ref_frame_gop_idx[frame_idx + 1][0];
-
-    switch (update_type) {
-      case KF_UPDATE:
-        if (show_frame) {
-          reset_ref_frame_idx(this_ref_map, frame_idx);
-        } else {
-          this_ref_map[REF_IDX(LAST3_FRAME)] = frame_idx;
-          this_ref_map[REF_IDX(EXTREF_FRAME)] = frame_idx;
-          this_ref_map[REF_IDX(ALTREF2_FRAME)] = frame_idx;
-          this_ref_map[REF_IDX(GOLDEN_FRAME)] = frame_idx;
-          this_ref_map[REF_IDX(ALTREF_FRAME)] = frame_idx;
-        }
-        break;
-      case LF_UPDATE: this_ref_map[REF_IDX(LAST3_FRAME)] = frame_idx; break;
-      case GF_UPDATE:
-        this_ref_map[REF_IDX(LAST3_FRAME)] = frame_idx;
-        this_ref_map[REF_IDX(GOLDEN_FRAME)] = frame_idx;
-        break;
-      case OVERLAY_UPDATE:
-        this_ref_map[REF_IDX(ALTREF_FRAME)] = frame_idx;
-        break;
-      case ARF_UPDATE: this_ref_map[REF_IDX(ALTREF_FRAME)] = frame_idx; break;
-      case INTNL_OVERLAY_UPDATE:
-        if (!show_existing_frame)
-          this_ref_map[REF_IDX(LAST3_FRAME)] = frame_idx;
-        break;
-      case INTNL_ARF_UPDATE:
-        this_ref_map[REF_IDX(EXTREF_FRAME)] = frame_idx;
-        break;
-      default: assert(0); break;
-    }
-
-    memcpy(next_ref_map, this_ref_map, sizeof(this_ref_map));
-
-    switch (update_type) {
-      case LF_UPDATE:
-      case GF_UPDATE:
-        next_ref_map[REF_IDX(LAST3_FRAME)] = this_ref_map[REF_IDX(LAST2_FRAME)];
-        next_ref_map[REF_IDX(LAST2_FRAME)] = this_ref_map[REF_IDX(LAST_FRAME)];
-        next_ref_map[REF_IDX(LAST_FRAME)] = this_ref_map[REF_IDX(LAST3_FRAME)];
-        break;
-      case INTNL_OVERLAY_UPDATE:
-        if (!show_existing_frame) {
-          next_ref_map[REF_IDX(LAST3_FRAME)] =
-              this_ref_map[REF_IDX(LAST2_FRAME)];
-          next_ref_map[REF_IDX(LAST2_FRAME)] =
-              this_ref_map[REF_IDX(LAST_FRAME)];
-          next_ref_map[REF_IDX(LAST_FRAME)] =
-              this_ref_map[REF_IDX(LAST3_FRAME)];
-        } else {
-          next_ref_map[REF_IDX(LAST_FRAME)] =
-              this_ref_map[REF_IDX(BWDREF_FRAME)];
-          next_ref_map[REF_IDX(LAST2_FRAME)] =
-              this_ref_map[REF_IDX(LAST_FRAME)];
-          next_ref_map[REF_IDX(LAST3_FRAME)] =
-              this_ref_map[REF_IDX(LAST2_FRAME)];
-          next_ref_map[REF_IDX(BWDREF_FRAME)] =
-              this_ref_map[REF_IDX(ALTREF2_FRAME)];
-          next_ref_map[REF_IDX(ALTREF2_FRAME)] =
-              this_ref_map[REF_IDX(EXTREF_FRAME)];
-          next_ref_map[REF_IDX(EXTREF_FRAME)] =
-              this_ref_map[REF_IDX(LAST3_FRAME)];
-        }
-        break;
-      case INTNL_ARF_UPDATE:
-        if (!show_existing_frame) {
-          next_ref_map[REF_IDX(BWDREF_FRAME)] =
-              this_ref_map[REF_IDX(EXTREF_FRAME)];
-          next_ref_map[REF_IDX(ALTREF2_FRAME)] =
-              this_ref_map[REF_IDX(BWDREF_FRAME)];
-          next_ref_map[REF_IDX(EXTREF_FRAME)] =
-              this_ref_map[REF_IDX(ALTREF2_FRAME)];
-        }
-        break;
-      case OVERLAY_UPDATE:
-        next_ref_map[REF_IDX(ALTREF_FRAME)] =
-            this_ref_map[REF_IDX(GOLDEN_FRAME)];
-        next_ref_map[REF_IDX(GOLDEN_FRAME)] =
-            this_ref_map[REF_IDX(ALTREF_FRAME)];
-        break;
-      default: break;
-    }
-  }
-
-  // Set the map in display order index by converting from gop indices in the
-  // above map
-  set_ref_frame_disp_idx(gf_group);
-}
-
-void av1_gop_setup_structure(AV1_COMP *cpi,
-                             const EncodeFrameParams *const frame_params) {
+void av1_gop_setup_structure(AV1_COMP *cpi) {
   RATE_CONTROL *const rc = &cpi->rc;
   GF_GROUP *const gf_group = &cpi->gf_group;
   TWO_PASS *const twopass = &cpi->twopass;
   FRAME_INFO *const frame_info = &cpi->frame_info;
-  const int key_frame = (frame_params->frame_type == KEY_FRAME);
+  const int key_frame = rc->frames_since_key == 0;
   const FRAME_UPDATE_TYPE first_frame_update_type =
-      key_frame ? KF_UPDATE
-                : rc->source_alt_ref_active ? OVERLAY_UPDATE : GF_UPDATE;
+      key_frame
+          ? KF_UPDATE
+          : cpi->gf_state.arf_gf_boost_lst || (rc->baseline_gf_interval == 1)
+                ? OVERLAY_UPDATE
+                : GF_UPDATE;
   gf_group->size = construct_multi_layer_gf_structure(
-      cpi, twopass, gf_group, rc, frame_info, rc->baseline_gf_interval,
+      cpi, twopass, gf_group, rc, frame_info, rc->baseline_gf_interval - 1,
       first_frame_update_type);
-
-  set_gop_ref_frame_map(gf_group);
-
-#if CHECK_GF_PARAMETER
-  check_frame_params(gf_group, rc->baseline_gf_interval);
-#endif
 }
diff --git a/av1/encoder/gop_structure.h b/av1/encoder/gop_structure.h
index 0c775c7..6cfca22 100644
--- a/av1/encoder/gop_structure.h
+++ b/av1/encoder/gop_structure.h
@@ -18,24 +18,60 @@
 #ifdef __cplusplus
 extern "C" {
 #endif
-
+/*!\cond */
 struct AV1_COMP;
 struct EncodeFrameParams;
 
 #define MIN_ARF_GF_BOOST 240
 #define NORMAL_BOOST 100
 
-// Set up the Group-Of-Pictures structure for this GF_GROUP.  This involves
-// deciding where to place the various FRAME_UPDATE_TYPEs in the group.  It does
-// this primarily by setting the contents of
-// cpi->twopass.gf_group.update_type[].
-void av1_gop_setup_structure(
-    struct AV1_COMP *cpi, const struct EncodeFrameParams *const frame_params);
+/*!\endcond */
 
+/*!\brief Set up the Group-Of-Pictures structure for this GF_GROUP.
+ *
+ *\ingroup rate_control
+ *
+ * This function defines the Group-Of-Pictures structure for this GF_GROUP.
+ * This involves deciding where to place the various FRAME_UPDATE_TYPEs in
+ * the group. It does this primarily by updateing entries in
+ * cpi->twopass.gf_group.update_type[].
+ *
+ * \param[in]    cpi          Top - level encoder instance structure
+ *
+ * \return No return value but this function updates group data structures.
+ */
+void av1_gop_setup_structure(struct AV1_COMP *cpi);
+
+/*!\brief Distributes bits to frames in a group
+ *
+ *\ingroup rate_control
+ *
+ * This function decides on the allocation of bits between the different
+ * frames and types of frame in a GF/ARF group.
+ *
+ * \param[in]   cpi           Top - level encoder instance structure
+ * \param[in]   rc            Rate control data
+ * \param[in]   gf_group      GF/ARF group data structure
+ * \param[in]   is_key_frame  Indicates if the first frame in the group is
+ *                            also a key frame.
+ * \param[in]   use_arf       Are ARF frames enabled or is this a GF only
+ *                            uni-directional group.
+ * \param[in]   gf_group_bits Bits available to be allocated.
+ *
+ * \return No return but updates the rate control and group data structures
+ *         to reflect the allocation of bits.
+ */
+void av1_gop_bit_allocation(const AV1_COMP *cpi, RATE_CONTROL *const rc,
+                            GF_GROUP *gf_group, int is_key_frame, int use_arf,
+                            int64_t gf_group_bits);
+
+/*!\cond */
 int av1_calc_arf_boost(const TWO_PASS *twopass, const RATE_CONTROL *rc,
                        FRAME_INFO *frame_info, int offset, int f_frames,
                        int b_frames, int *num_fpstats_used,
                        int *num_fpstats_required);
+/*!\endcond */
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/av1/encoder/interp_search.c b/av1/encoder/interp_search.c
index 6b7317b..0066c35 100644
--- a/av1/encoder/interp_search.c
+++ b/av1/encoder/interp_search.c
@@ -116,12 +116,14 @@
 
 static INLINE int get_switchable_rate(MACROBLOCK *const x,
                                       const int_interpfilters filters,
-                                      const int ctx[2]) {
-  int inter_filter_cost;
+                                      const int ctx[2], int dual_filter) {
   const InterpFilter filter0 = filters.as_filters.y_filter;
-  const InterpFilter filter1 = filters.as_filters.x_filter;
-  inter_filter_cost = x->switchable_interp_costs[ctx[0]][filter0];
-  inter_filter_cost += x->switchable_interp_costs[ctx[1]][filter1];
+  int inter_filter_cost =
+      x->mode_costs.switchable_interp_costs[ctx[0]][filter0];
+  if (dual_filter) {
+    const InterpFilter filter1 = filters.as_filters.x_filter;
+    inter_filter_cost += x->mode_costs.switchable_interp_costs[ctx[1]][filter1];
+  }
   return SWITCHABLE_INTERP_RATE_FACTOR * inter_filter_cost;
 }
 
@@ -136,7 +138,7 @@
   RD_STATS tmp_rd_stats;
   av1_init_rd_stats(&tmp_rd_stats);
 
-  // Skip inter predictor if the predictor is already avilable.
+  // Skip inter predictor if the predictor is already available.
   if (!is_skip_build_pred) {
     const int mi_row = xd->mi_row;
     const int mi_col = xd->mi_col;
@@ -148,8 +150,8 @@
                      ? MODELRD_LEGACY
                      : MODELRD_TYPE_INTERP_FILTER](
       cpi, bsize, x, xd, plane_from, plane_to, &tmp_rd_stats.rate,
-      &tmp_rd_stats.dist, &tmp_rd_stats.skip, &tmp_rd_stats.sse, NULL, NULL,
-      NULL);
+      &tmp_rd_stats.dist, &tmp_rd_stats.skip_txfm, &tmp_rd_stats.sse, NULL,
+      NULL, NULL);
 
   av1_merge_rd_stats(rd_stats, &tmp_rd_stats);
 }
@@ -175,7 +177,8 @@
   const int_interpfilters last_best = mbmi->interp_filters;
   mbmi->interp_filters = filter_sets[filter_idx];
   const int tmp_rs =
-      get_switchable_rate(x, mbmi->interp_filters, switchable_ctx);
+      get_switchable_rate(x, mbmi->interp_filters, switchable_ctx,
+                          cm->seq_params.enable_dual_filter);
 
   int64_t min_rd = RDCOST(x->rdmult, tmp_rs, 0);
   if (min_rd > *rd) {
@@ -189,12 +192,12 @@
   assert((rd_stats_luma->rate >= 0) && (rd_stats->rate >= 0));
   assert((rd_stats_luma->dist >= 0) && (rd_stats->dist >= 0));
   assert((rd_stats_luma->sse >= 0) && (rd_stats->sse >= 0));
-  assert((rd_stats_luma->skip == 0) || (rd_stats_luma->skip == 1));
-  assert((rd_stats->skip == 0) || (rd_stats->skip == 1));
+  assert((rd_stats_luma->skip_txfm == 0) || (rd_stats_luma->skip_txfm == 1));
+  assert((rd_stats->skip_txfm == 0) || (rd_stats->skip_txfm == 1));
   assert((skip_pred >= 0) &&
          (skip_pred <= interp_search_flags->default_interp_skip_flags));
 
-  // When skip pred is equal to default_interp_skip_flags,
+  // When skip_txfm pred is equal to default_interp_skip_flags,
   // skip both luma and chroma MC.
   // For mono-chrome images:
   // num_planes = 1 and cpi->default_interp_skip_flags = 1,
@@ -604,6 +607,43 @@
   }
 }
 
+/*!\brief AV1 interpolation filter search
+ *
+ * \ingroup inter_mode_search
+ *
+ * \param[in]     cpi               Top-level encoder structure.
+ * \param[in]     tile_data         Pointer to struct holding adaptive
+ *                                  data/contexts/models for the tile during
+ *                                  encoding.
+ * \param[in]     x                 Pointer to struc holding all the data for
+ *                                  the current macroblock.
+ * \param[in]     bsize             Current block size.
+ * \param[in]     tmp_dst           A temporary prediction buffer to hold a
+ *                                  computed prediction.
+ * \param[in,out] orig_dst          A prediction buffer to hold a computed
+ *                                  prediction. This will eventually hold the
+ *                                  final prediction, and the tmp_dst info will
+ *                                  be copied here.
+ * \param[in,out] rd                The RD cost associated with the selected
+ *                                  interpolation filter parameters.
+ * \param[in,out] switchable_rate   The rate associated with using a SWITCHABLE
+ *                                  filter mode.
+ * \param[in,out] skip_build_pred   Indicates whether or not to build the inter
+ *                                  predictor. If this is 0, the inter predictor
+ *                                  has already been built and thus we can avoid
+ *                                  repeating computation.
+ * \param[in]     args              HandleInterModeArgs struct holding
+ *                                  miscellaneous arguments for inter mode
+ *                                  search. See the documentation for this
+ *                                  struct for a description of each member.
+ * \param[in]     ref_best_rd       Best RD found so far for this block.
+ *                                  It is used for early termination of this
+ *                                  search if the RD exceeds this value.
+ *
+ * \return Returns INT64_MAX if the filter parameters are invalid and the
+ * current motion mode being tested should be skipped. It returns 0 if the
+ * parameter search is a success.
+ */
 int64_t av1_interpolation_filter_search(
     MACROBLOCK *const x, const AV1_COMP *const cpi,
     const TileDataEnc *tile_data, BLOCK_SIZE bsize,
@@ -642,7 +682,8 @@
   switchable_ctx[0] = av1_get_pred_context_switchable_interp(xd, 0);
   switchable_ctx[1] = av1_get_pred_context_switchable_interp(xd, 1);
   *switchable_rate =
-      get_switchable_rate(x, mbmi->interp_filters, switchable_ctx);
+      get_switchable_rate(x, mbmi->interp_filters, switchable_ctx,
+                          cm->seq_params.enable_dual_filter);
 
   // Do MC evaluation for default filter_type.
   // Luma MC
diff --git a/av1/encoder/interp_search.h b/av1/encoder/interp_search.h
index 401e14f..1ee26d1 100644
--- a/av1/encoder/interp_search.h
+++ b/av1/encoder/interp_search.h
@@ -20,6 +20,7 @@
 extern "C" {
 #endif
 
+/*!\cond */
 #define MAX_INTERP_FILTER_STATS 128
 #define DUAL_FILTER_SET_SIZE (SWITCHABLE_FILTERS * SWITCHABLE_FILTERS)
 
@@ -32,33 +33,115 @@
   int64_t rd;
   unsigned int pred_sse;
 } INTERPOLATION_FILTER_STATS;
+/*!\endcond */
 
+/*!\brief Miscellaneous arguments for inter mode search.
+ */
 typedef struct {
-  // OBMC secondary prediction buffers and respective strides
+  /*!
+   * Buffer for the above predictor in OBMC
+   */
   uint8_t *above_pred_buf[MAX_MB_PLANE];
+  /*!
+   * Stride for the above predictor in OBMC
+   */
   int above_pred_stride[MAX_MB_PLANE];
+  /*!
+   * Buffer for the left predictor in OBMC
+   */
   uint8_t *left_pred_buf[MAX_MB_PLANE];
+  /*!
+   * Stride for the left predictor in OBMC
+   */
   int left_pred_stride[MAX_MB_PLANE];
+  /*!
+   * Pointer to the first member in a 2D array which holds
+   * single reference mode motion vectors to be used as a starting
+   * point in the mv search for compound modes. Each array is length REF_FRAMES,
+   * meaning there is a slot for a single reference motion vector for
+   * each possible reference frame. The 2D array consists of N of these arrays,
+   * where N is the length of the reference mv stack computed for the single
+   * reference case for that particular reference frame.
+   */
   int_mv (*single_newmv)[REF_FRAMES];
-  // Pointer to array of motion vectors to use for each ref and their rates
-  // Should point to first of 2 arrays in 2D array
+  /*!
+   * Pointer to the first array of a 2D array with the same setup as
+   * single_newmv array above. This is a 2D array to hold the rate
+   * corresponding to each of the single reference mode motion vectors
+   * held in single_newmv.
+   */
   int (*single_newmv_rate)[REF_FRAMES];
+  /*!
+   * Pointer to the first array of a 2D array with the same setup as
+   * single_newmv array above. This is a 2D array to hold a 0 or 1
+   * validity value corresponding to each of the single reference mode motion
+   * vectors held in single_newmv.
+   */
   int (*single_newmv_valid)[REF_FRAMES];
-  // Pointer to array of predicted rate-distortion
-  // Should point to first of 2 arrays in 2D array
+  /*!
+   * Pointer to the first array in a 3D array of predicted rate-distortion.
+   * The dimensions of this structure are:
+   * (number of possible inter modes) X
+   * (number of reference MVs) X
+   * (number of reference frames).
+   */
   int64_t (*modelled_rd)[MAX_REF_MV_SEARCH][REF_FRAMES];
+  /*!
+   * Holds an estimated entropy cost for picking the current reference frame.
+   * This is used to compute an rd estimate.
+   */
   int ref_frame_cost;
+  /*!
+   * Holds an estimated entropy cost for picking single or compound
+   * reference. This is used to compute an rd estimate.
+   */
   int single_comp_cost;
+  /*!
+   * Pointer to the first element in a 3D array holding rd's of
+   * SIMPLE_TRANSLATION used to prune out the motion mode search in single ref
+   * modes used to determine compound ref modes. The full structure is:
+   * (number of inter modes) X (length of refmv list) X (number of ref frames)
+   */
   int64_t (*simple_rd)[MAX_REF_MV_SEARCH][REF_FRAMES];
+  /*!
+   * An integer value 0 or 1 which indicates whether or not to skip the motion
+   * mode search and default to SIMPLE_TRANSLATION as a speed feature.
+   */
   int skip_motion_mode;
+  /*!
+   * A pointer to the first element in an array of INTERINTRA_MODE types. This
+   * contains the best inter_intra mode for each reference frame.
+   */
   INTERINTRA_MODE *inter_intra_mode;
-  int single_ref_first_pass;
-  SimpleRDState *simple_rd_state;
-  // [comp_idx][saved stat_idx]
+  /*!
+   * Array of saved interpolation filter stats collected to avoid repeating
+   * an interpolation filter search when the mv and ref_frame are the same
+   * as a previous search.
+   */
   INTERPOLATION_FILTER_STATS interp_filter_stats[MAX_INTERP_FILTER_STATS];
+  /*!
+   * Index of the last set of saved stats in the interp_filter_stats array.
+   */
   int interp_filter_stats_idx;
+  /*!
+   * Estimated wedge index.
+   */
+  int wedge_index;
+  /*!
+   * Estimated wedge sign.
+   */
+  int wedge_sign;
+  /*!
+   * Estimated diff wtd index.
+   */
+  int diffwtd_index;
+  /*!
+   * Estimated cmp mode.
+   */
+  int cmp_mode[MODE_CTX_REF_FRAMES];
 } HandleInterModeArgs;
 
+/*!\cond */
 static const int_interpfilters filter_sets[DUAL_FILTER_SET_SIZE] = {
   { 0x00000000 }, { 0x00010000 }, { 0x00020000 },  // y = 0
   { 0x00000001 }, { 0x00010001 }, { 0x00020001 },  // y = 1
@@ -78,6 +161,7 @@
     int64_t *const rd, int *const switchable_rate, int *skip_build_pred,
     HandleInterModeArgs *args, int64_t ref_best_rd);
 
+/*!\endcond */
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/av1/encoder/intra_mode_search.c b/av1/encoder/intra_mode_search.c
index 43192a9..b9f311e 100644
--- a/av1/encoder/intra_mode_search.c
+++ b/av1/encoder/intra_mode_search.c
@@ -9,13 +9,16 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#include "av1/encoder/intra_mode_search.h"
-#include "av1/encoder/model_rd.h"
-#include "av1/encoder/palette.h"
-#include "av1/common/pred_common.h"
+#include "av1/common/av1_common_int.h"
 #include "av1/common/reconintra.h"
+
+#include "av1/encoder/intra_mode_search.h"
+#include "av1/encoder/intra_mode_search_utils.h"
+#include "av1/encoder/palette.h"
+#include "av1/encoder/speed_features.h"
 #include "av1/encoder/tx_search.h"
 
+/*!\cond */
 static const PREDICTION_MODE intra_rd_search_mode_order[INTRA_MODES] = {
   DC_PRED,       H_PRED,        V_PRED,    SMOOTH_PRED, PAETH_PRED,
   SMOOTH_V_PRED, SMOOTH_H_PRED, D135_PRED, D203_PRED,   D157_PRED,
@@ -28,234 +31,18 @@
   UV_D135_PRED,   UV_D203_PRED,  UV_D157_PRED,     UV_D67_PRED,
   UV_D113_PRED,   UV_D45_PRED,
 };
+/*!\endcond */
 
-#define BINS 32
-static const float intra_hog_model_bias[DIRECTIONAL_MODES] = {
-  0.450578f,  0.695518f,  -0.717944f, -0.639894f,
-  -0.602019f, -0.453454f, 0.055857f,  -0.465480f,
-};
-
-static const float intra_hog_model_weights[BINS * DIRECTIONAL_MODES] = {
-  -3.076402f, -3.757063f, -3.275266f, -3.180665f, -3.452105f, -3.216593f,
-  -2.871212f, -3.134296f, -1.822324f, -2.401411f, -1.541016f, -1.195322f,
-  -0.434156f, 0.322868f,  2.260546f,  3.368715f,  3.989290f,  3.308487f,
-  2.277893f,  0.923793f,  0.026412f,  -0.385174f, -0.718622f, -1.408867f,
-  -1.050558f, -2.323941f, -2.225827f, -2.585453f, -3.054283f, -2.875087f,
-  -2.985709f, -3.447155f, 3.758139f,  3.204353f,  2.170998f,  0.826587f,
-  -0.269665f, -0.702068f, -1.085776f, -2.175249f, -1.623180f, -2.975142f,
-  -2.779629f, -3.190799f, -3.521900f, -3.375480f, -3.319355f, -3.897389f,
-  -3.172334f, -3.594528f, -2.879132f, -2.547777f, -2.921023f, -2.281844f,
-  -1.818988f, -2.041771f, -0.618268f, -1.396458f, -0.567153f, -0.285868f,
-  -0.088058f, 0.753494f,  2.092413f,  3.215266f,  -3.300277f, -2.748658f,
-  -2.315784f, -2.423671f, -2.257283f, -2.269583f, -2.196660f, -2.301076f,
-  -2.646516f, -2.271319f, -2.254366f, -2.300102f, -2.217960f, -2.473300f,
-  -2.116866f, -2.528246f, -3.314712f, -1.701010f, -0.589040f, -0.088077f,
-  0.813112f,  1.702213f,  2.653045f,  3.351749f,  3.243554f,  3.199409f,
-  2.437856f,  1.468854f,  0.533039f,  -0.099065f, -0.622643f, -2.200732f,
-  -4.228861f, -2.875263f, -1.273956f, -0.433280f, 0.803771f,  1.975043f,
-  3.179528f,  3.939064f,  3.454379f,  3.689386f,  3.116411f,  1.970991f,
-  0.798406f,  -0.628514f, -1.252546f, -2.825176f, -4.090178f, -3.777448f,
-  -3.227314f, -3.479403f, -3.320569f, -3.159372f, -2.729202f, -2.722341f,
-  -3.054913f, -2.742923f, -2.612703f, -2.662632f, -2.907314f, -3.117794f,
-  -3.102660f, -3.970972f, -4.891357f, -3.935582f, -3.347758f, -2.721924f,
-  -2.219011f, -1.702391f, -0.866529f, -0.153743f, 0.107733f,  1.416882f,
-  2.572884f,  3.607755f,  3.974820f,  3.997783f,  2.970459f,  0.791687f,
-  -1.478921f, -1.228154f, -1.216955f, -1.765932f, -1.951003f, -1.985301f,
-  -1.975881f, -1.985593f, -2.422371f, -2.419978f, -2.531288f, -2.951853f,
-  -3.071380f, -3.277027f, -3.373539f, -4.462010f, -0.967888f, 0.805524f,
-  2.794130f,  3.685984f,  3.745195f,  3.252444f,  2.316108f,  1.399146f,
-  -0.136519f, -0.162811f, -1.004357f, -1.667911f, -1.964662f, -2.937579f,
-  -3.019533f, -3.942766f, -5.102767f, -3.882073f, -3.532027f, -3.451956f,
-  -2.944015f, -2.643064f, -2.529872f, -2.077290f, -2.809965f, -1.803734f,
-  -1.783593f, -1.662585f, -1.415484f, -1.392673f, -0.788794f, -1.204819f,
-  -1.998864f, -1.182102f, -0.892110f, -1.317415f, -1.359112f, -1.522867f,
-  -1.468552f, -1.779072f, -2.332959f, -2.160346f, -2.329387f, -2.631259f,
-  -2.744936f, -3.052494f, -2.787363f, -3.442548f, -4.245075f, -3.032172f,
-  -2.061609f, -1.768116f, -1.286072f, -0.706587f, -0.192413f, 0.386938f,
-  0.716997f,  1.481393f,  2.216702f,  2.737986f,  3.109809f,  3.226084f,
-  2.490098f,  -0.095827f, -3.864816f, -3.507248f, -3.128925f, -2.908251f,
-  -2.883836f, -2.881411f, -2.524377f, -2.624478f, -2.399573f, -2.367718f,
-  -1.918255f, -1.926277f, -1.694584f, -1.723790f, -0.966491f, -1.183115f,
-  -1.430687f, 0.872896f,  2.766550f,  3.610080f,  3.578041f,  3.334928f,
-  2.586680f,  1.895721f,  1.122195f,  0.488519f,  -0.140689f, -0.799076f,
-  -1.222860f, -1.502437f, -1.900969f, -3.206816f,
-};
-
-static void generate_hog(const uint8_t *src, int stride, int rows, int cols,
-                         float *hist) {
-  const float step = (float)PI / BINS;
-  float total = 0.1f;
-  src += stride;
-  for (int r = 1; r < rows - 1; ++r) {
-    for (int c = 1; c < cols - 1; ++c) {
-      const uint8_t *above = &src[c - stride];
-      const uint8_t *below = &src[c + stride];
-      const uint8_t *left = &src[c - 1];
-      const uint8_t *right = &src[c + 1];
-      // Calculate gradient using Sobel fitlers.
-      const int dx = (right[-stride] + 2 * right[0] + right[stride]) -
-                     (left[-stride] + 2 * left[0] + left[stride]);
-      const int dy = (below[-1] + 2 * below[0] + below[1]) -
-                     (above[-1] + 2 * above[0] + above[1]);
-      if (dx == 0 && dy == 0) continue;
-      const int temp = abs(dx) + abs(dy);
-      if (!temp) continue;
-      total += temp;
-      if (dx == 0) {
-        hist[0] += temp / 2;
-        hist[BINS - 1] += temp / 2;
-      } else {
-        const float angle = atanf(dy * 1.0f / dx);
-        int idx = (int)roundf(angle / step) + BINS / 2;
-        idx = AOMMIN(idx, BINS - 1);
-        idx = AOMMAX(idx, 0);
-        hist[idx] += temp;
-      }
-    }
-    src += stride;
-  }
-
-  for (int i = 0; i < BINS; ++i) hist[i] /= total;
-}
-
-static void generate_hog_hbd(const uint8_t *src8, int stride, int rows,
-                             int cols, float *hist) {
-  const float step = (float)PI / BINS;
-  float total = 0.1f;
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  src += stride;
-  for (int r = 1; r < rows - 1; ++r) {
-    for (int c = 1; c < cols - 1; ++c) {
-      const uint16_t *above = &src[c - stride];
-      const uint16_t *below = &src[c + stride];
-      const uint16_t *left = &src[c - 1];
-      const uint16_t *right = &src[c + 1];
-      // Calculate gradient using Sobel fitlers.
-      const int dx = (right[-stride] + 2 * right[0] + right[stride]) -
-                     (left[-stride] + 2 * left[0] + left[stride]);
-      const int dy = (below[-1] + 2 * below[0] + below[1]) -
-                     (above[-1] + 2 * above[0] + above[1]);
-      if (dx == 0 && dy == 0) continue;
-      const int temp = abs(dx) + abs(dy);
-      if (!temp) continue;
-      total += temp;
-      if (dx == 0) {
-        hist[0] += temp / 2;
-        hist[BINS - 1] += temp / 2;
-      } else {
-        const float angle = atanf(dy * 1.0f / dx);
-        int idx = (int)roundf(angle / step) + BINS / 2;
-        idx = AOMMIN(idx, BINS - 1);
-        idx = AOMMAX(idx, 0);
-        hist[idx] += temp;
-      }
-    }
-    src += stride;
-  }
-
-  for (int i = 0; i < BINS; ++i) hist[i] /= total;
-}
-
-static void prune_intra_mode_with_hog(const MACROBLOCK *x, BLOCK_SIZE bsize,
-                                      float th,
-                                      uint8_t *directional_mode_skip_mask) {
-  aom_clear_system_state();
-
-  const int bh = block_size_high[bsize];
-  const int bw = block_size_wide[bsize];
-  const MACROBLOCKD *xd = &x->e_mbd;
-  const int rows =
-      (xd->mb_to_bottom_edge >= 0) ? bh : (xd->mb_to_bottom_edge >> 3) + bh;
-  const int cols =
-      (xd->mb_to_right_edge >= 0) ? bw : (xd->mb_to_right_edge >> 3) + bw;
-  const int src_stride = x->plane[0].src.stride;
-  const uint8_t *src = x->plane[0].src.buf;
-  float hist[BINS] = { 0.0f };
-  if (is_cur_buf_hbd(xd)) {
-    generate_hog_hbd(src, src_stride, rows, cols, hist);
-  } else {
-    generate_hog(src, src_stride, rows, cols, hist);
-  }
-
-  for (int i = 0; i < DIRECTIONAL_MODES; ++i) {
-    float this_score = intra_hog_model_bias[i];
-    const float *weights = &intra_hog_model_weights[i * BINS];
-    for (int j = 0; j < BINS; ++j) {
-      this_score += weights[j] * hist[j];
-    }
-    if (this_score < th) directional_mode_skip_mask[i + 1] = 1;
-  }
-
-  aom_clear_system_state();
-}
-
-#undef BINS
-
-// Model based RD estimation for luma intra blocks.
-static int64_t intra_model_yrd(const AV1_COMP *const cpi, MACROBLOCK *const x,
-                               BLOCK_SIZE bsize, int mode_cost) {
-  const AV1_COMMON *cm = &cpi->common;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = xd->mi[0];
-  assert(!is_inter_block(mbmi));
-  RD_STATS this_rd_stats;
-  int row, col;
-  int64_t temp_sse, this_rd;
-  TX_SIZE tx_size = tx_size_from_tx_mode(bsize, x->tx_mode_search_type);
-  const int stepr = tx_size_high_unit[tx_size];
-  const int stepc = tx_size_wide_unit[tx_size];
-  const int max_blocks_wide = max_block_wide(xd, bsize, 0);
-  const int max_blocks_high = max_block_high(xd, bsize, 0);
-  mbmi->tx_size = tx_size;
-  // Prediction.
-  for (row = 0; row < max_blocks_high; row += stepr) {
-    for (col = 0; col < max_blocks_wide; col += stepc) {
-      av1_predict_intra_block_facade(cm, xd, 0, col, row, tx_size);
-    }
-  }
-  // RD estimation.
-  model_rd_sb_fn[cpi->sf.rt_sf.use_simple_rd_model ? MODELRD_LEGACY
-                                                   : MODELRD_TYPE_INTRA](
-      cpi, bsize, x, xd, 0, 0, &this_rd_stats.rate, &this_rd_stats.dist,
-      &this_rd_stats.skip, &temp_sse, NULL, NULL, NULL);
-  if (av1_is_directional_mode(mbmi->mode) && av1_use_angle_delta(bsize)) {
-    mode_cost +=
-        x->angle_delta_cost[mbmi->mode - V_PRED]
-                           [MAX_ANGLE_DELTA + mbmi->angle_delta[PLANE_TYPE_Y]];
-  }
-  if (mbmi->mode == DC_PRED &&
-      av1_filter_intra_allowed_bsize(cm, mbmi->sb_type)) {
-    if (mbmi->filter_intra_mode_info.use_filter_intra) {
-      const int mode = mbmi->filter_intra_mode_info.filter_intra_mode;
-      mode_cost += x->filter_intra_cost[mbmi->sb_type][1] +
-                   x->filter_intra_mode_cost[mode];
-    } else {
-      mode_cost += x->filter_intra_cost[mbmi->sb_type][0];
-    }
-  }
-  this_rd =
-      RDCOST(x->rdmult, this_rd_stats.rate + mode_cost, this_rd_stats.dist);
-  return this_rd;
-}
-
-// Update the intra model yrd and prune the current mode if the new estimate
-// y_rd > 1.5 * best_model_rd.
-static AOM_INLINE int model_intra_yrd_and_prune(const AV1_COMP *const cpi,
-                                                MACROBLOCK *x, BLOCK_SIZE bsize,
-                                                int mode_info_cost,
-                                                int64_t *best_model_rd) {
-  const int64_t this_model_rd = intra_model_yrd(cpi, x, bsize, mode_info_cost);
-  if (*best_model_rd != INT64_MAX &&
-      this_model_rd > *best_model_rd + (*best_model_rd >> 1)) {
-    return 1;
-  } else if (this_model_rd < *best_model_rd) {
-    *best_model_rd = this_model_rd;
-  }
-  return 0;
-}
-
-// Run RD calculation with given luma intra prediction angle., and return
-// the RD cost. Update the best mode info. if the RD cost is the best so far.
+/*!\brief Calculate the rdcost of a given luma intra angle
+ *
+ * \ingroup intra_mode_search
+ * \callergraph
+ * This function runs rd calculation for a given luma intra prediction angle.
+ * This is used to select the best angle delta.
+ *
+ * \return Returns the rdcost of the angle and updates the mbmi if the
+ * new rdcost is better.
+ */
 static int64_t calc_rd_given_intra_angle(
     const AV1_COMP *const cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int mode_cost,
     int64_t best_rd_in, int8_t angle_delta, int max_angle_delta, int *rate,
@@ -280,11 +67,13 @@
 
   int this_rate =
       mode_cost + tokenonly_rd_stats.rate +
-      x->angle_delta_cost[mbmi->mode - V_PRED][max_angle_delta + angle_delta];
+      x->mode_costs
+          .angle_delta_cost[mbmi->mode - V_PRED][max_angle_delta + angle_delta];
   this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist);
 
   if (this_rd < *best_rd) {
-    memcpy(best_blk_skip, x->blk_skip, sizeof(best_blk_skip[0]) * n4);
+    memcpy(best_blk_skip, x->txfm_search_info.blk_skip,
+           sizeof(best_blk_skip[0]) * n4);
     av1_copy_array(best_tx_type_map, xd->tx_type_map, n4);
     *best_rd = this_rd;
     *best_angle_delta = mbmi->angle_delta[PLANE_TYPE_Y];
@@ -292,121 +81,19 @@
     *rate = this_rate;
     rd_stats->rate = tokenonly_rd_stats.rate;
     rd_stats->dist = tokenonly_rd_stats.dist;
-    rd_stats->skip = tokenonly_rd_stats.skip;
+    rd_stats->skip_txfm = tokenonly_rd_stats.skip_txfm;
   }
   return this_rd;
 }
 
-static INLINE int write_uniform_cost(int n, int v) {
-  const int l = get_unsigned_bits(n);
-  const int m = (1 << l) - n;
-  if (l == 0) return 0;
-  if (v < m)
-    return av1_cost_literal(l - 1);
-  else
-    return av1_cost_literal(l);
-}
-
-// Return the rate cost for luma prediction mode info. of intra blocks.
-static int intra_mode_info_cost_y(const AV1_COMP *cpi, const MACROBLOCK *x,
-                                  const MB_MODE_INFO *mbmi, BLOCK_SIZE bsize,
-                                  int mode_cost) {
-  int total_rate = mode_cost;
-  const int use_palette = mbmi->palette_mode_info.palette_size[0] > 0;
-  const int use_filter_intra = mbmi->filter_intra_mode_info.use_filter_intra;
-  const int use_intrabc = mbmi->use_intrabc;
-  // Can only activate one mode.
-  assert(((mbmi->mode != DC_PRED) + use_palette + use_intrabc +
-          use_filter_intra) <= 1);
-  const int try_palette = av1_allow_palette(
-      cpi->common.features.allow_screen_content_tools, mbmi->sb_type);
-  if (try_palette && mbmi->mode == DC_PRED) {
-    const MACROBLOCKD *xd = &x->e_mbd;
-    const int bsize_ctx = av1_get_palette_bsize_ctx(bsize);
-    const int mode_ctx = av1_get_palette_mode_ctx(xd);
-    total_rate += x->palette_y_mode_cost[bsize_ctx][mode_ctx][use_palette];
-    if (use_palette) {
-      const uint8_t *const color_map = xd->plane[0].color_index_map;
-      int block_width, block_height, rows, cols;
-      av1_get_block_dimensions(bsize, 0, xd, &block_width, &block_height, &rows,
-                               &cols);
-      const int plt_size = mbmi->palette_mode_info.palette_size[0];
-      int palette_mode_cost =
-          x->palette_y_size_cost[bsize_ctx][plt_size - PALETTE_MIN_SIZE] +
-          write_uniform_cost(plt_size, color_map[0]);
-      uint16_t color_cache[2 * PALETTE_MAX_SIZE];
-      const int n_cache = av1_get_palette_cache(xd, 0, color_cache);
-      palette_mode_cost +=
-          av1_palette_color_cost_y(&mbmi->palette_mode_info, color_cache,
-                                   n_cache, cpi->common.seq_params.bit_depth);
-      palette_mode_cost +=
-          av1_cost_color_map(x, 0, bsize, mbmi->tx_size, PALETTE_MAP);
-      total_rate += palette_mode_cost;
-    }
-  }
-  if (av1_filter_intra_allowed(&cpi->common, mbmi)) {
-    total_rate += x->filter_intra_cost[mbmi->sb_type][use_filter_intra];
-    if (use_filter_intra) {
-      total_rate += x->filter_intra_mode_cost[mbmi->filter_intra_mode_info
-                                                  .filter_intra_mode];
-    }
-  }
-  if (av1_is_directional_mode(mbmi->mode)) {
-    if (av1_use_angle_delta(bsize)) {
-      total_rate += x->angle_delta_cost[mbmi->mode - V_PRED]
-                                       [MAX_ANGLE_DELTA +
-                                        mbmi->angle_delta[PLANE_TYPE_Y]];
-    }
-  }
-  if (av1_allow_intrabc(&cpi->common))
-    total_rate += x->intrabc_cost[use_intrabc];
-  return total_rate;
-}
-
-// Return the rate cost for chroma prediction mode info. of intra blocks.
-static int intra_mode_info_cost_uv(const AV1_COMP *cpi, const MACROBLOCK *x,
-                                   const MB_MODE_INFO *mbmi, BLOCK_SIZE bsize,
-                                   int mode_cost) {
-  int total_rate = mode_cost;
-  const int use_palette = mbmi->palette_mode_info.palette_size[1] > 0;
-  const UV_PREDICTION_MODE mode = mbmi->uv_mode;
-  // Can only activate one mode.
-  assert(((mode != UV_DC_PRED) + use_palette + mbmi->use_intrabc) <= 1);
-
-  const int try_palette = av1_allow_palette(
-      cpi->common.features.allow_screen_content_tools, mbmi->sb_type);
-  if (try_palette && mode == UV_DC_PRED) {
-    const PALETTE_MODE_INFO *pmi = &mbmi->palette_mode_info;
-    total_rate +=
-        x->palette_uv_mode_cost[pmi->palette_size[0] > 0][use_palette];
-    if (use_palette) {
-      const int bsize_ctx = av1_get_palette_bsize_ctx(bsize);
-      const int plt_size = pmi->palette_size[1];
-      const MACROBLOCKD *xd = &x->e_mbd;
-      const uint8_t *const color_map = xd->plane[1].color_index_map;
-      int palette_mode_cost =
-          x->palette_uv_size_cost[bsize_ctx][plt_size - PALETTE_MIN_SIZE] +
-          write_uniform_cost(plt_size, color_map[0]);
-      uint16_t color_cache[2 * PALETTE_MAX_SIZE];
-      const int n_cache = av1_get_palette_cache(xd, 1, color_cache);
-      palette_mode_cost += av1_palette_color_cost_uv(
-          pmi, color_cache, n_cache, cpi->common.seq_params.bit_depth);
-      palette_mode_cost +=
-          av1_cost_color_map(x, 1, bsize, mbmi->tx_size, PALETTE_MAP);
-      total_rate += palette_mode_cost;
-    }
-  }
-  if (av1_is_directional_mode(get_uv_mode(mode))) {
-    if (av1_use_angle_delta(bsize)) {
-      total_rate +=
-          x->angle_delta_cost[mode - V_PRED][mbmi->angle_delta[PLANE_TYPE_UV] +
-                                             MAX_ANGLE_DELTA];
-    }
-  }
-  return total_rate;
-}
-
-// Return 1 if an filter intra mode is selected; return 0 otherwise.
+/*!\brief Search for the best filter_intra mode when coding intra frame.
+ *
+ * \ingroup intra_mode_search
+ * \callergraph
+ * This function loops through all filter_intra modes to find the best one.
+ *
+ * \return Returns 1 if a new filter_intra mode is selected; 0 otherwise.
+ */
 static int rd_pick_filter_intra_sby(const AV1_COMP *const cpi, MACROBLOCK *x,
                                     int *rate, int *rate_tokenonly,
                                     int64_t *distortion, int *skippable,
@@ -446,19 +133,18 @@
     const int txfm_search_done = 1;
     store_winner_mode_stats(
         &cpi->common, x, mbmi, NULL, NULL, NULL, 0, NULL, bsize, this_rd,
-        cpi->sf.winner_mode_sf.enable_multiwinner_mode_process,
-        txfm_search_done);
+        cpi->sf.winner_mode_sf.multi_winner_mode_type, txfm_search_done);
     if (this_rd < *best_rd) {
       *best_rd = this_rd;
       best_tx_size = mbmi->tx_size;
       filter_intra_mode_info = mbmi->filter_intra_mode_info;
       av1_copy_array(best_tx_type_map, xd->tx_type_map, ctx->num_4x4_blk);
-      memcpy(ctx->blk_skip, x->blk_skip,
-             sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
+      memcpy(ctx->blk_skip, x->txfm_search_info.blk_skip,
+             sizeof(x->txfm_search_info.blk_skip[0]) * ctx->num_4x4_blk);
       *rate = this_rate;
       *rate_tokenonly = tokenonly_rd_stats.rate;
       *distortion = tokenonly_rd_stats.dist;
-      *skippable = tokenonly_rd_stats.skip;
+      *skippable = tokenonly_rd_stats.skip_txfm;
       filter_intra_selected_flag = 1;
     }
   }
@@ -474,8 +160,8 @@
   }
 }
 
-int av1_count_colors(const uint8_t *src, int stride, int rows, int cols,
-                     int *val_count) {
+void av1_count_colors(const uint8_t *src, int stride, int rows, int cols,
+                      int *val_count, int *num_colors) {
   const int max_pix_val = 1 << 8;
   memset(val_count, 0, max_pix_val * sizeof(val_count[0]));
   for (int r = 0; r < rows; ++r) {
@@ -489,745 +175,52 @@
   for (int i = 0; i < max_pix_val; ++i) {
     if (val_count[i]) ++n;
   }
-  return n;
+  *num_colors = n;
 }
 
-int av1_count_colors_highbd(const uint8_t *src8, int stride, int rows, int cols,
-                            int bit_depth, int *val_count) {
+void av1_count_colors_highbd(const uint8_t *src8, int stride, int rows,
+                             int cols, int bit_depth, int *val_count,
+                             int *bin_val_count, int *num_color_bins,
+                             int *num_colors) {
   assert(bit_depth <= 12);
+  const int max_bin_val = 1 << 8;
   const int max_pix_val = 1 << bit_depth;
   const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  memset(val_count, 0, max_pix_val * sizeof(val_count[0]));
+  memset(bin_val_count, 0, max_bin_val * sizeof(val_count[0]));
+  if (val_count != NULL)
+    memset(val_count, 0, max_pix_val * sizeof(val_count[0]));
   for (int r = 0; r < rows; ++r) {
     for (int c = 0; c < cols; ++c) {
-      const int this_val = src[r * stride + c];
-      assert(this_val < max_pix_val);
-      if (this_val >= max_pix_val) return 0;
-      ++val_count[this_val];
+      /*
+       * Down-convert the pixels to 8-bit domain before counting.
+       * This provides consistency of behavior for palette search
+       * between lbd and hbd encodes. This down-converted pixels
+       * are only used for calculating the threshold (n).
+       */
+      const int this_val = ((src[r * stride + c]) >> (bit_depth - 8));
+      assert(this_val < max_bin_val);
+      if (this_val >= max_bin_val) continue;
+      ++bin_val_count[this_val];
+      if (val_count != NULL) ++val_count[(src[r * stride + c])];
     }
   }
   int n = 0;
-  for (int i = 0; i < max_pix_val; ++i) {
-    if (val_count[i]) ++n;
+  // Count the colors based on 8-bit domain used to gate the palette path
+  for (int i = 0; i < max_bin_val; ++i) {
+    if (bin_val_count[i]) ++n;
   }
-  return n;
-}
+  *num_color_bins = n;
 
-// Extends 'color_map' array from 'orig_width x orig_height' to 'new_width x
-// new_height'. Extra rows and columns are filled in by copying last valid
-// row/column.
-static AOM_INLINE void extend_palette_color_map(uint8_t *const color_map,
-                                                int orig_width, int orig_height,
-                                                int new_width, int new_height) {
-  int j;
-  assert(new_width >= orig_width);
-  assert(new_height >= orig_height);
-  if (new_width == orig_width && new_height == orig_height) return;
-
-  for (j = orig_height - 1; j >= 0; --j) {
-    memmove(color_map + j * new_width, color_map + j * orig_width, orig_width);
-    // Copy last column to extra columns.
-    memset(color_map + j * new_width + orig_width,
-           color_map[j * new_width + orig_width - 1], new_width - orig_width);
-  }
-  // Copy last row to extra rows.
-  for (j = orig_height; j < new_height; ++j) {
-    memcpy(color_map + j * new_width, color_map + (orig_height - 1) * new_width,
-           new_width);
-  }
-}
-
-// Bias toward using colors in the cache.
-// TODO(huisu): Try other schemes to improve compression.
-static AOM_INLINE void optimize_palette_colors(uint16_t *color_cache,
-                                               int n_cache, int n_colors,
-                                               int stride, int *centroids) {
-  if (n_cache <= 0) return;
-  for (int i = 0; i < n_colors * stride; i += stride) {
-    int min_diff = abs(centroids[i] - (int)color_cache[0]);
-    int idx = 0;
-    for (int j = 1; j < n_cache; ++j) {
-      const int this_diff = abs(centroids[i] - color_cache[j]);
-      if (this_diff < min_diff) {
-        min_diff = this_diff;
-        idx = j;
-      }
+  // Count the actual hbd colors used to create top_colors
+  n = 0;
+  if (val_count != NULL) {
+    for (int i = 0; i < max_pix_val; ++i) {
+      if (val_count[i]) ++n;
     }
-    if (min_diff <= 1) centroids[i] = color_cache[idx];
+    *num_colors = n;
   }
 }
 
-// Given the base colors as specified in centroids[], calculate the RD cost
-// of palette mode.
-static AOM_INLINE void palette_rd_y(
-    const AV1_COMP *const cpi, MACROBLOCK *x, MB_MODE_INFO *mbmi,
-    BLOCK_SIZE bsize, int dc_mode_cost, const int *data, int *centroids, int n,
-    uint16_t *color_cache, int n_cache, MB_MODE_INFO *best_mbmi,
-    uint8_t *best_palette_color_map, int64_t *best_rd, int64_t *best_model_rd,
-    int *rate, int *rate_tokenonly, int64_t *distortion, int *skippable,
-    int *beat_best_rd, PICK_MODE_CONTEXT *ctx, uint8_t *blk_skip,
-    uint8_t *tx_type_map, int *beat_best_pallette_rd) {
-  optimize_palette_colors(color_cache, n_cache, n, 1, centroids);
-  const int num_unique_colors = av1_remove_duplicates(centroids, n);
-  if (num_unique_colors < PALETTE_MIN_SIZE) {
-    // Too few unique colors to create a palette. And DC_PRED will work
-    // well for that case anyway. So skip.
-    return;
-  }
-  PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
-  if (cpi->common.seq_params.use_highbitdepth) {
-    for (int i = 0; i < num_unique_colors; ++i) {
-      pmi->palette_colors[i] = clip_pixel_highbd(
-          (int)centroids[i], cpi->common.seq_params.bit_depth);
-    }
-  } else {
-    for (int i = 0; i < num_unique_colors; ++i) {
-      pmi->palette_colors[i] = clip_pixel(centroids[i]);
-    }
-  }
-  pmi->palette_size[0] = num_unique_colors;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  uint8_t *const color_map = xd->plane[0].color_index_map;
-  int block_width, block_height, rows, cols;
-  av1_get_block_dimensions(bsize, 0, xd, &block_width, &block_height, &rows,
-                           &cols);
-  av1_calc_indices(data, centroids, color_map, rows * cols, num_unique_colors,
-                   1);
-  extend_palette_color_map(color_map, cols, rows, block_width, block_height);
-
-  const int palette_mode_cost =
-      intra_mode_info_cost_y(cpi, x, mbmi, bsize, dc_mode_cost);
-  if (model_intra_yrd_and_prune(cpi, x, bsize, palette_mode_cost,
-                                best_model_rd)) {
-    return;
-  }
-
-  RD_STATS tokenonly_rd_stats;
-  av1_pick_uniform_tx_size_type_yrd(cpi, x, &tokenonly_rd_stats, bsize,
-                                    *best_rd);
-  if (tokenonly_rd_stats.rate == INT_MAX) return;
-  int this_rate = tokenonly_rd_stats.rate + palette_mode_cost;
-  int64_t this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist);
-  if (!xd->lossless[mbmi->segment_id] && block_signals_txsize(mbmi->sb_type)) {
-    tokenonly_rd_stats.rate -= tx_size_cost(x, bsize, mbmi->tx_size);
-  }
-  // Collect mode stats for multiwinner mode processing
-  const int txfm_search_done = 1;
-  store_winner_mode_stats(
-      &cpi->common, x, mbmi, NULL, NULL, NULL, THR_DC, color_map, bsize,
-      this_rd, cpi->sf.winner_mode_sf.enable_multiwinner_mode_process,
-      txfm_search_done);
-  if (this_rd < *best_rd) {
-    *best_rd = this_rd;
-    // Setting beat_best_rd flag because current mode rd is better than best_rd.
-    // This flag need to be updated only for palette evaluation in key frames
-    if (beat_best_rd) *beat_best_rd = 1;
-    memcpy(best_palette_color_map, color_map,
-           block_width * block_height * sizeof(color_map[0]));
-    *best_mbmi = *mbmi;
-    memcpy(blk_skip, x->blk_skip, sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
-    av1_copy_array(tx_type_map, xd->tx_type_map, ctx->num_4x4_blk);
-    if (rate) *rate = this_rate;
-    if (rate_tokenonly) *rate_tokenonly = tokenonly_rd_stats.rate;
-    if (distortion) *distortion = tokenonly_rd_stats.dist;
-    if (skippable) *skippable = tokenonly_rd_stats.skip;
-    if (beat_best_pallette_rd) *beat_best_pallette_rd = 1;
-  }
-}
-
-static AOM_INLINE int perform_top_color_coarse_palette_search(
-    const AV1_COMP *const cpi, MACROBLOCK *x, MB_MODE_INFO *mbmi,
-    BLOCK_SIZE bsize, int dc_mode_cost, const int *data,
-    const int *const top_colors, int start_n, int end_n, int step_size,
-    uint16_t *color_cache, int n_cache, MB_MODE_INFO *best_mbmi,
-    uint8_t *best_palette_color_map, int64_t *best_rd, int64_t *best_model_rd,
-    int *rate, int *rate_tokenonly, int64_t *distortion, int *skippable,
-    int *beat_best_rd, PICK_MODE_CONTEXT *ctx, uint8_t *best_blk_skip,
-    uint8_t *tx_type_map) {
-  int centroids[PALETTE_MAX_SIZE];
-  int n = start_n;
-  int top_color_winner = end_n + 1;
-  while (1) {
-    int beat_best_pallette_rd = 0;
-    for (int i = 0; i < n; ++i) centroids[i] = top_colors[i];
-    palette_rd_y(cpi, x, mbmi, bsize, dc_mode_cost, data, centroids, n,
-                 color_cache, n_cache, best_mbmi, best_palette_color_map,
-                 best_rd, best_model_rd, rate, rate_tokenonly, distortion,
-                 skippable, beat_best_rd, ctx, best_blk_skip, tx_type_map,
-                 &beat_best_pallette_rd);
-    // Break if current palette colors is not winning
-    if (beat_best_pallette_rd) top_color_winner = n;
-    n += step_size;
-    if (n > end_n) break;
-  }
-  return top_color_winner;
-}
-
-static AOM_INLINE int perform_k_means_coarse_palette_search(
-    const AV1_COMP *const cpi, MACROBLOCK *x, MB_MODE_INFO *mbmi,
-    BLOCK_SIZE bsize, int dc_mode_cost, const int *data, int lb, int ub,
-    int start_n, int end_n, int step_size, uint16_t *color_cache, int n_cache,
-    MB_MODE_INFO *best_mbmi, uint8_t *best_palette_color_map, int64_t *best_rd,
-    int64_t *best_model_rd, int *rate, int *rate_tokenonly, int64_t *distortion,
-    int *skippable, int *beat_best_rd, PICK_MODE_CONTEXT *ctx,
-    uint8_t *best_blk_skip, uint8_t *tx_type_map, uint8_t *color_map,
-    int data_points) {
-  int centroids[PALETTE_MAX_SIZE];
-  const int max_itr = 50;
-  int n = start_n;
-  int k_means_winner = end_n + 1;
-  while (1) {
-    int beat_best_pallette_rd = 0;
-    for (int i = 0; i < n; ++i) {
-      centroids[i] = lb + (2 * i + 1) * (ub - lb) / n / 2;
-    }
-    av1_k_means(data, centroids, color_map, data_points, n, 1, max_itr);
-    palette_rd_y(cpi, x, mbmi, bsize, dc_mode_cost, data, centroids, n,
-                 color_cache, n_cache, best_mbmi, best_palette_color_map,
-                 best_rd, best_model_rd, rate, rate_tokenonly, distortion,
-                 skippable, beat_best_rd, ctx, best_blk_skip, tx_type_map,
-                 &beat_best_pallette_rd);
-    // Break if current palette colors is not winning
-    if (beat_best_pallette_rd) k_means_winner = n;
-    n += step_size;
-    if (n > end_n) break;
-  }
-  return k_means_winner;
-}
-
-// Perform palette search for top colors from minimum palette colors (/maximum)
-// with a step-size of 1 (/-1)
-static AOM_INLINE int perform_top_color_palette_search(
-    const AV1_COMP *const cpi, MACROBLOCK *x, MB_MODE_INFO *mbmi,
-    BLOCK_SIZE bsize, int dc_mode_cost, const int *data, int *top_colors,
-    int start_n, int end_n, int step_size, uint16_t *color_cache, int n_cache,
-    MB_MODE_INFO *best_mbmi, uint8_t *best_palette_color_map, int64_t *best_rd,
-    int64_t *best_model_rd, int *rate, int *rate_tokenonly, int64_t *distortion,
-    int *skippable, int *beat_best_rd, PICK_MODE_CONTEXT *ctx,
-    uint8_t *best_blk_skip, uint8_t *tx_type_map) {
-  int centroids[PALETTE_MAX_SIZE];
-  int n = start_n;
-  assert((step_size == -1) || (step_size == 1) || (step_size == 0) ||
-         (step_size == 2));
-  assert(IMPLIES(step_size == -1, start_n > end_n));
-  assert(IMPLIES(step_size == 1, start_n < end_n));
-  while (1) {
-    int beat_best_pallette_rd = 0;
-    for (int i = 0; i < n; ++i) centroids[i] = top_colors[i];
-    palette_rd_y(cpi, x, mbmi, bsize, dc_mode_cost, data, centroids, n,
-                 color_cache, n_cache, best_mbmi, best_palette_color_map,
-                 best_rd, best_model_rd, rate, rate_tokenonly, distortion,
-                 skippable, beat_best_rd, ctx, best_blk_skip, tx_type_map,
-                 &beat_best_pallette_rd);
-    // Break if current palette colors is not winning
-    if ((cpi->sf.intra_sf.prune_palette_search_level == 2) &&
-        !beat_best_pallette_rd)
-      return n;
-    n += step_size;
-    if (n == end_n) break;
-  }
-  return n;
-}
-// Perform k-means based palette search from minimum palette colors (/maximum)
-// with a step-size of 1 (/-1)
-static AOM_INLINE int perform_k_means_palette_search(
-    const AV1_COMP *const cpi, MACROBLOCK *x, MB_MODE_INFO *mbmi,
-    BLOCK_SIZE bsize, int dc_mode_cost, const int *data, int lb, int ub,
-    int start_n, int end_n, int step_size, uint16_t *color_cache, int n_cache,
-    MB_MODE_INFO *best_mbmi, uint8_t *best_palette_color_map, int64_t *best_rd,
-    int64_t *best_model_rd, int *rate, int *rate_tokenonly, int64_t *distortion,
-    int *skippable, int *beat_best_rd, PICK_MODE_CONTEXT *ctx,
-    uint8_t *best_blk_skip, uint8_t *tx_type_map, uint8_t *color_map,
-    int data_points) {
-  int centroids[PALETTE_MAX_SIZE];
-  const int max_itr = 50;
-  int n = start_n;
-  assert((step_size == -1) || (step_size == 1) || (step_size == 0) ||
-         (step_size == 2));
-  assert(IMPLIES(step_size == -1, start_n > end_n));
-  assert(IMPLIES(step_size == 1, start_n < end_n));
-  while (1) {
-    int beat_best_pallette_rd = 0;
-    for (int i = 0; i < n; ++i) {
-      centroids[i] = lb + (2 * i + 1) * (ub - lb) / n / 2;
-    }
-    av1_k_means(data, centroids, color_map, data_points, n, 1, max_itr);
-    palette_rd_y(cpi, x, mbmi, bsize, dc_mode_cost, data, centroids, n,
-                 color_cache, n_cache, best_mbmi, best_palette_color_map,
-                 best_rd, best_model_rd, rate, rate_tokenonly, distortion,
-                 skippable, beat_best_rd, ctx, best_blk_skip, tx_type_map,
-                 &beat_best_pallette_rd);
-    // Break if current palette colors is not winning
-    if ((cpi->sf.intra_sf.prune_palette_search_level == 2) &&
-        !beat_best_pallette_rd)
-      return n;
-    n += step_size;
-    if (n == end_n) break;
-  }
-  return n;
-}
-
-#define START_N_STAGE2(x)                         \
-  ((x == PALETTE_MIN_SIZE) ? PALETTE_MIN_SIZE + 1 \
-                           : AOMMAX(x - 1, PALETTE_MIN_SIZE));
-#define END_N_STAGE2(x, end_n) \
-  ((x == end_n) ? x - 1 : AOMMIN(x + 1, PALETTE_MAX_SIZE));
-
-static AOM_INLINE void update_start_end_stage_2(int *start_n_stage2,
-                                                int *end_n_stage2,
-                                                int *step_size_stage2,
-                                                int winner, int end_n) {
-  *start_n_stage2 = START_N_STAGE2(winner);
-  *end_n_stage2 = END_N_STAGE2(winner, end_n);
-  *step_size_stage2 = *end_n_stage2 - *start_n_stage2;
-}
-
-// Start index and step size below are chosen to evaluate unique
-// candidates in neighbor search, in case a winner candidate is found in
-// coarse search. Example,
-// 1) 8 colors (end_n = 8): 2,3,4,5,6,7,8. start_n is chosen as 2 and step
-// size is chosen as 3. Therefore, coarse search will evaluate 2, 5 and 8.
-// If winner is found at 5, then 4 and 6 are evaluated. Similarly, for 2
-// (3) and 8 (7).
-// 2) 7 colors (end_n = 7): 2,3,4,5,6,7. If start_n is chosen as 2 (same
-// as for 8 colors) then step size should also be 2, to cover all
-// candidates. Coarse search will evaluate 2, 4 and 6. If winner is either
-// 2 or 4, 3 will be evaluated. Instead, if start_n=3 and step_size=3,
-// coarse search will evaluate 3 and 6. For the winner, unique neighbors
-// (3: 2,4 or 6: 5,7) would be evaluated.
-
-// start index for coarse palette search for dominant colors and k-means
-static const uint8_t start_n_lookup_table[PALETTE_MAX_SIZE + 1] = { 0, 0, 0,
-                                                                    3, 3, 2,
-                                                                    3, 3, 2 };
-// step size for coarse palette search for dominant colors and k-means
-static const uint8_t step_size_lookup_table[PALETTE_MAX_SIZE + 1] = { 0, 0, 0,
-                                                                      3, 3, 3,
-                                                                      3, 3, 3 };
-
-static void rd_pick_palette_intra_sby(
-    const AV1_COMP *const cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
-    int dc_mode_cost, MB_MODE_INFO *best_mbmi, uint8_t *best_palette_color_map,
-    int64_t *best_rd, int64_t *best_model_rd, int *rate, int *rate_tokenonly,
-    int64_t *distortion, int *skippable, int *beat_best_rd,
-    PICK_MODE_CONTEXT *ctx, uint8_t *best_blk_skip, uint8_t *tx_type_map) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = xd->mi[0];
-  assert(!is_inter_block(mbmi));
-  assert(av1_allow_palette(cpi->common.features.allow_screen_content_tools,
-                           bsize));
-
-  const int src_stride = x->plane[0].src.stride;
-  const uint8_t *const src = x->plane[0].src.buf;
-  int block_width, block_height, rows, cols;
-  av1_get_block_dimensions(bsize, 0, xd, &block_width, &block_height, &rows,
-                           &cols);
-  const SequenceHeader *const seq_params = &cpi->common.seq_params;
-  const int is_hbd = seq_params->use_highbitdepth;
-  const int bit_depth = seq_params->bit_depth;
-  int count_buf[1 << 12];  // Maximum (1 << 12) color levels.
-  int colors;
-  if (is_hbd) {
-    colors = av1_count_colors_highbd(src, src_stride, rows, cols, bit_depth,
-                                     count_buf);
-  } else {
-    colors = av1_count_colors(src, src_stride, rows, cols, count_buf);
-  }
-
-  uint8_t *const color_map = xd->plane[0].color_index_map;
-  if (colors > 1 && colors <= 64) {
-    int *const data = x->palette_buffer->kmeans_data_buf;
-    int centroids[PALETTE_MAX_SIZE];
-    int lb, ub;
-    if (is_hbd) {
-      int *data_pt = data;
-      const uint16_t *src_pt = CONVERT_TO_SHORTPTR(src);
-      lb = ub = src_pt[0];
-      for (int r = 0; r < rows; ++r) {
-        for (int c = 0; c < cols; ++c) {
-          const int val = src_pt[c];
-          data_pt[c] = val;
-          lb = AOMMIN(lb, val);
-          ub = AOMMAX(ub, val);
-        }
-        src_pt += src_stride;
-        data_pt += cols;
-      }
-    } else {
-      int *data_pt = data;
-      const uint8_t *src_pt = src;
-      lb = ub = src[0];
-      for (int r = 0; r < rows; ++r) {
-        for (int c = 0; c < cols; ++c) {
-          const int val = src_pt[c];
-          data_pt[c] = val;
-          lb = AOMMIN(lb, val);
-          ub = AOMMAX(ub, val);
-        }
-        src_pt += src_stride;
-        data_pt += cols;
-      }
-    }
-
-    mbmi->mode = DC_PRED;
-    mbmi->filter_intra_mode_info.use_filter_intra = 0;
-
-    uint16_t color_cache[2 * PALETTE_MAX_SIZE];
-    const int n_cache = av1_get_palette_cache(xd, 0, color_cache);
-
-    // Find the dominant colors, stored in top_colors[].
-    int top_colors[PALETTE_MAX_SIZE] = { 0 };
-    for (int i = 0; i < AOMMIN(colors, PALETTE_MAX_SIZE); ++i) {
-      int max_count = 0;
-      for (int j = 0; j < (1 << bit_depth); ++j) {
-        if (count_buf[j] > max_count) {
-          max_count = count_buf[j];
-          top_colors[i] = j;
-        }
-      }
-      assert(max_count > 0);
-      count_buf[top_colors[i]] = 0;
-    }
-
-    // Try the dominant colors directly.
-    // TODO(huisu@google.com): Try to avoid duplicate computation in cases
-    // where the dominant colors and the k-means results are similar.
-    if ((cpi->sf.intra_sf.prune_palette_search_level == 1) &&
-        (colors > PALETTE_MIN_SIZE)) {
-      const int end_n = AOMMIN(colors, PALETTE_MAX_SIZE);
-      assert(PALETTE_MAX_SIZE == 8);
-      assert(PALETTE_MIN_SIZE == 2);
-      // Choose the start index and step size for coarse search based on number
-      // of colors
-      const int start_n = start_n_lookup_table[end_n];
-      const int step_size = step_size_lookup_table[end_n];
-      // Perform top color coarse palette search to find the winner candidate
-      const int top_color_winner = perform_top_color_coarse_palette_search(
-          cpi, x, mbmi, bsize, dc_mode_cost, data, top_colors, start_n, end_n,
-          step_size, color_cache, n_cache, best_mbmi, best_palette_color_map,
-          best_rd, best_model_rd, rate, rate_tokenonly, distortion, skippable,
-          beat_best_rd, ctx, best_blk_skip, tx_type_map);
-      // Evaluate neighbors for the winner color (if winner is found) in the
-      // above coarse search for dominant colors
-      if (top_color_winner <= end_n) {
-        int start_n_stage2, end_n_stage2, step_size_stage2;
-        update_start_end_stage_2(&start_n_stage2, &end_n_stage2,
-                                 &step_size_stage2, top_color_winner, end_n);
-        // perform finer search for the winner candidate
-        perform_top_color_palette_search(
-            cpi, x, mbmi, bsize, dc_mode_cost, data, top_colors, start_n_stage2,
-            end_n_stage2 + step_size_stage2, step_size_stage2, color_cache,
-            n_cache, best_mbmi, best_palette_color_map, best_rd, best_model_rd,
-            rate, rate_tokenonly, distortion, skippable, beat_best_rd, ctx,
-            best_blk_skip, tx_type_map);
-      }
-      // K-means clustering.
-      // Perform k-means coarse palette search to find the winner candidate
-      const int k_means_winner = perform_k_means_coarse_palette_search(
-          cpi, x, mbmi, bsize, dc_mode_cost, data, lb, ub, start_n, end_n,
-          step_size, color_cache, n_cache, best_mbmi, best_palette_color_map,
-          best_rd, best_model_rd, rate, rate_tokenonly, distortion, skippable,
-          beat_best_rd, ctx, best_blk_skip, tx_type_map, color_map,
-          rows * cols);
-      // Evaluate neighbors for the winner color (if winner is found) in the
-      // above coarse search for k-means
-      if (k_means_winner <= end_n) {
-        int start_n_stage2, end_n_stage2, step_size_stage2;
-        update_start_end_stage_2(&start_n_stage2, &end_n_stage2,
-                                 &step_size_stage2, k_means_winner, end_n);
-        // perform finer search for the winner candidate
-        perform_k_means_palette_search(
-            cpi, x, mbmi, bsize, dc_mode_cost, data, lb, ub, start_n_stage2,
-            end_n_stage2 + step_size_stage2, step_size_stage2, color_cache,
-            n_cache, best_mbmi, best_palette_color_map, best_rd, best_model_rd,
-            rate, rate_tokenonly, distortion, skippable, beat_best_rd, ctx,
-            best_blk_skip, tx_type_map, color_map, rows * cols);
-      }
-    } else {
-      const int start_n = AOMMIN(colors, PALETTE_MAX_SIZE),
-                end_n = PALETTE_MIN_SIZE;
-      // Perform top color palette search from start_n
-      const int top_color_winner = perform_top_color_palette_search(
-          cpi, x, mbmi, bsize, dc_mode_cost, data, top_colors, start_n,
-          end_n - 1, -1, color_cache, n_cache, best_mbmi,
-          best_palette_color_map, best_rd, best_model_rd, rate, rate_tokenonly,
-          distortion, skippable, beat_best_rd, ctx, best_blk_skip, tx_type_map);
-
-      if (top_color_winner > end_n) {
-        // Perform top color palette search in reverse order for the remaining
-        // colors
-        perform_top_color_palette_search(
-            cpi, x, mbmi, bsize, dc_mode_cost, data, top_colors, end_n,
-            top_color_winner, 1, color_cache, n_cache, best_mbmi,
-            best_palette_color_map, best_rd, best_model_rd, rate,
-            rate_tokenonly, distortion, skippable, beat_best_rd, ctx,
-            best_blk_skip, tx_type_map);
-      }
-      // K-means clustering.
-      if (colors == PALETTE_MIN_SIZE) {
-        // Special case: These colors automatically become the centroids.
-        assert(colors == 2);
-        centroids[0] = lb;
-        centroids[1] = ub;
-        palette_rd_y(cpi, x, mbmi, bsize, dc_mode_cost, data, centroids, colors,
-                     color_cache, n_cache, best_mbmi, best_palette_color_map,
-                     best_rd, best_model_rd, rate, rate_tokenonly, distortion,
-                     skippable, beat_best_rd, ctx, best_blk_skip, tx_type_map,
-                     NULL);
-      } else {
-        // Perform k-means palette search from start_n
-        const int k_means_winner = perform_k_means_palette_search(
-            cpi, x, mbmi, bsize, dc_mode_cost, data, lb, ub, start_n, end_n - 1,
-            -1, color_cache, n_cache, best_mbmi, best_palette_color_map,
-            best_rd, best_model_rd, rate, rate_tokenonly, distortion, skippable,
-            beat_best_rd, ctx, best_blk_skip, tx_type_map, color_map,
-            rows * cols);
-        if (k_means_winner > end_n) {
-          // Perform k-means palette search in reverse order for the remaining
-          // colors
-          perform_k_means_palette_search(
-              cpi, x, mbmi, bsize, dc_mode_cost, data, lb, ub, end_n,
-              k_means_winner, 1, color_cache, n_cache, best_mbmi,
-              best_palette_color_map, best_rd, best_model_rd, rate,
-              rate_tokenonly, distortion, skippable, beat_best_rd, ctx,
-              best_blk_skip, tx_type_map, color_map, rows * cols);
-        }
-      }
-    }
-  }
-
-  if (best_mbmi->palette_mode_info.palette_size[0] > 0) {
-    memcpy(color_map, best_palette_color_map,
-           block_width * block_height * sizeof(best_palette_color_map[0]));
-  }
-  *mbmi = *best_mbmi;
-}
-
-static AOM_INLINE void rd_pick_palette_intra_sbuv(
-    const AV1_COMP *const cpi, MACROBLOCK *x, int dc_mode_cost,
-    uint8_t *best_palette_color_map, MB_MODE_INFO *const best_mbmi,
-    int64_t *best_rd, int *rate, int *rate_tokenonly, int64_t *distortion,
-    int *skippable) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = xd->mi[0];
-  assert(!is_inter_block(mbmi));
-  assert(av1_allow_palette(cpi->common.features.allow_screen_content_tools,
-                           mbmi->sb_type));
-  PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
-  const BLOCK_SIZE bsize = mbmi->sb_type;
-  const SequenceHeader *const seq_params = &cpi->common.seq_params;
-  int this_rate;
-  int64_t this_rd;
-  int colors_u, colors_v, colors;
-  const int src_stride = x->plane[1].src.stride;
-  const uint8_t *const src_u = x->plane[1].src.buf;
-  const uint8_t *const src_v = x->plane[2].src.buf;
-  uint8_t *const color_map = xd->plane[1].color_index_map;
-  RD_STATS tokenonly_rd_stats;
-  int plane_block_width, plane_block_height, rows, cols;
-  av1_get_block_dimensions(bsize, 1, xd, &plane_block_width,
-                           &plane_block_height, &rows, &cols);
-
-  mbmi->uv_mode = UV_DC_PRED;
-
-  int count_buf[1 << 12];  // Maximum (1 << 12) color levels.
-  if (seq_params->use_highbitdepth) {
-    colors_u = av1_count_colors_highbd(src_u, src_stride, rows, cols,
-                                       seq_params->bit_depth, count_buf);
-    colors_v = av1_count_colors_highbd(src_v, src_stride, rows, cols,
-                                       seq_params->bit_depth, count_buf);
-  } else {
-    colors_u = av1_count_colors(src_u, src_stride, rows, cols, count_buf);
-    colors_v = av1_count_colors(src_v, src_stride, rows, cols, count_buf);
-  }
-
-  uint16_t color_cache[2 * PALETTE_MAX_SIZE];
-  const int n_cache = av1_get_palette_cache(xd, 1, color_cache);
-
-  colors = colors_u > colors_v ? colors_u : colors_v;
-  if (colors > 1 && colors <= 64) {
-    int r, c, n, i, j;
-    const int max_itr = 50;
-    int lb_u, ub_u, val_u;
-    int lb_v, ub_v, val_v;
-    int *const data = x->palette_buffer->kmeans_data_buf;
-    int centroids[2 * PALETTE_MAX_SIZE];
-
-    uint16_t *src_u16 = CONVERT_TO_SHORTPTR(src_u);
-    uint16_t *src_v16 = CONVERT_TO_SHORTPTR(src_v);
-    if (seq_params->use_highbitdepth) {
-      lb_u = src_u16[0];
-      ub_u = src_u16[0];
-      lb_v = src_v16[0];
-      ub_v = src_v16[0];
-    } else {
-      lb_u = src_u[0];
-      ub_u = src_u[0];
-      lb_v = src_v[0];
-      ub_v = src_v[0];
-    }
-
-    for (r = 0; r < rows; ++r) {
-      for (c = 0; c < cols; ++c) {
-        if (seq_params->use_highbitdepth) {
-          val_u = src_u16[r * src_stride + c];
-          val_v = src_v16[r * src_stride + c];
-          data[(r * cols + c) * 2] = val_u;
-          data[(r * cols + c) * 2 + 1] = val_v;
-        } else {
-          val_u = src_u[r * src_stride + c];
-          val_v = src_v[r * src_stride + c];
-          data[(r * cols + c) * 2] = val_u;
-          data[(r * cols + c) * 2 + 1] = val_v;
-        }
-        if (val_u < lb_u)
-          lb_u = val_u;
-        else if (val_u > ub_u)
-          ub_u = val_u;
-        if (val_v < lb_v)
-          lb_v = val_v;
-        else if (val_v > ub_v)
-          ub_v = val_v;
-      }
-    }
-
-    for (n = colors > PALETTE_MAX_SIZE ? PALETTE_MAX_SIZE : colors; n >= 2;
-         --n) {
-      for (i = 0; i < n; ++i) {
-        centroids[i * 2] = lb_u + (2 * i + 1) * (ub_u - lb_u) / n / 2;
-        centroids[i * 2 + 1] = lb_v + (2 * i + 1) * (ub_v - lb_v) / n / 2;
-      }
-      av1_k_means(data, centroids, color_map, rows * cols, n, 2, max_itr);
-      optimize_palette_colors(color_cache, n_cache, n, 2, centroids);
-      // Sort the U channel colors in ascending order.
-      for (i = 0; i < 2 * (n - 1); i += 2) {
-        int min_idx = i;
-        int min_val = centroids[i];
-        for (j = i + 2; j < 2 * n; j += 2)
-          if (centroids[j] < min_val) min_val = centroids[j], min_idx = j;
-        if (min_idx != i) {
-          int temp_u = centroids[i], temp_v = centroids[i + 1];
-          centroids[i] = centroids[min_idx];
-          centroids[i + 1] = centroids[min_idx + 1];
-          centroids[min_idx] = temp_u, centroids[min_idx + 1] = temp_v;
-        }
-      }
-      av1_calc_indices(data, centroids, color_map, rows * cols, n, 2);
-      extend_palette_color_map(color_map, cols, rows, plane_block_width,
-                               plane_block_height);
-      pmi->palette_size[1] = n;
-      for (i = 1; i < 3; ++i) {
-        for (j = 0; j < n; ++j) {
-          if (seq_params->use_highbitdepth)
-            pmi->palette_colors[i * PALETTE_MAX_SIZE + j] = clip_pixel_highbd(
-                (int)centroids[j * 2 + i - 1], seq_params->bit_depth);
-          else
-            pmi->palette_colors[i * PALETTE_MAX_SIZE + j] =
-                clip_pixel((int)centroids[j * 2 + i - 1]);
-        }
-      }
-
-      av1_txfm_uvrd(cpi, x, &tokenonly_rd_stats, bsize, *best_rd);
-      if (tokenonly_rd_stats.rate == INT_MAX) continue;
-      this_rate = tokenonly_rd_stats.rate +
-                  intra_mode_info_cost_uv(cpi, x, mbmi, bsize, dc_mode_cost);
-      this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist);
-      if (this_rd < *best_rd) {
-        *best_rd = this_rd;
-        *best_mbmi = *mbmi;
-        memcpy(best_palette_color_map, color_map,
-               plane_block_width * plane_block_height *
-                   sizeof(best_palette_color_map[0]));
-        *rate = this_rate;
-        *distortion = tokenonly_rd_stats.dist;
-        *rate_tokenonly = tokenonly_rd_stats.rate;
-        *skippable = tokenonly_rd_stats.skip;
-      }
-    }
-  }
-  if (best_mbmi->palette_mode_info.palette_size[1] > 0) {
-    memcpy(color_map, best_palette_color_map,
-           plane_block_width * plane_block_height *
-               sizeof(best_palette_color_map[0]));
-  }
-}
-
-void av1_restore_uv_color_map(const AV1_COMP *const cpi, MACROBLOCK *x) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = xd->mi[0];
-  PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
-  const BLOCK_SIZE bsize = mbmi->sb_type;
-  int src_stride = x->plane[1].src.stride;
-  const uint8_t *const src_u = x->plane[1].src.buf;
-  const uint8_t *const src_v = x->plane[2].src.buf;
-  int *const data = x->palette_buffer->kmeans_data_buf;
-  int centroids[2 * PALETTE_MAX_SIZE];
-  uint8_t *const color_map = xd->plane[1].color_index_map;
-  int r, c;
-  const uint16_t *const src_u16 = CONVERT_TO_SHORTPTR(src_u);
-  const uint16_t *const src_v16 = CONVERT_TO_SHORTPTR(src_v);
-  int plane_block_width, plane_block_height, rows, cols;
-  av1_get_block_dimensions(bsize, 1, xd, &plane_block_width,
-                           &plane_block_height, &rows, &cols);
-
-  for (r = 0; r < rows; ++r) {
-    for (c = 0; c < cols; ++c) {
-      if (cpi->common.seq_params.use_highbitdepth) {
-        data[(r * cols + c) * 2] = src_u16[r * src_stride + c];
-        data[(r * cols + c) * 2 + 1] = src_v16[r * src_stride + c];
-      } else {
-        data[(r * cols + c) * 2] = src_u[r * src_stride + c];
-        data[(r * cols + c) * 2 + 1] = src_v[r * src_stride + c];
-      }
-    }
-  }
-
-  for (r = 1; r < 3; ++r) {
-    for (c = 0; c < pmi->palette_size[1]; ++c) {
-      centroids[c * 2 + r - 1] = pmi->palette_colors[r * PALETTE_MAX_SIZE + c];
-    }
-  }
-
-  av1_calc_indices(data, centroids, color_map, rows * cols,
-                   pmi->palette_size[1], 2);
-  extend_palette_color_map(color_map, cols, rows, plane_block_width,
-                           plane_block_height);
-}
-
-static AOM_INLINE void choose_intra_uv_mode(
-    const AV1_COMP *const cpi, MACROBLOCK *const x, BLOCK_SIZE bsize,
-    TX_SIZE max_tx_size, int *rate_uv, int *rate_uv_tokenonly, int64_t *dist_uv,
-    int *skip_uv, UV_PREDICTION_MODE *mode_uv) {
-  const AV1_COMMON *const cm = &cpi->common;
-  MACROBLOCKD *xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = xd->mi[0];
-  // Use an estimated rd for uv_intra based on DC_PRED if the
-  // appropriate speed flag is set.
-  init_sbuv_mode(mbmi);
-  if (!xd->is_chroma_ref) {
-    *rate_uv = 0;
-    *rate_uv_tokenonly = 0;
-    *dist_uv = 0;
-    *skip_uv = 1;
-    *mode_uv = UV_DC_PRED;
-    return;
-  }
-
-  // Only store reconstructed luma when there's chroma RDO. When there's no
-  // chroma RDO, the reconstructed luma will be stored in encode_superblock().
-  xd->cfl.store_y = store_cfl_required_rdo(cm, x);
-  if (xd->cfl.store_y) {
-    // Restore reconstructed luma values.
-    av1_encode_intra_block_plane(cpi, x, mbmi->sb_type, AOM_PLANE_Y,
-                                 DRY_RUN_NORMAL,
-                                 cpi->optimize_seg_arr[mbmi->segment_id]);
-    xd->cfl.store_y = 0;
-  }
-  av1_rd_pick_intra_sbuv_mode(cpi, x, rate_uv, rate_uv_tokenonly, dist_uv,
-                              skip_uv, bsize, max_tx_size);
-  *mode_uv = mbmi->uv_mode;
-}
-
 // Run RD calculation with given chroma intra prediction angle., and return
 // the RD cost. Update the best mode info. if the RD cost is the best so far.
 static int64_t pick_intra_angle_routine_sbuv(
@@ -1251,13 +244,20 @@
     *rate = this_rate;
     rd_stats->rate = tokenonly_rd_stats.rate;
     rd_stats->dist = tokenonly_rd_stats.dist;
-    rd_stats->skip = tokenonly_rd_stats.skip;
+    rd_stats->skip_txfm = tokenonly_rd_stats.skip_txfm;
   }
   return this_rd;
 }
 
-// With given chroma directional intra prediction mode, pick the best angle
-// delta. Return true if a RD cost that is smaller than the input one is found.
+/*!\brief Search for the best angle delta for chroma prediction
+ *
+ * \ingroup intra_mode_search
+ * \callergraph
+ * Given a chroma directional intra prediction mode, this function will try to
+ * estimate the best delta_angle.
+ *
+ * \returns Return if there is a new mode with smaller rdcost than best_rd.
+ */
 static int rd_pick_intra_angle_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x,
                                     BLOCK_SIZE bsize, int rate_overhead,
                                     int64_t best_rd, int *rate,
@@ -1269,7 +269,7 @@
   int64_t this_rd, best_rd_in, rd_cost[2 * (MAX_ANGLE_DELTA + 2)];
 
   rd_stats->rate = INT_MAX;
-  rd_stats->skip = 0;
+  rd_stats->skip_txfm = 0;
   rd_stats->dist = INT64_MAX;
   for (i = 0; i < 2 * (MAX_ANGLE_DELTA + 2); ++i) rd_cost[i] = INT64_MAX;
 
@@ -1320,10 +320,11 @@
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = xd->mi[0];
   const MACROBLOCKD_PLANE *pd = &xd->plane[AOM_PLANE_U];
+  const ModeCosts *mode_costs = &x->mode_costs;
   const BLOCK_SIZE plane_bsize =
-      get_plane_block_size(mbmi->sb_type, pd->subsampling_x, pd->subsampling_y);
+      get_plane_block_size(mbmi->bsize, pd->subsampling_x, pd->subsampling_y);
 
-  assert(is_cfl_allowed(xd) && cpi->oxcf.enable_cfl_intra);
+  assert(is_cfl_allowed(xd) && cpi->oxcf.intra_mode_cfg.enable_cfl_intra);
   assert(plane_bsize < BLOCK_SIZES_ALL);
   if (!xd->lossless[mbmi->segment_id]) {
     assert(block_size_wide[plane_bsize] == tx_size_wide[tx_size]);
@@ -1331,9 +332,9 @@
   }
 
   xd->cfl.use_dc_pred_cache = 1;
-  const int64_t mode_rd =
-      RDCOST(x->rdmult,
-             x->intra_uv_mode_cost[CFL_ALLOWED][mbmi->mode][UV_CFL_PRED], 0);
+  const int64_t mode_rd = RDCOST(
+      x->rdmult,
+      mode_costs->intra_uv_mode_cost[CFL_ALLOWED][mbmi->mode][UV_CFL_PRED], 0);
   int64_t best_rd_uv[CFL_JOINT_SIGNS][CFL_PRED_PLANES];
   int best_c[CFL_JOINT_SIGNS][CFL_PRED_PLANES];
 #if CONFIG_DEBUG
@@ -1356,12 +357,11 @@
       if (i == CFL_SIGN_NEG) {
         mbmi->cfl_alpha_idx = 0;
         mbmi->cfl_alpha_signs = joint_sign;
-        av1_txfm_rd_in_plane(
-            x, cpi, &rd_stats, best_rd, 0, plane + 1, plane_bsize, tx_size,
-            cpi->sf.rd_sf.use_fast_coef_costing, FTXS_NONE, skip_trellis);
+        av1_txfm_rd_in_plane(x, cpi, &rd_stats, best_rd, 0, plane + 1,
+                             plane_bsize, tx_size, FTXS_NONE, skip_trellis);
         if (rd_stats.rate == INT_MAX) break;
       }
-      const int alpha_rate = x->cfl_cost[joint_sign][plane][0];
+      const int alpha_rate = mode_costs->cfl_cost[joint_sign][plane][0];
       best_rd_uv[joint_sign][plane] =
           RDCOST(x->rdmult, rd_stats.rate + alpha_rate, rd_stats.dist);
 #if CONFIG_DEBUG
@@ -1385,12 +385,11 @@
           if (i == 0) {
             mbmi->cfl_alpha_idx = (c << CFL_ALPHABET_SIZE_LOG2) + c;
             mbmi->cfl_alpha_signs = joint_sign;
-            av1_txfm_rd_in_plane(
-                x, cpi, &rd_stats, best_rd, 0, plane + 1, plane_bsize, tx_size,
-                cpi->sf.rd_sf.use_fast_coef_costing, FTXS_NONE, skip_trellis);
+            av1_txfm_rd_in_plane(x, cpi, &rd_stats, best_rd, 0, plane + 1,
+                                 plane_bsize, tx_size, FTXS_NONE, skip_trellis);
             if (rd_stats.rate == INT_MAX) break;
           }
-          const int alpha_rate = x->cfl_cost[joint_sign][plane][c];
+          const int alpha_rate = mode_costs->cfl_cost[joint_sign][plane][c];
           int64_t this_rd =
               RDCOST(x->rdmult, rd_stats.rate + alpha_rate, rd_stats.dist);
           if (this_rd >= best_rd_uv[joint_sign][plane]) continue;
@@ -1417,13 +416,13 @@
     const int u = best_c[best_joint_sign][CFL_PRED_U];
     const int v = best_c[best_joint_sign][CFL_PRED_V];
     ind = (u << CFL_ALPHABET_SIZE_LOG2) + v;
-    best_rate_overhead = x->cfl_cost[best_joint_sign][CFL_PRED_U][u] +
-                         x->cfl_cost[best_joint_sign][CFL_PRED_V][v];
+    best_rate_overhead = mode_costs->cfl_cost[best_joint_sign][CFL_PRED_U][u] +
+                         mode_costs->cfl_cost[best_joint_sign][CFL_PRED_V][v];
 #if CONFIG_DEBUG
-    xd->cfl.rate = x->intra_uv_mode_cost[CFL_ALLOWED][mbmi->mode][UV_CFL_PRED] +
-                   best_rate_overhead +
-                   best_rate_uv[best_joint_sign][CFL_PRED_U] +
-                   best_rate_uv[best_joint_sign][CFL_PRED_V];
+    xd->cfl.rate =
+        mode_costs->intra_uv_mode_cost[CFL_ALLOWED][mbmi->mode][UV_CFL_PRED] +
+        best_rate_overhead + best_rate_uv[best_joint_sign][CFL_PRED_U] +
+        best_rate_uv[best_joint_sign][CFL_PRED_V];
 #endif  // CONFIG_DEBUG
   } else {
     best_joint_sign = 0;
@@ -1441,12 +440,44 @@
                                     int *rate, int *rate_tokenonly,
                                     int64_t *distortion, int *skippable,
                                     BLOCK_SIZE bsize, TX_SIZE max_tx_size) {
+  const AV1_COMMON *const cm = &cpi->common;
   MACROBLOCKD *xd = &x->e_mbd;
   MB_MODE_INFO *mbmi = xd->mi[0];
   assert(!is_inter_block(mbmi));
   MB_MODE_INFO best_mbmi = *mbmi;
   int64_t best_rd = INT64_MAX, this_rd;
+  const ModeCosts *mode_costs = &x->mode_costs;
+  const IntraModeCfg *const intra_mode_cfg = &cpi->oxcf.intra_mode_cfg;
 
+  init_sbuv_mode(mbmi);
+
+  // Return if the current block does not correspond to a chroma block.
+  if (!xd->is_chroma_ref) {
+    *rate = 0;
+    *rate_tokenonly = 0;
+    *distortion = 0;
+    *skippable = 1;
+    return INT64_MAX;
+  }
+
+  // Only store reconstructed luma when there's chroma RDO. When there's no
+  // chroma RDO, the reconstructed luma will be stored in encode_superblock().
+  xd->cfl.store_y = store_cfl_required_rdo(cm, x);
+  if (xd->cfl.store_y) {
+    // Restore reconstructed luma values.
+    // TODO(chiyotsai@google.com): right now we are re-computing the txfm in
+    // this function everytime we search through uv modes. There is some
+    // potential speed up here if we cache the result to avoid redundant
+    // computation.
+    av1_encode_intra_block_plane(cpi, x, mbmi->bsize, AOM_PLANE_Y,
+                                 DRY_RUN_NORMAL,
+                                 cpi->optimize_seg_arr[mbmi->segment_id]);
+    xd->cfl.store_y = 0;
+  }
+  IntraModeSearchState intra_search_state;
+  init_intra_mode_search_state(&intra_search_state);
+
+  // Search through all non-palette modes.
   for (int mode_idx = 0; mode_idx < UV_INTRA_MODES; ++mode_idx) {
     int this_rate;
     RD_STATS tokenonly_rd_stats;
@@ -1455,41 +486,66 @@
     if (!(cpi->sf.intra_sf.intra_uv_mode_mask[txsize_sqr_up_map[max_tx_size]] &
           (1 << mode)))
       continue;
-    if (!cpi->oxcf.enable_smooth_intra && mode >= UV_SMOOTH_PRED &&
+    if (!intra_mode_cfg->enable_smooth_intra && mode >= UV_SMOOTH_PRED &&
         mode <= UV_SMOOTH_H_PRED)
       continue;
 
-    if (!cpi->oxcf.enable_paeth_intra && mode == UV_PAETH_PRED) continue;
+    if (!intra_mode_cfg->enable_paeth_intra && mode == UV_PAETH_PRED) continue;
 
     mbmi->uv_mode = mode;
+
+    // Init variables for cfl and angle delta
     int cfl_alpha_rate = 0;
     if (mode == UV_CFL_PRED) {
-      if (!is_cfl_allowed(xd) || !cpi->oxcf.enable_cfl_intra) continue;
+      if (!is_cfl_allowed(xd) || !intra_mode_cfg->enable_cfl_intra) continue;
       assert(!is_directional_mode);
       const TX_SIZE uv_tx_size = av1_get_tx_size(AOM_PLANE_U, xd);
       cfl_alpha_rate = cfl_rd_pick_alpha(x, cpi, uv_tx_size, best_rd);
       if (cfl_alpha_rate == INT_MAX) continue;
     }
     mbmi->angle_delta[PLANE_TYPE_UV] = 0;
-    if (is_directional_mode && av1_use_angle_delta(mbmi->sb_type) &&
-        cpi->oxcf.enable_angle_delta) {
+
+    if (is_directional_mode && av1_use_angle_delta(mbmi->bsize) &&
+        intra_mode_cfg->enable_angle_delta) {
+      const SPEED_FEATURES *sf = &cpi->sf;
+      if (sf->intra_sf.chroma_intra_pruning_with_hog &&
+          !intra_search_state.dir_mode_skip_mask_ready) {
+        static const float thresh[2][4] = {
+          { -1.2f, 0.0f, 0.0f, 1.2f },    // Interframe
+          { -1.2f, -1.2f, -0.6f, 0.4f },  // Intraframe
+        };
+        const int is_chroma = 1;
+        const int is_intra_frame = frame_is_intra_only(cm);
+        prune_intra_mode_with_hog(
+            x, bsize,
+            thresh[is_intra_frame]
+                  [sf->intra_sf.chroma_intra_pruning_with_hog - 1],
+            intra_search_state.directional_mode_skip_mask, is_chroma);
+        intra_search_state.dir_mode_skip_mask_ready = 1;
+      }
+      if (intra_search_state.directional_mode_skip_mask[mode]) {
+        continue;
+      }
+
+      // Search through angle delta
       const int rate_overhead =
-          x->intra_uv_mode_cost[is_cfl_allowed(xd)][mbmi->mode][mode];
+          mode_costs->intra_uv_mode_cost[is_cfl_allowed(xd)][mbmi->mode][mode];
       if (!rd_pick_intra_angle_sbuv(cpi, x, bsize, rate_overhead, best_rd,
                                     &this_rate, &tokenonly_rd_stats))
         continue;
     } else {
+      // Predict directly if we don't need to search for angle delta.
       if (!av1_txfm_uvrd(cpi, x, &tokenonly_rd_stats, bsize, best_rd)) {
         continue;
       }
     }
     const int mode_cost =
-        x->intra_uv_mode_cost[is_cfl_allowed(xd)][mbmi->mode][mode] +
+        mode_costs->intra_uv_mode_cost[is_cfl_allowed(xd)][mbmi->mode][mode] +
         cfl_alpha_rate;
     this_rate = tokenonly_rd_stats.rate +
                 intra_mode_info_cost_uv(cpi, x, mbmi, bsize, mode_cost);
     if (mode == UV_CFL_PRED) {
-      assert(is_cfl_allowed(xd) && cpi->oxcf.enable_cfl_intra);
+      assert(is_cfl_allowed(xd) && intra_mode_cfg->enable_cfl_intra);
 #if CONFIG_DEBUG
       if (!xd->lossless[mbmi->segment_id])
         assert(xd->cfl.rate == tokenonly_rd_stats.rate + mode_cost);
@@ -1503,19 +559,21 @@
       *rate = this_rate;
       *rate_tokenonly = tokenonly_rd_stats.rate;
       *distortion = tokenonly_rd_stats.dist;
-      *skippable = tokenonly_rd_stats.skip;
+      *skippable = tokenonly_rd_stats.skip_txfm;
     }
   }
 
+  // Search palette mode
   const int try_palette =
-      cpi->oxcf.enable_palette &&
+      cpi->oxcf.tool_cfg.enable_palette &&
       av1_allow_palette(cpi->common.features.allow_screen_content_tools,
-                        mbmi->sb_type);
+                        mbmi->bsize);
   if (try_palette) {
     uint8_t *best_palette_color_map = x->palette_buffer->best_palette_color_map;
-    rd_pick_palette_intra_sbuv(
+    av1_rd_pick_palette_intra_sbuv(
         cpi, x,
-        x->intra_uv_mode_cost[is_cfl_allowed(xd)][mbmi->mode][UV_DC_PRED],
+        mode_costs
+            ->intra_uv_mode_cost[is_cfl_allowed(xd)][mbmi->mode][UV_DC_PRED],
         best_palette_color_map, &best_mbmi, &best_rd, rate, rate_tokenonly,
         distortion, skippable);
   }
@@ -1526,28 +584,30 @@
   return best_rd;
 }
 
-int av1_search_palette_mode(const AV1_COMP *cpi, MACROBLOCK *x,
-                            RD_STATS *this_rd_cost, PICK_MODE_CONTEXT *ctx,
-                            BLOCK_SIZE bsize, MB_MODE_INFO *const mbmi,
-                            PALETTE_MODE_INFO *const pmi,
-                            unsigned int *ref_costs_single,
-                            IntraModeSearchState *intra_search_state,
+// Searches palette mode for luma channel in inter frame.
+int av1_search_palette_mode(IntraModeSearchState *intra_search_state,
+                            const AV1_COMP *cpi, MACROBLOCK *x,
+                            BLOCK_SIZE bsize, unsigned int ref_frame_cost,
+                            PICK_MODE_CONTEXT *ctx, RD_STATS *this_rd_cost,
                             int64_t best_rd) {
   const AV1_COMMON *const cm = &cpi->common;
+  MB_MODE_INFO *const mbmi = x->e_mbd.mi[0];
+  PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
   const int num_planes = av1_num_planes(cm);
   MACROBLOCKD *const xd = &x->e_mbd;
   int rate2 = 0;
   int64_t distortion2 = 0, best_rd_palette = best_rd, this_rd,
           best_model_rd_palette = INT64_MAX;
   int skippable = 0;
-  TX_SIZE uv_tx = TX_4X4;
   uint8_t *const best_palette_color_map =
       x->palette_buffer->best_palette_color_map;
   uint8_t *const color_map = xd->plane[0].color_index_map;
   MB_MODE_INFO best_mbmi_palette = *mbmi;
   uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
   uint8_t best_tx_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE];
-  const int *const intra_mode_cost = x->mbmode_cost[size_group_lookup[bsize]];
+  const ModeCosts *mode_costs = &x->mode_costs;
+  const int *const intra_mode_cost =
+      mode_costs->mbmode_cost[size_group_lookup[bsize]];
   const int rows = block_size_high[bsize];
   const int cols = block_size_wide[bsize];
 
@@ -1555,37 +615,44 @@
   mbmi->uv_mode = UV_DC_PRED;
   mbmi->ref_frame[0] = INTRA_FRAME;
   mbmi->ref_frame[1] = NONE_FRAME;
+  av1_zero(pmi->palette_size);
+
   RD_STATS rd_stats_y;
   av1_invalid_rd_stats(&rd_stats_y);
-  rd_pick_palette_intra_sby(
+  av1_rd_pick_palette_intra_sby(
       cpi, x, bsize, intra_mode_cost[DC_PRED], &best_mbmi_palette,
       best_palette_color_map, &best_rd_palette, &best_model_rd_palette,
-      &rd_stats_y.rate, NULL, &rd_stats_y.dist, &rd_stats_y.skip, NULL, ctx,
-      best_blk_skip, best_tx_type_map);
+      &rd_stats_y.rate, NULL, &rd_stats_y.dist, &rd_stats_y.skip_txfm, NULL,
+      ctx, best_blk_skip, best_tx_type_map);
   if (rd_stats_y.rate == INT_MAX || pmi->palette_size[0] == 0) {
     this_rd_cost->rdcost = INT64_MAX;
     return skippable;
   }
 
-  memcpy(x->blk_skip, best_blk_skip,
+  memcpy(x->txfm_search_info.blk_skip, best_blk_skip,
          sizeof(best_blk_skip[0]) * bsize_to_num_blk(bsize));
   av1_copy_array(xd->tx_type_map, best_tx_type_map, ctx->num_4x4_blk);
   memcpy(color_map, best_palette_color_map,
          rows * cols * sizeof(best_palette_color_map[0]));
 
-  skippable = rd_stats_y.skip;
+  skippable = rd_stats_y.skip_txfm;
   distortion2 = rd_stats_y.dist;
-  rate2 = rd_stats_y.rate + ref_costs_single[INTRA_FRAME];
+  rate2 = rd_stats_y.rate + ref_frame_cost;
   if (num_planes > 1) {
-    uv_tx = av1_get_tx_size(AOM_PLANE_U, xd);
     if (intra_search_state->rate_uv_intra == INT_MAX) {
-      choose_intra_uv_mode(
-          cpi, x, bsize, uv_tx, &intra_search_state->rate_uv_intra,
-          &intra_search_state->rate_uv_tokenonly, &intra_search_state->dist_uvs,
-          &intra_search_state->skip_uvs, &intra_search_state->mode_uv);
+      // We have not found any good uv mode yet, so we need to search for it.
+      TX_SIZE uv_tx = av1_get_tx_size(AOM_PLANE_U, xd);
+      av1_rd_pick_intra_sbuv_mode(cpi, x, &intra_search_state->rate_uv_intra,
+                                  &intra_search_state->rate_uv_tokenonly,
+                                  &intra_search_state->dist_uvs,
+                                  &intra_search_state->skip_uvs, bsize, uv_tx);
+      intra_search_state->mode_uv = mbmi->uv_mode;
       intra_search_state->pmi_uv = *pmi;
       intra_search_state->uv_angle_delta = mbmi->angle_delta[PLANE_TYPE_UV];
     }
+
+    // We have found at least one good uv mode before, so copy and paste it
+    // over.
     mbmi->uv_mode = intra_search_state->mode_uv;
     pmi->palette_size[1] = intra_search_state->pmi_uv.palette_size[1];
     if (pmi->palette_size[1] > 0) {
@@ -1602,9 +669,9 @@
   if (skippable) {
     rate2 -= rd_stats_y.rate;
     if (num_planes > 1) rate2 -= intra_search_state->rate_uv_tokenonly;
-    rate2 += x->skip_cost[av1_get_skip_context(xd)][1];
+    rate2 += mode_costs->skip_txfm_cost[av1_get_skip_txfm_context(xd)][1];
   } else {
-    rate2 += x->skip_cost[av1_get_skip_context(xd)][0];
+    rate2 += mode_costs->skip_txfm_cost[av1_get_skip_txfm_context(xd)][0];
   }
   this_rd = RDCOST(x->rdmult, rate2, distortion2);
   this_rd_cost->rate = rate2;
@@ -1613,7 +680,15 @@
   return skippable;
 }
 
-// Given selected prediction mode, search for the best tx type and size.
+/*!\brief Get the intra prediction by searching through tx_type and tx_size.
+ *
+ * \ingroup intra_mode_search
+ * \callergraph
+ * Currently this function is only used in the intra frame code path for
+ * winner-mode processing.
+ *
+ * \return Returns whether the current mode is an improvement over best_rd.
+ */
 static AOM_INLINE int intra_block_yrd(const AV1_COMP *const cpi, MACROBLOCK *x,
                                       BLOCK_SIZE bsize, const int *bmode_costs,
                                       int64_t *best_rd, int *rate,
@@ -1628,7 +703,7 @@
   av1_pick_uniform_tx_size_type_yrd(cpi, x, &rd_stats, bsize, INT64_MAX);
   if (rd_stats.rate == INT_MAX) return 0;
   int this_rate_tokenonly = rd_stats.rate;
-  if (!xd->lossless[mbmi->segment_id] && block_signals_txsize(mbmi->sb_type)) {
+  if (!xd->lossless[mbmi->segment_id] && block_signals_txsize(mbmi->bsize)) {
     // av1_pick_uniform_tx_size_type_yrd above includes the cost of the tx_size
     // in the tokenonly rate, but for intra blocks, tx_size is always coded
     // (prediction granularity), so we account for it in the full rate,
@@ -1645,16 +720,24 @@
     *rate = this_rate;
     *rate_tokenonly = this_rate_tokenonly;
     *distortion = rd_stats.dist;
-    *skippable = rd_stats.skip;
-    av1_copy_array(ctx->blk_skip, x->blk_skip, ctx->num_4x4_blk);
+    *skippable = rd_stats.skip_txfm;
+    av1_copy_array(ctx->blk_skip, x->txfm_search_info.blk_skip,
+                   ctx->num_4x4_blk);
     av1_copy_array(ctx->tx_type_map, xd->tx_type_map, ctx->num_4x4_blk);
     return 1;
   }
   return 0;
 }
 
-// With given luma directional intra prediction mode, pick the best angle delta
-// Return the RD cost corresponding to the best angle delta.
+/*!\brief Search for the best angle delta for luma prediction
+ *
+ * \ingroup intra_mode_search
+ * \callergraph
+ * Given a luma directional intra prediction mode, this function will try to
+ * estimate the best delta_angle.
+ *
+ * \return Returns the new rdcost of the best intra angle.
+ */
 static int64_t rd_pick_intra_angle_sby(const AV1_COMP *const cpi, MACROBLOCK *x,
                                        int *rate, RD_STATS *rd_stats,
                                        BLOCK_SIZE bsize, int mode_cost,
@@ -1714,53 +797,124 @@
     mbmi->tx_size = best_tx_size;
     mbmi->angle_delta[PLANE_TYPE_Y] = best_angle_delta;
     const int n4 = bsize_to_num_blk(bsize);
-    memcpy(x->blk_skip, best_blk_skip, sizeof(best_blk_skip[0]) * n4);
+    memcpy(x->txfm_search_info.blk_skip, best_blk_skip,
+           sizeof(best_blk_skip[0]) * n4);
     av1_copy_array(xd->tx_type_map, best_tx_type_map, n4);
   }
   return best_rd;
 }
 
-int64_t av1_handle_intra_mode(IntraModeSearchState *intra_search_state,
-                              const AV1_COMP *cpi, MACROBLOCK *x,
-                              BLOCK_SIZE bsize, int ref_frame_cost,
-                              const PICK_MODE_CONTEXT *ctx, int disable_skip,
-                              RD_STATS *rd_stats, RD_STATS *rd_stats_y,
-                              RD_STATS *rd_stats_uv, int64_t best_rd,
-                              int64_t *best_intra_rd, int8_t best_mbmode_skip) {
+/*!\brief Search for the best filter_intra mode when coding inter frame.
+ *
+ * \ingroup intra_mode_search
+ * \callergraph
+ * This function loops through all filter_intra modes to find the best one.
+ *
+ * \return Returns nothing, but updates the mbmi and rd_stats.
+ */
+static INLINE void handle_filter_intra_mode(const AV1_COMP *cpi, MACROBLOCK *x,
+                                            BLOCK_SIZE bsize,
+                                            const PICK_MODE_CONTEXT *ctx,
+                                            RD_STATS *rd_stats_y, int mode_cost,
+                                            int64_t best_rd,
+                                            int64_t best_rd_so_far) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  assert(mbmi->mode == DC_PRED &&
+         av1_filter_intra_allowed_bsize(&cpi->common, bsize));
+
+  RD_STATS rd_stats_y_fi;
+  int filter_intra_selected_flag = 0;
+  TX_SIZE best_tx_size = mbmi->tx_size;
+  FILTER_INTRA_MODE best_fi_mode = FILTER_DC_PRED;
+  uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
+  memcpy(best_blk_skip, x->txfm_search_info.blk_skip,
+         sizeof(best_blk_skip[0]) * ctx->num_4x4_blk);
+  uint8_t best_tx_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE];
+  av1_copy_array(best_tx_type_map, xd->tx_type_map, ctx->num_4x4_blk);
+  mbmi->filter_intra_mode_info.use_filter_intra = 1;
+  for (FILTER_INTRA_MODE fi_mode = FILTER_DC_PRED; fi_mode < FILTER_INTRA_MODES;
+       ++fi_mode) {
+    mbmi->filter_intra_mode_info.filter_intra_mode = fi_mode;
+    av1_pick_uniform_tx_size_type_yrd(cpi, x, &rd_stats_y_fi, bsize, best_rd);
+    if (rd_stats_y_fi.rate == INT_MAX) continue;
+    const int this_rate_tmp =
+        rd_stats_y_fi.rate +
+        intra_mode_info_cost_y(cpi, x, mbmi, bsize, mode_cost);
+    const int64_t this_rd_tmp =
+        RDCOST(x->rdmult, this_rate_tmp, rd_stats_y_fi.dist);
+
+    if (this_rd_tmp != INT64_MAX && this_rd_tmp / 2 > best_rd) {
+      break;
+    }
+    if (this_rd_tmp < best_rd_so_far) {
+      best_tx_size = mbmi->tx_size;
+      av1_copy_array(best_tx_type_map, xd->tx_type_map, ctx->num_4x4_blk);
+      memcpy(best_blk_skip, x->txfm_search_info.blk_skip,
+             sizeof(best_blk_skip[0]) * ctx->num_4x4_blk);
+      best_fi_mode = fi_mode;
+      *rd_stats_y = rd_stats_y_fi;
+      filter_intra_selected_flag = 1;
+      best_rd_so_far = this_rd_tmp;
+    }
+  }
+
+  mbmi->tx_size = best_tx_size;
+  av1_copy_array(xd->tx_type_map, best_tx_type_map, ctx->num_4x4_blk);
+  memcpy(x->txfm_search_info.blk_skip, best_blk_skip,
+         sizeof(x->txfm_search_info.blk_skip[0]) * ctx->num_4x4_blk);
+
+  if (filter_intra_selected_flag) {
+    mbmi->filter_intra_mode_info.use_filter_intra = 1;
+    mbmi->filter_intra_mode_info.filter_intra_mode = best_fi_mode;
+  } else {
+    mbmi->filter_intra_mode_info.use_filter_intra = 0;
+  }
+}
+
+int av1_handle_intra_y_mode(IntraModeSearchState *intra_search_state,
+                            const AV1_COMP *cpi, MACROBLOCK *x,
+                            BLOCK_SIZE bsize, unsigned int ref_frame_cost,
+                            const PICK_MODE_CONTEXT *ctx, RD_STATS *rd_stats_y,
+                            int64_t best_rd, int *mode_cost_y, int64_t *rd_y) {
   const AV1_COMMON *cm = &cpi->common;
   const SPEED_FEATURES *const sf = &cpi->sf;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = xd->mi[0];
   assert(mbmi->ref_frame[0] == INTRA_FRAME);
   const PREDICTION_MODE mode = mbmi->mode;
+  const ModeCosts *mode_costs = &x->mode_costs;
   const int mode_cost =
-      x->mbmode_cost[size_group_lookup[bsize]][mode] + ref_frame_cost;
+      mode_costs->mbmode_cost[size_group_lookup[bsize]][mode] + ref_frame_cost;
+  const int skip_ctx = av1_get_skip_txfm_context(xd);
+
+  int known_rate = mode_cost;
   const int intra_cost_penalty = av1_get_intra_cost_penalty(
       cm->quant_params.base_qindex, cm->quant_params.y_dc_delta_q,
       cm->seq_params.bit_depth);
-  const int skip_ctx = av1_get_skip_context(xd);
 
-  int known_rate = mode_cost;
-  known_rate += ref_frame_cost;
   if (mode != DC_PRED && mode != PAETH_PRED) known_rate += intra_cost_penalty;
-  known_rate += AOMMIN(x->skip_cost[skip_ctx][0], x->skip_cost[skip_ctx][1]);
+  known_rate += AOMMIN(mode_costs->skip_txfm_cost[skip_ctx][0],
+                       mode_costs->skip_txfm_cost[skip_ctx][1]);
   const int64_t known_rd = RDCOST(x->rdmult, known_rate, 0);
   if (known_rd > best_rd) {
     intra_search_state->skip_intra_modes = 1;
-    return INT64_MAX;
+    return 0;
   }
 
   const int is_directional_mode = av1_is_directional_mode(mode);
   if (is_directional_mode && av1_use_angle_delta(bsize) &&
-      cpi->oxcf.enable_angle_delta) {
+      cpi->oxcf.intra_mode_cfg.enable_angle_delta) {
     if (sf->intra_sf.intra_pruning_with_hog &&
-        !intra_search_state->angle_stats_ready) {
-      prune_intra_mode_with_hog(x, bsize,
-                                cpi->sf.intra_sf.intra_pruning_with_hog_thresh,
-                                intra_search_state->directional_mode_skip_mask);
-      intra_search_state->angle_stats_ready = 1;
+        !intra_search_state->dir_mode_skip_mask_ready) {
+      const float thresh[4] = { -1.2f, 0.0f, 0.0f, 1.2f };
+      const int is_chroma = 0;
+      prune_intra_mode_with_hog(
+          x, bsize, thresh[sf->intra_sf.intra_pruning_with_hog - 1],
+          intra_search_state->directional_mode_skip_mask, is_chroma);
+      intra_search_state->dir_mode_skip_mask_ready = 1;
     }
-    if (intra_search_state->directional_mode_skip_mask[mode]) return INT64_MAX;
+    if (intra_search_state->directional_mode_skip_mask[mode]) return 0;
     av1_init_rd_stats(rd_stats_y);
     rd_stats_y->rate = INT_MAX;
     int64_t model_rd = INT64_MAX;
@@ -1776,169 +930,96 @@
 
   // Pick filter intra modes.
   if (mode == DC_PRED && av1_filter_intra_allowed_bsize(cm, bsize)) {
-    int try_filter_intra = 0;
+    int try_filter_intra = 1;
     int64_t best_rd_so_far = INT64_MAX;
     if (rd_stats_y->rate != INT_MAX) {
-      const int tmp_rate =
-          rd_stats_y->rate + x->filter_intra_cost[bsize][0] + mode_cost;
+      const int tmp_rate = rd_stats_y->rate +
+                           mode_costs->filter_intra_cost[bsize][0] + mode_cost;
       best_rd_so_far = RDCOST(x->rdmult, tmp_rate, rd_stats_y->dist);
       try_filter_intra = (best_rd_so_far / 2) <= best_rd;
-    } else {
-      try_filter_intra = !best_mbmode_skip;
     }
 
     if (try_filter_intra) {
-      RD_STATS rd_stats_y_fi;
-      int filter_intra_selected_flag = 0;
-      TX_SIZE best_tx_size = mbmi->tx_size;
-      FILTER_INTRA_MODE best_fi_mode = FILTER_DC_PRED;
-      uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
-      memcpy(best_blk_skip, x->blk_skip,
-             sizeof(best_blk_skip[0]) * ctx->num_4x4_blk);
-      uint8_t best_tx_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE];
-      av1_copy_array(best_tx_type_map, xd->tx_type_map, ctx->num_4x4_blk);
-      mbmi->filter_intra_mode_info.use_filter_intra = 1;
-      for (FILTER_INTRA_MODE fi_mode = FILTER_DC_PRED;
-           fi_mode < FILTER_INTRA_MODES; ++fi_mode) {
-        mbmi->filter_intra_mode_info.filter_intra_mode = fi_mode;
-        av1_pick_uniform_tx_size_type_yrd(cpi, x, &rd_stats_y_fi, bsize,
-                                          best_rd);
-        if (rd_stats_y_fi.rate == INT_MAX) continue;
-        const int this_rate_tmp =
-            rd_stats_y_fi.rate +
-            intra_mode_info_cost_y(cpi, x, mbmi, bsize, mode_cost);
-        const int64_t this_rd_tmp =
-            RDCOST(x->rdmult, this_rate_tmp, rd_stats_y_fi.dist);
-
-        if (this_rd_tmp != INT64_MAX && this_rd_tmp / 2 > best_rd) {
-          break;
-        }
-        if (this_rd_tmp < best_rd_so_far) {
-          best_tx_size = mbmi->tx_size;
-          av1_copy_array(best_tx_type_map, xd->tx_type_map, ctx->num_4x4_blk);
-          memcpy(best_blk_skip, x->blk_skip,
-                 sizeof(best_blk_skip[0]) * ctx->num_4x4_blk);
-          best_fi_mode = fi_mode;
-          *rd_stats_y = rd_stats_y_fi;
-          filter_intra_selected_flag = 1;
-          best_rd_so_far = this_rd_tmp;
-        }
-      }
-
-      mbmi->tx_size = best_tx_size;
-      av1_copy_array(xd->tx_type_map, best_tx_type_map, ctx->num_4x4_blk);
-      memcpy(x->blk_skip, best_blk_skip,
-             sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
-
-      if (filter_intra_selected_flag) {
-        mbmi->filter_intra_mode_info.use_filter_intra = 1;
-        mbmi->filter_intra_mode_info.filter_intra_mode = best_fi_mode;
-      } else {
-        mbmi->filter_intra_mode_info.use_filter_intra = 0;
-      }
+      handle_filter_intra_mode(cpi, x, bsize, ctx, rd_stats_y, mode_cost,
+                               best_rd, best_rd_so_far);
     }
   }
 
-  if (rd_stats_y->rate == INT_MAX) return INT64_MAX;
+  if (rd_stats_y->rate == INT_MAX) return 0;
 
-  const int mode_cost_y =
-      intra_mode_info_cost_y(cpi, x, mbmi, bsize, mode_cost);
-  av1_init_rd_stats(rd_stats);
-  av1_init_rd_stats(rd_stats_uv);
-  const int num_planes = av1_num_planes(cm);
-  if (num_planes > 1) {
-    PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
-    const int try_palette =
-        cpi->oxcf.enable_palette &&
-        av1_allow_palette(cm->features.allow_screen_content_tools,
-                          mbmi->sb_type);
-    const TX_SIZE uv_tx = av1_get_tx_size(AOM_PLANE_U, xd);
-    if (intra_search_state->rate_uv_intra == INT_MAX) {
-      const int rate_y =
-          rd_stats_y->skip ? x->skip_cost[skip_ctx][1] : rd_stats_y->rate;
-      const int64_t rdy =
-          RDCOST(x->rdmult, rate_y + mode_cost_y, rd_stats_y->dist);
-      if (best_rd < (INT64_MAX / 2) && rdy > (best_rd + (best_rd >> 2))) {
-        intra_search_state->skip_intra_modes = 1;
-        return INT64_MAX;
-      }
-      choose_intra_uv_mode(
-          cpi, x, bsize, uv_tx, &intra_search_state->rate_uv_intra,
-          &intra_search_state->rate_uv_tokenonly, &intra_search_state->dist_uvs,
-          &intra_search_state->skip_uvs, &intra_search_state->mode_uv);
-      if (try_palette) intra_search_state->pmi_uv = *pmi;
-      intra_search_state->uv_angle_delta = mbmi->angle_delta[PLANE_TYPE_UV];
-
-      const int uv_rate = intra_search_state->rate_uv_tokenonly;
-      const int64_t uv_dist = intra_search_state->dist_uvs;
-      const int64_t uv_rd = RDCOST(x->rdmult, uv_rate, uv_dist);
-      if (uv_rd > best_rd) {
-        intra_search_state->skip_intra_modes = 1;
-        return INT64_MAX;
-      }
-    }
-
-    rd_stats_uv->rate = intra_search_state->rate_uv_tokenonly;
-    rd_stats_uv->dist = intra_search_state->dist_uvs;
-    rd_stats_uv->skip = intra_search_state->skip_uvs;
-    rd_stats->skip = rd_stats_y->skip && rd_stats_uv->skip;
-    mbmi->uv_mode = intra_search_state->mode_uv;
-    if (try_palette) {
-      pmi->palette_size[1] = intra_search_state->pmi_uv.palette_size[1];
-      memcpy(pmi->palette_colors + PALETTE_MAX_SIZE,
-             intra_search_state->pmi_uv.palette_colors + PALETTE_MAX_SIZE,
-             2 * PALETTE_MAX_SIZE * sizeof(pmi->palette_colors[0]));
-    }
-    mbmi->angle_delta[PLANE_TYPE_UV] = intra_search_state->uv_angle_delta;
+  *mode_cost_y = intra_mode_info_cost_y(cpi, x, mbmi, bsize, mode_cost);
+  const int rate_y = rd_stats_y->skip_txfm
+                         ? mode_costs->skip_txfm_cost[skip_ctx][1]
+                         : rd_stats_y->rate;
+  *rd_y = RDCOST(x->rdmult, rate_y + *mode_cost_y, rd_stats_y->dist);
+  if (best_rd < (INT64_MAX / 2) && *rd_y > (best_rd + (best_rd >> 2))) {
+    intra_search_state->skip_intra_modes = 1;
+    return 0;
   }
 
-  rd_stats->rate = rd_stats_y->rate + mode_cost_y;
-  if (!xd->lossless[mbmi->segment_id] && block_signals_txsize(bsize)) {
-    // av1_pick_uniform_tx_size_type_yrd above includes the cost of the tx_size
-    // in the tokenonly rate, but for intra blocks, tx_size is always coded
-    // (prediction granularity), so we account for it in the full rate,
-    // not the tokenonly rate.
-    rd_stats_y->rate -= tx_size_cost(x, bsize, mbmi->tx_size);
-  }
-  if (num_planes > 1 && xd->is_chroma_ref) {
-    const int uv_mode_cost =
-        x->intra_uv_mode_cost[is_cfl_allowed(xd)][mode][mbmi->uv_mode];
-    rd_stats->rate +=
-        rd_stats_uv->rate +
-        intra_mode_info_cost_uv(cpi, x, mbmi, bsize, uv_mode_cost);
-  }
-  if (mode != DC_PRED && mode != PAETH_PRED) {
-    rd_stats->rate += intra_cost_penalty;
-  }
-
-  // Intra block is always coded as non-skip
-  rd_stats->skip = 0;
-  rd_stats->dist = rd_stats_y->dist + rd_stats_uv->dist;
-  // Add in the cost of the no skip flag.
-  rd_stats->rate += x->skip_cost[skip_ctx][0];
-  // Calculate the final RD estimate for this mode.
-  const int64_t this_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
-  // Keep record of best intra rd
-  if (this_rd < *best_intra_rd) {
-    *best_intra_rd = this_rd;
-    intra_search_state->best_intra_mode = mode;
-  }
-
-  if (sf->intra_sf.skip_intra_in_interframe) {
-    if (best_rd < (INT64_MAX / 2) && this_rd > (best_rd + (best_rd >> 1)))
-      intra_search_state->skip_intra_modes = 1;
-  }
-
-  if (!disable_skip) {
-    for (int i = 0; i < REFERENCE_MODES; ++i) {
-      intra_search_state->best_pred_rd[i] =
-          AOMMIN(intra_search_state->best_pred_rd[i], this_rd);
-    }
-  }
-  return this_rd;
+  return 1;
 }
 
-// This function is used only for intra_only frames
+int av1_search_intra_uv_modes_in_interframe(
+    IntraModeSearchState *intra_search_state, const AV1_COMP *cpi,
+    MACROBLOCK *x, BLOCK_SIZE bsize, RD_STATS *rd_stats,
+    const RD_STATS *rd_stats_y, RD_STATS *rd_stats_uv, int64_t best_rd) {
+  const AV1_COMMON *cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  assert(mbmi->ref_frame[0] == INTRA_FRAME);
+
+  // TODO(chiyotsai@google.com): Consolidate the chroma search code here with
+  // the one in av1_search_palette_mode.
+  PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+  const int try_palette =
+      cpi->oxcf.tool_cfg.enable_palette &&
+      av1_allow_palette(cm->features.allow_screen_content_tools, mbmi->bsize);
+
+  assert(intra_search_state->rate_uv_intra == INT_MAX);
+  if (intra_search_state->rate_uv_intra == INT_MAX) {
+    // If no good uv-predictor had been found, search for it.
+    const TX_SIZE uv_tx = av1_get_tx_size(AOM_PLANE_U, xd);
+    av1_rd_pick_intra_sbuv_mode(cpi, x, &intra_search_state->rate_uv_intra,
+                                &intra_search_state->rate_uv_tokenonly,
+                                &intra_search_state->dist_uvs,
+                                &intra_search_state->skip_uvs, bsize, uv_tx);
+    intra_search_state->mode_uv = mbmi->uv_mode;
+    if (try_palette) intra_search_state->pmi_uv = *pmi;
+    intra_search_state->uv_angle_delta = mbmi->angle_delta[PLANE_TYPE_UV];
+
+    const int uv_rate = intra_search_state->rate_uv_tokenonly;
+    const int64_t uv_dist = intra_search_state->dist_uvs;
+    const int64_t uv_rd = RDCOST(x->rdmult, uv_rate, uv_dist);
+    if (uv_rd > best_rd) {
+      // If there is no good intra uv-mode available, we can skip all intra
+      // modes.
+      intra_search_state->skip_intra_modes = 1;
+      return 0;
+    }
+  }
+
+  // If we are here, then the encoder has found at least one good intra uv
+  // predictor, so we can directly copy its statistics over.
+  // TODO(any): the stats here is not right if the best uv mode is CFL but the
+  // best y mode is palette.
+  rd_stats_uv->rate = intra_search_state->rate_uv_tokenonly;
+  rd_stats_uv->dist = intra_search_state->dist_uvs;
+  rd_stats_uv->skip_txfm = intra_search_state->skip_uvs;
+  rd_stats->skip_txfm = rd_stats_y->skip_txfm && rd_stats_uv->skip_txfm;
+  mbmi->uv_mode = intra_search_state->mode_uv;
+  if (try_palette) {
+    pmi->palette_size[1] = intra_search_state->pmi_uv.palette_size[1];
+    memcpy(pmi->palette_colors + PALETTE_MAX_SIZE,
+           intra_search_state->pmi_uv.palette_colors + PALETTE_MAX_SIZE,
+           2 * PALETTE_MAX_SIZE * sizeof(pmi->palette_colors[0]));
+  }
+  mbmi->angle_delta[PLANE_TYPE_UV] = intra_search_state->uv_angle_delta;
+
+  return 1;
+}
+
+// Finds the best non-intrabc mode on an intra frame.
 int64_t av1_rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
                                    int *rate, int *rate_tokenonly,
                                    int64_t *distortion, int *skippable,
@@ -1956,9 +1037,9 @@
   const int *bmode_costs;
   PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
   const int try_palette =
-      cpi->oxcf.enable_palette &&
+      cpi->oxcf.tool_cfg.enable_palette &&
       av1_allow_palette(cpi->common.features.allow_screen_content_tools,
-                        mbmi->sb_type);
+                        mbmi->bsize);
   uint8_t *best_palette_color_map =
       try_palette ? x->palette_buffer->best_palette_color_map : NULL;
   const MB_MODE_INFO *above_mi = xd->above_mbmi;
@@ -1967,13 +1048,17 @@
   const PREDICTION_MODE L = av1_left_block_mode(left_mi);
   const int above_ctx = intra_mode_context[A];
   const int left_ctx = intra_mode_context[L];
-  bmode_costs = x->y_mode_costs[above_ctx][left_ctx];
+  bmode_costs = x->mode_costs.y_mode_costs[above_ctx][left_ctx];
 
   mbmi->angle_delta[PLANE_TYPE_Y] = 0;
   if (cpi->sf.intra_sf.intra_pruning_with_hog) {
-    prune_intra_mode_with_hog(x, bsize,
-                              cpi->sf.intra_sf.intra_pruning_with_hog_thresh,
-                              directional_mode_skip_mask);
+    // Less aggressive thresholds are used here than those used in inter frame
+    // encoding.
+    const float thresh[4] = { -1.2f, -1.2f, -0.6f, 0.4f };
+    const int is_chroma = 0;
+    prune_intra_mode_with_hog(
+        x, bsize, thresh[cpi->sf.intra_sf.intra_pruning_with_hog - 1],
+        directional_mode_skip_mask, is_chroma);
   }
   mbmi->filter_intra_mode_info.use_filter_intra = 0;
   pmi->palette_size[0] = 0;
@@ -1985,18 +1070,20 @@
   av1_zero(x->winner_mode_stats);
   x->winner_mode_count = 0;
 
-  /* Y Search for intra prediction mode */
+  // Searches the intra-modes except for intrabc, palette, and filter_intra.
   for (int mode_idx = INTRA_MODE_START; mode_idx < INTRA_MODE_END; ++mode_idx) {
     RD_STATS this_rd_stats;
     int this_rate, this_rate_tokenonly, s;
     int64_t this_distortion, this_rd;
     mbmi->mode = intra_rd_search_mode_order[mode_idx];
-    if ((!cpi->oxcf.enable_smooth_intra ||
+    if ((!cpi->oxcf.intra_mode_cfg.enable_smooth_intra ||
          cpi->sf.intra_sf.disable_smooth_intra) &&
         (mbmi->mode == SMOOTH_PRED || mbmi->mode == SMOOTH_H_PRED ||
          mbmi->mode == SMOOTH_V_PRED))
       continue;
-    if (!cpi->oxcf.enable_paeth_intra && mbmi->mode == PAETH_PRED) continue;
+    if (!cpi->oxcf.intra_mode_cfg.enable_paeth_intra &&
+        mbmi->mode == PAETH_PRED)
+      continue;
     mbmi->angle_delta[PLANE_TYPE_Y] = 0;
 
     if (model_intra_yrd_and_prune(cpi, x, bsize, bmode_costs[mbmi->mode],
@@ -2007,22 +1094,26 @@
     is_directional_mode = av1_is_directional_mode(mbmi->mode);
     if (is_directional_mode && directional_mode_skip_mask[mbmi->mode]) continue;
     if (is_directional_mode && av1_use_angle_delta(bsize) &&
-        cpi->oxcf.enable_angle_delta) {
+        cpi->oxcf.intra_mode_cfg.enable_angle_delta) {
+      // Searches through the best angle_delta if this option is available.
       this_rd_stats.rate = INT_MAX;
       rd_pick_intra_angle_sby(cpi, x, &this_rate, &this_rd_stats, bsize,
                               bmode_costs[mbmi->mode], best_rd, &best_model_rd,
                               1);
     } else {
+      // Builds the actual prediction. The prediction from
+      // model_intra_yrd_and_prune was just an estimation that did not take into
+      // account the effect of txfm pipeline, so we need to redo it for real
+      // here.
       av1_pick_uniform_tx_size_type_yrd(cpi, x, &this_rd_stats, bsize, best_rd);
     }
     this_rate_tokenonly = this_rd_stats.rate;
     this_distortion = this_rd_stats.dist;
-    s = this_rd_stats.skip;
+    s = this_rd_stats.skip_txfm;
 
     if (this_rate_tokenonly == INT_MAX) continue;
 
-    if (!xd->lossless[mbmi->segment_id] &&
-        block_signals_txsize(mbmi->sb_type)) {
+    if (!xd->lossless[mbmi->segment_id] && block_signals_txsize(mbmi->bsize)) {
       // av1_pick_uniform_tx_size_type_yrd above includes the cost of the
       // tx_size in the tokenonly rate, but for intra blocks, tx_size is always
       // coded (prediction granularity), so we account for it in the full rate,
@@ -2037,8 +1128,7 @@
     const int txfm_search_done = 1;
     store_winner_mode_stats(
         &cpi->common, x, mbmi, NULL, NULL, NULL, 0, NULL, bsize, this_rd,
-        cpi->sf.winner_mode_sf.enable_multiwinner_mode_process,
-        txfm_search_done);
+        cpi->sf.winner_mode_sf.multi_winner_mode_type, txfm_search_done);
     if (this_rd < best_rd) {
       best_mbmi = *mbmi;
       best_rd = this_rd;
@@ -2049,19 +1139,21 @@
       *rate_tokenonly = this_rate_tokenonly;
       *distortion = this_distortion;
       *skippable = s;
-      memcpy(ctx->blk_skip, x->blk_skip,
-             sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
+      memcpy(ctx->blk_skip, x->txfm_search_info.blk_skip,
+             sizeof(x->txfm_search_info.blk_skip[0]) * ctx->num_4x4_blk);
       av1_copy_array(ctx->tx_type_map, xd->tx_type_map, ctx->num_4x4_blk);
     }
   }
 
+  // Searches palette
   if (try_palette) {
-    rd_pick_palette_intra_sby(
+    av1_rd_pick_palette_intra_sby(
         cpi, x, bsize, bmode_costs[DC_PRED], &best_mbmi, best_palette_color_map,
         &best_rd, &best_model_rd, rate, rate_tokenonly, distortion, skippable,
         &beat_best_rd, ctx, ctx->blk_skip, ctx->tx_type_map);
   }
 
+  // Searches filter_intra
   if (beat_best_rd && av1_filter_intra_allowed_bsize(&cpi->common, bsize)) {
     if (rd_pick_filter_intra_sby(cpi, x, rate, rate_tokenonly, distortion,
                                  skippable, bsize, bmode_costs[DC_PRED],
@@ -2069,6 +1161,7 @@
       best_mbmi = *mbmi;
     }
   }
+
   // No mode is identified with less rd value than best_rd passed to this
   // function. In such cases winner mode processing is not necessary and return
   // best_rd as INT64_MAX to indicate best mode is not identified
@@ -2077,7 +1170,7 @@
   // In multi-winner mode processing, perform tx search for few best modes
   // identified during mode evaluation. Winner mode processing uses best tx
   // configuration for tx search.
-  if (cpi->sf.winner_mode_sf.enable_multiwinner_mode_process) {
+  if (cpi->sf.winner_mode_sf.multi_winner_mode_type) {
     int best_mode_idx = 0;
     int block_width, block_height;
     uint8_t *color_map_dst = xd->plane[PLANE_TYPE_Y].color_index_map;
diff --git a/av1/encoder/intra_mode_search.h b/av1/encoder/intra_mode_search.h
index 4b5d31c..cc2a87b0 100644
--- a/av1/encoder/intra_mode_search.h
+++ b/av1/encoder/intra_mode_search.h
@@ -9,6 +9,9 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
+/*!\file
+ * \brief Declares high level functions to search through intra modes.
+ */
 #ifndef AOM_AV1_ENCODER_INTRA_MODE_SEARCH_H_
 #define AOM_AV1_ENCODER_INTRA_MODE_SEARCH_H_
 
@@ -18,46 +21,249 @@
 extern "C" {
 #endif
 
+/*! \brief Variables related to intra-mode search during inter frame coding.
+ *
+ * \ingroup intra_mode_search
+ * This is a set of variables used during intra-mode search for inter frames.
+ * This includes an histogram of gradient speed features and a cache of uv
+ * prediction to avoid repeated search of chroma prediction.
+ */
 typedef struct IntraModeSearchState {
-  int skip_intra_modes;
+  /*!
+   * \brief The best luma intra-mode found so far
+   */
   PREDICTION_MODE best_intra_mode;
-  int angle_stats_ready;
+
+  /** \name Speed feature variables
+   * Variables to help with pruning some luma intra-modes during inter frame
+   * coding process.
+   */
+  /**@{*/
+  /*!
+   * \brief Whether to terminate all intra mode search.
+   */
+  int skip_intra_modes;
+  /*!
+   * \brief Whether a directional mode is pruned.
+   */
   uint8_t directional_mode_skip_mask[INTRA_MODES];
-  int rate_uv_intra;
-  int rate_uv_tokenonly;
-  int64_t dist_uvs;
-  int skip_uvs;
-  UV_PREDICTION_MODE mode_uv;
-  PALETTE_MODE_INFO pmi_uv;
-  int8_t uv_angle_delta;
-  int64_t best_pred_rd[REFERENCE_MODES];
+  /*!
+   * \brief Whether \ref directional_mode_skip_mask is valid for pruning.
+   */
+  int dir_mode_skip_mask_ready;
+  /**@}*/
+
+  /** \name Chroma mode search cache
+   * A cache of the best chroma prediction mode to avoid having to search for
+   * chroma predictions repeatedly in \ref
+   * av1_search_intra_uv_modes_in_interframe()
+   */
+  /**@{*/
+  int rate_uv_intra;          /*!< \brief Total rate to transmit uv_mode */
+  int rate_uv_tokenonly;      /*!< \brief Rate transmit txfm tokens */
+  int64_t dist_uvs;           /*!< \brief Distortion of the uv_mode's recon */
+  int skip_uvs;               /*!< \brief Whether the uv txfm is skippable */
+  UV_PREDICTION_MODE mode_uv; /*!< \brief The best uv mode */
+  PALETTE_MODE_INFO pmi_uv;   /*!< \brief Color map if mode_uv is palette */
+  int8_t uv_angle_delta;      /*!< \brief Angle delta if mode_uv directional */
+  /**@}*/
 } IntraModeSearchState;
 
-void av1_restore_uv_color_map(const AV1_COMP *const cpi, MACROBLOCK *x);
-int av1_search_palette_mode(const AV1_COMP *cpi, MACROBLOCK *x,
-                            RD_STATS *this_rd_cost, PICK_MODE_CONTEXT *ctx,
-                            BLOCK_SIZE bsize, MB_MODE_INFO *const mbmi,
-                            PALETTE_MODE_INFO *const pmi,
-                            unsigned int *ref_costs_single,
-                            IntraModeSearchState *intra_search_state,
+/*!\brief Evaluate a given luma intra-mode for inter frames.
+ *
+ * \ingroup intra_mode_search
+ * \callgraph
+ * \callergraph
+ * This function handles an intra-mode luma prediction when the current frame
+ * is an inter frame. This is the intra-mode counterpart of handle_inter_mode.
+ * This function performs an intra luma prediction using the mode specified by
+ * x->e_mbd.mi[0]->mode. This function does *not* support palette mode
+ * prediction in the luma channel.
+ *
+ * \param[in,out]    intra_search_state Structure to intra search state.
+ * \param[in]        cpi                Top-level encoder structure.
+ * \param[in,out]    x                  Pointer to structure holding all the
+ *                                      data for the current macroblock.
+ * \param[in]        bsize              Current partition block size.
+ * \param[in]        ref_frame_cost     The entropy cost for signaling that the
+ *                                      current ref frame is an intra frame.
+ * \param[in]        ctx                Structure to hold the number of 4x4 blks
+ *                                      to copy tx_type and txfm_skip arrays.
+ * \param[out]       rd_stats_y         Struct to keep track of the current
+ *                                      intra-mode's rd_stats (luma only).
+ * \param[in]        best_rd            Best RD seen for this block so far.
+ * \param[out]       mode_cost_y        The cost needed to signal the current
+ *                                      intra mode.
+ * \param[out]       rd_y               The rdcost of the chosen mode.
+ *
+ * \return Returns 1 if a valid intra mode is found, 0 otherwise.
+ * The corresponding values in x->e_mbd.mi[0], rd_stats_y, mode_cost_y, and
+ * rd_y are also updated. Moreover, in the first evaluation with directional
+ * mode, a prune_mask computed with histogram of gradient is also stored in
+ * intra_search_state.
+ */
+int av1_handle_intra_y_mode(IntraModeSearchState *intra_search_state,
+                            const AV1_COMP *cpi, MACROBLOCK *x,
+                            BLOCK_SIZE bsize, unsigned int ref_frame_cost,
+                            const PICK_MODE_CONTEXT *ctx, RD_STATS *rd_stats_y,
+                            int64_t best_rd, int *mode_cost_y, int64_t *rd_y);
+
+/*!\brief Search through all chroma intra-modes for inter frames.
+ *
+ * \ingroup intra_mode_search
+ * \callgraph
+ * \callergraph
+ * This function handles intra-mode chroma prediction when the current frame
+ * is an inter frame. This is done by calling \ref av1_rd_pick_intra_sbuv_mode
+ * with some additional book-keeping.
+ *
+ * \param[in,out]    intra_search_state Structure to intra search state.
+ * \param[in]        cpi                Top-level encoder structure.
+ * \param[in,out]    x                  Pointer to structure holding all the
+ *                                      data for the current macroblock.
+ * \param[in]        bsize              Current partition block size.
+ * \param[out]       rd_stats           Struct to keep track of the current
+ *                                      intra-mode's rd_stats (all planes).
+ * \param[out]       rd_stats_y         Struct to keep track of the current
+ *                                      intra-mode's rd_stats (luma only).
+ * \param[out]       rd_stats_uv        Struct to keep track of the current
+ *                                      intra-mode's rd_stats (chroma only).
+ * \param[in]        best_rd            Best RD seen for this block so far.
+ *
+ * \return Returns 1 if a valid intra mode is found, 0 otherwise.
+ * The corresponding values in x->e_mbd.mi[0], rd_stats(_y|_uv)  are also
+ * updated. Moreover, in the first evocation of the function, the chroma intra
+ * mode result is cached in intra_search_state to be used in subsequent calls.
+ */
+int av1_search_intra_uv_modes_in_interframe(
+    IntraModeSearchState *intra_search_state, const AV1_COMP *cpi,
+    MACROBLOCK *x, BLOCK_SIZE bsize, RD_STATS *rd_stats,
+    const RD_STATS *rd_stats_y, RD_STATS *rd_stats_uv, int64_t best_rd);
+
+/*!\brief Evaluate luma palette mode for inter frames.
+ *
+ * \ingroup intra_mode_search
+ * \callergraph
+ * \callgraph
+ * This function handles luma palette mode when the current frame is an
+ * inter frame.
+ *
+ * \param[in]    intra_search_state Structure to hold the best luma intra mode
+ *                                  and cache chroma prediction for speed up.
+ * \param[in]    cpi                Top-level encoder structure.
+ * \param[in]    x                  Pointer to structure holding all the data
+ *                                  for the current macroblock.
+ * \param[in]    bsize              Current partition block size.
+ * \param[in]    ref_frame_cost     The entropy cost for signaling that the
+ *                                  current ref frame is an intra frame.
+ * \param[in]    ctx                Structure to hold the number of 4x4 blks to
+ *                                  copy the tx_type and txfm_skip arrays.
+ * \param[in]    this_rd_cost       Struct to keep track of palette mode's
+ *                                  rd_stats.
+ * \param[in]    best_rd            Best RD seen for this block so far.
+ *
+ * \return Returns whether luma palette mode can skip the txfm. The
+ * corresponding mbmi, this_rd_costs, intra_search_state, and tx_type arrays in
+ * ctx are also updated.
+ */
+int av1_search_palette_mode(IntraModeSearchState *intra_search_state,
+                            const AV1_COMP *cpi, MACROBLOCK *x,
+                            BLOCK_SIZE bsize, unsigned int ref_frame_cost,
+                            PICK_MODE_CONTEXT *ctx, RD_STATS *this_rd_cost,
                             int64_t best_rd);
 
-int64_t av1_rd_pick_intra_sbuv_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
-                                    int *rate, int *rate_tokenonly,
-                                    int64_t *distortion, int *skippable,
-                                    BLOCK_SIZE bsize, TX_SIZE max_tx_size);
-
-int64_t av1_handle_intra_mode(IntraModeSearchState *intra_search_state,
-                              const AV1_COMP *cpi, MACROBLOCK *x,
-                              BLOCK_SIZE bsize, int ref_frame_cost,
-                              const PICK_MODE_CONTEXT *ctx, int disable_skip,
-                              RD_STATS *rd_stats, RD_STATS *rd_stats_y,
-                              RD_STATS *rd_stats_uv, int64_t best_rd,
-                              int64_t *best_intra_rd, int8_t best_mbmode_skip);
-
+/*!\brief Perform intra-mode search on luma channels for intra frames.
+ *
+ * \ingroup intra_mode_search
+ * \callgraph
+ * \callergraph
+ * This function performs intra-mode search on the luma channel when the
+ * current frame is intra-only. This function does not search intrabc mode,
+ * but it does search palette and filter_intra.
+ *
+ * \param[in]    cpi                Top-level encoder structure.
+ * \param[in]    x                  Pointer to structure holding all the data
+ *                                  for the current macroblock.
+ * \param[in]    rate               The total rate needed to predict the current
+ *                                  chroma block.
+ * \param[in]    rate_tokenonly     The rate without the cost of sending the
+ *                                  prediction modes.
+ *                                  chroma block.
+ *                                  after the reconstruction.
+ * \param[in]    distortion         The chroma distortion of the best prediction
+ *                                  after the reconstruction.
+ * \param[in]    skippable          Whether we can skip txfm process.
+ * \param[in]    bsize              Current partition block size.
+ * \param[in]    best_rd            Best RD seen for this block so far.
+ * \param[in]    ctx                Structure to hold the number of 4x4 blks to
+ *                                  copy the tx_type and txfm_skip arrays.
+ *
+ * \return Returns the rd_cost if this function finds a mode better than
+ * best_rd, otherwise returns INT64_MAX. This also updates the mbmi, the rate
+ * and distortion, and the tx_type arrays in ctx.
+ */
 int64_t av1_rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
                                    int *rate, int *rate_tokenonly,
                                    int64_t *distortion, int *skippable,
                                    BLOCK_SIZE bsize, int64_t best_rd,
                                    PICK_MODE_CONTEXT *ctx);
+
+/*!\brief Perform intra-mode search on chroma channels.
+ *
+ * \ingroup intra_mode_search
+ * \callergraph
+ * \callgraph
+ * This function performs intra-mode search on the chroma channels. Just like
+ * \ref av1_rd_pick_intra_sby_mode(), this function searches over palette mode
+ * (filter_intra is not available on chroma planes). Unlike \ref
+ * av1_rd_pick_intra_sby_mode() this function is used by both inter and intra
+ * frames.
+ *
+ * \param[in]    cpi                Top-level encoder structure.
+ * \param[in]    x                  Pointer to structure holding all the data
+ *                                  for the current macroblock.
+ * \param[in]    rate               The total rate needed to predict the current
+ *                                  chroma block.
+ * \param[in]    rate_tokenonly     The rate without the cost of sending the
+ *                                  prediction modes.
+ *                                  chroma block.
+ *                                  after the reconstruction.
+ * \param[in]    distortion         The chroma distortion of the best prediction
+ *                                  after the reconstruction.
+ * \param[in]    skippable          Whether we can skip txfm process.
+ * \param[in]    bsize              Current partition block size.
+ * \param[in]    max_tx_size        The maximum tx_size available
+ *
+ * \return Returns the rd_cost of the best uv mode found. This also updates the
+ * mbmi, the rate and distortion, distortion.
+ */
+int64_t av1_rd_pick_intra_sbuv_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
+                                    int *rate, int *rate_tokenonly,
+                                    int64_t *distortion, int *skippable,
+                                    BLOCK_SIZE bsize, TX_SIZE max_tx_size);
+
+/*! \brief Return the number of colors in src. Used by palette mode.
+ */
+void av1_count_colors(const uint8_t *src, int stride, int rows, int cols,
+                      int *val_count, int *num_colors);
+
+/*! \brief See \ref av1_count_colors(), but for highbd.
+ */
+void av1_count_colors_highbd(const uint8_t *src8, int stride, int rows,
+                             int cols, int bit_depth, int *val_count,
+                             int *val_count_8bit, int *num_color_bins,
+                             int *num_colors);
+
+/*! \brief Initializes the \ref IntraModeSearchState struct.
+ */
+static AOM_INLINE void init_intra_mode_search_state(
+    IntraModeSearchState *intra_search_state) {
+  memset(intra_search_state, 0, sizeof(*intra_search_state));
+  intra_search_state->rate_uv_intra = INT_MAX;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
 #endif  // AOM_AV1_ENCODER_INTRA_MODE_SEARCH_H_
diff --git a/av1/encoder/intra_mode_search_utils.h b/av1/encoder/intra_mode_search_utils.h
new file mode 100644
index 0000000..18ab41b
--- /dev/null
+++ b/av1/encoder/intra_mode_search_utils.h
@@ -0,0 +1,466 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/*!\file
+ * \brief Defines utility functions used in intra mode search.
+ *
+ * This includes rdcost estimations, histogram based pruning, etc.
+ */
+#ifndef AOM_AV1_ENCODER_INTRA_MODE_SEARCH_UTILS_H_
+#define AOM_AV1_ENCODER_INTRA_MODE_SEARCH_UTILS_H_
+
+#include "av1/common/enums.h"
+#include "av1/common/pred_common.h"
+#include "av1/common/reconintra.h"
+
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/model_rd.h"
+#include "av1/encoder/palette.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*!\cond */
+#define BINS 32
+static const float av1_intra_hog_model_bias[DIRECTIONAL_MODES] = {
+  0.450578f,  0.695518f,  -0.717944f, -0.639894f,
+  -0.602019f, -0.453454f, 0.055857f,  -0.465480f,
+};
+
+static const float av1_intra_hog_model_weights[BINS * DIRECTIONAL_MODES] = {
+  -3.076402f, -3.757063f, -3.275266f, -3.180665f, -3.452105f, -3.216593f,
+  -2.871212f, -3.134296f, -1.822324f, -2.401411f, -1.541016f, -1.195322f,
+  -0.434156f, 0.322868f,  2.260546f,  3.368715f,  3.989290f,  3.308487f,
+  2.277893f,  0.923793f,  0.026412f,  -0.385174f, -0.718622f, -1.408867f,
+  -1.050558f, -2.323941f, -2.225827f, -2.585453f, -3.054283f, -2.875087f,
+  -2.985709f, -3.447155f, 3.758139f,  3.204353f,  2.170998f,  0.826587f,
+  -0.269665f, -0.702068f, -1.085776f, -2.175249f, -1.623180f, -2.975142f,
+  -2.779629f, -3.190799f, -3.521900f, -3.375480f, -3.319355f, -3.897389f,
+  -3.172334f, -3.594528f, -2.879132f, -2.547777f, -2.921023f, -2.281844f,
+  -1.818988f, -2.041771f, -0.618268f, -1.396458f, -0.567153f, -0.285868f,
+  -0.088058f, 0.753494f,  2.092413f,  3.215266f,  -3.300277f, -2.748658f,
+  -2.315784f, -2.423671f, -2.257283f, -2.269583f, -2.196660f, -2.301076f,
+  -2.646516f, -2.271319f, -2.254366f, -2.300102f, -2.217960f, -2.473300f,
+  -2.116866f, -2.528246f, -3.314712f, -1.701010f, -0.589040f, -0.088077f,
+  0.813112f,  1.702213f,  2.653045f,  3.351749f,  3.243554f,  3.199409f,
+  2.437856f,  1.468854f,  0.533039f,  -0.099065f, -0.622643f, -2.200732f,
+  -4.228861f, -2.875263f, -1.273956f, -0.433280f, 0.803771f,  1.975043f,
+  3.179528f,  3.939064f,  3.454379f,  3.689386f,  3.116411f,  1.970991f,
+  0.798406f,  -0.628514f, -1.252546f, -2.825176f, -4.090178f, -3.777448f,
+  -3.227314f, -3.479403f, -3.320569f, -3.159372f, -2.729202f, -2.722341f,
+  -3.054913f, -2.742923f, -2.612703f, -2.662632f, -2.907314f, -3.117794f,
+  -3.102660f, -3.970972f, -4.891357f, -3.935582f, -3.347758f, -2.721924f,
+  -2.219011f, -1.702391f, -0.866529f, -0.153743f, 0.107733f,  1.416882f,
+  2.572884f,  3.607755f,  3.974820f,  3.997783f,  2.970459f,  0.791687f,
+  -1.478921f, -1.228154f, -1.216955f, -1.765932f, -1.951003f, -1.985301f,
+  -1.975881f, -1.985593f, -2.422371f, -2.419978f, -2.531288f, -2.951853f,
+  -3.071380f, -3.277027f, -3.373539f, -4.462010f, -0.967888f, 0.805524f,
+  2.794130f,  3.685984f,  3.745195f,  3.252444f,  2.316108f,  1.399146f,
+  -0.136519f, -0.162811f, -1.004357f, -1.667911f, -1.964662f, -2.937579f,
+  -3.019533f, -3.942766f, -5.102767f, -3.882073f, -3.532027f, -3.451956f,
+  -2.944015f, -2.643064f, -2.529872f, -2.077290f, -2.809965f, -1.803734f,
+  -1.783593f, -1.662585f, -1.415484f, -1.392673f, -0.788794f, -1.204819f,
+  -1.998864f, -1.182102f, -0.892110f, -1.317415f, -1.359112f, -1.522867f,
+  -1.468552f, -1.779072f, -2.332959f, -2.160346f, -2.329387f, -2.631259f,
+  -2.744936f, -3.052494f, -2.787363f, -3.442548f, -4.245075f, -3.032172f,
+  -2.061609f, -1.768116f, -1.286072f, -0.706587f, -0.192413f, 0.386938f,
+  0.716997f,  1.481393f,  2.216702f,  2.737986f,  3.109809f,  3.226084f,
+  2.490098f,  -0.095827f, -3.864816f, -3.507248f, -3.128925f, -2.908251f,
+  -2.883836f, -2.881411f, -2.524377f, -2.624478f, -2.399573f, -2.367718f,
+  -1.918255f, -1.926277f, -1.694584f, -1.723790f, -0.966491f, -1.183115f,
+  -1.430687f, 0.872896f,  2.766550f,  3.610080f,  3.578041f,  3.334928f,
+  2.586680f,  1.895721f,  1.122195f,  0.488519f,  -0.140689f, -0.799076f,
+  -1.222860f, -1.502437f, -1.900969f, -3.206816f,
+};
+
+static const NN_CONFIG av1_intra_hog_model_nnconfig = {
+  BINS,               // num_inputs
+  DIRECTIONAL_MODES,  // num_outputs
+  0,                  // num_hidden_layers
+  { 0 },
+  {
+      av1_intra_hog_model_weights,
+  },
+  {
+      av1_intra_hog_model_bias,
+  },
+};
+
+#define FIX_PREC_BITS (16)
+static AOM_INLINE int get_hist_bin_idx(int dx, int dy) {
+  const int32_t ratio = (dy * (1 << FIX_PREC_BITS)) / dx;
+
+  // Find index by bisection
+  static const int thresholds[BINS] = {
+    -1334015, -441798, -261605, -183158, -138560, -109331, -88359, -72303,
+    -59392,   -48579,  -39272,  -30982,  -23445,  -16400,  -9715,  -3194,
+    3227,     9748,    16433,   23478,   31015,   39305,   48611,  59425,
+    72336,    88392,   109364,  138593,  183191,  261638,  441831, INT32_MAX
+  };
+
+  int lo_idx = 0, hi_idx = BINS - 1;
+  // Divide into segments of size 8 gives better performance than binary search
+  // here.
+  if (ratio <= thresholds[7]) {
+    lo_idx = 0;
+    hi_idx = 7;
+  } else if (ratio <= thresholds[15]) {
+    lo_idx = 8;
+    hi_idx = 15;
+  } else if (ratio <= thresholds[23]) {
+    lo_idx = 16;
+    hi_idx = 23;
+  } else {
+    lo_idx = 24;
+    hi_idx = 31;
+  }
+
+  for (int idx = lo_idx; idx <= hi_idx; idx++) {
+    if (ratio <= thresholds[idx]) {
+      return idx;
+    }
+  }
+  assert(0 && "No valid histogram bin found!");
+  return BINS - 1;
+}
+#undef FIX_PREC_BITS
+
+static AOM_INLINE void generate_hog(const uint8_t *src, int stride, int rows,
+                                    int cols, float *hist) {
+  float total = 0.1f;
+  src += stride;
+  for (int r = 1; r < rows - 1; ++r) {
+    for (int c = 1; c < cols - 1; ++c) {
+      const uint8_t *above = &src[c - stride];
+      const uint8_t *below = &src[c + stride];
+      const uint8_t *left = &src[c - 1];
+      const uint8_t *right = &src[c + 1];
+      // Calculate gradient using Sobel fitlers.
+      const int dx = (right[-stride] + 2 * right[0] + right[stride]) -
+                     (left[-stride] + 2 * left[0] + left[stride]);
+      const int dy = (below[-1] + 2 * below[0] + below[1]) -
+                     (above[-1] + 2 * above[0] + above[1]);
+      if (dx == 0 && dy == 0) continue;
+      const int temp = abs(dx) + abs(dy);
+      if (!temp) continue;
+      total += temp;
+      if (dx == 0) {
+        hist[0] += temp / 2;
+        hist[BINS - 1] += temp / 2;
+      } else {
+        const int idx = get_hist_bin_idx(dx, dy);
+        assert(idx >= 0 && idx < BINS);
+        hist[idx] += temp;
+      }
+    }
+    src += stride;
+  }
+
+  for (int i = 0; i < BINS; ++i) hist[i] /= total;
+}
+
+static AOM_INLINE void generate_hog_hbd(const uint8_t *src8, int stride,
+                                        int rows, int cols, float *hist) {
+  float total = 0.1f;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  src += stride;
+  for (int r = 1; r < rows - 1; ++r) {
+    for (int c = 1; c < cols - 1; ++c) {
+      const uint16_t *above = &src[c - stride];
+      const uint16_t *below = &src[c + stride];
+      const uint16_t *left = &src[c - 1];
+      const uint16_t *right = &src[c + 1];
+      // Calculate gradient using Sobel fitlers.
+      const int dx = (right[-stride] + 2 * right[0] + right[stride]) -
+                     (left[-stride] + 2 * left[0] + left[stride]);
+      const int dy = (below[-1] + 2 * below[0] + below[1]) -
+                     (above[-1] + 2 * above[0] + above[1]);
+      if (dx == 0 && dy == 0) continue;
+      const int temp = abs(dx) + abs(dy);
+      if (!temp) continue;
+      total += temp;
+      if (dx == 0) {
+        hist[0] += temp / 2;
+        hist[BINS - 1] += temp / 2;
+      } else {
+        const int idx = get_hist_bin_idx(dx, dy);
+        assert(idx >= 0 && idx < BINS);
+        hist[idx] += temp;
+      }
+    }
+    src += stride;
+  }
+
+  for (int i = 0; i < BINS; ++i) hist[i] /= total;
+}
+
+static INLINE void collect_hog_data(const MACROBLOCK *x, BLOCK_SIZE bsize,
+                                    int plane, float *hog) {
+  const MACROBLOCKD *xd = &x->e_mbd;
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const int ss_x = pd->subsampling_x;
+  const int ss_y = pd->subsampling_y;
+  const int bh = block_size_high[bsize];
+  const int bw = block_size_wide[bsize];
+  const int rows =
+      ((xd->mb_to_bottom_edge >= 0) ? bh : (xd->mb_to_bottom_edge >> 3) + bh) >>
+      ss_y;
+  const int cols =
+      ((xd->mb_to_right_edge >= 0) ? bw : (xd->mb_to_right_edge >> 3) + bw) >>
+      ss_x;
+  const int src_stride = x->plane[plane].src.stride;
+  const uint8_t *src = x->plane[plane].src.buf;
+  if (is_cur_buf_hbd(xd)) {
+    generate_hog_hbd(src, src_stride, rows, cols, hog);
+  } else {
+    generate_hog(src, src_stride, rows, cols, hog);
+  }
+
+  // Scale the hog so the luma and chroma are on the same scale
+  for (int b = 0; b < BINS; ++b) {
+    hog[b] *= (1 + ss_x) * (1 + ss_y);
+  }
+}
+
+static AOM_INLINE void prune_intra_mode_with_hog(
+    const MACROBLOCK *x, BLOCK_SIZE bsize, float th,
+    uint8_t *directional_mode_skip_mask, int is_chroma) {
+  aom_clear_system_state();
+
+  const int plane = is_chroma ? AOM_PLANE_U : AOM_PLANE_Y;
+  float hist[BINS] = { 0.0f };
+  collect_hog_data(x, bsize, plane, hist);
+
+  // Make prediction for each of the mode
+  float scores[DIRECTIONAL_MODES] = { 0.0f };
+  aom_clear_system_state();
+  av1_nn_predict(hist, &av1_intra_hog_model_nnconfig, 1, scores);
+  for (UV_PREDICTION_MODE uv_mode = UV_V_PRED; uv_mode <= UV_D67_PRED;
+       uv_mode++) {
+    if (scores[uv_mode - UV_V_PRED] <= th) {
+      directional_mode_skip_mask[uv_mode] = 1;
+    }
+  }
+
+  aom_clear_system_state();
+}
+#undef BINS
+
+// Returns the cost needed to send a uniformly distributed r.v.
+static AOM_INLINE int write_uniform_cost(int n, int v) {
+  const int l = get_unsigned_bits(n);
+  const int m = (1 << l) - n;
+  if (l == 0) return 0;
+  if (v < m)
+    return av1_cost_literal(l - 1);
+  else
+    return av1_cost_literal(l);
+}
+/*!\endcond */
+
+/*!\brief Returns the rate cost for luma prediction mode info of intra blocks.
+ *
+ * \callergraph
+ */
+static AOM_INLINE int intra_mode_info_cost_y(const AV1_COMP *cpi,
+                                             const MACROBLOCK *x,
+                                             const MB_MODE_INFO *mbmi,
+                                             BLOCK_SIZE bsize, int mode_cost) {
+  int total_rate = mode_cost;
+  const ModeCosts *mode_costs = &x->mode_costs;
+  const int use_palette = mbmi->palette_mode_info.palette_size[0] > 0;
+  const int use_filter_intra = mbmi->filter_intra_mode_info.use_filter_intra;
+  const int use_intrabc = mbmi->use_intrabc;
+  // Can only activate one mode.
+  assert(((mbmi->mode != DC_PRED) + use_palette + use_intrabc +
+          use_filter_intra) <= 1);
+  const int try_palette = av1_allow_palette(
+      cpi->common.features.allow_screen_content_tools, mbmi->bsize);
+  if (try_palette && mbmi->mode == DC_PRED) {
+    const MACROBLOCKD *xd = &x->e_mbd;
+    const int bsize_ctx = av1_get_palette_bsize_ctx(bsize);
+    const int mode_ctx = av1_get_palette_mode_ctx(xd);
+    total_rate +=
+        mode_costs->palette_y_mode_cost[bsize_ctx][mode_ctx][use_palette];
+    if (use_palette) {
+      const uint8_t *const color_map = xd->plane[0].color_index_map;
+      int block_width, block_height, rows, cols;
+      av1_get_block_dimensions(bsize, 0, xd, &block_width, &block_height, &rows,
+                               &cols);
+      const int plt_size = mbmi->palette_mode_info.palette_size[0];
+      int palette_mode_cost =
+          mode_costs
+              ->palette_y_size_cost[bsize_ctx][plt_size - PALETTE_MIN_SIZE] +
+          write_uniform_cost(plt_size, color_map[0]);
+      uint16_t color_cache[2 * PALETTE_MAX_SIZE];
+      const int n_cache = av1_get_palette_cache(xd, 0, color_cache);
+      palette_mode_cost +=
+          av1_palette_color_cost_y(&mbmi->palette_mode_info, color_cache,
+                                   n_cache, cpi->common.seq_params.bit_depth);
+      palette_mode_cost +=
+          av1_cost_color_map(x, 0, bsize, mbmi->tx_size, PALETTE_MAP);
+      total_rate += palette_mode_cost;
+    }
+  }
+  if (av1_filter_intra_allowed(&cpi->common, mbmi)) {
+    total_rate += mode_costs->filter_intra_cost[mbmi->bsize][use_filter_intra];
+    if (use_filter_intra) {
+      total_rate +=
+          mode_costs->filter_intra_mode_cost[mbmi->filter_intra_mode_info
+                                                 .filter_intra_mode];
+    }
+  }
+  if (av1_is_directional_mode(mbmi->mode)) {
+    if (av1_use_angle_delta(bsize)) {
+      total_rate +=
+          mode_costs->angle_delta_cost[mbmi->mode - V_PRED]
+                                      [MAX_ANGLE_DELTA +
+                                       mbmi->angle_delta[PLANE_TYPE_Y]];
+    }
+  }
+  if (av1_allow_intrabc(&cpi->common))
+    total_rate += mode_costs->intrabc_cost[use_intrabc];
+  return total_rate;
+}
+
+/*!\brief Return the rate cost for chroma prediction mode info of intra blocks.
+ *
+ * \callergraph
+ */
+static AOM_INLINE int intra_mode_info_cost_uv(const AV1_COMP *cpi,
+                                              const MACROBLOCK *x,
+                                              const MB_MODE_INFO *mbmi,
+                                              BLOCK_SIZE bsize, int mode_cost) {
+  int total_rate = mode_cost;
+  const ModeCosts *mode_costs = &x->mode_costs;
+  const int use_palette = mbmi->palette_mode_info.palette_size[1] > 0;
+  const UV_PREDICTION_MODE mode = mbmi->uv_mode;
+  // Can only activate one mode.
+  assert(((mode != UV_DC_PRED) + use_palette + mbmi->use_intrabc) <= 1);
+
+  const int try_palette = av1_allow_palette(
+      cpi->common.features.allow_screen_content_tools, mbmi->bsize);
+  if (try_palette && mode == UV_DC_PRED) {
+    const PALETTE_MODE_INFO *pmi = &mbmi->palette_mode_info;
+    total_rate +=
+        mode_costs->palette_uv_mode_cost[pmi->palette_size[0] > 0][use_palette];
+    if (use_palette) {
+      const int bsize_ctx = av1_get_palette_bsize_ctx(bsize);
+      const int plt_size = pmi->palette_size[1];
+      const MACROBLOCKD *xd = &x->e_mbd;
+      const uint8_t *const color_map = xd->plane[1].color_index_map;
+      int palette_mode_cost =
+          mode_costs
+              ->palette_uv_size_cost[bsize_ctx][plt_size - PALETTE_MIN_SIZE] +
+          write_uniform_cost(plt_size, color_map[0]);
+      uint16_t color_cache[2 * PALETTE_MAX_SIZE];
+      const int n_cache = av1_get_palette_cache(xd, 1, color_cache);
+      palette_mode_cost += av1_palette_color_cost_uv(
+          pmi, color_cache, n_cache, cpi->common.seq_params.bit_depth);
+      palette_mode_cost +=
+          av1_cost_color_map(x, 1, bsize, mbmi->tx_size, PALETTE_MAP);
+      total_rate += palette_mode_cost;
+    }
+  }
+  if (av1_is_directional_mode(get_uv_mode(mode))) {
+    if (av1_use_angle_delta(bsize)) {
+      total_rate +=
+          mode_costs->angle_delta_cost[mode - V_PRED]
+                                      [mbmi->angle_delta[PLANE_TYPE_UV] +
+                                       MAX_ANGLE_DELTA];
+    }
+  }
+  return total_rate;
+}
+
+/*!\cond */
+// Makes a quick luma prediction and estimate the rdcost with a model without
+// going through the whole txfm/quantize/itxfm process.
+static int64_t intra_model_yrd(const AV1_COMP *const cpi, MACROBLOCK *const x,
+                               BLOCK_SIZE bsize, int mode_cost) {
+  const AV1_COMMON *cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  assert(!is_inter_block(mbmi));
+  RD_STATS this_rd_stats;
+  int row, col;
+  int64_t temp_sse, this_rd;
+  const ModeCosts *mode_costs = &x->mode_costs;
+  const TxfmSearchParams *txfm_params = &x->txfm_search_params;
+  TX_SIZE tx_size =
+      tx_size_from_tx_mode(bsize, txfm_params->tx_mode_search_type);
+  const int stepr = tx_size_high_unit[tx_size];
+  const int stepc = tx_size_wide_unit[tx_size];
+  const int max_blocks_wide = max_block_wide(xd, bsize, 0);
+  const int max_blocks_high = max_block_high(xd, bsize, 0);
+  mbmi->tx_size = tx_size;
+  // Prediction.
+  for (row = 0; row < max_blocks_high; row += stepr) {
+    for (col = 0; col < max_blocks_wide; col += stepc) {
+      av1_predict_intra_block_facade(cm, xd, 0, col, row, tx_size);
+    }
+  }
+  // RD estimation.
+  model_rd_sb_fn[cpi->sf.rt_sf.use_simple_rd_model ? MODELRD_LEGACY
+                                                   : MODELRD_TYPE_INTRA](
+      cpi, bsize, x, xd, 0, 0, &this_rd_stats.rate, &this_rd_stats.dist,
+      &this_rd_stats.skip_txfm, &temp_sse, NULL, NULL, NULL);
+  if (av1_is_directional_mode(mbmi->mode) && av1_use_angle_delta(bsize)) {
+    mode_cost += mode_costs->angle_delta_cost[mbmi->mode - V_PRED]
+                                             [MAX_ANGLE_DELTA +
+                                              mbmi->angle_delta[PLANE_TYPE_Y]];
+  }
+  if (mbmi->mode == DC_PRED &&
+      av1_filter_intra_allowed_bsize(cm, mbmi->bsize)) {
+    if (mbmi->filter_intra_mode_info.use_filter_intra) {
+      const int mode = mbmi->filter_intra_mode_info.filter_intra_mode;
+      mode_cost += mode_costs->filter_intra_cost[mbmi->bsize][1] +
+                   mode_costs->filter_intra_mode_cost[mode];
+    } else {
+      mode_cost += mode_costs->filter_intra_cost[mbmi->bsize][0];
+    }
+  }
+  this_rd =
+      RDCOST(x->rdmult, this_rd_stats.rate + mode_cost, this_rd_stats.dist);
+  return this_rd;
+}
+/*!\endcond */
+
+/*!\brief Estimate the luma rdcost of a given intra mode and try to prune it.
+ *
+ * \ingroup intra_mode_search
+ * \callergraph
+ * This function first makes a quick luma prediction and estimates the rdcost
+ * with a model without going through the txfm, then try to prune the current
+ * mode if the new estimate y_rd > 1.5 * best_model_rd.
+ *
+ * \return Returns 1 if the given mode is prune; 0 otherwise.
+ */
+static AOM_INLINE int model_intra_yrd_and_prune(const AV1_COMP *const cpi,
+                                                MACROBLOCK *x, BLOCK_SIZE bsize,
+                                                int mode_info_cost,
+                                                int64_t *best_model_rd) {
+  const int64_t this_model_rd = intra_model_yrd(cpi, x, bsize, mode_info_cost);
+  if (*best_model_rd != INT64_MAX &&
+      this_model_rd > *best_model_rd + (*best_model_rd >> 1)) {
+    return 1;
+  } else if (this_model_rd < *best_model_rd) {
+    *best_model_rd = this_model_rd;
+  }
+  return 0;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_INTRA_MODE_SEARCH_UTILS_H_
diff --git a/av1/encoder/k_means_template.h b/av1/encoder/k_means_template.h
index 9e526b8..1998a8a 100644
--- a/av1/encoder/k_means_template.h
+++ b/av1/encoder/k_means_template.h
@@ -95,7 +95,11 @@
   int pre_centroids[2 * PALETTE_MAX_SIZE];
   uint8_t pre_indices[MAX_SB_SQUARE];
 
+#if AV1_K_MEANS_DIM - 2
+  av1_calc_indices_dim1(data, centroids, indices, n, k);
+#else
   RENAME(av1_calc_indices)(data, centroids, indices, n, k);
+#endif
   int64_t this_dist = RENAME(calc_total_dist)(data, centroids, indices, n, k);
 
   for (int i = 0; i < max_itr; ++i) {
@@ -105,7 +109,11 @@
     memcpy(pre_indices, indices, sizeof(pre_indices[0]) * n);
 
     RENAME(calc_centroids)(data, centroids, indices, n, k);
+#if AV1_K_MEANS_DIM - 2
+    av1_calc_indices_dim1(data, centroids, indices, n, k);
+#else
     RENAME(av1_calc_indices)(data, centroids, indices, n, k);
+#endif
     this_dist = RENAME(calc_total_dist)(data, centroids, indices, n, k);
 
     if (this_dist > pre_dist) {
diff --git a/av1/encoder/level.c b/av1/encoder/level.c
index 3403a3a..7a74c46 100644
--- a/av1/encoder/level.c
+++ b/av1/encoder/level.c
@@ -1072,9 +1072,6 @@
 
   aom_clear_system_state();
   const double compression_ratio = av1_get_compression_ratio(cm, size);
-  const double total_time_encoded =
-      (cpi->time_stamps.prev_end_seen - cpi->time_stamps.first_ever) /
-      (double)TICKS_PER_SEC;
 
   const int temporal_layer_id = cm->temporal_layer_id;
   const int spatial_layer_id = cm->spatial_layer_id;
@@ -1131,7 +1128,9 @@
           show_frame ? count_frames(buffer, TICKS_PER_SEC) : 0;
       scan_past_frames(buffer, encoded_frames_in_last_second, level_spec,
                        level_stats);
-      level_stats->total_time_encoded = total_time_encoded;
+      level_stats->total_time_encoded +=
+          (cpi->time_stamps.prev_ts_end - cpi->time_stamps.prev_ts_start) /
+          (double)TICKS_PER_SEC;
     }
 
     DECODER_MODEL *const decoder_models = level_info->decoder_models;
diff --git a/av1/encoder/lookahead.c b/av1/encoder/lookahead.c
index 0f7c819..f2bcb27 100644
--- a/av1/encoder/lookahead.c
+++ b/av1/encoder/lookahead.c
@@ -84,7 +84,7 @@
   return NULL;
 }
 
-int av1_lookahead_push(struct lookahead_ctx *ctx, YV12_BUFFER_CONFIG *src,
+int av1_lookahead_push(struct lookahead_ctx *ctx, const YV12_BUFFER_CONFIG *src,
                        int64_t ts_start, int64_t ts_end, int use_highbitdepth,
                        aom_enc_frame_flags_t flags) {
   struct lookahead_entry *buf;
diff --git a/av1/encoder/lookahead.h b/av1/encoder/lookahead.h
index 03693d3..8861fd1 100644
--- a/av1/encoder/lookahead.h
+++ b/av1/encoder/lookahead.h
@@ -9,6 +9,9 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
+/*!\file
+ * \brief Describes look ahead buffer operations.
+ */
 #ifndef AOM_AV1_ENCODER_LOOKAHEAD_H_
 #define AOM_AV1_ENCODER_LOOKAHEAD_H_
 
@@ -19,6 +22,7 @@
 extern "C" {
 #endif
 
+/*!\cond */
 #define MAX_LAG_BUFFERS 35
 #define MAX_LAP_BUFFERS 35
 #define MAX_TOTAL_BUFFERS (MAX_LAG_BUFFERS + MAX_LAP_BUFFERS)
@@ -49,6 +53,7 @@
   struct read_ctx read_ctxs[MAX_STAGES]; /* Read context */
   struct lookahead_entry *buf;           /* Buffer list */
 };
+/*!\endcond */
 
 /**\brief Initializes the lookahead stage
  *
@@ -76,22 +81,22 @@
  * \param[in] src         Pointer to the image to enqueue
  * \param[in] ts_start    Timestamp for the start of this frame
  * \param[in] ts_end      Timestamp for the end of this frame
+ * \param[in] use_highbitdepth Tell if HBD is used
  * \param[in] flags       Flags set on this frame
- * \param[in] active_map  Map that specifies which macroblock is active
  */
-int av1_lookahead_push(struct lookahead_ctx *ctx, YV12_BUFFER_CONFIG *src,
+int av1_lookahead_push(struct lookahead_ctx *ctx, const YV12_BUFFER_CONFIG *src,
                        int64_t ts_start, int64_t ts_end, int use_highbitdepth,
                        aom_enc_frame_flags_t flags);
 
 /**\brief Get the next source buffer to encode
  *
- *
  * \param[in] ctx       Pointer to the lookahead context
  * \param[in] drain     Flag indicating the buffer should be drained
  *                      (return a buffer regardless of the current queue depth)
+ * \param[in] stage     Encoder stage
  *
- * \retval NULL, if drain set and queue is empty
- * \retval NULL, if drain not set and queue not of the configured depth
+ * \retval Return NULL, if drain set and queue is empty, or if drain not set and
+ * queue not of the configured depth.
  */
 struct lookahead_entry *av1_lookahead_pop(struct lookahead_ctx *ctx, int drain,
                                           COMPRESSOR_STAGE stage);
@@ -100,19 +105,20 @@
  *
  * \param[in] ctx       Pointer to the lookahead context
  * \param[in] index     Index of the frame to be returned, 0 == next frame
+ * \param[in] stage     Encoder stage
  *
- * \retval NULL, if no buffer exists at the specified index
+ * \retval Return NULL, if no buffer exists at the specified index
  */
 struct lookahead_entry *av1_lookahead_peek(struct lookahead_ctx *ctx, int index,
                                            COMPRESSOR_STAGE stage);
 
 /**\brief Get the number of frames currently in the lookahead queue
- *
- * \param[in] ctx       Pointer to the lookahead context
  */
 unsigned int av1_lookahead_depth(struct lookahead_ctx *ctx,
                                  COMPRESSOR_STAGE stage);
 
+/**\brief Get pop_sz value
+ */
 int av1_lookahead_pop_sz(struct lookahead_ctx *ctx, COMPRESSOR_STAGE stage);
 
 #ifdef __cplusplus
diff --git a/av1/encoder/mathutils.h b/av1/encoder/mathutils.h
index 64f9361..576de07 100644
--- a/av1/encoder/mathutils.h
+++ b/av1/encoder/mathutils.h
@@ -114,7 +114,7 @@
 // svdcmp
 // Adopted from Numerical Recipes in C
 
-static INLINE double sign(double a, double b) {
+static INLINE double apply_sign(double a, double b) {
   return ((b) >= 0 ? fabs(a) : -fabs(a));
 }
 
@@ -150,7 +150,7 @@
           s += u[k][i] * u[k][i];
         }
         f = u[i][i];
-        g = -sign(sqrt(s), f);
+        g = -apply_sign(sqrt(s), f);
         h = f * g - s;
         u[i][i] = f - g;
         for (j = l; j < n; j++) {
@@ -171,7 +171,7 @@
           s += u[i][k] * u[i][k];
         }
         f = u[i][l];
-        g = -sign(sqrt(s), f);
+        g = -apply_sign(sqrt(s), f);
         h = f * g - s;
         u[i][l] = f - g;
         for (k = l; k < n; k++) rv1[k] = u[i][k] / h;
@@ -269,7 +269,7 @@
       h = rv1[k];
       f = ((y - z) * (y + z) + (g - h) * (g + h)) / (2.0 * h * y);
       g = pythag(f, 1.0);
-      f = ((x - z) * (x + z) + h * ((y / (f + sign(g, f))) - h)) / x;
+      f = ((x - z) * (x + z) + h * ((y / (f + apply_sign(g, f))) - h)) / x;
       c = s = 1.0;
       for (j = l; j <= nm; j++) {
         i = j + 1;
diff --git a/av1/encoder/mcomp.c b/av1/encoder/mcomp.c
index 43f7f5c..745f75b 100644
--- a/av1/encoder/mcomp.c
+++ b/av1/encoder/mcomp.c
@@ -33,15 +33,16 @@
 #include "av1/encoder/reconinter_enc.h"
 
 static INLINE void init_mv_cost_params(MV_COST_PARAMS *mv_cost_params,
-                                       const MACROBLOCK *x, const MV *ref_mv) {
+                                       const MvCosts *mv_costs,
+                                       const MV *ref_mv) {
   mv_cost_params->ref_mv = ref_mv;
   mv_cost_params->full_ref_mv = get_fullmv_from_mv(ref_mv);
-  mv_cost_params->error_per_bit = x->errorperbit;
-  mv_cost_params->sad_per_bit = x->sadperbit;
-  mv_cost_params->mvjcost = x->nmv_vec_cost;
-  mv_cost_params->mvcost[0] = x->mv_cost_stack[0];
-  mv_cost_params->mvcost[1] = x->mv_cost_stack[1];
-  mv_cost_params->mv_cost_type = x->mv_cost_type;
+  mv_cost_params->mv_cost_type = MV_COST_ENTROPY;
+  mv_cost_params->error_per_bit = mv_costs->errorperbit;
+  mv_cost_params->sad_per_bit = mv_costs->sadperbit;
+  mv_cost_params->mvjcost = mv_costs->nmv_joint_cost;
+  mv_cost_params->mvcost[0] = mv_costs->mv_cost_stack[0];
+  mv_cost_params->mvcost[1] = mv_costs->mv_cost_stack[1];
 }
 
 static INLINE void init_ms_buffers(MSBuffers *ms_buffers, const MACROBLOCK *x) {
@@ -50,38 +51,82 @@
 
   av1_set_ms_compound_refs(ms_buffers, NULL, NULL, 0, 0);
 
-  ms_buffers->wsrc = x->wsrc_buf;
-  ms_buffers->obmc_mask = x->mask_buf;
+  ms_buffers->wsrc = x->obmc_buffer.wsrc;
+  ms_buffers->obmc_mask = x->obmc_buffer.mask;
+}
+
+static AOM_INLINE SEARCH_METHODS
+get_faster_search_method(SEARCH_METHODS search_method) {
+  // Note on search method's accuracy:
+  //  1. NSTEP
+  //  2. DIAMOND
+  //  3. BIGDIA \approx SQUARE
+  //  4. HEX.
+  //  5. FAST_HEX \approx FAST_DIAMOND
+  switch (search_method) {
+    case NSTEP: return DIAMOND;
+    case NSTEP_8PT: return DIAMOND;
+    case DIAMOND: return BIGDIA;
+    case CLAMPED_DIAMOND: return BIGDIA;
+    case BIGDIA: return HEX;
+    case SQUARE: return HEX;
+    case HEX: return FAST_HEX;
+    case FAST_HEX: return FAST_HEX;
+    case FAST_DIAMOND: return FAST_DIAMOND;
+    case FAST_BIGDIA: return FAST_BIGDIA;
+    default: assert(0 && "Invalid search method!"); return DIAMOND;
+  }
 }
 
 void av1_make_default_fullpel_ms_params(
     FULLPEL_MOTION_SEARCH_PARAMS *ms_params, const struct AV1_COMP *cpi,
     const MACROBLOCK *x, BLOCK_SIZE bsize, const MV *ref_mv,
-    const search_site_config *search_sites) {
+    const search_site_config search_sites[NUM_SEARCH_METHODS],
+    int fine_search_interval) {
+  const MV_SPEED_FEATURES *mv_sf = &cpi->sf.mv_sf;
+
   // High level params
   ms_params->bsize = bsize;
   ms_params->vfp = &cpi->fn_ptr[bsize];
 
   init_ms_buffers(&ms_params->ms_buffers, x);
 
-  ms_params->search_method = cpi->sf.mv_sf.search_method;
-  ms_params->search_sites = search_sites;
+  SEARCH_METHODS search_method = mv_sf->search_method;
+  if (mv_sf->use_bsize_dependent_search_method) {
+    const int min_dim = AOMMIN(block_size_wide[bsize], block_size_high[bsize]);
+    if (min_dim >= 32) {
+      search_method = get_faster_search_method(search_method);
+    }
+  }
 
-  ms_params->mesh_patterns[0] = cpi->sf.mv_sf.mesh_patterns;
-  ms_params->mesh_patterns[1] = cpi->sf.mv_sf.intrabc_mesh_patterns;
-  ms_params->force_mesh_thresh = cpi->sf.mv_sf.exhaustive_searches_thresh;
-  ms_params->prune_mesh_search = cpi->sf.mv_sf.prune_mesh_search;
+  av1_set_mv_search_method(ms_params, search_sites, search_method);
+
+  const int use_downsampled_sad =
+      mv_sf->use_downsampled_sad && block_size_high[bsize] >= 16;
+  if (use_downsampled_sad) {
+    ms_params->sdf = ms_params->vfp->sdsf;
+    ms_params->sdx4df = ms_params->vfp->sdsx4df;
+  } else {
+    ms_params->sdf = ms_params->vfp->sdf;
+    ms_params->sdx4df = ms_params->vfp->sdx4df;
+  }
+
+  ms_params->mesh_patterns[0] = mv_sf->mesh_patterns;
+  ms_params->mesh_patterns[1] = mv_sf->intrabc_mesh_patterns;
+  ms_params->force_mesh_thresh = mv_sf->exhaustive_searches_thresh;
+  ms_params->prune_mesh_search = mv_sf->prune_mesh_search;
   ms_params->run_mesh_search = 0;
+  ms_params->fine_search_interval = fine_search_interval;
 
   ms_params->is_intra_mode = 0;
 
-  ms_params->fast_obmc_search = cpi->sf.mv_sf.obmc_full_pixel_search_level;
+  ms_params->fast_obmc_search = mv_sf->obmc_full_pixel_search_level;
 
   ms_params->mv_limits = x->mv_limits;
   av1_set_mv_search_range(&ms_params->mv_limits, ref_mv);
 
   // Mvcost params
-  init_mv_cost_params(&ms_params->mv_cost_params, x, ref_mv);
+  init_mv_cost_params(&ms_params->mv_cost_params, &x->mv_costs, ref_mv);
 }
 
 void av1_make_default_subpel_ms_params(SUBPEL_MOTION_SEARCH_PARAMS *ms_params,
@@ -98,7 +143,7 @@
   av1_set_subpel_mv_search_range(&ms_params->mv_limits, &x->mv_limits, ref_mv);
 
   // Mvcost params
-  init_mv_cost_params(&ms_params->mv_cost_params, x, ref_mv);
+  init_mv_cost_params(&ms_params->mv_cost_params, &x->mv_costs, ref_mv);
 
   // Subpel variance params
   ms_params->var_params.vfp = &cpi->fn_ptr[bsize];
@@ -270,18 +315,24 @@
 #define MAX_PATTERN_CANDIDATES 8  // max number of candidates per scale
 #define PATTERN_CANDIDATES_REF 3  // number of refinement candidates
 
-void av1_init_dsmotion_compensation(search_site_config *cfg, int stride) {
-  int ss_count = 0;
+// Search site initialization for DIAMOND / CLAMPED_DIAMOND search methods.
+// level = 0: DIAMOND, level = 1: CLAMPED_DIAMOND.
+void av1_init_dsmotion_compensation(search_site_config *cfg, int stride,
+                                    int level) {
+  int num_search_steps = 0;
   int stage_index = MAX_MVSEARCH_STEPS - 1;
 
-  cfg->ss[stage_index][0].mv.col = cfg->ss[stage_index][0].mv.row = 0;
-  cfg->ss[stage_index][0].offset = 0;
+  cfg->site[stage_index][0].mv.col = cfg->site[stage_index][0].mv.row = 0;
+  cfg->site[stage_index][0].offset = 0;
   cfg->stride = stride;
 
-  for (int radius = MAX_FIRST_STEP; radius > 0; radius /= 2) {
+  // Choose the initial step size depending on level.
+  const int first_step = (level > 0) ? (MAX_FIRST_STEP / 4) : MAX_FIRST_STEP;
+
+  for (int radius = first_step; radius > 0;) {
     int num_search_pts = 8;
 
-    const FULLPEL_MV ss_mvs[13] = {
+    const FULLPEL_MV search_site_mvs[13] = {
       { 0, 0 },           { -radius, 0 },      { radius, 0 },
       { 0, -radius },     { 0, radius },       { -radius, -radius },
       { radius, radius }, { -radius, radius }, { radius, -radius },
@@ -289,24 +340,26 @@
 
     int i;
     for (i = 0; i <= num_search_pts; ++i) {
-      search_site *const ss = &cfg->ss[stage_index][i];
-      ss->mv = ss_mvs[i];
-      ss->offset = get_offset_from_fullmv(&ss->mv, stride);
+      search_site *const site = &cfg->site[stage_index][i];
+      site->mv = search_site_mvs[i];
+      site->offset = get_offset_from_fullmv(&site->mv, stride);
     }
     cfg->searches_per_step[stage_index] = num_search_pts;
     cfg->radius[stage_index] = radius;
+    // Update the search radius based on level.
+    if (!level || ((stage_index < 9) && level)) radius /= 2;
     --stage_index;
-    ++ss_count;
+    ++num_search_steps;
   }
-  cfg->ss_count = ss_count;
+  cfg->num_search_steps = num_search_steps;
 }
 
 void av1_init_motion_fpf(search_site_config *cfg, int stride) {
-  int ss_count = 0;
+  int num_search_steps = 0;
   int stage_index = MAX_MVSEARCH_STEPS - 1;
 
-  cfg->ss[stage_index][0].mv.col = cfg->ss[stage_index][0].mv.row = 0;
-  cfg->ss[stage_index][0].offset = 0;
+  cfg->site[stage_index][0].mv.col = cfg->site[stage_index][0].mv.row = 0;
+  cfg->site[stage_index][0].offset = 0;
   cfg->stride = stride;
 
   for (int radius = MAX_FIRST_STEP; radius > 0; radius /= 2) {
@@ -315,7 +368,7 @@
     int num_search_pts = 12;
     if (radius == 1) num_search_pts = 8;
 
-    const FULLPEL_MV ss_mvs[13] = {
+    const FULLPEL_MV search_site_mvs[13] = {
       { 0, 0 },
       { -radius, 0 },
       { radius, 0 },
@@ -333,31 +386,35 @@
 
     int i;
     for (i = 0; i <= num_search_pts; ++i) {
-      search_site *const ss = &cfg->ss[stage_index][i];
-      ss->mv = ss_mvs[i];
-      ss->offset = get_offset_from_fullmv(&ss->mv, stride);
+      search_site *const site = &cfg->site[stage_index][i];
+      site->mv = search_site_mvs[i];
+      site->offset = get_offset_from_fullmv(&site->mv, stride);
     }
     cfg->searches_per_step[stage_index] = num_search_pts;
     cfg->radius[stage_index] = radius;
     --stage_index;
-    ++ss_count;
+    ++num_search_steps;
   }
-  cfg->ss_count = ss_count;
+  cfg->num_search_steps = num_search_steps;
 }
 
-void av1_init3smotion_compensation(search_site_config *cfg, int stride) {
-  int ss_count = 0;
+// Search site initialization for NSTEP / NSTEP_8PT search methods.
+// level = 0: NSTEP, level = 1: NSTEP_8PT.
+void av1_init_motion_compensation_nstep(search_site_config *cfg, int stride,
+                                        int level) {
+  int num_search_steps = 0;
   int stage_index = 0;
   cfg->stride = stride;
   int radius = 1;
-  for (stage_index = 0; stage_index < 15; ++stage_index) {
+  const int num_stages = (level > 0) ? 16 : 15;
+  for (stage_index = 0; stage_index < num_stages; ++stage_index) {
     int tan_radius = AOMMAX((int)(0.41 * radius), 1);
     int num_search_pts = 12;
-    if (radius <= 5) {
+    if ((radius <= 5) || (level > 0)) {
       tan_radius = radius;
       num_search_pts = 8;
     }
-    const FULLPEL_MV ss_mvs[13] = {
+    const FULLPEL_MV search_site_mvs[13] = {
       { 0, 0 },
       { -radius, 0 },
       { radius, 0 },
@@ -374,17 +431,176 @@
     };
 
     for (int i = 0; i <= num_search_pts; ++i) {
-      search_site *const ss = &cfg->ss[stage_index][i];
-      ss->mv = ss_mvs[i];
-      ss->offset = get_offset_from_fullmv(&ss->mv, stride);
+      search_site *const site = &cfg->site[stage_index][i];
+      site->mv = search_site_mvs[i];
+      site->offset = get_offset_from_fullmv(&site->mv, stride);
     }
     cfg->searches_per_step[stage_index] = num_search_pts;
     cfg->radius[stage_index] = radius;
-    ++ss_count;
+    ++num_search_steps;
     if (stage_index < 12)
       radius = (int)AOMMAX((radius * 1.5 + 0.5), radius + 1);
   }
-  cfg->ss_count = ss_count;
+  cfg->num_search_steps = num_search_steps;
+}
+
+// Search site initialization for BIGDIA / FAST_BIGDIA / FAST_DIAMOND
+// search methods.
+void av1_init_motion_compensation_bigdia(search_site_config *cfg, int stride,
+                                         int level) {
+  (void)level;
+  cfg->stride = stride;
+  // First scale has 4-closest points, the rest have 8 points in diamond
+  // shape at increasing scales
+  static const int bigdia_num_candidates[MAX_PATTERN_SCALES] = {
+    4, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+  };
+
+  // BIGDIA search method candidates.
+  // Note that the largest candidate step at each scale is 2^scale
+  /* clang-format off */
+  static const FULLPEL_MV
+      site_candidates[MAX_PATTERN_SCALES][MAX_PATTERN_CANDIDATES] = {
+          { { 0, -1 }, { 1, 0 }, { 0, 1 }, { -1, 0 }, { 0, 0 }, { 0, 0 },
+            { 0, 0 }, { 0, 0 } },
+          { { -1, -1 }, { 0, -2 }, { 1, -1 }, { 2, 0 }, { 1, 1 }, { 0, 2 },
+            { -1, 1 }, { -2, 0 } },
+          { { -2, -2 }, { 0, -4 }, { 2, -2 }, { 4, 0 }, { 2, 2 }, { 0, 4 },
+            { -2, 2 }, { -4, 0 } },
+          { { -4, -4 }, { 0, -8 }, { 4, -4 }, { 8, 0 }, { 4, 4 }, { 0, 8 },
+            { -4, 4 }, { -8, 0 } },
+          { { -8, -8 }, { 0, -16 }, { 8, -8 }, { 16, 0 }, { 8, 8 }, { 0, 16 },
+            { -8, 8 }, { -16, 0 } },
+          { { -16, -16 }, { 0, -32 }, { 16, -16 }, { 32, 0 }, { 16, 16 },
+            { 0, 32 }, { -16, 16 }, { -32, 0 } },
+          { { -32, -32 }, { 0, -64 }, { 32, -32 }, { 64, 0 }, { 32, 32 },
+            { 0, 64 }, { -32, 32 }, { -64, 0 } },
+          { { -64, -64 }, { 0, -128 }, { 64, -64 }, { 128, 0 }, { 64, 64 },
+            { 0, 128 }, { -64, 64 }, { -128, 0 } },
+          { { -128, -128 }, { 0, -256 }, { 128, -128 }, { 256, 0 },
+            { 128, 128 }, { 0, 256 }, { -128, 128 }, { -256, 0 } },
+          { { -256, -256 }, { 0, -512 }, { 256, -256 }, { 512, 0 },
+            { 256, 256 }, { 0, 512 }, { -256, 256 }, { -512, 0 } },
+          { { -512, -512 }, { 0, -1024 }, { 512, -512 }, { 1024, 0 },
+            { 512, 512 }, { 0, 1024 }, { -512, 512 }, { -1024, 0 } },
+        };
+
+  /* clang-format on */
+  int radius = 1;
+  for (int i = 0; i < MAX_PATTERN_SCALES; ++i) {
+    cfg->searches_per_step[i] = bigdia_num_candidates[i];
+    cfg->radius[i] = radius;
+    for (int j = 0; j < MAX_PATTERN_CANDIDATES; ++j) {
+      search_site *const site = &cfg->site[i][j];
+      site->mv = site_candidates[i][j];
+      site->offset = get_offset_from_fullmv(&site->mv, stride);
+    }
+    radius *= 2;
+  }
+  cfg->num_search_steps = MAX_PATTERN_SCALES;
+}
+
+// Search site initialization for SQUARE search method.
+void av1_init_motion_compensation_square(search_site_config *cfg, int stride,
+                                         int level) {
+  (void)level;
+  cfg->stride = stride;
+  // All scales have 8 closest points in square shape.
+  static const int square_num_candidates[MAX_PATTERN_SCALES] = {
+    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+  };
+
+  // Square search method candidates.
+  // Note that the largest candidate step at each scale is 2^scale.
+  /* clang-format off */
+    static const FULLPEL_MV
+        square_candidates[MAX_PATTERN_SCALES][MAX_PATTERN_CANDIDATES] = {
+             { { -1, -1 }, { 0, -1 }, { 1, -1 }, { 1, 0 }, { 1, 1 }, { 0, 1 },
+               { -1, 1 }, { -1, 0 } },
+             { { -2, -2 }, { 0, -2 }, { 2, -2 }, { 2, 0 }, { 2, 2 }, { 0, 2 },
+               { -2, 2 }, { -2, 0 } },
+             { { -4, -4 }, { 0, -4 }, { 4, -4 }, { 4, 0 }, { 4, 4 }, { 0, 4 },
+               { -4, 4 }, { -4, 0 } },
+             { { -8, -8 }, { 0, -8 }, { 8, -8 }, { 8, 0 }, { 8, 8 }, { 0, 8 },
+               { -8, 8 }, { -8, 0 } },
+             { { -16, -16 }, { 0, -16 }, { 16, -16 }, { 16, 0 }, { 16, 16 },
+               { 0, 16 }, { -16, 16 }, { -16, 0 } },
+             { { -32, -32 }, { 0, -32 }, { 32, -32 }, { 32, 0 }, { 32, 32 },
+               { 0, 32 }, { -32, 32 }, { -32, 0 } },
+             { { -64, -64 }, { 0, -64 }, { 64, -64 }, { 64, 0 }, { 64, 64 },
+               { 0, 64 }, { -64, 64 }, { -64, 0 } },
+             { { -128, -128 }, { 0, -128 }, { 128, -128 }, { 128, 0 },
+               { 128, 128 }, { 0, 128 }, { -128, 128 }, { -128, 0 } },
+             { { -256, -256 }, { 0, -256 }, { 256, -256 }, { 256, 0 },
+               { 256, 256 }, { 0, 256 }, { -256, 256 }, { -256, 0 } },
+             { { -512, -512 }, { 0, -512 }, { 512, -512 }, { 512, 0 },
+               { 512, 512 }, { 0, 512 }, { -512, 512 }, { -512, 0 } },
+             { { -1024, -1024 }, { 0, -1024 }, { 1024, -1024 }, { 1024, 0 },
+               { 1024, 1024 }, { 0, 1024 }, { -1024, 1024 }, { -1024, 0 } },
+    };
+
+  /* clang-format on */
+  int radius = 1;
+  for (int i = 0; i < MAX_PATTERN_SCALES; ++i) {
+    cfg->searches_per_step[i] = square_num_candidates[i];
+    cfg->radius[i] = radius;
+    for (int j = 0; j < MAX_PATTERN_CANDIDATES; ++j) {
+      search_site *const site = &cfg->site[i][j];
+      site->mv = square_candidates[i][j];
+      site->offset = get_offset_from_fullmv(&site->mv, stride);
+    }
+    radius *= 2;
+  }
+  cfg->num_search_steps = MAX_PATTERN_SCALES;
+}
+
+// Search site initialization for HEX / FAST_HEX search methods.
+void av1_init_motion_compensation_hex(search_site_config *cfg, int stride,
+                                      int level) {
+  (void)level;
+  cfg->stride = stride;
+  // First scale has 8-closest points, the rest have 6 points in hex shape
+  // at increasing scales.
+  static const int hex_num_candidates[MAX_PATTERN_SCALES] = { 8, 6, 6, 6, 6, 6,
+                                                              6, 6, 6, 6, 6 };
+  // Note that the largest candidate step at each scale is 2^scale.
+  /* clang-format off */
+    static const FULLPEL_MV
+        hex_candidates[MAX_PATTERN_SCALES][MAX_PATTERN_CANDIDATES] = {
+        { { -1, -1 }, { 0, -1 }, { 1, -1 }, { 1, 0 }, { 1, 1 }, { 0, 1 },
+          { -1, 1 }, { -1, 0 } },
+        { { -1, -2 }, { 1, -2 }, { 2, 0 }, { 1, 2 }, { -1, 2 }, { -2, 0 } },
+        { { -2, -4 }, { 2, -4 }, { 4, 0 }, { 2, 4 }, { -2, 4 }, { -4, 0 } },
+        { { -4, -8 }, { 4, -8 }, { 8, 0 }, { 4, 8 }, { -4, 8 }, { -8, 0 } },
+        { { -8, -16 }, { 8, -16 }, { 16, 0 }, { 8, 16 },
+          { -8, 16 }, { -16, 0 } },
+        { { -16, -32 }, { 16, -32 }, { 32, 0 }, { 16, 32 }, { -16, 32 },
+          { -32, 0 } },
+        { { -32, -64 }, { 32, -64 }, { 64, 0 }, { 32, 64 }, { -32, 64 },
+          { -64, 0 } },
+        { { -64, -128 }, { 64, -128 }, { 128, 0 }, { 64, 128 },
+          { -64, 128 }, { -128, 0 } },
+        { { -128, -256 }, { 128, -256 }, { 256, 0 }, { 128, 256 },
+          { -128, 256 }, { -256, 0 } },
+        { { -256, -512 }, { 256, -512 }, { 512, 0 }, { 256, 512 },
+          { -256, 512 }, { -512, 0 } },
+        { { -512, -1024 }, { 512, -1024 }, { 1024, 0 }, { 512, 1024 },
+          { -512, 1024 }, { -1024, 0 } },
+    };
+
+  /* clang-format on */
+  int radius = 1;
+  for (int i = 0; i < MAX_PATTERN_SCALES; ++i) {
+    cfg->searches_per_step[i] = hex_num_candidates[i];
+    cfg->radius[i] = radius;
+    for (int j = 0; j < hex_num_candidates[i]; ++j) {
+      search_site *const site = &cfg->site[i][j];
+      site->mv = hex_candidates[i][j];
+      site->offset = get_offset_from_fullmv(&site->mv, stride);
+    }
+    radius *= 2;
+  }
+  cfg->num_search_steps = MAX_PATTERN_SCALES;
 }
 
 // Checks whether the mv is within range of the mv_limits
@@ -421,11 +637,10 @@
                                  const struct buf_2d *const src,
                                  const uint8_t *const ref_address,
                                  const int ref_stride) {
-  const aom_variance_fn_ptr_t *vfp = ms_params->vfp;
   const uint8_t *src_buf = src->buf;
   const int src_stride = src->stride;
 
-  return vfp->sdf(src_buf, src_stride, ref_address, ref_stride);
+  return ms_params->sdf(src_buf, src_stride, ref_address, ref_stride);
 }
 
 static INLINE int get_mvpred_compound_var_cost(
@@ -445,9 +660,9 @@
   int bestsme;
 
   if (mask) {
-    bestsme = vfp->msvf(src_buf, src_stride, 0, 0,
-                        get_buf_from_fullmv(ref, this_mv), ref_stride,
-                        second_pred, mask, mask_stride, invert_mask, &unused);
+    bestsme = vfp->msvf(get_buf_from_fullmv(ref, this_mv), ref_stride, 0, 0,
+                        src_buf, src_stride, second_pred, mask, mask_stride,
+                        invert_mask, &unused);
   } else if (second_pred) {
     bestsme = vfp->svaf(get_buf_from_fullmv(ref, this_mv), ref_stride, 0, 0,
                         src_buf, src_stride, &unused, second_pred);
@@ -481,7 +696,7 @@
   } else if (second_pred) {
     return vfp->sdaf(src_buf, src_stride, ref_address, ref_stride, second_pred);
   } else {
-    return vfp->sdf(src_buf, src_stride, ref_address, ref_stride);
+    return ms_params->sdf(src_buf, src_stride, ref_address, ref_stride);
   }
 }
 
@@ -576,44 +791,127 @@
   }
 }
 
-#define CHECK_BETTER                                                      \
-  if (thissad < bestsad) {                                                \
-    int tmp_thissad = thissad;                                            \
-    if (use_mvcost) thissad += mvsad_err_cost_(&this_mv, mv_cost_params); \
-    if (thissad < bestsad) {                                              \
-      raw_bestsad = tmp_thissad;                                          \
-      bestsad = thissad;                                                  \
-      best_site = i;                                                      \
-    }                                                                     \
+// Computes motion vector cost and adds to the sad cost.
+// Then updates the best sad and motion vectors.
+// Inputs:
+//   this_sad: the sad to be evaluated.
+//   mv: the current motion vector.
+//   mv_cost_params: a structure containing information to compute mv cost.
+//   best_sad: the current best sad.
+//   raw_best_sad (optional): the current best sad without calculating mv cost.
+//   best_mv: the current best motion vector.
+//   second_best_mv (optional): the second best motion vector up to now.
+// Modifies:
+//   best_sad, raw_best_sad, best_mv, second_best_mv
+//   If the current sad is lower than the current best sad.
+// Returns:
+//   Whether the input sad (mv) is better than the current best.
+static int update_mvs_and_sad(const unsigned int this_sad, const FULLPEL_MV *mv,
+                              const MV_COST_PARAMS *mv_cost_params,
+                              unsigned int *best_sad,
+                              unsigned int *raw_best_sad, FULLPEL_MV *best_mv,
+                              FULLPEL_MV *second_best_mv) {
+  if (this_sad >= *best_sad) return 0;
+
+  // Add the motion vector cost.
+  const unsigned int sad = this_sad + mvsad_err_cost_(mv, mv_cost_params);
+  if (sad < *best_sad) {
+    if (raw_best_sad) *raw_best_sad = this_sad;
+    *best_sad = sad;
+    if (second_best_mv) *second_best_mv = *best_mv;
+    *best_mv = *mv;
+    return 1;
   }
+  return 0;
+}
+
+// Calculate sad4 and update the bestmv information
+// in FAST_DIAMOND search method.
+static void calc_sad4_update_bestmv(
+    const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+    const MV_COST_PARAMS *mv_cost_params, FULLPEL_MV *best_mv,
+    FULLPEL_MV *temp_best_mv, unsigned int *bestsad, unsigned int *raw_bestsad,
+    int search_step, int *best_site, int cand_start) {
+  const struct buf_2d *const src = ms_params->ms_buffers.src;
+  const struct buf_2d *const ref = ms_params->ms_buffers.ref;
+  const search_site *site = ms_params->search_sites->site[search_step];
+
+  unsigned char const *block_offset[4];
+  unsigned int sads[4];
+  const uint8_t *best_address;
+  const uint8_t *src_buf = src->buf;
+  const int src_stride = src->stride;
+  best_address = get_buf_from_fullmv(ref, temp_best_mv);
+  // Loop over number of candidates.
+  for (int j = 0; j < 4; j++)
+    block_offset[j] = site[cand_start + j].offset + best_address;
+
+  // 4-point sad calculation.
+  ms_params->sdx4df(src_buf, src_stride, block_offset, ref->stride, sads);
+
+  for (int j = 0; j < 4; j++) {
+    const FULLPEL_MV this_mv = {
+      temp_best_mv->row + site[cand_start + j].mv.row,
+      temp_best_mv->col + site[cand_start + j].mv.col
+    };
+    const int found_better_mv = update_mvs_and_sad(
+        sads[j], &this_mv, mv_cost_params, bestsad, raw_bestsad, best_mv,
+        /*second_best_mv=*/NULL);
+    if (found_better_mv) *best_site = cand_start + j;
+  }
+}
+
+// Calculate sad and update the bestmv information
+// in FAST_DIAMOND search method.
+static void calc_sad_update_bestmv(
+    const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+    const MV_COST_PARAMS *mv_cost_params, FULLPEL_MV *best_mv,
+    FULLPEL_MV *temp_best_mv, unsigned int *bestsad, unsigned int *raw_bestsad,
+    int search_step, int *best_site, const int num_candidates, int cand_start) {
+  const struct buf_2d *const src = ms_params->ms_buffers.src;
+  const struct buf_2d *const ref = ms_params->ms_buffers.ref;
+  const search_site *site = ms_params->search_sites->site[search_step];
+  // Loop over number of candidates.
+  for (int i = cand_start; i < num_candidates; i++) {
+    const FULLPEL_MV this_mv = { temp_best_mv->row + site[i].mv.row,
+                                 temp_best_mv->col + site[i].mv.col };
+    if (!av1_is_fullmv_in_range(&ms_params->mv_limits, this_mv)) continue;
+    int thissad = get_mvpred_sad(
+        ms_params, src, get_buf_from_fullmv(ref, &this_mv), ref->stride);
+    const int found_better_mv = update_mvs_and_sad(
+        thissad, &this_mv, mv_cost_params, bestsad, raw_bestsad, best_mv,
+        /*second_best_mv=*/NULL);
+    if (found_better_mv) *best_site = i;
+  }
+}
 
 // Generic pattern search function that searches over multiple scales.
 // Each scale can have a different number of candidates and shape of
 // candidates as indicated in the num_candidates and candidates arrays
 // passed into this function
-static int pattern_search(
-    FULLPEL_MV start_mv, const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
-    const int search_param, const int do_init_search,
-    const int num_candidates[MAX_PATTERN_SCALES],
-    const MV candidates[MAX_PATTERN_SCALES][MAX_PATTERN_CANDIDATES],
-    int *cost_list, FULLPEL_MV *best_mv) {
-  static const int search_param_to_steps[MAX_MVSEARCH_STEPS] = {
+static int pattern_search(FULLPEL_MV start_mv,
+                          const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+                          int search_step, const int do_init_search,
+                          int *cost_list, FULLPEL_MV *best_mv) {
+  static const int search_steps[MAX_MVSEARCH_STEPS] = {
     10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
   };
   int i, s, t;
 
   const struct buf_2d *const src = ms_params->ms_buffers.src;
   const struct buf_2d *const ref = ms_params->ms_buffers.ref;
+  const search_site_config *search_sites = ms_params->search_sites;
+  const int *num_candidates = search_sites->searches_per_step;
   const int ref_stride = ref->stride;
   const int last_is_4 = num_candidates[0] == 4;
   int br, bc;
-  int bestsad = INT_MAX, raw_bestsad = INT_MAX;
+  unsigned int bestsad = UINT_MAX, raw_bestsad = UINT_MAX;
   int thissad;
   int k = -1;
-  const int use_mvcost = ms_params->mv_cost_params.mv_cost_type != MV_COST_NONE;
   const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params;
-  assert(search_param < MAX_MVSEARCH_STEPS);
-  int best_init_s = search_param_to_steps[search_param];
+  search_step = AOMMIN(search_step, MAX_MVSEARCH_STEPS - 1);
+  assert(search_step >= 0);
+  int best_init_s = search_steps[search_step];
   // adjust ref_mv to make sure it is within MV range
   clamp_fullmv(&start_mv, &ms_params->mv_limits);
   br = start_mv.row;
@@ -637,23 +935,27 @@
     best_init_s = -1;
     for (t = 0; t <= s; ++t) {
       int best_site = -1;
+      FULLPEL_MV temp_best_mv;
+      temp_best_mv.row = br;
+      temp_best_mv.col = bc;
       if (check_bounds(&ms_params->mv_limits, br, bc, 1 << t)) {
-        for (i = 0; i < num_candidates[t]; i++) {
-          const FULLPEL_MV this_mv = { br + candidates[t][i].row,
-                                       bc + candidates[t][i].col };
-          thissad = get_mvpred_sad(
-              ms_params, src, get_buf_from_fullmv(ref, &this_mv), ref_stride);
-          CHECK_BETTER
+        // Call 4-point sad for multiples of 4 candidates.
+        const int no_of_4_cand_loops = num_candidates[t] >> 2;
+        for (i = 0; i < no_of_4_cand_loops; i++) {
+          calc_sad4_update_bestmv(ms_params, mv_cost_params, best_mv,
+                                  &temp_best_mv, &bestsad, &raw_bestsad, t,
+                                  &best_site, i * 4);
         }
+        // Rest of the candidates
+        const int remaining_cand = num_candidates[t] % 4;
+        calc_sad_update_bestmv(ms_params, mv_cost_params, best_mv,
+                               &temp_best_mv, &bestsad, &raw_bestsad, t,
+                               &best_site, remaining_cand,
+                               no_of_4_cand_loops * 4);
       } else {
-        for (i = 0; i < num_candidates[t]; i++) {
-          const FULLPEL_MV this_mv = { br + candidates[t][i].row,
-                                       bc + candidates[t][i].col };
-          if (!av1_is_fullmv_in_range(&ms_params->mv_limits, this_mv)) continue;
-          thissad = get_mvpred_sad(
-              ms_params, src, get_buf_from_fullmv(ref, &this_mv), ref_stride);
-          CHECK_BETTER
-        }
+        calc_sad_update_bestmv(ms_params, mv_cost_params, best_mv,
+                               &temp_best_mv, &bestsad, &raw_bestsad, t,
+                               &best_site, num_candidates[t], 0);
       }
       if (best_site == -1) {
         continue;
@@ -663,8 +965,8 @@
       }
     }
     if (best_init_s != -1) {
-      br += candidates[best_init_s][k].row;
-      bc += candidates[best_init_s][k].col;
+      br += search_sites->site[best_init_s][k].mv.row;
+      bc += search_sites->site[best_init_s][k].mv.col;
     }
   }
 
@@ -678,31 +980,34 @@
     for (; s >= last_s; s--) {
       // No need to search all points the 1st time if initial search was used
       if (!do_init_search || s != best_init_s) {
+        FULLPEL_MV temp_best_mv;
+        temp_best_mv.row = br;
+        temp_best_mv.col = bc;
         if (check_bounds(&ms_params->mv_limits, br, bc, 1 << s)) {
-          for (i = 0; i < num_candidates[s]; i++) {
-            const FULLPEL_MV this_mv = { br + candidates[s][i].row,
-                                         bc + candidates[s][i].col };
-            thissad = get_mvpred_sad(
-                ms_params, src, get_buf_from_fullmv(ref, &this_mv), ref_stride);
-            CHECK_BETTER
+          // Call 4-point sad for multiples of 4 candidates.
+          const int no_of_4_cand_loops = num_candidates[s] >> 2;
+          for (i = 0; i < no_of_4_cand_loops; i++) {
+            calc_sad4_update_bestmv(ms_params, mv_cost_params, best_mv,
+                                    &temp_best_mv, &bestsad, &raw_bestsad, s,
+                                    &best_site, i * 4);
           }
+          // Rest of the candidates
+          const int remaining_cand = num_candidates[s] % 4;
+          calc_sad_update_bestmv(ms_params, mv_cost_params, best_mv,
+                                 &temp_best_mv, &bestsad, &raw_bestsad, s,
+                                 &best_site, remaining_cand,
+                                 no_of_4_cand_loops * 4);
         } else {
-          for (i = 0; i < num_candidates[s]; i++) {
-            const FULLPEL_MV this_mv = { br + candidates[s][i].row,
-                                         bc + candidates[s][i].col };
-            if (!av1_is_fullmv_in_range(&ms_params->mv_limits, this_mv))
-              continue;
-            thissad = get_mvpred_sad(
-                ms_params, src, get_buf_from_fullmv(ref, &this_mv), ref_stride);
-            CHECK_BETTER
-          }
+          calc_sad_update_bestmv(ms_params, mv_cost_params, best_mv,
+                                 &temp_best_mv, &bestsad, &raw_bestsad, s,
+                                 &best_site, num_candidates[s], 0);
         }
 
         if (best_site == -1) {
           continue;
         } else {
-          br += candidates[s][best_site].row;
-          bc += candidates[s][best_site].col;
+          br += search_sites->site[s][best_site].mv.row;
+          bc += search_sites->site[s][best_site].mv.col;
           k = best_site;
         }
       }
@@ -717,31 +1022,39 @@
         if (check_bounds(&ms_params->mv_limits, br, bc, 1 << s)) {
           for (i = 0; i < PATTERN_CANDIDATES_REF; i++) {
             const FULLPEL_MV this_mv = {
-              br + candidates[s][next_chkpts_indices[i]].row,
-              bc + candidates[s][next_chkpts_indices[i]].col
+              br + search_sites->site[s][next_chkpts_indices[i]].mv.row,
+              bc + search_sites->site[s][next_chkpts_indices[i]].mv.col
             };
             thissad = get_mvpred_sad(
                 ms_params, src, get_buf_from_fullmv(ref, &this_mv), ref_stride);
-            CHECK_BETTER
+            const int found_better_mv =
+                update_mvs_and_sad(thissad, &this_mv, mv_cost_params, &bestsad,
+                                   &raw_bestsad, best_mv,
+                                   /*second_best_mv=*/NULL);
+            if (found_better_mv) best_site = i;
           }
         } else {
           for (i = 0; i < PATTERN_CANDIDATES_REF; i++) {
             const FULLPEL_MV this_mv = {
-              br + candidates[s][next_chkpts_indices[i]].row,
-              bc + candidates[s][next_chkpts_indices[i]].col
+              br + search_sites->site[s][next_chkpts_indices[i]].mv.row,
+              bc + search_sites->site[s][next_chkpts_indices[i]].mv.col
             };
             if (!av1_is_fullmv_in_range(&ms_params->mv_limits, this_mv))
               continue;
             thissad = get_mvpred_sad(
                 ms_params, src, get_buf_from_fullmv(ref, &this_mv), ref_stride);
-            CHECK_BETTER
+            const int found_better_mv =
+                update_mvs_and_sad(thissad, &this_mv, mv_cost_params, &bestsad,
+                                   &raw_bestsad, best_mv,
+                                   /*second_best_mv=*/NULL);
+            if (found_better_mv) best_site = i;
           }
         }
 
         if (best_site != -1) {
           k = next_chkpts_indices[best_site];
-          br += candidates[s][k].row;
-          bc += candidates[s][k].col;
+          br += search_sites->site[s][k].mv.row;
+          bc += search_sites->site[s][k].mv.col;
         }
       } while (best_site != -1);
     }
@@ -753,27 +1066,35 @@
       if (!do_init_search || s != best_init_s) {
         if (check_bounds(&ms_params->mv_limits, br, bc, 1 << s)) {
           for (i = 0; i < num_candidates[s]; i++) {
-            const FULLPEL_MV this_mv = { br + candidates[s][i].row,
-                                         bc + candidates[s][i].col };
+            const FULLPEL_MV this_mv = { br + search_sites->site[s][i].mv.row,
+                                         bc + search_sites->site[s][i].mv.col };
             cost_list[i + 1] = thissad = get_mvpred_sad(
                 ms_params, src, get_buf_from_fullmv(ref, &this_mv), ref_stride);
-            CHECK_BETTER
+            const int found_better_mv =
+                update_mvs_and_sad(thissad, &this_mv, mv_cost_params, &bestsad,
+                                   &raw_bestsad, best_mv,
+                                   /*second_best_mv=*/NULL);
+            if (found_better_mv) best_site = i;
           }
         } else {
           for (i = 0; i < num_candidates[s]; i++) {
-            const FULLPEL_MV this_mv = { br + candidates[s][i].row,
-                                         bc + candidates[s][i].col };
+            const FULLPEL_MV this_mv = { br + search_sites->site[s][i].mv.row,
+                                         bc + search_sites->site[s][i].mv.col };
             if (!av1_is_fullmv_in_range(&ms_params->mv_limits, this_mv))
               continue;
             cost_list[i + 1] = thissad = get_mvpred_sad(
                 ms_params, src, get_buf_from_fullmv(ref, &this_mv), ref_stride);
-            CHECK_BETTER
+            const int found_better_mv =
+                update_mvs_and_sad(thissad, &this_mv, mv_cost_params, &bestsad,
+                                   &raw_bestsad, best_mv,
+                                   /*second_best_mv=*/NULL);
+            if (found_better_mv) best_site = i;
           }
         }
 
         if (best_site != -1) {
-          br += candidates[s][best_site].row;
-          bc += candidates[s][best_site].col;
+          br += search_sites->site[s][best_site].mv.row;
+          bc += search_sites->site[s][best_site].mv.col;
           k = best_site;
         }
       }
@@ -790,18 +1111,22 @@
         if (check_bounds(&ms_params->mv_limits, br, bc, 1 << s)) {
           for (i = 0; i < PATTERN_CANDIDATES_REF; i++) {
             const FULLPEL_MV this_mv = {
-              br + candidates[s][next_chkpts_indices[i]].row,
-              bc + candidates[s][next_chkpts_indices[i]].col
+              br + search_sites->site[s][next_chkpts_indices[i]].mv.row,
+              bc + search_sites->site[s][next_chkpts_indices[i]].mv.col
             };
             cost_list[next_chkpts_indices[i] + 1] = thissad = get_mvpred_sad(
                 ms_params, src, get_buf_from_fullmv(ref, &this_mv), ref_stride);
-            CHECK_BETTER
+            const int found_better_mv =
+                update_mvs_and_sad(thissad, &this_mv, mv_cost_params, &bestsad,
+                                   &raw_bestsad, best_mv,
+                                   /*second_best_mv=*/NULL);
+            if (found_better_mv) best_site = i;
           }
         } else {
           for (i = 0; i < PATTERN_CANDIDATES_REF; i++) {
             const FULLPEL_MV this_mv = {
-              br + candidates[s][next_chkpts_indices[i]].row,
-              bc + candidates[s][next_chkpts_indices[i]].col
+              br + search_sites->site[s][next_chkpts_indices[i]].mv.row,
+              bc + search_sites->site[s][next_chkpts_indices[i]].mv.col
             };
             if (!av1_is_fullmv_in_range(&ms_params->mv_limits, this_mv)) {
               cost_list[next_chkpts_indices[i] + 1] = INT_MAX;
@@ -809,14 +1134,18 @@
             }
             cost_list[next_chkpts_indices[i] + 1] = thissad = get_mvpred_sad(
                 ms_params, src, get_buf_from_fullmv(ref, &this_mv), ref_stride);
-            CHECK_BETTER
+            const int found_better_mv =
+                update_mvs_and_sad(thissad, &this_mv, mv_cost_params, &bestsad,
+                                   &raw_bestsad, best_mv,
+                                   /*second_best_mv=*/NULL);
+            if (found_better_mv) best_site = i;
           }
         }
 
         if (best_site != -1) {
           k = next_chkpts_indices[best_site];
-          br += candidates[s][k].row;
-          bc += candidates[s][k].col;
+          br += search_sites->site[s][k].mv.row;
+          bc += search_sites->site[s][k].mv.col;
         }
       }
     }
@@ -844,160 +1173,72 @@
   const int var_cost = get_mvpred_var_cost(ms_params, best_mv);
   return var_cost;
 }
-#undef CHECK_BETTER
 
 // For the following foo_search, the input arguments are:
-// x: The struct used to hold a bunch of random configs.
 // start_mv: where we are starting our motion search
-// search_param: how many steps to skip in our motion search. For example,
+// ms_params: a collection of motion search parameters
+// search_step: how many steps to skip in our motion search. For example,
 //   a value 3 suggests that 3 search steps have already taken place prior to
 //   this function call, so we jump directly to step 4 of the search process
-// sad_per_bit: a multiplier used to convert rate to sad cost
 // do_init_search: if on, do an initial search of all possible scales around the
 //   start_mv, and then pick the best scale.
 // cond_list: used to hold the cost around the best full mv so we can use it to
 //   speed up subpel search later.
-// vfp: a function pointer to the simd function so we can compute the cost
-//   efficiently
-// ref_mv: the reference mv used to compute the mv cost
+// best_mv: the best mv found in the motion search
 static int hex_search(const FULLPEL_MV start_mv,
                       const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
-                      const int search_param, const int do_init_search,
+                      const int search_step, const int do_init_search,
                       int *cost_list, FULLPEL_MV *best_mv) {
-  // First scale has 8-closest points, the rest have 6 points in hex shape
-  // at increasing scales
-  static const int hex_num_candidates[MAX_PATTERN_SCALES] = { 8, 6, 6, 6, 6, 6,
-                                                              6, 6, 6, 6, 6 };
-  // Note that the largest candidate step at each scale is 2^scale
-  /* clang-format off */
-  static const MV hex_candidates[MAX_PATTERN_SCALES][MAX_PATTERN_CANDIDATES] = {
-    { { -1, -1 }, { 0, -1 }, { 1, -1 }, { 1, 0 }, { 1, 1 }, { 0, 1 }, { -1, 1 },
-      { -1, 0 } },
-    { { -1, -2 }, { 1, -2 }, { 2, 0 }, { 1, 2 }, { -1, 2 }, { -2, 0 } },
-    { { -2, -4 }, { 2, -4 }, { 4, 0 }, { 2, 4 }, { -2, 4 }, { -4, 0 } },
-    { { -4, -8 }, { 4, -8 }, { 8, 0 }, { 4, 8 }, { -4, 8 }, { -8, 0 } },
-    { { -8, -16 }, { 8, -16 }, { 16, 0 }, { 8, 16 }, { -8, 16 }, { -16, 0 } },
-    { { -16, -32 }, { 16, -32 }, { 32, 0 }, { 16, 32 }, { -16, 32 },
-      { -32, 0 } },
-    { { -32, -64 }, { 32, -64 }, { 64, 0 }, { 32, 64 }, { -32, 64 },
-      { -64, 0 } },
-    { { -64, -128 }, { 64, -128 }, { 128, 0 }, { 64, 128 }, { -64, 128 },
-      { -128, 0 } },
-    { { -128, -256 }, { 128, -256 }, { 256, 0 }, { 128, 256 }, { -128, 256 },
-      { -256, 0 } },
-    { { -256, -512 }, { 256, -512 }, { 512, 0 }, { 256, 512 }, { -256, 512 },
-      { -512, 0 } },
-    { { -512, -1024 }, { 512, -1024 }, { 1024, 0 }, { 512, 1024 },
-      { -512, 1024 }, { -1024, 0 } },
-  };
-  /* clang-format on */
-  return pattern_search(start_mv, ms_params, search_param, do_init_search,
-                        hex_num_candidates, hex_candidates, cost_list, best_mv);
+  return pattern_search(start_mv, ms_params, search_step, do_init_search,
+                        cost_list, best_mv);
 }
 
 static int bigdia_search(const FULLPEL_MV start_mv,
                          const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
-                         const int search_param, const int do_init_search,
+                         const int search_step, const int do_init_search,
                          int *cost_list, FULLPEL_MV *best_mv) {
-  // First scale has 4-closest points, the rest have 8 points in diamond
-  // shape at increasing scales
-  static const int bigdia_num_candidates[MAX_PATTERN_SCALES] = {
-    4, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-  };
-  // Note that the largest candidate step at each scale is 2^scale
-  /* clang-format off */
-  static const MV
-      bigdia_candidates[MAX_PATTERN_SCALES][MAX_PATTERN_CANDIDATES] = {
-        { { 0, -1 }, { 1, 0 }, { 0, 1 }, { -1, 0 } },
-        { { -1, -1 }, { 0, -2 }, { 1, -1 }, { 2, 0 }, { 1, 1 }, { 0, 2 },
-          { -1, 1 }, { -2, 0 } },
-        { { -2, -2 }, { 0, -4 }, { 2, -2 }, { 4, 0 }, { 2, 2 }, { 0, 4 },
-          { -2, 2 }, { -4, 0 } },
-        { { -4, -4 }, { 0, -8 }, { 4, -4 }, { 8, 0 }, { 4, 4 }, { 0, 8 },
-          { -4, 4 }, { -8, 0 } },
-        { { -8, -8 }, { 0, -16 }, { 8, -8 }, { 16, 0 }, { 8, 8 }, { 0, 16 },
-          { -8, 8 }, { -16, 0 } },
-        { { -16, -16 }, { 0, -32 }, { 16, -16 }, { 32, 0 }, { 16, 16 },
-          { 0, 32 }, { -16, 16 }, { -32, 0 } },
-        { { -32, -32 }, { 0, -64 }, { 32, -32 }, { 64, 0 }, { 32, 32 },
-          { 0, 64 }, { -32, 32 }, { -64, 0 } },
-        { { -64, -64 }, { 0, -128 }, { 64, -64 }, { 128, 0 }, { 64, 64 },
-          { 0, 128 }, { -64, 64 }, { -128, 0 } },
-        { { -128, -128 }, { 0, -256 }, { 128, -128 }, { 256, 0 }, { 128, 128 },
-          { 0, 256 }, { -128, 128 }, { -256, 0 } },
-        { { -256, -256 }, { 0, -512 }, { 256, -256 }, { 512, 0 }, { 256, 256 },
-          { 0, 512 }, { -256, 256 }, { -512, 0 } },
-        { { -512, -512 }, { 0, -1024 }, { 512, -512 }, { 1024, 0 },
-          { 512, 512 }, { 0, 1024 }, { -512, 512 }, { -1024, 0 } },
-      };
-  /* clang-format on */
-  return pattern_search(start_mv, ms_params, search_param, do_init_search,
-                        bigdia_num_candidates, bigdia_candidates, cost_list,
-                        best_mv);
+  return pattern_search(start_mv, ms_params, search_step, do_init_search,
+                        cost_list, best_mv);
 }
 
 static int square_search(const FULLPEL_MV start_mv,
                          const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
-                         const int search_param, const int do_init_search,
+                         const int search_step, const int do_init_search,
                          int *cost_list, FULLPEL_MV *best_mv) {
-  // All scales have 8 closest points in square shape
-  static const int square_num_candidates[MAX_PATTERN_SCALES] = {
-    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-  };
-  // Note that the largest candidate step at each scale is 2^scale
-  /* clang-format off */
-  static const MV
-      square_candidates[MAX_PATTERN_SCALES][MAX_PATTERN_CANDIDATES] = {
-        { { -1, -1 }, { 0, -1 }, { 1, -1 }, { 1, 0 }, { 1, 1 }, { 0, 1 },
-          { -1, 1 }, { -1, 0 } },
-        { { -2, -2 }, { 0, -2 }, { 2, -2 }, { 2, 0 }, { 2, 2 }, { 0, 2 },
-          { -2, 2 }, { -2, 0 } },
-        { { -4, -4 }, { 0, -4 }, { 4, -4 }, { 4, 0 }, { 4, 4 }, { 0, 4 },
-          { -4, 4 }, { -4, 0 } },
-        { { -8, -8 }, { 0, -8 }, { 8, -8 }, { 8, 0 }, { 8, 8 }, { 0, 8 },
-          { -8, 8 }, { -8, 0 } },
-        { { -16, -16 }, { 0, -16 }, { 16, -16 }, { 16, 0 }, { 16, 16 },
-          { 0, 16 }, { -16, 16 }, { -16, 0 } },
-        { { -32, -32 }, { 0, -32 }, { 32, -32 }, { 32, 0 }, { 32, 32 },
-          { 0, 32 }, { -32, 32 }, { -32, 0 } },
-        { { -64, -64 }, { 0, -64 }, { 64, -64 }, { 64, 0 }, { 64, 64 },
-          { 0, 64 }, { -64, 64 }, { -64, 0 } },
-        { { -128, -128 }, { 0, -128 }, { 128, -128 }, { 128, 0 }, { 128, 128 },
-          { 0, 128 }, { -128, 128 }, { -128, 0 } },
-        { { -256, -256 }, { 0, -256 }, { 256, -256 }, { 256, 0 }, { 256, 256 },
-          { 0, 256 }, { -256, 256 }, { -256, 0 } },
-        { { -512, -512 }, { 0, -512 }, { 512, -512 }, { 512, 0 }, { 512, 512 },
-          { 0, 512 }, { -512, 512 }, { -512, 0 } },
-        { { -1024, -1024 }, { 0, -1024 }, { 1024, -1024 }, { 1024, 0 },
-          { 1024, 1024 }, { 0, 1024 }, { -1024, 1024 }, { -1024, 0 } },
-      };
-  /* clang-format on */
-  return pattern_search(start_mv, ms_params, search_param, do_init_search,
-                        square_num_candidates, square_candidates, cost_list,
-                        best_mv);
+  return pattern_search(start_mv, ms_params, search_step, do_init_search,
+                        cost_list, best_mv);
 }
 
 static int fast_hex_search(const FULLPEL_MV start_mv,
                            const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
-                           const int search_param, const int do_init_search,
+                           const int search_step, const int do_init_search,
                            int *cost_list, FULLPEL_MV *best_mv) {
   return hex_search(start_mv, ms_params,
-                    AOMMAX(MAX_MVSEARCH_STEPS - 2, search_param),
-                    do_init_search, cost_list, best_mv);
+                    AOMMAX(MAX_MVSEARCH_STEPS - 2, search_step), do_init_search,
+                    cost_list, best_mv);
 }
 
 static int fast_dia_search(const FULLPEL_MV start_mv,
                            const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
-                           const int search_param, const int do_init_search,
+                           const int search_step, const int do_init_search,
                            int *cost_list, FULLPEL_MV *best_mv) {
   return bigdia_search(start_mv, ms_params,
-                       AOMMAX(MAX_MVSEARCH_STEPS - 2, search_param),
+                       AOMMAX(MAX_MVSEARCH_STEPS - 2, search_step),
+                       do_init_search, cost_list, best_mv);
+}
+
+static int fast_bigdia_search(const FULLPEL_MV start_mv,
+                              const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+                              const int search_step, const int do_init_search,
+                              int *cost_list, FULLPEL_MV *best_mv) {
+  return bigdia_search(start_mv, ms_params,
+                       AOMMAX(MAX_MVSEARCH_STEPS - 3, search_step),
                        do_init_search, cost_list, best_mv);
 }
 
 static int diamond_search_sad(FULLPEL_MV start_mv,
                               const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
-                              const int search_param, int *num00,
+                              const int search_step, int *num00,
                               FULLPEL_MV *best_mv, FULLPEL_MV *second_best_mv) {
   const struct buf_2d *const src = ms_params->ms_buffers.src;
   const struct buf_2d *const ref = ms_params->ms_buffers.ref;
@@ -1005,7 +1246,6 @@
   const int ref_stride = ref->stride;
   const uint8_t *best_address;
 
-  const aom_variance_fn_ptr_t *vfp = ms_params->vfp;
   const uint8_t *mask = ms_params->ms_buffers.mask;
   const uint8_t *second_pred = ms_params->ms_buffers.second_pred;
   const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params;
@@ -1018,9 +1258,9 @@
 
   clamp_fullmv(&start_mv, &ms_params->mv_limits);
 
-  // search_param determines the length of the initial step and hence the number
+  // search_step determines the length of the initial step and hence the number
   // of iterations.
-  const int tot_steps = cfg->ss_count - search_param;
+  const int tot_steps = cfg->num_search_steps - search_step;
 
   *num00 = 0;
   *best_mv = start_mv;
@@ -1032,16 +1272,16 @@
 
   int next_step_size = tot_steps > 2 ? cfg->radius[tot_steps - 2] : 1;
   for (int step = tot_steps - 1; step >= 0; --step) {
-    const search_site *ss = cfg->ss[step];
+    const search_site *site = cfg->site[step];
     best_site = 0;
     if (step > 0) next_step_size = cfg->radius[step - 1];
 
     int all_in = 1, j;
     // Trap illegal vectors
-    all_in &= best_mv->row + ss[1].mv.row >= ms_params->mv_limits.row_min;
-    all_in &= best_mv->row + ss[2].mv.row <= ms_params->mv_limits.row_max;
-    all_in &= best_mv->col + ss[3].mv.col >= ms_params->mv_limits.col_min;
-    all_in &= best_mv->col + ss[4].mv.col <= ms_params->mv_limits.col_max;
+    all_in &= best_mv->row + site[1].mv.row >= ms_params->mv_limits.row_min;
+    all_in &= best_mv->row + site[2].mv.row <= ms_params->mv_limits.row_max;
+    all_in &= best_mv->col + site[3].mv.col >= ms_params->mv_limits.col_min;
+    all_in &= best_mv->col + site[4].mv.col <= ms_params->mv_limits.col_max;
 
     // TODO(anyone): Implement 4 points search for msdf&sdaf
     if (all_in && !mask && !second_pred) {
@@ -1052,13 +1292,13 @@
         unsigned int sads[4];
 
         for (j = 0; j < 4; j++)
-          block_offset[j] = ss[idx + j].offset + best_address;
+          block_offset[j] = site[idx + j].offset + best_address;
 
-        vfp->sdx4df(src_buf, src_stride, block_offset, ref_stride, sads);
+        ms_params->sdx4df(src_buf, src_stride, block_offset, ref_stride, sads);
         for (j = 0; j < 4; j++) {
           if (sads[j] < bestsad) {
-            const FULLPEL_MV this_mv = { best_mv->row + ss[idx + j].mv.row,
-                                         best_mv->col + ss[idx + j].mv.col };
+            const FULLPEL_MV this_mv = { best_mv->row + site[idx + j].mv.row,
+                                         best_mv->col + site[idx + j].mv.col };
             unsigned int thissad =
                 sads[j] + mvsad_err_cost_(&this_mv, mv_cost_params);
             if (thissad < bestsad) {
@@ -1070,11 +1310,11 @@
       }
     } else {
       for (int idx = 1; idx <= cfg->searches_per_step[step]; idx++) {
-        const FULLPEL_MV this_mv = { best_mv->row + ss[idx].mv.row,
-                                     best_mv->col + ss[idx].mv.col };
+        const FULLPEL_MV this_mv = { best_mv->row + site[idx].mv.row,
+                                     best_mv->col + site[idx].mv.col };
 
         if (av1_is_fullmv_in_range(&ms_params->mv_limits, this_mv)) {
-          const uint8_t *const check_here = ss[idx].offset + best_address;
+          const uint8_t *const check_here = site[idx].offset + best_address;
           unsigned int thissad;
 
           thissad =
@@ -1095,9 +1335,9 @@
       if (second_best_mv) {
         *second_best_mv = *best_mv;
       }
-      best_mv->row += ss[best_site].mv.row;
-      best_mv->col += ss[best_site].mv.col;
-      best_address += ss[best_site].offset;
+      best_mv->row += site[best_site].mv.row;
+      best_mv->col += site[best_site].mv.col;
+      best_address += site[best_site].offset;
       is_off_center = 1;
     }
 
@@ -1133,7 +1373,7 @@
 
   // If there won't be more n-step search, check to see if refining search is
   // needed.
-  const int further_steps = cfg->ss_count - 1 - step_param;
+  const int further_steps = cfg->num_search_steps - 1 - step_param;
   while (n < further_steps) {
     ++n;
 
@@ -1176,7 +1416,6 @@
                                   const int range, const int step,
                                   FULLPEL_MV *best_mv,
                                   FULLPEL_MV *second_best_mv) {
-  const aom_variance_fn_ptr_t *vfp = ms_params->vfp;
   const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params;
   const struct buf_2d *const src = ms_params->ms_buffers.src;
   const struct buf_2d *const ref = ms_params->ms_buffers.ref;
@@ -1184,7 +1423,7 @@
   unsigned int best_sad = INT_MAX;
   int r, c, i;
   int start_col, end_col, start_row, end_row;
-  int col_step = (step > 1) ? step : 4;
+  const int col_step = (step > 1) ? step : 4;
 
   assert(step >= 1);
 
@@ -1205,16 +1444,8 @@
         const FULLPEL_MV mv = { start_mv.row + r, start_mv.col + c };
         unsigned int sad = get_mvpred_sad(
             ms_params, src, get_buf_from_fullmv(ref, &mv), ref_stride);
-        if (sad < best_sad) {
-          sad += mvsad_err_cost_(&mv, mv_cost_params);
-          if (sad < best_sad) {
-            best_sad = sad;
-            if (second_best_mv) {
-              *second_best_mv = *best_mv;
-            }
-            *best_mv = mv;
-          }
-        }
+        update_mvs_and_sad(sad, &mv, mv_cost_params, &best_sad,
+                           /*raw_best_sad=*/NULL, best_mv, second_best_mv);
       } else {
         // 4 sads in a single call if we are checking every location
         if (c + 3 <= end_col) {
@@ -1224,20 +1455,15 @@
             const FULLPEL_MV mv = { start_mv.row + r, start_mv.col + c + i };
             addrs[i] = get_buf_from_fullmv(ref, &mv);
           }
-          vfp->sdx4df(src->buf, src->stride, addrs, ref_stride, sads);
+
+          ms_params->sdx4df(src->buf, src->stride, addrs, ref_stride, sads);
 
           for (i = 0; i < 4; ++i) {
             if (sads[i] < best_sad) {
               const FULLPEL_MV mv = { start_mv.row + r, start_mv.col + c + i };
-              const unsigned int sad =
-                  sads[i] + mvsad_err_cost_(&mv, mv_cost_params);
-              if (sad < best_sad) {
-                best_sad = sad;
-                if (second_best_mv) {
-                  *second_best_mv = *best_mv;
-                }
-                *best_mv = mv;
-              }
+              update_mvs_and_sad(sads[i], &mv, mv_cost_params, &best_sad,
+                                 /*raw_best_sad=*/NULL, best_mv,
+                                 second_best_mv);
             }
           }
         } else {
@@ -1245,16 +1471,8 @@
             const FULLPEL_MV mv = { start_mv.row + r, start_mv.col + c + i };
             unsigned int sad = get_mvpred_sad(
                 ms_params, src, get_buf_from_fullmv(ref, &mv), ref_stride);
-            if (sad < best_sad) {
-              sad += mvsad_err_cost_(&mv, mv_cost_params);
-              if (sad < best_sad) {
-                best_sad = sad;
-                if (second_best_mv) {
-                  *second_best_mv = *best_mv;
-                }
-                *best_mv = mv;
-              }
-            }
+            update_mvs_and_sad(sad, &mv, mv_cost_params, &best_sad,
+                               /*raw_best_sad=*/NULL, best_mv, second_best_mv);
           }
         }
       }
@@ -1295,6 +1513,15 @@
   range = AOMMAX(range, (5 * AOMMAX(abs(best_mv->row), abs(best_mv->col))) / 4);
   range = AOMMIN(range, kMaxRange);
   interval = AOMMAX(interval, range / baseline_interval_divisor);
+  // Use a small search step/interval for certain kind of clips.
+  // For example, screen content clips with a lot of texts.
+  // Large interval could lead to a false matching position, and it can't find
+  // the best global candidate in following iterations due to reduced search
+  // range. The solution here is to use a small search iterval in the beginning
+  // and thus reduces the chance of missing the best candidate.
+  if (ms_params->fine_search_interval) {
+    interval = AOMMIN(interval, 4);
+  }
 
   // initial search
   bestsme = exhaustive_mesh_search(*best_mv, ms_params, range, interval,
@@ -1419,10 +1646,6 @@
     MARK_MV_INVALID(second_best_mv);
   }
 
-  assert(ms_params->ms_buffers.second_pred == NULL &&
-         ms_params->ms_buffers.mask == NULL &&
-         "av1_full_pixel_search does not support compound pred");
-
   if (cost_list) {
     cost_list[0] = INT_MAX;
     cost_list[1] = INT_MAX;
@@ -1432,6 +1655,10 @@
   }
 
   switch (search_method) {
+    case FAST_BIGDIA:
+      var = fast_bigdia_search(start_mv, ms_params, step_param, 0, cost_list,
+                               best_mv);
+      break;
     case FAST_DIAMOND:
       var = fast_dia_search(start_mv, ms_params, step_param, 0, cost_list,
                             best_mv);
@@ -1452,7 +1679,9 @@
           bigdia_search(start_mv, ms_params, step_param, 1, cost_list, best_mv);
       break;
     case NSTEP:
+    case NSTEP_8PT:
     case DIAMOND:
+    case CLAMPED_DIAMOND:
       var = full_pixel_diamond(start_mv, ms_params, step_param, cost_list,
                                best_mv, second_best_mv);
       break;
@@ -1460,12 +1689,13 @@
   }
 
   // Should we allow a follow on exhaustive search?
-  if (!run_mesh_search && search_method == NSTEP) {
-    int exhuastive_thr = ms_params->force_mesh_thresh;
-    exhuastive_thr >>=
+  if (!run_mesh_search &&
+      ((search_method == NSTEP) || (search_method == NSTEP_8PT))) {
+    int exhaustive_thr = ms_params->force_mesh_thresh;
+    exhaustive_thr >>=
         10 - (mi_size_wide_log2[bsize] + mi_size_high_log2[bsize]);
     // Threshold variance for an exhaustive full search.
-    if (var > exhuastive_thr) run_mesh_search = 1;
+    if (var > exhaustive_thr) run_mesh_search = 1;
   }
 
   // TODO(yunqing): the following is used to reduce mesh search in temporal
@@ -1478,6 +1708,40 @@
     }
   }
 
+  if (ms_params->sdf != ms_params->vfp->sdf) {
+    // If we are skipping rows when we perform the motion search, we need to
+    // check the quality of skipping. If it's bad, then we run mesh search with
+    // skip row features off.
+    // TODO(chiyotsai@google.com): Handle the case where we have a vertical
+    // offset of 1 before we hit this statement to avoid having to redo
+    // motion search.
+    const struct buf_2d *src = ms_params->ms_buffers.src;
+    const struct buf_2d *ref = ms_params->ms_buffers.ref;
+    const int src_stride = src->stride;
+    const int ref_stride = ref->stride;
+
+    const uint8_t *src_address = src->buf;
+    const uint8_t *best_address = get_buf_from_fullmv(ref, best_mv);
+    const int sad =
+        ms_params->vfp->sdf(src_address, src_stride, best_address, ref_stride);
+    const int skip_sad =
+        ms_params->vfp->sdsf(src_address, src_stride, best_address, ref_stride);
+    // We will keep the result of skipping rows if it's good enough. Here, good
+    // enough means the error is less than 1 per pixel.
+    const int kSADThresh =
+        1 << (mi_size_wide_log2[bsize] + mi_size_high_log2[bsize]);
+    if (sad > kSADThresh && abs(skip_sad - sad) * 10 >= AOMMAX(sad, 1) * 9) {
+      // There is a large discrepancy between skipping and not skipping, so we
+      // need to redo the motion search.
+      FULLPEL_MOTION_SEARCH_PARAMS new_ms_params = *ms_params;
+      new_ms_params.sdf = new_ms_params.vfp->sdf;
+      new_ms_params.sdx4df = new_ms_params.vfp->sdx4df;
+
+      return av1_full_pixel_search(start_mv, &new_ms_params, step_param,
+                                   cost_list, best_mv, second_best_mv);
+    }
+  }
+
   if (run_mesh_search) {
     int var_ex;
     FULLPEL_MV tmp_mv_ex;
@@ -1836,7 +2100,7 @@
 
 static int obmc_diamond_search_sad(
     const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, FULLPEL_MV start_mv,
-    FULLPEL_MV *best_mv, int search_param, int *num00) {
+    FULLPEL_MV *best_mv, int search_step, int *num00) {
   const aom_variance_fn_ptr_t *fn_ptr = ms_params->vfp;
   const search_site_config *cfg = ms_params->search_sites;
   const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params;
@@ -1844,12 +2108,12 @@
   const int32_t *wsrc = ms_buffers->wsrc;
   const int32_t *mask = ms_buffers->obmc_mask;
   const struct buf_2d *const ref_buf = ms_buffers->ref;
-  // search_param determines the length of the initial step and hence the number
+  // search_step determines the length of the initial step and hence the number
   // of iterations
   // 0 = initial step (MAX_FIRST_STEP) pel : 1 = (MAX_FIRST_STEP/2) pel, 2 =
   // (MAX_FIRST_STEP/4) pel... etc.
 
-  const int tot_steps = MAX_MVSEARCH_STEPS - 1 - search_param;
+  const int tot_steps = MAX_MVSEARCH_STEPS - 1 - search_step;
   const uint8_t *best_address, *init_ref;
   int best_sad = INT_MAX;
   int best_site = 0;
@@ -1865,13 +2129,13 @@
              mvsad_err_cost_(best_mv, mv_cost_params);
 
   for (step = tot_steps; step >= 0; --step) {
-    const search_site *const ss = cfg->ss[step];
+    const search_site *const site = cfg->site[step];
     best_site = 0;
     for (int idx = 1; idx <= cfg->searches_per_step[step]; ++idx) {
-      const FULLPEL_MV mv = { best_mv->row + ss[idx].mv.row,
-                              best_mv->col + ss[idx].mv.col };
+      const FULLPEL_MV mv = { best_mv->row + site[idx].mv.row,
+                              best_mv->col + site[idx].mv.col };
       if (av1_is_fullmv_in_range(&ms_params->mv_limits, mv)) {
-        int sad = fn_ptr->osdf(best_address + ss[idx].offset, ref_buf->stride,
+        int sad = fn_ptr->osdf(best_address + site[idx].offset, ref_buf->stride,
                                wsrc, mask);
         if (sad < best_sad) {
           sad += mvsad_err_cost_(&mv, mv_cost_params);
@@ -1885,9 +2149,9 @@
     }
 
     if (best_site != 0) {
-      best_mv->row += ss[best_site].mv.row;
-      best_mv->col += ss[best_site].mv.col;
-      best_address += ss[best_site].offset;
+      best_mv->row += site[best_site].mv.row;
+      best_mv->col += site[best_site].mv.col;
+      best_address += site[best_site].offset;
     } else if (best_address == init_ref) {
       (*num00)++;
     }
@@ -1908,7 +2172,7 @@
 
   // If there won't be more n-step search, check to see if refining search is
   // needed.
-  const int further_steps = cfg->ss_count - 1 - step_param;
+  const int further_steps = cfg->num_search_steps - 1 - step_param;
   if (n > further_steps) do_refine = 0;
 
   while (n < further_steps) {
@@ -2115,14 +2379,19 @@
 // both prediction error and residue into account. It is suffixed "fast" because
 // it uses bilinear filter to estimate the prediction.
 static INLINE unsigned int check_better_fast(
-    const MV *this_mv, MV *best_mv, const SubpelMvLimits *mv_limits,
-    const SUBPEL_SEARCH_VAR_PARAMS *var_params,
+    MACROBLOCKD *xd, const AV1_COMMON *cm, const MV *this_mv, MV *best_mv,
+    const SubpelMvLimits *mv_limits, const SUBPEL_SEARCH_VAR_PARAMS *var_params,
     const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr,
-    unsigned int *sse1, int *distortion, int *has_better_mv) {
+    unsigned int *sse1, int *distortion, int *has_better_mv, int is_scaled) {
   unsigned int cost;
   if (av1_is_subpelmv_in_range(mv_limits, *this_mv)) {
     unsigned int sse;
-    int thismse = estimated_pref_error(this_mv, var_params, &sse);
+    int thismse;
+    if (is_scaled) {
+      thismse = upsampled_pref_error(xd, cm, this_mv, var_params, &sse);
+    } else {
+      thismse = estimated_pref_error(this_mv, var_params, &sse);
+    }
     cost = mv_err_cost_(this_mv, mv_cost_params);
     cost += thismse;
 
@@ -2180,39 +2449,41 @@
 // search in the best quadrant. This uses bilinear filter to speed up the
 // calculation.
 static AOM_FORCE_INLINE MV first_level_check_fast(
-    const MV this_mv, MV *best_mv, int hstep, const SubpelMvLimits *mv_limits,
+    MACROBLOCKD *xd, const AV1_COMMON *cm, const MV this_mv, MV *best_mv,
+    int hstep, const SubpelMvLimits *mv_limits,
     const SUBPEL_SEARCH_VAR_PARAMS *var_params,
     const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr,
-    unsigned int *sse1, int *distortion) {
+    unsigned int *sse1, int *distortion, int is_scaled) {
   // Check the four cardinal directions
   const MV left_mv = { this_mv.row, this_mv.col - hstep };
   int dummy = 0;
-  const unsigned int left =
-      check_better_fast(&left_mv, best_mv, mv_limits, var_params,
-                        mv_cost_params, besterr, sse1, distortion, &dummy);
+  const unsigned int left = check_better_fast(
+      xd, cm, &left_mv, best_mv, mv_limits, var_params, mv_cost_params, besterr,
+      sse1, distortion, &dummy, is_scaled);
 
   const MV right_mv = { this_mv.row, this_mv.col + hstep };
-  const unsigned int right =
-      check_better_fast(&right_mv, best_mv, mv_limits, var_params,
-                        mv_cost_params, besterr, sse1, distortion, &dummy);
+  const unsigned int right = check_better_fast(
+      xd, cm, &right_mv, best_mv, mv_limits, var_params, mv_cost_params,
+      besterr, sse1, distortion, &dummy, is_scaled);
 
   const MV top_mv = { this_mv.row - hstep, this_mv.col };
-  const unsigned int up =
-      check_better_fast(&top_mv, best_mv, mv_limits, var_params, mv_cost_params,
-                        besterr, sse1, distortion, &dummy);
+  const unsigned int up = check_better_fast(
+      xd, cm, &top_mv, best_mv, mv_limits, var_params, mv_cost_params, besterr,
+      sse1, distortion, &dummy, is_scaled);
 
   const MV bottom_mv = { this_mv.row + hstep, this_mv.col };
-  const unsigned int down =
-      check_better_fast(&bottom_mv, best_mv, mv_limits, var_params,
-                        mv_cost_params, besterr, sse1, distortion, &dummy);
+  const unsigned int down = check_better_fast(
+      xd, cm, &bottom_mv, best_mv, mv_limits, var_params, mv_cost_params,
+      besterr, sse1, distortion, &dummy, is_scaled);
 
   const MV diag_step = get_best_diag_step(hstep, left, right, up, down);
   const MV diag_mv = { this_mv.row + diag_step.row,
                        this_mv.col + diag_step.col };
 
   // Check the diagonal direction with the best mv
-  check_better_fast(&diag_mv, best_mv, mv_limits, var_params, mv_cost_params,
-                    besterr, sse1, distortion, &dummy);
+  check_better_fast(xd, cm, &diag_mv, best_mv, mv_limits, var_params,
+                    mv_cost_params, besterr, sse1, distortion, &dummy,
+                    is_scaled);
 
   return diag_step;
 }
@@ -2220,10 +2491,11 @@
 // Performs a following up search after first_level_check_fast is called. This
 // performs two extra chess pattern searches in the best quadrant.
 static AOM_FORCE_INLINE void second_level_check_fast(
-    const MV this_mv, const MV diag_step, MV *best_mv, int hstep,
-    const SubpelMvLimits *mv_limits, const SUBPEL_SEARCH_VAR_PARAMS *var_params,
+    MACROBLOCKD *xd, const AV1_COMMON *cm, const MV this_mv, const MV diag_step,
+    MV *best_mv, int hstep, const SubpelMvLimits *mv_limits,
+    const SUBPEL_SEARCH_VAR_PARAMS *var_params,
     const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr,
-    unsigned int *sse1, int *distortion) {
+    unsigned int *sse1, int *distortion, int is_scaled) {
   assert(diag_step.row == hstep || diag_step.row == -hstep);
   assert(diag_step.col == hstep || diag_step.col == -hstep);
   const int tr = this_mv.row;
@@ -2236,39 +2508,47 @@
     assert(diag_step.row == br - tr);
     const MV chess_mv_1 = { br, bc + diag_step.col };
     const MV chess_mv_2 = { br + diag_step.row, bc };
-    check_better_fast(&chess_mv_1, best_mv, mv_limits, var_params,
-                      mv_cost_params, besterr, sse1, distortion, &dummy);
+    check_better_fast(xd, cm, &chess_mv_1, best_mv, mv_limits, var_params,
+                      mv_cost_params, besterr, sse1, distortion, &dummy,
+                      is_scaled);
 
-    check_better_fast(&chess_mv_2, best_mv, mv_limits, var_params,
-                      mv_cost_params, besterr, sse1, distortion, &dummy);
+    check_better_fast(xd, cm, &chess_mv_2, best_mv, mv_limits, var_params,
+                      mv_cost_params, besterr, sse1, distortion, &dummy,
+                      is_scaled);
   } else if (tr == br && tc != bc) {
     assert(diag_step.col == bc - tc);
     // Continue searching in the best direction
     const MV bottom_long_mv = { br + hstep, bc + diag_step.col };
     const MV top_long_mv = { br - hstep, bc + diag_step.col };
-    check_better_fast(&bottom_long_mv, best_mv, mv_limits, var_params,
-                      mv_cost_params, besterr, sse1, distortion, &dummy);
-    check_better_fast(&top_long_mv, best_mv, mv_limits, var_params,
-                      mv_cost_params, besterr, sse1, distortion, &dummy);
+    check_better_fast(xd, cm, &bottom_long_mv, best_mv, mv_limits, var_params,
+                      mv_cost_params, besterr, sse1, distortion, &dummy,
+                      is_scaled);
+    check_better_fast(xd, cm, &top_long_mv, best_mv, mv_limits, var_params,
+                      mv_cost_params, besterr, sse1, distortion, &dummy,
+                      is_scaled);
 
     // Search in the direction opposite of the best quadrant
     const MV rev_mv = { br - diag_step.row, bc };
-    check_better_fast(&rev_mv, best_mv, mv_limits, var_params, mv_cost_params,
-                      besterr, sse1, distortion, &dummy);
+    check_better_fast(xd, cm, &rev_mv, best_mv, mv_limits, var_params,
+                      mv_cost_params, besterr, sse1, distortion, &dummy,
+                      is_scaled);
   } else if (tr != br && tc == bc) {
     assert(diag_step.row == br - tr);
     // Continue searching in the best direction
     const MV right_long_mv = { br + diag_step.row, bc + hstep };
     const MV left_long_mv = { br + diag_step.row, bc - hstep };
-    check_better_fast(&right_long_mv, best_mv, mv_limits, var_params,
-                      mv_cost_params, besterr, sse1, distortion, &dummy);
-    check_better_fast(&left_long_mv, best_mv, mv_limits, var_params,
-                      mv_cost_params, besterr, sse1, distortion, &dummy);
+    check_better_fast(xd, cm, &right_long_mv, best_mv, mv_limits, var_params,
+                      mv_cost_params, besterr, sse1, distortion, &dummy,
+                      is_scaled);
+    check_better_fast(xd, cm, &left_long_mv, best_mv, mv_limits, var_params,
+                      mv_cost_params, besterr, sse1, distortion, &dummy,
+                      is_scaled);
 
     // Search in the direction opposite of the best quadrant
     const MV rev_mv = { br, bc - diag_step.col };
-    check_better_fast(&rev_mv, best_mv, mv_limits, var_params, mv_cost_params,
-                      besterr, sse1, distortion, &dummy);
+    check_better_fast(xd, cm, &rev_mv, best_mv, mv_limits, var_params,
+                      mv_cost_params, besterr, sse1, distortion, &dummy,
+                      is_scaled);
   }
 }
 
@@ -2276,17 +2556,18 @@
 // searches the four cardinal directions, and perform several
 // diagonal/chess-pattern searches in the best quadrant.
 static AOM_FORCE_INLINE void two_level_checks_fast(
-    const MV this_mv, MV *best_mv, int hstep, const SubpelMvLimits *mv_limits,
+    MACROBLOCKD *xd, const AV1_COMMON *cm, const MV this_mv, MV *best_mv,
+    int hstep, const SubpelMvLimits *mv_limits,
     const SUBPEL_SEARCH_VAR_PARAMS *var_params,
     const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr,
-    unsigned int *sse1, int *distortion, int iters) {
-  const MV diag_step =
-      first_level_check_fast(this_mv, best_mv, hstep, mv_limits, var_params,
-                             mv_cost_params, besterr, sse1, distortion);
+    unsigned int *sse1, int *distortion, int iters, int is_scaled) {
+  const MV diag_step = first_level_check_fast(
+      xd, cm, this_mv, best_mv, hstep, mv_limits, var_params, mv_cost_params,
+      besterr, sse1, distortion, is_scaled);
   if (iters > 1) {
-    second_level_check_fast(this_mv, diag_step, best_mv, hstep, mv_limits,
-                            var_params, mv_cost_params, besterr, sse1,
-                            distortion);
+    second_level_check_fast(xd, cm, this_mv, diag_step, best_mv, hstep,
+                            mv_limits, var_params, mv_cost_params, besterr,
+                            sse1, distortion, is_scaled);
   }
 }
 
@@ -2334,7 +2615,7 @@
     MV *best_mv, const SubpelMvLimits *mv_limits,
     const SUBPEL_SEARCH_VAR_PARAMS *var_params,
     const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr,
-    unsigned int *sse1, int *distortion) {
+    unsigned int *sse1, int *distortion, int is_scaled) {
   assert(best_mv->row == this_mv.row + diag_step.row ||
          best_mv->col == this_mv.col + diag_step.col);
   if (CHECK_MV_EQUAL(this_mv, *best_mv)) {
@@ -2365,18 +2646,18 @@
                    mv_cost_params, besterr, sse1, distortion, &has_better_mv);
     }
   } else {
-    check_better_fast(&row_bias_mv, best_mv, mv_limits, var_params,
-                      mv_cost_params, besterr, sse1, distortion,
-                      &has_better_mv);
-    check_better_fast(&col_bias_mv, best_mv, mv_limits, var_params,
-                      mv_cost_params, besterr, sse1, distortion,
-                      &has_better_mv);
+    check_better_fast(xd, cm, &row_bias_mv, best_mv, mv_limits, var_params,
+                      mv_cost_params, besterr, sse1, distortion, &has_better_mv,
+                      is_scaled);
+    check_better_fast(xd, cm, &col_bias_mv, best_mv, mv_limits, var_params,
+                      mv_cost_params, besterr, sse1, distortion, &has_better_mv,
+                      is_scaled);
 
     // Do an additional search if the second iteration gives a better mv
     if (has_better_mv) {
-      check_better_fast(&diag_bias_mv, best_mv, mv_limits, var_params,
+      check_better_fast(xd, cm, &diag_bias_mv, best_mv, mv_limits, var_params,
                         mv_cost_params, besterr, sse1, distortion,
-                        &has_better_mv);
+                        &has_better_mv, is_scaled);
     }
   }
 }
@@ -2494,80 +2775,18 @@
   return 0;
 }
 
-int av1_find_best_sub_pixel_tree_pruned_evenmore(
-    MACROBLOCKD *xd, const AV1_COMMON *const cm,
-    const SUBPEL_MOTION_SEARCH_PARAMS *ms_params, MV start_mv, MV *bestmv,
-    int *distortion, unsigned int *sse1, int_mv *last_mv_search_list) {
-  (void)cm;
-  const int allow_hp = ms_params->allow_hp;
-  const int forced_stop = ms_params->forced_stop;
-  const int iters_per_step = ms_params->iters_per_step;
-  const int *cost_list = ms_params->cost_list;
-  const SubpelMvLimits *mv_limits = &ms_params->mv_limits;
-  const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params;
-  const SUBPEL_SEARCH_VAR_PARAMS *var_params = &ms_params->var_params;
-
-  // The iteration we are current searching for. Iter 0 corresponds to fullpel
-  // mv, iter 1 to half pel, and so on
-  int iter = 0;
-  int hstep = INIT_SUBPEL_STEP_SIZE;  // Step size, initialized to 4/8=1/2 pel
-  unsigned int besterr = INT_MAX;
-  *bestmv = start_mv;
-
-  besterr = setup_center_error(xd, bestmv, var_params, mv_cost_params, sse1,
-                               distortion);
-
-  if (check_repeated_mv_and_update(last_mv_search_list, *bestmv, iter)) {
-    return INT_MAX;
-  }
-  iter++;
-
-  if (cost_list && cost_list[0] != INT_MAX && cost_list[1] != INT_MAX &&
-      cost_list[2] != INT_MAX && cost_list[3] != INT_MAX &&
-      cost_list[4] != INT_MAX && is_cost_list_wellbehaved(cost_list)) {
-    int ir, ic;
-    int dummy = 0;
-    get_cost_surf_min(cost_list, &ir, &ic, 2);
-    if (ir != 0 || ic != 0) {
-      const MV this_mv = { start_mv.row + 2 * ir, start_mv.col + 2 * ic };
-      check_better_fast(&this_mv, bestmv, mv_limits, var_params, mv_cost_params,
-                        &besterr, sse1, distortion, &dummy);
-    }
+static AOM_INLINE int setup_center_error_facade(
+    MACROBLOCKD *xd, const AV1_COMMON *cm, const MV *bestmv,
+    const SUBPEL_SEARCH_VAR_PARAMS *var_params,
+    const MV_COST_PARAMS *mv_cost_params, unsigned int *sse1, int *distortion,
+    int is_scaled) {
+  if (is_scaled) {
+    return upsampled_setup_center_error(xd, cm, bestmv, var_params,
+                                        mv_cost_params, sse1, distortion);
   } else {
-    two_level_checks_fast(start_mv, bestmv, hstep, mv_limits, var_params,
-                          mv_cost_params, &besterr, sse1, distortion,
-                          iters_per_step);
-
-    // Each subsequent iteration checks at least one point in common with
-    // the last iteration could be 2 ( if diag selected) 1/4 pel
-    if (forced_stop != HALF_PEL) {
-      if (check_repeated_mv_and_update(last_mv_search_list, *bestmv, iter)) {
-        return INT_MAX;
-      }
-      iter++;
-
-      hstep >>= 1;
-      start_mv = *bestmv;
-      two_level_checks_fast(start_mv, bestmv, hstep, mv_limits, var_params,
-                            mv_cost_params, &besterr, sse1, distortion,
-                            iters_per_step);
-    }
+    return setup_center_error(xd, bestmv, var_params, mv_cost_params, sse1,
+                              distortion);
   }
-
-  if (allow_hp && forced_stop == EIGHTH_PEL) {
-    if (check_repeated_mv_and_update(last_mv_search_list, *bestmv, iter)) {
-      return INT_MAX;
-    }
-    iter++;
-
-    hstep >>= 1;
-    start_mv = *bestmv;
-    two_level_checks_fast(start_mv, bestmv, hstep, mv_limits, var_params,
-                          mv_cost_params, &besterr, sse1, distortion,
-                          iters_per_step);
-  }
-
-  return besterr;
 }
 
 int av1_find_best_sub_pixel_tree_pruned_more(
@@ -2590,8 +2809,15 @@
   unsigned int besterr = INT_MAX;
   *bestmv = start_mv;
 
-  besterr = setup_center_error(xd, bestmv, var_params, mv_cost_params, sse1,
-                               distortion);
+  const struct scale_factors *const sf = is_intrabc_block(xd->mi[0])
+                                             ? &cm->sf_identity
+                                             : xd->block_ref_scale_factors[0];
+  const int is_scaled = av1_is_scaled(sf);
+  besterr = setup_center_error_facade(
+      xd, cm, bestmv, var_params, mv_cost_params, sse1, distortion, is_scaled);
+
+  // If forced_stop is FULL_PEL, return.
+  if (forced_stop == FULL_PEL) return besterr;
 
   if (check_repeated_mv_and_update(last_mv_search_list, *bestmv, iter)) {
     return INT_MAX;
@@ -2607,18 +2833,19 @@
       const MV this_mv = { start_mv.row + ir * hstep,
                            start_mv.col + ic * hstep };
       int dummy = 0;
-      check_better_fast(&this_mv, bestmv, mv_limits, var_params, mv_cost_params,
-                        &besterr, sse1, distortion, &dummy);
+      check_better_fast(xd, cm, &this_mv, bestmv, mv_limits, var_params,
+                        mv_cost_params, &besterr, sse1, distortion, &dummy,
+                        is_scaled);
     }
   } else {
-    two_level_checks_fast(start_mv, bestmv, hstep, mv_limits, var_params,
-                          mv_cost_params, &besterr, sse1, distortion,
-                          iters_per_step);
+    two_level_checks_fast(xd, cm, start_mv, bestmv, hstep, mv_limits,
+                          var_params, mv_cost_params, &besterr, sse1,
+                          distortion, iters_per_step, is_scaled);
   }
 
   // Each subsequent iteration checks at least one point in common with
   // the last iteration could be 2 ( if diag selected) 1/4 pel
-  if (forced_stop != HALF_PEL) {
+  if (forced_stop < HALF_PEL) {
     if (check_repeated_mv_and_update(last_mv_search_list, *bestmv, iter)) {
       return INT_MAX;
     }
@@ -2626,9 +2853,9 @@
 
     hstep >>= 1;
     start_mv = *bestmv;
-    two_level_checks_fast(start_mv, bestmv, hstep, mv_limits, var_params,
-                          mv_cost_params, &besterr, sse1, distortion,
-                          iters_per_step);
+    two_level_checks_fast(xd, cm, start_mv, bestmv, hstep, mv_limits,
+                          var_params, mv_cost_params, &besterr, sse1,
+                          distortion, iters_per_step, is_scaled);
   }
 
   if (allow_hp && forced_stop == EIGHTH_PEL) {
@@ -2639,9 +2866,9 @@
 
     hstep >>= 1;
     start_mv = *bestmv;
-    two_level_checks_fast(start_mv, bestmv, hstep, mv_limits, var_params,
-                          mv_cost_params, &besterr, sse1, distortion,
-                          iters_per_step);
+    two_level_checks_fast(xd, cm, start_mv, bestmv, hstep, mv_limits,
+                          var_params, mv_cost_params, &besterr, sse1,
+                          distortion, iters_per_step, is_scaled);
   }
 
   return besterr;
@@ -2667,8 +2894,16 @@
   unsigned int besterr = INT_MAX;
   *bestmv = start_mv;
 
-  besterr = setup_center_error(xd, bestmv, var_params, mv_cost_params, sse1,
-                               distortion);
+  const struct scale_factors *const sf = is_intrabc_block(xd->mi[0])
+                                             ? &cm->sf_identity
+                                             : xd->block_ref_scale_factors[0];
+  const int is_scaled = av1_is_scaled(sf);
+  besterr = setup_center_error_facade(
+      xd, cm, bestmv, var_params, mv_cost_params, sse1, distortion, is_scaled);
+
+  // If forced_stop is FULL_PEL, return.
+  if (forced_stop == FULL_PEL) return besterr;
+
   if (check_repeated_mv_and_update(last_mv_search_list, *bestmv, iter)) {
     return INT_MAX;
   }
@@ -2694,47 +2929,59 @@
 
     switch (whichdir) {
       case 0:  // bottom left quadrant
-        check_better_fast(&left_mv, bestmv, mv_limits, var_params,
-                          mv_cost_params, &besterr, sse1, distortion, &dummy);
-        check_better_fast(&bottom_mv, bestmv, mv_limits, var_params,
-                          mv_cost_params, &besterr, sse1, distortion, &dummy);
-        check_better_fast(&bottom_left_mv, bestmv, mv_limits, var_params,
-                          mv_cost_params, &besterr, sse1, distortion, &dummy);
+        check_better_fast(xd, cm, &left_mv, bestmv, mv_limits, var_params,
+                          mv_cost_params, &besterr, sse1, distortion, &dummy,
+                          is_scaled);
+        check_better_fast(xd, cm, &bottom_mv, bestmv, mv_limits, var_params,
+                          mv_cost_params, &besterr, sse1, distortion, &dummy,
+                          is_scaled);
+        check_better_fast(xd, cm, &bottom_left_mv, bestmv, mv_limits,
+                          var_params, mv_cost_params, &besterr, sse1,
+                          distortion, &dummy, is_scaled);
         break;
       case 1:  // bottom right quadrant
-        check_better_fast(&right_mv, bestmv, mv_limits, var_params,
-                          mv_cost_params, &besterr, sse1, distortion, &dummy);
-        check_better_fast(&bottom_mv, bestmv, mv_limits, var_params,
-                          mv_cost_params, &besterr, sse1, distortion, &dummy);
-        check_better_fast(&bottom_right_mv, bestmv, mv_limits, var_params,
-                          mv_cost_params, &besterr, sse1, distortion, &dummy);
+        check_better_fast(xd, cm, &right_mv, bestmv, mv_limits, var_params,
+                          mv_cost_params, &besterr, sse1, distortion, &dummy,
+                          is_scaled);
+        check_better_fast(xd, cm, &bottom_mv, bestmv, mv_limits, var_params,
+                          mv_cost_params, &besterr, sse1, distortion, &dummy,
+                          is_scaled);
+        check_better_fast(xd, cm, &bottom_right_mv, bestmv, mv_limits,
+                          var_params, mv_cost_params, &besterr, sse1,
+                          distortion, &dummy, is_scaled);
         break;
       case 2:  // top left quadrant
-        check_better_fast(&left_mv, bestmv, mv_limits, var_params,
-                          mv_cost_params, &besterr, sse1, distortion, &dummy);
-        check_better_fast(&top_mv, bestmv, mv_limits, var_params,
-                          mv_cost_params, &besterr, sse1, distortion, &dummy);
-        check_better_fast(&top_left_mv, bestmv, mv_limits, var_params,
-                          mv_cost_params, &besterr, sse1, distortion, &dummy);
+        check_better_fast(xd, cm, &left_mv, bestmv, mv_limits, var_params,
+                          mv_cost_params, &besterr, sse1, distortion, &dummy,
+                          is_scaled);
+        check_better_fast(xd, cm, &top_mv, bestmv, mv_limits, var_params,
+                          mv_cost_params, &besterr, sse1, distortion, &dummy,
+                          is_scaled);
+        check_better_fast(xd, cm, &top_left_mv, bestmv, mv_limits, var_params,
+                          mv_cost_params, &besterr, sse1, distortion, &dummy,
+                          is_scaled);
         break;
       case 3:  // top right quadrant
-        check_better_fast(&right_mv, bestmv, mv_limits, var_params,
-                          mv_cost_params, &besterr, sse1, distortion, &dummy);
-        check_better_fast(&top_mv, bestmv, mv_limits, var_params,
-                          mv_cost_params, &besterr, sse1, distortion, &dummy);
-        check_better_fast(&top_right_mv, bestmv, mv_limits, var_params,
-                          mv_cost_params, &besterr, sse1, distortion, &dummy);
+        check_better_fast(xd, cm, &right_mv, bestmv, mv_limits, var_params,
+                          mv_cost_params, &besterr, sse1, distortion, &dummy,
+                          is_scaled);
+        check_better_fast(xd, cm, &top_mv, bestmv, mv_limits, var_params,
+                          mv_cost_params, &besterr, sse1, distortion, &dummy,
+                          is_scaled);
+        check_better_fast(xd, cm, &top_right_mv, bestmv, mv_limits, var_params,
+                          mv_cost_params, &besterr, sse1, distortion, &dummy,
+                          is_scaled);
         break;
     }
   } else {
-    two_level_checks_fast(start_mv, bestmv, hstep, mv_limits, var_params,
-                          mv_cost_params, &besterr, sse1, distortion,
-                          iters_per_step);
+    two_level_checks_fast(xd, cm, start_mv, bestmv, hstep, mv_limits,
+                          var_params, mv_cost_params, &besterr, sse1,
+                          distortion, iters_per_step, is_scaled);
   }
 
   // Each subsequent iteration checks at least one point in common with
   // the last iteration could be 2 ( if diag selected) 1/4 pel
-  if (forced_stop != HALF_PEL) {
+  if (forced_stop < HALF_PEL) {
     if (check_repeated_mv_and_update(last_mv_search_list, *bestmv, iter)) {
       return INT_MAX;
     }
@@ -2742,9 +2989,9 @@
 
     hstep >>= 1;
     start_mv = *bestmv;
-    two_level_checks_fast(start_mv, bestmv, hstep, mv_limits, var_params,
-                          mv_cost_params, &besterr, sse1, distortion,
-                          iters_per_step);
+    two_level_checks_fast(xd, cm, start_mv, bestmv, hstep, mv_limits,
+                          var_params, mv_cost_params, &besterr, sse1,
+                          distortion, iters_per_step, is_scaled);
   }
 
   if (allow_hp && forced_stop == EIGHTH_PEL) {
@@ -2755,9 +3002,9 @@
 
     hstep >>= 1;
     start_mv = *bestmv;
-    two_level_checks_fast(start_mv, bestmv, hstep, mv_limits, var_params,
-                          mv_cost_params, &besterr, sse1, distortion,
-                          iters_per_step);
+    two_level_checks_fast(xd, cm, start_mv, bestmv, hstep, mv_limits,
+                          var_params, mv_cost_params, &besterr, sse1,
+                          distortion, iters_per_step, is_scaled);
   }
 
   return besterr;
@@ -2786,6 +3033,11 @@
 
   *bestmv = start_mv;
 
+  const struct scale_factors *const sf = is_intrabc_block(xd->mi[0])
+                                             ? &cm->sf_identity
+                                             : xd->block_ref_scale_factors[0];
+  const int is_scaled = av1_is_scaled(sf);
+
   if (subpel_search_type != USE_2_TAPS_ORIG) {
     besterr = upsampled_setup_center_error(xd, cm, bestmv, var_params,
                                            mv_cost_params, sse1, distortion);
@@ -2794,6 +3046,9 @@
                                  distortion);
   }
 
+  // If forced_stop is FULL_PEL, return.
+  if (!round) return besterr;
+
   for (int iter = 0; iter < round; ++iter) {
     MV iter_center_mv = *bestmv;
     if (check_repeated_mv_and_update(last_mv_search_list, iter_center_mv,
@@ -2807,16 +3062,16 @@
                                     mv_limits, var_params, mv_cost_params,
                                     &besterr, sse1, distortion);
     } else {
-      diag_step = first_level_check_fast(iter_center_mv, bestmv, hstep,
+      diag_step = first_level_check_fast(xd, cm, iter_center_mv, bestmv, hstep,
                                          mv_limits, var_params, mv_cost_params,
-                                         &besterr, sse1, distortion);
+                                         &besterr, sse1, distortion, is_scaled);
     }
 
     // Check diagonal sub-pixel position
     if (!CHECK_MV_EQUAL(iter_center_mv, *bestmv) && iters_per_step > 1) {
       second_level_check_v2(xd, cm, iter_center_mv, diag_step, bestmv,
                             mv_limits, var_params, mv_cost_params, &besterr,
-                            sse1, distortion);
+                            sse1, distortion, is_scaled);
     }
 
     hstep >>= 1;
@@ -2881,6 +3136,7 @@
   return besterr;
 }
 
+#if !CONFIG_REALTIME_ONLY
 // Computes the cost of the current predictor by going through the whole
 // av1_enc_build_inter_predictor pipeline. This is mainly used by warped mv
 // during motion_mode_rd. We are going through the whole
@@ -2948,9 +3204,10 @@
       if (av1_is_subpelmv_in_range(mv_limits, this_mv)) {
         memcpy(pts, pts0, total_samples * 2 * sizeof(*pts0));
         memcpy(pts_inref, pts_inref0, total_samples * 2 * sizeof(*pts_inref0));
-        if (total_samples > 1)
+        if (total_samples > 1) {
           mbmi->num_proj_ref =
               av1_selectSamples(&this_mv, pts, pts_inref, total_samples, bsize);
+        }
 
         if (!av1_find_projection(mbmi->num_proj_ref, pts, pts_inref, bsize,
                                  this_mv.row, this_mv.col, &mbmi->wm_params,
@@ -2979,6 +3236,7 @@
   mbmi->num_proj_ref = best_num_proj_ref;
   return bestmse;
 }
+#endif  // !CONFIG_REALTIME_ONLY
 // =============================================================================
 //  Subpixel Motion Search: OBMC
 // =============================================================================
@@ -3324,22 +3582,18 @@
 // =============================================================================
 //  Public cost function: mv_cost + pred error
 // =============================================================================
-int av1_get_mvpred_sse(const MACROBLOCK *x, const FULLPEL_MV *best_mv,
-                       const MV *ref_mv, const aom_variance_fn_ptr_t *vfp) {
-  const MACROBLOCKD *const xd = &x->e_mbd;
-  const struct buf_2d *const what = &x->plane[0].src;
-  const struct buf_2d *const in_what = &xd->plane[0].pre[0];
-  const MV mv = get_mv_from_fullmv(best_mv);
-  const MV_COST_TYPE mv_cost_type = x->mv_cost_type;
+int av1_get_mvpred_sse(const MV_COST_PARAMS *mv_cost_params,
+                       const FULLPEL_MV best_mv,
+                       const aom_variance_fn_ptr_t *vfp,
+                       const struct buf_2d *src, const struct buf_2d *pre) {
+  const MV mv = get_mv_from_fullmv(&best_mv);
   unsigned int sse, var;
 
-  var = vfp->vf(what->buf, what->stride, get_buf_from_fullmv(in_what, best_mv),
-                in_what->stride, &sse);
+  var = vfp->vf(src->buf, src->stride, get_buf_from_fullmv(pre, &best_mv),
+                pre->stride, &sse);
   (void)var;
 
-  return sse + mv_err_cost(&mv, ref_mv, x->nmv_vec_cost,
-                           CONVERT_TO_CONST_MVCOST(x->mv_cost_stack),
-                           x->errorperbit, mv_cost_type);
+  return sse + mv_err_cost_(&mv, mv_cost_params);
 }
 
 static INLINE int get_mvpred_av_var(const MV_COST_PARAMS *mv_cost_params,
@@ -3348,13 +3602,11 @@
                                     const aom_variance_fn_ptr_t *vfp,
                                     const struct buf_2d *src,
                                     const struct buf_2d *pre) {
-  const struct buf_2d *const what = src;
-  const struct buf_2d *const in_what = pre;
   const MV mv = get_mv_from_fullmv(&best_mv);
   unsigned int unused;
 
-  return vfp->svaf(get_buf_from_fullmv(in_what, &best_mv), in_what->stride, 0,
-                   0, what->buf, what->stride, &unused, second_pred) +
+  return vfp->svaf(get_buf_from_fullmv(pre, &best_mv), pre->stride, 0, 0,
+                   src->buf, src->stride, &unused, second_pred) +
          mv_err_cost_(&mv, mv_cost_params);
 }
 
@@ -3363,14 +3615,12 @@
     const uint8_t *second_pred, const uint8_t *mask, int mask_stride,
     int invert_mask, const aom_variance_fn_ptr_t *vfp, const struct buf_2d *src,
     const struct buf_2d *pre) {
-  const struct buf_2d *const what = src;
-  const struct buf_2d *const in_what = pre;
   const MV mv = get_mv_from_fullmv(&best_mv);
   unsigned int unused;
 
-  return vfp->msvf(what->buf, what->stride, 0, 0,
-                   get_buf_from_fullmv(in_what, &best_mv), in_what->stride,
-                   second_pred, mask, mask_stride, invert_mask, &unused) +
+  return vfp->msvf(get_buf_from_fullmv(pre, &best_mv), pre->stride, 0, 0,
+                   src->buf, src->stride, second_pred, mask, mask_stride,
+                   invert_mask, &unused) +
          mv_err_cost_(&mv, mv_cost_params);
 }
 
diff --git a/av1/encoder/mcomp.h b/av1/encoder/mcomp.h
index 73135d8..8f6085f 100644
--- a/av1/encoder/mcomp.h
+++ b/av1/encoder/mcomp.h
@@ -42,8 +42,9 @@
 } search_site;
 
 typedef struct search_site_config {
-  search_site ss[MAX_MVSEARCH_STEPS * 2][16 + 1];
-  int ss_count;
+  search_site site[MAX_MVSEARCH_STEPS * 2][16 + 1];
+  // Number of search steps.
+  int num_search_steps;
   int searches_per_step[MAX_MVSEARCH_STEPS * 2];
   int radius[MAX_MVSEARCH_STEPS * 2];
   int stride;
@@ -60,21 +61,34 @@
 // =============================================================================
 //  Cost functions
 // =============================================================================
+
+enum {
+  MV_COST_ENTROPY,    // Use the entropy rate of the mv as the cost
+  MV_COST_L1_LOWRES,  // Use the l1 norm of the mv as the cost (<480p)
+  MV_COST_L1_MIDRES,  // Use the l1 norm of the mv as the cost (>=480p)
+  MV_COST_L1_HDRES,   // Use the l1 norm of the mv as the cost (>=720p)
+  MV_COST_NONE        // Use 0 as as cost irrespective of the current mv
+} UENUM1BYTE(MV_COST_TYPE);
+
 typedef struct {
+  // The reference mv used to compute the mv cost
   const MV *ref_mv;
   FULLPEL_MV full_ref_mv;
+  MV_COST_TYPE mv_cost_type;
   const int *mvjcost;
   const int *mvcost[2];
   int error_per_bit;
+  // A multiplier used to convert rate to sad cost
   int sad_per_bit;
-  MV_COST_TYPE mv_cost_type;
 } MV_COST_PARAMS;
 
 int av1_mv_bit_cost(const MV *mv, const MV *ref_mv, const int *mvjcost,
                     int *mvcost[2], int weight);
 
-int av1_get_mvpred_sse(const MACROBLOCK *x, const FULLPEL_MV *best_mv,
-                       const MV *ref_mv, const aom_variance_fn_ptr_t *vfp);
+int av1_get_mvpred_sse(const MV_COST_PARAMS *mv_cost_params,
+                       const FULLPEL_MV best_mv,
+                       const aom_variance_fn_ptr_t *vfp,
+                       const struct buf_2d *src, const struct buf_2d *pre);
 int av1_get_mvpred_compound_var(const MV_COST_PARAMS *ms_params,
                                 const FULLPEL_MV best_mv,
                                 const uint8_t *second_pred, const uint8_t *mask,
@@ -116,23 +130,50 @@
 //  Fullpixel Motion Search
 // =============================================================================
 enum {
+  // Search 8-points in the radius grid around center, up to 11 search stages.
   DIAMOND = 0,
+  // Search 12-points in the radius/tan_radius grid around center,
+  // up to 15 search stages.
   NSTEP = 1,
-  HEX = 2,
-  BIGDIA = 3,
-  SQUARE = 4,
-  FAST_HEX = 5,
-  FAST_DIAMOND = 6
+  // Search 8-points in the radius grid around center, up to 16 search stages.
+  NSTEP_8PT = 2,
+  // Search 8-points in the radius grid around center, upto 11 search stages
+  // with clamping of search radius.
+  CLAMPED_DIAMOND = 3,
+  // Search maximum 8-points in the radius grid around center,
+  // up to 11 search stages. First stage consists of 8 search points
+  // and the rest with 6 search points each in hex shape.
+  HEX = 4,
+  // Search maximum 8-points in the radius grid around center,
+  // up to 11 search stages. First stage consists of 4 search
+  // points and the rest with 8 search points each.
+  BIGDIA = 5,
+  // Search 8-points in the square grid around center, up to 11 search stages.
+  SQUARE = 6,
+  // HEX search with up to 2 stages.
+  FAST_HEX = 7,
+  // BIGDIA search with up to 2 stages.
+  FAST_DIAMOND = 8,
+  // BIGDIA search with up to 3 stages.
+  FAST_BIGDIA = 9,
+  // Total number of search methods.
+  NUM_SEARCH_METHODS,
+  // Number of distinct search methods.
+  NUM_DISTINCT_SEARCH_METHODS = SQUARE + 1,
 } UENUM1BYTE(SEARCH_METHODS);
 
 // This struct holds fullpixel motion search parameters that should be constant
 // during the search
 typedef struct {
   BLOCK_SIZE bsize;
+  // A function pointer to the simd function for fast computation
   const aom_variance_fn_ptr_t *vfp;
 
   MSBuffers ms_buffers;
 
+  // WARNING: search_method should be regarded as a private variable and should
+  // not be modified directly so it is in sync with search_sites. To modify it,
+  // use av1_set_mv_search_method.
   SEARCH_METHODS search_method;
   const search_site_config *search_sites;
   FullMvLimits mv_limits;
@@ -145,26 +186,72 @@
                           // higher than the threshold.
   const struct MESH_PATTERN *mesh_patterns[2];
 
+  // Use maximum search interval of 4 if true. This helps motion search to find
+  // the best motion vector for screen content types.
+  int fine_search_interval;
+
   int is_intra_mode;
 
   int fast_obmc_search;
 
   // For calculating mv cost
   MV_COST_PARAMS mv_cost_params;
+
+  // Stores the function used to compute the sad. This can be different from the
+  // sdf in vfp (e.g. downsampled sad and not sad) to allow speed up.
+  aom_sad_fn_t sdf;
+  aom_sad_multi_d_fn_t sdx4df;
 } FULLPEL_MOTION_SEARCH_PARAMS;
 
-void av1_make_default_fullpel_ms_params(FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
-                                        const struct AV1_COMP *cpi,
-                                        const MACROBLOCK *x, BLOCK_SIZE bsize,
-                                        const MV *ref_mv,
-                                        const search_site_config *search_sites);
+void av1_make_default_fullpel_ms_params(
+    FULLPEL_MOTION_SEARCH_PARAMS *ms_params, const struct AV1_COMP *cpi,
+    const MACROBLOCK *x, BLOCK_SIZE bsize, const MV *ref_mv,
+    const search_site_config search_sites[NUM_SEARCH_METHODS],
+    int fine_search_interval);
 
-// Sets up configs for fullpixel diamond search
-void av1_init_dsmotion_compensation(search_site_config *cfg, int stride);
-// Sets up configs for firstpass motion search
+// Sets up configs for fullpixel DIAMOND / CLAMPED_DIAMOND search method.
+void av1_init_dsmotion_compensation(search_site_config *cfg, int stride,
+                                    int level);
+// Sets up configs for firstpass motion search.
 void av1_init_motion_fpf(search_site_config *cfg, int stride);
-// Sets up configs for all other types of motion search
-void av1_init3smotion_compensation(search_site_config *cfg, int stride);
+// Sets up configs for NSTEP / NSTEP_8PT motion search method.
+void av1_init_motion_compensation_nstep(search_site_config *cfg, int stride,
+                                        int level);
+// Sets up configs for BIGDIA / FAST_DIAMOND / FAST_BIGDIA
+// motion search method.
+void av1_init_motion_compensation_bigdia(search_site_config *cfg, int stride,
+                                         int level);
+// Sets up configs for HEX or FAST_HEX motion search method.
+void av1_init_motion_compensation_hex(search_site_config *cfg, int stride,
+                                      int level);
+// Sets up configs for SQUARE motion search method.
+void av1_init_motion_compensation_square(search_site_config *cfg, int stride,
+                                         int level);
+
+// Mv beyond the range do not produce new/different prediction block.
+static INLINE void av1_set_mv_search_method(
+    FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+    const search_site_config search_sites[NUM_SEARCH_METHODS],
+    SEARCH_METHODS search_method) {
+  // Array to inform which all search methods are having
+  // same candidates and different in number of search steps.
+  static const SEARCH_METHODS search_method_lookup[NUM_SEARCH_METHODS] = {
+    DIAMOND,          // DIAMOND
+    NSTEP,            // NSTEP
+    NSTEP_8PT,        // NSTEP_8PT
+    CLAMPED_DIAMOND,  // CLAMPED_DIAMOND
+    HEX,              // HEX
+    BIGDIA,           // BIGDIA
+    SQUARE,           // SQUARE
+    HEX,              // FAST_HEX
+    BIGDIA,           // FAST_DIAMOND
+    BIGDIA            // FAST_BIGDIA
+  };
+
+  ms_params->search_method = search_method;
+  ms_params->search_sites =
+      &search_sites[search_method_lookup[ms_params->search_method]];
+}
 
 // Set up limit values for MV components.
 // Mv beyond the range do not produce new/different prediction block.
@@ -281,7 +368,6 @@
 extern fractional_mv_step_fp av1_find_best_sub_pixel_tree;
 extern fractional_mv_step_fp av1_find_best_sub_pixel_tree_pruned;
 extern fractional_mv_step_fp av1_find_best_sub_pixel_tree_pruned_more;
-extern fractional_mv_step_fp av1_find_best_sub_pixel_tree_pruned_evenmore;
 extern fractional_mv_step_fp av1_return_max_sub_pixel_mv;
 extern fractional_mv_step_fp av1_return_min_sub_pixel_mv;
 extern fractional_mv_step_fp av1_find_best_obmc_sub_pixel_tree_up;
diff --git a/av1/encoder/motion_search_facade.c b/av1/encoder/motion_search_facade.c
index 8db1423..42019ef 100644
--- a/av1/encoder/motion_search_facade.c
+++ b/av1/encoder/motion_search_facade.c
@@ -37,6 +37,84 @@
   return 0;
 }
 
+// Allow more mesh searches for screen content type on the ARF.
+static int use_fine_search_interval(const AV1_COMP *const cpi) {
+  return cpi->is_screen_content_type &&
+         cpi->gf_group.update_type[cpi->gf_group.index] == ARF_UPDATE &&
+         cpi->oxcf.speed <= 2;
+}
+
+// Iterate through the tpl and collect the mvs to be used as candidates
+static INLINE void get_mv_candidate_from_tpl(const AV1_COMP *const cpi,
+                                             const MACROBLOCK *x,
+                                             BLOCK_SIZE bsize, int ref,
+                                             cand_mv_t *cand, int *cand_count,
+                                             int *total_cand_weight) {
+  const SuperBlockEnc *sb_enc = &x->sb_enc;
+  if (!sb_enc->tpl_data_count) {
+    return;
+  }
+
+  const AV1_COMMON *cm = &cpi->common;
+  const MACROBLOCKD *xd = &x->e_mbd;
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
+
+  const BLOCK_SIZE tpl_bsize =
+      convert_length_to_bsize(cpi->tpl_data.tpl_bsize_1d);
+  const int tplw = mi_size_wide[tpl_bsize];
+  const int tplh = mi_size_high[tpl_bsize];
+  const int nw = mi_size_wide[bsize] / tplw;
+  const int nh = mi_size_high[bsize] / tplh;
+
+  if (nw >= 1 && nh >= 1) {
+    const int of_h = mi_row % mi_size_high[cm->seq_params.sb_size];
+    const int of_w = mi_col % mi_size_wide[cm->seq_params.sb_size];
+    const int start = of_h / tplh * sb_enc->tpl_stride + of_w / tplw;
+    int valid = 1;
+
+    // Assign large weight to start_mv, so it is always tested.
+    cand[0].weight = nw * nh;
+
+    for (int k = 0; k < nh; k++) {
+      for (int l = 0; l < nw; l++) {
+        const int_mv mv =
+            sb_enc
+                ->tpl_mv[start + k * sb_enc->tpl_stride + l][ref - LAST_FRAME];
+        if (mv.as_int == INVALID_MV) {
+          valid = 0;
+          break;
+        }
+
+        const FULLPEL_MV fmv = { GET_MV_RAWPEL(mv.as_mv.row),
+                                 GET_MV_RAWPEL(mv.as_mv.col) };
+        int unique = 1;
+        for (int m = 0; m < *cand_count; m++) {
+          if (RIGHT_SHIFT_MV(fmv.row) == RIGHT_SHIFT_MV(cand[m].fmv.row) &&
+              RIGHT_SHIFT_MV(fmv.col) == RIGHT_SHIFT_MV(cand[m].fmv.col)) {
+            unique = 0;
+            cand[m].weight++;
+            break;
+          }
+        }
+
+        if (unique) {
+          cand[*cand_count].fmv = fmv;
+          cand[*cand_count].weight = 1;
+          (*cand_count)++;
+        }
+      }
+      if (!valid) break;
+    }
+
+    if (valid) {
+      *total_cand_weight = 2 * nh * nw;
+      if (*cand_count > 2)
+        qsort(cand, *cand_count, sizeof(cand[0]), &compare_weight);
+    }
+  }
+}
+
 void av1_single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
                               BLOCK_SIZE bsize, int ref_idx, int *rate_mv,
                               int search_range, inter_mode_info *mode_info,
@@ -53,6 +131,7 @@
       av1_get_scaled_ref_frame(cpi, ref);
   const int mi_row = xd->mi_row;
   const int mi_col = xd->mi_col;
+  const MvCosts *mv_costs = &x->mv_costs;
 
   if (scaled_ref_frame) {
     // Swap out the reference frame for a version that's been scaled to
@@ -80,43 +159,6 @@
     step_param = mv_search_params->mv_step_param;
   }
 
-  if (cpi->sf.mv_sf.adaptive_motion_search && bsize < cm->seq_params.sb_size) {
-    int boffset =
-        2 * (mi_size_wide_log2[cm->seq_params.sb_size] -
-             AOMMIN(mi_size_high_log2[bsize], mi_size_wide_log2[bsize]));
-    step_param = AOMMAX(step_param, boffset);
-  }
-
-  if (cpi->sf.mv_sf.adaptive_motion_search) {
-    int bwl = mi_size_wide_log2[bsize];
-    int bhl = mi_size_high_log2[bsize];
-    int tlevel = x->pred_mv_sad[ref] >> (bwl + bhl + 4);
-
-    if (tlevel < 5) {
-      step_param += 2;
-      step_param = AOMMIN(step_param, MAX_MVSEARCH_STEPS - 1);
-    }
-
-    // prev_mv_sad is not setup for dynamically scaled frames.
-    if (cpi->oxcf.resize_mode != RESIZE_RANDOM) {
-      int i;
-      for (i = LAST_FRAME; i <= ALTREF_FRAME && cm->show_frame; ++i) {
-        if ((x->pred_mv_sad[ref] >> 3) > x->pred_mv_sad[i]) {
-          x->pred_mv[ref].row = 0;
-          x->pred_mv[ref].col = 0;
-          best_mv->as_int = INVALID_MV;
-
-          if (scaled_ref_frame) {
-            // Swap back the original buffers before returning.
-            for (int j = 0; j < num_planes; ++j)
-              xd->plane[j].pre[ref_idx] = backup_yv12[j];
-          }
-          return;
-        }
-      }
-    }
-  }
-
   const MV ref_mv = av1_get_ref_mv(x, ref_idx).as_mv;
   FULLPEL_MV start_mv;
   if (mbmi->motion_mode != SIMPLE_TRANSLATION)
@@ -125,79 +167,29 @@
     start_mv = get_fullmv_from_mv(&ref_mv);
 
   // cand stores start_mv and all possible MVs in a SB.
-  cand_mv_t cand[MAX_MC_FLOW_BLK_IN_SB * MAX_MC_FLOW_BLK_IN_SB + 1] = {
-    { { 0, 0 }, 0 }
-  };
+  cand_mv_t cand[MAX_TPL_BLK_IN_SB * MAX_TPL_BLK_IN_SB + 1] = { { { 0, 0 },
+                                                                  0 } };
   cand[0].fmv = start_mv;
   int cnt = 1;
   int total_weight = 0;
 
   if (!cpi->sf.mv_sf.full_pixel_search_level &&
       mbmi->motion_mode == SIMPLE_TRANSLATION) {
-    if (x->valid_cost_b) {
-      const BLOCK_SIZE tpl_bsize = convert_length_to_bsize(MC_FLOW_BSIZE_1D);
-      const int tplw = mi_size_wide[tpl_bsize];
-      const int tplh = mi_size_high[tpl_bsize];
-      const int nw = mi_size_wide[bsize] / tplw;
-      const int nh = mi_size_high[bsize] / tplh;
-
-      if (nw >= 1 && nh >= 1) {
-        const int of_h = mi_row % mi_size_high[cm->seq_params.sb_size];
-        const int of_w = mi_col % mi_size_wide[cm->seq_params.sb_size];
-        const int start = of_h / tplh * x->cost_stride + of_w / tplw;
-        int valid = 1;
-
-        // Assign large weight to start_mv, so it is always tested.
-        cand[0].weight = nw * nh;
-
-        for (int k = 0; k < nh; k++) {
-          for (int l = 0; l < nw; l++) {
-            const int_mv mv =
-                x->mv_b[start + k * x->cost_stride + l][ref - LAST_FRAME];
-            if (mv.as_int == INVALID_MV) {
-              valid = 0;
-              break;
-            }
-
-            const FULLPEL_MV fmv = { GET_MV_RAWPEL(mv.as_mv.row),
-                                     GET_MV_RAWPEL(mv.as_mv.col) };
-            int unique = 1;
-            for (int m = 0; m < cnt; m++) {
-              if (RIGHT_SHIFT_MV(fmv.row) == RIGHT_SHIFT_MV(cand[m].fmv.row) &&
-                  RIGHT_SHIFT_MV(fmv.col) == RIGHT_SHIFT_MV(cand[m].fmv.col)) {
-                unique = 0;
-                cand[m].weight++;
-                break;
-              }
-            }
-
-            if (unique) {
-              cand[cnt].fmv = fmv;
-              cand[cnt].weight = 1;
-              cnt++;
-            }
-          }
-          if (!valid) break;
-        }
-
-        if (valid) {
-          total_weight = 2 * nh * nw;
-          if (cnt > 2) qsort(cand, cnt, sizeof(cand[0]), &compare_weight);
-        }
-      }
-    }
+    get_mv_candidate_from_tpl(cpi, x, bsize, ref, cand, &cnt, &total_weight);
   }
 
   // Further reduce the search range.
   if (search_range < INT_MAX) {
-    const search_site_config *ss_cfg = &mv_search_params->ss_cfg[SS_CFG_SRC];
-    // MAx step_param is ss_cfg->ss_count.
+    const search_site_config *search_site_cfg =
+        &mv_search_params
+             ->search_site_cfg[SS_CFG_SRC][cpi->sf.mv_sf.search_method];
+    // Max step_param is search_site_cfg->num_search_steps.
     if (search_range < 1) {
-      step_param = ss_cfg->ss_count;
+      step_param = search_site_cfg->num_search_steps;
     } else {
-      while (ss_cfg->radius[ss_cfg->ss_count - step_param - 1] >
-                 (search_range << 1) &&
-             ss_cfg->ss_count - step_param - 1 > 0)
+      while (search_site_cfg->radius[search_site_cfg->num_search_steps -
+                                     step_param - 1] > (search_range << 1) &&
+             search_site_cfg->num_search_steps - step_param - 1 > 0)
         step_param++;
     }
   }
@@ -206,17 +198,19 @@
   int_mv second_best_mv;
   best_mv->as_int = second_best_mv.as_int = INVALID_MV;
 
+  // Allow more mesh searches for screen content type on the ARF.
+  const int fine_search_interval = use_fine_search_interval(cpi);
   const search_site_config *src_search_sites =
-      &mv_search_params->ss_cfg[SS_CFG_SRC];
+      mv_search_params->search_site_cfg[SS_CFG_SRC];
   FULLPEL_MOTION_SEARCH_PARAMS full_ms_params;
   av1_make_default_fullpel_ms_params(&full_ms_params, cpi, x, bsize, &ref_mv,
-                                     src_search_sites);
+                                     src_search_sites, fine_search_interval);
 
   switch (mbmi->motion_mode) {
     case SIMPLE_TRANSLATION: {
+      // Perform a search with the top 2 candidates
       int sum_weight = 0;
-
-      for (int m = 0; m < cnt; m++) {
+      for (int m = 0; m < AOMMIN(2, cnt); m++) {
         FULLPEL_MV smv = cand[m].fmv;
         FULLPEL_MV this_best_mv, this_second_best_mv;
 
@@ -231,7 +225,7 @@
         }
 
         sum_weight += cand[m].weight;
-        if (m >= 2 || 4 * sum_weight > 3 * total_weight) break;
+        if (4 * sum_weight > 3 * total_weight) break;
       }
     } break;
     case OBMC_CAUSAL:
@@ -261,8 +255,8 @@
     this_mv.as_mv = get_mv_from_fullmv(&best_mv->as_fullmv);
     const int ref_mv_idx = mbmi->ref_mv_idx;
     const int this_mv_rate =
-        av1_mv_bit_cost(&this_mv.as_mv, &ref_mv, x->nmv_vec_cost,
-                        x->mv_cost_stack, MV_COST_WEIGHT);
+        av1_mv_bit_cost(&this_mv.as_mv, &ref_mv, mv_costs->nmv_joint_cost,
+                        mv_costs->mv_cost_stack, MV_COST_WEIGHT);
     mode_info[ref_mv_idx].full_search_mv.as_int = this_mv.as_int;
     mode_info[ref_mv_idx].full_mv_rate = this_mv_rate;
 
@@ -338,18 +332,14 @@
       default: assert(0 && "Invalid motion mode!\n");
     }
   }
-  *rate_mv = av1_mv_bit_cost(&best_mv->as_mv, &ref_mv, x->nmv_vec_cost,
-                             x->mv_cost_stack, MV_COST_WEIGHT);
-
-  if (cpi->sf.mv_sf.adaptive_motion_search &&
-      mbmi->motion_mode == SIMPLE_TRANSLATION)
-    x->pred_mv[ref] = best_mv->as_mv;
+  *rate_mv = av1_mv_bit_cost(&best_mv->as_mv, &ref_mv, mv_costs->nmv_joint_cost,
+                             mv_costs->mv_cost_stack, MV_COST_WEIGHT);
 }
 
-void av1_joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
-                             BLOCK_SIZE bsize, int_mv *cur_mv,
-                             const uint8_t *mask, int mask_stride,
-                             int *rate_mv) {
+int av1_joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
+                            BLOCK_SIZE bsize, int_mv *cur_mv,
+                            const uint8_t *mask, int mask_stride,
+                            int *rate_mv) {
   const AV1_COMMON *const cm = &cpi->common;
   const int num_planes = av1_num_planes(cm);
   const int pw = block_size_wide[bsize];
@@ -361,6 +351,7 @@
   assert(has_second_ref(mbmi));
   const int_mv init_mv[2] = { cur_mv[0], cur_mv[1] };
   const int refs[2] = { mbmi->ref_frame[0], mbmi->ref_frame[1] };
+  const MvCosts *mv_costs = &x->mv_costs;
   int_mv ref_mv[2];
   int ite, ref;
 
@@ -442,18 +433,17 @@
     av1_enc_build_one_inter_predictor(second_pred, pw, &cur_mv[!id].as_mv,
                                       &inter_pred_params);
 
-    const int order_idx = id != 0;
-    av1_dist_wtd_comp_weight_assign(
-        cm, mbmi, order_idx, &xd->jcp_param.fwd_offset,
-        &xd->jcp_param.bck_offset, &xd->jcp_param.use_dist_wtd_comp_avg, 1);
-
     // Do full-pixel compound motion search on the current reference frame.
     if (id) xd->plane[plane].pre[0] = ref_yv12[id];
 
     // Make motion search params
     FULLPEL_MOTION_SEARCH_PARAMS full_ms_params;
+    const search_site_config *src_search_sites =
+        cpi->mv_search_params.search_site_cfg[SS_CFG_SRC];
     av1_make_default_fullpel_ms_params(&full_ms_params, cpi, x, bsize,
-                                       &ref_mv[id].as_mv, NULL);
+                                       &ref_mv[id].as_mv, src_search_sites,
+                                       /*fine_search_interval=*/0);
+
     av1_set_ms_compound_refs(&full_ms_params.ms_buffers, second_pred, mask,
                              mask_stride, id);
 
@@ -461,14 +451,12 @@
     const FULLPEL_MV start_fullmv = get_fullmv_from_mv(&cur_mv[id].as_mv);
 
     // Small-range full-pixel motion search.
-    bestsme = av1_refining_search_8p_c(&full_ms_params, start_fullmv,
-                                       &best_mv.as_fullmv);
-
-    if (bestsme < INT_MAX) {
-      bestsme = av1_get_mvpred_compound_var(
-          &full_ms_params.mv_cost_params, best_mv.as_fullmv, second_pred, mask,
-          mask_stride, id, &cpi->fn_ptr[bsize], &x->plane[0].src,
-          &ref_yv12[id]);
+    if (mbmi->interinter_comp.type != COMPOUND_WEDGE) {
+      bestsme = av1_full_pixel_search(start_fullmv, &full_ms_params, 5, NULL,
+                                      &best_mv.as_fullmv, NULL);
+    } else {
+      bestsme = av1_refining_search_8p_c(&full_ms_params, start_fullmv,
+                                         &best_mv.as_fullmv);
     }
 
     // Restore the pointer to the first (possibly scaled) prediction buffer.
@@ -520,19 +508,21 @@
 
   for (ref = 0; ref < 2; ++ref) {
     const int_mv curr_ref_mv = av1_get_ref_mv(x, ref);
-    *rate_mv +=
-        av1_mv_bit_cost(&cur_mv[ref].as_mv, &curr_ref_mv.as_mv, x->nmv_vec_cost,
-                        x->mv_cost_stack, MV_COST_WEIGHT);
+    *rate_mv += av1_mv_bit_cost(&cur_mv[ref].as_mv, &curr_ref_mv.as_mv,
+                                mv_costs->nmv_joint_cost,
+                                mv_costs->mv_cost_stack, MV_COST_WEIGHT);
   }
+
+  return AOMMIN(last_besterr[0], last_besterr[1]);
 }
 
 // Search for the best mv for one component of a compound,
 // given that the other component is fixed.
-void av1_compound_single_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
-                                       BLOCK_SIZE bsize, MV *this_mv,
-                                       const uint8_t *second_pred,
-                                       const uint8_t *mask, int mask_stride,
-                                       int *rate_mv, int ref_idx) {
+int av1_compound_single_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
+                                      BLOCK_SIZE bsize, MV *this_mv,
+                                      const uint8_t *second_pred,
+                                      const uint8_t *mask, int mask_stride,
+                                      int *rate_mv, int ref_idx) {
   const AV1_COMMON *const cm = &cpi->common;
   const int num_planes = av1_num_planes(cm);
   MACROBLOCKD *xd = &x->e_mbd;
@@ -540,6 +530,7 @@
   const int ref = mbmi->ref_frame[ref_idx];
   const int_mv ref_mv = av1_get_ref_mv(x, ref_idx);
   struct macroblockd_plane *const pd = &xd->plane[0];
+  const MvCosts *mv_costs = &x->mv_costs;
 
   struct buf_2d backup_yv12[MAX_MB_PLANE];
   const YV12_BUFFER_CONFIG *const scaled_ref_frame =
@@ -550,7 +541,6 @@
 
   // Store the first prediction buffer.
   struct buf_2d orig_yv12;
-  struct buf_2d ref_yv12 = pd->pre[ref_idx];
   if (ref_idx) {
     orig_yv12 = pd->pre[0];
     pd->pre[0] = pd->pre[ref_idx];
@@ -575,8 +565,12 @@
 
   // Make motion search params
   FULLPEL_MOTION_SEARCH_PARAMS full_ms_params;
+  const search_site_config *src_search_sites =
+      cpi->mv_search_params.search_site_cfg[SS_CFG_SRC];
   av1_make_default_fullpel_ms_params(&full_ms_params, cpi, x, bsize,
-                                     &ref_mv.as_mv, NULL);
+                                     &ref_mv.as_mv, src_search_sites,
+                                     /*fine_search_interval=*/0);
+
   av1_set_ms_compound_refs(&full_ms_params.ms_buffers, second_pred, mask,
                            mask_stride, ref_idx);
 
@@ -584,14 +578,8 @@
   const FULLPEL_MV start_fullmv = get_fullmv_from_mv(this_mv);
 
   // Small-range full-pixel motion search.
-  bestsme = av1_refining_search_8p_c(&full_ms_params, start_fullmv,
-                                     &best_mv.as_fullmv);
-
-  if (bestsme < INT_MAX) {
-    bestsme = av1_get_mvpred_compound_var(
-        &full_ms_params.mv_cost_params, best_mv.as_fullmv, second_pred, mask,
-        mask_stride, ref_idx, &cpi->fn_ptr[bsize], &x->plane[0].src, &ref_yv12);
-  }
+  bestsme = av1_full_pixel_search(start_fullmv, &full_ms_params, 5, NULL,
+                                  &best_mv.as_fullmv, NULL);
 
   if (scaled_ref_frame) {
     // Swap back the original buffers for subpel motion search.
@@ -626,8 +614,9 @@
 
   *rate_mv = 0;
 
-  *rate_mv += av1_mv_bit_cost(this_mv, &ref_mv.as_mv, x->nmv_vec_cost,
-                              x->mv_cost_stack, MV_COST_WEIGHT);
+  *rate_mv += av1_mv_bit_cost(this_mv, &ref_mv.as_mv, mv_costs->nmv_joint_cost,
+                              mv_costs->mv_cost_stack, MV_COST_WEIGHT);
+  return bestsme;
 }
 
 static AOM_INLINE void build_second_inter_pred(const AV1_COMP *cpi,
@@ -666,15 +655,11 @@
   // Get the prediction block from the 'other' reference frame.
   av1_enc_build_one_inter_predictor(second_pred, pw, other_mv,
                                     &inter_pred_params);
-
-  av1_dist_wtd_comp_weight_assign(cm, mbmi, 0, &xd->jcp_param.fwd_offset,
-                                  &xd->jcp_param.bck_offset,
-                                  &xd->jcp_param.use_dist_wtd_comp_avg, 1);
 }
 
 // Wrapper for av1_compound_single_motion_search, for the common case
 // where the second prediction is also an inter mode.
-void av1_compound_single_motion_search_interinter(
+int av1_compound_single_motion_search_interinter(
     const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int_mv *cur_mv,
     const uint8_t *mask, int mask_stride, int *rate_mv, int ref_idx) {
   MACROBLOCKD *xd = &x->e_mbd;
@@ -692,8 +677,8 @@
   MV *this_mv = &cur_mv[ref_idx].as_mv;
   const MV *other_mv = &cur_mv[!ref_idx].as_mv;
   build_second_inter_pred(cpi, x, bsize, other_mv, ref_idx, second_pred);
-  av1_compound_single_motion_search(cpi, x, bsize, this_mv, second_pred, mask,
-                                    mask_stride, rate_mv, ref_idx);
+  return av1_compound_single_motion_search(cpi, x, bsize, this_mv, second_pred,
+                                           mask, mask_stride, rate_mv, ref_idx);
 }
 
 static AOM_INLINE void do_masked_motion_search_indexed(
@@ -703,7 +688,7 @@
   // NOTE: which values: 0 - 0 only, 1 - 1 only, 2 - both
   MACROBLOCKD *xd = &x->e_mbd;
   MB_MODE_INFO *mbmi = xd->mi[0];
-  BLOCK_SIZE sb_type = mbmi->sb_type;
+  BLOCK_SIZE sb_type = mbmi->bsize;
   const uint8_t *mask;
   const int mask_stride = block_size_wide[bsize];
 
@@ -761,7 +746,7 @@
   set_offsets_for_motion_search(cpi, x, mi_row, mi_col, bsize);
 
   MB_MODE_INFO *mbmi = xd->mi[0];
-  mbmi->sb_type = bsize;
+  mbmi->bsize = bsize;
   mbmi->ref_frame[0] = ref;
   mbmi->ref_frame[1] = NONE_FRAME;
   mbmi->motion_mode = SIMPLE_TRANSLATION;
@@ -773,9 +758,12 @@
   struct buf_2d backup_yv12;
   // ref_mv is used to calculate the cost of the motion vector
   const MV ref_mv = kZeroMv;
-  const int step_param = cpi->mv_search_params.mv_step_param;
+  const int step_param =
+      AOMMIN(cpi->mv_search_params.mv_step_param +
+                 cpi->sf.part_sf.simple_motion_search_reduce_search_steps,
+             MAX_MVSEARCH_STEPS - 2);
   const search_site_config *src_search_sites =
-      &cpi->mv_search_params.ss_cfg[SS_CFG_SRC];
+      cpi->mv_search_params.search_site_cfg[SS_CFG_SRC];
   int cost_list[5];
   const int ref_idx = 0;
   int var;
@@ -790,9 +778,11 @@
                          num_planes);
   }
 
+  // Allow more mesh searches for screen content type on the ARF.
+  const int fine_search_interval = use_fine_search_interval(cpi);
   FULLPEL_MOTION_SEARCH_PARAMS full_ms_params;
   av1_make_default_fullpel_ms_params(&full_ms_params, cpi, x, bsize, &ref_mv,
-                                     src_search_sites);
+                                     src_search_sites, fine_search_interval);
 
   var = av1_full_pixel_search(start_mv, &full_ms_params, step_param,
                               cond_cost_list(cpi, cost_list),
diff --git a/av1/encoder/motion_search_facade.h b/av1/encoder/motion_search_facade.h
index 3b86e93..f0c57a2 100644
--- a/av1/encoder/motion_search_facade.h
+++ b/av1/encoder/motion_search_facade.h
@@ -18,6 +18,8 @@
 extern "C" {
 #endif
 
+// TODO(any): rename this struct to something else. There is already another
+// struct called inter_modes_info, which makes this terribly confusing.
 typedef struct {
   int64_t rd;
   int drl_cost;
@@ -34,10 +36,9 @@
                               int search_range, inter_mode_info *mode_info,
                               int_mv *best_mv);
 
-void av1_joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
-                             BLOCK_SIZE bsize, int_mv *cur_mv,
-                             const uint8_t *mask, int mask_stride,
-                             int *rate_mv);
+int av1_joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
+                            BLOCK_SIZE bsize, int_mv *cur_mv,
+                            const uint8_t *mask, int mask_stride, int *rate_mv);
 
 int av1_interinter_compound_motion_search(const AV1_COMP *const cpi,
                                           MACROBLOCK *x,
@@ -45,15 +46,15 @@
                                           const BLOCK_SIZE bsize,
                                           const PREDICTION_MODE this_mode);
 
-void av1_compound_single_motion_search_interinter(
+int av1_compound_single_motion_search_interinter(
     const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int_mv *cur_mv,
     const uint8_t *mask, int mask_stride, int *rate_mv, int ref_idx);
 
-void av1_compound_single_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
-                                       BLOCK_SIZE bsize, MV *this_mv,
-                                       const uint8_t *second_pred,
-                                       const uint8_t *mask, int mask_stride,
-                                       int *rate_mv, int ref_idx);
+int av1_compound_single_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
+                                      BLOCK_SIZE bsize, MV *this_mv,
+                                      const uint8_t *second_pred,
+                                      const uint8_t *mask, int mask_stride,
+                                      int *rate_mv, int ref_idx);
 
 // Performs a motion search in SIMPLE_TRANSLATION mode using reference frame
 // ref. Note that this sets the offset of mbmi, so we will need to reset it
diff --git a/av1/encoder/mv_prec.c b/av1/encoder/mv_prec.c
index 8fcbde9..cc81d72 100644
--- a/av1/encoder/mv_prec.c
+++ b/av1/encoder/mv_prec.c
@@ -224,7 +224,7 @@
   }
 
   // Add texture information
-  const BLOCK_SIZE bsize = mbmi->sb_type;
+  const BLOCK_SIZE bsize = mbmi->bsize;
   const int num_rows = block_size_high[bsize];
   const int num_cols = block_size_wide[bsize];
   const int y_stride = cpi->source->y_stride;
diff --git a/av1/encoder/mv_prec.h b/av1/encoder/mv_prec.h
index 8df8b96..8564226 100644
--- a/av1/encoder/mv_prec.h
+++ b/av1/encoder/mv_prec.h
@@ -32,15 +32,16 @@
 static AOM_INLINE void av1_set_high_precision_mv(
     AV1_COMP *cpi, int allow_high_precision_mv,
     int cur_frame_force_integer_mv) {
-  MACROBLOCK *const x = &cpi->td.mb;
+  MvCosts *const mv_costs = &cpi->td.mb.mv_costs;
   const int copy_hp = cpi->common.features.allow_high_precision_mv =
       allow_high_precision_mv && !cur_frame_force_integer_mv;
-  x->nmvcost[0] = &x->nmv_costs[0][MV_MAX];
-  x->nmvcost[1] = &x->nmv_costs[1][MV_MAX];
-  x->nmvcost_hp[0] = &x->nmv_costs_hp[0][MV_MAX];
-  x->nmvcost_hp[1] = &x->nmv_costs_hp[1][MV_MAX];
-  int *(*src)[2] = copy_hp ? &x->nmvcost_hp : &x->nmvcost;
-  x->mv_cost_stack = *src;
+
+  mv_costs->nmv_cost[0] = &mv_costs->nmv_cost_alloc[0][MV_MAX];
+  mv_costs->nmv_cost[1] = &mv_costs->nmv_cost_alloc[1][MV_MAX];
+  mv_costs->nmv_cost_hp[0] = &mv_costs->nmv_cost_hp_alloc[0][MV_MAX];
+  mv_costs->nmv_cost_hp[1] = &mv_costs->nmv_cost_hp_alloc[1][MV_MAX];
+  mv_costs->mv_cost_stack =
+      copy_hp ? mv_costs->nmv_cost_hp : mv_costs->nmv_cost;
 }
 
 void av1_pick_and_set_high_precision_mv(AV1_COMP *cpi, int qindex);
diff --git a/av1/encoder/nonrd_pickmode.c b/av1/encoder/nonrd_pickmode.c
index a118001..0891323 100644
--- a/av1/encoder/nonrd_pickmode.c
+++ b/av1/encoder/nonrd_pickmode.c
@@ -36,6 +36,7 @@
 #include "av1/encoder/reconinter_enc.h"
 
 extern int g_pick_inter_mode_cnt;
+/*!\cond */
 typedef struct {
   uint8_t *data;
   int stride;
@@ -46,10 +47,9 @@
   PRED_BUFFER *best_pred;
   PREDICTION_MODE best_mode;
   TX_SIZE best_tx_size;
-  TX_SIZE best_intra_tx_size;
   MV_REFERENCE_FRAME best_ref_frame;
-  MV_REFERENCE_FRAME best_second_ref_frame;
   uint8_t best_mode_skip_txfm;
+  uint8_t best_mode_initial_skip_flag;
   int_interpfilters best_pred_filter;
 } BEST_PICKMODE;
 
@@ -57,13 +57,16 @@
   MV_REFERENCE_FRAME ref_frame;
   PREDICTION_MODE pred_mode;
 } REF_MODE;
+/*!\endcond */
 
 static const int pos_shift_16x16[4][4] = {
   { 9, 10, 13, 14 }, { 11, 12, 15, 16 }, { 17, 18, 21, 22 }, { 19, 20, 23, 24 }
 };
 
-#define RT_INTER_MODES 9
-static const REF_MODE ref_mode_set[RT_INTER_MODES] = {
+#define NUM_INTER_MODES_RT 9
+#define NUM_INTER_MODES_REDUCED 8
+
+static const REF_MODE ref_mode_set_rt[NUM_INTER_MODES_RT] = {
   { LAST_FRAME, NEARESTMV },   { LAST_FRAME, NEARMV },
   { LAST_FRAME, NEWMV },       { GOLDEN_FRAME, NEARESTMV },
   { GOLDEN_FRAME, NEARMV },    { GOLDEN_FRAME, NEWMV },
@@ -71,6 +74,15 @@
   { ALTREF_FRAME, NEWMV }
 };
 
+// GLOBALMV in the set below is in fact ZEROMV as we don't do global ME in RT
+// mode
+static const REF_MODE ref_mode_set_reduced[NUM_INTER_MODES_REDUCED] = {
+  { LAST_FRAME, GLOBALMV },   { LAST_FRAME, NEARESTMV },
+  { GOLDEN_FRAME, GLOBALMV }, { LAST_FRAME, NEARMV },
+  { LAST_FRAME, NEWMV },      { GOLDEN_FRAME, NEARESTMV },
+  { GOLDEN_FRAME, NEARMV },   { GOLDEN_FRAME, NEWMV }
+};
+
 static const THR_MODES mode_idx[REF_FRAMES][4] = {
   { THR_DC, THR_V_PRED, THR_H_PRED, THR_SMOOTH },
   { THR_NEARESTMV, THR_NEARMV, THR_GLOBALMV, THR_NEWMV },
@@ -108,13 +120,36 @@
   bp->best_mode = NEARESTMV;
   bp->best_ref_frame = LAST_FRAME;
   bp->best_tx_size = TX_8X8;
-  bp->best_intra_tx_size = TX_8X8;
   bp->best_pred_filter = av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
   bp->best_mode_skip_txfm = 0;
-  bp->best_second_ref_frame = NONE_FRAME;
+  bp->best_mode_initial_skip_flag = 0;
   bp->best_pred = NULL;
 }
 
+/*!\brief Runs Motion Estimation for a specific block and specific ref frame.
+ *
+ * \ingroup nonrd_mode_search
+ * \callgraph
+ * \callergraph
+ * Finds the best Motion Vector by running Motion Estimation for a specific
+ * block and a specific reference frame. Exits early if RDCost of Full Pel part
+ * exceeds best RD Cost fund so far
+ * \param[in]    cpi                      Top-level encoder structure
+ * \param[in]    x                        Pointer to structure holding all the
+ *                                        data for the current macroblock
+ * \param[in]    bsize                    Current block size
+ * \param[in]    mi_row                   Row index in 4x4 units
+ * \param[in]    mi_col                   Column index in 4x4 units
+ * \param[in]    tmp_mv                   Pointer to best found New MV
+ * \param[in]    rate_mv                  Pointer to Rate of the best new MV
+ * \param[in]    best_rd_sofar            RD Cost of the best mode found so far
+ * \param[in]    use_base_mv              Flag, indicating that tmp_mv holds
+ *                                        specific MV to start the search with
+ *
+ * \return Returns 0 if ME was terminated after Full Pel Search because too
+ * high RD Cost. Otherwise returns 1. Best New MV is placed into \c tmp_mv.
+ * Rate estimation for this vector is placed to \c rate_mv
+ */
 static int combined_motion_search(AV1_COMP *cpi, MACROBLOCK *x,
                                   BLOCK_SIZE bsize, int mi_row, int mi_col,
                                   int_mv *tmp_mv, int *rate_mv,
@@ -124,7 +159,9 @@
   const int num_planes = av1_num_planes(cm);
   MB_MODE_INFO *mi = xd->mi[0];
   struct buf_2d backup_yv12[MAX_MB_PLANE] = { { 0, 0, 0, 0, 0 } };
-  int step_param = cpi->mv_search_params.mv_step_param;
+  int step_param = (cpi->sf.rt_sf.fullpel_search_step_param)
+                       ? cpi->sf.rt_sf.fullpel_search_step_param
+                       : cpi->mv_search_params.mv_step_param;
   FULLPEL_MV start_mv;
   const int ref = mi->ref_frame[0];
   const MV ref_mv = av1_get_ref_mv(x, mi->ref_mv_idx).as_mv;
@@ -152,12 +189,12 @@
     center_mv = ref_mv;
   else
     center_mv = tmp_mv->as_mv;
-
   const search_site_config *src_search_sites =
-      &cpi->mv_search_params.ss_cfg[SS_CFG_SRC];
+      cpi->mv_search_params.search_site_cfg[SS_CFG_SRC];
   FULLPEL_MOTION_SEARCH_PARAMS full_ms_params;
   av1_make_default_fullpel_ms_params(&full_ms_params, cpi, x, bsize, &center_mv,
-                                     src_search_sites);
+                                     src_search_sites,
+                                     /*fine_search_interval=*/0);
 
   av1_full_pixel_search(start_mv, &full_ms_params, step_param,
                         cond_cost_list(cpi, cost_list), &tmp_mv->as_fullmv,
@@ -166,8 +203,8 @@
   // calculate the bit cost on motion vector
   MV mvp_full = get_mv_from_fullmv(&tmp_mv->as_fullmv);
 
-  *rate_mv = av1_mv_bit_cost(&mvp_full, &ref_mv, x->nmv_vec_cost,
-                             x->mv_cost_stack, MV_COST_WEIGHT);
+  *rate_mv = av1_mv_bit_cost(&mvp_full, &ref_mv, x->mv_costs.nmv_joint_cost,
+                             x->mv_costs.mv_cost_stack, MV_COST_WEIGHT);
 
   // TODO(kyslov) Account for Rate Mode!
   rv = !(RDCOST(x->rdmult, (*rate_mv), 0) > best_rd_sofar);
@@ -181,27 +218,59 @@
         xd, cm, &ms_params, subpel_start_mv, &tmp_mv->as_mv, &dis,
         &x->pred_sse[ref], NULL);
 
-    *rate_mv = av1_mv_bit_cost(&tmp_mv->as_mv, &ref_mv, x->nmv_vec_cost,
-                               x->mv_cost_stack, MV_COST_WEIGHT);
+    *rate_mv =
+        av1_mv_bit_cost(&tmp_mv->as_mv, &ref_mv, x->mv_costs.nmv_joint_cost,
+                        x->mv_costs.mv_cost_stack, MV_COST_WEIGHT);
   }
 
   if (scaled_ref_frame) {
     int i;
     for (i = 0; i < MAX_MB_PLANE; i++) xd->plane[i].pre[0] = backup_yv12[i];
   }
+  // Final MV can not be equal to referance MV as this will trigger assert
+  // later. This can happen if both NEAREST and NEAR modes were skipped
+  rv = (tmp_mv->as_mv.col != ref_mv.col || tmp_mv->as_mv.row != ref_mv.row);
   return rv;
 }
 
+/*!\brief Searches for the best New Motion Vector.
+ *
+ * \ingroup nonrd_mode_search
+ * \callgraph
+ * \callergraph
+ * Finds the best Motion Vector by doing Motion Estimation. Uses reduced
+ * complexity ME for non-LAST frames or calls \c combined_motion_search
+ * for LAST reference frame
+ * \param[in]    cpi                      Top-level encoder structure
+ * \param[in]    x                        Pointer to structure holding all the
+ *                                        data for the current macroblock
+ * \param[in]    frame_mv                 Array that holds MVs for all modes
+ *                                        and ref frames
+ * \param[in]    ref_frame                Reference freme for which to find
+ *                                        the best New MVs
+ * \param[in]    gf_temporal_ref          Flag, indicating temporal reference
+ *                                        for GOLDEN frame
+ * \param[in]    bsize                    Current block size
+ * \param[in]    mi_row                   Row index in 4x4 units
+ * \param[in]    mi_col                   Column index in 4x4 units
+ * \param[in]    rate_mv                  Pointer to Rate of the best new MV
+ * \param[in]    best_rdc                 Pointer to the RD Cost for the best
+ *                                        mode found so far
+ *
+ * \return Returns -1 if the search was not done, otherwise returns 0.
+ * Best New MV is placed into \c frame_mv array, Rate estimation for this
+ * vector is placed to \c rate_mv
+ */
 static int search_new_mv(AV1_COMP *cpi, MACROBLOCK *x,
                          int_mv frame_mv[][REF_FRAMES],
                          MV_REFERENCE_FRAME ref_frame, int gf_temporal_ref,
-                         BLOCK_SIZE bsize, int mi_row, int mi_col,
-                         int best_pred_sad, int *rate_mv, RD_STATS *best_rdc) {
+                         BLOCK_SIZE bsize, int mi_row, int mi_col, int *rate_mv,
+                         RD_STATS *best_rdc) {
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mi = xd->mi[0];
   AV1_COMMON *cm = &cpi->common;
-  if (ref_frame > LAST_FRAME && gf_temporal_ref &&
-      cpi->oxcf.rc_mode == AOM_CBR) {
+  if (ref_frame > LAST_FRAME && cpi->oxcf.rc_cfg.mode == AOM_CBR &&
+      gf_temporal_ref) {
     int tmp_sad;
     int dis;
     int cost_list[5] = { INT_MAX, INT_MAX, INT_MAX, INT_MAX, INT_MAX };
@@ -210,10 +279,9 @@
 
     tmp_sad = av1_int_pro_motion_estimation(
         cpi, x, bsize, mi_row, mi_col,
-        &x->mbmi_ext->ref_mv_stack[ref_frame][0].this_mv.as_mv);
+        &x->mbmi_ext.ref_mv_stack[ref_frame][0].this_mv.as_mv);
 
     if (tmp_sad > x->pred_mv_sad[LAST_FRAME]) return -1;
-    if (tmp_sad + (num_pels_log2_lookup[bsize] << 4) > best_pred_sad) return -1;
 
     frame_mv[NEWMV][ref_frame].as_int = mi->mv[0].as_int;
     int_mv best_mv = mi->mv[0];
@@ -221,9 +289,9 @@
     best_mv.as_mv.col >>= 3;
     MV ref_mv = av1_get_ref_mv(x, 0).as_mv;
 
-    *rate_mv =
-        av1_mv_bit_cost(&frame_mv[NEWMV][ref_frame].as_mv, &ref_mv,
-                        x->nmv_vec_cost, x->mv_cost_stack, MV_COST_WEIGHT);
+    *rate_mv = av1_mv_bit_cost(&frame_mv[NEWMV][ref_frame].as_mv, &ref_mv,
+                               x->mv_costs.nmv_joint_cost,
+                               x->mv_costs.mv_cost_stack, MV_COST_WEIGHT);
     frame_mv[NEWMV][ref_frame].as_mv.row >>= 3;
     frame_mv[NEWMV][ref_frame].as_mv.col >>= 3;
 
@@ -244,16 +312,42 @@
   return 0;
 }
 
-static INLINE void find_predictors(
-    AV1_COMP *cpi, MACROBLOCK *x, MV_REFERENCE_FRAME ref_frame,
-    int_mv frame_mv[MB_MODE_COUNT][REF_FRAMES], int *ref_frame_skip_mask,
-    const int flag_list[4], TileDataEnc *tile_data,
-    struct buf_2d yv12_mb[8][MAX_MB_PLANE], BLOCK_SIZE bsize,
-    int force_skip_low_temp_var) {
+/*!\brief Finds predicted motion vectors for a block.
+ *
+ * \ingroup nonrd_mode_search
+ * \callgraph
+ * \callergraph
+ * Finds predicted motion vectors for a block from a certain reference frame.
+ * First, it fills reference MV stack, then picks the test from the stack and
+ * predicts the final MV for a block for each mode.
+ * \param[in]    cpi                      Top-level encoder structure
+ * \param[in]    x                        Pointer to structure holding all the
+ *                                        data for the current macroblock
+ * \param[in]    ref_frame                Reference freme for which to find
+ *                                        ref MVs
+ * \param[in]    frame_mv                 Predicted MVs for a block
+ * \param[in]    tile_data                Pointer to struct holding adaptive
+ *                                        data/contexts/models for the tile
+ *                                        during encoding
+ * \param[in]    yv12_mb                  Buffer to hold predicted block
+ * \param[in]    bsize                    Current block size
+ * \param[in]    force_skip_low_temp_var  Flag indicating possible mode search
+ *                                        prune for low temporal variace  block
+ *
+ * \return Nothing is returned. Instead, predicted MVs are placed into
+ * \c frame_mv array
+ */
+static INLINE void find_predictors(AV1_COMP *cpi, MACROBLOCK *x,
+                                   MV_REFERENCE_FRAME ref_frame,
+                                   int_mv frame_mv[MB_MODE_COUNT][REF_FRAMES],
+                                   TileDataEnc *tile_data,
+                                   struct buf_2d yv12_mb[8][MAX_MB_PLANE],
+                                   BLOCK_SIZE bsize,
+                                   int force_skip_low_temp_var) {
   AV1_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = xd->mi[0];
-  MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
+  MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext;
   const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_yv12_buf(cm, ref_frame);
   const int num_planes = av1_num_planes(cm);
   (void)tile_data;
@@ -261,7 +355,8 @@
   x->pred_mv_sad[ref_frame] = INT_MAX;
   frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
   // TODO(kyslov) this needs various further optimizations. to be continued..
-  if ((cpi->ref_frame_flags & flag_list[ref_frame]) && (yv12 != NULL)) {
+  assert(yv12 != NULL);
+  if (yv12 != NULL) {
     const struct scale_factors *const sf =
         get_ref_scale_factors_const(cm, ref_frame);
     av1_setup_pred_block(xd, yv12_mb[ref_frame], yv12, sf, sf, num_planes);
@@ -274,14 +369,13 @@
     av1_find_best_ref_mvs_from_stack(
         cm->features.allow_high_precision_mv, mbmi_ext, ref_frame,
         &frame_mv[NEARESTMV][ref_frame], &frame_mv[NEARMV][ref_frame], 0);
+    frame_mv[GLOBALMV][ref_frame] = mbmi_ext->global_mvs[ref_frame];
     // Early exit for non-LAST frame if force_skip_low_temp_var is set.
     if (!av1_is_scaled(sf) && bsize >= BLOCK_8X8 &&
         !(force_skip_low_temp_var && ref_frame != LAST_FRAME)) {
       av1_mv_pred(cpi, x, yv12_mb[ref_frame][0].buf, yv12->y_stride, ref_frame,
                   bsize);
     }
-  } else {
-    *ref_frame_skip_mask |= (1 << ref_frame);
   }
   av1_count_overlappable_neighbors(cm, xd);
   mbmi->num_proj_ref = 1;
@@ -289,7 +383,8 @@
 
 static void estimate_single_ref_frame_costs(const AV1_COMMON *cm,
                                             const MACROBLOCKD *xd,
-                                            const MACROBLOCK *x, int segment_id,
+                                            const ModeCosts *mode_costs,
+                                            int segment_id,
                                             unsigned int *ref_costs_single) {
   int seg_ref_active =
       segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME);
@@ -297,59 +392,23 @@
     memset(ref_costs_single, 0, REF_FRAMES * sizeof(*ref_costs_single));
   } else {
     int intra_inter_ctx = av1_get_intra_inter_context(xd);
-    ref_costs_single[INTRA_FRAME] = x->intra_inter_cost[intra_inter_ctx][0];
-    unsigned int base_cost = x->intra_inter_cost[intra_inter_ctx][1];
-
-    for (int i = LAST_FRAME; i <= ALTREF_FRAME; ++i)
-      ref_costs_single[i] = base_cost;
-
-    const int ctx_p1 = av1_get_pred_context_single_ref_p1(xd);
-    const int ctx_p2 = av1_get_pred_context_single_ref_p2(xd);
-    const int ctx_p3 = av1_get_pred_context_single_ref_p3(xd);
-    const int ctx_p4 = av1_get_pred_context_single_ref_p4(xd);
-    const int ctx_p5 = av1_get_pred_context_single_ref_p5(xd);
-    const int ctx_p6 = av1_get_pred_context_single_ref_p6(xd);
-
-    // Determine cost of a single ref frame, where frame types are represented
-    // by a tree:
-    // Level 0: add cost whether this ref is a forward or backward ref
-    ref_costs_single[LAST_FRAME] += x->single_ref_cost[ctx_p1][0][0];
-    ref_costs_single[LAST2_FRAME] += x->single_ref_cost[ctx_p1][0][0];
-    ref_costs_single[LAST3_FRAME] += x->single_ref_cost[ctx_p1][0][0];
-    ref_costs_single[GOLDEN_FRAME] += x->single_ref_cost[ctx_p1][0][0];
-    ref_costs_single[BWDREF_FRAME] += x->single_ref_cost[ctx_p1][0][1];
-    ref_costs_single[ALTREF2_FRAME] += x->single_ref_cost[ctx_p1][0][1];
-    ref_costs_single[ALTREF_FRAME] += x->single_ref_cost[ctx_p1][0][1];
-
-    // Level 1: if this ref is forward ref,
-    // add cost whether it is last/last2 or last3/golden
-    ref_costs_single[LAST_FRAME] += x->single_ref_cost[ctx_p3][2][0];
-    ref_costs_single[LAST2_FRAME] += x->single_ref_cost[ctx_p3][2][0];
-    ref_costs_single[LAST3_FRAME] += x->single_ref_cost[ctx_p3][2][1];
-    ref_costs_single[GOLDEN_FRAME] += x->single_ref_cost[ctx_p3][2][1];
-
-    // Level 1: if this ref is backward ref
-    // then add cost whether this ref is altref or backward ref
-    ref_costs_single[BWDREF_FRAME] += x->single_ref_cost[ctx_p2][1][0];
-    ref_costs_single[ALTREF2_FRAME] += x->single_ref_cost[ctx_p2][1][0];
-    ref_costs_single[ALTREF_FRAME] += x->single_ref_cost[ctx_p2][1][1];
-
-    // Level 2: further add cost whether this ref is last or last2
-    ref_costs_single[LAST_FRAME] += x->single_ref_cost[ctx_p4][3][0];
-    ref_costs_single[LAST2_FRAME] += x->single_ref_cost[ctx_p4][3][1];
-
-    // Level 2: last3 or golden
-    ref_costs_single[LAST3_FRAME] += x->single_ref_cost[ctx_p5][4][0];
-    ref_costs_single[GOLDEN_FRAME] += x->single_ref_cost[ctx_p5][4][1];
-
-    // Level 2: bwdref or altref2
-    ref_costs_single[BWDREF_FRAME] += x->single_ref_cost[ctx_p6][5][0];
-    ref_costs_single[ALTREF2_FRAME] += x->single_ref_cost[ctx_p6][5][1];
+    ref_costs_single[INTRA_FRAME] =
+        mode_costs->intra_inter_cost[intra_inter_ctx][0];
+    unsigned int base_cost = mode_costs->intra_inter_cost[intra_inter_ctx][1];
+    ref_costs_single[LAST_FRAME] = base_cost;
+    ref_costs_single[GOLDEN_FRAME] = base_cost;
+    ref_costs_single[ALTREF_FRAME] = base_cost;
+    // add cost for last, golden, altref
+    ref_costs_single[LAST_FRAME] += mode_costs->single_ref_cost[0][0][0];
+    ref_costs_single[GOLDEN_FRAME] += mode_costs->single_ref_cost[0][0][1];
+    ref_costs_single[GOLDEN_FRAME] += mode_costs->single_ref_cost[0][1][0];
+    ref_costs_single[ALTREF_FRAME] += mode_costs->single_ref_cost[0][0][1];
+    ref_costs_single[ALTREF_FRAME] += mode_costs->single_ref_cost[0][2][0];
   }
 }
 
 static void estimate_comp_ref_frame_costs(
-    const AV1_COMMON *cm, const MACROBLOCKD *xd, const MACROBLOCK *x,
+    const AV1_COMMON *cm, const MACROBLOCKD *xd, const ModeCosts *mode_costs,
     int segment_id, unsigned int (*ref_costs_comp)[REF_FRAMES]) {
   if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) {
     for (int ref_frame = 0; ref_frame < REF_FRAMES; ++ref_frame)
@@ -357,7 +416,7 @@
              REF_FRAMES * sizeof((*ref_costs_comp)[0]));
   } else {
     int intra_inter_ctx = av1_get_intra_inter_context(xd);
-    unsigned int base_cost = x->intra_inter_cost[intra_inter_ctx][1];
+    unsigned int base_cost = mode_costs->intra_inter_cost[intra_inter_ctx][1];
 
     if (cm->current_frame.reference_mode != SINGLE_REFERENCE) {
       // Similar to single ref, determine cost of compound ref frames.
@@ -373,34 +432,42 @@
 
       ref_bicomp_costs[LAST_FRAME] = ref_bicomp_costs[LAST2_FRAME] =
           ref_bicomp_costs[LAST3_FRAME] = ref_bicomp_costs[GOLDEN_FRAME] =
-              base_cost + x->comp_ref_type_cost[comp_ref_type_ctx][1];
+              base_cost + mode_costs->comp_ref_type_cost[comp_ref_type_ctx][1];
       ref_bicomp_costs[BWDREF_FRAME] = ref_bicomp_costs[ALTREF2_FRAME] = 0;
       ref_bicomp_costs[ALTREF_FRAME] = 0;
 
       // cost of first ref frame
-      ref_bicomp_costs[LAST_FRAME] += x->comp_ref_cost[ref_comp_ctx_p][0][0];
-      ref_bicomp_costs[LAST2_FRAME] += x->comp_ref_cost[ref_comp_ctx_p][0][0];
-      ref_bicomp_costs[LAST3_FRAME] += x->comp_ref_cost[ref_comp_ctx_p][0][1];
-      ref_bicomp_costs[GOLDEN_FRAME] += x->comp_ref_cost[ref_comp_ctx_p][0][1];
+      ref_bicomp_costs[LAST_FRAME] +=
+          mode_costs->comp_ref_cost[ref_comp_ctx_p][0][0];
+      ref_bicomp_costs[LAST2_FRAME] +=
+          mode_costs->comp_ref_cost[ref_comp_ctx_p][0][0];
+      ref_bicomp_costs[LAST3_FRAME] +=
+          mode_costs->comp_ref_cost[ref_comp_ctx_p][0][1];
+      ref_bicomp_costs[GOLDEN_FRAME] +=
+          mode_costs->comp_ref_cost[ref_comp_ctx_p][0][1];
 
-      ref_bicomp_costs[LAST_FRAME] += x->comp_ref_cost[ref_comp_ctx_p1][1][0];
-      ref_bicomp_costs[LAST2_FRAME] += x->comp_ref_cost[ref_comp_ctx_p1][1][1];
+      ref_bicomp_costs[LAST_FRAME] +=
+          mode_costs->comp_ref_cost[ref_comp_ctx_p1][1][0];
+      ref_bicomp_costs[LAST2_FRAME] +=
+          mode_costs->comp_ref_cost[ref_comp_ctx_p1][1][1];
 
-      ref_bicomp_costs[LAST3_FRAME] += x->comp_ref_cost[ref_comp_ctx_p2][2][0];
-      ref_bicomp_costs[GOLDEN_FRAME] += x->comp_ref_cost[ref_comp_ctx_p2][2][1];
+      ref_bicomp_costs[LAST3_FRAME] +=
+          mode_costs->comp_ref_cost[ref_comp_ctx_p2][2][0];
+      ref_bicomp_costs[GOLDEN_FRAME] +=
+          mode_costs->comp_ref_cost[ref_comp_ctx_p2][2][1];
 
       // cost of second ref frame
       ref_bicomp_costs[BWDREF_FRAME] +=
-          x->comp_bwdref_cost[bwdref_comp_ctx_p][0][0];
+          mode_costs->comp_bwdref_cost[bwdref_comp_ctx_p][0][0];
       ref_bicomp_costs[ALTREF2_FRAME] +=
-          x->comp_bwdref_cost[bwdref_comp_ctx_p][0][0];
+          mode_costs->comp_bwdref_cost[bwdref_comp_ctx_p][0][0];
       ref_bicomp_costs[ALTREF_FRAME] +=
-          x->comp_bwdref_cost[bwdref_comp_ctx_p][0][1];
+          mode_costs->comp_bwdref_cost[bwdref_comp_ctx_p][0][1];
 
       ref_bicomp_costs[BWDREF_FRAME] +=
-          x->comp_bwdref_cost[bwdref_comp_ctx_p1][1][0];
+          mode_costs->comp_bwdref_cost[bwdref_comp_ctx_p1][1][0];
       ref_bicomp_costs[ALTREF2_FRAME] +=
-          x->comp_bwdref_cost[bwdref_comp_ctx_p1][1][1];
+          mode_costs->comp_bwdref_cost[bwdref_comp_ctx_p1][1][1];
 
       // cost: if one ref frame is forward ref, the other ref is backward ref
       for (int ref0 = LAST_FRAME; ref0 <= GOLDEN_FRAME; ++ref0) {
@@ -415,22 +482,22 @@
       const int uni_comp_ref_ctx_p1 = av1_get_pred_context_uni_comp_ref_p1(xd);
       const int uni_comp_ref_ctx_p2 = av1_get_pred_context_uni_comp_ref_p2(xd);
       ref_costs_comp[LAST_FRAME][LAST2_FRAME] =
-          base_cost + x->comp_ref_type_cost[comp_ref_type_ctx][0] +
-          x->uni_comp_ref_cost[uni_comp_ref_ctx_p][0][0] +
-          x->uni_comp_ref_cost[uni_comp_ref_ctx_p1][1][0];
+          base_cost + mode_costs->comp_ref_type_cost[comp_ref_type_ctx][0] +
+          mode_costs->uni_comp_ref_cost[uni_comp_ref_ctx_p][0][0] +
+          mode_costs->uni_comp_ref_cost[uni_comp_ref_ctx_p1][1][0];
       ref_costs_comp[LAST_FRAME][LAST3_FRAME] =
-          base_cost + x->comp_ref_type_cost[comp_ref_type_ctx][0] +
-          x->uni_comp_ref_cost[uni_comp_ref_ctx_p][0][0] +
-          x->uni_comp_ref_cost[uni_comp_ref_ctx_p1][1][1] +
-          x->uni_comp_ref_cost[uni_comp_ref_ctx_p2][2][0];
+          base_cost + mode_costs->comp_ref_type_cost[comp_ref_type_ctx][0] +
+          mode_costs->uni_comp_ref_cost[uni_comp_ref_ctx_p][0][0] +
+          mode_costs->uni_comp_ref_cost[uni_comp_ref_ctx_p1][1][1] +
+          mode_costs->uni_comp_ref_cost[uni_comp_ref_ctx_p2][2][0];
       ref_costs_comp[LAST_FRAME][GOLDEN_FRAME] =
-          base_cost + x->comp_ref_type_cost[comp_ref_type_ctx][0] +
-          x->uni_comp_ref_cost[uni_comp_ref_ctx_p][0][0] +
-          x->uni_comp_ref_cost[uni_comp_ref_ctx_p1][1][1] +
-          x->uni_comp_ref_cost[uni_comp_ref_ctx_p2][2][1];
+          base_cost + mode_costs->comp_ref_type_cost[comp_ref_type_ctx][0] +
+          mode_costs->uni_comp_ref_cost[uni_comp_ref_ctx_p][0][0] +
+          mode_costs->uni_comp_ref_cost[uni_comp_ref_ctx_p1][1][1] +
+          mode_costs->uni_comp_ref_cost[uni_comp_ref_ctx_p2][2][1];
       ref_costs_comp[BWDREF_FRAME][ALTREF_FRAME] =
-          base_cost + x->comp_ref_type_cost[comp_ref_type_ctx][0] +
-          x->uni_comp_ref_cost[uni_comp_ref_ctx_p][0][1];
+          base_cost + mode_costs->comp_ref_type_cost[comp_ref_type_ctx][0] +
+          mode_costs->uni_comp_ref_cost[uni_comp_ref_ctx_p][0][1];
     } else {
       for (int ref0 = LAST_FRAME; ref0 <= GOLDEN_FRAME; ++ref0) {
         for (int ref1 = BWDREF_FRAME; ref1 <= ALTREF_FRAME; ++ref1)
@@ -449,24 +516,27 @@
                                  unsigned int sse) {
   MACROBLOCKD *const xd = &x->e_mbd;
   TX_SIZE tx_size;
-  if (x->tx_mode_search_type == TX_MODE_SELECT) {
+  const TxfmSearchParams *txfm_params = &x->txfm_search_params;
+  if (txfm_params->tx_mode_search_type == TX_MODE_SELECT) {
     if (sse > (var << 2))
-      tx_size = AOMMIN(max_txsize_lookup[bsize],
-                       tx_mode_to_biggest_tx_size[x->tx_mode_search_type]);
+      tx_size =
+          AOMMIN(max_txsize_lookup[bsize],
+                 tx_mode_to_biggest_tx_size[txfm_params->tx_mode_search_type]);
     else
       tx_size = TX_8X8;
 
-    if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ &&
+    if (cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ &&
         cyclic_refresh_segment_id_boosted(xd->mi[0]->segment_id))
       tx_size = TX_8X8;
     else if (tx_size > TX_16X16)
       tx_size = TX_16X16;
   } else {
-    tx_size = AOMMIN(max_txsize_lookup[bsize],
-                     tx_mode_to_biggest_tx_size[x->tx_mode_search_type]);
+    tx_size =
+        AOMMIN(max_txsize_lookup[bsize],
+               tx_mode_to_biggest_tx_size[txfm_params->tx_mode_search_type]);
   }
 
-  if (x->tx_mode_search_type != ONLY_4X4 && bsize > BLOCK_32X32)
+  if (txfm_params->tx_mode_search_type != ONLY_4X4 && bsize > BLOCK_32X32)
     tx_size = TX_16X16;
 
   return AOMMIN(tx_size, TX_16X16);
@@ -538,10 +608,8 @@
 
 static void model_skip_for_sb_y_large(AV1_COMP *cpi, BLOCK_SIZE bsize,
                                       int mi_row, int mi_col, MACROBLOCK *x,
-                                      MACROBLOCKD *xd, int *out_rate,
-                                      int64_t *out_dist, unsigned int *var_y,
-                                      unsigned int *sse_y, int *early_term,
-                                      int calculate_rd) {
+                                      MACROBLOCKD *xd, RD_STATS *rd_stats,
+                                      int *early_term, int calculate_rd) {
   // Note our transform coeffs are 8 times an orthogonal transform.
   // Hence quantizer step is also 8 times. To get effective quantizer
   // we need to divide by 8 before sending to modeling function.
@@ -569,8 +637,7 @@
                  4 << bw, 4 << bh, &sse, &sum, 8, sse8x8, sum8x8, var8x8);
   var = sse - (unsigned int)(((int64_t)sum * sum) >> (bw + bh + 4));
 
-  *var_y = var;
-  *sse_y = sse;
+  rd_stats->sse = sse;
 
   ac_thr *= ac_thr_factor(cpi->oxcf.speed, cpi->common.width,
                           cpi->common.height, abs(sum) >> (bw + bh));
@@ -666,27 +733,25 @@
       }
     }
   }
-  if (calculate_rd && out_dist != NULL && out_rate != NULL) {
+  if (calculate_rd) {
     if (!*early_term) {
       const int bwide = block_size_wide[bsize];
       const int bhigh = block_size_high[bsize];
 
       model_rd_with_curvfit(cpi, x, bsize, AOM_PLANE_Y, sse, bwide * bhigh,
-                            out_rate, out_dist);
+                            &rd_stats->rate, &rd_stats->dist);
     }
 
     if (*early_term) {
-      *out_rate = 0;
-      *out_dist = sse << 4;
+      rd_stats->rate = 0;
+      rd_stats->dist = sse << 4;
     }
   }
 }
 
 static void model_rd_for_sb_y(const AV1_COMP *const cpi, BLOCK_SIZE bsize,
-                              MACROBLOCK *x, MACROBLOCKD *xd, int *out_rate_sum,
-                              int64_t *out_dist_sum, int *skip_txfm_sb,
-                              int64_t *skip_sse_sb, unsigned int *var_y,
-                              unsigned int *sse_y, int calculate_rd) {
+                              MACROBLOCK *x, MACROBLOCKD *xd,
+                              RD_STATS *rd_stats, int calculate_rd) {
   // Note our transform coeffs are 8 times an orthogonal transform.
   // Hence quantizer step is also 8 times. To get effective quantizer
   // we need to divide by 8 before sending to modeling function.
@@ -713,22 +778,41 @@
     rate = INT_MAX;  // this will be overwritten later with block_yrd
     dist = INT_MAX;
   }
-  *var_y = var;
-  *sse_y = sse;
+  rd_stats->sse = sse;
   x->pred_sse[ref] = (unsigned int)AOMMIN(sse, UINT_MAX);
 
   assert(rate >= 0);
 
-  if (skip_txfm_sb) *skip_txfm_sb = rate == 0;
-  if (skip_sse_sb) *skip_sse_sb = sse << 4;
+  rd_stats->skip_txfm = (rate == 0);
   rate = AOMMIN(rate, INT_MAX);
-  *out_rate_sum = (int)rate;
-  *out_dist_sum = dist;
+  rd_stats->rate = rate;
+  rd_stats->dist = dist;
 }
 
+/*!\brief Calculates RD Cost using Hadamard transform.
+ *
+ * \ingroup nonrd_mode_search
+ * \callgraph
+ * \callergraph
+ * Calculates RD Cost using Hadamard transform. For low bit depth this function
+ * uses low-precision set of functions (16-bit) and 32 bit for high bit depth
+ * \param[in]    cpi            Top-level encoder structure
+ * \param[in]    x              Pointer to structure holding all the data for
+                                the current macroblock
+ * \param[in]    mi_row         Row index in 4x4 units
+ * \param[in]    mi_col         Column index in 4x4 units
+ * \param[in]    this_rdc       Pointer to calculated RD Cost
+ * \param[in]    skippable      Pointer to a flag indicating possible tx skip
+ * \param[in]    bsize          Current block size
+ * \param[in]    tx_size        Transform size
+ *
+ * \return Nothing is returned. Instead, calculated RD cost is placed to
+ * \c this_rdc. \c skippable flag is set if there is no non-zero quantized
+ * coefficients for Hadamard transform
+ */
 static void block_yrd(AV1_COMP *cpi, MACROBLOCK *x, int mi_row, int mi_col,
-                      RD_STATS *this_rdc, int *skippable, int64_t *sse,
-                      BLOCK_SIZE bsize, TX_SIZE tx_size) {
+                      RD_STATS *this_rdc, int *skippable, BLOCK_SIZE bsize,
+                      TX_SIZE tx_size) {
   MACROBLOCKD *xd = &x->e_mbd;
   const struct macroblockd_plane *pd = &xd->plane[0];
   struct macroblock_plane *const p = &x->plane[0];
@@ -774,11 +858,11 @@
 #if CONFIG_AV1_HIGHBITDEPTH
         tran_low_t *const coeff = p->coeff + block_offset;
         tran_low_t *const qcoeff = p->qcoeff + block_offset;
-        tran_low_t *const dqcoeff = pd->dqcoeff + block_offset;
+        tran_low_t *const dqcoeff = p->dqcoeff + block_offset;
 #else
         int16_t *const low_coeff = (int16_t *)p->coeff + block_offset;
         int16_t *const low_qcoeff = (int16_t *)p->qcoeff + block_offset;
-        int16_t *const low_dqcoeff = (int16_t *)pd->dqcoeff + block_offset;
+        int16_t *const low_dqcoeff = (int16_t *)p->dqcoeff + block_offset;
 #endif
         uint16_t *const eob = &p->eobs[block];
         const int diff_stride = bw;
@@ -807,6 +891,14 @@
                             dqcoeff, p->dequant_QTX, eob, scan_order->scan,
                             scan_order->iscan);
             break;
+          default:
+            assert(tx_size == TX_4X4);
+            aom_fdct4x4(src_diff, coeff, diff_stride);
+            av1_quantize_fp(coeff, 4 * 4, p->zbin_QTX, p->round_fp_QTX,
+                            p->quant_fp_QTX, p->quant_shift_QTX, qcoeff,
+                            dqcoeff, p->dequant_QTX, eob, scan_order->scan,
+                            scan_order->iscan);
+            break;
 #else
           case TX_16X16:
             aom_hadamard_lp_16x16(src_diff, diff_stride, low_coeff);
@@ -822,25 +914,26 @@
             break;
           default:
             assert(tx_size == TX_4X4);
-            x->fwd_txfm4x4(src_diff, low_coeff, diff_stride);
+            aom_fdct4x4_lp(src_diff, low_coeff, diff_stride);
             av1_quantize_lp(low_coeff, 4 * 4, p->round_fp_QTX, p->quant_fp_QTX,
                             low_qcoeff, low_dqcoeff, p->dequant_QTX, eob,
                             scan_order->scan);
             break;
 #endif
         }
+        assert(*eob <= 1024);
         *skippable &= (*eob == 0);
         eob_cost += 1;
       }
       block += step;
     }
   }
-  this_rdc->skip = *skippable;
+  this_rdc->skip_txfm = *skippable;
   this_rdc->rate = 0;
-  if (*sse < INT64_MAX) {
-    *sse = (*sse << 6) >> 2;
+  if (this_rdc->sse < INT64_MAX) {
+    this_rdc->sse = (this_rdc->sse << 6) >> 2;
     if (*skippable) {
-      this_rdc->dist = *sse;
+      this_rdc->dist = this_rdc->sse;
       return;
     }
   }
@@ -856,7 +949,7 @@
         int64_t dummy;
         tran_low_t *const coeff = p->coeff + block_offset;
         tran_low_t *const qcoeff = p->qcoeff + block_offset;
-        tran_low_t *const dqcoeff = pd->dqcoeff + block_offset;
+        tran_low_t *const dqcoeff = p->dqcoeff + block_offset;
 
         if (*eob == 1)
           this_rdc->rate += (int)abs(qcoeff[0]);
@@ -868,7 +961,7 @@
 #else
         int16_t *const low_coeff = (int16_t *)p->coeff + block_offset;
         int16_t *const low_qcoeff = (int16_t *)p->qcoeff + block_offset;
-        int16_t *const low_dqcoeff = (int16_t *)pd->dqcoeff + block_offset;
+        int16_t *const low_dqcoeff = (int16_t *)p->dqcoeff + block_offset;
 
         if (*eob == 1)
           this_rdc->rate += (int)abs(low_qcoeff[0]);
@@ -915,20 +1008,22 @@
 static void store_coding_context(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx) {
 #endif  // CONFIG_INTERNAL_STATS
   MACROBLOCKD *const xd = &x->e_mbd;
+  TxfmSearchInfo *txfm_info = &x->txfm_search_info;
 
   // Take a snapshot of the coding context so it can be
   // restored if we decide to encode this way
-  ctx->rd_stats.skip = x->force_skip;
+  ctx->rd_stats.skip_txfm = txfm_info->skip_txfm;
+
   memset(ctx->blk_skip, 0, sizeof(ctx->blk_skip[0]) * ctx->num_4x4_blk);
   memset(ctx->tx_type_map, DCT_DCT,
          sizeof(ctx->tx_type_map[0]) * ctx->num_4x4_blk);
-  ctx->skippable = x->force_skip;
+  ctx->skippable = txfm_info->skip_txfm;
 #if CONFIG_INTERNAL_STATS
   ctx->best_mode_index = mode_index;
 #endif  // CONFIG_INTERNAL_STATS
   ctx->mic = *xd->mi[0];
-  ctx->skippable = x->force_skip;
-  av1_copy_mbmi_ext_to_mbmi_ext_frame(&ctx->mbmi_ext_best, x->mbmi_ext,
+  ctx->skippable = txfm_info->skip_txfm;
+  av1_copy_mbmi_ext_to_mbmi_ext_frame(&ctx->mbmi_ext_best, &x->mbmi_ext,
                                       av1_ref_frame_type(xd->mi[0]->ref_frame));
   ctx->comp_pred_diff = 0;
   ctx->hybrid_pred_diff = 0;
@@ -949,10 +1044,10 @@
   if (p != NULL) p->in_use = 0;
 }
 
-static int cost_mv_ref(const MACROBLOCK *const x, PREDICTION_MODE mode,
+static int cost_mv_ref(const ModeCosts *const mode_costs, PREDICTION_MODE mode,
                        int16_t mode_context) {
   if (is_inter_compound_mode(mode)) {
-    return x
+    return mode_costs
         ->inter_compound_mode_cost[mode_context][INTER_COMPOUND_OFFSET(mode)];
   }
 
@@ -962,19 +1057,19 @@
   assert(is_inter_mode(mode));
 
   if (mode == NEWMV) {
-    mode_cost = x->newmv_mode_cost[mode_ctx][0];
+    mode_cost = mode_costs->newmv_mode_cost[mode_ctx][0];
     return mode_cost;
   } else {
-    mode_cost = x->newmv_mode_cost[mode_ctx][1];
+    mode_cost = mode_costs->newmv_mode_cost[mode_ctx][1];
     mode_ctx = (mode_context >> GLOBALMV_OFFSET) & GLOBALMV_CTX_MASK;
 
     if (mode == GLOBALMV) {
-      mode_cost += x->zeromv_mode_cost[mode_ctx][0];
+      mode_cost += mode_costs->zeromv_mode_cost[mode_ctx][0];
       return mode_cost;
     } else {
-      mode_cost += x->zeromv_mode_cost[mode_ctx][1];
+      mode_cost += mode_costs->zeromv_mode_cost[mode_ctx][1];
       mode_ctx = (mode_context >> REFMV_OFFSET) & REFMV_CTX_MASK;
-      mode_cost += x->refmv_mode_cost[mode_ctx][mode != NEARESTMV];
+      mode_cost += mode_costs->refmv_mode_cost[mode_ctx][mode != NEARESTMV];
       return mode_cost;
     }
   }
@@ -982,7 +1077,8 @@
 
 static void newmv_diff_bias(MACROBLOCKD *xd, PREDICTION_MODE this_mode,
                             RD_STATS *this_rdc, BLOCK_SIZE bsize, int mv_row,
-                            int mv_col, int speed, uint32_t spatial_variance) {
+                            int mv_col, int speed, uint32_t spatial_variance,
+                            CONTENT_STATE_SB content_state_sb) {
   // Bias against MVs associated with NEWMV mode that are very different from
   // top/left neighbors.
   if (this_mode == NEWMV) {
@@ -994,7 +1090,12 @@
     int left_mv_valid = 0;
     int above_row = 0;
     int above_col = 0;
-
+    if (bsize >= BLOCK_64X64 && content_state_sb.source_sad != kHighSad &&
+        spatial_variance < 300 &&
+        (mv_row > 16 || mv_row < -16 || mv_col > 16 || mv_col < -16)) {
+      this_rdc->rdcost = this_rdc->rdcost << 2;
+      return;
+    }
     if (xd->above_mbmi) {
       above_mv_valid = xd->above_mbmi->mv[0].as_int != INVALID_MV;
       above_row = xd->above_mbmi->mv[0].as_mv.row;
@@ -1035,9 +1136,8 @@
 
 static void model_rd_for_sb_uv(AV1_COMP *cpi, BLOCK_SIZE plane_bsize,
                                MACROBLOCK *x, MACROBLOCKD *xd,
-                               RD_STATS *this_rdc, unsigned int *var_y,
-                               unsigned int *sse_y, int start_plane,
-                               int stop_plane) {
+                               RD_STATS *this_rdc, int64_t *sse_y,
+                               int start_plane, int stop_plane) {
   // Note our transform coeffs are 8 times an orthogonal transform.
   // Hence quantizer step is also 8 times. To get effective quantizer
   // we need to divide by 8 before sending to modeling function.
@@ -1045,12 +1145,11 @@
   int rate;
   int64_t dist;
   int i;
-  uint32_t tot_var = *var_y;
-  uint32_t tot_sse = *sse_y;
+  int64_t tot_sse = *sse_y;
 
   this_rdc->rate = 0;
   this_rdc->dist = 0;
-  this_rdc->skip = 0;
+  this_rdc->skip_txfm = 0;
 
   for (i = start_plane; i <= stop_plane; ++i) {
     struct macroblock_plane *const p = &x->plane[i];
@@ -1064,7 +1163,6 @@
     var = cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride, pd->dst.buf,
                              pd->dst.stride, &sse);
     assert(sse >= var);
-    tot_var += var;
     tot_sse += sse;
 
     av1_model_rd_from_var_lapndz(sse - var, num_pels_log2_lookup[bs],
@@ -1081,20 +1179,20 @@
   }
 
   if (this_rdc->rate == 0) {
-    this_rdc->skip = 1;
+    this_rdc->skip_txfm = 1;
   }
 
   if (RDCOST(x->rdmult, this_rdc->rate, this_rdc->dist) >=
-      RDCOST(x->rdmult, 0, ((int64_t)tot_sse) << 4)) {
+      RDCOST(x->rdmult, 0, tot_sse << 4)) {
     this_rdc->rate = 0;
     this_rdc->dist = tot_sse << 4;
-    this_rdc->skip = 1;
+    this_rdc->skip_txfm = 1;
   }
 
-  *var_y = tot_var;
   *sse_y = tot_sse;
 }
 
+/*!\cond */
 struct estimate_block_intra_args {
   AV1_COMP *cpi;
   MACROBLOCK *x;
@@ -1102,7 +1200,27 @@
   int skippable;
   RD_STATS *rdc;
 };
+/*!\endcond */
 
+/*!\brief Estimation of RD cost of an intra mode for Non-RD optimized case.
+ *
+ * \ingroup nonrd_mode_search
+ * \callgraph
+ * \callergraph
+ * Calculates RD Cost for an intra mode for a single TX block using Hadamard
+ * transform.
+ * \param[in]    plane          Color plane
+ * \param[in]    block          Index of a TX block in a prediction block
+ * \param[in]    row            Row of a current TX block
+ * \param[in]    col            Column of a current TX block
+ * \param[in]    plane_bsize    Block size of a current prediction block
+ * \param[in]    tx_size        Transform size
+ * \param[in]    arg            Pointer to a structure that holds paramaters
+ *                              for intra mode search
+ *
+ * \return Nothing is returned. Instead, best mode and RD Cost of the best mode
+ * are set in \c args->rdc and \c args->mode
+ */
 static void estimate_block_intra(int plane, int block, int row, int col,
                                  BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
                                  void *arg) {
@@ -1126,16 +1244,14 @@
   pd->dst.buf = &dst_buf_base[4 * (row * dst_stride + col)];
 
   av1_predict_intra_block_facade(cm, xd, plane, col, row, tx_size);
+  av1_invalid_rd_stats(&this_rdc);
 
   if (plane == 0) {
-    int64_t this_sse = INT64_MAX;
-    block_yrd(cpi, x, 0, 0, &this_rdc, &args->skippable, &this_sse, bsize_tx,
+    block_yrd(cpi, x, 0, 0, &this_rdc, &args->skippable, bsize_tx,
               AOMMIN(tx_size, TX_16X16));
   } else {
-    unsigned int var = 0;
-    unsigned int sse = 0;
-    model_rd_for_sb_uv(cpi, plane_bsize, x, xd, &this_rdc, &var, &sse, plane,
-                       plane);
+    int64_t sse = 0;
+    model_rd_for_sb_uv(cpi, plane_bsize, x, xd, &this_rdc, &sse, plane, plane);
   }
 
   p->src.buf = src_buf_base;
@@ -1149,14 +1265,18 @@
                                            MV_REFERENCE_FRAME ref_frame,
                                            THR_MODES best_mode_idx,
                                            PREDICTION_MODE mode) {
-  THR_MODES thr_mode_idx = mode_idx[ref_frame][mode_offset(mode)];
-  int *freq_fact = &x->thresh_freq_fact[bsize][thr_mode_idx];
-  if (thr_mode_idx == best_mode_idx) {
-    *freq_fact -= (*freq_fact >> 4);
-  } else {
-    *freq_fact =
-        AOMMIN(*freq_fact + RD_THRESH_INC,
-               cpi->sf.inter_sf.adaptive_rd_thresh * RD_THRESH_MAX_FACT);
+  const THR_MODES thr_mode_idx = mode_idx[ref_frame][mode_offset(mode)];
+  const BLOCK_SIZE min_size = AOMMAX(bsize - 3, BLOCK_4X4);
+  const BLOCK_SIZE max_size = AOMMIN(bsize + 6, BLOCK_128X128);
+  for (BLOCK_SIZE bs = min_size; bs <= max_size; bs += 3) {
+    int *freq_fact = &x->thresh_freq_fact[bs][thr_mode_idx];
+    if (thr_mode_idx == best_mode_idx) {
+      *freq_fact -= (*freq_fact >> 4);
+    } else {
+      *freq_fact =
+          AOMMIN(*freq_fact + RD_THRESH_INC,
+                 cpi->sf.inter_sf.adaptive_rd_thresh * RD_THRESH_MAX_FACT);
+    }
   }
 }
 
@@ -1283,51 +1403,75 @@
 }
 
 #define FILTER_SEARCH_SIZE 2
+/*!\brief Searches for the best intrpolation filter
+ *
+ * \ingroup nonrd_mode_search
+ * \callgraph
+ * \callergraph
+ * Iterates through subset of possible interpolation filters (currently
+ * only EIGHTTAP_REGULAR and EIGTHTAP_SMOOTH in both directions) and selects
+ * the one that gives lowest RD cost. RD cost is calculated using curvfit model
+ *
+ * \param[in]    cpi                  Top-level encoder structure
+ * \param[in]    x                    Pointer to structure holding all the
+ *                                    data for the current macroblock
+ * \param[in]    this_rdc             Pointer to calculated RD Cost
+ * \param[in]    mi_row               Row index in 4x4 units
+ * \param[in]    mi_col               Column index in 4x4 units
+ * \param[in]    tmp                  Pointer to a temporary buffer for
+ *                                    prediction re-use
+ * \param[in]    bsize                Current block size
+ * \param[in]    reuse_inter_pred     Flag, indicating prediction re-use
+ * \param[out]   this_mode_pred       Pointer to store prediction buffer
+ *                                    for prediction re-use
+ * \param[out]   this_early_term      Flag, indicating that transform can be
+ *                                    skipped
+ * \param[in]    use_model_yrd_large  Flag, indicating special logic to handle
+ *                                    large blocks
+ *
+ * \return Nothing is returned. Instead, calculated RD cost is placed to
+ * \c this_rdc and best filter is placed to \c mi->interp_filters. In case
+ * \c reuse_inter_pred flag is set, this function also ouputs
+ * \c this_mode_pred. Also \c this_early_temp is set if transform can be
+ * skipped
+ */
 static void search_filter_ref(AV1_COMP *cpi, MACROBLOCK *x, RD_STATS *this_rdc,
                               int mi_row, int mi_col, PRED_BUFFER *tmp,
                               BLOCK_SIZE bsize, int reuse_inter_pred,
-                              PRED_BUFFER **this_mode_pred, unsigned int *var_y,
-                              unsigned int *sse_y, int *this_early_term,
-                              int use_model_yrd_large, int64_t *sse_block_yrd) {
+                              PRED_BUFFER **this_mode_pred,
+                              int *this_early_term, int use_model_yrd_large) {
   AV1_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
   struct macroblockd_plane *const pd = &xd->plane[0];
   MB_MODE_INFO *const mi = xd->mi[0];
   const int bw = block_size_wide[bsize];
-  int pf_rate[FILTER_SEARCH_SIZE] = { 0 };
-  int64_t pf_dist[FILTER_SEARCH_SIZE] = { 0 };
-  unsigned int pf_var[FILTER_SEARCH_SIZE] = { 0 };
-  unsigned int pf_sse[FILTER_SEARCH_SIZE] = { 0 };
-  int64_t pf_sse_block_yrd[FILTER_SEARCH_SIZE] = { 0 };
+  RD_STATS pf_rd_stats[FILTER_SEARCH_SIZE] = { 0 };
   TX_SIZE pf_tx_size[FILTER_SEARCH_SIZE] = { 0 };
   PRED_BUFFER *current_pred = *this_mode_pred;
-  int skip_txfm[FILTER_SEARCH_SIZE] = { 0 };
   int best_skip = 0;
   int best_early_term = 0;
   int64_t best_cost = INT64_MAX;
   int best_filter_index = -1;
   InterpFilter filters[FILTER_SEARCH_SIZE] = { EIGHTTAP_REGULAR,
                                                EIGHTTAP_SMOOTH };
-  int i;
-  for (i = 0; i < FILTER_SEARCH_SIZE; ++i) {
+  for (int i = 0; i < FILTER_SEARCH_SIZE; ++i) {
     int64_t cost;
     InterpFilter filter = filters[i];
     mi->interp_filters = av1_broadcast_interp_filter(filter);
     av1_enc_build_inter_predictor_y(xd, mi_row, mi_col);
     if (use_model_yrd_large)
-      model_skip_for_sb_y_large(cpi, bsize, mi_row, mi_col, x, xd, &pf_rate[i],
-                                &pf_dist[i], &pf_var[i], &pf_sse[i],
-                                this_early_term, 1);
+      model_skip_for_sb_y_large(cpi, bsize, mi_row, mi_col, x, xd,
+                                &pf_rd_stats[i], this_early_term, 1);
     else
-      model_rd_for_sb_y(cpi, bsize, x, xd, &pf_rate[i], &pf_dist[i],
-                        &skip_txfm[i], NULL, &pf_var[i], &pf_sse[i], 1);
-    pf_rate[i] += av1_get_switchable_rate(x, xd, cm->features.interp_filter);
-    cost = RDCOST(x->rdmult, pf_rate[i], pf_dist[i]);
+      model_rd_for_sb_y(cpi, bsize, x, xd, &pf_rd_stats[i], 1);
+    pf_rd_stats[i].rate += av1_get_switchable_rate(
+        x, xd, cm->features.interp_filter, cm->seq_params.enable_dual_filter);
+    cost = RDCOST(x->rdmult, pf_rd_stats[i].rate, pf_rd_stats[i].dist);
     pf_tx_size[i] = mi->tx_size;
     if (cost < best_cost) {
       best_filter_index = i;
       best_cost = cost;
-      best_skip = skip_txfm[i];
+      best_skip = pf_rd_stats[i].skip_txfm;
       best_early_term = *this_early_term;
       if (reuse_inter_pred) {
         if (*this_mode_pred != current_pred) {
@@ -1346,12 +1490,10 @@
 
   mi->interp_filters = av1_broadcast_interp_filter(filters[best_filter_index]);
   mi->tx_size = pf_tx_size[best_filter_index];
-  this_rdc->rate = pf_rate[best_filter_index];
-  this_rdc->dist = pf_dist[best_filter_index];
-  *var_y = pf_var[best_filter_index];
-  *sse_y = pf_sse[best_filter_index];
-  *sse_block_yrd = pf_sse_block_yrd[best_filter_index];
-  this_rdc->skip = (best_skip || best_early_term);
+  this_rdc->rate = pf_rd_stats[best_filter_index].rate;
+  this_rdc->dist = pf_rd_stats[best_filter_index].dist;
+  this_rdc->sse = pf_rd_stats[best_filter_index].sse;
+  this_rdc->skip_txfm = (best_skip || best_early_term);
   *this_early_term = best_early_term;
   if (reuse_inter_pred) {
     pd->dst.buf = (*this_mode_pred)->data;
@@ -1415,22 +1557,23 @@
   pd->dst.buf = dst_buf_base;
 }
 
-void av1_pick_intra_mode(AV1_COMP *cpi, MACROBLOCK *x, RD_STATS *rd_cost,
-                         BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx) {
+void av1_nonrd_pick_intra_mode(AV1_COMP *cpi, MACROBLOCK *x, RD_STATS *rd_cost,
+                               BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx) {
   AV1_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mi = xd->mi[0];
   RD_STATS this_rdc, best_rdc;
   struct estimate_block_intra_args args = { cpi, x, DC_PRED, 1, 0 };
+  const TxfmSearchParams *txfm_params = &x->txfm_search_params;
   const TX_SIZE intra_tx_size =
       AOMMIN(max_txsize_lookup[bsize],
-             tx_mode_to_biggest_tx_size[x->tx_mode_search_type]);
+             tx_mode_to_biggest_tx_size[txfm_params->tx_mode_search_type]);
   int *bmode_costs;
   const MB_MODE_INFO *above_mi = xd->above_mbmi;
   const MB_MODE_INFO *left_mi = xd->left_mbmi;
   const PREDICTION_MODE A = av1_above_block_mode(above_mi);
   const PREDICTION_MODE L = av1_left_block_mode(left_mi);
-  bmode_costs = x->y_mode_costs[A][L];
+  bmode_costs = x->mode_costs.y_mode_costs[A][L];
 
   av1_invalid_rd_stats(&best_rdc);
   av1_invalid_rd_stats(&this_rdc);
@@ -1450,9 +1593,9 @@
     av1_foreach_transformed_block_in_plane(xd, bsize, 0, estimate_block_intra,
                                            &args);
     if (args.skippable) {
-      this_rdc.rate = av1_cost_symbol(av1_get_skip_cdf(xd)[1]);
+      this_rdc.rate = av1_cost_symbol(av1_get_skip_txfm_cdf(xd)[1]);
     } else {
-      this_rdc.rate += av1_cost_symbol(av1_get_skip_cdf(xd)[0]);
+      this_rdc.rate += av1_cost_symbol(av1_get_skip_txfm_cdf(xd)[0]);
     }
     this_rdc.rate += bmode_costs[this_mode];
     this_rdc.rdcost = RDCOST(x->rdmult, this_rdc.rate, this_rdc.dist);
@@ -1472,95 +1615,407 @@
 #endif  // CONFIG_INTERNAL_STATS
 }
 
+static AOM_INLINE int is_same_gf_and_last_scale(AV1_COMMON *cm) {
+  struct scale_factors *const sf_last = get_ref_scale_factors(cm, LAST_FRAME);
+  struct scale_factors *const sf_golden =
+      get_ref_scale_factors(cm, GOLDEN_FRAME);
+  return ((sf_last->x_scale_fp == sf_golden->x_scale_fp) &&
+          (sf_last->y_scale_fp == sf_golden->y_scale_fp));
+}
+
+static AOM_INLINE void get_ref_frame_use_mask(AV1_COMP *cpi, MACROBLOCK *x,
+                                              MB_MODE_INFO *mi, int mi_row,
+                                              int mi_col, int bsize,
+                                              int gf_temporal_ref,
+                                              int use_ref_frame[],
+                                              int *force_skip_low_temp_var) {
+  AV1_COMMON *const cm = &cpi->common;
+  const struct segmentation *const seg = &cm->seg;
+  const int is_small_sb = (cm->seq_params.sb_size == BLOCK_64X64);
+
+  // For SVC the usage of alt_ref is determined by the ref_frame_flags.
+  int use_alt_ref_frame = cpi->use_svc || cpi->sf.rt_sf.use_nonrd_altref_frame;
+  int use_golden_ref_frame = 1;
+
+  use_ref_frame[LAST_FRAME] = 1;  // we never skip LAST
+
+  if (cpi->rc.frames_since_golden == 0 && gf_temporal_ref) {
+    use_golden_ref_frame = 0;
+  }
+
+  if (cpi->sf.rt_sf.short_circuit_low_temp_var &&
+      x->nonrd_prune_ref_frame_search) {
+    if (is_small_sb)
+      *force_skip_low_temp_var = get_force_skip_low_temp_var_small_sb(
+          &x->part_search_info.variance_low[0], mi_row, mi_col, bsize);
+    else
+      *force_skip_low_temp_var = get_force_skip_low_temp_var(
+          &x->part_search_info.variance_low[0], mi_row, mi_col, bsize);
+    // If force_skip_low_temp_var is set, skip golden reference.
+    if (*force_skip_low_temp_var) {
+      use_golden_ref_frame = 0;
+      use_alt_ref_frame = 0;
+    }
+  }
+
+  if (segfeature_active(seg, mi->segment_id, SEG_LVL_REF_FRAME) &&
+      get_segdata(seg, mi->segment_id, SEG_LVL_REF_FRAME) == GOLDEN_FRAME) {
+    use_golden_ref_frame = 1;
+    use_alt_ref_frame = 0;
+  }
+
+  use_alt_ref_frame =
+      cpi->ref_frame_flags & AOM_ALT_FLAG ? use_alt_ref_frame : 0;
+  use_golden_ref_frame =
+      cpi->ref_frame_flags & AOM_GOLD_FLAG ? use_golden_ref_frame : 0;
+
+  use_ref_frame[ALTREF_FRAME] = use_alt_ref_frame;
+  use_ref_frame[GOLDEN_FRAME] = use_golden_ref_frame;
+}
+
+/*!\brief Estimates best intra mode for inter mode search
+ *
+ * \ingroup nonrd_mode_search
+ * \callgraph
+ * \callergraph
+ *
+ * Using heuristics based on best inter mode, block size, and other decides
+ * whether to check intra modes. If so, estimates and selects best intra mode
+ * from the reduced set of intra modes (max 4 intra modes checked)
+ *
+ * \param[in]    cpi                      Top-level encoder structure
+ * \param[in]    x                        Pointer to structure holding all the
+ *                                        data for the current macroblock
+ * \param[in]    bsize                    Current block size
+ * \param[in]    use_modeled_non_rd_cost  Flag, indicating usage of curvfit
+ *                                        model for RD cost
+ * \param[in]    best_early_term          Flag, indicating that TX for the
+ *                                        best inter mode was skipped
+ * \param[in]    ref_cost_intra           Cost of signalling intra mode
+ * \param[in]    reuse_prediction         Flag, indicating prediction re-use
+ * \param[in]    orig_dst                 Original destination buffer
+ * \param[in]    tmp_buffers              Pointer to a temporary buffers for
+ *                                        prediction re-use
+ * \param[out]   this_mode_pred           Pointer to store prediction buffer
+ *                                        for prediction re-use
+ * \param[in]    best_rdc                 Pointer to RD cost for the best
+ *                                        selected intra mode
+ * \param[in]    best_pickmode            Pointer to a structure containing
+ *                                        best mode picked so far
+ *
+ * \return Nothing is returned. Instead, calculated RD cost is placed to
+ * \c best_rdc and best selected mode is placed to \c best_pickmode
+ */
+static void estimate_intra_mode(
+    AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int use_modeled_non_rd_cost,
+    int best_early_term, unsigned int ref_cost_intra, int reuse_prediction,
+    struct buf_2d *orig_dst, PRED_BUFFER *tmp_buffers,
+    PRED_BUFFER **this_mode_pred, RD_STATS *best_rdc,
+    BEST_PICKMODE *best_pickmode) {
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mi = xd->mi[0];
+  const TxfmSearchParams *txfm_params = &x->txfm_search_params;
+  const unsigned char segment_id = mi->segment_id;
+  const int *const rd_threshes = cpi->rd.threshes[segment_id][bsize];
+  const int *const rd_thresh_freq_fact = x->thresh_freq_fact[bsize];
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
+  struct macroblockd_plane *const pd = &xd->plane[0];
+
+  const CommonQuantParams *quant_params = &cm->quant_params;
+
+  RD_STATS this_rdc;
+
+  int intra_cost_penalty = av1_get_intra_cost_penalty(
+      quant_params->base_qindex, quant_params->y_dc_delta_q,
+      cm->seq_params.bit_depth);
+  int64_t inter_mode_thresh = RDCOST(x->rdmult, intra_cost_penalty, 0);
+  int perform_intra_pred = cpi->sf.rt_sf.check_intra_pred_nonrd;
+
+  int do_early_exit_rdthresh = 1;
+
+  uint32_t spatial_var_thresh = 50;
+  int motion_thresh = 32;
+  // Adjust thresholds to make intra mode likely tested if the other
+  // references (golden, alt) are skipped/not checked. For now always
+  // adjust for svc mode.
+  if (cpi->use_svc || (cpi->sf.rt_sf.use_nonrd_altref_frame == 0 &&
+                       cpi->sf.rt_sf.nonrd_prune_ref_frame_search > 0)) {
+    spatial_var_thresh = 150;
+    motion_thresh = 0;
+  }
+
+  // Some adjustments to checking intra mode based on source variance.
+  if (x->source_variance < spatial_var_thresh) {
+    // If the best inter mode is large motion or non-LAST ref reduce intra cost
+    // penalty, so intra mode is more likely tested.
+    if (best_pickmode->best_ref_frame != LAST_FRAME ||
+        abs(mi->mv[0].as_mv.row) >= motion_thresh ||
+        abs(mi->mv[0].as_mv.col) >= motion_thresh) {
+      intra_cost_penalty = intra_cost_penalty >> 2;
+      inter_mode_thresh = RDCOST(x->rdmult, intra_cost_penalty, 0);
+      do_early_exit_rdthresh = 0;
+    }
+    // For big blocks worth checking intra (since only DC will be checked),
+    // even if best_early_term is set.
+    if (bsize >= BLOCK_32X32) best_early_term = 0;
+  } else if (cpi->sf.rt_sf.source_metrics_sb_nonrd &&
+             x->content_state_sb.source_sad == kLowSad) {
+    perform_intra_pred = 0;
+  }
+
+  if (cpi->sf.rt_sf.skip_intra_pred_if_tx_skip && best_rdc->skip_txfm &&
+      best_pickmode->best_mode_initial_skip_flag) {
+    perform_intra_pred = 0;
+  }
+
+  if (!(best_rdc->rdcost == INT64_MAX ||
+        (perform_intra_pred && !best_early_term &&
+         best_rdc->rdcost > inter_mode_thresh &&
+         bsize <= cpi->sf.part_sf.max_intra_bsize))) {
+    return;
+  }
+
+  struct estimate_block_intra_args args = { cpi, x, DC_PRED, 1, 0 };
+  TX_SIZE intra_tx_size = AOMMIN(
+      AOMMIN(max_txsize_lookup[bsize],
+             tx_mode_to_biggest_tx_size[txfm_params->tx_mode_search_type]),
+      TX_16X16);
+
+  PRED_BUFFER *const best_pred = best_pickmode->best_pred;
+  if (reuse_prediction && best_pred != NULL) {
+    const int bh = block_size_high[bsize];
+    const int bw = block_size_wide[bsize];
+    if (best_pred->data == orig_dst->buf) {
+      *this_mode_pred = &tmp_buffers[get_pred_buffer(tmp_buffers, 3)];
+      aom_convolve_copy(best_pred->data, best_pred->stride,
+                        (*this_mode_pred)->data, (*this_mode_pred)->stride, bw,
+                        bh);
+      best_pickmode->best_pred = *this_mode_pred;
+    }
+  }
+  pd->dst = *orig_dst;
+
+  for (int i = 0; i < 4; ++i) {
+    const PREDICTION_MODE this_mode = intra_mode_list[i];
+    const THR_MODES mode_index = mode_idx[INTRA_FRAME][mode_offset(this_mode)];
+    const int64_t mode_rd_thresh = rd_threshes[mode_index];
+
+    if (!((1 << this_mode) & cpi->sf.rt_sf.intra_y_mode_bsize_mask_nrd[bsize]))
+      continue;
+
+    if (rd_less_than_thresh(best_rdc->rdcost, mode_rd_thresh,
+                            rd_thresh_freq_fact[mode_index]) &&
+        (do_early_exit_rdthresh || this_mode == SMOOTH_PRED)) {
+      continue;
+    }
+    const BLOCK_SIZE uv_bsize = get_plane_block_size(
+        bsize, xd->plane[1].subsampling_x, xd->plane[1].subsampling_y);
+
+    mi->mode = this_mode;
+    mi->ref_frame[0] = INTRA_FRAME;
+    mi->ref_frame[1] = NONE_FRAME;
+
+    av1_invalid_rd_stats(&this_rdc);
+    args.mode = this_mode;
+    args.skippable = 1;
+    args.rdc = &this_rdc;
+    mi->tx_size = intra_tx_size;
+    compute_intra_yprediction(cm, this_mode, bsize, x, xd);
+    // Look into selecting tx_size here, based on prediction residual.
+    if (use_modeled_non_rd_cost)
+      model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc, 1);
+    else
+      block_yrd(cpi, x, mi_row, mi_col, &this_rdc, &args.skippable, bsize,
+                mi->tx_size);
+    // TODO(kyslov@) Need to account for skippable
+    if (x->color_sensitivity[0]) {
+      av1_foreach_transformed_block_in_plane(xd, uv_bsize, 1,
+                                             estimate_block_intra, &args);
+    }
+    if (x->color_sensitivity[1]) {
+      av1_foreach_transformed_block_in_plane(xd, uv_bsize, 2,
+                                             estimate_block_intra, &args);
+    }
+
+    int mode_cost = 0;
+    if (av1_is_directional_mode(this_mode) && av1_use_angle_delta(bsize)) {
+      mode_cost +=
+          x->mode_costs.angle_delta_cost[this_mode - V_PRED]
+                                        [MAX_ANGLE_DELTA +
+                                         mi->angle_delta[PLANE_TYPE_Y]];
+    }
+    if (this_mode == DC_PRED && av1_filter_intra_allowed_bsize(cm, bsize)) {
+      mode_cost += x->mode_costs.filter_intra_cost[bsize][0];
+    }
+    this_rdc.rate += ref_cost_intra;
+    this_rdc.rate += intra_cost_penalty;
+    this_rdc.rate += mode_cost;
+    this_rdc.rdcost = RDCOST(x->rdmult, this_rdc.rate, this_rdc.dist);
+
+    if (this_rdc.rdcost < best_rdc->rdcost) {
+      *best_rdc = this_rdc;
+      best_pickmode->best_mode = this_mode;
+      best_pickmode->best_tx_size = mi->tx_size;
+      best_pickmode->best_ref_frame = INTRA_FRAME;
+      mi->uv_mode = this_mode;
+      mi->mv[0].as_int = INVALID_MV;
+      mi->mv[1].as_int = INVALID_MV;
+    }
+  }
+  mi->tx_size = best_pickmode->best_tx_size;
+}
+
+static AOM_INLINE int is_filter_search_enabled(const AV1_COMP *cpi, int mi_row,
+                                               int mi_col, BLOCK_SIZE bsize) {
+  const AV1_COMMON *const cm = &cpi->common;
+  int enable_filter_search = 0;
+
+  if (cpi->sf.rt_sf.use_nonrd_filter_search) {
+    enable_filter_search = 1;
+    if (cpi->sf.interp_sf.cb_pred_filter_search) {
+      const int bsl = mi_size_wide_log2[bsize];
+      enable_filter_search =
+          (((mi_row + mi_col) >> bsl) +
+           get_chessboard_index(cm->current_frame.frame_number)) &
+          0x1;
+    }
+  }
+  return enable_filter_search;
+}
+
+static AOM_INLINE int skip_mode_by_threshold(
+    PREDICTION_MODE mode, MV_REFERENCE_FRAME ref_frame, int_mv mv,
+    int frames_since_golden, const int *const rd_threshes,
+    const int *const rd_thresh_freq_fact, int64_t best_cost, int best_skip,
+    int extra_shift) {
+  int skip_this_mode = 0;
+  const THR_MODES mode_index = mode_idx[ref_frame][INTER_OFFSET(mode)];
+  int64_t mode_rd_thresh =
+      best_skip ? ((int64_t)rd_threshes[mode_index]) << (extra_shift + 1)
+                : ((int64_t)rd_threshes[mode_index]) << extra_shift;
+
+  // Increase mode_rd_thresh value for non-LAST for improved encoding
+  // speed
+  if (ref_frame != LAST_FRAME) {
+    mode_rd_thresh = mode_rd_thresh << 1;
+    if (ref_frame == GOLDEN_FRAME && frames_since_golden > 4)
+      mode_rd_thresh = mode_rd_thresh << (extra_shift + 1);
+  }
+
+  if (rd_less_than_thresh(best_cost, mode_rd_thresh,
+                          rd_thresh_freq_fact[mode_index]))
+    if (mv.as_int != 0) skip_this_mode = 1;
+
+  return skip_this_mode;
+}
+
+static AOM_INLINE int skip_mode_by_low_temp(
+    PREDICTION_MODE mode, MV_REFERENCE_FRAME ref_frame, BLOCK_SIZE bsize,
+    CONTENT_STATE_SB content_state_sb, int_mv mv, int force_skip_low_temp_var) {
+  // Skip non-zeromv mode search for non-LAST frame if force_skip_low_temp_var
+  // is set. If nearestmv for golden frame is 0, zeromv mode will be skipped
+  // later.
+  if (force_skip_low_temp_var && ref_frame != LAST_FRAME && mv.as_int != 0) {
+    return 1;
+  }
+
+  if (content_state_sb.source_sad != kHighSad && bsize >= BLOCK_64X64 &&
+      force_skip_low_temp_var && mode == NEWMV) {
+    return 1;
+  }
+  return 0;
+}
+
+static AOM_INLINE int skip_mode_by_bsize_and_ref_frame(
+    PREDICTION_MODE mode, MV_REFERENCE_FRAME ref_frame, BLOCK_SIZE bsize,
+    int extra_prune, unsigned int sse_zeromv_norm, int more_prune) {
+  const unsigned int thresh_skip_golden = 500;
+
+  if (ref_frame != LAST_FRAME && sse_zeromv_norm < thresh_skip_golden &&
+      mode == NEWMV)
+    return 1;
+
+  if (bsize == BLOCK_128X128 && mode == NEWMV) return 1;
+
+  // Skip testing non-LAST if this flag is set.
+  if (extra_prune) {
+    if (extra_prune > 1 && ref_frame != LAST_FRAME &&
+        (bsize > BLOCK_64X64 || (bsize > BLOCK_16X16 && mode == NEWMV)))
+      return 1;
+
+    if (ref_frame != LAST_FRAME && mode == NEARMV) return 1;
+
+    if (more_prune && bsize >= BLOCK_32X32 && mode == NEARMV) return 1;
+  }
+  return 0;
+}
+
 void av1_nonrd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
                                   MACROBLOCK *x, RD_STATS *rd_cost,
-                                  BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
-                                  int64_t best_rd_so_far) {
+                                  BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx) {
   AV1_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mi = xd->mi[0];
   struct macroblockd_plane *const pd = &xd->plane[0];
 
   BEST_PICKMODE best_pickmode;
-  int inter_mode_mask[BLOCK_SIZES];
 #if COLLECT_PICK_MODE_STAT
   static mode_search_stat ms_stat;
 #endif
   MV_REFERENCE_FRAME ref_frame;
-  MV_REFERENCE_FRAME usable_ref_frame, second_ref_frame;
   int_mv frame_mv[MB_MODE_COUNT][REF_FRAMES];
   uint8_t mode_checked[MB_MODE_COUNT][REF_FRAMES];
-  struct buf_2d yv12_mb[8][MAX_MB_PLANE];
-  static const int flag_list[8] = { 0, AOM_LAST_FLAG, 0, 0, AOM_GOLD_FLAG, 0,
-                                    0, AOM_ALT_FLAG };
+  struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE];
   RD_STATS this_rdc, best_rdc;
-  // var_y and sse_y are saved to be used in skipping checking
-  unsigned int sse_y = UINT_MAX;
-  unsigned int var_y = UINT_MAX;
-  const int *const rd_threshes = cpi->rd.threshes[mi->segment_id][bsize];
+  const unsigned char segment_id = mi->segment_id;
+  const int *const rd_threshes = cpi->rd.threshes[segment_id][bsize];
   const int *const rd_thresh_freq_fact = x->thresh_freq_fact[bsize];
-  InterpFilter filter_ref;
-  int ref_frame_skip_mask = 0;
-  int best_pred_sad = INT_MAX;
+  const InterpFilter filter_ref = cm->features.interp_filter;
   int best_early_term = 0;
   unsigned int ref_costs_single[REF_FRAMES],
       ref_costs_comp[REF_FRAMES][REF_FRAMES];
   int force_skip_low_temp_var = 0;
-  int skip_ref_find_pred[8] = { 0 };
+  int use_ref_frame_mask[REF_FRAMES] = { 0 };
   unsigned int sse_zeromv_norm = UINT_MAX;
-  const unsigned int thresh_skip_golden = 500;
-  int gf_temporal_ref = 0;
-  const struct segmentation *const seg = &cm->seg;
-  int num_inter_modes = RT_INTER_MODES;
-  unsigned char segment_id = mi->segment_id;
+  const int num_inter_modes = cpi->sf.rt_sf.nonrd_agressive_skip
+                                  ? NUM_INTER_MODES_REDUCED
+                                  : NUM_INTER_MODES_RT;
+  const REF_MODE *const ref_mode_set = cpi->sf.rt_sf.nonrd_agressive_skip
+                                           ? ref_mode_set_reduced
+                                           : ref_mode_set_rt;
   PRED_BUFFER tmp[4];
   DECLARE_ALIGNED(16, uint8_t, pred_buf[3 * 128 * 128]);
   PRED_BUFFER *this_mode_pred = NULL;
   const int reuse_inter_pred =
       cpi->sf.rt_sf.reuse_inter_pred_nonrd && cm->seq_params.bit_depth == 8;
+
   const int bh = block_size_high[bsize];
   const int bw = block_size_wide[bsize];
   const int pixels_in_block = bh * bw;
   struct buf_2d orig_dst = pd->dst;
   const CommonQuantParams *quant_params = &cm->quant_params;
+  const TxfmSearchParams *txfm_params = &x->txfm_search_params;
+  TxfmSearchInfo *txfm_info = &x->txfm_search_info;
 #if COLLECT_PICK_MODE_STAT
   aom_usec_timer_start(&ms_stat.timer2);
 #endif
-  int intra_cost_penalty = av1_get_intra_cost_penalty(
-      quant_params->base_qindex, quant_params->y_dc_delta_q,
-      cm->seq_params.bit_depth);
-  int64_t inter_mode_thresh = RDCOST(x->rdmult, intra_cost_penalty, 0);
-  const int perform_intra_pred = cpi->sf.rt_sf.check_intra_pred_nonrd;
-  int use_modeled_non_rd_cost = 0;
-  int enable_filter_search = 0;
-  InterpFilter default_interp_filter = EIGHTTAP_REGULAR;
+  const InterpFilter default_interp_filter = EIGHTTAP_REGULAR;
   int64_t thresh_sad_pred = INT64_MAX;
-
-  (void)best_rd_so_far;
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
+  int use_modeled_non_rd_cost = 0;
 
   init_best_pickmode(&best_pickmode);
 
-  for (int i = 0; i < BLOCK_SIZES; ++i) inter_mode_mask[i] = INTER_ALL;
+  const ModeCosts *mode_costs = &x->mode_costs;
 
-  // TODO(kyslov) Move this to Speed Features
-  inter_mode_mask[BLOCK_128X128] = INTER_NEAREST_NEAR;
-
-  struct scale_factors *const sf_last = get_ref_scale_factors(cm, LAST_FRAME);
-  struct scale_factors *const sf_golden =
-      get_ref_scale_factors(cm, GOLDEN_FRAME);
-  gf_temporal_ref = 1;
-  // For temporal long term prediction, check that the golden reference
-  // is same scale as last reference, otherwise disable.
-  if ((sf_last->x_scale_fp != sf_golden->x_scale_fp) ||
-      (sf_last->y_scale_fp != sf_golden->y_scale_fp)) {
-    gf_temporal_ref = 0;
-  }
-
-  av1_collect_neighbors_ref_counts(xd);
-
-  estimate_single_ref_frame_costs(cm, xd, x, segment_id, ref_costs_single);
+  estimate_single_ref_frame_costs(cm, xd, mode_costs, segment_id,
+                                  ref_costs_single);
   if (cpi->sf.rt_sf.use_comp_ref_nonrd)
-    estimate_comp_ref_frame_costs(cm, xd, x, segment_id, ref_costs_comp);
+    estimate_comp_ref_frame_costs(cm, xd, mode_costs, segment_id,
+                                  ref_costs_comp);
 
   memset(&mode_checked[0][0], 0, MB_MODE_COUNT * REF_FRAMES);
   if (reuse_inter_pred) {
@@ -1574,66 +2029,26 @@
     tmp[3].in_use = 0;
   }
 
-  x->force_skip = 0;
-
-  // Instead of using av1_get_pred_context_switchable_interp(xd) to assign
-  // filter_ref, we use a less strict condition on assigning filter_ref.
-  // This is to reduce the probabily of entering the flow of not assigning
-  // filter_ref and then skip filter search.
-  filter_ref = cm->features.interp_filter;
+  txfm_info->skip_txfm = 0;
 
   // initialize mode decisions
   av1_invalid_rd_stats(&best_rdc);
   av1_invalid_rd_stats(&this_rdc);
   av1_invalid_rd_stats(rd_cost);
-  mi->sb_type = bsize;
+  mi->bsize = bsize;
   mi->ref_frame[0] = NONE_FRAME;
   mi->ref_frame[1] = NONE_FRAME;
 
-  usable_ref_frame =
-      cpi->sf.rt_sf.use_nonrd_altref_frame ? ALTREF_FRAME : GOLDEN_FRAME;
+  const int gf_temporal_ref = is_same_gf_and_last_scale(cm);
 
-  if (cpi->rc.frames_since_golden == 0 && gf_temporal_ref) {
-    skip_ref_find_pred[GOLDEN_FRAME] = 1;
-    if (!cpi->sf.rt_sf.use_nonrd_altref_frame) usable_ref_frame = LAST_FRAME;
-  }
-
-  const int mi_row = xd->mi_row;
-  const int mi_col = xd->mi_col;
-  const int is_small_sb = (cm->seq_params.sb_size == BLOCK_64X64);
-  if (cpi->sf.rt_sf.short_circuit_low_temp_var &&
-      x->nonrd_prune_ref_frame_search) {
-    if (is_small_sb)
-      force_skip_low_temp_var = get_force_skip_low_temp_var_small_sb(
-          &x->variance_low[0], mi_row, mi_col, bsize);
-    else
-      force_skip_low_temp_var = get_force_skip_low_temp_var(
-          &x->variance_low[0], mi_row, mi_col, bsize);
-    // If force_skip_low_temp_var is set, skip golden reference.
-    if (force_skip_low_temp_var) {
-      usable_ref_frame = LAST_FRAME;
-    }
-  }
-
-  // If the segment reference frame feature is enabled and it's set to GOLDEN
-  // reference, then make sure we don't skip checking GOLDEN, this is to
-  // prevent possibility of not picking any mode.
-  if (segfeature_active(seg, mi->segment_id, SEG_LVL_REF_FRAME) &&
-      get_segdata(seg, mi->segment_id, SEG_LVL_REF_FRAME) == GOLDEN_FRAME) {
-    usable_ref_frame = GOLDEN_FRAME;
-    skip_ref_find_pred[GOLDEN_FRAME] = 0;
-  }
+  get_ref_frame_use_mask(cpi, x, mi, mi_row, mi_col, bsize, gf_temporal_ref,
+                         use_ref_frame_mask, &force_skip_low_temp_var);
 
   for (MV_REFERENCE_FRAME ref_frame_iter = LAST_FRAME;
-       ref_frame_iter <= usable_ref_frame; ++ref_frame_iter) {
-    // Skip find_predictor if the reference frame is not in the
-    // ref_frame_flags (i.e., not used as a reference for this frame).
-    skip_ref_find_pred[ref_frame_iter] =
-        !(cpi->ref_frame_flags & flag_list[ref_frame_iter]);
-    if (!skip_ref_find_pred[ref_frame_iter]) {
-      find_predictors(cpi, x, ref_frame_iter, frame_mv, &ref_frame_skip_mask,
-                      flag_list, tile_data, yv12_mb, bsize,
-                      force_skip_low_temp_var);
+       ref_frame_iter <= ALTREF_FRAME; ++ref_frame_iter) {
+    if (use_ref_frame_mask[ref_frame_iter]) {
+      find_predictors(cpi, x, ref_frame_iter, frame_mv, tile_data, yv12_mb,
+                      bsize, force_skip_low_temp_var);
     }
   }
 
@@ -1644,54 +2059,48 @@
 
   const int large_block = bsize >= BLOCK_32X32;
   const int use_model_yrd_large =
-      cpi->oxcf.rc_mode == AOM_CBR && large_block &&
+      cpi->oxcf.rc_cfg.mode == AOM_CBR && large_block &&
       !cyclic_refresh_segment_id_boosted(xd->mi[0]->segment_id) &&
       quant_params->base_qindex && cm->seq_params.bit_depth == 8;
 
+  const int enable_filter_search =
+      is_filter_search_enabled(cpi, mi_row, mi_col, bsize);
+
+  // TODO(marpan): Look into reducing these conditions. For now constrain
+  // it to avoid significant bdrate loss.
+  if (cpi->sf.rt_sf.use_modeled_non_rd_cost) {
+    if (cpi->svc.non_reference_frame)
+      use_modeled_non_rd_cost = 1;
+    else if (cpi->svc.number_temporal_layers > 1 &&
+             cpi->svc.temporal_layer_id == 0)
+      use_modeled_non_rd_cost = 0;
+    else
+      use_modeled_non_rd_cost =
+          (quant_params->base_qindex > 120 && x->source_variance > 100 &&
+           bsize <= BLOCK_16X16 && !x->content_state_sb.lighting_change &&
+           x->content_state_sb.source_sad != kHighSad);
+  }
+
 #if COLLECT_PICK_MODE_STAT
   ms_stat.num_blocks[bsize]++;
 #endif
   init_mbmi(mi, DC_PRED, NONE_FRAME, NONE_FRAME, cm);
-  mi->tx_size =
-      AOMMIN(AOMMIN(max_txsize_lookup[bsize],
-                    tx_mode_to_biggest_tx_size[x->tx_mode_search_type]),
-             TX_16X16);
-
-  // TODO(marpan): Look into reducing these conditions. For now constrain
-  // it to avoid significant bdrate loss.
-  if (cpi->sf.rt_sf.use_modeled_non_rd_cost &&
-      quant_params->base_qindex > 120 && x->source_variance > 100 &&
-      bsize <= BLOCK_16X16 && x->content_state_sb != kLowVarHighSumdiff &&
-      x->content_state_sb != kHighSad)
-    use_modeled_non_rd_cost = 1;
-
-  if (cpi->sf.rt_sf.use_nonrd_filter_search) {
-    enable_filter_search = 1;
-    if (cpi->sf.interp_sf.cb_pred_filter_search) {
-      const int bsl = mi_size_wide_log2[bsize];
-      enable_filter_search =
-          (((mi_row + mi_col) >> bsl) +
-           get_chessboard_index(cm->current_frame.frame_number)) &
-          0x1;
-    }
-    if (x->source_variance <=
-        cpi->sf.interp_sf.disable_filter_search_var_thresh)
-      enable_filter_search = 0;
-  }
+  mi->tx_size = AOMMIN(
+      AOMMIN(max_txsize_lookup[bsize],
+             tx_mode_to_biggest_tx_size[txfm_params->tx_mode_search_type]),
+      TX_16X16);
 
   for (int idx = 0; idx < num_inter_modes; ++idx) {
+    const struct segmentation *const seg = &cm->seg;
+
     int rate_mv = 0;
-    int mode_rd_thresh;
-    int mode_index;
-    int64_t this_sse;
     int is_skippable;
     int this_early_term = 0;
     int skip_this_mv = 0;
-    int comp_pred = 0;
-    int force_mv_inter_layer = 0;
     PREDICTION_MODE this_mode;
-    MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
-    second_ref_frame = NONE_FRAME;
+    MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext;
+    RD_STATS nonskip_rdc;
+    av1_invalid_rd_stats(&nonskip_rdc);
 
     this_mode = ref_mode_set[idx].pred_mode;
     ref_frame = ref_mode_set[idx].ref_frame;
@@ -1703,8 +2112,7 @@
     mi->mode = this_mode;
     mi->ref_frame[0] = ref_frame;
 
-    if (ref_frame > usable_ref_frame) continue;
-    if (skip_ref_find_pred[ref_frame]) continue;
+    if (!use_ref_frame_mask[ref_frame]) continue;
 
     // Skip non-zero motion for SVC if skip_nonzeromv_ref is set.
     if (cpi->use_svc && frame_mv[this_mode][ref_frame].as_int != 0) {
@@ -1716,93 +2124,56 @@
 
     // If the segment reference frame feature is enabled then do nothing if the
     // current ref frame is not allowed.
-    if (segfeature_active(seg, mi->segment_id, SEG_LVL_REF_FRAME) &&
-        get_segdata(seg, mi->segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame)
+    if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) &&
+        get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame)
       continue;
 
-    if (ref_frame != LAST_FRAME && cpi->oxcf.rc_mode == AOM_CBR &&
-        sse_zeromv_norm < thresh_skip_golden && this_mode == NEWMV)
+    if (skip_mode_by_bsize_and_ref_frame(
+            this_mode, ref_frame, bsize, x->nonrd_prune_ref_frame_search,
+            sse_zeromv_norm, cpi->sf.rt_sf.nonrd_agressive_skip))
       continue;
 
-    if (!(cpi->ref_frame_flags & flag_list[ref_frame])) continue;
-
-    if (!(inter_mode_mask[bsize] & (1 << this_mode))) continue;
-
-    // Skip testing non-LAST if this flag is set.
-    if (x->nonrd_prune_ref_frame_search) {
-      if (x->nonrd_prune_ref_frame_search > 1 && ref_frame != LAST_FRAME &&
-          (bsize > BLOCK_64X64 || (bsize > BLOCK_16X16 && this_mode == NEWMV)))
-        continue;
-
-      if (ref_frame != LAST_FRAME && this_mode == NEARMV) continue;
-    }
-
-    // Skip non-zeromv mode search for non-LAST frame if force_skip_low_temp_var
-    // is set. If nearestmv for golden frame is 0, zeromv mode will be skipped
-    // later.
-    if (!force_mv_inter_layer && force_skip_low_temp_var &&
-        ref_frame != LAST_FRAME && frame_mv[this_mode][ref_frame].as_int != 0) {
+    if (skip_mode_by_low_temp(this_mode, ref_frame, bsize, x->content_state_sb,
+                              frame_mv[this_mode][ref_frame],
+                              force_skip_low_temp_var))
       continue;
-    }
-
-#if 0
-        if (x->content_state_sb != kVeryHighSad &&
-        (cpi->sf.short_circuit_low_temp_var >= 2 ||
-        (cpi->sf.short_circuit_low_temp_var == 1 && bsize == BLOCK_64X64))
-        && force_skip_low_temp_var && ref_frame == LAST_FRAME && this_mode ==
-            NEWMV)  {
-          continue;
-        }
-#endif
 
     // Disable this drop out case if the ref frame segment level feature is
     // enabled for this segment. This is to prevent the possibility that we
     // end up unable to pick any mode.
-    if (!segfeature_active(seg, mi->segment_id, SEG_LVL_REF_FRAME)) {
+    if (!segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) {
       // Check for skipping GOLDEN and ALTREF based pred_mv_sad.
       if (cpi->sf.rt_sf.nonrd_prune_ref_frame_search > 0 &&
           x->pred_mv_sad[ref_frame] != INT_MAX && ref_frame != LAST_FRAME) {
-        if ((int64_t)(x->pred_mv_sad[ref_frame]) > thresh_sad_pred)
-          ref_frame_skip_mask |= (1 << ref_frame);
+        if ((int64_t)(x->pred_mv_sad[ref_frame]) > thresh_sad_pred) continue;
       }
-      if (ref_frame_skip_mask & (1 << ref_frame)) continue;
     }
 
+    if (skip_mode_by_threshold(
+            this_mode, ref_frame, frame_mv[this_mode][ref_frame],
+            cpi->rc.frames_since_golden, rd_threshes, rd_thresh_freq_fact,
+            best_rdc.rdcost, best_pickmode.best_mode_skip_txfm,
+            (cpi->sf.rt_sf.nonrd_agressive_skip ? 1 : 0)))
+      continue;
+
     // Select prediction reference frames.
     for (int i = 0; i < MAX_MB_PLANE; i++) {
       xd->plane[i].pre[0] = yv12_mb[ref_frame][i];
     }
 
     mi->ref_frame[0] = ref_frame;
-    mi->ref_frame[1] = second_ref_frame;
-    set_ref_ptrs(cm, xd, ref_frame, second_ref_frame);
+    mi->ref_frame[1] = NONE_FRAME;
+    set_ref_ptrs(cm, xd, ref_frame, NONE_FRAME);
 
-    mode_index = mode_idx[ref_frame][INTER_OFFSET(this_mode)];
-    mode_rd_thresh = best_pickmode.best_mode_skip_txfm
-                         ? rd_threshes[mode_index] << 1
-                         : rd_threshes[mode_index];
-
-    // Increase mode_rd_thresh value for non-LAST for improved encoding
-    // speed
-    if (ref_frame != LAST_FRAME) {
-      mode_rd_thresh = mode_rd_thresh << 1;
-      if (ref_frame == GOLDEN_FRAME && cpi->rc.frames_since_golden > 4)
-        mode_rd_thresh = mode_rd_thresh << 1;
-    }
-
-    if (rd_less_than_thresh(best_rdc.rdcost, mode_rd_thresh,
-                            rd_thresh_freq_fact[mode_index]))
-      if (frame_mv[this_mode][ref_frame].as_int != 0) continue;
-
-    if (this_mode == NEWMV && !force_mv_inter_layer) {
+    if (this_mode == NEWMV) {
       if (search_new_mv(cpi, x, frame_mv, ref_frame, gf_temporal_ref, bsize,
-                        mi_row, mi_col, best_pred_sad, &rate_mv, &best_rdc))
+                        mi_row, mi_col, &rate_mv, &best_rdc))
         continue;
     }
 
     for (PREDICTION_MODE inter_mv_mode = NEARESTMV; inter_mv_mode <= NEWMV;
          inter_mv_mode++) {
-      if (inter_mv_mode == this_mode || comp_pred) continue;
+      if (inter_mv_mode == this_mode) continue;
       if (mode_checked[inter_mv_mode][ref_frame] &&
           frame_mv[this_mode][ref_frame].as_int ==
               frame_mv[inter_mv_mode][ref_frame].as_int) {
@@ -1832,8 +2203,8 @@
         ((mi->mv[0].as_mv.row & 0x07) || (mi->mv[0].as_mv.col & 0x07)) &&
         (ref_frame == LAST_FRAME || !x->nonrd_prune_ref_frame_search)) {
       search_filter_ref(cpi, x, &this_rdc, mi_row, mi_col, tmp, bsize,
-                        reuse_inter_pred, &this_mode_pred, &var_y, &sse_y,
-                        &this_early_term, use_model_yrd_large, &this_sse);
+                        reuse_inter_pred, &this_mode_pred, &this_early_term,
+                        use_model_yrd_large);
     } else {
       mi->interp_filters =
           (filter_ref == SWITCHABLE)
@@ -1841,89 +2212,90 @@
               : av1_broadcast_interp_filter(filter_ref);
       av1_enc_build_inter_predictor_y(xd, mi_row, mi_col);
       if (use_model_yrd_large) {
-        model_skip_for_sb_y_large(cpi, bsize, mi_row, mi_col, x, xd, NULL, NULL,
-                                  &var_y, &sse_y, &this_early_term,
-                                  use_modeled_non_rd_cost);
+        model_skip_for_sb_y_large(cpi, bsize, mi_row, mi_col, x, xd, &this_rdc,
+                                  &this_early_term, use_modeled_non_rd_cost);
       } else {
-        model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc.rate, &this_rdc.dist,
-                          &this_rdc.skip, NULL, &var_y, &sse_y,
+        model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc,
                           use_modeled_non_rd_cost);
       }
     }
 
     if (ref_frame == LAST_FRAME && frame_mv[this_mode][ref_frame].as_int == 0) {
       sse_zeromv_norm =
-          sse_y >> (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]);
+          (unsigned int)(this_rdc.sse >> (b_width_log2_lookup[bsize] +
+                                          b_height_log2_lookup[bsize]));
     }
 
-    const int skip_ctx = av1_get_skip_context(xd);
-    const int skip_cost = x->skip_cost[skip_ctx][1];
-    const int no_skip_cost = x->skip_cost[skip_ctx][0];
-    if (!this_early_term) {
+    const int skip_ctx = av1_get_skip_txfm_context(xd);
+    const int skip_txfm_cost = mode_costs->skip_txfm_cost[skip_ctx][1];
+    const int no_skip_txfm_cost = mode_costs->skip_txfm_cost[skip_ctx][0];
+    if (this_early_term) {
+      this_rdc.skip_txfm = 1;
+      this_rdc.rate = skip_txfm_cost;
+      this_rdc.dist = this_rdc.sse << 4;
+    } else {
       if (use_modeled_non_rd_cost) {
-        if (this_rdc.skip) {
-          this_rdc.rate = skip_cost;
+        if (this_rdc.skip_txfm) {
+          this_rdc.rate = skip_txfm_cost;
         } else {
-          this_rdc.rate += no_skip_cost;
+          this_rdc.rate += no_skip_txfm_cost;
         }
       } else {
-        this_sse = (int64_t)sse_y;
-        block_yrd(cpi, x, mi_row, mi_col, &this_rdc, &is_skippable, &this_sse,
-                  bsize, mi->tx_size);
-        if (this_rdc.skip) {
-          this_rdc.rate = skip_cost;
-        } else {
-          if (RDCOST(x->rdmult, this_rdc.rate, this_rdc.dist) >=
-              RDCOST(x->rdmult, 0,
-                     this_sse)) {  // this_sse already multiplied by 16 in
-                                   // block_yrd
-            this_rdc.skip = 1;
-            this_rdc.rate = skip_cost;
-            this_rdc.dist = this_sse;
-          } else {
-            this_rdc.rate += no_skip_cost;
+        block_yrd(cpi, x, mi_row, mi_col, &this_rdc, &is_skippable, bsize,
+                  mi->tx_size);
+        if (this_rdc.skip_txfm ||
+            RDCOST(x->rdmult, this_rdc.rate, this_rdc.dist) >=
+                RDCOST(x->rdmult, 0, this_rdc.sse)) {
+          if (!this_rdc.skip_txfm) {
+            // Need to store "real" rdc for possible furure use if UV rdc
+            // disallows tx skip
+            nonskip_rdc = this_rdc;
+            nonskip_rdc.rate += no_skip_txfm_cost;
           }
+          this_rdc.rate = skip_txfm_cost;
+          this_rdc.skip_txfm = 1;
+          this_rdc.dist = this_rdc.sse;
+        } else {
+          this_rdc.rate += no_skip_txfm_cost;
         }
       }
-    } else {
-      this_rdc.skip = 1;
-      this_rdc.rate = skip_cost;
-      this_rdc.dist = sse_y << 4;
-    }
-
-    if (!this_early_term &&
-        (x->color_sensitivity[0] || x->color_sensitivity[1])) {
-      RD_STATS rdc_uv;
-      const BLOCK_SIZE uv_bsize = get_plane_block_size(
-          bsize, xd->plane[1].subsampling_x, xd->plane[1].subsampling_y);
-      if (x->color_sensitivity[0]) {
-        av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize,
-                                      AOM_PLANE_U, AOM_PLANE_U);
+      if ((x->color_sensitivity[0] || x->color_sensitivity[1])) {
+        RD_STATS rdc_uv;
+        const BLOCK_SIZE uv_bsize = get_plane_block_size(
+            bsize, xd->plane[1].subsampling_x, xd->plane[1].subsampling_y);
+        if (x->color_sensitivity[0]) {
+          av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize,
+                                        AOM_PLANE_U, AOM_PLANE_U);
+        }
+        if (x->color_sensitivity[1]) {
+          av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize,
+                                        AOM_PLANE_V, AOM_PLANE_V);
+        }
+        model_rd_for_sb_uv(cpi, uv_bsize, x, xd, &rdc_uv, &this_rdc.sse, 1, 2);
+        // Restore Y rdc if UV rdc disallows txfm skip
+        if (this_rdc.skip_txfm && !rdc_uv.skip_txfm &&
+            nonskip_rdc.rate != INT_MAX)
+          this_rdc = nonskip_rdc;
+        this_rdc.rate += rdc_uv.rate;
+        this_rdc.dist += rdc_uv.dist;
+        this_rdc.skip_txfm = this_rdc.skip_txfm && rdc_uv.skip_txfm;
       }
-      if (x->color_sensitivity[1]) {
-        av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize,
-                                      AOM_PLANE_V, AOM_PLANE_V);
-      }
-      model_rd_for_sb_uv(cpi, uv_bsize, x, xd, &rdc_uv, &var_y, &sse_y, 1, 2);
-      this_rdc.rate += rdc_uv.rate;
-      this_rdc.dist += rdc_uv.dist;
-      this_rdc.skip = this_rdc.skip && rdc_uv.skip;
     }
 
     // TODO(kyslov) account for UV prediction cost
     this_rdc.rate += rate_mv;
     const int16_t mode_ctx =
         av1_mode_context_analyzer(mbmi_ext->mode_context, mi->ref_frame);
-    this_rdc.rate += cost_mv_ref(x, this_mode, mode_ctx);
+    this_rdc.rate += cost_mv_ref(mode_costs, this_mode, mode_ctx);
 
     this_rdc.rate += ref_costs_single[ref_frame];
 
     this_rdc.rdcost = RDCOST(x->rdmult, this_rdc.rate, this_rdc.dist);
-    if (cpi->oxcf.rc_mode == AOM_CBR) {
+    if (cpi->oxcf.rc_cfg.mode == AOM_CBR) {
       newmv_diff_bias(xd, this_mode, &this_rdc, bsize,
                       frame_mv[this_mode][ref_frame].as_mv.row,
                       frame_mv[this_mode][ref_frame].as_mv.col, cpi->speed,
-                      x->source_variance);
+                      x->source_variance, x->content_state_sb);
     }
 
     mode_checked[this_mode][ref_frame] = 1;
@@ -1939,8 +2311,10 @@
       best_pickmode.best_pred_filter = mi->interp_filters;
       best_pickmode.best_tx_size = mi->tx_size;
       best_pickmode.best_ref_frame = ref_frame;
-      best_pickmode.best_mode_skip_txfm = this_rdc.skip;
-      best_pickmode.best_second_ref_frame = second_ref_frame;
+      best_pickmode.best_mode_skip_txfm = this_rdc.skip_txfm;
+      best_pickmode.best_mode_initial_skip_flag =
+          (nonskip_rdc.rate == INT_MAX && this_rdc.skip_txfm);
+
       if (reuse_inter_pred) {
         free_pred_buffer(best_pickmode.best_pred);
         best_pickmode.best_pred = this_mode_pred;
@@ -1948,8 +2322,8 @@
     } else {
       if (reuse_inter_pred) free_pred_buffer(this_mode_pred);
     }
-    if (best_early_term && idx > 0) {
-      x->force_skip = 1;
+    if (best_early_term && (idx > 0 || cpi->sf.rt_sf.nonrd_agressive_skip)) {
+      txfm_info->skip_txfm = 1;
       break;
     }
   }
@@ -1961,8 +2335,6 @@
   mi->ref_frame[0] = best_pickmode.best_ref_frame;
   mi->mv[0].as_int =
       frame_mv[best_pickmode.best_mode][best_pickmode.best_ref_frame].as_int;
-  mi->ref_frame[1] = best_pickmode.best_second_ref_frame;
-  x->force_skip = best_rdc.skip;
 
   // Perform intra prediction search, if the best SAD is above a certain
   // threshold.
@@ -1970,137 +2342,15 @@
   mi->angle_delta[PLANE_TYPE_UV] = 0;
   mi->filter_intra_mode_info.use_filter_intra = 0;
 
-  uint32_t spatial_var_thresh = 50;
-  int motion_thresh = 32;
-  // Adjust thresholds to make intra mode likely tested if the other
-  // references (golden, alt) are skipped/not checked.
-  if (cpi->sf.rt_sf.use_nonrd_altref_frame == 0 &&
-      cpi->sf.rt_sf.nonrd_prune_ref_frame_search > 0) {
-    spatial_var_thresh = 150;
-    motion_thresh = 0;
-  }
-  int do_early_exit_rdthresh = 1;
-  // Some adjustments to checking intra mode based on source variance.
-  if (x->source_variance < spatial_var_thresh) {
-    // If the best inter mode is large motion or non-LAST ref reduce intra cost
-    // penalty, so intra mode is more likely tested.
-    if (best_pickmode.best_ref_frame != LAST_FRAME ||
-        abs(mi->mv[0].as_mv.row) >= motion_thresh ||
-        abs(mi->mv[0].as_mv.col) >= motion_thresh) {
-      intra_cost_penalty = intra_cost_penalty >> 2;
-      inter_mode_thresh = RDCOST(x->rdmult, intra_cost_penalty, 0);
-      do_early_exit_rdthresh = 0;
-    }
-    // For big blocks worth checking intra (since only DC will be checked),
-    // even if best_early_term is set.
-    if (bsize >= BLOCK_32X32) best_early_term = 0;
-  }
-
-  if (best_rdc.rdcost == INT64_MAX ||
-      (perform_intra_pred && !best_early_term &&
-       best_rdc.rdcost > inter_mode_thresh &&
-       bsize <= cpi->sf.part_sf.max_intra_bsize)) {
-    int64_t this_sse = INT64_MAX;
-    struct estimate_block_intra_args args = { cpi, x, DC_PRED, 1, 0 };
-    PRED_BUFFER *const best_pred = best_pickmode.best_pred;
-    TX_SIZE intra_tx_size =
-        AOMMIN(AOMMIN(max_txsize_lookup[bsize],
-                      tx_mode_to_biggest_tx_size[x->tx_mode_search_type]),
-               TX_16X16);
-
-    if (reuse_inter_pred && best_pred != NULL) {
-      if (best_pred->data == orig_dst.buf) {
-        this_mode_pred = &tmp[get_pred_buffer(tmp, 3)];
-        aom_convolve_copy(best_pred->data, best_pred->stride,
-                          this_mode_pred->data, this_mode_pred->stride, 0, 0, 0,
-                          0, bw, bh);
-        best_pickmode.best_pred = this_mode_pred;
-      }
-    }
-    pd->dst = orig_dst;
-
-    for (int i = 0; i < 4; ++i) {
-      const PREDICTION_MODE this_mode = intra_mode_list[i];
-      const THR_MODES mode_index =
-          mode_idx[INTRA_FRAME][mode_offset(this_mode)];
-      const int mode_rd_thresh = rd_threshes[mode_index];
-
-      // Only check DC for blocks >= 32X32.
-      if (this_mode > 0 && bsize >= BLOCK_32X32) continue;
-
-      if (rd_less_than_thresh(best_rdc.rdcost, mode_rd_thresh,
-                              rd_thresh_freq_fact[mode_index]) &&
-          (do_early_exit_rdthresh || this_mode == SMOOTH_PRED)) {
-        continue;
-      }
-      const BLOCK_SIZE uv_bsize = get_plane_block_size(
-          bsize, xd->plane[1].subsampling_x, xd->plane[1].subsampling_y);
-
-      mi->mode = this_mode;
-      mi->ref_frame[0] = INTRA_FRAME;
-      mi->ref_frame[1] = NONE_FRAME;
-
-      this_rdc.dist = this_rdc.rate = 0;
-      args.mode = this_mode;
-      args.skippable = 1;
-      args.rdc = &this_rdc;
-      mi->tx_size = intra_tx_size;
-      compute_intra_yprediction(cm, this_mode, bsize, x, xd);
-      // Look into selecting tx_size here, based on prediction residual.
-      if (use_modeled_non_rd_cost)
-        model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc.rate, &this_rdc.dist,
-                          &this_rdc.skip, NULL, &var_y, &sse_y, 1);
-      else
-        block_yrd(cpi, x, mi_row, mi_col, &this_rdc, &args.skippable, &this_sse,
-                  bsize, mi->tx_size);
-      // TODO(kyslov@) Need to account for skippable
-      if (x->color_sensitivity[0]) {
-        av1_foreach_transformed_block_in_plane(xd, uv_bsize, 1,
-                                               estimate_block_intra, &args);
-      }
-      if (x->color_sensitivity[1]) {
-        av1_foreach_transformed_block_in_plane(xd, uv_bsize, 2,
-                                               estimate_block_intra, &args);
-      }
-
-      int mode_cost = 0;
-      if (av1_is_directional_mode(this_mode) && av1_use_angle_delta(bsize)) {
-        mode_cost += x->angle_delta_cost[this_mode - V_PRED]
-                                        [MAX_ANGLE_DELTA +
-                                         mi->angle_delta[PLANE_TYPE_Y]];
-      }
-      if (this_mode == DC_PRED && av1_filter_intra_allowed_bsize(cm, bsize)) {
-        mode_cost += x->filter_intra_cost[bsize][0];
-      }
-      this_rdc.rate += ref_costs_single[INTRA_FRAME];
-      this_rdc.rate += intra_cost_penalty;
-      this_rdc.rate += mode_cost;
-      this_rdc.rdcost = RDCOST(x->rdmult, this_rdc.rate, this_rdc.dist);
-
-      if (this_rdc.rdcost < best_rdc.rdcost) {
-        best_rdc = this_rdc;
-        best_pickmode.best_mode = this_mode;
-        best_pickmode.best_intra_tx_size = mi->tx_size;
-        best_pickmode.best_ref_frame = INTRA_FRAME;
-        best_pickmode.best_second_ref_frame = NONE_FRAME;
-        mi->uv_mode = this_mode;
-        mi->mv[0].as_int = INVALID_MV;
-        mi->mv[1].as_int = INVALID_MV;
-      }
-    }
-
-    // Reset mb_mode_info to the best inter mode.
-    if (best_pickmode.best_ref_frame != INTRA_FRAME) {
-      mi->tx_size = best_pickmode.best_tx_size;
-    } else {
-      mi->tx_size = best_pickmode.best_intra_tx_size;
-    }
-  }
+  estimate_intra_mode(cpi, x, bsize, use_modeled_non_rd_cost, best_early_term,
+                      ref_costs_single[INTRA_FRAME], reuse_inter_pred,
+                      &orig_dst, tmp, &this_mode_pred, &best_rdc,
+                      &best_pickmode);
 
   pd->dst = orig_dst;
   mi->mode = best_pickmode.best_mode;
   mi->ref_frame[0] = best_pickmode.best_ref_frame;
-  mi->ref_frame[1] = best_pickmode.best_second_ref_frame;
+  txfm_info->skip_txfm = best_rdc.skip_txfm;
 
   if (!is_inter_block(mi)) {
     mi->interp_filters = av1_broadcast_interp_filter(SWITCHABLE_FILTERS);
@@ -2110,7 +2360,7 @@
     PRED_BUFFER *const best_pred = best_pickmode.best_pred;
     if (best_pred->data != orig_dst.buf && is_inter_mode(mi->mode)) {
       aom_convolve_copy(best_pred->data, best_pred->stride, pd->dst.buf,
-                        pd->dst.stride, 0, 0, 0, 0, bw, bh);
+                        pd->dst.stride, bw, bh);
     }
   }
   if (cpi->sf.inter_sf.adaptive_rd_thresh) {
@@ -2124,13 +2374,10 @@
                                 intra_mode_list[i]);
       }
     } else {
-      for (ref_frame = LAST_FRAME; ref_frame <= usable_ref_frame; ++ref_frame) {
-        PREDICTION_MODE this_mode;
-        if (best_pickmode.best_ref_frame != ref_frame) continue;
-        for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) {
-          update_thresh_freq_fact(cpi, x, bsize, ref_frame, best_mode_idx,
-                                  this_mode);
-        }
+      PREDICTION_MODE this_mode;
+      for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) {
+        update_thresh_freq_fact(cpi, x, bsize, best_pickmode.best_ref_frame,
+                                best_mode_idx, this_mode);
       }
     }
   }
diff --git a/av1/encoder/optical_flow.c b/av1/encoder/optical_flow.c
new file mode 100644
index 0000000..eed1def
--- /dev/null
+++ b/av1/encoder/optical_flow.c
@@ -0,0 +1,694 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <math.h>
+#include <limits.h>
+
+#include "config/aom_config.h"
+#include "av1/common/av1_common_int.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/mathutils.h"
+#include "av1/encoder/optical_flow.h"
+#include "av1/encoder/reconinter_enc.h"
+#include "aom_mem/aom_mem.h"
+
+#if CONFIG_OPTICAL_FLOW_API
+
+// Helper function to determine whether a frame is encoded with high bit-depth.
+static INLINE int is_frame_high_bitdepth(const YV12_BUFFER_CONFIG *frame) {
+  return (frame->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
+}
+
+// Helper function to determine whether optical flow method is sparse.
+static INLINE int is_sparse(const OPFL_PARAMS *opfl_params) {
+  return (opfl_params->flags & OPFL_FLAG_SPARSE) ? 1 : 0;
+}
+
+typedef struct LOCALMV {
+  double row;
+  double col;
+} LOCALMV;
+
+void gradients_over_window(const YV12_BUFFER_CONFIG *frame,
+                           const YV12_BUFFER_CONFIG *ref_frame,
+                           const double x_coord, const double y_coord,
+                           const int window_size, const int bit_depth,
+                           double *ix, double *iy, double *it, LOCALMV *mv);
+
+// coefficients for bilinear interpolation on unit square
+int pixel_interp(const double x, const double y, const double b00,
+                 const double b01, const double b10, const double b11) {
+  const int xint = (int)x;
+  const int yint = (int)y;
+  const double xdec = x - xint;
+  const double ydec = y - yint;
+  const double a = (1 - xdec) * (1 - ydec);
+  const double b = xdec * (1 - ydec);
+  const double c = (1 - xdec) * ydec;
+  const double d = xdec * ydec;
+  // if x, y are already integers, this results to b00
+  int interp = (int)round(a * b00 + b * b01 + c * b10 + d * b11);
+  return interp;
+}
+// bilinear interpolation to find subpixel values
+int get_subpixels(const YV12_BUFFER_CONFIG *frame, int *pred, const int w,
+                  const int h, LOCALMV mv, const double x_coord,
+                  const double y_coord) {
+  double left = x_coord + mv.row;
+  double top = y_coord + mv.col;
+  const int fromedge = 2;
+  const int height = frame->y_crop_height;
+  const int width = frame->y_crop_width;
+  if (left < 1) left = 1;
+  if (top < 1) top = 1;
+  // could use elements past boundary where stride > width
+  if (top > height - fromedge) top = height - fromedge;
+  if (left > width - fromedge) left = width - fromedge;
+  const uint8_t *buf = frame->y_buffer;
+  const int s = frame->y_stride;
+  int prev = -1;
+
+  int xint;
+  int yint;
+  int idx = 0;
+  for (int y = prev; y < prev + h; y++) {
+    for (int x = prev; x < prev + w; x++) {
+      double xx = left + x;
+      double yy = top + y;
+      xint = (int)xx;
+      yint = (int)yy;
+      int interp = pixel_interp(
+          xx, yy, buf[yint * s + xint], buf[yint * s + (xint + 1)],
+          buf[(yint + 1) * s + xint], buf[(yint + 1) * s + (xint + 1)]);
+      pred[idx++] = interp;
+    }
+  }
+  return 0;
+}
+// Scharr filter to compute spatial gradient
+void spatial_gradient(const YV12_BUFFER_CONFIG *frame, const int x_coord,
+                      const int y_coord, const int direction,
+                      double *derivative) {
+  double *filter;
+  // Scharr filters
+  double gx[9] = { -3, 0, 3, -10, 0, 10, -3, 0, 3 };
+  double gy[9] = { -3, -10, -3, 0, 0, 0, 3, 10, 3 };
+  if (direction == 0) {  // x direction
+    filter = gx;
+  } else {  // y direction
+    filter = gy;
+  }
+  int idx = 0;
+  double d = 0;
+  for (int yy = -1; yy <= 1; yy++) {
+    for (int xx = -1; xx <= 1; xx++) {
+      d += filter[idx] *
+           frame->y_buffer[(y_coord + yy) * frame->y_stride + (x_coord + xx)];
+      idx++;
+    }
+  }
+  // normalization scaling factor for scharr
+  *derivative = d / 32.0;
+}
+// Determine the spatial gradient at subpixel locations
+// For example, when reducing images for pyramidal LK,
+// corners found in original image may be at subpixel locations.
+void gradient_interp(double *fullpel_deriv, const double x_coord,
+                     const double y_coord, const int w, const int h,
+                     double *derivative) {
+  const int xint = (int)x_coord;
+  const int yint = (int)y_coord;
+  double interp;
+  if (xint + 1 > w - 1 || yint + 1 > h - 1) {
+    interp = fullpel_deriv[yint * w + xint];
+  } else {
+    interp = pixel_interp(x_coord, y_coord, fullpel_deriv[yint * w + xint],
+                          fullpel_deriv[yint * w + (xint + 1)],
+                          fullpel_deriv[(yint + 1) * w + xint],
+                          fullpel_deriv[(yint + 1) * w + (xint + 1)]);
+  }
+
+  *derivative = interp;
+}
+
+void temporal_gradient(const YV12_BUFFER_CONFIG *frame,
+                       const YV12_BUFFER_CONFIG *frame2, const double x_coord,
+                       const double y_coord, const int bit_depth,
+                       double *derivative, LOCALMV *mv) {
+  // TODO(any): this is a roundabout way of enforcing build_one_inter_pred
+  // to use the 8-tap filter (instead of lower). it would be more
+  // efficient to apply the filter only at 1 pixel instead of 25 pixels.
+  const int w = 5;
+  const int h = 5;
+  uint8_t pred1[25];
+  uint8_t pred2[25];
+
+  const int y = (int)y_coord;
+  const int x = (int)x_coord;
+  const double ydec = y_coord - y;
+  const double xdec = x_coord - x;
+  const int is_intrabc = 0;  // Is intra-copied?
+  const int is_high_bitdepth = is_frame_high_bitdepth(frame2);
+  const int subsampling_x = 0, subsampling_y = 0;  // for y-buffer
+  const int_interpfilters interp_filters =
+      av1_broadcast_interp_filter(MULTITAP_SHARP);
+  const int plane = 0;  // y-plane
+  const struct buf_2d ref_buf2 = { NULL, frame2->y_buffer, frame2->y_crop_width,
+                                   frame2->y_crop_height, frame2->y_stride };
+  struct scale_factors scale;
+  av1_setup_scale_factors_for_frame(&scale, frame->y_crop_width,
+                                    frame->y_crop_height, frame->y_crop_width,
+                                    frame->y_crop_height);
+  InterPredParams inter_pred_params;
+  av1_init_inter_params(&inter_pred_params, w, h, y, x, subsampling_x,
+                        subsampling_y, bit_depth, is_high_bitdepth, is_intrabc,
+                        &scale, &ref_buf2, interp_filters);
+  inter_pred_params.conv_params = get_conv_params(0, plane, bit_depth);
+  MV newmv = { .row = (int16_t)round((mv->row + xdec) * 8),
+               .col = (int16_t)round((mv->col + ydec) * 8) };
+  av1_enc_build_one_inter_predictor(pred2, w, &newmv, &inter_pred_params);
+  const struct buf_2d ref_buf1 = { NULL, frame->y_buffer, frame->y_crop_width,
+                                   frame->y_crop_height, frame->y_stride };
+  av1_init_inter_params(&inter_pred_params, w, h, y, x, subsampling_x,
+                        subsampling_y, bit_depth, is_high_bitdepth, is_intrabc,
+                        &scale, &ref_buf1, interp_filters);
+  inter_pred_params.conv_params = get_conv_params(0, plane, bit_depth);
+  MV zeroMV = { .row = (int16_t)round(xdec * 8),
+                .col = (int16_t)round(ydec * 8) };
+  av1_enc_build_one_inter_predictor(pred1, w, &zeroMV, &inter_pred_params);
+
+  *derivative = pred2[0] - pred1[0];
+}
+// Numerical differentiate over window_size x window_size surrounding (x,y)
+// location. Alters ix, iy, it to contain numerical partial derivatives
+void gradients_over_window(const YV12_BUFFER_CONFIG *frame,
+                           const YV12_BUFFER_CONFIG *ref_frame,
+                           const double x_coord, const double y_coord,
+                           const int window_size, const int bit_depth,
+                           double *ix, double *iy, double *it, LOCALMV *mv) {
+  const double left = x_coord - window_size / 2;
+  const double top = y_coord - window_size / 2;
+  // gradient operators need pixel before and after (start at 1)
+  const double x_start = AOMMAX(1, left);
+  const double y_start = AOMMAX(1, top);
+  const int frame_height = frame->y_crop_height;
+  const int frame_width = frame->y_crop_width;
+  double deriv_x;
+  double deriv_y;
+  double deriv_t;
+
+  const double x_end = AOMMIN(x_coord + window_size / 2, frame_width - 2);
+  const double y_end = AOMMIN(y_coord + window_size / 2, frame_height - 2);
+  const int xs = (int)AOMMAX(1, x_start - 1);
+  const int ys = (int)AOMMAX(1, y_start - 1);
+  const int xe = (int)AOMMIN(x_end + 2, frame_width - 2);
+  const int ye = (int)AOMMIN(y_end + 2, frame_height - 2);
+  // with normalization, gradients may be double values
+  double *fullpel_dx = aom_malloc((ye - ys) * (xe - xs) * sizeof(deriv_x));
+  double *fullpel_dy = aom_malloc((ye - ys) * (xe - xs) * sizeof(deriv_y));
+  // TODO(any): This could be more efficient in the case that x_coord
+  // and y_coord are integers.. but it may look more messy.
+
+  // calculate spatial gradients at full pixel locations
+  for (int j = ys; j < ye; j++) {
+    for (int i = xs; i < xe; i++) {
+      spatial_gradient(frame, i, j, 0, &deriv_x);
+      spatial_gradient(frame, i, j, 1, &deriv_y);
+      int idx = (j - ys) * (xe - xs) + (i - xs);
+      fullpel_dx[idx] = deriv_x;
+      fullpel_dy[idx] = deriv_y;
+    }
+  }
+  // compute numerical differentiation for every pixel in window
+  // (this potentially includes subpixels)
+  for (double j = y_start; j < y_end; j++) {
+    for (double i = x_start; i < x_end; i++) {
+      temporal_gradient(frame, ref_frame, i, j, bit_depth, &deriv_t, mv);
+      gradient_interp(fullpel_dx, i - xs, j - ys, xe - xs, ye - ys, &deriv_x);
+      gradient_interp(fullpel_dy, i - xs, j - ys, xe - xs, ye - ys, &deriv_y);
+      int idx = (int)(j - top) * window_size + (int)(i - left);
+      ix[idx] = deriv_x;
+      iy[idx] = deriv_y;
+      it[idx] = deriv_t;
+    }
+  }
+  // TODO(any): to avoid setting deriv arrays to zero for every iteration,
+  // could instead pass these two values back through function call
+  // int first_idx = (int)(y_start - top) * window_size + (int)(x_start - left);
+  // int width = window_size - ((int)(x_start - left) + (int)(left + window_size
+  // - x_end));
+
+  aom_free(fullpel_dx);
+  aom_free(fullpel_dy);
+}
+
+// To compute eigenvalues of 2x2 matrix: Solve for lambda where
+// Determinant(matrix - lambda*identity) == 0
+void eigenvalues_2x2(const double *matrix, double *eig) {
+  const double a = 1;
+  const double b = -1 * matrix[0] - matrix[3];
+  const double c = -1 * matrix[1] * matrix[2] + matrix[0] * matrix[3];
+  // quadratic formula
+  const double discriminant = b * b - 4 * a * c;
+  eig[0] = (-b - sqrt(discriminant)) / (2.0 * a);
+  eig[1] = (-b + sqrt(discriminant)) / (2.0 * a);
+  // double check that eigenvalues are ordered by magnitude
+  if (fabs(eig[0]) > fabs(eig[1])) {
+    double tmp = eig[0];
+    eig[0] = eig[1];
+    eig[1] = tmp;
+  }
+}
+// Shi-Tomasi corner detection criteria
+double corner_score(const YV12_BUFFER_CONFIG *frame_to_filter,
+                    const YV12_BUFFER_CONFIG *ref_frame, const int x,
+                    const int y, double *i_x, double *i_y, double *i_t,
+                    const int n, const int bit_depth) {
+  double eig[2];
+  LOCALMV mv = { .row = 0, .col = 0 };
+  // TODO(any): technically, ref_frame and i_t are not used by corner score
+  // so these could be replaced by dummy variables,
+  // or change this to spatial gradient function over window only
+  gradients_over_window(frame_to_filter, ref_frame, x, y, n, bit_depth, i_x,
+                        i_y, i_t, &mv);
+  double Mres1[1] = { 0 }, Mres2[1] = { 0 }, Mres3[1] = { 0 };
+  multiply_mat(i_x, i_x, Mres1, 1, n * n, 1);
+  multiply_mat(i_x, i_y, Mres2, 1, n * n, 1);
+  multiply_mat(i_y, i_y, Mres3, 1, n * n, 1);
+  double M[4] = { Mres1[0], Mres2[0], Mres2[0], Mres3[0] };
+  eigenvalues_2x2(M, eig);
+  return fabs(eig[0]);
+}
+// Finds corners in frame_to_filter
+// For less strict requirements (i.e. more corners), decrease threshold
+int detect_corners(const YV12_BUFFER_CONFIG *frame_to_filter,
+                   const YV12_BUFFER_CONFIG *ref_frame, const int maxcorners,
+                   int *ref_corners, const int bit_depth) {
+  const int frame_height = frame_to_filter->y_crop_height;
+  const int frame_width = frame_to_filter->y_crop_width;
+  // TODO(any): currently if maxcorners is decreased, then it only means
+  // corners will be omited from bottom-right of image. if maxcorners
+  // is actually used, then this algorithm would need to re-iterate
+  // and choose threshold based on that
+  assert(maxcorners == frame_height * frame_width);
+  int countcorners = 0;
+  const double threshold = 0.1;
+  double score;
+  const int n = 3;
+  double i_x[9] = { 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+  double i_y[9] = { 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+  double i_t[9] = { 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+  const int fromedge = n;
+  double max_score = corner_score(frame_to_filter, ref_frame, fromedge,
+                                  fromedge, i_x, i_y, i_t, n, bit_depth);
+  // rough estimate of max corner score in image
+  for (int x = fromedge; x < frame_width - fromedge; x += 1) {
+    for (int y = fromedge; y < frame_height - fromedge; y += frame_height / 5) {
+      for (int i = 0; i < n * n; i++) {
+        i_x[i] = 0;
+        i_y[i] = 0;
+        i_t[i] = 0;
+      }
+      score = corner_score(frame_to_filter, ref_frame, x, y, i_x, i_y, i_t, n,
+                           bit_depth);
+      if (score > max_score) {
+        max_score = score;
+      }
+    }
+  }
+  // score all the points and choose corners over threshold
+  for (int x = fromedge; x < frame_width - fromedge; x += 1) {
+    for (int y = fromedge;
+         (y < frame_height - fromedge) && countcorners < maxcorners; y += 1) {
+      for (int i = 0; i < n * n; i++) {
+        i_x[i] = 0;
+        i_y[i] = 0;
+        i_t[i] = 0;
+      }
+      score = corner_score(frame_to_filter, ref_frame, x, y, i_x, i_y, i_t, n,
+                           bit_depth);
+      if (score > threshold * max_score) {
+        ref_corners[countcorners * 2] = x;
+        ref_corners[countcorners * 2 + 1] = y;
+        countcorners++;
+      }
+    }
+  }
+  return countcorners;
+}
+// weights is an nxn matrix. weights is filled with a gaussian function,
+// with independent variable: distance from the center point.
+void gaussian(const double sigma, const int n, const int normalize,
+              double *weights) {
+  double total_weight = 0;
+  for (int j = 0; j < n; j++) {
+    for (int i = 0; i < n; i++) {
+      double distance = sqrt(pow(n / 2 - i, 2) + pow(n / 2 - j, 2));
+      double weight = exp(-0.5 * pow(distance / sigma, 2));
+      weights[j * n + i] = weight;
+      total_weight += weight;
+    }
+  }
+  if (normalize == 1) {
+    for (int j = 0; j < n; j++) {
+      weights[j] = weights[j] / total_weight;
+    }
+  }
+}
+double convolve(const double *filter, const int *img, const int size) {
+  double result = 0;
+  for (int i = 0; i < size; i++) {
+    result += filter[i] * img[i];
+  }
+  return result;
+}
+// Applies a Gaussian low-pass smoothing filter to produce
+// a corresponding lower resolution image with halved dimensions
+void reduce(uint8_t *img, int height, int width, int stride,
+            uint8_t *reduced_img) {
+  const int new_width = width / 2;
+  const int window_size = 5;
+  const double gaussian_filter[25] = {
+    1. / 256, 1.0 / 64, 3. / 128, 1. / 64,  1. / 256, 1. / 64, 1. / 16,
+    3. / 32,  1. / 16,  1. / 64,  3. / 128, 3. / 32,  9. / 64, 3. / 32,
+    3. / 128, 1. / 64,  1. / 16,  3. / 32,  1. / 16,  1. / 64, 1. / 256,
+    1. / 64,  3. / 128, 1. / 64,  1. / 256
+  };
+  // filter is 5x5 so need prev and forward 2 pixels
+  int img_section[25];
+  for (int y = 0; y < height - 1; y += 2) {
+    for (int x = 0; x < width - 1; x += 2) {
+      int i = 0;
+      for (int yy = y - window_size / 2; yy <= y + window_size / 2; yy++) {
+        for (int xx = x - window_size / 2; xx <= x + window_size / 2; xx++) {
+          int yvalue = yy;
+          int xvalue = xx;
+          // copied pixels outside the boundary
+          if (yvalue < 0) yvalue = 0;
+          if (xvalue < 0) xvalue = 0;
+          if (yvalue >= height) yvalue = height - 1;
+          if (xvalue >= width) xvalue = width - 1;
+          img_section[i++] = img[yvalue * stride + xvalue];
+        }
+      }
+      reduced_img[(y / 2) * new_width + (x / 2)] = (uint8_t)convolve(
+          gaussian_filter, img_section, (int)pow(window_size, 2));
+    }
+  }
+}
+int cmpfunc(const void *a, const void *b) { return (*(int *)a - *(int *)b); }
+void filter_mvs(const MV_FILTER_TYPE mv_filter, const int frame_height,
+                const int frame_width, LOCALMV *localmvs, MV *mvs) {
+  const int n = 5;  // window size
+  // for smoothing filter
+  const double gaussian_filter[25] = {
+    1. / 256, 1. / 64,  3. / 128, 1. / 64,  1. / 256, 1. / 64, 1. / 16,
+    3. / 32,  1. / 16,  1. / 64,  3. / 128, 3. / 32,  9. / 64, 3. / 32,
+    3. / 128, 1. / 64,  1. / 16,  3. / 32,  1. / 16,  1. / 64, 1. / 256,
+    1. / 64,  3. / 128, 1. / 64,  1. / 256
+  };
+  // for median filter
+  int mvrows[25];
+  int mvcols[25];
+  if (mv_filter != MV_FILTER_NONE) {
+    for (int y = 0; y < frame_height; y++) {
+      for (int x = 0; x < frame_width; x++) {
+        int center_idx = y * frame_width + x;
+        if (fabs(localmvs[center_idx].row) > 0 ||
+            fabs(localmvs[center_idx].col) > 0) {
+          int i = 0;
+          double filtered_row = 0;
+          double filtered_col = 0;
+          for (int yy = y - n / 2; yy <= y + n / 2; yy++) {
+            for (int xx = x - n / 2; xx <= x + n / 2; xx++) {
+              int yvalue = yy + y;
+              int xvalue = xx + x;
+              // copied pixels outside the boundary
+              if (yvalue < 0) yvalue = 0;
+              if (xvalue < 0) xvalue = 0;
+              if (yvalue >= frame_height) yvalue = frame_height - 1;
+              if (xvalue >= frame_width) xvalue = frame_width - 1;
+              int index = yvalue * frame_width + xvalue;
+              if (mv_filter == MV_FILTER_SMOOTH) {
+                filtered_row += mvs[index].row * gaussian_filter[i];
+                filtered_col += mvs[index].col * gaussian_filter[i];
+              } else if (mv_filter == MV_FILTER_MEDIAN) {
+                mvrows[i] = mvs[index].row;
+                mvcols[i] = mvs[index].col;
+              }
+              i++;
+            }
+          }
+
+          MV mv = mvs[center_idx];
+          if (mv_filter == MV_FILTER_SMOOTH) {
+            mv.row = (int16_t)filtered_row;
+            mv.col = (int16_t)filtered_col;
+          } else if (mv_filter == MV_FILTER_MEDIAN) {
+            qsort(mvrows, 25, sizeof(mv.row), cmpfunc);
+            qsort(mvcols, 25, sizeof(mv.col), cmpfunc);
+            mv.row = mvrows[25 / 2];
+            mv.col = mvcols[25 / 2];
+          }
+          LOCALMV localmv = { .row = ((double)mv.row) / 8,
+                              .col = ((double)mv.row) / 8 };
+          localmvs[y * frame_width + x] = localmv;
+          // if mvs array is immediately updated here, then the result may
+          // propagate to other pixels.
+        }
+      }
+    }
+    for (int i = 0; i < frame_height * frame_width; i++) {
+      if (fabs(localmvs[i].row) > 0 || fabs(localmvs[i].col) > 0) {
+        MV mv = { .row = (int16_t)round(8 * localmvs[i].row),
+                  .col = (int16_t)round(8 * localmvs[i].col) };
+        mvs[i] = mv;
+      }
+    }
+  }
+}
+
+// Computes optical flow at a single pyramid level,
+// using Lucas-Kanade algorithm.
+// Modifies mvs array.
+void lucas_kanade(const YV12_BUFFER_CONFIG *frame_to_filter,
+                  const YV12_BUFFER_CONFIG *ref_frame, const int level,
+                  const LK_PARAMS *lk_params, const int num_ref_corners,
+                  int *ref_corners, const int highres_frame_width,
+                  const int bit_depth, LOCALMV *mvs) {
+  assert(lk_params->window_size > 0 && lk_params->window_size % 2 == 0);
+  const int n = lk_params->window_size;
+  // algorithm is sensitive to window size
+  double *i_x = (double *)aom_malloc(n * n * sizeof(double));
+  double *i_y = (double *)aom_malloc(n * n * sizeof(double));
+  double *i_t = (double *)aom_malloc(n * n * sizeof(double));
+  const int expand_multiplier = (int)pow(2, level);
+  double sigma = 0.2 * n;
+  double *weights = (double *)aom_malloc(n * n * sizeof(double));
+  // normalizing doesn't really affect anything since it's applied
+  // to every component of M and b
+  gaussian(sigma, n, 0, weights);
+  for (int i = 0; i < num_ref_corners; i++) {
+    const double x_coord = 1.0 * ref_corners[i * 2] / expand_multiplier;
+    const double y_coord = 1.0 * ref_corners[i * 2 + 1] / expand_multiplier;
+    int highres_x = ref_corners[i * 2];
+    int highres_y = ref_corners[i * 2 + 1];
+    int mv_idx = highres_y * (highres_frame_width) + highres_x;
+    LOCALMV mv_old = mvs[mv_idx];
+    mv_old.row = mv_old.row / expand_multiplier;
+    mv_old.col = mv_old.col / expand_multiplier;
+    // using this instead of memset, since it's not completely
+    // clear if zero memset works on double arrays
+    for (int j = 0; j < n * n; j++) {
+      i_x[j] = 0;
+      i_y[j] = 0;
+      i_t[j] = 0;
+    }
+    gradients_over_window(frame_to_filter, ref_frame, x_coord, y_coord, n,
+                          bit_depth, i_x, i_y, i_t, &mv_old);
+    double Mres1[1] = { 0 }, Mres2[1] = { 0 }, Mres3[1] = { 0 };
+    double bres1[1] = { 0 }, bres2[1] = { 0 };
+    for (int j = 0; j < n * n; j++) {
+      Mres1[0] += weights[j] * i_x[j] * i_x[j];
+      Mres2[0] += weights[j] * i_x[j] * i_y[j];
+      Mres3[0] += weights[j] * i_y[j] * i_y[j];
+      bres1[0] += weights[j] * i_x[j] * i_t[j];
+      bres2[0] += weights[j] * i_y[j] * i_t[j];
+    }
+    double M[4] = { Mres1[0], Mres2[0], Mres2[0], Mres3[0] };
+    double b[2] = { -1 * bres1[0], -1 * bres2[0] };
+    double eig[2] = { 1, 1 };
+    eigenvalues_2x2(M, eig);
+    double threshold = 0.1;
+    if (fabs(eig[0]) > threshold) {
+      // if M is not invertible, then displacement
+      // will default to zeros
+      double u[2] = { 0, 0 };
+      linsolve(2, M, 2, b, u);
+      int mult = 1;
+      if (level != 0)
+        mult = expand_multiplier;  // mv doubles when resolution doubles
+      LOCALMV mv = { .row = (mult * (u[0] + mv_old.row)),
+                     .col = (mult * (u[1] + mv_old.col)) };
+      mvs[mv_idx] = mv;
+      mvs[mv_idx] = mv;
+    }
+  }
+  aom_free(weights);
+  aom_free(i_t);
+  aom_free(i_x);
+  aom_free(i_y);
+}
+
+// Apply optical flow iteratively at each pyramid level
+void pyramid_optical_flow(const YV12_BUFFER_CONFIG *from_frame,
+                          const YV12_BUFFER_CONFIG *to_frame,
+                          const int bit_depth, const OPFL_PARAMS *opfl_params,
+                          const OPTFLOW_METHOD method, LOCALMV *mvs) {
+  assert(opfl_params->pyramid_levels > 0 &&
+         opfl_params->pyramid_levels <= MAX_PYRAMID_LEVELS);
+  int levels = opfl_params->pyramid_levels;
+  const int frame_height = from_frame->y_crop_height;
+  const int frame_width = from_frame->y_crop_width;
+  if ((frame_height / pow(2.0, levels - 1) < 50 ||
+       frame_height / pow(2.0, levels - 1) < 50) &&
+      levels > 1)
+    levels = levels - 1;
+  uint8_t *images1[MAX_PYRAMID_LEVELS];
+  uint8_t *images2[MAX_PYRAMID_LEVELS];
+  images1[0] = from_frame->y_buffer;
+  images2[0] = to_frame->y_buffer;
+  YV12_BUFFER_CONFIG *buffers1 =
+      aom_malloc(levels * sizeof(YV12_BUFFER_CONFIG));
+  YV12_BUFFER_CONFIG *buffers2 =
+      aom_malloc(levels * sizeof(YV12_BUFFER_CONFIG));
+  buffers1[0] = *from_frame;
+  buffers2[0] = *to_frame;
+  int fw = frame_width;
+  int fh = frame_height;
+  for (int i = 1; i < levels; i++) {
+    images1[i] = (uint8_t *)aom_calloc(fh / 2 * fw / 2, sizeof(uint8_t));
+    images2[i] = (uint8_t *)aom_calloc(fh / 2 * fw / 2, sizeof(uint8_t));
+    int stride;
+    if (i == 1)
+      stride = from_frame->y_stride;
+    else
+      stride = fw;
+    reduce(images1[i - 1], fh, fw, stride, images1[i]);
+    reduce(images2[i - 1], fh, fw, stride, images2[i]);
+    fh /= 2;
+    fw /= 2;
+    YV12_BUFFER_CONFIG a = { .y_buffer = images1[i],
+                             .y_crop_width = fw,
+                             .y_crop_height = fh,
+                             .y_stride = fw };
+    YV12_BUFFER_CONFIG b = { .y_buffer = images2[i],
+                             .y_crop_width = fw,
+                             .y_crop_height = fh,
+                             .y_stride = fw };
+    buffers1[i] = a;
+    buffers2[i] = b;
+  }
+  // Compute corners for specific frame
+  int maxcorners = from_frame->y_crop_width * from_frame->y_crop_height;
+  int *ref_corners = aom_malloc(maxcorners * 2 * sizeof(int));
+  int num_ref_corners = 0;
+  if (is_sparse(opfl_params)) {
+    num_ref_corners = detect_corners(from_frame, to_frame, maxcorners,
+                                     ref_corners, bit_depth);
+  }
+  const int stop_level = 0;
+  for (int i = levels - 1; i >= stop_level; i--) {
+    if (method == LUCAS_KANADE) {
+      assert(is_sparse(opfl_params));
+      lucas_kanade(&buffers1[i], &buffers2[i], i, opfl_params->lk_params,
+                   num_ref_corners, ref_corners, buffers1[0].y_crop_width,
+                   bit_depth, mvs);
+    }
+  }
+  for (int i = 1; i < levels; i++) {
+    aom_free(images1[i]);
+    aom_free(images2[i]);
+  }
+  aom_free(ref_corners);
+}
+// Computes optical flow by applying algorithm at
+// multiple pyramid levels of images (lower-resolution, smoothed images)
+// This accounts for larger motions.
+// Inputs:
+//   from_frame Frame buffer.
+//   to_frame: Frame buffer. MVs point from_frame -> to_frame.
+//   from_frame_idx: Index of from_frame.
+//   to_frame_idx: Index of to_frame. Return all zero MVs when idx are equal.
+//   bit_depth:
+//   opfl_params: contains algorithm-specific parameters.
+//   mv_filter: MV_FILTER_NONE, MV_FILTER_SMOOTH, or MV_FILTER_MEDIAN.
+//   method: LUCAS_KANADE,
+//   mvs: pointer to MVs. Contains initialization, and modified
+//   based on optical flow. Must have
+//   dimensions = from_frame->y_crop_width * from_frame->y_crop_height
+void optical_flow(const YV12_BUFFER_CONFIG *from_frame,
+                  const YV12_BUFFER_CONFIG *to_frame, const int from_frame_idx,
+                  const int to_frame_idx, const int bit_depth,
+                  const OPFL_PARAMS *opfl_params,
+                  const MV_FILTER_TYPE mv_filter, const OPTFLOW_METHOD method,
+                  MV *mvs) {
+  const int frame_height = from_frame->y_crop_height;
+  const int frame_width = from_frame->y_crop_width;
+  // TODO(any): deal with the case where frames are not of the same dimensions
+  assert(frame_height == to_frame->y_crop_height &&
+         frame_width == to_frame->y_crop_width);
+  if (from_frame_idx == to_frame_idx) {
+    // immediately return all zero mvs when frame indices are equal
+    for (int yy = 0; yy < frame_height; yy++) {
+      for (int xx = 0; xx < frame_width; xx++) {
+        MV mv = { .row = 0, .col = 0 };
+        mvs[yy * frame_width + xx] = mv;
+      }
+    }
+    return;
+  }
+
+  // Initialize double mvs based on input parameter mvs array
+  LOCALMV *localmvs = aom_malloc(frame_height * frame_width * sizeof(LOCALMV));
+  for (int i = 0; i < frame_width * frame_height; i++) {
+    MV mv = mvs[i];
+    LOCALMV localmv = { .row = ((double)mv.row) / 8,
+                        .col = ((double)mv.col) / 8 };
+    localmvs[i] = localmv;
+  }
+  // Apply optical flow algorithm
+  pyramid_optical_flow(from_frame, to_frame, bit_depth, opfl_params, method,
+                       localmvs);
+
+  // Update original mvs array
+  for (int j = 0; j < frame_height; j++) {
+    for (int i = 0; i < frame_width; i++) {
+      int idx = j * frame_width + i;
+      int new_x = (int)(localmvs[idx].row + i);
+      int new_y = (int)(localmvs[idx].col + j);
+      if ((fabs(localmvs[idx].row) >= 0.125 ||
+           fabs(localmvs[idx].col) >= 0.125)) {
+        // if mv points outside of frame (lost feature), keep old mv.
+        if (new_x < frame_width && new_x >= 0 && new_y < frame_height &&
+            new_y >= 0) {
+          MV mv = { .row = (int16_t)round(8 * localmvs[idx].row),
+                    .col = (int16_t)round(8 * localmvs[idx].col) };
+          mvs[idx] = mv;
+        }
+      }
+    }
+  }
+
+  filter_mvs(mv_filter, frame_height, frame_width, localmvs, mvs);
+
+  aom_free(localmvs);
+}
+#endif
diff --git a/av1/encoder/optical_flow.h b/av1/encoder/optical_flow.h
new file mode 100644
index 0000000..9b7cd62
--- /dev/null
+++ b/av1/encoder/optical_flow.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_OPTICAL_FLOW_H_
+#define AOM_AV1_ENCODER_OPTICAL_FLOW_H_
+
+#include "config/aom_config.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if CONFIG_OPTICAL_FLOW_API
+
+typedef enum { LUCAS_KANADE } OPTFLOW_METHOD;
+
+typedef enum {
+  MV_FILTER_NONE,
+  MV_FILTER_SMOOTH,
+  MV_FILTER_MEDIAN
+} MV_FILTER_TYPE;
+
+#define MAX_PYRAMID_LEVELS 5
+// default options for optical flow
+#define OPFL_WINDOW_SIZE 15
+#define OPFL_PYRAMID_LEVELS 3  // total levels
+
+// parameters specific to Lucas-Kanade
+typedef struct lk_params {
+  int window_size;
+} LK_PARAMS;
+
+// generic structure to contain parameters for all
+// optical flow algorithms
+typedef struct opfl_params {
+  int pyramid_levels;
+  LK_PARAMS *lk_params;
+  int flags;
+} OPFL_PARAMS;
+
+#define OPFL_FLAG_SPARSE 1
+
+void init_opfl_params(OPFL_PARAMS *opfl_params) {
+  opfl_params->pyramid_levels = OPFL_PYRAMID_LEVELS;
+  opfl_params->lk_params = NULL;
+}
+
+void init_lk_params(LK_PARAMS *lk_params) {
+  lk_params->window_size = OPFL_WINDOW_SIZE;
+}
+
+void optical_flow(const YV12_BUFFER_CONFIG *from_frame,
+                  const YV12_BUFFER_CONFIG *to_frame, const int from_frame_idx,
+                  const int to_frame_idx, const int bit_depth,
+                  const OPFL_PARAMS *opfl_params,
+                  const MV_FILTER_TYPE mv_filter, const OPTFLOW_METHOD method,
+                  MV *mvs);
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_OPTICAL_FLOW_H_
diff --git a/av1/encoder/palette.c b/av1/encoder/palette.c
index e61cd02c..f821bdc 100644
--- a/av1/encoder/palette.c
+++ b/av1/encoder/palette.c
@@ -12,9 +12,17 @@
 #include <math.h>
 #include <stdlib.h>
 
+#include "av1/common/pred_common.h"
+
+#include "av1/encoder/block.h"
 #include "av1/encoder/cost.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/intra_mode_search.h"
+#include "av1/encoder/intra_mode_search_utils.h"
 #include "av1/encoder/palette.h"
 #include "av1/encoder/random.h"
+#include "av1/encoder/rdopt_utils.h"
+#include "av1/encoder/tx_search.h"
 
 #define AV1_K_MEANS_DIM 1
 #include "av1/encoder/k_means_template.h"
@@ -115,7 +123,7 @@
 }
 
 int av1_palette_color_cost_y(const PALETTE_MODE_INFO *const pmi,
-                             uint16_t *color_cache, int n_cache,
+                             const uint16_t *color_cache, int n_cache,
                              int bit_depth) {
   const int n = pmi->palette_size[0];
   int out_cache_colors[PALETTE_MAX_SIZE];
@@ -129,7 +137,7 @@
 }
 
 int av1_palette_color_cost_uv(const PALETTE_MODE_INFO *const pmi,
-                              uint16_t *color_cache, int n_cache,
+                              const uint16_t *color_cache, int n_cache,
                               int bit_depth) {
   const int n = pmi->palette_size[1];
   int total_bits = 0;
@@ -152,3 +160,676 @@
   total_bits += 1 + AOMMIN(bits_using_delta, bits_using_raw);
   return av1_cost_literal(total_bits);
 }
+
+// Extends 'color_map' array from 'orig_width x orig_height' to 'new_width x
+// new_height'. Extra rows and columns are filled in by copying last valid
+// row/column.
+static AOM_INLINE void extend_palette_color_map(uint8_t *const color_map,
+                                                int orig_width, int orig_height,
+                                                int new_width, int new_height) {
+  int j;
+  assert(new_width >= orig_width);
+  assert(new_height >= orig_height);
+  if (new_width == orig_width && new_height == orig_height) return;
+
+  for (j = orig_height - 1; j >= 0; --j) {
+    memmove(color_map + j * new_width, color_map + j * orig_width, orig_width);
+    // Copy last column to extra columns.
+    memset(color_map + j * new_width + orig_width,
+           color_map[j * new_width + orig_width - 1], new_width - orig_width);
+  }
+  // Copy last row to extra rows.
+  for (j = orig_height; j < new_height; ++j) {
+    memcpy(color_map + j * new_width, color_map + (orig_height - 1) * new_width,
+           new_width);
+  }
+}
+
+// Bias toward using colors in the cache.
+// TODO(huisu): Try other schemes to improve compression.
+static AOM_INLINE void optimize_palette_colors(uint16_t *color_cache,
+                                               int n_cache, int n_colors,
+                                               int stride, int *centroids,
+                                               int bit_depth) {
+  if (n_cache <= 0) return;
+  for (int i = 0; i < n_colors * stride; i += stride) {
+    int min_diff = abs(centroids[i] - (int)color_cache[0]);
+    int idx = 0;
+    for (int j = 1; j < n_cache; ++j) {
+      const int this_diff = abs(centroids[i] - color_cache[j]);
+      if (this_diff < min_diff) {
+        min_diff = this_diff;
+        idx = j;
+      }
+    }
+    const int min_threshold = 4 << (bit_depth - 8);
+    if (min_diff <= min_threshold) centroids[i] = color_cache[idx];
+  }
+}
+
+/*!\brief Calculate the luma palette cost from a given color palette
+ *
+ * \ingroup palette_mode_search
+ * \callergraph
+ * Given the base colors as specified in centroids[], calculate the RD cost
+ * of palette mode.
+ */
+static AOM_INLINE void palette_rd_y(
+    const AV1_COMP *const cpi, MACROBLOCK *x, MB_MODE_INFO *mbmi,
+    BLOCK_SIZE bsize, int dc_mode_cost, const int *data, int *centroids, int n,
+    uint16_t *color_cache, int n_cache, MB_MODE_INFO *best_mbmi,
+    uint8_t *best_palette_color_map, int64_t *best_rd, int64_t *best_model_rd,
+    int *rate, int *rate_tokenonly, int64_t *distortion, int *skippable,
+    int *beat_best_rd, PICK_MODE_CONTEXT *ctx, uint8_t *blk_skip,
+    uint8_t *tx_type_map, int *beat_best_palette_rd) {
+  optimize_palette_colors(color_cache, n_cache, n, 1, centroids,
+                          cpi->common.seq_params.bit_depth);
+  const int num_unique_colors = av1_remove_duplicates(centroids, n);
+  if (num_unique_colors < PALETTE_MIN_SIZE) {
+    // Too few unique colors to create a palette. And DC_PRED will work
+    // well for that case anyway. So skip.
+    return;
+  }
+  PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+  if (cpi->common.seq_params.use_highbitdepth) {
+    for (int i = 0; i < num_unique_colors; ++i) {
+      pmi->palette_colors[i] = clip_pixel_highbd(
+          (int)centroids[i], cpi->common.seq_params.bit_depth);
+    }
+  } else {
+    for (int i = 0; i < num_unique_colors; ++i) {
+      pmi->palette_colors[i] = clip_pixel(centroids[i]);
+    }
+  }
+  pmi->palette_size[0] = num_unique_colors;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  uint8_t *const color_map = xd->plane[0].color_index_map;
+  int block_width, block_height, rows, cols;
+  av1_get_block_dimensions(bsize, 0, xd, &block_width, &block_height, &rows,
+                           &cols);
+  av1_calc_indices(data, centroids, color_map, rows * cols, num_unique_colors,
+                   1);
+  extend_palette_color_map(color_map, cols, rows, block_width, block_height);
+
+  const int palette_mode_cost =
+      intra_mode_info_cost_y(cpi, x, mbmi, bsize, dc_mode_cost);
+  if (model_intra_yrd_and_prune(cpi, x, bsize, palette_mode_cost,
+                                best_model_rd)) {
+    return;
+  }
+
+  RD_STATS tokenonly_rd_stats;
+  av1_pick_uniform_tx_size_type_yrd(cpi, x, &tokenonly_rd_stats, bsize,
+                                    *best_rd);
+  if (tokenonly_rd_stats.rate == INT_MAX) return;
+  int this_rate = tokenonly_rd_stats.rate + palette_mode_cost;
+  int64_t this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist);
+  if (!xd->lossless[mbmi->segment_id] && block_signals_txsize(mbmi->bsize)) {
+    tokenonly_rd_stats.rate -= tx_size_cost(x, bsize, mbmi->tx_size);
+  }
+  // Collect mode stats for multiwinner mode processing
+  const int txfm_search_done = 1;
+  store_winner_mode_stats(
+      &cpi->common, x, mbmi, NULL, NULL, NULL, THR_DC, color_map, bsize,
+      this_rd, cpi->sf.winner_mode_sf.multi_winner_mode_type, txfm_search_done);
+  if (this_rd < *best_rd) {
+    *best_rd = this_rd;
+    // Setting beat_best_rd flag because current mode rd is better than best_rd.
+    // This flag need to be updated only for palette evaluation in key frames
+    if (beat_best_rd) *beat_best_rd = 1;
+    memcpy(best_palette_color_map, color_map,
+           block_width * block_height * sizeof(color_map[0]));
+    *best_mbmi = *mbmi;
+    memcpy(blk_skip, x->txfm_search_info.blk_skip,
+           sizeof(x->txfm_search_info.blk_skip[0]) * ctx->num_4x4_blk);
+    av1_copy_array(tx_type_map, xd->tx_type_map, ctx->num_4x4_blk);
+    if (rate) *rate = this_rate;
+    if (rate_tokenonly) *rate_tokenonly = tokenonly_rd_stats.rate;
+    if (distortion) *distortion = tokenonly_rd_stats.dist;
+    if (skippable) *skippable = tokenonly_rd_stats.skip_txfm;
+    if (beat_best_palette_rd) *beat_best_palette_rd = 1;
+  }
+}
+
+static AOM_INLINE int is_iter_over(int curr_idx, int end_idx, int step_size) {
+  assert(step_size != 0);
+  return (step_size > 0) ? curr_idx >= end_idx : curr_idx <= end_idx;
+}
+
+// Performs count-based palette search with number of colors in interval
+// [start_n, end_n) with step size step_size. If step_size < 0, then end_n can
+// be less than start_n. Saves the last numbers searched in last_n_searched and
+// returns the best number of colors found.
+static AOM_INLINE int perform_top_color_palette_search(
+    const AV1_COMP *const cpi, MACROBLOCK *x, MB_MODE_INFO *mbmi,
+    BLOCK_SIZE bsize, int dc_mode_cost, const int *data, int *top_colors,
+    int start_n, int end_n, int step_size, int *last_n_searched,
+    uint16_t *color_cache, int n_cache, MB_MODE_INFO *best_mbmi,
+    uint8_t *best_palette_color_map, int64_t *best_rd, int64_t *best_model_rd,
+    int *rate, int *rate_tokenonly, int64_t *distortion, int *skippable,
+    int *beat_best_rd, PICK_MODE_CONTEXT *ctx, uint8_t *best_blk_skip,
+    uint8_t *tx_type_map) {
+  int centroids[PALETTE_MAX_SIZE];
+  int n = start_n;
+  int top_color_winner = end_n;
+  /* clang-format off */
+  assert(IMPLIES(step_size < 0, start_n > end_n));
+  /* clang-format on */
+  assert(IMPLIES(step_size > 0, start_n < end_n));
+  while (!is_iter_over(n, end_n, step_size)) {
+    int beat_best_palette_rd = 0;
+    memcpy(centroids, top_colors, n * sizeof(top_colors[0]));
+    palette_rd_y(cpi, x, mbmi, bsize, dc_mode_cost, data, centroids, n,
+                 color_cache, n_cache, best_mbmi, best_palette_color_map,
+                 best_rd, best_model_rd, rate, rate_tokenonly, distortion,
+                 skippable, beat_best_rd, ctx, best_blk_skip, tx_type_map,
+                 &beat_best_palette_rd);
+    *last_n_searched = n;
+    if (beat_best_palette_rd) {
+      top_color_winner = n;
+    } else if (cpi->sf.intra_sf.prune_palette_search_level == 2) {
+      // At search level 2, we return immediately if we don't see an improvement
+      return top_color_winner;
+    }
+    n += step_size;
+  }
+  return top_color_winner;
+}
+
+// Performs k-means based palette search with number of colors in interval
+// [start_n, end_n) with step size step_size. If step_size < 0, then end_n can
+// be less than start_n. Saves the last numbers searched in last_n_searched and
+// returns the best number of colors found.
+static AOM_INLINE int perform_k_means_palette_search(
+    const AV1_COMP *const cpi, MACROBLOCK *x, MB_MODE_INFO *mbmi,
+    BLOCK_SIZE bsize, int dc_mode_cost, const int *data, int lower_bound,
+    int upper_bound, int start_n, int end_n, int step_size,
+    int *last_n_searched, uint16_t *color_cache, int n_cache,
+    MB_MODE_INFO *best_mbmi, uint8_t *best_palette_color_map, int64_t *best_rd,
+    int64_t *best_model_rd, int *rate, int *rate_tokenonly, int64_t *distortion,
+    int *skippable, int *beat_best_rd, PICK_MODE_CONTEXT *ctx,
+    uint8_t *best_blk_skip, uint8_t *tx_type_map, uint8_t *color_map,
+    int data_points) {
+  int centroids[PALETTE_MAX_SIZE];
+  const int max_itr = 50;
+  int n = start_n;
+  int top_color_winner = end_n;
+  /* clang-format off */
+  assert(IMPLIES(step_size < 0, start_n > end_n));
+  /* clang-format on */
+  assert(IMPLIES(step_size > 0, start_n < end_n));
+  while (!is_iter_over(n, end_n, step_size)) {
+    int beat_best_palette_rd = 0;
+    for (int i = 0; i < n; ++i) {
+      centroids[i] =
+          lower_bound + (2 * i + 1) * (upper_bound - lower_bound) / n / 2;
+    }
+    av1_k_means(data, centroids, color_map, data_points, n, 1, max_itr);
+    palette_rd_y(cpi, x, mbmi, bsize, dc_mode_cost, data, centroids, n,
+                 color_cache, n_cache, best_mbmi, best_palette_color_map,
+                 best_rd, best_model_rd, rate, rate_tokenonly, distortion,
+                 skippable, beat_best_rd, ctx, best_blk_skip, tx_type_map,
+                 &beat_best_palette_rd);
+    *last_n_searched = n;
+    if (beat_best_palette_rd) {
+      top_color_winner = n;
+    } else if (cpi->sf.intra_sf.prune_palette_search_level == 2) {
+      // At search level 2, we return immediately if we don't see an improvement
+      return top_color_winner;
+    }
+    n += step_size;
+  }
+  return top_color_winner;
+}
+
+// Sets the parameters to search the current number of colors +- 1
+static AOM_INLINE void set_stage2_params(int *min_n, int *max_n, int *step_size,
+                                         int winner, int end_n) {
+  // Set min to winner - 1 unless we are already at the border, then we set it
+  // to winner + 1
+  *min_n = (winner == PALETTE_MIN_SIZE) ? (PALETTE_MIN_SIZE + 1)
+                                        : AOMMAX(winner - 1, PALETTE_MIN_SIZE);
+  // Set max to winner + 1 unless we are already at the border, then we set it
+  // to winner - 1
+  *max_n =
+      (winner == end_n) ? (winner - 1) : AOMMIN(winner + 1, PALETTE_MAX_SIZE);
+
+  // Set the step size to max_n - min_n so we only search those two values.
+  // If max_n == min_n, then set step_size to 1 to avoid infinite loop later.
+  *step_size = AOMMAX(1, *max_n - *min_n);
+}
+
+static AOM_INLINE void fill_data_and_get_bounds(
+    const uint8_t *src, const int src_stride, const int rows, const int cols,
+    const int is_high_bitdepth, int *data, int *lower_bound, int *upper_bound) {
+  if (is_high_bitdepth) {
+    const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src);
+    *lower_bound = *upper_bound = src_ptr[0];
+    for (int r = 0; r < rows; ++r) {
+      for (int c = 0; c < cols; ++c) {
+        const int val = src_ptr[c];
+        data[c] = val;
+        *lower_bound = AOMMIN(*lower_bound, val);
+        *upper_bound = AOMMAX(*upper_bound, val);
+      }
+      src_ptr += src_stride;
+      data += cols;
+    }
+    return;
+  }
+
+  // low bit depth
+  *lower_bound = *upper_bound = src[0];
+  for (int r = 0; r < rows; ++r) {
+    for (int c = 0; c < cols; ++c) {
+      const int val = src[c];
+      data[c] = val;
+      *lower_bound = AOMMIN(*lower_bound, val);
+      *upper_bound = AOMMAX(*upper_bound, val);
+    }
+    src += src_stride;
+    data += cols;
+  }
+}
+
+void av1_rd_pick_palette_intra_sby(
+    const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int dc_mode_cost,
+    MB_MODE_INFO *best_mbmi, uint8_t *best_palette_color_map, int64_t *best_rd,
+    int64_t *best_model_rd, int *rate, int *rate_tokenonly, int64_t *distortion,
+    int *skippable, int *beat_best_rd, PICK_MODE_CONTEXT *ctx,
+    uint8_t *best_blk_skip, uint8_t *tx_type_map) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  assert(!is_inter_block(mbmi));
+  assert(av1_allow_palette(cpi->common.features.allow_screen_content_tools,
+                           bsize));
+  assert(PALETTE_MAX_SIZE == 8);
+  assert(PALETTE_MIN_SIZE == 2);
+
+  const int src_stride = x->plane[0].src.stride;
+  const uint8_t *const src = x->plane[0].src.buf;
+  int block_width, block_height, rows, cols;
+  av1_get_block_dimensions(bsize, 0, xd, &block_width, &block_height, &rows,
+                           &cols);
+  const SequenceHeader *const seq_params = &cpi->common.seq_params;
+  const int is_hbd = seq_params->use_highbitdepth;
+  const int bit_depth = seq_params->bit_depth;
+  int unused;
+
+  int count_buf[1 << 12];      // Maximum (1 << 12) color levels.
+  int count_buf_8bit[1 << 8];  // Maximum (1 << 8) bins for hbd path.
+  int colors, colors_threshold = 0;
+  if (is_hbd) {
+    av1_count_colors_highbd(src, src_stride, rows, cols, bit_depth, count_buf,
+                            count_buf_8bit, &colors_threshold, &colors);
+  } else {
+    av1_count_colors(src, src_stride, rows, cols, count_buf, &colors);
+    colors_threshold = colors;
+  }
+
+  uint8_t *const color_map = xd->plane[0].color_index_map;
+  if (colors_threshold > 1 && colors_threshold <= 64) {
+    int *const data = x->palette_buffer->kmeans_data_buf;
+    int centroids[PALETTE_MAX_SIZE];
+    int lower_bound, upper_bound;
+    fill_data_and_get_bounds(src, src_stride, rows, cols, is_hbd, data,
+                             &lower_bound, &upper_bound);
+
+    mbmi->mode = DC_PRED;
+    mbmi->filter_intra_mode_info.use_filter_intra = 0;
+
+    uint16_t color_cache[2 * PALETTE_MAX_SIZE];
+    const int n_cache = av1_get_palette_cache(xd, 0, color_cache);
+
+    // Find the dominant colors, stored in top_colors[].
+    int top_colors[PALETTE_MAX_SIZE] = { 0 };
+    for (int i = 0; i < AOMMIN(colors, PALETTE_MAX_SIZE); ++i) {
+      int max_count = 0;
+      for (int j = 0; j < (1 << bit_depth); ++j) {
+        if (count_buf[j] > max_count) {
+          max_count = count_buf[j];
+          top_colors[i] = j;
+        }
+      }
+      assert(max_count > 0);
+      count_buf[top_colors[i]] = 0;
+    }
+
+    // TODO(huisu@google.com): Try to avoid duplicate computation in cases
+    // where the dominant colors and the k-means results are similar.
+    if ((cpi->sf.intra_sf.prune_palette_search_level == 1) &&
+        (colors > PALETTE_MIN_SIZE)) {
+      // Start index and step size below are chosen to evaluate unique
+      // candidates in neighbor search, in case a winner candidate is found in
+      // coarse search. Example,
+      // 1) 8 colors (end_n = 8): 2,3,4,5,6,7,8. start_n is chosen as 2 and step
+      // size is chosen as 3. Therefore, coarse search will evaluate 2, 5 and 8.
+      // If winner is found at 5, then 4 and 6 are evaluated. Similarly, for 2
+      // (3) and 8 (7).
+      // 2) 7 colors (end_n = 7): 2,3,4,5,6,7. If start_n is chosen as 2 (same
+      // as for 8 colors) then step size should also be 2, to cover all
+      // candidates. Coarse search will evaluate 2, 4 and 6. If winner is either
+      // 2 or 4, 3 will be evaluated. Instead, if start_n=3 and step_size=3,
+      // coarse search will evaluate 3 and 6. For the winner, unique neighbors
+      // (3: 2,4 or 6: 5,7) would be evaluated.
+
+      // Start index for coarse palette search for dominant colors and k-means
+      const uint8_t start_n_lookup_table[PALETTE_MAX_SIZE + 1] = { 0, 0, 0,
+                                                                   3, 3, 2,
+                                                                   3, 3, 2 };
+      // Step size for coarse palette search for dominant colors and k-means
+      const uint8_t step_size_lookup_table[PALETTE_MAX_SIZE + 1] = { 0, 0, 0,
+                                                                     3, 3, 3,
+                                                                     3, 3, 3 };
+
+      // Choose the start index and step size for coarse search based on number
+      // of colors
+      const int max_n = AOMMIN(colors, PALETTE_MAX_SIZE);
+      const int min_n = start_n_lookup_table[max_n];
+      const int step_size = step_size_lookup_table[max_n];
+      assert(min_n >= PALETTE_MIN_SIZE);
+
+      // Perform top color coarse palette search to find the winner candidate
+      const int top_color_winner = perform_top_color_palette_search(
+          cpi, x, mbmi, bsize, dc_mode_cost, data, top_colors, min_n, max_n + 1,
+          step_size, &unused, color_cache, n_cache, best_mbmi,
+          best_palette_color_map, best_rd, best_model_rd, rate, rate_tokenonly,
+          distortion, skippable, beat_best_rd, ctx, best_blk_skip, tx_type_map);
+      // Evaluate neighbors for the winner color (if winner is found) in the
+      // above coarse search for dominant colors
+      if (top_color_winner <= max_n) {
+        int stage2_min_n, stage2_max_n, stage2_step_size;
+        set_stage2_params(&stage2_min_n, &stage2_max_n, &stage2_step_size,
+                          top_color_winner, max_n);
+        // perform finer search for the winner candidate
+        perform_top_color_palette_search(
+            cpi, x, mbmi, bsize, dc_mode_cost, data, top_colors, stage2_min_n,
+            stage2_max_n + 1, stage2_step_size, &unused, color_cache, n_cache,
+            best_mbmi, best_palette_color_map, best_rd, best_model_rd, rate,
+            rate_tokenonly, distortion, skippable, beat_best_rd, ctx,
+            best_blk_skip, tx_type_map);
+      }
+      // K-means clustering.
+      // Perform k-means coarse palette search to find the winner candidate
+      const int k_means_winner = perform_k_means_palette_search(
+          cpi, x, mbmi, bsize, dc_mode_cost, data, lower_bound, upper_bound,
+          min_n, max_n + 1, step_size, &unused, color_cache, n_cache, best_mbmi,
+          best_palette_color_map, best_rd, best_model_rd, rate, rate_tokenonly,
+          distortion, skippable, beat_best_rd, ctx, best_blk_skip, tx_type_map,
+          color_map, rows * cols);
+      // Evaluate neighbors for the winner color (if winner is found) in the
+      // above coarse search for k-means
+      if (k_means_winner <= max_n) {
+        int start_n_stage2, end_n_stage2, step_size_stage2;
+        set_stage2_params(&start_n_stage2, &end_n_stage2, &step_size_stage2,
+                          k_means_winner, max_n);
+        // perform finer search for the winner candidate
+        perform_k_means_palette_search(
+            cpi, x, mbmi, bsize, dc_mode_cost, data, lower_bound, upper_bound,
+            start_n_stage2, end_n_stage2 + 1, step_size_stage2, &unused,
+            color_cache, n_cache, best_mbmi, best_palette_color_map, best_rd,
+            best_model_rd, rate, rate_tokenonly, distortion, skippable,
+            beat_best_rd, ctx, best_blk_skip, tx_type_map, color_map,
+            rows * cols);
+      }
+    } else {
+      const int max_n = AOMMIN(colors, PALETTE_MAX_SIZE),
+                min_n = PALETTE_MIN_SIZE;
+      // Perform top color palette search in descending order
+      int last_n_searched = max_n;
+      perform_top_color_palette_search(
+          cpi, x, mbmi, bsize, dc_mode_cost, data, top_colors, max_n, min_n - 1,
+          -1, &last_n_searched, color_cache, n_cache, best_mbmi,
+          best_palette_color_map, best_rd, best_model_rd, rate, rate_tokenonly,
+          distortion, skippable, beat_best_rd, ctx, best_blk_skip, tx_type_map);
+
+      if (last_n_searched > min_n) {
+        // Search in ascending order until we get to the previous best
+        perform_top_color_palette_search(
+            cpi, x, mbmi, bsize, dc_mode_cost, data, top_colors, min_n,
+            last_n_searched, 1, &unused, color_cache, n_cache, best_mbmi,
+            best_palette_color_map, best_rd, best_model_rd, rate,
+            rate_tokenonly, distortion, skippable, beat_best_rd, ctx,
+            best_blk_skip, tx_type_map);
+      }
+      // K-means clustering.
+      if (colors == PALETTE_MIN_SIZE) {
+        // Special case: These colors automatically become the centroids.
+        assert(colors == 2);
+        centroids[0] = lower_bound;
+        centroids[1] = upper_bound;
+        palette_rd_y(cpi, x, mbmi, bsize, dc_mode_cost, data, centroids, colors,
+                     color_cache, n_cache, best_mbmi, best_palette_color_map,
+                     best_rd, best_model_rd, rate, rate_tokenonly, distortion,
+                     skippable, beat_best_rd, ctx, best_blk_skip, tx_type_map,
+                     NULL);
+      } else {
+        // Perform k-means palette search in descending order
+        last_n_searched = max_n;
+        perform_k_means_palette_search(
+            cpi, x, mbmi, bsize, dc_mode_cost, data, lower_bound, upper_bound,
+            max_n, min_n - 1, -1, &last_n_searched, color_cache, n_cache,
+            best_mbmi, best_palette_color_map, best_rd, best_model_rd, rate,
+            rate_tokenonly, distortion, skippable, beat_best_rd, ctx,
+            best_blk_skip, tx_type_map, color_map, rows * cols);
+        if (last_n_searched > min_n) {
+          // Search in ascending order until we get to the previous best
+          perform_k_means_palette_search(
+              cpi, x, mbmi, bsize, dc_mode_cost, data, lower_bound, upper_bound,
+              min_n, last_n_searched, 1, &unused, color_cache, n_cache,
+              best_mbmi, best_palette_color_map, best_rd, best_model_rd, rate,
+              rate_tokenonly, distortion, skippable, beat_best_rd, ctx,
+              best_blk_skip, tx_type_map, color_map, rows * cols);
+        }
+      }
+    }
+  }
+
+  if (best_mbmi->palette_mode_info.palette_size[0] > 0) {
+    memcpy(color_map, best_palette_color_map,
+           block_width * block_height * sizeof(best_palette_color_map[0]));
+  }
+  *mbmi = *best_mbmi;
+}
+
+void av1_rd_pick_palette_intra_sbuv(const AV1_COMP *cpi, MACROBLOCK *x,
+                                    int dc_mode_cost,
+                                    uint8_t *best_palette_color_map,
+                                    MB_MODE_INFO *const best_mbmi,
+                                    int64_t *best_rd, int *rate,
+                                    int *rate_tokenonly, int64_t *distortion,
+                                    int *skippable) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  assert(!is_inter_block(mbmi));
+  assert(av1_allow_palette(cpi->common.features.allow_screen_content_tools,
+                           mbmi->bsize));
+  PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+  const BLOCK_SIZE bsize = mbmi->bsize;
+  const SequenceHeader *const seq_params = &cpi->common.seq_params;
+  int this_rate;
+  int64_t this_rd;
+  int colors_u, colors_v, colors;
+  int colors_threshold_u = 0, colors_threshold_v = 0, colors_threshold = 0;
+  const int src_stride = x->plane[1].src.stride;
+  const uint8_t *const src_u = x->plane[1].src.buf;
+  const uint8_t *const src_v = x->plane[2].src.buf;
+  uint8_t *const color_map = xd->plane[1].color_index_map;
+  RD_STATS tokenonly_rd_stats;
+  int plane_block_width, plane_block_height, rows, cols;
+  av1_get_block_dimensions(bsize, 1, xd, &plane_block_width,
+                           &plane_block_height, &rows, &cols);
+
+  mbmi->uv_mode = UV_DC_PRED;
+  int count_buf[1 << 12];      // Maximum (1 << 12) color levels.
+  int count_buf_8bit[1 << 8];  // Maximum (1 << 8) bins for hbd path.
+  if (seq_params->use_highbitdepth) {
+    av1_count_colors_highbd(src_u, src_stride, rows, cols,
+                            seq_params->bit_depth, count_buf, count_buf_8bit,
+                            &colors_threshold_u, &colors_u);
+    av1_count_colors_highbd(src_v, src_stride, rows, cols,
+                            seq_params->bit_depth, count_buf, count_buf_8bit,
+                            &colors_threshold_v, &colors_v);
+  } else {
+    av1_count_colors(src_u, src_stride, rows, cols, count_buf, &colors_u);
+    av1_count_colors(src_v, src_stride, rows, cols, count_buf, &colors_v);
+    colors_threshold_u = colors_u;
+    colors_threshold_v = colors_v;
+  }
+
+  uint16_t color_cache[2 * PALETTE_MAX_SIZE];
+  const int n_cache = av1_get_palette_cache(xd, 1, color_cache);
+
+  colors = colors_u > colors_v ? colors_u : colors_v;
+  colors_threshold = colors_threshold_u > colors_threshold_v
+                         ? colors_threshold_u
+                         : colors_threshold_v;
+  if (colors_threshold > 1 && colors_threshold <= 64) {
+    int r, c, n, i, j;
+    const int max_itr = 50;
+    int lb_u, ub_u, val_u;
+    int lb_v, ub_v, val_v;
+    int *const data = x->palette_buffer->kmeans_data_buf;
+    int centroids[2 * PALETTE_MAX_SIZE];
+
+    uint16_t *src_u16 = CONVERT_TO_SHORTPTR(src_u);
+    uint16_t *src_v16 = CONVERT_TO_SHORTPTR(src_v);
+    if (seq_params->use_highbitdepth) {
+      lb_u = src_u16[0];
+      ub_u = src_u16[0];
+      lb_v = src_v16[0];
+      ub_v = src_v16[0];
+    } else {
+      lb_u = src_u[0];
+      ub_u = src_u[0];
+      lb_v = src_v[0];
+      ub_v = src_v[0];
+    }
+
+    for (r = 0; r < rows; ++r) {
+      for (c = 0; c < cols; ++c) {
+        if (seq_params->use_highbitdepth) {
+          val_u = src_u16[r * src_stride + c];
+          val_v = src_v16[r * src_stride + c];
+          data[(r * cols + c) * 2] = val_u;
+          data[(r * cols + c) * 2 + 1] = val_v;
+        } else {
+          val_u = src_u[r * src_stride + c];
+          val_v = src_v[r * src_stride + c];
+          data[(r * cols + c) * 2] = val_u;
+          data[(r * cols + c) * 2 + 1] = val_v;
+        }
+        if (val_u < lb_u)
+          lb_u = val_u;
+        else if (val_u > ub_u)
+          ub_u = val_u;
+        if (val_v < lb_v)
+          lb_v = val_v;
+        else if (val_v > ub_v)
+          ub_v = val_v;
+      }
+    }
+
+    for (n = colors > PALETTE_MAX_SIZE ? PALETTE_MAX_SIZE : colors; n >= 2;
+         --n) {
+      for (i = 0; i < n; ++i) {
+        centroids[i * 2] = lb_u + (2 * i + 1) * (ub_u - lb_u) / n / 2;
+        centroids[i * 2 + 1] = lb_v + (2 * i + 1) * (ub_v - lb_v) / n / 2;
+      }
+      av1_k_means(data, centroids, color_map, rows * cols, n, 2, max_itr);
+      optimize_palette_colors(color_cache, n_cache, n, 2, centroids,
+                              cpi->common.seq_params.bit_depth);
+      // Sort the U channel colors in ascending order.
+      for (i = 0; i < 2 * (n - 1); i += 2) {
+        int min_idx = i;
+        int min_val = centroids[i];
+        for (j = i + 2; j < 2 * n; j += 2)
+          if (centroids[j] < min_val) min_val = centroids[j], min_idx = j;
+        if (min_idx != i) {
+          int temp_u = centroids[i], temp_v = centroids[i + 1];
+          centroids[i] = centroids[min_idx];
+          centroids[i + 1] = centroids[min_idx + 1];
+          centroids[min_idx] = temp_u, centroids[min_idx + 1] = temp_v;
+        }
+      }
+      av1_calc_indices(data, centroids, color_map, rows * cols, n, 2);
+      extend_palette_color_map(color_map, cols, rows, plane_block_width,
+                               plane_block_height);
+      pmi->palette_size[1] = n;
+      for (i = 1; i < 3; ++i) {
+        for (j = 0; j < n; ++j) {
+          if (seq_params->use_highbitdepth)
+            pmi->palette_colors[i * PALETTE_MAX_SIZE + j] = clip_pixel_highbd(
+                (int)centroids[j * 2 + i - 1], seq_params->bit_depth);
+          else
+            pmi->palette_colors[i * PALETTE_MAX_SIZE + j] =
+                clip_pixel((int)centroids[j * 2 + i - 1]);
+        }
+      }
+
+      av1_txfm_uvrd(cpi, x, &tokenonly_rd_stats, bsize, *best_rd);
+      if (tokenonly_rd_stats.rate == INT_MAX) continue;
+      this_rate = tokenonly_rd_stats.rate +
+                  intra_mode_info_cost_uv(cpi, x, mbmi, bsize, dc_mode_cost);
+      this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist);
+      if (this_rd < *best_rd) {
+        *best_rd = this_rd;
+        *best_mbmi = *mbmi;
+        memcpy(best_palette_color_map, color_map,
+               plane_block_width * plane_block_height *
+                   sizeof(best_palette_color_map[0]));
+        *rate = this_rate;
+        *distortion = tokenonly_rd_stats.dist;
+        *rate_tokenonly = tokenonly_rd_stats.rate;
+        *skippable = tokenonly_rd_stats.skip_txfm;
+      }
+    }
+  }
+  if (best_mbmi->palette_mode_info.palette_size[1] > 0) {
+    memcpy(color_map, best_palette_color_map,
+           plane_block_width * plane_block_height *
+               sizeof(best_palette_color_map[0]));
+  }
+}
+
+void av1_restore_uv_color_map(const AV1_COMP *cpi, MACROBLOCK *x) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+  const BLOCK_SIZE bsize = mbmi->bsize;
+  int src_stride = x->plane[1].src.stride;
+  const uint8_t *const src_u = x->plane[1].src.buf;
+  const uint8_t *const src_v = x->plane[2].src.buf;
+  int *const data = x->palette_buffer->kmeans_data_buf;
+  int centroids[2 * PALETTE_MAX_SIZE];
+  uint8_t *const color_map = xd->plane[1].color_index_map;
+  int r, c;
+  const uint16_t *const src_u16 = CONVERT_TO_SHORTPTR(src_u);
+  const uint16_t *const src_v16 = CONVERT_TO_SHORTPTR(src_v);
+  int plane_block_width, plane_block_height, rows, cols;
+  av1_get_block_dimensions(bsize, 1, xd, &plane_block_width,
+                           &plane_block_height, &rows, &cols);
+
+  for (r = 0; r < rows; ++r) {
+    for (c = 0; c < cols; ++c) {
+      if (cpi->common.seq_params.use_highbitdepth) {
+        data[(r * cols + c) * 2] = src_u16[r * src_stride + c];
+        data[(r * cols + c) * 2 + 1] = src_v16[r * src_stride + c];
+      } else {
+        data[(r * cols + c) * 2] = src_u[r * src_stride + c];
+        data[(r * cols + c) * 2 + 1] = src_v[r * src_stride + c];
+      }
+    }
+  }
+
+  for (r = 1; r < 3; ++r) {
+    for (c = 0; c < pmi->palette_size[1]; ++c) {
+      centroids[c * 2 + r - 1] = pmi->palette_colors[r * PALETTE_MAX_SIZE + c];
+    }
+  }
+
+  av1_calc_indices(data, centroids, color_map, rows * cols,
+                   pmi->palette_size[1], 2);
+  extend_palette_color_map(color_map, cols, rows, plane_block_width,
+                           plane_block_height);
+}
diff --git a/av1/encoder/palette.h b/av1/encoder/palette.h
index 8b88c47..b1e1b14 100644
--- a/av1/encoder/palette.h
+++ b/av1/encoder/palette.h
@@ -9,6 +9,9 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
+/*!\file
+ * \brief Declares functions used in palette search.
+ */
 #ifndef AOM_AV1_ENCODER_PALETTE_H_
 #define AOM_AV1_ENCODER_PALETTE_H_
 
@@ -18,11 +21,13 @@
 extern "C" {
 #endif
 
-#define AV1_K_MEANS_RENAME(func, dim) func##_dim##dim
+struct AV1_COMP;
+struct PICK_MODE_CONTEXT;
+struct macroblock;
 
-void AV1_K_MEANS_RENAME(av1_calc_indices, 1)(const int *data,
-                                             const int *centroids,
-                                             uint8_t *indices, int n, int k);
+/*!\cond */
+#define AV1_K_MEANS_RENAME(func, dim) func##_dim##dim##_c
+
 void AV1_K_MEANS_RENAME(av1_calc_indices, 2)(const int *data,
                                              const int *centroids,
                                              uint8_t *indices, int n, int k);
@@ -32,27 +37,61 @@
 void AV1_K_MEANS_RENAME(av1_k_means, 2)(const int *data, int *centroids,
                                         uint8_t *indices, int n, int k,
                                         int max_itr);
+/*!\endcond */
 
-// Given 'n' 'data' points and 'k' 'centroids' each of dimension 'dim',
-// calculate the centroid 'indices' for the data points.
+/*!\brief Calculates the cluster to which each data point belong.
+ *
+ * \ingroup palette_mode_search
+ * \param[in]    data               The data points whose cluster indices are
+ *                                  to be computed. The data layout is
+ *                                  NUM_DATA_POINTS X DATA_DIM.
+ * \param[in]    centroids          Pointer to the centroids. The data layout
+ *                                  is NUM_CENTROIDS X DATA_DIM.
+ * \param[in]    indices            Pointer to store the computed indices.
+ * \param[in]    n                  Number of data points.
+ * \param[in]    k                  Number of clusters.
+ * \param[in]    dim                Data dimension.
+ *
+ * \return Returns nothing, but saves each data's cluster index in indices.
+ */
 static INLINE void av1_calc_indices(const int *data, const int *centroids,
                                     uint8_t *indices, int n, int k, int dim) {
+  assert(n > 0);
+  assert(k > 0);
   if (dim == 1) {
-    AV1_K_MEANS_RENAME(av1_calc_indices, 1)(data, centroids, indices, n, k);
+    av1_calc_indices_dim1(data, centroids, indices, n, k);
   } else if (dim == 2) {
-    AV1_K_MEANS_RENAME(av1_calc_indices, 2)(data, centroids, indices, n, k);
+    av1_calc_indices_dim2_c(data, centroids, indices, n, k);
   } else {
     assert(0 && "Untemplated k means dimension");
   }
 }
 
-// Given 'n' 'data' points and an initial guess of 'k' 'centroids' each of
-// dimension 'dim', runs up to 'max_itr' iterations of k-means algorithm to get
-// updated 'centroids' and the centroid 'indices' for elements in 'data'.
-// Note: the output centroids are rounded off to nearest integers.
+/*!\brief Performs k-means cluster on the data.
+ *
+ * \ingroup palette_mode_search
+ * \param[in]    data               The data points to be clustered. The data
+ *                                  layout is NUM_DATA_POINTS X DATA_DIM.
+ * \param[in]    centroids          Pointer to store the computed centroids.
+ *                                  The data layout is
+ *                                  NUM_CENTROIDS X DATA_DIM.
+ * \param[in]    indices            Pointer to store the computed indices. For
+ *                                  each training data.
+ * \param[in]    n                  Number of data points.
+ * \param[in]    k                  Number of clusters.
+ * \param[in]    dim                Data dimension.
+ * \param[in]    max_itr            Maximum number of iterations to run.
+ *
+ * \return Returns nothing, but saves each cluster's centroid in centroids and
+ * each data's cluster index in indices.
+ *
+ * \attention The output centroids are rounded off to nearest integers.
+ */
 static INLINE void av1_k_means(const int *data, int *centroids,
                                uint8_t *indices, int n, int k, int dim,
                                int max_itr) {
+  assert(n > 0);
+  assert(k > 0);
   if (dim == 1) {
     AV1_K_MEANS_RENAME(av1_k_means, 1)(data, centroids, indices, n, k, max_itr);
   } else if (dim == 2) {
@@ -62,33 +101,116 @@
   }
 }
 
-// Given a list of centroids, returns the unique number of centroids 'k', and
-// puts these unique centroids in first 'k' indices of 'centroids' array.
-// Ideally, the centroids should be rounded to integers before calling this
-// method.
+/*!\brief Removes duplicated centroid indices.
+ *
+ * \ingroup palette_mode_search
+ * \param[in]    centroids          A list of centroids index.
+ * \param[in]    num_centroids      Number of centroids.
+ *
+ * \return Returns the number of unique centroids and saves the unique centroids
+ * in beginning of the centroids array.
+ *
+ * \attention The centroids should be rounded to integers before calling this
+ * method.
+ */
 int av1_remove_duplicates(int *centroids, int num_centroids);
 
-// Given a color cache and a set of base colors, find if each cache color is
-// present in the base colors, record the binary results in "cache_color_found".
-// Record the colors that are not in the color cache in "out_cache_colors".
+/*!\brief Checks what colors are in the color cache.
+ *
+ * \ingroup palette_mode_search
+ * \param[in]    color_cache          A cache of colors.
+ * \param[in]    n_cache              Number of colors in the cache.
+ * \param[in]    colors               New base colors.
+ * \param[in]    n_colors             Number of new colors.
+ * \param[in]    cache_color_found    Stores what cached colors are presented in
+ *                                    colors.
+ * \param[in]    out_cache_colors     Stores what colors are not in the cache.
+ *
+ * \return Returns the number of colors that are not in cache. In addition,
+ * records whether each cache color is presented in colors in cache_color_found,
+ * and stores and stores the out of cache colors in out_cache_colors.
+ */
 int av1_index_color_cache(const uint16_t *color_cache, int n_cache,
                           const uint16_t *colors, int n_colors,
                           uint8_t *cache_color_found, int *out_cache_colors);
 
-// Return the number of bits used to transmit each v palette color delta;
-// assign zero_count with the number of deltas being 0.
+/*!\brief Gets the rate cost for each delta-encoding v palette.
+ *
+ * \ingroup palette_mode_search
+ * \param[in]    pmi                  Struct that stores the palette mode info.
+ * \param[in]    bit_depth            Pixel bitdepth of the sequence.
+ * \param[in]    zero_count           Stores the number of zero deltas.
+ * \param[in]    min_bits             Minimum bits for the deltas. Sets to
+ *                                    bit_depth - 4.
+ *
+ * \return Returns the number of bits used to transmit each v palette color
+ * delta and assigns zero_count with the number of deltas being 0.
+ */
 int av1_get_palette_delta_bits_v(const PALETTE_MODE_INFO *const pmi,
                                  int bit_depth, int *zero_count, int *min_bits);
 
-// Return the rate cost for transmitting luma palette color values.
+/*!\brief Gets the rate cost for transmitting luma palette color values.
+ *
+ * \ingroup palette_mode_search
+ * \param[in]    pmi                  Struct that stores the palette mode info.
+ * \param[in]    color_cache          Color cache presented at the decoder.
+ * \param[in]    n_cache              Number of colors in the cache.
+ * \param[in]    bit_depth            Pixel bitdepth of the sequence.
+ *
+ * \return Returns the rate needed to transmit the palette. Note that this does
+ * not include the cost of transmitted the color map.
+ */
 int av1_palette_color_cost_y(const PALETTE_MODE_INFO *const pmi,
-                             uint16_t *color_cache, int n_cache, int bit_depth);
+                             const uint16_t *color_cache, int n_cache,
+                             int bit_depth);
 
-// Return the rate cost for transmitting chroma palette color values.
+/*!\brief Gets the rate cost for transmitting luma palette chroma values.
+ *
+ * \ingroup palette_mode_search
+ * \param[in]    pmi                  Struct that stores the palette mode info.
+ * \param[in]    color_cache          Color cache presented at the decoder.
+ * \param[in]    n_cache              Number of colors in the cache.
+ * \param[in]    bit_depth            Pixel bitdepth of the sequence.
+ *
+ * \return Returns the rate needed to transmit the palette. Note that this does
+ * not include the cost of transmitted the color map.
+ */
 int av1_palette_color_cost_uv(const PALETTE_MODE_INFO *const pmi,
-                              uint16_t *color_cache, int n_cache,
+                              const uint16_t *color_cache, int n_cache,
                               int bit_depth);
 
+/*!\brief Search for the best palette in the luma plane.
+ *
+ * \ingroup palette_mode_search
+ * \callergraph
+ * This function is used in both inter and intra frame coding.
+ */
+void av1_rd_pick_palette_intra_sby(
+    const struct AV1_COMP *cpi, struct macroblock *x, BLOCK_SIZE bsize,
+    int dc_mode_cost, MB_MODE_INFO *best_mbmi, uint8_t *best_palette_color_map,
+    int64_t *best_rd, int64_t *best_model_rd, int *rate, int *rate_tokenonly,
+    int64_t *distortion, int *skippable, int *beat_best_rd,
+    struct PICK_MODE_CONTEXT *ctx, uint8_t *best_blk_skip,
+    uint8_t *tx_type_map);
+
+/*!\brief Search for the best palette in the chroma plane.
+ *
+ * \ingroup palette_mode_search
+ * \callergraph
+ * This function is used in both inter and intra frame coding.
+ */
+void av1_rd_pick_palette_intra_sbuv(const struct AV1_COMP *cpi,
+                                    struct macroblock *x, int dc_mode_cost,
+                                    uint8_t *best_palette_color_map,
+                                    MB_MODE_INFO *const best_mbmi,
+                                    int64_t *best_rd, int *rate,
+                                    int *rate_tokenonly, int64_t *distortion,
+                                    int *skippable);
+
+/*!\brief Resets palette color map for chroma channels.
+ */
+void av1_restore_uv_color_map(const struct AV1_COMP *cpi, struct macroblock *x);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/av1/encoder/partition_search.c b/av1/encoder/partition_search.c
new file mode 100644
index 0000000..edb5ab4
--- /dev/null
+++ b/av1/encoder/partition_search.c
@@ -0,0 +1,4201 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_ports/system_state.h"
+
+#include "av1/common/blockd.h"
+#include "av1/common/enums.h"
+#include "av1/common/reconintra.h"
+
+#include "av1/encoder/aq_complexity.h"
+#include "av1/encoder/aq_variance.h"
+#include "av1/encoder/context_tree.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/encodeframe.h"
+#include "av1/encoder/encodeframe_utils.h"
+#include "av1/encoder/encodemv.h"
+#include "av1/encoder/motion_search_facade.h"
+#include "av1/encoder/partition_search.h"
+#include "av1/encoder/reconinter_enc.h"
+#include "av1/encoder/tokenize.h"
+#include "av1/encoder/var_based_part.h"
+#include "av1/encoder/av1_ml_partition_models.h"
+
+#if CONFIG_TUNE_VMAF
+#include "av1/encoder/tune_vmaf.h"
+#endif
+
+static void update_txfm_count(MACROBLOCK *x, MACROBLOCKD *xd,
+                              FRAME_COUNTS *counts, TX_SIZE tx_size, int depth,
+                              int blk_row, int blk_col,
+                              uint8_t allow_update_cdf) {
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  const BLOCK_SIZE bsize = mbmi->bsize;
+  const int max_blocks_high = max_block_high(xd, bsize, 0);
+  const int max_blocks_wide = max_block_wide(xd, bsize, 0);
+  int ctx = txfm_partition_context(xd->above_txfm_context + blk_col,
+                                   xd->left_txfm_context + blk_row, mbmi->bsize,
+                                   tx_size);
+  const int txb_size_index = av1_get_txb_size_index(bsize, blk_row, blk_col);
+  const TX_SIZE plane_tx_size = mbmi->inter_tx_size[txb_size_index];
+
+  if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
+  assert(tx_size > TX_4X4);
+
+  if (depth == MAX_VARTX_DEPTH) {
+    // Don't add to counts in this case
+    mbmi->tx_size = tx_size;
+    txfm_partition_update(xd->above_txfm_context + blk_col,
+                          xd->left_txfm_context + blk_row, tx_size, tx_size);
+    return;
+  }
+
+  if (tx_size == plane_tx_size) {
+#if CONFIG_ENTROPY_STATS
+    ++counts->txfm_partition[ctx][0];
+#endif
+    if (allow_update_cdf)
+      update_cdf(xd->tile_ctx->txfm_partition_cdf[ctx], 0, 2);
+    mbmi->tx_size = tx_size;
+    txfm_partition_update(xd->above_txfm_context + blk_col,
+                          xd->left_txfm_context + blk_row, tx_size, tx_size);
+  } else {
+    const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
+    const int bsw = tx_size_wide_unit[sub_txs];
+    const int bsh = tx_size_high_unit[sub_txs];
+
+#if CONFIG_ENTROPY_STATS
+    ++counts->txfm_partition[ctx][1];
+#endif
+    if (allow_update_cdf)
+      update_cdf(xd->tile_ctx->txfm_partition_cdf[ctx], 1, 2);
+    ++x->txfm_search_info.txb_split_count;
+
+    if (sub_txs == TX_4X4) {
+      mbmi->inter_tx_size[txb_size_index] = TX_4X4;
+      mbmi->tx_size = TX_4X4;
+      txfm_partition_update(xd->above_txfm_context + blk_col,
+                            xd->left_txfm_context + blk_row, TX_4X4, tx_size);
+      return;
+    }
+
+    for (int row = 0; row < tx_size_high_unit[tx_size]; row += bsh) {
+      for (int col = 0; col < tx_size_wide_unit[tx_size]; col += bsw) {
+        int offsetr = row;
+        int offsetc = col;
+
+        update_txfm_count(x, xd, counts, sub_txs, depth + 1, blk_row + offsetr,
+                          blk_col + offsetc, allow_update_cdf);
+      }
+    }
+  }
+}
+
+static void tx_partition_count_update(const AV1_COMMON *const cm, MACROBLOCK *x,
+                                      BLOCK_SIZE plane_bsize,
+                                      FRAME_COUNTS *td_counts,
+                                      uint8_t allow_update_cdf) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  const int mi_width = mi_size_wide[plane_bsize];
+  const int mi_height = mi_size_high[plane_bsize];
+  const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, plane_bsize, 0);
+  const int bh = tx_size_high_unit[max_tx_size];
+  const int bw = tx_size_wide_unit[max_tx_size];
+
+  xd->above_txfm_context =
+      cm->above_contexts.txfm[xd->tile.tile_row] + xd->mi_col;
+  xd->left_txfm_context =
+      xd->left_txfm_context_buffer + (xd->mi_row & MAX_MIB_MASK);
+
+  for (int idy = 0; idy < mi_height; idy += bh) {
+    for (int idx = 0; idx < mi_width; idx += bw) {
+      update_txfm_count(x, xd, td_counts, max_tx_size, 0, idy, idx,
+                        allow_update_cdf);
+    }
+  }
+}
+
+static void set_txfm_context(MACROBLOCKD *xd, TX_SIZE tx_size, int blk_row,
+                             int blk_col) {
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  const BLOCK_SIZE bsize = mbmi->bsize;
+  const int max_blocks_high = max_block_high(xd, bsize, 0);
+  const int max_blocks_wide = max_block_wide(xd, bsize, 0);
+  const int txb_size_index = av1_get_txb_size_index(bsize, blk_row, blk_col);
+  const TX_SIZE plane_tx_size = mbmi->inter_tx_size[txb_size_index];
+
+  if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
+
+  if (tx_size == plane_tx_size) {
+    mbmi->tx_size = tx_size;
+    txfm_partition_update(xd->above_txfm_context + blk_col,
+                          xd->left_txfm_context + blk_row, tx_size, tx_size);
+
+  } else {
+    if (tx_size == TX_8X8) {
+      mbmi->inter_tx_size[txb_size_index] = TX_4X4;
+      mbmi->tx_size = TX_4X4;
+      txfm_partition_update(xd->above_txfm_context + blk_col,
+                            xd->left_txfm_context + blk_row, TX_4X4, tx_size);
+      return;
+    }
+    const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
+    const int bsw = tx_size_wide_unit[sub_txs];
+    const int bsh = tx_size_high_unit[sub_txs];
+    for (int row = 0; row < tx_size_high_unit[tx_size]; row += bsh) {
+      for (int col = 0; col < tx_size_wide_unit[tx_size]; col += bsw) {
+        const int offsetr = blk_row + row;
+        const int offsetc = blk_col + col;
+        if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue;
+        set_txfm_context(xd, sub_txs, offsetr, offsetc);
+      }
+    }
+  }
+}
+
+static void tx_partition_set_contexts(const AV1_COMMON *const cm,
+                                      MACROBLOCKD *xd, BLOCK_SIZE plane_bsize) {
+  const int mi_width = mi_size_wide[plane_bsize];
+  const int mi_height = mi_size_high[plane_bsize];
+  const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, plane_bsize, 0);
+  const int bh = tx_size_high_unit[max_tx_size];
+  const int bw = tx_size_wide_unit[max_tx_size];
+
+  xd->above_txfm_context =
+      cm->above_contexts.txfm[xd->tile.tile_row] + xd->mi_col;
+  xd->left_txfm_context =
+      xd->left_txfm_context_buffer + (xd->mi_row & MAX_MIB_MASK);
+
+  for (int idy = 0; idy < mi_height; idy += bh) {
+    for (int idx = 0; idx < mi_width; idx += bw) {
+      set_txfm_context(xd, max_tx_size, idy, idx);
+    }
+  }
+}
+
+static void update_zeromv_cnt(const AV1_COMP *const cpi,
+                              const MB_MODE_INFO *const mi, int mi_row,
+                              int mi_col, BLOCK_SIZE bsize) {
+  const AV1_COMMON *const cm = &cpi->common;
+  MV mv = mi->mv[0].as_mv;
+  const int bw = mi_size_wide[bsize] >> 1;
+  const int bh = mi_size_high[bsize] >> 1;
+  const int xmis = AOMMIN((cm->mi_params.mi_cols - mi_col) >> 1, bw);
+  const int ymis = AOMMIN((cm->mi_params.mi_rows - mi_row) >> 1, bh);
+  const int block_index =
+      (mi_row >> 1) * (cm->mi_params.mi_cols >> 1) + (mi_col >> 1);
+  for (int y = 0; y < ymis; y++)
+    for (int x = 0; x < xmis; x++) {
+      // consec_zero_mv is in the scale of 8x8 blocks
+      const int map_offset = block_index + y * (cm->mi_params.mi_cols >> 1) + x;
+      if (mi->ref_frame[0] == LAST_FRAME && is_inter_block(mi) &&
+          mi->segment_id <= CR_SEGMENT_ID_BOOST2) {
+        if (abs(mv.row) < 10 && abs(mv.col) < 10) {
+          if (cpi->consec_zero_mv[map_offset] < 255)
+            cpi->consec_zero_mv[map_offset]++;
+        } else {
+          cpi->consec_zero_mv[map_offset] = 0;
+        }
+      }
+    }
+}
+
+static void encode_superblock(const AV1_COMP *const cpi, TileDataEnc *tile_data,
+                              ThreadData *td, TokenExtra **t, RUN_TYPE dry_run,
+                              BLOCK_SIZE bsize, int *rate) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO **mi_4x4 = xd->mi;
+  MB_MODE_INFO *mbmi = mi_4x4[0];
+  const int seg_skip =
+      segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP);
+  const int mis = cm->mi_params.mi_stride;
+  const int mi_width = mi_size_wide[bsize];
+  const int mi_height = mi_size_high[bsize];
+  const int is_inter = is_inter_block(mbmi);
+
+  // Initialize tx_mode and tx_size_search_method
+  TxfmSearchParams *txfm_params = &x->txfm_search_params;
+  set_tx_size_search_method(
+      cm, &cpi->winner_mode_params, txfm_params,
+      cpi->sf.winner_mode_sf.enable_winner_mode_for_tx_size_srch, 1);
+
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
+  if (!is_inter) {
+    xd->cfl.store_y = store_cfl_required(cm, xd);
+    mbmi->skip_txfm = 1;
+    for (int plane = 0; plane < num_planes; ++plane) {
+      av1_encode_intra_block_plane(cpi, x, bsize, plane, dry_run,
+                                   cpi->optimize_seg_arr[mbmi->segment_id]);
+    }
+
+    // If there is at least one lossless segment, force the skip for intra
+    // block to be 0, in order to avoid the segment_id to be changed by in
+    // write_segment_id().
+    if (!cpi->common.seg.segid_preskip && cpi->common.seg.update_map &&
+        cpi->enc_seg.has_lossless_segment)
+      mbmi->skip_txfm = 0;
+
+    xd->cfl.store_y = 0;
+    if (av1_allow_palette(cm->features.allow_screen_content_tools, bsize)) {
+      for (int plane = 0; plane < AOMMIN(2, num_planes); ++plane) {
+        if (mbmi->palette_mode_info.palette_size[plane] > 0) {
+          if (!dry_run) {
+            av1_tokenize_color_map(x, plane, t, bsize, mbmi->tx_size,
+                                   PALETTE_MAP, tile_data->allow_update_cdf,
+                                   td->counts);
+          } else if (dry_run == DRY_RUN_COSTCOEFFS) {
+            rate +=
+                av1_cost_color_map(x, plane, bsize, mbmi->tx_size, PALETTE_MAP);
+          }
+        }
+      }
+    }
+
+    av1_update_intra_mb_txb_context(cpi, td, dry_run, bsize,
+                                    tile_data->allow_update_cdf);
+  } else {
+    int ref;
+    const int is_compound = has_second_ref(mbmi);
+
+    set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+    for (ref = 0; ref < 1 + is_compound; ++ref) {
+      const YV12_BUFFER_CONFIG *cfg =
+          get_ref_frame_yv12_buf(cm, mbmi->ref_frame[ref]);
+      assert(IMPLIES(!is_intrabc_block(mbmi), cfg));
+      av1_setup_pre_planes(xd, ref, cfg, mi_row, mi_col,
+                           xd->block_ref_scale_factors[ref], num_planes);
+    }
+    int start_plane = (cpi->sf.rt_sf.reuse_inter_pred_nonrd) ? 1 : 0;
+    av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize,
+                                  start_plane, av1_num_planes(cm) - 1);
+    if (mbmi->motion_mode == OBMC_CAUSAL) {
+      assert(cpi->oxcf.motion_mode_cfg.enable_obmc);
+      av1_build_obmc_inter_predictors_sb(cm, xd);
+    }
+
+#if CONFIG_MISMATCH_DEBUG
+    if (dry_run == OUTPUT_ENABLED) {
+      for (int plane = 0; plane < num_planes; ++plane) {
+        const struct macroblockd_plane *pd = &xd->plane[plane];
+        int pixel_c, pixel_r;
+        mi_to_pixel_loc(&pixel_c, &pixel_r, mi_col, mi_row, 0, 0,
+                        pd->subsampling_x, pd->subsampling_y);
+        if (!is_chroma_reference(mi_row, mi_col, bsize, pd->subsampling_x,
+                                 pd->subsampling_y))
+          continue;
+        mismatch_record_block_pre(pd->dst.buf, pd->dst.stride,
+                                  cm->current_frame.order_hint, plane, pixel_c,
+                                  pixel_r, pd->width, pd->height,
+                                  xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH);
+      }
+    }
+#else
+    (void)num_planes;
+#endif
+
+    av1_encode_sb(cpi, x, bsize, dry_run);
+    av1_tokenize_sb_vartx(cpi, td, dry_run, bsize, rate,
+                          tile_data->allow_update_cdf);
+  }
+
+  if (!dry_run) {
+    if (av1_allow_intrabc(cm) && is_intrabc_block(mbmi)) td->intrabc_used = 1;
+    if (txfm_params->tx_mode_search_type == TX_MODE_SELECT &&
+        !xd->lossless[mbmi->segment_id] && mbmi->bsize > BLOCK_4X4 &&
+        !(is_inter && (mbmi->skip_txfm || seg_skip))) {
+      if (is_inter) {
+        tx_partition_count_update(cm, x, bsize, td->counts,
+                                  tile_data->allow_update_cdf);
+      } else {
+        if (mbmi->tx_size != max_txsize_rect_lookup[bsize])
+          ++x->txfm_search_info.txb_split_count;
+        if (block_signals_txsize(bsize)) {
+          const int tx_size_ctx = get_tx_size_context(xd);
+          const int32_t tx_size_cat = bsize_to_tx_size_cat(bsize);
+          const int depth = tx_size_to_depth(mbmi->tx_size, bsize);
+          const int max_depths = bsize_to_max_depth(bsize);
+
+          if (tile_data->allow_update_cdf)
+            update_cdf(xd->tile_ctx->tx_size_cdf[tx_size_cat][tx_size_ctx],
+                       depth, max_depths + 1);
+#if CONFIG_ENTROPY_STATS
+          ++td->counts->intra_tx_size[tx_size_cat][tx_size_ctx][depth];
+#endif
+        }
+      }
+      assert(IMPLIES(is_rect_tx(mbmi->tx_size), is_rect_tx_allowed(xd, mbmi)));
+    } else {
+      int i, j;
+      TX_SIZE intra_tx_size;
+      // The new intra coding scheme requires no change of transform size
+      if (is_inter) {
+        if (xd->lossless[mbmi->segment_id]) {
+          intra_tx_size = TX_4X4;
+        } else {
+          intra_tx_size =
+              tx_size_from_tx_mode(bsize, txfm_params->tx_mode_search_type);
+        }
+      } else {
+        intra_tx_size = mbmi->tx_size;
+      }
+
+      for (j = 0; j < mi_height; j++)
+        for (i = 0; i < mi_width; i++)
+          if (mi_col + i < cm->mi_params.mi_cols &&
+              mi_row + j < cm->mi_params.mi_rows)
+            mi_4x4[mis * j + i]->tx_size = intra_tx_size;
+
+      if (intra_tx_size != max_txsize_rect_lookup[bsize])
+        ++x->txfm_search_info.txb_split_count;
+    }
+  }
+
+  if (txfm_params->tx_mode_search_type == TX_MODE_SELECT &&
+      block_signals_txsize(mbmi->bsize) && is_inter &&
+      !(mbmi->skip_txfm || seg_skip) && !xd->lossless[mbmi->segment_id]) {
+    if (dry_run) tx_partition_set_contexts(cm, xd, bsize);
+  } else {
+    TX_SIZE tx_size = mbmi->tx_size;
+    // The new intra coding scheme requires no change of transform size
+    if (is_inter) {
+      if (xd->lossless[mbmi->segment_id]) {
+        tx_size = TX_4X4;
+      } else {
+        tx_size = tx_size_from_tx_mode(bsize, txfm_params->tx_mode_search_type);
+      }
+    } else {
+      tx_size = (bsize > BLOCK_4X4) ? tx_size : TX_4X4;
+    }
+    mbmi->tx_size = tx_size;
+    set_txfm_ctxs(tx_size, xd->width, xd->height,
+                  (mbmi->skip_txfm || seg_skip) && is_inter_block(mbmi), xd);
+  }
+
+  if (is_inter_block(mbmi) && !xd->is_chroma_ref && is_cfl_allowed(xd)) {
+    cfl_store_block(xd, mbmi->bsize, mbmi->tx_size);
+  }
+  if (!dry_run) {
+    if (cpi->oxcf.pass == 0 && cpi->svc.temporal_layer_id == 0 &&
+        cpi->sf.rt_sf.use_temporal_noise_estimate &&
+        (!cpi->use_svc ||
+         (cpi->use_svc &&
+          !cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame &&
+          cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1)))
+      update_zeromv_cnt(cpi, mbmi, mi_row, mi_col, bsize);
+  }
+}
+
+static void setup_block_rdmult(const AV1_COMP *const cpi, MACROBLOCK *const x,
+                               int mi_row, int mi_col, BLOCK_SIZE bsize,
+                               AQ_MODE aq_mode, MB_MODE_INFO *mbmi) {
+  x->rdmult = cpi->rd.RDMULT;
+
+  if (aq_mode != NO_AQ) {
+    assert(mbmi != NULL);
+    if (aq_mode == VARIANCE_AQ) {
+      if (cpi->vaq_refresh) {
+        const int energy = bsize <= BLOCK_16X16
+                               ? x->mb_energy
+                               : av1_log_block_var(cpi, x, bsize);
+        mbmi->segment_id = energy;
+      }
+      x->rdmult = set_segment_rdmult(cpi, x, mbmi->segment_id);
+    } else if (aq_mode == COMPLEXITY_AQ) {
+      x->rdmult = set_segment_rdmult(cpi, x, mbmi->segment_id);
+    } else if (aq_mode == CYCLIC_REFRESH_AQ) {
+      // If segment is boosted, use rdmult for that segment.
+      if (cyclic_refresh_segment_id_boosted(mbmi->segment_id))
+        x->rdmult = av1_cyclic_refresh_get_rdmult(cpi->cyclic_refresh);
+    }
+  }
+
+  const AV1_COMMON *const cm = &cpi->common;
+  if (cm->delta_q_info.delta_q_present_flag &&
+      !cpi->sf.rt_sf.use_nonrd_pick_mode) {
+    x->rdmult =
+        av1_get_hier_tpl_rdmult(cpi, x, bsize, mi_row, mi_col, x->rdmult);
+  }
+
+  if (cpi->oxcf.tune_cfg.tuning == AOM_TUNE_SSIM) {
+    av1_set_ssim_rdmult(cpi, &x->mv_costs, bsize, mi_row, mi_col, &x->rdmult);
+  }
+#if CONFIG_TUNE_VMAF
+  if (cpi->oxcf.tune_cfg.tuning == AOM_TUNE_VMAF_WITHOUT_PREPROCESSING ||
+      cpi->oxcf.tune_cfg.tuning == AOM_TUNE_VMAF_MAX_GAIN ||
+      cpi->oxcf.tune_cfg.tuning == AOM_TUNE_VMAF_NEG_MAX_GAIN) {
+    av1_set_vmaf_rdmult(cpi, x, bsize, mi_row, mi_col, &x->rdmult);
+  }
+#endif
+}
+
+void av1_set_offsets_without_segment_id(const AV1_COMP *const cpi,
+                                        const TileInfo *const tile,
+                                        MACROBLOCK *const x, int mi_row,
+                                        int mi_col, BLOCK_SIZE bsize) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  MACROBLOCKD *const xd = &x->e_mbd;
+  assert(bsize < BLOCK_SIZES_ALL);
+  const int mi_width = mi_size_wide[bsize];
+  const int mi_height = mi_size_high[bsize];
+
+  set_mode_info_offsets(&cpi->common.mi_params, &cpi->mbmi_ext_info, x, xd,
+                        mi_row, mi_col);
+
+  set_entropy_context(xd, mi_row, mi_col, num_planes);
+  xd->above_txfm_context = cm->above_contexts.txfm[tile->tile_row] + mi_col;
+  xd->left_txfm_context =
+      xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+
+  // Set up destination pointers.
+  av1_setup_dst_planes(xd->plane, bsize, &cm->cur_frame->buf, mi_row, mi_col, 0,
+                       num_planes);
+
+  // Set up limit values for MV components.
+  // Mv beyond the range do not produce new/different prediction block.
+  av1_set_mv_limits(&cm->mi_params, &x->mv_limits, mi_row, mi_col, mi_height,
+                    mi_width, cpi->oxcf.border_in_pixels);
+
+  set_plane_n4(xd, mi_width, mi_height, num_planes);
+
+  // Set up distance of MB to edge of frame in 1/8th pel units.
+  assert(!(mi_col & (mi_width - 1)) && !(mi_row & (mi_height - 1)));
+  set_mi_row_col(xd, tile, mi_row, mi_height, mi_col, mi_width,
+                 cm->mi_params.mi_rows, cm->mi_params.mi_cols);
+
+  // Set up source buffers.
+  av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes, bsize);
+
+  // required by av1_append_sub8x8_mvs_for_idx() and av1_find_best_ref_mvs()
+  xd->tile = *tile;
+}
+
+void av1_set_offsets(const AV1_COMP *const cpi, const TileInfo *const tile,
+                     MACROBLOCK *const x, int mi_row, int mi_col,
+                     BLOCK_SIZE bsize) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const struct segmentation *const seg = &cm->seg;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi;
+
+  av1_set_offsets_without_segment_id(cpi, tile, x, mi_row, mi_col, bsize);
+
+  // Setup segment ID.
+  mbmi = xd->mi[0];
+  mbmi->segment_id = 0;
+  if (seg->enabled) {
+    if (seg->enabled && !cpi->vaq_refresh) {
+      const uint8_t *const map =
+          seg->update_map ? cpi->enc_seg.map : cm->last_frame_seg_map;
+      mbmi->segment_id =
+          map ? get_segment_id(&cm->mi_params, map, bsize, mi_row, mi_col) : 0;
+    }
+    av1_init_plane_quantizers(cpi, x, mbmi->segment_id);
+  }
+}
+
+/*!\brief Hybrid intra mode search.
+ *
+ * \ingroup intra_mode_search
+ * \callgraph
+ * \callergraph
+ * This is top level function for mode srarch for intra frames in non-RD
+ * optimized case. Depending on speed feature, rate control mode and block
+ * size it calls either non-RD or RD optimized intra mode search
+ *
+ * \param[in]    cpi            Top-level encoder structure
+ * \param[in]    x              Pointer to structure holding all the data for
+                                the current macroblock
+ * \param[in]    rd_cost        Struct to keep track of the RD information
+ * \param[in]    bsize          Current block size
+ * \param[in]    ctx            Structure to hold snapshot of coding context
+                                during the mode picking process
+ *
+ * \return Nothing is returned. Instead, the MB_MODE_INFO struct inside x
+ * is modified to store information about the best mode computed
+ * in this function. The rd_cost struct is also updated with the RD stats
+ * corresponding to the best mode found.
+ */
+
+static AOM_INLINE void hybrid_intra_mode_search(AV1_COMP *cpi,
+                                                MACROBLOCK *const x,
+                                                RD_STATS *rd_cost,
+                                                BLOCK_SIZE bsize,
+                                                PICK_MODE_CONTEXT *ctx) {
+  // TODO(jianj): Investigate the failure of ScalabilityTest in AOM_Q mode,
+  // which sets base_qindex to 0 on keyframe.
+  if (cpi->oxcf.rc_cfg.mode != AOM_CBR ||
+      !cpi->sf.rt_sf.hybrid_intra_pickmode || bsize < BLOCK_16X16)
+    av1_rd_pick_intra_mode_sb(cpi, x, rd_cost, bsize, ctx, INT64_MAX);
+  else
+    av1_nonrd_pick_intra_mode(cpi, x, rd_cost, bsize, ctx);
+}
+
+/*!\brief Interface for AV1 mode search for an individual coding block
+ *
+ * \ingroup partition_search
+ * \callgraph
+ * \callergraph
+ * Searches prediction modes, transform, and coefficient coding modes for an
+ * individual coding block. This function is the top-level interface that
+ * directs the encoder to the proper mode search function, among these
+ * implemented for inter/intra + rd/non-rd + non-skip segment/skip segment.
+ *
+ * \param[in]    cpi            Top-level encoder structure
+ * \param[in]    tile_data      Pointer to struct holding adaptive
+ *                              data/contexts/models for the tile during
+ *                              encoding
+ * \param[in]    x              Pointer to structure holding all the data for
+ *                              the current macroblock
+ * \param[in]    mi_row         Row coordinate of the block in a step size of
+ *                              MI_SIZE
+ * \param[in]    mi_col         Column coordinate of the block in a step size of
+ *                              MI_SIZE
+ * \param[in]    rd_cost        Pointer to structure holding rate and distortion
+ *                              stats for the current block
+ * \param[in]    partition      Partition mode of the parent block
+ * \param[in]    bsize          Current block size
+ * \param[in]    ctx            Pointer to structure holding coding contexts and
+ *                              chosen modes for the current block
+ * \param[in]    best_rd        Upper bound of rd cost of a valid partition
+ *
+ * \return Nothing is returned. Instead, the chosen modes and contexts necessary
+ * for reconstruction are stored in ctx, the rate-distortion stats are stored in
+ * rd_cost. If no valid mode leading to rd_cost <= best_rd, the status will be
+ * signalled by an INT64_MAX rd_cost->rdcost.
+ */
+static void pick_sb_modes(AV1_COMP *const cpi, TileDataEnc *tile_data,
+                          MACROBLOCK *const x, int mi_row, int mi_col,
+                          RD_STATS *rd_cost, PARTITION_TYPE partition,
+                          BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
+                          RD_STATS best_rd) {
+  if (best_rd.rdcost < 0) {
+    ctx->rd_stats.rdcost = INT64_MAX;
+    ctx->rd_stats.skip_txfm = 0;
+    av1_invalid_rd_stats(rd_cost);
+    return;
+  }
+
+  av1_set_offsets(cpi, &tile_data->tile_info, x, mi_row, mi_col, bsize);
+
+  if (ctx->rd_mode_is_ready) {
+    assert(ctx->mic.bsize == bsize);
+    assert(ctx->mic.partition == partition);
+    rd_cost->rate = ctx->rd_stats.rate;
+    rd_cost->dist = ctx->rd_stats.dist;
+    rd_cost->rdcost = ctx->rd_stats.rdcost;
+    return;
+  }
+
+  AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi;
+  struct macroblock_plane *const p = x->plane;
+  struct macroblockd_plane *const pd = xd->plane;
+  const AQ_MODE aq_mode = cpi->oxcf.q_cfg.aq_mode;
+  TxfmSearchInfo *txfm_info = &x->txfm_search_info;
+
+  int i;
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  start_timing(cpi, rd_pick_sb_modes_time);
+#endif
+
+  aom_clear_system_state();
+
+  mbmi = xd->mi[0];
+  mbmi->bsize = bsize;
+  mbmi->partition = partition;
+
+#if CONFIG_RD_DEBUG
+  mbmi->mi_row = mi_row;
+  mbmi->mi_col = mi_col;
+#endif
+
+  // Sets up the tx_type_map buffer in MACROBLOCKD.
+  xd->tx_type_map = txfm_info->tx_type_map_;
+  xd->tx_type_map_stride = mi_size_wide[bsize];
+
+  for (i = 0; i < num_planes; ++i) {
+    p[i].coeff = ctx->coeff[i];
+    p[i].qcoeff = ctx->qcoeff[i];
+    p[i].dqcoeff = ctx->dqcoeff[i];
+    p[i].eobs = ctx->eobs[i];
+    p[i].txb_entropy_ctx = ctx->txb_entropy_ctx[i];
+  }
+
+  for (i = 0; i < 2; ++i) pd[i].color_index_map = ctx->color_index_map[i];
+
+  ctx->skippable = 0;
+  // Set to zero to make sure we do not use the previous encoded frame stats
+  mbmi->skip_txfm = 0;
+  // Reset skip mode flag.
+  mbmi->skip_mode = 0;
+
+  if (is_cur_buf_hbd(xd)) {
+    x->source_variance = av1_high_get_sby_perpixel_variance(
+        cpi, &x->plane[0].src, bsize, xd->bd);
+  } else {
+    x->source_variance =
+        av1_get_sby_perpixel_variance(cpi, &x->plane[0].src, bsize);
+  }
+
+  // Initialize default mode evaluation params
+  set_mode_eval_params(cpi, x, DEFAULT_EVAL);
+
+  // Save rdmult before it might be changed, so it can be restored later.
+  const int orig_rdmult = x->rdmult;
+  setup_block_rdmult(cpi, x, mi_row, mi_col, bsize, aq_mode, mbmi);
+  // Set error per bit for current rdmult
+  av1_set_error_per_bit(&x->mv_costs, x->rdmult);
+  av1_rd_cost_update(x->rdmult, &best_rd);
+
+  // Find best coding mode & reconstruct the MB so it is available
+  // as a predictor for MBs that follow in the SB
+  if (frame_is_intra_only(cm)) {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    start_timing(cpi, av1_rd_pick_intra_mode_sb_time);
+#endif
+    av1_rd_pick_intra_mode_sb(cpi, x, rd_cost, bsize, ctx, best_rd.rdcost);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    end_timing(cpi, av1_rd_pick_intra_mode_sb_time);
+#endif
+  } else {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    start_timing(cpi, av1_rd_pick_inter_mode_sb_time);
+#endif
+    if (segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+      av1_rd_pick_inter_mode_sb_seg_skip(cpi, tile_data, x, mi_row, mi_col,
+                                         rd_cost, bsize, ctx, best_rd.rdcost);
+    } else {
+      av1_rd_pick_inter_mode(cpi, tile_data, x, rd_cost, bsize, ctx,
+                             best_rd.rdcost);
+    }
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    end_timing(cpi, av1_rd_pick_inter_mode_sb_time);
+#endif
+  }
+
+  // Examine the resulting rate and for AQ mode 2 make a segment choice.
+  if (rd_cost->rate != INT_MAX && aq_mode == COMPLEXITY_AQ &&
+      bsize >= BLOCK_16X16) {
+    av1_caq_select_segment(cpi, x, bsize, mi_row, mi_col, rd_cost->rate);
+  }
+
+  x->rdmult = orig_rdmult;
+
+  // TODO(jingning) The rate-distortion optimization flow needs to be
+  // refactored to provide proper exit/return handle.
+  if (rd_cost->rate == INT_MAX) rd_cost->rdcost = INT64_MAX;
+
+  ctx->rd_stats.rate = rd_cost->rate;
+  ctx->rd_stats.dist = rd_cost->dist;
+  ctx->rd_stats.rdcost = rd_cost->rdcost;
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  end_timing(cpi, rd_pick_sb_modes_time);
+#endif
+}
+
+static void update_stats(const AV1_COMMON *const cm, ThreadData *td) {
+  MACROBLOCK *x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
+  const MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext;
+  const CurrentFrame *const current_frame = &cm->current_frame;
+  const BLOCK_SIZE bsize = mbmi->bsize;
+  FRAME_CONTEXT *fc = xd->tile_ctx;
+  const int seg_ref_active =
+      segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_REF_FRAME);
+
+  if (current_frame->skip_mode_info.skip_mode_flag && !seg_ref_active &&
+      is_comp_ref_allowed(bsize)) {
+    const int skip_mode_ctx = av1_get_skip_mode_context(xd);
+#if CONFIG_ENTROPY_STATS
+    td->counts->skip_mode[skip_mode_ctx][mbmi->skip_mode]++;
+#endif
+    update_cdf(fc->skip_mode_cdfs[skip_mode_ctx], mbmi->skip_mode, 2);
+  }
+
+  if (!mbmi->skip_mode && !seg_ref_active) {
+    const int skip_ctx = av1_get_skip_txfm_context(xd);
+#if CONFIG_ENTROPY_STATS
+    td->counts->skip_txfm[skip_ctx][mbmi->skip_txfm]++;
+#endif
+    update_cdf(fc->skip_txfm_cdfs[skip_ctx], mbmi->skip_txfm, 2);
+  }
+
+#if CONFIG_ENTROPY_STATS
+  // delta quant applies to both intra and inter
+  const int super_block_upper_left =
+      ((xd->mi_row & (cm->seq_params.mib_size - 1)) == 0) &&
+      ((xd->mi_col & (cm->seq_params.mib_size - 1)) == 0);
+  const DeltaQInfo *const delta_q_info = &cm->delta_q_info;
+  if (delta_q_info->delta_q_present_flag &&
+      (bsize != cm->seq_params.sb_size || !mbmi->skip_txfm) &&
+      super_block_upper_left) {
+    const int dq = (mbmi->current_qindex - xd->current_base_qindex) /
+                   delta_q_info->delta_q_res;
+    const int absdq = abs(dq);
+    for (int i = 0; i < AOMMIN(absdq, DELTA_Q_SMALL); ++i) {
+      td->counts->delta_q[i][1]++;
+    }
+    if (absdq < DELTA_Q_SMALL) td->counts->delta_q[absdq][0]++;
+    if (delta_q_info->delta_lf_present_flag) {
+      if (delta_q_info->delta_lf_multi) {
+        const int frame_lf_count =
+            av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2;
+        for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) {
+          const int delta_lf = (mbmi->delta_lf[lf_id] - xd->delta_lf[lf_id]) /
+                               delta_q_info->delta_lf_res;
+          const int abs_delta_lf = abs(delta_lf);
+          for (int i = 0; i < AOMMIN(abs_delta_lf, DELTA_LF_SMALL); ++i) {
+            td->counts->delta_lf_multi[lf_id][i][1]++;
+          }
+          if (abs_delta_lf < DELTA_LF_SMALL)
+            td->counts->delta_lf_multi[lf_id][abs_delta_lf][0]++;
+        }
+      } else {
+        const int delta_lf =
+            (mbmi->delta_lf_from_base - xd->delta_lf_from_base) /
+            delta_q_info->delta_lf_res;
+        const int abs_delta_lf = abs(delta_lf);
+        for (int i = 0; i < AOMMIN(abs_delta_lf, DELTA_LF_SMALL); ++i) {
+          td->counts->delta_lf[i][1]++;
+        }
+        if (abs_delta_lf < DELTA_LF_SMALL)
+          td->counts->delta_lf[abs_delta_lf][0]++;
+      }
+    }
+  }
+#endif
+
+  if (!is_inter_block(mbmi)) {
+    av1_sum_intra_stats(cm, td->counts, xd, mbmi, xd->above_mbmi, xd->left_mbmi,
+                        frame_is_intra_only(cm));
+  }
+
+  if (av1_allow_intrabc(cm)) {
+    update_cdf(fc->intrabc_cdf, is_intrabc_block(mbmi), 2);
+#if CONFIG_ENTROPY_STATS
+    ++td->counts->intrabc[is_intrabc_block(mbmi)];
+#endif  // CONFIG_ENTROPY_STATS
+  }
+
+  if (frame_is_intra_only(cm) || mbmi->skip_mode) return;
+
+  FRAME_COUNTS *const counts = td->counts;
+  const int inter_block = is_inter_block(mbmi);
+
+  if (!seg_ref_active) {
+#if CONFIG_ENTROPY_STATS
+    counts->intra_inter[av1_get_intra_inter_context(xd)][inter_block]++;
+#endif
+    update_cdf(fc->intra_inter_cdf[av1_get_intra_inter_context(xd)],
+               inter_block, 2);
+    // If the segment reference feature is enabled we have only a single
+    // reference frame allowed for the segment so exclude it from
+    // the reference frame counts used to work out probabilities.
+    if (inter_block) {
+      const MV_REFERENCE_FRAME ref0 = mbmi->ref_frame[0];
+      const MV_REFERENCE_FRAME ref1 = mbmi->ref_frame[1];
+      if (current_frame->reference_mode == REFERENCE_MODE_SELECT) {
+        if (is_comp_ref_allowed(bsize)) {
+#if CONFIG_ENTROPY_STATS
+          counts->comp_inter[av1_get_reference_mode_context(xd)]
+                            [has_second_ref(mbmi)]++;
+#endif  // CONFIG_ENTROPY_STATS
+          update_cdf(av1_get_reference_mode_cdf(xd), has_second_ref(mbmi), 2);
+        }
+      }
+
+      if (has_second_ref(mbmi)) {
+        const COMP_REFERENCE_TYPE comp_ref_type = has_uni_comp_refs(mbmi)
+                                                      ? UNIDIR_COMP_REFERENCE
+                                                      : BIDIR_COMP_REFERENCE;
+        update_cdf(av1_get_comp_reference_type_cdf(xd), comp_ref_type,
+                   COMP_REFERENCE_TYPES);
+#if CONFIG_ENTROPY_STATS
+        counts->comp_ref_type[av1_get_comp_reference_type_context(xd)]
+                             [comp_ref_type]++;
+#endif  // CONFIG_ENTROPY_STATS
+
+        if (comp_ref_type == UNIDIR_COMP_REFERENCE) {
+          const int bit = (ref0 == BWDREF_FRAME);
+          update_cdf(av1_get_pred_cdf_uni_comp_ref_p(xd), bit, 2);
+#if CONFIG_ENTROPY_STATS
+          counts
+              ->uni_comp_ref[av1_get_pred_context_uni_comp_ref_p(xd)][0][bit]++;
+#endif  // CONFIG_ENTROPY_STATS
+          if (!bit) {
+            const int bit1 = (ref1 == LAST3_FRAME || ref1 == GOLDEN_FRAME);
+            update_cdf(av1_get_pred_cdf_uni_comp_ref_p1(xd), bit1, 2);
+#if CONFIG_ENTROPY_STATS
+            counts->uni_comp_ref[av1_get_pred_context_uni_comp_ref_p1(xd)][1]
+                                [bit1]++;
+#endif  // CONFIG_ENTROPY_STATS
+            if (bit1) {
+              update_cdf(av1_get_pred_cdf_uni_comp_ref_p2(xd),
+                         ref1 == GOLDEN_FRAME, 2);
+#if CONFIG_ENTROPY_STATS
+              counts->uni_comp_ref[av1_get_pred_context_uni_comp_ref_p2(xd)][2]
+                                  [ref1 == GOLDEN_FRAME]++;
+#endif  // CONFIG_ENTROPY_STATS
+            }
+          }
+        } else {
+          const int bit = (ref0 == GOLDEN_FRAME || ref0 == LAST3_FRAME);
+          update_cdf(av1_get_pred_cdf_comp_ref_p(xd), bit, 2);
+#if CONFIG_ENTROPY_STATS
+          counts->comp_ref[av1_get_pred_context_comp_ref_p(xd)][0][bit]++;
+#endif  // CONFIG_ENTROPY_STATS
+          if (!bit) {
+            update_cdf(av1_get_pred_cdf_comp_ref_p1(xd), ref0 == LAST2_FRAME,
+                       2);
+#if CONFIG_ENTROPY_STATS
+            counts->comp_ref[av1_get_pred_context_comp_ref_p1(xd)][1]
+                            [ref0 == LAST2_FRAME]++;
+#endif  // CONFIG_ENTROPY_STATS
+          } else {
+            update_cdf(av1_get_pred_cdf_comp_ref_p2(xd), ref0 == GOLDEN_FRAME,
+                       2);
+#if CONFIG_ENTROPY_STATS
+            counts->comp_ref[av1_get_pred_context_comp_ref_p2(xd)][2]
+                            [ref0 == GOLDEN_FRAME]++;
+#endif  // CONFIG_ENTROPY_STATS
+          }
+          update_cdf(av1_get_pred_cdf_comp_bwdref_p(xd), ref1 == ALTREF_FRAME,
+                     2);
+#if CONFIG_ENTROPY_STATS
+          counts->comp_bwdref[av1_get_pred_context_comp_bwdref_p(xd)][0]
+                             [ref1 == ALTREF_FRAME]++;
+#endif  // CONFIG_ENTROPY_STATS
+          if (ref1 != ALTREF_FRAME) {
+            update_cdf(av1_get_pred_cdf_comp_bwdref_p1(xd),
+                       ref1 == ALTREF2_FRAME, 2);
+#if CONFIG_ENTROPY_STATS
+            counts->comp_bwdref[av1_get_pred_context_comp_bwdref_p1(xd)][1]
+                               [ref1 == ALTREF2_FRAME]++;
+#endif  // CONFIG_ENTROPY_STATS
+          }
+        }
+      } else {
+        const int bit = (ref0 >= BWDREF_FRAME);
+        update_cdf(av1_get_pred_cdf_single_ref_p1(xd), bit, 2);
+#if CONFIG_ENTROPY_STATS
+        counts->single_ref[av1_get_pred_context_single_ref_p1(xd)][0][bit]++;
+#endif  // CONFIG_ENTROPY_STATS
+        if (bit) {
+          assert(ref0 <= ALTREF_FRAME);
+          update_cdf(av1_get_pred_cdf_single_ref_p2(xd), ref0 == ALTREF_FRAME,
+                     2);
+#if CONFIG_ENTROPY_STATS
+          counts->single_ref[av1_get_pred_context_single_ref_p2(xd)][1]
+                            [ref0 == ALTREF_FRAME]++;
+#endif  // CONFIG_ENTROPY_STATS
+          if (ref0 != ALTREF_FRAME) {
+            update_cdf(av1_get_pred_cdf_single_ref_p6(xd),
+                       ref0 == ALTREF2_FRAME, 2);
+#if CONFIG_ENTROPY_STATS
+            counts->single_ref[av1_get_pred_context_single_ref_p6(xd)][5]
+                              [ref0 == ALTREF2_FRAME]++;
+#endif  // CONFIG_ENTROPY_STATS
+          }
+        } else {
+          const int bit1 = !(ref0 == LAST2_FRAME || ref0 == LAST_FRAME);
+          update_cdf(av1_get_pred_cdf_single_ref_p3(xd), bit1, 2);
+#if CONFIG_ENTROPY_STATS
+          counts->single_ref[av1_get_pred_context_single_ref_p3(xd)][2][bit1]++;
+#endif  // CONFIG_ENTROPY_STATS
+          if (!bit1) {
+            update_cdf(av1_get_pred_cdf_single_ref_p4(xd), ref0 != LAST_FRAME,
+                       2);
+#if CONFIG_ENTROPY_STATS
+            counts->single_ref[av1_get_pred_context_single_ref_p4(xd)][3]
+                              [ref0 != LAST_FRAME]++;
+#endif  // CONFIG_ENTROPY_STATS
+          } else {
+            update_cdf(av1_get_pred_cdf_single_ref_p5(xd), ref0 != LAST3_FRAME,
+                       2);
+#if CONFIG_ENTROPY_STATS
+            counts->single_ref[av1_get_pred_context_single_ref_p5(xd)][4]
+                              [ref0 != LAST3_FRAME]++;
+#endif  // CONFIG_ENTROPY_STATS
+          }
+        }
+      }
+
+      if (cm->seq_params.enable_interintra_compound &&
+          is_interintra_allowed(mbmi)) {
+        const int bsize_group = size_group_lookup[bsize];
+        if (mbmi->ref_frame[1] == INTRA_FRAME) {
+#if CONFIG_ENTROPY_STATS
+          counts->interintra[bsize_group][1]++;
+#endif
+          update_cdf(fc->interintra_cdf[bsize_group], 1, 2);
+#if CONFIG_ENTROPY_STATS
+          counts->interintra_mode[bsize_group][mbmi->interintra_mode]++;
+#endif
+          update_cdf(fc->interintra_mode_cdf[bsize_group],
+                     mbmi->interintra_mode, INTERINTRA_MODES);
+          if (av1_is_wedge_used(bsize)) {
+#if CONFIG_ENTROPY_STATS
+            counts->wedge_interintra[bsize][mbmi->use_wedge_interintra]++;
+#endif
+            update_cdf(fc->wedge_interintra_cdf[bsize],
+                       mbmi->use_wedge_interintra, 2);
+            if (mbmi->use_wedge_interintra) {
+#if CONFIG_ENTROPY_STATS
+              counts->wedge_idx[bsize][mbmi->interintra_wedge_index]++;
+#endif
+              update_cdf(fc->wedge_idx_cdf[bsize], mbmi->interintra_wedge_index,
+                         16);
+            }
+          }
+        } else {
+#if CONFIG_ENTROPY_STATS
+          counts->interintra[bsize_group][0]++;
+#endif
+          update_cdf(fc->interintra_cdf[bsize_group], 0, 2);
+        }
+      }
+
+      const MOTION_MODE motion_allowed =
+          cm->features.switchable_motion_mode
+              ? motion_mode_allowed(xd->global_motion, xd, mbmi,
+                                    cm->features.allow_warped_motion)
+              : SIMPLE_TRANSLATION;
+      if (mbmi->ref_frame[1] != INTRA_FRAME) {
+        if (motion_allowed == WARPED_CAUSAL) {
+#if CONFIG_ENTROPY_STATS
+          counts->motion_mode[bsize][mbmi->motion_mode]++;
+#endif
+          update_cdf(fc->motion_mode_cdf[bsize], mbmi->motion_mode,
+                     MOTION_MODES);
+        } else if (motion_allowed == OBMC_CAUSAL) {
+#if CONFIG_ENTROPY_STATS
+          counts->obmc[bsize][mbmi->motion_mode == OBMC_CAUSAL]++;
+#endif
+          update_cdf(fc->obmc_cdf[bsize], mbmi->motion_mode == OBMC_CAUSAL, 2);
+        }
+      }
+
+      if (has_second_ref(mbmi)) {
+        assert(current_frame->reference_mode != SINGLE_REFERENCE &&
+               is_inter_compound_mode(mbmi->mode) &&
+               mbmi->motion_mode == SIMPLE_TRANSLATION);
+
+        const int masked_compound_used = is_any_masked_compound_used(bsize) &&
+                                         cm->seq_params.enable_masked_compound;
+        if (masked_compound_used) {
+          const int comp_group_idx_ctx = get_comp_group_idx_context(xd);
+#if CONFIG_ENTROPY_STATS
+          ++counts->comp_group_idx[comp_group_idx_ctx][mbmi->comp_group_idx];
+#endif
+          update_cdf(fc->comp_group_idx_cdf[comp_group_idx_ctx],
+                     mbmi->comp_group_idx, 2);
+        }
+
+        if (mbmi->comp_group_idx == 0) {
+          const int comp_index_ctx = get_comp_index_context(cm, xd);
+#if CONFIG_ENTROPY_STATS
+          ++counts->compound_index[comp_index_ctx][mbmi->compound_idx];
+#endif
+          update_cdf(fc->compound_index_cdf[comp_index_ctx], mbmi->compound_idx,
+                     2);
+        } else {
+          assert(masked_compound_used);
+          if (is_interinter_compound_used(COMPOUND_WEDGE, bsize)) {
+#if CONFIG_ENTROPY_STATS
+            ++counts->compound_type[bsize][mbmi->interinter_comp.type -
+                                           COMPOUND_WEDGE];
+#endif
+            update_cdf(fc->compound_type_cdf[bsize],
+                       mbmi->interinter_comp.type - COMPOUND_WEDGE,
+                       MASKED_COMPOUND_TYPES);
+          }
+        }
+      }
+      if (mbmi->interinter_comp.type == COMPOUND_WEDGE) {
+        if (is_interinter_compound_used(COMPOUND_WEDGE, bsize)) {
+#if CONFIG_ENTROPY_STATS
+          counts->wedge_idx[bsize][mbmi->interinter_comp.wedge_index]++;
+#endif
+          update_cdf(fc->wedge_idx_cdf[bsize],
+                     mbmi->interinter_comp.wedge_index, 16);
+        }
+      }
+    }
+  }
+
+  if (inter_block && cm->features.interp_filter == SWITCHABLE &&
+      mbmi->motion_mode != WARPED_CAUSAL &&
+      !is_nontrans_global_motion(xd, mbmi)) {
+    update_filter_type_cdf(xd, mbmi, cm->seq_params.enable_dual_filter);
+  }
+  if (inter_block &&
+      !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+    const PREDICTION_MODE mode = mbmi->mode;
+    const int16_t mode_ctx =
+        av1_mode_context_analyzer(mbmi_ext->mode_context, mbmi->ref_frame);
+    if (has_second_ref(mbmi)) {
+#if CONFIG_ENTROPY_STATS
+      ++counts->inter_compound_mode[mode_ctx][INTER_COMPOUND_OFFSET(mode)];
+#endif
+      update_cdf(fc->inter_compound_mode_cdf[mode_ctx],
+                 INTER_COMPOUND_OFFSET(mode), INTER_COMPOUND_MODES);
+    } else {
+      av1_update_inter_mode_stats(fc, counts, mode, mode_ctx);
+    }
+
+    const int new_mv = mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV;
+    if (new_mv) {
+      const uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
+      for (int idx = 0; idx < 2; ++idx) {
+        if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) {
+          const uint8_t drl_ctx =
+              av1_drl_ctx(mbmi_ext->weight[ref_frame_type], idx);
+          update_cdf(fc->drl_cdf[drl_ctx], mbmi->ref_mv_idx != idx, 2);
+#if CONFIG_ENTROPY_STATS
+          ++counts->drl_mode[drl_ctx][mbmi->ref_mv_idx != idx];
+#endif
+          if (mbmi->ref_mv_idx == idx) break;
+        }
+      }
+    }
+
+    if (have_nearmv_in_inter_mode(mbmi->mode)) {
+      const uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
+      for (int idx = 1; idx < 3; ++idx) {
+        if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) {
+          const uint8_t drl_ctx =
+              av1_drl_ctx(mbmi_ext->weight[ref_frame_type], idx);
+          update_cdf(fc->drl_cdf[drl_ctx], mbmi->ref_mv_idx != idx - 1, 2);
+#if CONFIG_ENTROPY_STATS
+          ++counts->drl_mode[drl_ctx][mbmi->ref_mv_idx != idx - 1];
+#endif
+          if (mbmi->ref_mv_idx == idx - 1) break;
+        }
+      }
+    }
+    if (have_newmv_in_inter_mode(mbmi->mode)) {
+      const int allow_hp = cm->features.cur_frame_force_integer_mv
+                               ? MV_SUBPEL_NONE
+                               : cm->features.allow_high_precision_mv;
+      if (new_mv) {
+        for (int ref = 0; ref < 1 + has_second_ref(mbmi); ++ref) {
+          const int_mv ref_mv = av1_get_ref_mv(x, ref);
+          av1_update_mv_stats(&mbmi->mv[ref].as_mv, &ref_mv.as_mv, &fc->nmvc,
+                              allow_hp);
+        }
+      } else if (mbmi->mode == NEAREST_NEWMV || mbmi->mode == NEAR_NEWMV) {
+        const int ref = 1;
+        const int_mv ref_mv = av1_get_ref_mv(x, ref);
+        av1_update_mv_stats(&mbmi->mv[ref].as_mv, &ref_mv.as_mv, &fc->nmvc,
+                            allow_hp);
+      } else if (mbmi->mode == NEW_NEARESTMV || mbmi->mode == NEW_NEARMV) {
+        const int ref = 0;
+        const int_mv ref_mv = av1_get_ref_mv(x, ref);
+        av1_update_mv_stats(&mbmi->mv[ref].as_mv, &ref_mv.as_mv, &fc->nmvc,
+                            allow_hp);
+      }
+    }
+  }
+}
+
+/*!\brief Reconstructs an individual coding block
+ *
+ * \ingroup partition_search
+ * Reconstructs an individual coding block by applying the chosen modes stored
+ * in ctx, also updates mode counts and entropy models.
+ *
+ * \param[in]    cpi       Top-level encoder structure
+ * \param[in]    tile_data Pointer to struct holding adaptive
+ *                         data/contexts/models for the tile during encoding
+ * \param[in]    td        Pointer to thread data
+ * \param[in]    tp        Pointer to the starting token
+ * \param[in]    mi_row    Row coordinate of the block in a step size of MI_SIZE
+ * \param[in]    mi_col    Column coordinate of the block in a step size of
+ *                         MI_SIZE
+ * \param[in]    dry_run   A code indicating whether it is part of the final
+ *                         pass for reconstructing the superblock
+ * \param[in]    bsize     Current block size
+ * \param[in]    partition Partition mode of the parent block
+ * \param[in]    ctx       Pointer to structure holding coding contexts and the
+ *                         chosen modes for the current block
+ * \param[in]    rate      Pointer to the total rate for the current block
+ *
+ * \return Nothing is returned. Instead, reconstructions (w/o in-loop filters)
+ * will be updated in the pixel buffers in td->mb.e_mbd. Also, the chosen modes
+ * will be stored in the MB_MODE_INFO buffer td->mb.e_mbd.mi[0].
+ */
+static void encode_b(const AV1_COMP *const cpi, TileDataEnc *tile_data,
+                     ThreadData *td, TokenExtra **tp, int mi_row, int mi_col,
+                     RUN_TYPE dry_run, BLOCK_SIZE bsize,
+                     PARTITION_TYPE partition, PICK_MODE_CONTEXT *const ctx,
+                     int *rate) {
+  const AV1_COMMON *const cm = &cpi->common;
+  TileInfo *const tile = &tile_data->tile_info;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *xd = &x->e_mbd;
+  const int subsampling_x = cm->seq_params.subsampling_x;
+  const int subsampling_y = cm->seq_params.subsampling_y;
+
+  av1_set_offsets_without_segment_id(cpi, tile, x, mi_row, mi_col, bsize);
+  const int origin_mult = x->rdmult;
+  setup_block_rdmult(cpi, x, mi_row, mi_col, bsize, NO_AQ, NULL);
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  mbmi->partition = partition;
+  av1_update_state(cpi, td, ctx, mi_row, mi_col, bsize, dry_run);
+
+  if (!dry_run) {
+    set_cb_offsets(x->mbmi_ext_frame->cb_offset, x->cb_offset[PLANE_TYPE_Y],
+                   x->cb_offset[PLANE_TYPE_UV]);
+    assert(x->cb_offset[PLANE_TYPE_Y] <
+           (1 << num_pels_log2_lookup[cpi->common.seq_params.sb_size]));
+    assert(x->cb_offset[PLANE_TYPE_UV] <
+           ((1 << num_pels_log2_lookup[cpi->common.seq_params.sb_size]) >>
+            (subsampling_x + subsampling_y)));
+  }
+
+  encode_superblock(cpi, tile_data, td, tp, dry_run, bsize, rate);
+
+  if (!dry_run) {
+    update_cb_offsets(x, bsize, subsampling_x, subsampling_y);
+    if (bsize == cpi->common.seq_params.sb_size && mbmi->skip_txfm == 1 &&
+        cm->delta_q_info.delta_lf_present_flag) {
+      const int frame_lf_count =
+          av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2;
+      for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id)
+        mbmi->delta_lf[lf_id] = xd->delta_lf[lf_id];
+      mbmi->delta_lf_from_base = xd->delta_lf_from_base;
+    }
+    if (has_second_ref(mbmi)) {
+      if (mbmi->compound_idx == 0 ||
+          mbmi->interinter_comp.type == COMPOUND_AVERAGE)
+        mbmi->comp_group_idx = 0;
+      else
+        mbmi->comp_group_idx = 1;
+    }
+
+    // delta quant applies to both intra and inter
+    const int super_block_upper_left =
+        ((mi_row & (cm->seq_params.mib_size - 1)) == 0) &&
+        ((mi_col & (cm->seq_params.mib_size - 1)) == 0);
+    const DeltaQInfo *const delta_q_info = &cm->delta_q_info;
+    if (delta_q_info->delta_q_present_flag &&
+        (bsize != cm->seq_params.sb_size || !mbmi->skip_txfm) &&
+        super_block_upper_left) {
+      xd->current_base_qindex = mbmi->current_qindex;
+      if (delta_q_info->delta_lf_present_flag) {
+        if (delta_q_info->delta_lf_multi) {
+          const int frame_lf_count =
+              av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2;
+          for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) {
+            xd->delta_lf[lf_id] = mbmi->delta_lf[lf_id];
+          }
+        } else {
+          xd->delta_lf_from_base = mbmi->delta_lf_from_base;
+        }
+      }
+    }
+
+    RD_COUNTS *rdc = &td->rd_counts;
+    if (mbmi->skip_mode) {
+      assert(!frame_is_intra_only(cm));
+      rdc->skip_mode_used_flag = 1;
+      if (cm->current_frame.reference_mode == REFERENCE_MODE_SELECT) {
+        assert(has_second_ref(mbmi));
+        rdc->compound_ref_used_flag = 1;
+      }
+      set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+    } else {
+      const int seg_ref_active =
+          segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_REF_FRAME);
+      if (!seg_ref_active) {
+        // If the segment reference feature is enabled we have only a single
+        // reference frame allowed for the segment so exclude it from
+        // the reference frame counts used to work out probabilities.
+        if (is_inter_block(mbmi)) {
+          av1_collect_neighbors_ref_counts(xd);
+          if (cm->current_frame.reference_mode == REFERENCE_MODE_SELECT) {
+            if (has_second_ref(mbmi)) {
+              // This flag is also updated for 4x4 blocks
+              rdc->compound_ref_used_flag = 1;
+            }
+          }
+          set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+        }
+      }
+    }
+
+    if (tile_data->allow_update_cdf) update_stats(&cpi->common, td);
+
+    // Gather obmc and warped motion count to update the probability.
+    if ((!cpi->sf.inter_sf.disable_obmc &&
+         cpi->sf.inter_sf.prune_obmc_prob_thresh > 0) ||
+        (cm->features.allow_warped_motion &&
+         cpi->sf.inter_sf.prune_warped_prob_thresh > 0)) {
+      const int inter_block = is_inter_block(mbmi);
+      const int seg_ref_active =
+          segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_REF_FRAME);
+      if (!seg_ref_active && inter_block) {
+        const MOTION_MODE motion_allowed =
+            cm->features.switchable_motion_mode
+                ? motion_mode_allowed(xd->global_motion, xd, mbmi,
+                                      cm->features.allow_warped_motion)
+                : SIMPLE_TRANSLATION;
+
+        if (mbmi->ref_frame[1] != INTRA_FRAME) {
+          if (motion_allowed >= OBMC_CAUSAL) {
+            td->rd_counts.obmc_used[bsize][mbmi->motion_mode == OBMC_CAUSAL]++;
+          }
+          if (motion_allowed == WARPED_CAUSAL) {
+            td->rd_counts.warped_used[mbmi->motion_mode == WARPED_CAUSAL]++;
+          }
+        }
+      }
+    }
+  }
+  // TODO(Ravi/Remya): Move this copy function to a better logical place
+  // This function will copy the best mode information from block
+  // level (x->mbmi_ext) to frame level (cpi->mbmi_ext_info.frame_base). This
+  // frame level buffer (cpi->mbmi_ext_info.frame_base) will be used during
+  // bitstream preparation.
+  av1_copy_mbmi_ext_to_mbmi_ext_frame(x->mbmi_ext_frame, &x->mbmi_ext,
+                                      av1_ref_frame_type(xd->mi[0]->ref_frame));
+  x->rdmult = origin_mult;
+}
+
+/*!\brief Reconstructs a partition (may contain multiple coding blocks)
+ *
+ * \ingroup partition_search
+ * Reconstructs a sub-partition of the superblock by applying the chosen modes
+ * and partition trees stored in pc_tree.
+ *
+ * \param[in]    cpi       Top-level encoder structure
+ * \param[in]    td        Pointer to thread data
+ * \param[in]    tile_data Pointer to struct holding adaptive
+ *                         data/contexts/models for the tile during encoding
+ * \param[in]    tp        Pointer to the starting token
+ * \param[in]    mi_row    Row coordinate of the block in a step size of MI_SIZE
+ * \param[in]    mi_col    Column coordinate of the block in a step size of
+ *                         MI_SIZE
+ * \param[in]    dry_run   A code indicating whether it is part of the final
+ *                         pass for reconstructing the superblock
+ * \param[in]    bsize     Current block size
+ * \param[in]    pc_tree   Pointer to the PC_TREE node storing the picked
+ *                         partitions and mode info for the current block
+ * \param[in]    rate      Pointer to the total rate for the current block
+ *
+ * \return Nothing is returned. Instead, reconstructions (w/o in-loop filters)
+ * will be updated in the pixel buffers in td->mb.e_mbd.
+ */
+static void encode_sb(const AV1_COMP *const cpi, ThreadData *td,
+                      TileDataEnc *tile_data, TokenExtra **tp, int mi_row,
+                      int mi_col, RUN_TYPE dry_run, BLOCK_SIZE bsize,
+                      PC_TREE *pc_tree, int *rate) {
+  assert(bsize < BLOCK_SIZES_ALL);
+  const AV1_COMMON *const cm = &cpi->common;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  assert(bsize < BLOCK_SIZES_ALL);
+  const int hbs = mi_size_wide[bsize] / 2;
+  const int is_partition_root = bsize >= BLOCK_8X8;
+  const int ctx = is_partition_root
+                      ? partition_plane_context(xd, mi_row, mi_col, bsize)
+                      : -1;
+  const PARTITION_TYPE partition = pc_tree->partitioning;
+  const BLOCK_SIZE subsize = get_partition_subsize(bsize, partition);
+  int quarter_step = mi_size_wide[bsize] / 4;
+  int i;
+  BLOCK_SIZE bsize2 = get_partition_subsize(bsize, PARTITION_SPLIT);
+
+  if (mi_row >= mi_params->mi_rows || mi_col >= mi_params->mi_cols) return;
+  if (subsize == BLOCK_INVALID) return;
+
+  if (!dry_run && ctx >= 0) {
+    const int has_rows = (mi_row + hbs) < mi_params->mi_rows;
+    const int has_cols = (mi_col + hbs) < mi_params->mi_cols;
+
+    if (has_rows && has_cols) {
+#if CONFIG_ENTROPY_STATS
+      td->counts->partition[ctx][partition]++;
+#endif
+
+      if (tile_data->allow_update_cdf) {
+        FRAME_CONTEXT *fc = xd->tile_ctx;
+        update_cdf(fc->partition_cdf[ctx], partition,
+                   partition_cdf_length(bsize));
+      }
+    }
+  }
+
+  switch (partition) {
+    case PARTITION_NONE:
+      encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, subsize,
+               partition, pc_tree->none, rate);
+      break;
+    case PARTITION_VERT:
+      encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, subsize,
+               partition, pc_tree->vertical[0], rate);
+      if (mi_col + hbs < mi_params->mi_cols) {
+        encode_b(cpi, tile_data, td, tp, mi_row, mi_col + hbs, dry_run, subsize,
+                 partition, pc_tree->vertical[1], rate);
+      }
+      break;
+    case PARTITION_HORZ:
+      encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, subsize,
+               partition, pc_tree->horizontal[0], rate);
+      if (mi_row + hbs < mi_params->mi_rows) {
+        encode_b(cpi, tile_data, td, tp, mi_row + hbs, mi_col, dry_run, subsize,
+                 partition, pc_tree->horizontal[1], rate);
+      }
+      break;
+    case PARTITION_SPLIT:
+      encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, dry_run, subsize,
+                pc_tree->split[0], rate);
+      encode_sb(cpi, td, tile_data, tp, mi_row, mi_col + hbs, dry_run, subsize,
+                pc_tree->split[1], rate);
+      encode_sb(cpi, td, tile_data, tp, mi_row + hbs, mi_col, dry_run, subsize,
+                pc_tree->split[2], rate);
+      encode_sb(cpi, td, tile_data, tp, mi_row + hbs, mi_col + hbs, dry_run,
+                subsize, pc_tree->split[3], rate);
+      break;
+
+    case PARTITION_HORZ_A:
+      encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, bsize2,
+               partition, pc_tree->horizontala[0], rate);
+      encode_b(cpi, tile_data, td, tp, mi_row, mi_col + hbs, dry_run, bsize2,
+               partition, pc_tree->horizontala[1], rate);
+      encode_b(cpi, tile_data, td, tp, mi_row + hbs, mi_col, dry_run, subsize,
+               partition, pc_tree->horizontala[2], rate);
+      break;
+    case PARTITION_HORZ_B:
+      encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, subsize,
+               partition, pc_tree->horizontalb[0], rate);
+      encode_b(cpi, tile_data, td, tp, mi_row + hbs, mi_col, dry_run, bsize2,
+               partition, pc_tree->horizontalb[1], rate);
+      encode_b(cpi, tile_data, td, tp, mi_row + hbs, mi_col + hbs, dry_run,
+               bsize2, partition, pc_tree->horizontalb[2], rate);
+      break;
+    case PARTITION_VERT_A:
+      encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, bsize2,
+               partition, pc_tree->verticala[0], rate);
+      encode_b(cpi, tile_data, td, tp, mi_row + hbs, mi_col, dry_run, bsize2,
+               partition, pc_tree->verticala[1], rate);
+      encode_b(cpi, tile_data, td, tp, mi_row, mi_col + hbs, dry_run, subsize,
+               partition, pc_tree->verticala[2], rate);
+
+      break;
+    case PARTITION_VERT_B:
+      encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, subsize,
+               partition, pc_tree->verticalb[0], rate);
+      encode_b(cpi, tile_data, td, tp, mi_row, mi_col + hbs, dry_run, bsize2,
+               partition, pc_tree->verticalb[1], rate);
+      encode_b(cpi, tile_data, td, tp, mi_row + hbs, mi_col + hbs, dry_run,
+               bsize2, partition, pc_tree->verticalb[2], rate);
+      break;
+    case PARTITION_HORZ_4:
+      for (i = 0; i < SUB_PARTITIONS_PART4; ++i) {
+        int this_mi_row = mi_row + i * quarter_step;
+        if (i > 0 && this_mi_row >= mi_params->mi_rows) break;
+
+        encode_b(cpi, tile_data, td, tp, this_mi_row, mi_col, dry_run, subsize,
+                 partition, pc_tree->horizontal4[i], rate);
+      }
+      break;
+    case PARTITION_VERT_4:
+      for (i = 0; i < SUB_PARTITIONS_PART4; ++i) {
+        int this_mi_col = mi_col + i * quarter_step;
+        if (i > 0 && this_mi_col >= mi_params->mi_cols) break;
+        encode_b(cpi, tile_data, td, tp, mi_row, this_mi_col, dry_run, subsize,
+                 partition, pc_tree->vertical4[i], rate);
+      }
+      break;
+    default: assert(0 && "Invalid partition type."); break;
+  }
+
+  update_ext_partition_context(xd, mi_row, mi_col, subsize, bsize, partition);
+}
+
+/*!\brief AV1 block partition search (partition estimation and partial search).
+*
+* \ingroup partition_search
+* Encode the block by applying pre-calculated partition patterns that are
+* represented by coding block sizes stored in the mbmi array. Minor partition
+* adjustments are tested and applied if they lead to lower rd costs. The
+* partition types are limited to a basic set: none, horz, vert, and split.
+*
+* \param[in]    cpi       Top-level encoder structure
+* \param[in]    td        Pointer to thread data
+* \param[in]    tile_data Pointer to struct holding adaptive
+data/contexts/models for the tile during encoding
+* \param[in]    mib       Array representing MB_MODE_INFO pointers for mi
+blocks starting from the first pixel of the current
+block
+* \param[in]    tp        Pointer to the starting token
+* \param[in]    mi_row    Row coordinate of the block in a step size of MI_SIZE
+* \param[in]    mi_col    Column coordinate of the block in a step size of
+MI_SIZE
+* \param[in]    bsize     Current block size
+* \param[in]    rate      Pointer to the final rate for encoding the current
+block
+* \param[in]    dist      Pointer to the final distortion of the current block
+* \param[in]    do_recon  Whether the reconstruction function needs to be run,
+either for finalizing a superblock or providing
+reference for future sub-partitions
+* \param[in]    pc_tree   Pointer to the PC_TREE node holding the picked
+partitions and mode info for the current block
+*
+* \return Nothing is returned. The pc_tree struct is modified to store the
+* picked partition and modes. The rate and dist are also updated with those
+* corresponding to the best partition found.
+*/
+void av1_rd_use_partition(AV1_COMP *cpi, ThreadData *td, TileDataEnc *tile_data,
+                          MB_MODE_INFO **mib, TokenExtra **tp, int mi_row,
+                          int mi_col, BLOCK_SIZE bsize, int *rate,
+                          int64_t *dist, int do_recon, PC_TREE *pc_tree) {
+  AV1_COMMON *const cm = &cpi->common;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  const int num_planes = av1_num_planes(cm);
+  TileInfo *const tile_info = &tile_data->tile_info;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const ModeCosts *mode_costs = &x->mode_costs;
+  const int bs = mi_size_wide[bsize];
+  const int hbs = bs / 2;
+  const int pl = (bsize >= BLOCK_8X8)
+                     ? partition_plane_context(xd, mi_row, mi_col, bsize)
+                     : 0;
+  const PARTITION_TYPE partition =
+      (bsize >= BLOCK_8X8) ? get_partition(cm, mi_row, mi_col, bsize)
+                           : PARTITION_NONE;
+  const BLOCK_SIZE subsize = get_partition_subsize(bsize, partition);
+  RD_SEARCH_MACROBLOCK_CONTEXT x_ctx;
+  RD_STATS last_part_rdc, none_rdc, chosen_rdc, invalid_rdc;
+  BLOCK_SIZE sub_subsize = BLOCK_4X4;
+  int splits_below = 0;
+  BLOCK_SIZE bs_type = mib[0]->bsize;
+
+  if (pc_tree->none == NULL) {
+    pc_tree->none = av1_alloc_pmc(cm, bsize, &td->shared_coeff_buf);
+  }
+  PICK_MODE_CONTEXT *ctx_none = pc_tree->none;
+
+  if (mi_row >= mi_params->mi_rows || mi_col >= mi_params->mi_cols) return;
+
+  assert(mi_size_wide[bsize] == mi_size_high[bsize]);
+
+  av1_invalid_rd_stats(&last_part_rdc);
+  av1_invalid_rd_stats(&none_rdc);
+  av1_invalid_rd_stats(&chosen_rdc);
+  av1_invalid_rd_stats(&invalid_rdc);
+
+  pc_tree->partitioning = partition;
+
+  xd->above_txfm_context =
+      cm->above_contexts.txfm[tile_info->tile_row] + mi_col;
+  xd->left_txfm_context =
+      xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+  av1_save_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+
+  if (bsize == BLOCK_16X16 && cpi->vaq_refresh) {
+    av1_set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
+    x->mb_energy = av1_log_block_var(cpi, x, bsize);
+  }
+
+  // Save rdmult before it might be changed, so it can be restored later.
+  const int orig_rdmult = x->rdmult;
+  setup_block_rdmult(cpi, x, mi_row, mi_col, bsize, NO_AQ, NULL);
+
+  if (cpi->sf.part_sf.partition_search_type == VAR_BASED_PARTITION &&
+      ((cpi->sf.part_sf.adjust_var_based_rd_partitioning == 2 &&
+        bsize <= BLOCK_32X32) ||
+       (cpi->sf.part_sf.adjust_var_based_rd_partitioning == 1 &&
+        cm->quant_params.base_qindex > 190 && bsize <= BLOCK_32X32 &&
+        !frame_is_intra_only(cm)))) {
+    // Check if any of the sub blocks are further split.
+    if (partition == PARTITION_SPLIT && subsize > BLOCK_8X8) {
+      sub_subsize = get_partition_subsize(subsize, PARTITION_SPLIT);
+      splits_below = 1;
+      for (int i = 0; i < SUB_PARTITIONS_SPLIT; i++) {
+        int jj = i >> 1, ii = i & 0x01;
+        MB_MODE_INFO *this_mi = mib[jj * hbs * mi_params->mi_stride + ii * hbs];
+        if (this_mi && this_mi->bsize >= sub_subsize) {
+          splits_below = 0;
+        }
+      }
+    }
+
+    // If partition is not none try none unless each of the 4 splits are split
+    // even further..
+    if (partition != PARTITION_NONE && !splits_below &&
+        mi_row + hbs < mi_params->mi_rows &&
+        mi_col + hbs < mi_params->mi_cols) {
+      pc_tree->partitioning = PARTITION_NONE;
+      pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &none_rdc,
+                    PARTITION_NONE, bsize, ctx_none, invalid_rdc);
+
+      if (none_rdc.rate < INT_MAX) {
+        none_rdc.rate += mode_costs->partition_cost[pl][PARTITION_NONE];
+        none_rdc.rdcost = RDCOST(x->rdmult, none_rdc.rate, none_rdc.dist);
+      }
+
+      av1_restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+      mib[0]->bsize = bs_type;
+      pc_tree->partitioning = partition;
+    }
+  }
+
+  for (int i = 0; i < SUB_PARTITIONS_SPLIT; ++i) {
+    pc_tree->split[i] = av1_alloc_pc_tree_node(subsize);
+    pc_tree->split[i]->index = i;
+  }
+  switch (partition) {
+    case PARTITION_NONE:
+      pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
+                    PARTITION_NONE, bsize, ctx_none, invalid_rdc);
+      break;
+    case PARTITION_HORZ:
+      for (int i = 0; i < SUB_PARTITIONS_RECT; ++i) {
+        pc_tree->horizontal[i] =
+            av1_alloc_pmc(cm, subsize, &td->shared_coeff_buf);
+      }
+      pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
+                    PARTITION_HORZ, subsize, pc_tree->horizontal[0],
+                    invalid_rdc);
+      if (last_part_rdc.rate != INT_MAX && bsize >= BLOCK_8X8 &&
+          mi_row + hbs < mi_params->mi_rows) {
+        RD_STATS tmp_rdc;
+        const PICK_MODE_CONTEXT *const ctx_h = pc_tree->horizontal[0];
+        av1_init_rd_stats(&tmp_rdc);
+        av1_update_state(cpi, td, ctx_h, mi_row, mi_col, subsize, 1);
+        encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, subsize,
+                          NULL);
+        pick_sb_modes(cpi, tile_data, x, mi_row + hbs, mi_col, &tmp_rdc,
+                      PARTITION_HORZ, subsize, pc_tree->horizontal[1],
+                      invalid_rdc);
+        if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
+          av1_invalid_rd_stats(&last_part_rdc);
+          break;
+        }
+        last_part_rdc.rate += tmp_rdc.rate;
+        last_part_rdc.dist += tmp_rdc.dist;
+        last_part_rdc.rdcost += tmp_rdc.rdcost;
+      }
+      break;
+    case PARTITION_VERT:
+      for (int i = 0; i < SUB_PARTITIONS_RECT; ++i) {
+        pc_tree->vertical[i] =
+            av1_alloc_pmc(cm, subsize, &td->shared_coeff_buf);
+      }
+      pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
+                    PARTITION_VERT, subsize, pc_tree->vertical[0], invalid_rdc);
+      if (last_part_rdc.rate != INT_MAX && bsize >= BLOCK_8X8 &&
+          mi_col + hbs < mi_params->mi_cols) {
+        RD_STATS tmp_rdc;
+        const PICK_MODE_CONTEXT *const ctx_v = pc_tree->vertical[0];
+        av1_init_rd_stats(&tmp_rdc);
+        av1_update_state(cpi, td, ctx_v, mi_row, mi_col, subsize, 1);
+        encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, subsize,
+                          NULL);
+        pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + hbs, &tmp_rdc,
+                      PARTITION_VERT, subsize,
+                      pc_tree->vertical[bsize > BLOCK_8X8], invalid_rdc);
+        if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
+          av1_invalid_rd_stats(&last_part_rdc);
+          break;
+        }
+        last_part_rdc.rate += tmp_rdc.rate;
+        last_part_rdc.dist += tmp_rdc.dist;
+        last_part_rdc.rdcost += tmp_rdc.rdcost;
+      }
+      break;
+    case PARTITION_SPLIT:
+      if (cpi->sf.part_sf.adjust_var_based_rd_partitioning == 1 &&
+          none_rdc.rate < INT_MAX && none_rdc.skip_txfm == 1) {
+        av1_invalid_rd_stats(&last_part_rdc);
+        break;
+      }
+      last_part_rdc.rate = 0;
+      last_part_rdc.dist = 0;
+      last_part_rdc.rdcost = 0;
+      for (int i = 0; i < SUB_PARTITIONS_SPLIT; i++) {
+        int x_idx = (i & 1) * hbs;
+        int y_idx = (i >> 1) * hbs;
+        int jj = i >> 1, ii = i & 0x01;
+        RD_STATS tmp_rdc;
+        if ((mi_row + y_idx >= mi_params->mi_rows) ||
+            (mi_col + x_idx >= mi_params->mi_cols))
+          continue;
+
+        av1_init_rd_stats(&tmp_rdc);
+        av1_rd_use_partition(
+            cpi, td, tile_data,
+            mib + jj * hbs * mi_params->mi_stride + ii * hbs, tp,
+            mi_row + y_idx, mi_col + x_idx, subsize, &tmp_rdc.rate,
+            &tmp_rdc.dist, i != (SUB_PARTITIONS_SPLIT - 1), pc_tree->split[i]);
+        if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
+          av1_invalid_rd_stats(&last_part_rdc);
+          break;
+        }
+        last_part_rdc.rate += tmp_rdc.rate;
+        last_part_rdc.dist += tmp_rdc.dist;
+      }
+      break;
+    case PARTITION_VERT_A:
+    case PARTITION_VERT_B:
+    case PARTITION_HORZ_A:
+    case PARTITION_HORZ_B:
+    case PARTITION_HORZ_4:
+    case PARTITION_VERT_4:
+      assert(0 && "Cannot handle extended partition types");
+    default: assert(0); break;
+  }
+
+  if (last_part_rdc.rate < INT_MAX) {
+    last_part_rdc.rate += mode_costs->partition_cost[pl][partition];
+    last_part_rdc.rdcost =
+        RDCOST(x->rdmult, last_part_rdc.rate, last_part_rdc.dist);
+  }
+
+  if ((cpi->sf.part_sf.partition_search_type == VAR_BASED_PARTITION &&
+       cpi->sf.part_sf.adjust_var_based_rd_partitioning > 2) &&
+      partition != PARTITION_SPLIT && bsize > BLOCK_8X8 &&
+      (mi_row + bs < mi_params->mi_rows ||
+       mi_row + hbs == mi_params->mi_rows) &&
+      (mi_col + bs < mi_params->mi_cols ||
+       mi_col + hbs == mi_params->mi_cols)) {
+    BLOCK_SIZE split_subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
+    chosen_rdc.rate = 0;
+    chosen_rdc.dist = 0;
+
+    av1_restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+    pc_tree->partitioning = PARTITION_SPLIT;
+
+    // Split partition.
+    for (int i = 0; i < SUB_PARTITIONS_SPLIT; i++) {
+      int x_idx = (i & 1) * hbs;
+      int y_idx = (i >> 1) * hbs;
+      RD_STATS tmp_rdc;
+
+      if ((mi_row + y_idx >= mi_params->mi_rows) ||
+          (mi_col + x_idx >= mi_params->mi_cols))
+        continue;
+
+      av1_save_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+      pc_tree->split[i]->partitioning = PARTITION_NONE;
+      if (pc_tree->split[i]->none == NULL)
+        pc_tree->split[i]->none =
+            av1_alloc_pmc(cm, split_subsize, &td->shared_coeff_buf);
+      pick_sb_modes(cpi, tile_data, x, mi_row + y_idx, mi_col + x_idx, &tmp_rdc,
+                    PARTITION_SPLIT, split_subsize, pc_tree->split[i]->none,
+                    invalid_rdc);
+
+      av1_restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+      if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
+        av1_invalid_rd_stats(&chosen_rdc);
+        break;
+      }
+
+      chosen_rdc.rate += tmp_rdc.rate;
+      chosen_rdc.dist += tmp_rdc.dist;
+
+      if (i != SUB_PARTITIONS_SPLIT - 1)
+        encode_sb(cpi, td, tile_data, tp, mi_row + y_idx, mi_col + x_idx,
+                  OUTPUT_ENABLED, split_subsize, pc_tree->split[i], NULL);
+
+      chosen_rdc.rate += mode_costs->partition_cost[pl][PARTITION_NONE];
+    }
+    if (chosen_rdc.rate < INT_MAX) {
+      chosen_rdc.rate += mode_costs->partition_cost[pl][PARTITION_SPLIT];
+      chosen_rdc.rdcost = RDCOST(x->rdmult, chosen_rdc.rate, chosen_rdc.dist);
+    }
+  }
+
+  // If last_part is better set the partitioning to that.
+  if (last_part_rdc.rdcost < chosen_rdc.rdcost) {
+    mib[0]->bsize = bsize;
+    if (bsize >= BLOCK_8X8) pc_tree->partitioning = partition;
+    chosen_rdc = last_part_rdc;
+  }
+  // If none was better set the partitioning to that.
+  if (none_rdc.rdcost < chosen_rdc.rdcost) {
+    if (bsize >= BLOCK_8X8) pc_tree->partitioning = PARTITION_NONE;
+    chosen_rdc = none_rdc;
+  }
+
+  av1_restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+
+  // We must have chosen a partitioning and encoding or we'll fail later on.
+  // No other opportunities for success.
+  if (bsize == cm->seq_params.sb_size)
+    assert(chosen_rdc.rate < INT_MAX && chosen_rdc.dist < INT64_MAX);
+
+  if (do_recon) {
+    if (bsize == cm->seq_params.sb_size) {
+      // NOTE: To get estimate for rate due to the tokens, use:
+      // int rate_coeffs = 0;
+      // encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_COSTCOEFFS,
+      //           bsize, pc_tree, &rate_coeffs);
+      set_cb_offsets(x->cb_offset, 0, 0);
+      encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, OUTPUT_ENABLED, bsize,
+                pc_tree, NULL);
+    } else {
+      encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_NORMAL, bsize,
+                pc_tree, NULL);
+    }
+  }
+
+  *rate = chosen_rdc.rate;
+  *dist = chosen_rdc.dist;
+  x->rdmult = orig_rdmult;
+}
+
+static void encode_b_nonrd(const AV1_COMP *const cpi, TileDataEnc *tile_data,
+                           ThreadData *td, TokenExtra **tp, int mi_row,
+                           int mi_col, RUN_TYPE dry_run, BLOCK_SIZE bsize,
+                           PARTITION_TYPE partition,
+                           PICK_MODE_CONTEXT *const ctx, int *rate) {
+  TileInfo *const tile = &tile_data->tile_info;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *xd = &x->e_mbd;
+  av1_set_offsets_without_segment_id(cpi, tile, x, mi_row, mi_col, bsize);
+  const int origin_mult = x->rdmult;
+  setup_block_rdmult(cpi, x, mi_row, mi_col, bsize, NO_AQ, NULL);
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  mbmi->partition = partition;
+  // Nonrd pickmode does not currently support second/combined reference.
+  assert(!has_second_ref(mbmi));
+  av1_update_state(cpi, td, ctx, mi_row, mi_col, bsize, dry_run);
+  const int subsampling_x = cpi->common.seq_params.subsampling_x;
+  const int subsampling_y = cpi->common.seq_params.subsampling_y;
+  if (!dry_run) {
+    set_cb_offsets(x->mbmi_ext_frame->cb_offset, x->cb_offset[PLANE_TYPE_Y],
+                   x->cb_offset[PLANE_TYPE_UV]);
+    assert(x->cb_offset[PLANE_TYPE_Y] <
+           (1 << num_pels_log2_lookup[cpi->common.seq_params.sb_size]));
+    assert(x->cb_offset[PLANE_TYPE_UV] <
+           ((1 << num_pels_log2_lookup[cpi->common.seq_params.sb_size]) >>
+            (subsampling_x + subsampling_y)));
+  }
+  encode_superblock(cpi, tile_data, td, tp, dry_run, bsize, rate);
+  if (!dry_run) {
+    update_cb_offsets(x, bsize, subsampling_x, subsampling_y);
+    if (tile_data->allow_update_cdf) update_stats(&cpi->common, td);
+  }
+  // TODO(Ravi/Remya): Move this copy function to a better logical place
+  // This function will copy the best mode information from block
+  // level (x->mbmi_ext) to frame level (cpi->mbmi_ext_info.frame_base). This
+  // frame level buffer (cpi->mbmi_ext_info.frame_base) will be used during
+  // bitstream preparation.
+  av1_copy_mbmi_ext_to_mbmi_ext_frame(x->mbmi_ext_frame, &x->mbmi_ext,
+                                      av1_ref_frame_type(xd->mi[0]->ref_frame));
+  x->rdmult = origin_mult;
+}
+/*!\brief Top level function to pick block mode for non-RD optimized case
+ *
+ * \ingroup partition_search
+ * \callgraph
+ * \callergraph
+ * Searches prediction modes, transform, and coefficient coding modes for an
+ * individual coding block. This function is the top-level function that is
+ * used for non-RD optimized mode search (controlled by
+ * \c cpi->sf.rt_sf.use_nonrd_pick_mode). Depending on frame type it calls
+ * inter/skip/hybrid-intra mode search functions
+ *
+ * \param[in]    cpi            Top-level encoder structure
+ * \param[in]    tile_data      Pointer to struct holding adaptive
+ *                              data/contexts/models for the tile during
+ *                              encoding
+ * \param[in]    x              Pointer to structure holding all the data for
+ *                              the current macroblock
+ * \param[in]    mi_row         Row coordinate of the block in a step size of
+ *                              MI_SIZE
+ * \param[in]    mi_col         Column coordinate of the block in a step size of
+ *                              MI_SIZE
+ * \param[in]    rd_cost        Pointer to structure holding rate and distortion
+ *                              stats for the current block
+ * \param[in]    bsize          Current block size
+ * \param[in]    ctx            Pointer to structure holding coding contexts and
+ *                              chosen modes for the current block
+ *
+ * \return Nothing is returned. Instead, the chosen modes and contexts necessary
+ * for reconstruction are stored in ctx, the rate-distortion stats are stored in
+ * rd_cost. If no valid mode leading to rd_cost <= best_rd, the status will be
+ * signalled by an INT64_MAX rd_cost->rdcost.
+ */
+static void pick_sb_modes_nonrd(AV1_COMP *const cpi, TileDataEnc *tile_data,
+                                MACROBLOCK *const x, int mi_row, int mi_col,
+                                RD_STATS *rd_cost, BLOCK_SIZE bsize,
+                                PICK_MODE_CONTEXT *ctx) {
+  av1_set_offsets(cpi, &tile_data->tile_info, x, mi_row, mi_col, bsize);
+  AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  struct macroblock_plane *const p = x->plane;
+  struct macroblockd_plane *const pd = xd->plane;
+  const AQ_MODE aq_mode = cpi->oxcf.q_cfg.aq_mode;
+  TxfmSearchInfo *txfm_info = &x->txfm_search_info;
+  int i;
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  start_timing(cpi, rd_pick_sb_modes_time);
+#endif
+  aom_clear_system_state();
+  // Sets up the tx_type_map buffer in MACROBLOCKD.
+  xd->tx_type_map = txfm_info->tx_type_map_;
+  xd->tx_type_map_stride = mi_size_wide[bsize];
+  for (i = 0; i < num_planes; ++i) {
+    p[i].coeff = ctx->coeff[i];
+    p[i].qcoeff = ctx->qcoeff[i];
+    p[i].dqcoeff = ctx->dqcoeff[i];
+    p[i].eobs = ctx->eobs[i];
+    p[i].txb_entropy_ctx = ctx->txb_entropy_ctx[i];
+  }
+  for (i = 0; i < 2; ++i) pd[i].color_index_map = ctx->color_index_map[i];
+  if (is_cur_buf_hbd(xd)) {
+    x->source_variance = av1_high_get_sby_perpixel_variance(
+        cpi, &x->plane[0].src, bsize, xd->bd);
+  } else {
+    x->source_variance =
+        av1_get_sby_perpixel_variance(cpi, &x->plane[0].src, bsize);
+  }
+  // Save rdmult before it might be changed, so it can be restored later.
+  const int orig_rdmult = x->rdmult;
+  setup_block_rdmult(cpi, x, mi_row, mi_col, bsize, aq_mode, mbmi);
+  // Set error per bit for current rdmult
+  av1_set_error_per_bit(&x->mv_costs, x->rdmult);
+  // Find best coding mode & reconstruct the MB so it is available
+  // as a predictor for MBs that follow in the SB
+  if (frame_is_intra_only(cm)) {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    start_timing(cpi, av1_rd_pick_intra_mode_sb_time);
+#endif
+    hybrid_intra_mode_search(cpi, x, rd_cost, bsize, ctx);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    end_timing(cpi, av1_rd_pick_intra_mode_sb_time);
+#endif
+  } else {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    start_timing(cpi, av1_rd_pick_inter_mode_sb_time);
+#endif
+    if (segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+      RD_STATS invalid_rd;
+      av1_invalid_rd_stats(&invalid_rd);
+      // TODO(kyslov): add av1_nonrd_pick_inter_mode_sb_seg_skip
+      av1_rd_pick_inter_mode_sb_seg_skip(cpi, tile_data, x, mi_row, mi_col,
+                                         rd_cost, bsize, ctx,
+                                         invalid_rd.rdcost);
+    } else {
+      av1_nonrd_pick_inter_mode_sb(cpi, tile_data, x, rd_cost, bsize, ctx);
+    }
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    end_timing(cpi, av1_rd_pick_inter_mode_sb_time);
+#endif
+  }
+  x->rdmult = orig_rdmult;
+  ctx->rd_stats.rate = rd_cost->rate;
+  ctx->rd_stats.dist = rd_cost->dist;
+  ctx->rd_stats.rdcost = rd_cost->rdcost;
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  end_timing(cpi, rd_pick_sb_modes_time);
+#endif
+}
+
+/*!\brief AV1 block partition application (minimal RD search).
+*
+* \ingroup partition_search
+* \callgraph
+* \callergraph
+* Encode the block by applying pre-calculated partition patterns that are
+* represented by coding block sizes stored in the mbmi array. The only
+* partition adjustment allowed is merging leaf split nodes if it leads to a
+* lower rd cost. The partition types are limited to a basic set: none, horz,
+* vert, and split. This function is only used in the real-time mode.
+*
+* \param[in]    cpi       Top-level encoder structure
+* \param[in]    td        Pointer to thread data
+* \param[in]    tile_data Pointer to struct holding adaptive
+data/contexts/models for the tile during encoding
+* \param[in]    mib       Array representing MB_MODE_INFO pointers for mi
+blocks starting from the first pixel of the current
+block
+* \param[in]    tp        Pointer to the starting token
+* \param[in]    mi_row    Row coordinate of the block in a step size of MI_SIZE
+* \param[in]    mi_col    Column coordinate of the block in a step size of
+MI_SIZE
+* \param[in]    bsize     Current block size
+* \param[in]    pc_tree   Pointer to the PC_TREE node holding the picked
+partitions and mode info for the current block
+*
+* \return Nothing is returned. The pc_tree struct is modified to store the
+* picked partition and modes.
+*/
+void av1_nonrd_use_partition(AV1_COMP *cpi, ThreadData *td,
+                             TileDataEnc *tile_data, MB_MODE_INFO **mib,
+                             TokenExtra **tp, int mi_row, int mi_col,
+                             BLOCK_SIZE bsize, PC_TREE *pc_tree) {
+  AV1_COMMON *const cm = &cpi->common;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  TileInfo *const tile_info = &tile_data->tile_info;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const ModeCosts *mode_costs = &x->mode_costs;
+  // Only square blocks from 8x8 to 128x128 are supported
+  assert(bsize >= BLOCK_8X8 && bsize <= BLOCK_128X128);
+  const int bs = mi_size_wide[bsize];
+  const int hbs = bs / 2;
+  const PARTITION_TYPE partition =
+      (bsize >= BLOCK_8X8) ? get_partition(cm, mi_row, mi_col, bsize)
+                           : PARTITION_NONE;
+  BLOCK_SIZE subsize = get_partition_subsize(bsize, partition);
+  assert(subsize <= BLOCK_LARGEST);
+  const int pl = (bsize >= BLOCK_8X8)
+                     ? partition_plane_context(xd, mi_row, mi_col, bsize)
+                     : 0;
+
+  RD_STATS dummy_cost;
+  av1_invalid_rd_stats(&dummy_cost);
+
+  if (mi_row >= mi_params->mi_rows || mi_col >= mi_params->mi_cols) return;
+
+  assert(mi_size_wide[bsize] == mi_size_high[bsize]);
+
+  pc_tree->partitioning = partition;
+
+  xd->above_txfm_context =
+      cm->above_contexts.txfm[tile_info->tile_row] + mi_col;
+  xd->left_txfm_context =
+      xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+
+  // Initialize default mode evaluation params
+  set_mode_eval_params(cpi, x, DEFAULT_EVAL);
+
+  switch (partition) {
+    case PARTITION_NONE:
+      pc_tree->none = av1_alloc_pmc(cm, bsize, &td->shared_coeff_buf);
+      if (cpi->sf.rt_sf.nonrd_check_partition_split && do_slipt_check(bsize) &&
+          !frame_is_intra_only(cm)) {
+        RD_STATS split_rdc, none_rdc, block_rdc;
+        RD_SEARCH_MACROBLOCK_CONTEXT x_ctx;
+
+        av1_init_rd_stats(&split_rdc);
+        av1_invalid_rd_stats(&none_rdc);
+
+        av1_save_context(x, &x_ctx, mi_row, mi_col, bsize, 3);
+        subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
+        pick_sb_modes_nonrd(cpi, tile_data, x, mi_row, mi_col, &none_rdc, bsize,
+                            pc_tree->none);
+        none_rdc.rate += mode_costs->partition_cost[pl][PARTITION_NONE];
+        none_rdc.rdcost = RDCOST(x->rdmult, none_rdc.rate, none_rdc.dist);
+        av1_restore_context(x, &x_ctx, mi_row, mi_col, bsize, 3);
+
+        for (int i = 0; i < SUB_PARTITIONS_SPLIT; i++) {
+          av1_invalid_rd_stats(&block_rdc);
+          const int x_idx = (i & 1) * hbs;
+          const int y_idx = (i >> 1) * hbs;
+          if (mi_row + y_idx >= mi_params->mi_rows ||
+              mi_col + x_idx >= mi_params->mi_cols)
+            continue;
+          xd->above_txfm_context =
+              cm->above_contexts.txfm[tile_info->tile_row] + mi_col + x_idx;
+          xd->left_txfm_context =
+              xd->left_txfm_context_buffer + ((mi_row + y_idx) & MAX_MIB_MASK);
+          pc_tree->split[i]->partitioning = PARTITION_NONE;
+          pick_sb_modes_nonrd(cpi, tile_data, x, mi_row + y_idx, mi_col + x_idx,
+                              &block_rdc, subsize, pc_tree->split[i]->none);
+          split_rdc.rate += block_rdc.rate;
+          split_rdc.dist += block_rdc.dist;
+
+          encode_b_nonrd(cpi, tile_data, td, tp, mi_row + y_idx, mi_col + x_idx,
+                         1, subsize, PARTITION_NONE, pc_tree->split[i]->none,
+                         NULL);
+        }
+        split_rdc.rate += mode_costs->partition_cost[pl][PARTITION_SPLIT];
+        split_rdc.rdcost = RDCOST(x->rdmult, split_rdc.rate, split_rdc.dist);
+        av1_restore_context(x, &x_ctx, mi_row, mi_col, bsize, 3);
+
+        if (none_rdc.rdcost < split_rdc.rdcost) {
+          mib[0]->bsize = bsize;
+          pc_tree->partitioning = PARTITION_NONE;
+          encode_b_nonrd(cpi, tile_data, td, tp, mi_row, mi_col, 0, bsize,
+                         partition, pc_tree->none, NULL);
+        } else {
+          mib[0]->bsize = subsize;
+          pc_tree->partitioning = PARTITION_SPLIT;
+          for (int i = 0; i < SUB_PARTITIONS_SPLIT; i++) {
+            const int x_idx = (i & 1) * hbs;
+            const int y_idx = (i >> 1) * hbs;
+            if (mi_row + y_idx >= mi_params->mi_rows ||
+                mi_col + x_idx >= mi_params->mi_cols)
+              continue;
+            encode_b_nonrd(cpi, tile_data, td, tp, mi_row + y_idx,
+                           mi_col + x_idx, 0, subsize, PARTITION_NONE,
+                           pc_tree->split[i]->none, NULL);
+          }
+        }
+
+      } else {
+        pick_sb_modes_nonrd(cpi, tile_data, x, mi_row, mi_col, &dummy_cost,
+                            bsize, pc_tree->none);
+        encode_b_nonrd(cpi, tile_data, td, tp, mi_row, mi_col, 0, bsize,
+                       partition, pc_tree->none, NULL);
+      }
+      break;
+    case PARTITION_VERT:
+      for (int i = 0; i < SUB_PARTITIONS_RECT; ++i) {
+        pc_tree->vertical[i] =
+            av1_alloc_pmc(cm, subsize, &td->shared_coeff_buf);
+      }
+      pick_sb_modes_nonrd(cpi, tile_data, x, mi_row, mi_col, &dummy_cost,
+                          subsize, pc_tree->vertical[0]);
+      encode_b_nonrd(cpi, tile_data, td, tp, mi_row, mi_col, 0, subsize,
+                     PARTITION_VERT, pc_tree->vertical[0], NULL);
+      if (mi_col + hbs < mi_params->mi_cols && bsize > BLOCK_8X8) {
+        pick_sb_modes_nonrd(cpi, tile_data, x, mi_row, mi_col + hbs,
+                            &dummy_cost, subsize, pc_tree->vertical[1]);
+        encode_b_nonrd(cpi, tile_data, td, tp, mi_row, mi_col + hbs, 0, subsize,
+                       PARTITION_VERT, pc_tree->vertical[1], NULL);
+      }
+      break;
+    case PARTITION_HORZ:
+      for (int i = 0; i < SUB_PARTITIONS_RECT; ++i) {
+        pc_tree->horizontal[i] =
+            av1_alloc_pmc(cm, subsize, &td->shared_coeff_buf);
+      }
+      pick_sb_modes_nonrd(cpi, tile_data, x, mi_row, mi_col, &dummy_cost,
+                          subsize, pc_tree->horizontal[0]);
+      encode_b_nonrd(cpi, tile_data, td, tp, mi_row, mi_col, 0, subsize,
+                     PARTITION_HORZ, pc_tree->horizontal[0], NULL);
+
+      if (mi_row + hbs < mi_params->mi_rows && bsize > BLOCK_8X8) {
+        pick_sb_modes_nonrd(cpi, tile_data, x, mi_row + hbs, mi_col,
+                            &dummy_cost, subsize, pc_tree->horizontal[1]);
+        encode_b_nonrd(cpi, tile_data, td, tp, mi_row + hbs, mi_col, 0, subsize,
+                       PARTITION_HORZ, pc_tree->horizontal[1], NULL);
+      }
+      break;
+    case PARTITION_SPLIT:
+      for (int i = 0; i < SUB_PARTITIONS_SPLIT; ++i) {
+        pc_tree->split[i] = av1_alloc_pc_tree_node(subsize);
+        pc_tree->split[i]->index = i;
+      }
+      if (cpi->sf.rt_sf.nonrd_check_partition_merge_mode &&
+          av1_is_leaf_split_partition(cm, mi_row, mi_col, bsize) &&
+          !frame_is_intra_only(cm) && bsize <= BLOCK_64X64) {
+        RD_SEARCH_MACROBLOCK_CONTEXT x_ctx;
+        RD_STATS split_rdc, none_rdc;
+        av1_invalid_rd_stats(&split_rdc);
+        av1_invalid_rd_stats(&none_rdc);
+        av1_save_context(x, &x_ctx, mi_row, mi_col, bsize, 3);
+        xd->above_txfm_context =
+            cm->above_contexts.txfm[tile_info->tile_row] + mi_col;
+        xd->left_txfm_context =
+            xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+        pc_tree->partitioning = PARTITION_NONE;
+        pc_tree->none = av1_alloc_pmc(cm, bsize, &td->shared_coeff_buf);
+        pick_sb_modes_nonrd(cpi, tile_data, x, mi_row, mi_col, &none_rdc, bsize,
+                            pc_tree->none);
+        none_rdc.rate += mode_costs->partition_cost[pl][PARTITION_NONE];
+        none_rdc.rdcost = RDCOST(x->rdmult, none_rdc.rate, none_rdc.dist);
+        av1_restore_context(x, &x_ctx, mi_row, mi_col, bsize, 3);
+        if (cpi->sf.rt_sf.nonrd_check_partition_merge_mode != 2 ||
+            none_rdc.skip_txfm != 1 || pc_tree->none->mic.mode == NEWMV) {
+          av1_init_rd_stats(&split_rdc);
+          for (int i = 0; i < SUB_PARTITIONS_SPLIT; i++) {
+            RD_STATS block_rdc;
+            av1_invalid_rd_stats(&block_rdc);
+            int x_idx = (i & 1) * hbs;
+            int y_idx = (i >> 1) * hbs;
+            if ((mi_row + y_idx >= mi_params->mi_rows) ||
+                (mi_col + x_idx >= mi_params->mi_cols))
+              continue;
+            xd->above_txfm_context =
+                cm->above_contexts.txfm[tile_info->tile_row] + mi_col + x_idx;
+            xd->left_txfm_context = xd->left_txfm_context_buffer +
+                                    ((mi_row + y_idx) & MAX_MIB_MASK);
+            if (pc_tree->split[i]->none == NULL)
+              pc_tree->split[i]->none =
+                  av1_alloc_pmc(cm, subsize, &td->shared_coeff_buf);
+            pc_tree->split[i]->partitioning = PARTITION_NONE;
+            pick_sb_modes_nonrd(cpi, tile_data, x, mi_row + y_idx,
+                                mi_col + x_idx, &block_rdc, subsize,
+                                pc_tree->split[i]->none);
+            split_rdc.rate += block_rdc.rate;
+            split_rdc.dist += block_rdc.dist;
+
+            encode_b_nonrd(cpi, tile_data, td, tp, mi_row + y_idx,
+                           mi_col + x_idx, 1, subsize, PARTITION_NONE,
+                           pc_tree->split[i]->none, NULL);
+          }
+          av1_restore_context(x, &x_ctx, mi_row, mi_col, bsize, 3);
+          split_rdc.rate += mode_costs->partition_cost[pl][PARTITION_SPLIT];
+          split_rdc.rdcost = RDCOST(x->rdmult, split_rdc.rate, split_rdc.dist);
+        }
+        if (none_rdc.rdcost < split_rdc.rdcost) {
+          mib[0]->bsize = bsize;
+          pc_tree->partitioning = PARTITION_NONE;
+          encode_b_nonrd(cpi, tile_data, td, tp, mi_row, mi_col, 0, bsize,
+                         partition, pc_tree->none, NULL);
+        } else {
+          mib[0]->bsize = subsize;
+          pc_tree->partitioning = PARTITION_SPLIT;
+          for (int i = 0; i < SUB_PARTITIONS_SPLIT; i++) {
+            int x_idx = (i & 1) * hbs;
+            int y_idx = (i >> 1) * hbs;
+            if ((mi_row + y_idx >= mi_params->mi_rows) ||
+                (mi_col + x_idx >= mi_params->mi_cols))
+              continue;
+
+            if (pc_tree->split[i]->none == NULL)
+              pc_tree->split[i]->none =
+                  av1_alloc_pmc(cm, subsize, &td->shared_coeff_buf);
+            encode_b_nonrd(cpi, tile_data, td, tp, mi_row + y_idx,
+                           mi_col + x_idx, 0, subsize, PARTITION_NONE,
+                           pc_tree->split[i]->none, NULL);
+          }
+        }
+      } else {
+        for (int i = 0; i < SUB_PARTITIONS_SPLIT; i++) {
+          int x_idx = (i & 1) * hbs;
+          int y_idx = (i >> 1) * hbs;
+          int jj = i >> 1, ii = i & 0x01;
+          if ((mi_row + y_idx >= mi_params->mi_rows) ||
+              (mi_col + x_idx >= mi_params->mi_cols))
+            continue;
+          av1_nonrd_use_partition(
+              cpi, td, tile_data,
+              mib + jj * hbs * mi_params->mi_stride + ii * hbs, tp,
+              mi_row + y_idx, mi_col + x_idx, subsize, pc_tree->split[i]);
+        }
+      }
+      break;
+    case PARTITION_VERT_A:
+    case PARTITION_VERT_B:
+    case PARTITION_HORZ_A:
+    case PARTITION_HORZ_B:
+    case PARTITION_HORZ_4:
+    case PARTITION_VERT_4:
+      assert(0 && "Cannot handle extended partition types");
+    default: assert(0); break;
+  }
+}
+
+#if !CONFIG_REALTIME_ONLY
+// Try searching for an encoding for the given subblock. Returns zero if the
+// rdcost is already too high (to tell the caller not to bother searching for
+// encodings of further subblocks).
+static int rd_try_subblock(AV1_COMP *const cpi, ThreadData *td,
+                           TileDataEnc *tile_data, TokenExtra **tp, int is_last,
+                           int mi_row, int mi_col, BLOCK_SIZE subsize,
+                           RD_STATS best_rdcost, RD_STATS *sum_rdc,
+                           PARTITION_TYPE partition,
+                           PICK_MODE_CONTEXT *this_ctx) {
+  MACROBLOCK *const x = &td->mb;
+  const int orig_mult = x->rdmult;
+  setup_block_rdmult(cpi, x, mi_row, mi_col, subsize, NO_AQ, NULL);
+
+  av1_rd_cost_update(x->rdmult, &best_rdcost);
+
+  RD_STATS rdcost_remaining;
+  av1_rd_stats_subtraction(x->rdmult, &best_rdcost, sum_rdc, &rdcost_remaining);
+  RD_STATS this_rdc;
+  pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc, partition,
+                subsize, this_ctx, rdcost_remaining);
+
+  if (this_rdc.rate == INT_MAX) {
+    sum_rdc->rdcost = INT64_MAX;
+  } else {
+    sum_rdc->rate += this_rdc.rate;
+    sum_rdc->dist += this_rdc.dist;
+    av1_rd_cost_update(x->rdmult, sum_rdc);
+  }
+
+  if (sum_rdc->rdcost >= best_rdcost.rdcost) {
+    x->rdmult = orig_mult;
+    return 0;
+  }
+
+  if (!is_last) {
+    av1_update_state(cpi, td, this_ctx, mi_row, mi_col, subsize, 1);
+    encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, subsize, NULL);
+  }
+
+  x->rdmult = orig_mult;
+  return 1;
+}
+
+// Tests an AB partition, and updates the encoder status, the pick mode
+// contexts, the best rdcost, and the best partition.
+static bool rd_test_partition3(AV1_COMP *const cpi, ThreadData *td,
+                               TileDataEnc *tile_data, TokenExtra **tp,
+                               PC_TREE *pc_tree, RD_STATS *best_rdc,
+                               PICK_MODE_CONTEXT *ctxs[SUB_PARTITIONS_AB],
+                               int mi_row, int mi_col, BLOCK_SIZE bsize,
+                               PARTITION_TYPE partition,
+                               const BLOCK_SIZE ab_subsize[SUB_PARTITIONS_AB],
+                               const int ab_mi_pos[SUB_PARTITIONS_AB][2],
+                               const MB_MODE_INFO **mode_cache) {
+  MACROBLOCK *const x = &td->mb;
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const int pl = partition_plane_context(xd, mi_row, mi_col, bsize);
+  RD_STATS sum_rdc;
+  av1_init_rd_stats(&sum_rdc);
+  sum_rdc.rate = x->mode_costs.partition_cost[pl][partition];
+  sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, 0);
+  // Loop over sub-partitions in AB partition type.
+  for (int i = 0; i < SUB_PARTITIONS_AB; i++) {
+    if (mode_cache && mode_cache[i]) {
+      x->use_intermode_cache = 1;
+      x->intermode_cache = mode_cache[i];
+    }
+    const int mode_search_success =
+        rd_try_subblock(cpi, td, tile_data, tp, i == SUB_PARTITIONS_AB - 1,
+                        ab_mi_pos[i][0], ab_mi_pos[i][1], ab_subsize[i],
+                        *best_rdc, &sum_rdc, partition, ctxs[i]);
+    x->use_intermode_cache = 0;
+    x->intermode_cache = NULL;
+    if (!mode_search_success) {
+      return false;
+    }
+  }
+
+  av1_rd_cost_update(x->rdmult, &sum_rdc);
+  if (sum_rdc.rdcost >= best_rdc->rdcost) return false;
+  sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
+  if (sum_rdc.rdcost >= best_rdc->rdcost) return false;
+
+  *best_rdc = sum_rdc;
+  pc_tree->partitioning = partition;
+  return true;
+}
+
+// Initialize state variables of partition search used in
+// av1_rd_pick_partition().
+static void init_partition_search_state_params(
+    MACROBLOCK *x, AV1_COMP *const cpi, PartitionSearchState *part_search_state,
+    int mi_row, int mi_col, BLOCK_SIZE bsize) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const AV1_COMMON *const cm = &cpi->common;
+  PartitionBlkParams *blk_params = &part_search_state->part_blk_params;
+  const CommonModeInfoParams *const mi_params = &cpi->common.mi_params;
+
+  // Initialization of block size related parameters.
+  blk_params->mi_step = mi_size_wide[bsize] / 2;
+  blk_params->mi_row = mi_row;
+  blk_params->mi_col = mi_col;
+  blk_params->mi_row_edge = mi_row + blk_params->mi_step;
+  blk_params->mi_col_edge = mi_col + blk_params->mi_step;
+  blk_params->width = block_size_wide[bsize];
+  blk_params->min_partition_size_1d =
+      block_size_wide[x->sb_enc.min_partition_size];
+  blk_params->subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
+  blk_params->split_bsize2 = blk_params->subsize;
+  blk_params->bsize_at_least_8x8 = (bsize >= BLOCK_8X8);
+  blk_params->bsize = bsize;
+
+  // Check if the partition corresponds to edge block.
+  blk_params->has_rows = (blk_params->mi_row_edge < mi_params->mi_rows);
+  blk_params->has_cols = (blk_params->mi_col_edge < mi_params->mi_cols);
+
+  // Update intra partitioning related info.
+  part_search_state->intra_part_info = &x->part_search_info;
+  // Prepare for segmentation CNN-based partitioning for intra-frame.
+  if (frame_is_intra_only(cm) && bsize == BLOCK_64X64) {
+    part_search_state->intra_part_info->quad_tree_idx = 0;
+    part_search_state->intra_part_info->cnn_output_valid = 0;
+  }
+
+  // Set partition plane context index.
+  part_search_state->pl_ctx_idx =
+      blk_params->bsize_at_least_8x8
+          ? partition_plane_context(xd, mi_row, mi_col, bsize)
+          : 0;
+
+  // Partition cost buffer update
+  ModeCosts *mode_costs = &x->mode_costs;
+  part_search_state->partition_cost =
+      mode_costs->partition_cost[part_search_state->pl_ctx_idx];
+
+  // Initialize HORZ and VERT win flags as true for all split partitions.
+  for (int i = 0; i < SUB_PARTITIONS_SPLIT; i++) {
+    part_search_state->split_part_rect_win[i].rect_part_win[HORZ] = true;
+    part_search_state->split_part_rect_win[i].rect_part_win[VERT] = true;
+  }
+
+  // Initialize the rd cost.
+  av1_init_rd_stats(&part_search_state->this_rdc);
+
+  // Initialize RD costs for partition types to 0.
+  part_search_state->none_rd = 0;
+  av1_zero(part_search_state->split_rd);
+  av1_zero(part_search_state->rect_part_rd);
+
+  // Initialize SPLIT partition to be not ready.
+  av1_zero(part_search_state->is_split_ctx_is_ready);
+  // Initialize HORZ and VERT partitions to be not ready.
+  av1_zero(part_search_state->is_rect_ctx_is_ready);
+
+  // Chroma subsampling.
+  part_search_state->ss_x = x->e_mbd.plane[1].subsampling_x;
+  part_search_state->ss_y = x->e_mbd.plane[1].subsampling_y;
+
+  // Initialize partition search flags to defaults.
+  part_search_state->terminate_partition_search = 0;
+  part_search_state->do_square_split = blk_params->bsize_at_least_8x8;
+  part_search_state->do_rectangular_split =
+      cpi->oxcf.part_cfg.enable_rect_partitions;
+  av1_zero(part_search_state->prune_rect_part);
+
+  // Initialize allowed partition types for the partition block.
+  part_search_state->partition_none_allowed =
+      blk_params->has_rows && blk_params->has_cols;
+  part_search_state->partition_rect_allowed[HORZ] =
+      blk_params->has_cols && blk_params->bsize_at_least_8x8 &&
+      cpi->oxcf.part_cfg.enable_rect_partitions &&
+      get_plane_block_size(get_partition_subsize(bsize, PARTITION_HORZ),
+                           part_search_state->ss_x,
+                           part_search_state->ss_y) != BLOCK_INVALID;
+  part_search_state->partition_rect_allowed[VERT] =
+      blk_params->has_rows && blk_params->bsize_at_least_8x8 &&
+      cpi->oxcf.part_cfg.enable_rect_partitions &&
+      get_plane_block_size(get_partition_subsize(bsize, PARTITION_VERT),
+                           part_search_state->ss_x,
+                           part_search_state->ss_y) != BLOCK_INVALID;
+
+  // Reset the flag indicating whether a partition leading to a rdcost lower
+  // than the bound best_rdc has been found.
+  part_search_state->found_best_partition = false;
+}
+
+// Override partition cost buffer for the edge blocks.
+static void set_partition_cost_for_edge_blk(
+    AV1_COMMON const *cm, PartitionSearchState *part_search_state) {
+  PartitionBlkParams blk_params = part_search_state->part_blk_params;
+  assert(blk_params.bsize_at_least_8x8 && part_search_state->pl_ctx_idx >= 0);
+  const aom_cdf_prob *partition_cdf =
+      cm->fc->partition_cdf[part_search_state->pl_ctx_idx];
+  const int max_cost = av1_cost_symbol(0);
+  for (PARTITION_TYPE i = 0; i < PARTITION_TYPES; ++i)
+    part_search_state->tmp_partition_cost[i] = max_cost;
+  if (blk_params.has_cols) {
+    // At the bottom, the two possibilities are HORZ and SPLIT.
+    aom_cdf_prob bot_cdf[2];
+    partition_gather_vert_alike(bot_cdf, partition_cdf, blk_params.bsize);
+    static const int bot_inv_map[2] = { PARTITION_HORZ, PARTITION_SPLIT };
+    av1_cost_tokens_from_cdf(part_search_state->tmp_partition_cost, bot_cdf,
+                             bot_inv_map);
+  } else if (blk_params.has_rows) {
+    // At the right, the two possibilities are VERT and SPLIT.
+    aom_cdf_prob rhs_cdf[2];
+    partition_gather_horz_alike(rhs_cdf, partition_cdf, blk_params.bsize);
+    static const int rhs_inv_map[2] = { PARTITION_VERT, PARTITION_SPLIT };
+    av1_cost_tokens_from_cdf(part_search_state->tmp_partition_cost, rhs_cdf,
+                             rhs_inv_map);
+  } else {
+    // At the bottom right, we always split.
+    part_search_state->tmp_partition_cost[PARTITION_SPLIT] = 0;
+  }
+  // Override the partition cost buffer.
+  part_search_state->partition_cost = part_search_state->tmp_partition_cost;
+}
+
+// Reset the partition search state flags when
+// must_find_valid_partition is equal to 1.
+static AOM_INLINE void reset_part_limitations(
+    AV1_COMP *const cpi, PartitionSearchState *part_search_state) {
+  PartitionBlkParams blk_params = part_search_state->part_blk_params;
+  const int is_rect_part_allowed =
+      blk_params.bsize_at_least_8x8 &&
+      cpi->oxcf.part_cfg.enable_rect_partitions &&
+      (blk_params.width > blk_params.min_partition_size_1d);
+  part_search_state->do_square_split =
+      blk_params.bsize_at_least_8x8 &&
+      (blk_params.width > blk_params.min_partition_size_1d);
+  part_search_state->partition_none_allowed =
+      blk_params.has_rows && blk_params.has_cols &&
+      (blk_params.width >= blk_params.min_partition_size_1d);
+  part_search_state->partition_rect_allowed[HORZ] =
+      blk_params.has_cols && is_rect_part_allowed &&
+      get_plane_block_size(
+          get_partition_subsize(blk_params.bsize, PARTITION_HORZ),
+          part_search_state->ss_x, part_search_state->ss_y) != BLOCK_INVALID;
+  part_search_state->partition_rect_allowed[VERT] =
+      blk_params.has_rows && is_rect_part_allowed &&
+      get_plane_block_size(
+          get_partition_subsize(blk_params.bsize, PARTITION_VERT),
+          part_search_state->ss_x, part_search_state->ss_y) != BLOCK_INVALID;
+  part_search_state->terminate_partition_search = 0;
+}
+
+// Rectangular partitions evaluation at sub-block level.
+static void rd_pick_rect_partition(AV1_COMP *const cpi, TileDataEnc *tile_data,
+                                   MACROBLOCK *x,
+                                   PICK_MODE_CONTEXT *cur_partition_ctx,
+                                   PartitionSearchState *part_search_state,
+                                   RD_STATS *best_rdc, const int idx,
+                                   int mi_row, int mi_col, BLOCK_SIZE bsize,
+                                   PARTITION_TYPE partition_type) {
+  // Obtain the remainder from the best rd cost
+  // for further processing of partition.
+  RD_STATS best_remain_rdcost;
+  av1_rd_stats_subtraction(x->rdmult, best_rdc, &part_search_state->sum_rdc,
+                           &best_remain_rdcost);
+
+  // Obtain the best mode for the partition sub-block.
+  pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &part_search_state->this_rdc,
+                partition_type, bsize, cur_partition_ctx, best_remain_rdcost);
+  av1_rd_cost_update(x->rdmult, &part_search_state->this_rdc);
+
+  // Update the partition rd cost with the current sub-block rd.
+  if (part_search_state->this_rdc.rate == INT_MAX) {
+    part_search_state->sum_rdc.rdcost = INT64_MAX;
+  } else {
+    part_search_state->sum_rdc.rate += part_search_state->this_rdc.rate;
+    part_search_state->sum_rdc.dist += part_search_state->this_rdc.dist;
+    av1_rd_cost_update(x->rdmult, &part_search_state->sum_rdc);
+  }
+  const RECT_PART_TYPE rect_part =
+      partition_type == PARTITION_HORZ ? HORZ : VERT;
+  part_search_state->rect_part_rd[rect_part][idx] =
+      part_search_state->this_rdc.rdcost;
+}
+
+typedef int (*active_edge_info)(const AV1_COMP *cpi, int mi_col, int mi_step);
+
+// Checks if HORZ / VERT partition search is allowed.
+static AOM_INLINE int is_rect_part_allowed(
+    const AV1_COMP *cpi, PartitionSearchState *part_search_state,
+    active_edge_info *active_edge, RECT_PART_TYPE rect_part, const int mi_pos) {
+  PartitionBlkParams blk_params = part_search_state->part_blk_params;
+  const int is_part_allowed =
+      (!part_search_state->terminate_partition_search &&
+       part_search_state->partition_rect_allowed[rect_part] &&
+       !part_search_state->prune_rect_part[rect_part] &&
+       (part_search_state->do_rectangular_split ||
+        active_edge[rect_part](cpi, mi_pos, blk_params.mi_step)));
+  return is_part_allowed;
+}
+
+static void rectangular_partition_search(
+    AV1_COMP *const cpi, ThreadData *td, TileDataEnc *tile_data,
+    TokenExtra **tp, MACROBLOCK *x, PC_TREE *pc_tree,
+    RD_SEARCH_MACROBLOCK_CONTEXT *x_ctx,
+    PartitionSearchState *part_search_state, RD_STATS *best_rdc,
+    RD_RECT_PART_WIN_INFO *rect_part_win_info) {
+  const AV1_COMMON *const cm = &cpi->common;
+  PartitionBlkParams blk_params = part_search_state->part_blk_params;
+  RD_STATS *sum_rdc = &part_search_state->sum_rdc;
+  const int rect_partition_type[NUM_RECT_PARTS] = { PARTITION_HORZ,
+                                                    PARTITION_VERT };
+
+  // mi_pos_rect[NUM_RECT_PARTS][SUB_PARTITIONS_RECT][0]: mi_row postion of
+  //                                           HORZ and VERT partition types.
+  // mi_pos_rect[NUM_RECT_PARTS][SUB_PARTITIONS_RECT][1]: mi_col postion of
+  //                                           HORZ and VERT partition types.
+  const int mi_pos_rect[NUM_RECT_PARTS][SUB_PARTITIONS_RECT][2] = {
+    { { blk_params.mi_row, blk_params.mi_col },
+      { blk_params.mi_row_edge, blk_params.mi_col } },
+    { { blk_params.mi_row, blk_params.mi_col },
+      { blk_params.mi_row, blk_params.mi_col_edge } }
+  };
+
+  // Initialize active edge_type function pointer
+  // for HOZR and VERT partition types.
+  active_edge_info active_edge_type[NUM_RECT_PARTS] = { av1_active_h_edge,
+                                                        av1_active_v_edge };
+
+  // Indicates edge blocks for HORZ and VERT partition types.
+  const int is_not_edge_block[NUM_RECT_PARTS] = { blk_params.has_rows,
+                                                  blk_params.has_cols };
+
+  // Initialize pc tree context for HORZ and VERT partition types.
+  PICK_MODE_CONTEXT **cur_ctx[NUM_RECT_PARTS][SUB_PARTITIONS_RECT] = {
+    { &pc_tree->horizontal[0], &pc_tree->horizontal[1] },
+    { &pc_tree->vertical[0], &pc_tree->vertical[1] }
+  };
+
+  // Loop over rectangular partition types.
+  for (RECT_PART_TYPE i = HORZ; i < NUM_RECT_PARTS; i++) {
+    assert(IMPLIES(!cpi->oxcf.part_cfg.enable_rect_partitions,
+                   !part_search_state->partition_rect_allowed[i]));
+
+    // Check if the HORZ / VERT partition search is to be performed.
+    if (!is_rect_part_allowed(cpi, part_search_state, active_edge_type, i,
+                              mi_pos_rect[i][0][i]))
+      continue;
+
+    // Sub-partition idx.
+    int sub_part_idx = 0;
+    PARTITION_TYPE partition_type = rect_partition_type[i];
+    blk_params.subsize =
+        get_partition_subsize(blk_params.bsize, partition_type);
+    assert(blk_params.subsize <= BLOCK_LARGEST);
+    av1_init_rd_stats(sum_rdc);
+    for (int j = 0; j < SUB_PARTITIONS_RECT; j++) {
+      if (cur_ctx[i][j][0] == NULL) {
+        cur_ctx[i][j][0] =
+            av1_alloc_pmc(cm, blk_params.subsize, &td->shared_coeff_buf);
+      }
+    }
+    sum_rdc->rate = part_search_state->partition_cost[partition_type];
+    sum_rdc->rdcost = RDCOST(x->rdmult, sum_rdc->rate, 0);
+#if CONFIG_COLLECT_PARTITION_STATS
+    if (best_rdc.rdcost - sum_rdc->rdcost >= 0) {
+      partition_attempts[partition_type] += 1;
+      aom_usec_timer_start(&partition_timer);
+      partition_timer_on = 1;
+    }
+#endif
+
+    // First sub-partition evaluation in HORZ / VERT partition type.
+    rd_pick_rect_partition(
+        cpi, tile_data, x, cur_ctx[i][sub_part_idx][0], part_search_state,
+        best_rdc, 0, mi_pos_rect[i][sub_part_idx][0],
+        mi_pos_rect[i][sub_part_idx][1], blk_params.subsize, partition_type);
+
+    // Start of second sub-partition evaluation.
+    // Evaluate second sub-partition if the first sub-partition cost
+    // is less than the best cost and if it is not an edge block.
+    if (sum_rdc->rdcost < best_rdc->rdcost && is_not_edge_block[i]) {
+      const MB_MODE_INFO *const mbmi = &cur_ctx[i][sub_part_idx][0]->mic;
+      const PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+      // Neither palette mode nor cfl predicted.
+      if (pmi->palette_size[PLANE_TYPE_Y] == 0 &&
+          pmi->palette_size[PLANE_TYPE_UV] == 0) {
+        if (mbmi->uv_mode != UV_CFL_PRED)
+          part_search_state->is_rect_ctx_is_ready[i] = 1;
+      }
+      av1_update_state(cpi, td, cur_ctx[i][sub_part_idx][0], blk_params.mi_row,
+                       blk_params.mi_col, blk_params.subsize, DRY_RUN_NORMAL);
+      encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL,
+                        blk_params.subsize, NULL);
+
+      // Second sub-partition evaluation in HORZ / VERT partition type.
+      sub_part_idx = 1;
+      rd_pick_rect_partition(
+          cpi, tile_data, x, cur_ctx[i][sub_part_idx][0], part_search_state,
+          best_rdc, 1, mi_pos_rect[i][sub_part_idx][0],
+          mi_pos_rect[i][sub_part_idx][1], blk_params.subsize, partition_type);
+    }
+#if CONFIG_COLLECT_PARTITION_STATS
+    if (partition_timer_on) {
+      aom_usec_timer_mark(&partition_timer);
+      int64_t time = aom_usec_timer_elapsed(&partition_timer);
+      partition_times[partition_type] += time;
+      partition_timer_on = 0;
+    }
+#endif
+    // Update HORZ / VERT best partition.
+    if (sum_rdc->rdcost < best_rdc->rdcost) {
+      sum_rdc->rdcost = RDCOST(x->rdmult, sum_rdc->rate, sum_rdc->dist);
+      if (sum_rdc->rdcost < best_rdc->rdcost) {
+        *best_rdc = *sum_rdc;
+        part_search_state->found_best_partition = true;
+        pc_tree->partitioning = partition_type;
+      }
+    } else {
+      // Update HORZ / VERT win flag.
+      if (rect_part_win_info != NULL)
+        rect_part_win_info->rect_part_win[i] = false;
+    }
+    av1_restore_context(x, x_ctx, blk_params.mi_row, blk_params.mi_col,
+                        blk_params.bsize, av1_num_planes(cm));
+  }
+}
+
+// AB partition type evaluation.
+static void rd_pick_ab_part(
+    AV1_COMP *const cpi, ThreadData *td, TileDataEnc *tile_data,
+    TokenExtra **tp, MACROBLOCK *x, RD_SEARCH_MACROBLOCK_CONTEXT *x_ctx,
+    PC_TREE *pc_tree, PICK_MODE_CONTEXT *dst_ctxs[SUB_PARTITIONS_AB],
+    PartitionSearchState *part_search_state, RD_STATS *best_rdc,
+    const BLOCK_SIZE ab_subsize[SUB_PARTITIONS_AB],
+    const int ab_mi_pos[SUB_PARTITIONS_AB][2], const PARTITION_TYPE part_type,
+    const MB_MODE_INFO **mode_cache) {
+  const AV1_COMMON *const cm = &cpi->common;
+  PartitionBlkParams blk_params = part_search_state->part_blk_params;
+  const int mi_row = blk_params.mi_row;
+  const int mi_col = blk_params.mi_col;
+  const int bsize = blk_params.bsize;
+
+#if CONFIG_COLLECT_PARTITION_STATS
+  {
+    RD_STATS tmp_sum_rdc;
+    av1_init_rd_stats(&tmp_sum_rdc);
+    tmp_sum_rdc.rate =
+        x->partition_cost[part_search_state->pl_ctx_idx][part_type];
+    tmp_sum_rdc.rdcost = RDCOST(x->rdmult, tmp_sum_rdc.rate, 0);
+    if (best_rdc->rdcost - tmp_sum_rdc.rdcost >= 0) {
+      partition_attempts[part_type] += 1;
+      aom_usec_timer_start(&partition_timer);
+      partition_timer_on = 1;
+    }
+  }
+#endif
+
+  // Test this partition and update the best partition.
+  part_search_state->found_best_partition |= rd_test_partition3(
+      cpi, td, tile_data, tp, pc_tree, best_rdc, dst_ctxs, mi_row, mi_col,
+      bsize, part_type, ab_subsize, ab_mi_pos, mode_cache);
+
+#if CONFIG_COLLECT_PARTITION_STATS
+  if (partition_timer_on) {
+    aom_usec_timer_mark(&partition_timer);
+    int64_t time = aom_usec_timer_elapsed(&partition_timer);
+    partition_times[part_type] += time;
+    partition_timer_on = 0;
+  }
+#endif
+  av1_restore_context(x, x_ctx, mi_row, mi_col, bsize, av1_num_planes(cm));
+}
+
+// Check if AB partitions search is allowed.
+static AOM_INLINE int is_ab_part_allowed(
+    PartitionSearchState *part_search_state,
+    const int ab_partitions_allowed[NUM_AB_PARTS], const int ab_part_type) {
+  const int is_horz_ab = (ab_part_type >> 1);
+  const int is_part_allowed =
+      (!part_search_state->terminate_partition_search &&
+       part_search_state->partition_rect_allowed[is_horz_ab] &&
+       ab_partitions_allowed[ab_part_type]);
+  return is_part_allowed;
+}
+
+// Set mode search context.
+static AOM_INLINE void set_mode_search_ctx(
+    PC_TREE *pc_tree, const int is_ctx_ready[NUM_AB_PARTS][2],
+    PICK_MODE_CONTEXT **mode_srch_ctx[NUM_AB_PARTS][2]) {
+  mode_srch_ctx[HORZ_B][0] = &pc_tree->horizontal[0];
+  mode_srch_ctx[VERT_B][0] = &pc_tree->vertical[0];
+
+  if (is_ctx_ready[HORZ_A][0])
+    mode_srch_ctx[HORZ_A][0] = &pc_tree->split[0]->none;
+
+  if (is_ctx_ready[VERT_A][0])
+    mode_srch_ctx[VERT_A][0] = &pc_tree->split[0]->none;
+
+  if (is_ctx_ready[HORZ_A][1])
+    mode_srch_ctx[HORZ_A][1] = &pc_tree->split[1]->none;
+}
+
+static AOM_INLINE void copy_partition_mode_from_mode_context(
+    const MB_MODE_INFO **dst_mode, const PICK_MODE_CONTEXT *ctx) {
+  if (ctx && ctx->rd_stats.rate < INT_MAX) {
+    *dst_mode = &ctx->mic;
+  } else {
+    *dst_mode = NULL;
+  }
+}
+
+static AOM_INLINE void copy_partition_mode_from_pc_tree(
+    const MB_MODE_INFO **dst_mode, const PC_TREE *pc_tree) {
+  if (pc_tree) {
+    copy_partition_mode_from_mode_context(dst_mode, pc_tree->none);
+  } else {
+    *dst_mode = NULL;
+  }
+}
+
+static AOM_INLINE void set_mode_cache_for_partition_ab(
+    const MB_MODE_INFO **mode_cache, const PC_TREE *pc_tree,
+    AB_PART_TYPE ab_part_type) {
+  switch (ab_part_type) {
+    case HORZ_A:
+      copy_partition_mode_from_pc_tree(&mode_cache[0], pc_tree->split[0]);
+      copy_partition_mode_from_pc_tree(&mode_cache[1], pc_tree->split[1]);
+      copy_partition_mode_from_mode_context(&mode_cache[2],
+                                            pc_tree->horizontal[1]);
+      break;
+    case HORZ_B:
+      copy_partition_mode_from_mode_context(&mode_cache[0],
+                                            pc_tree->horizontal[0]);
+      copy_partition_mode_from_pc_tree(&mode_cache[1], pc_tree->split[2]);
+      copy_partition_mode_from_pc_tree(&mode_cache[2], pc_tree->split[3]);
+      break;
+    case VERT_A:
+      copy_partition_mode_from_pc_tree(&mode_cache[0], pc_tree->split[0]);
+      copy_partition_mode_from_pc_tree(&mode_cache[1], pc_tree->split[2]);
+      copy_partition_mode_from_mode_context(&mode_cache[2],
+                                            pc_tree->vertical[1]);
+      break;
+    case VERT_B:
+      copy_partition_mode_from_mode_context(&mode_cache[0],
+                                            pc_tree->vertical[0]);
+      copy_partition_mode_from_pc_tree(&mode_cache[1], pc_tree->split[1]);
+      copy_partition_mode_from_pc_tree(&mode_cache[2], pc_tree->split[3]);
+      break;
+    default: assert(0 && "Invalid ab partition type!\n");
+  }
+}
+
+// AB Partitions type search.
+static void ab_partitions_search(
+    AV1_COMP *const cpi, ThreadData *td, TileDataEnc *tile_data,
+    TokenExtra **tp, MACROBLOCK *x, RD_SEARCH_MACROBLOCK_CONTEXT *x_ctx,
+    PC_TREE *pc_tree, PartitionSearchState *part_search_state,
+    RD_STATS *best_rdc, RD_RECT_PART_WIN_INFO *rect_part_win_info,
+    int pb_source_variance, int ext_partition_allowed) {
+  const AV1_COMMON *const cm = &cpi->common;
+  PartitionBlkParams blk_params = part_search_state->part_blk_params;
+  const int mi_row = blk_params.mi_row;
+  const int mi_col = blk_params.mi_col;
+  const int bsize = blk_params.bsize;
+
+  int ab_partitions_allowed[NUM_AB_PARTS] = { 1, 1, 1, 1 };
+  // Prune AB partitions
+  av1_prune_ab_partitions(
+      cpi, x, pc_tree, bsize, pb_source_variance, best_rdc->rdcost,
+      part_search_state->rect_part_rd, part_search_state->split_rd,
+      rect_part_win_info, ext_partition_allowed,
+      part_search_state->partition_rect_allowed[HORZ],
+      part_search_state->partition_rect_allowed[VERT],
+      &ab_partitions_allowed[HORZ_A], &ab_partitions_allowed[HORZ_B],
+      &ab_partitions_allowed[VERT_A], &ab_partitions_allowed[VERT_B]);
+
+  // Flags to indicate whether the mode search is done.
+  const int is_ctx_ready[NUM_AB_PARTS][2] = {
+    { part_search_state->is_split_ctx_is_ready[0],
+      part_search_state->is_split_ctx_is_ready[1] },
+    { part_search_state->is_rect_ctx_is_ready[HORZ], 0 },
+    { part_search_state->is_split_ctx_is_ready[0], 0 },
+    { part_search_state->is_rect_ctx_is_ready[VERT], 0 }
+  };
+
+  // Current partition context.
+  PICK_MODE_CONTEXT **cur_part_ctxs[NUM_AB_PARTS] = { pc_tree->horizontala,
+                                                      pc_tree->horizontalb,
+                                                      pc_tree->verticala,
+                                                      pc_tree->verticalb };
+
+  // Context of already evaluted partition types.
+  PICK_MODE_CONTEXT **mode_srch_ctx[NUM_AB_PARTS][2];
+  // Set context of already evaluted partition types.
+  set_mode_search_ctx(pc_tree, is_ctx_ready, mode_srch_ctx);
+
+  // Array of sub-partition size of AB partition types.
+  const BLOCK_SIZE ab_subsize[NUM_AB_PARTS][SUB_PARTITIONS_AB] = {
+    { blk_params.split_bsize2, blk_params.split_bsize2,
+      get_partition_subsize(bsize, PARTITION_HORZ_A) },
+    { get_partition_subsize(bsize, PARTITION_HORZ_B), blk_params.split_bsize2,
+      blk_params.split_bsize2 },
+    { blk_params.split_bsize2, blk_params.split_bsize2,
+      get_partition_subsize(bsize, PARTITION_VERT_A) },
+    { get_partition_subsize(bsize, PARTITION_VERT_B), blk_params.split_bsize2,
+      blk_params.split_bsize2 }
+  };
+
+  // Array of mi_row, mi_col positions corresponds to each sub-partition in AB
+  // partition types.
+  const int ab_mi_pos[NUM_AB_PARTS][SUB_PARTITIONS_AB][2] = {
+    { { mi_row, mi_col },
+      { mi_row, blk_params.mi_col_edge },
+      { blk_params.mi_row_edge, mi_col } },
+    { { mi_row, mi_col },
+      { blk_params.mi_row_edge, mi_col },
+      { blk_params.mi_row_edge, blk_params.mi_col_edge } },
+    { { mi_row, mi_col },
+      { blk_params.mi_row_edge, mi_col },
+      { mi_row, blk_params.mi_col_edge } },
+    { { mi_row, mi_col },
+      { mi_row, blk_params.mi_col_edge },
+      { blk_params.mi_row_edge, blk_params.mi_col_edge } }
+  };
+
+  // Loop over AB partition types.
+  for (AB_PART_TYPE ab_part_type = 0; ab_part_type < NUM_AB_PARTS;
+       ab_part_type++) {
+    const PARTITION_TYPE part_type = ab_part_type + PARTITION_HORZ_A;
+
+    // Check if the AB partition search is to be performed.
+    if (!is_ab_part_allowed(part_search_state, ab_partitions_allowed,
+                            ab_part_type))
+      continue;
+
+    blk_params.subsize = get_partition_subsize(bsize, part_type);
+    for (int i = 0; i < SUB_PARTITIONS_AB; i++) {
+      // Set AB partition context.
+      cur_part_ctxs[ab_part_type][i] =
+          av1_alloc_pmc(cm, ab_subsize[ab_part_type][i], &td->shared_coeff_buf);
+      // Set mode as not ready.
+      cur_part_ctxs[ab_part_type][i]->rd_mode_is_ready = 0;
+    }
+
+    // We can copy directly the mode search results if we have already searched
+    // the current block and the contexts match.
+    if (is_ctx_ready[ab_part_type][0]) {
+      av1_copy_tree_context(cur_part_ctxs[ab_part_type][0],
+                            mode_srch_ctx[ab_part_type][0][0]);
+      cur_part_ctxs[ab_part_type][0]->mic.partition = part_type;
+      cur_part_ctxs[ab_part_type][0]->rd_mode_is_ready = 1;
+      if (is_ctx_ready[ab_part_type][1]) {
+        av1_copy_tree_context(cur_part_ctxs[ab_part_type][1],
+                              mode_srch_ctx[ab_part_type][1][0]);
+        cur_part_ctxs[ab_part_type][1]->mic.partition = part_type;
+        cur_part_ctxs[ab_part_type][1]->rd_mode_is_ready = 1;
+      }
+    }
+
+    // Even if the contexts don't match, we can still speed up by reusing the
+    // previous prediction mode.
+    const MB_MODE_INFO *mode_cache[3] = { NULL, NULL, NULL };
+    if (cpi->sf.inter_sf.reuse_best_prediction_for_part_ab) {
+      set_mode_cache_for_partition_ab(mode_cache, pc_tree, ab_part_type);
+    }
+
+    // Evaluation of AB partition type.
+    rd_pick_ab_part(cpi, td, tile_data, tp, x, x_ctx, pc_tree,
+                    cur_part_ctxs[ab_part_type], part_search_state, best_rdc,
+                    ab_subsize[ab_part_type], ab_mi_pos[ab_part_type],
+                    part_type, mode_cache);
+  }
+}
+
+// Set mi positions for HORZ4 / VERT4 sub-block partitions.
+static void set_mi_pos_partition4(const int inc_step[NUM_PART4_TYPES],
+                                  int mi_pos[SUB_PARTITIONS_PART4][2],
+                                  const int mi_row, const int mi_col) {
+  for (PART4_TYPES i = 0; i < SUB_PARTITIONS_PART4; i++) {
+    mi_pos[i][0] = mi_row + i * inc_step[HORZ4];
+    mi_pos[i][1] = mi_col + i * inc_step[VERT4];
+  }
+}
+
+// Set context and RD cost for HORZ4 / VERT4 partition types.
+static void set_4_part_ctx_and_rdcost(
+    MACROBLOCK *x, const AV1_COMMON *const cm, ThreadData *td,
+    PICK_MODE_CONTEXT *cur_part_ctx[SUB_PARTITIONS_PART4],
+    PartitionSearchState *part_search_state, PARTITION_TYPE partition_type,
+    BLOCK_SIZE bsize) {
+  // Initialize sum_rdc RD cost structure.
+  av1_init_rd_stats(&part_search_state->sum_rdc);
+  const int subsize = get_partition_subsize(bsize, partition_type);
+  part_search_state->sum_rdc.rate =
+      part_search_state->partition_cost[partition_type];
+  part_search_state->sum_rdc.rdcost =
+      RDCOST(x->rdmult, part_search_state->sum_rdc.rate, 0);
+  for (PART4_TYPES i = 0; i < SUB_PARTITIONS_PART4; ++i)
+    cur_part_ctx[i] = av1_alloc_pmc(cm, subsize, &td->shared_coeff_buf);
+}
+
+// Partition search of HORZ4 / VERT4 partition types.
+static void rd_pick_4partition(
+    AV1_COMP *const cpi, ThreadData *td, TileDataEnc *tile_data,
+    TokenExtra **tp, MACROBLOCK *x, RD_SEARCH_MACROBLOCK_CONTEXT *x_ctx,
+    PC_TREE *pc_tree, PICK_MODE_CONTEXT *cur_part_ctx[SUB_PARTITIONS_PART4],
+    PartitionSearchState *part_search_state, RD_STATS *best_rdc,
+    const int inc_step[NUM_PART4_TYPES], PARTITION_TYPE partition_type) {
+  const AV1_COMMON *const cm = &cpi->common;
+  PartitionBlkParams blk_params = part_search_state->part_blk_params;
+  // mi positions needed for HORZ4 and VERT4 partition types.
+  int mi_pos_check[NUM_PART4_TYPES] = { cm->mi_params.mi_rows,
+                                        cm->mi_params.mi_cols };
+  const PART4_TYPES part4_idx = (partition_type != PARTITION_HORZ_4);
+  int mi_pos[SUB_PARTITIONS_PART4][2];
+
+  blk_params.subsize = get_partition_subsize(blk_params.bsize, partition_type);
+  // Set partition context and RD cost.
+  set_4_part_ctx_and_rdcost(x, cm, td, cur_part_ctx, part_search_state,
+                            partition_type, blk_params.bsize);
+  // Set mi positions for sub-block sizes.
+  set_mi_pos_partition4(inc_step, mi_pos, blk_params.mi_row, blk_params.mi_col);
+#if CONFIG_COLLECT_PARTITION_STATS
+  if (best_rdc.rdcost - part_search_state->sum_rdc.rdcost >= 0) {
+    partition_attempts[partition_type] += 1;
+    aom_usec_timer_start(&partition_timer);
+    partition_timer_on = 1;
+  }
+#endif
+  // Loop over sub-block partitions.
+  for (PART4_TYPES i = 0; i < SUB_PARTITIONS_PART4; ++i) {
+    if (i > 0 && mi_pos[i][part4_idx] >= mi_pos_check[part4_idx]) break;
+
+    // Sub-block evaluation of Horz4 / Vert4 partition type.
+    cur_part_ctx[i]->rd_mode_is_ready = 0;
+    if (!rd_try_subblock(
+            cpi, td, tile_data, tp, (i == SUB_PARTITIONS_PART4 - 1),
+            mi_pos[i][0], mi_pos[i][1], blk_params.subsize, *best_rdc,
+            &part_search_state->sum_rdc, partition_type, cur_part_ctx[i])) {
+      av1_invalid_rd_stats(&part_search_state->sum_rdc);
+      break;
+    }
+  }
+
+  // Calculate the total cost and update the best partition.
+  av1_rd_cost_update(x->rdmult, &part_search_state->sum_rdc);
+  if (part_search_state->sum_rdc.rdcost < best_rdc->rdcost) {
+    *best_rdc = part_search_state->sum_rdc;
+    part_search_state->found_best_partition = true;
+    pc_tree->partitioning = partition_type;
+  }
+#if CONFIG_COLLECT_PARTITION_STATS
+  if (partition_timer_on) {
+    aom_usec_timer_mark(&partition_timer);
+    int64_t time = aom_usec_timer_elapsed(&partition_timer);
+    partition_times[partition_type] += time;
+    partition_timer_on = 0;
+  }
+#endif
+  av1_restore_context(x, x_ctx, blk_params.mi_row, blk_params.mi_col,
+                      blk_params.bsize, av1_num_planes(cm));
+}
+
+// Prune 4-way partitions based on the number of horz/vert wins
+// in the current block and sub-blocks in PARTITION_SPLIT.
+static void prune_4_partition_using_split_info(
+    AV1_COMP *const cpi, MACROBLOCK *x, PartitionSearchState *part_search_state,
+    int part4_search_allowed[NUM_PART4_TYPES]) {
+  PART4_TYPES cur_part[NUM_PART4_TYPES] = { HORZ4, VERT4 };
+  // Count of child blocks in which HORZ or VERT partition has won
+  int num_child_rect_win[NUM_RECT_PARTS] = { 0, 0 };
+  // Prune HORZ4/VERT4 partitions based on number of HORZ/VERT winners of
+  // split partiitons.
+  // Conservative pruning for high quantizers.
+  const int num_win_thresh = AOMMIN(3 * (MAXQ - x->qindex) / MAXQ + 1, 3);
+
+  for (RECT_PART_TYPE i = HORZ; i < NUM_RECT_PARTS; i++) {
+    if (!(cpi->sf.part_sf.prune_4_partition_using_split_info &&
+          part4_search_allowed[cur_part[i]]))
+      continue;
+    // Loop over split partitions.
+    // Get rectangular partitions winner info of split partitions.
+    for (int idx = 0; idx < SUB_PARTITIONS_SPLIT; idx++)
+      num_child_rect_win[i] +=
+          (part_search_state->split_part_rect_win[idx].rect_part_win[i]) ? 1
+                                                                         : 0;
+    if (num_child_rect_win[i] < num_win_thresh) {
+      part4_search_allowed[cur_part[i]] = 0;
+    }
+  }
+}
+
+// Prune 4-way partition search.
+static void prune_4_way_partition_search(
+    AV1_COMP *const cpi, MACROBLOCK *x, PC_TREE *pc_tree,
+    PartitionSearchState *part_search_state, RD_STATS *best_rdc,
+    int pb_source_variance, int ext_partition_allowed,
+    int part4_search_allowed[NUM_PART4_TYPES]) {
+  PartitionBlkParams blk_params = part_search_state->part_blk_params;
+  const int mi_row = blk_params.mi_row;
+  const int mi_col = blk_params.mi_col;
+  const int bsize = blk_params.bsize;
+  PARTITION_TYPE cur_part[NUM_PART4_TYPES] = { PARTITION_HORZ_4,
+                                               PARTITION_VERT_4 };
+  const PartitionCfg *const part_cfg = &cpi->oxcf.part_cfg;
+  // partition4_allowed is 1 if we can use a PARTITION_HORZ_4 or
+  // PARTITION_VERT_4 for this block. This is almost the same as
+  // ext_partition_allowed, except that we don't allow 128x32 or 32x128
+  // blocks, so we require that bsize is not BLOCK_128X128.
+  const int partition4_allowed = part_cfg->enable_1to4_partitions &&
+                                 ext_partition_allowed &&
+                                 bsize != BLOCK_128X128;
+
+  for (PART4_TYPES i = HORZ4; i < NUM_PART4_TYPES; i++) {
+    part4_search_allowed[i] =
+        partition4_allowed && part_search_state->partition_rect_allowed[i] &&
+        get_plane_block_size(get_partition_subsize(bsize, cur_part[i]),
+                             part_search_state->ss_x,
+                             part_search_state->ss_y) != BLOCK_INVALID;
+  }
+  // Pruning: pruning out 4-way partitions based on the current best partition.
+  if (cpi->sf.part_sf.prune_ext_partition_types_search_level == 2) {
+    part4_search_allowed[HORZ4] &= (pc_tree->partitioning == PARTITION_HORZ ||
+                                    pc_tree->partitioning == PARTITION_HORZ_A ||
+                                    pc_tree->partitioning == PARTITION_HORZ_B ||
+                                    pc_tree->partitioning == PARTITION_SPLIT ||
+                                    pc_tree->partitioning == PARTITION_NONE);
+    part4_search_allowed[VERT4] &= (pc_tree->partitioning == PARTITION_VERT ||
+                                    pc_tree->partitioning == PARTITION_VERT_A ||
+                                    pc_tree->partitioning == PARTITION_VERT_B ||
+                                    pc_tree->partitioning == PARTITION_SPLIT ||
+                                    pc_tree->partitioning == PARTITION_NONE);
+  }
+
+  // Pruning: pruning out some 4-way partitions using a DNN taking rd costs of
+  // sub-blocks from basic partition types.
+  if (cpi->sf.part_sf.ml_prune_4_partition && partition4_allowed &&
+      part_search_state->partition_rect_allowed[HORZ] &&
+      part_search_state->partition_rect_allowed[VERT]) {
+    av1_ml_prune_4_partition(
+        cpi, x, bsize, pc_tree->partitioning, best_rdc->rdcost,
+        part_search_state->rect_part_rd, part_search_state->split_rd,
+        &part4_search_allowed[HORZ4], &part4_search_allowed[VERT4],
+        pb_source_variance, mi_row, mi_col);
+  }
+
+  // Pruning: pruning out 4-way partitions based on the number of horz/vert wins
+  // in the current block and sub-blocks in PARTITION_SPLIT.
+  prune_4_partition_using_split_info(cpi, x, part_search_state,
+                                     part4_search_allowed);
+}
+
+// Set PARTITION_NONE allowed flag.
+static AOM_INLINE void set_part_none_allowed_flag(
+    AV1_COMP *const cpi, PartitionSearchState *part_search_state) {
+  PartitionBlkParams blk_params = part_search_state->part_blk_params;
+  if ((blk_params.width <= blk_params.min_partition_size_1d) &&
+      blk_params.has_rows && blk_params.has_cols)
+    part_search_state->partition_none_allowed = 1;
+  assert(part_search_state->terminate_partition_search == 0);
+
+  // Set PARTITION_NONE for screen content.
+  if (cpi->use_screen_content_tools)
+    part_search_state->partition_none_allowed =
+        blk_params.has_rows && blk_params.has_cols;
+}
+
+// Set params needed for PARTITION_NONE search.
+static void set_none_partition_params(const AV1_COMMON *const cm,
+                                      ThreadData *td, MACROBLOCK *x,
+                                      PC_TREE *pc_tree,
+                                      PartitionSearchState *part_search_state,
+                                      RD_STATS *best_remain_rdcost,
+                                      RD_STATS *best_rdc, int *pt_cost) {
+  PartitionBlkParams blk_params = part_search_state->part_blk_params;
+  RD_STATS partition_rdcost;
+  // Set PARTITION_NONE context.
+  if (pc_tree->none == NULL)
+    pc_tree->none = av1_alloc_pmc(cm, blk_params.bsize, &td->shared_coeff_buf);
+
+  // Set PARTITION_NONE type cost.
+  if (part_search_state->partition_none_allowed) {
+    if (blk_params.bsize_at_least_8x8) {
+      *pt_cost = part_search_state->partition_cost[PARTITION_NONE] < INT_MAX
+                     ? part_search_state->partition_cost[PARTITION_NONE]
+                     : 0;
+    }
+
+    // Initialize the RD stats structure.
+    av1_init_rd_stats(&partition_rdcost);
+    partition_rdcost.rate = *pt_cost;
+    av1_rd_cost_update(x->rdmult, &partition_rdcost);
+    av1_rd_stats_subtraction(x->rdmult, best_rdc, &partition_rdcost,
+                             best_remain_rdcost);
+  }
+}
+
+// Skip other partitions based on PARTITION_NONE rd cost.
+static void prune_partitions_after_none(AV1_COMP *const cpi, MACROBLOCK *x,
+                                        SIMPLE_MOTION_DATA_TREE *sms_tree,
+                                        PICK_MODE_CONTEXT *ctx_none,
+                                        PartitionSearchState *part_search_state,
+                                        RD_STATS *best_rdc,
+                                        unsigned int *pb_source_variance) {
+  const AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  PartitionBlkParams blk_params = part_search_state->part_blk_params;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  RD_STATS *this_rdc = &part_search_state->this_rdc;
+  const BLOCK_SIZE bsize = blk_params.bsize;
+  assert(bsize < BLOCK_SIZES_ALL);
+
+  if (!frame_is_intra_only(cm) &&
+      (part_search_state->do_square_split ||
+       part_search_state->do_rectangular_split) &&
+      !x->e_mbd.lossless[xd->mi[0]->segment_id] && ctx_none->skippable) {
+    const int use_ml_based_breakout =
+        bsize <= cpi->sf.part_sf.use_square_partition_only_threshold &&
+        bsize > BLOCK_4X4 && cpi->sf.part_sf.ml_predict_breakout_level >= 1;
+    if (use_ml_based_breakout) {
+      if (av1_ml_predict_breakout(cpi, bsize, x, this_rdc, *pb_source_variance,
+                                  xd->bd)) {
+        part_search_state->do_square_split = 0;
+        part_search_state->do_rectangular_split = 0;
+      }
+    }
+
+    // Adjust dist breakout threshold according to the partition size.
+    const int64_t dist_breakout_thr =
+        cpi->sf.part_sf.partition_search_breakout_dist_thr >>
+        ((2 * (MAX_SB_SIZE_LOG2 - 2)) -
+         (mi_size_wide_log2[bsize] + mi_size_high_log2[bsize]));
+    const int rate_breakout_thr =
+        cpi->sf.part_sf.partition_search_breakout_rate_thr *
+        num_pels_log2_lookup[bsize];
+    // If all y, u, v transform blocks in this partition are skippable,
+    // and the dist & rate are within the thresholds, the partition
+    // search is terminated for current branch of the partition search
+    // tree. The dist & rate thresholds are set to 0 at speed 0 to
+    // disable the early termination at that speed.
+    if (best_rdc->dist < dist_breakout_thr &&
+        best_rdc->rate < rate_breakout_thr) {
+      part_search_state->do_square_split = 0;
+      part_search_state->do_rectangular_split = 0;
+    }
+  }
+
+  // Early termination: using simple_motion_search features and the
+  // rate, distortion, and rdcost of PARTITION_NONE, a DNN will make a
+  // decision on early terminating at PARTITION_NONE.
+  if (cpi->sf.part_sf.simple_motion_search_early_term_none && cm->show_frame &&
+      !frame_is_intra_only(cm) && bsize >= BLOCK_16X16 &&
+      blk_params.mi_row_edge < mi_params->mi_rows &&
+      blk_params.mi_col_edge < mi_params->mi_cols &&
+      this_rdc->rdcost < INT64_MAX && this_rdc->rdcost >= 0 &&
+      this_rdc->rate < INT_MAX && this_rdc->rate >= 0 &&
+      (part_search_state->do_square_split ||
+       part_search_state->do_rectangular_split)) {
+    av1_simple_motion_search_early_term_none(
+        cpi, x, sms_tree, blk_params.mi_row, blk_params.mi_col, bsize, this_rdc,
+        &part_search_state->terminate_partition_search);
+  }
+}
+
+// Decide early termination and rectangular partition pruning
+// based on PARTITION_NONE and PARTITION_SPLIT costs.
+static void prune_partitions_after_split(
+    AV1_COMP *const cpi, MACROBLOCK *x, SIMPLE_MOTION_DATA_TREE *sms_tree,
+    PartitionSearchState *part_search_state, RD_STATS *best_rdc,
+    int64_t part_none_rd, int64_t part_split_rd) {
+  const AV1_COMMON *const cm = &cpi->common;
+  PartitionBlkParams blk_params = part_search_state->part_blk_params;
+  const int mi_row = blk_params.mi_row;
+  const int mi_col = blk_params.mi_col;
+  const BLOCK_SIZE bsize = blk_params.bsize;
+  assert(bsize < BLOCK_SIZES_ALL);
+
+  // Early termination: using the rd costs of PARTITION_NONE and subblocks
+  // from PARTITION_SPLIT to determine an early breakout.
+  if (cpi->sf.part_sf.ml_early_term_after_part_split_level &&
+      !frame_is_intra_only(cm) &&
+      !part_search_state->terminate_partition_search &&
+      part_search_state->do_rectangular_split &&
+      (part_search_state->partition_rect_allowed[HORZ] ||
+       part_search_state->partition_rect_allowed[VERT])) {
+    av1_ml_early_term_after_split(
+        cpi, x, sms_tree, bsize, best_rdc->rdcost, part_none_rd, part_split_rd,
+        part_search_state->split_rd, mi_row, mi_col,
+        &part_search_state->terminate_partition_search);
+  }
+
+  // Use the rd costs of PARTITION_NONE and subblocks from PARTITION_SPLIT
+  // to prune out rectangular partitions in some directions.
+  if (!cpi->sf.part_sf.ml_early_term_after_part_split_level &&
+      cpi->sf.part_sf.ml_prune_rect_partition && !frame_is_intra_only(cm) &&
+      (part_search_state->partition_rect_allowed[HORZ] ||
+       part_search_state->partition_rect_allowed[VERT]) &&
+      !(part_search_state->prune_rect_part[HORZ] ||
+        part_search_state->prune_rect_part[VERT]) &&
+      !part_search_state->terminate_partition_search) {
+    av1_setup_src_planes(x, cpi->source, mi_row, mi_col, av1_num_planes(cm),
+                         bsize);
+    av1_ml_prune_rect_partition(
+        cpi, x, bsize, best_rdc->rdcost, part_search_state->none_rd,
+        part_search_state->split_rd, &part_search_state->prune_rect_part[HORZ],
+        &part_search_state->prune_rect_part[VERT]);
+  }
+}
+
+// PARTITION_NONE search.
+static void none_partition_search(
+    AV1_COMP *const cpi, ThreadData *td, TileDataEnc *tile_data, MACROBLOCK *x,
+    PC_TREE *pc_tree, SIMPLE_MOTION_DATA_TREE *sms_tree,
+    RD_SEARCH_MACROBLOCK_CONTEXT *x_ctx,
+    PartitionSearchState *part_search_state, RD_STATS *best_rdc,
+    unsigned int *pb_source_variance, int64_t *none_rd, int64_t *part_none_rd) {
+  const AV1_COMMON *const cm = &cpi->common;
+  PartitionBlkParams blk_params = part_search_state->part_blk_params;
+  RD_STATS *this_rdc = &part_search_state->this_rdc;
+  const int mi_row = blk_params.mi_row;
+  const int mi_col = blk_params.mi_col;
+  const BLOCK_SIZE bsize = blk_params.bsize;
+  assert(bsize < BLOCK_SIZES_ALL);
+
+  // Set PARTITION_NONE allowed flag.
+  set_part_none_allowed_flag(cpi, part_search_state);
+  if (!part_search_state->partition_none_allowed) return;
+
+  int pt_cost = 0;
+  RD_STATS best_remain_rdcost;
+
+  // Set PARTITION_NONE context and cost.
+  set_none_partition_params(cm, td, x, pc_tree, part_search_state,
+                            &best_remain_rdcost, best_rdc, &pt_cost);
+
+#if CONFIG_COLLECT_PARTITION_STATS
+  // Timer start for partition None.
+  if (best_remain_rdcost >= 0) {
+    partition_attempts[PARTITION_NONE] += 1;
+    aom_usec_timer_start(&partition_timer);
+    partition_timer_on = 1;
+  }
+#endif
+  // PARTITION_NONE evaluation and cost update.
+  pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, this_rdc, PARTITION_NONE,
+                bsize, pc_tree->none, best_remain_rdcost);
+
+  av1_rd_cost_update(x->rdmult, this_rdc);
+
+#if CONFIG_COLLECT_PARTITION_STATS
+  // Timer end for partition None.
+  if (partition_timer_on) {
+    aom_usec_timer_mark(&partition_timer);
+    int64_t time = aom_usec_timer_elapsed(&partition_timer);
+    partition_times[PARTITION_NONE] += time;
+    partition_timer_on = 0;
+  }
+#endif
+  *pb_source_variance = x->source_variance;
+  if (none_rd) *none_rd = this_rdc->rdcost;
+  part_search_state->none_rd = this_rdc->rdcost;
+  if (this_rdc->rate != INT_MAX) {
+    // Record picked ref frame to prune ref frames for other partition types.
+    if (cpi->sf.inter_sf.prune_ref_frame_for_rect_partitions) {
+      const int ref_type = av1_ref_frame_type(pc_tree->none->mic.ref_frame);
+      av1_update_picked_ref_frames_mask(
+          x, ref_type, bsize, cm->seq_params.mib_size, mi_row, mi_col);
+    }
+
+    // Calculate the total cost and update the best partition.
+    if (blk_params.bsize_at_least_8x8) {
+      this_rdc->rate += pt_cost;
+      this_rdc->rdcost = RDCOST(x->rdmult, this_rdc->rate, this_rdc->dist);
+    }
+    *part_none_rd = this_rdc->rdcost;
+    if (this_rdc->rdcost < best_rdc->rdcost) {
+      *best_rdc = *this_rdc;
+      part_search_state->found_best_partition = true;
+      if (blk_params.bsize_at_least_8x8) {
+        pc_tree->partitioning = PARTITION_NONE;
+      }
+
+      // Disable split and rectangular partition search
+      // based on PARTITION_NONE cost.
+      prune_partitions_after_none(cpi, x, sms_tree, pc_tree->none,
+                                  part_search_state, best_rdc,
+                                  pb_source_variance);
+    }
+  }
+  av1_restore_context(x, x_ctx, mi_row, mi_col, bsize, av1_num_planes(cm));
+}
+
+// PARTITION_SPLIT search.
+static void split_partition_search(
+    AV1_COMP *const cpi, ThreadData *td, TileDataEnc *tile_data,
+    TokenExtra **tp, MACROBLOCK *x, PC_TREE *pc_tree,
+    SIMPLE_MOTION_DATA_TREE *sms_tree, RD_SEARCH_MACROBLOCK_CONTEXT *x_ctx,
+    PartitionSearchState *part_search_state, RD_STATS *best_rdc,
+    SB_MULTI_PASS_MODE multi_pass_mode, int64_t *part_split_rd) {
+  const AV1_COMMON *const cm = &cpi->common;
+  PartitionBlkParams blk_params = part_search_state->part_blk_params;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  const int mi_row = blk_params.mi_row;
+  const int mi_col = blk_params.mi_col;
+  const int bsize = blk_params.bsize;
+  assert(bsize < BLOCK_SIZES_ALL);
+  RD_STATS sum_rdc = part_search_state->sum_rdc;
+  const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
+
+  // Check if partition split is allowed.
+  if (part_search_state->terminate_partition_search ||
+      !part_search_state->do_square_split)
+    return;
+
+  for (int i = 0; i < SUB_PARTITIONS_SPLIT; ++i) {
+    if (pc_tree->split[i] == NULL)
+      pc_tree->split[i] = av1_alloc_pc_tree_node(subsize);
+    pc_tree->split[i]->index = i;
+  }
+
+  // Initialization of this partition RD stats.
+  av1_init_rd_stats(&sum_rdc);
+  sum_rdc.rate = part_search_state->partition_cost[PARTITION_SPLIT];
+  sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, 0);
+
+  int idx;
+#if CONFIG_COLLECT_PARTITION_STATS
+  if (best_rdc->rdcost - sum_rdc.rdcost >= 0) {
+    partition_attempts[PARTITION_SPLIT] += 1;
+    aom_usec_timer_start(&partition_timer);
+    partition_timer_on = 1;
+  }
+#endif
+  // Recursive partition search on 4 sub-blocks.
+  for (idx = 0; idx < SUB_PARTITIONS_SPLIT && sum_rdc.rdcost < best_rdc->rdcost;
+       ++idx) {
+    const int x_idx = (idx & 1) * blk_params.mi_step;
+    const int y_idx = (idx >> 1) * blk_params.mi_step;
+
+    if (mi_row + y_idx >= mi_params->mi_rows ||
+        mi_col + x_idx >= mi_params->mi_cols)
+      continue;
+
+    pc_tree->split[idx]->index = idx;
+    int64_t *p_split_rd = &part_search_state->split_rd[idx];
+    RD_STATS best_remain_rdcost;
+    av1_rd_stats_subtraction(x->rdmult, best_rdc, &sum_rdc,
+                             &best_remain_rdcost);
+
+    int curr_quad_tree_idx = 0;
+    if (frame_is_intra_only(cm) && bsize <= BLOCK_64X64) {
+      curr_quad_tree_idx = part_search_state->intra_part_info->quad_tree_idx;
+      part_search_state->intra_part_info->quad_tree_idx =
+          4 * curr_quad_tree_idx + idx + 1;
+    }
+    // Split partition evaluation of corresponding idx.
+    // If the RD cost exceeds the best cost then do not
+    // evaluate other split sub-partitions.
+    if (!av1_rd_pick_partition(
+            cpi, td, tile_data, tp, mi_row + y_idx, mi_col + x_idx, subsize,
+            &part_search_state->this_rdc, best_remain_rdcost,
+            pc_tree->split[idx], sms_tree->split[idx], p_split_rd,
+            multi_pass_mode, &part_search_state->split_part_rect_win[idx])) {
+      av1_invalid_rd_stats(&sum_rdc);
+      break;
+    }
+    if (frame_is_intra_only(cm) && bsize <= BLOCK_64X64) {
+      part_search_state->intra_part_info->quad_tree_idx = curr_quad_tree_idx;
+    }
+
+    sum_rdc.rate += part_search_state->this_rdc.rate;
+    sum_rdc.dist += part_search_state->this_rdc.dist;
+    av1_rd_cost_update(x->rdmult, &sum_rdc);
+
+    // Set split ctx as ready for use.
+    if (idx <= 1 && (bsize <= BLOCK_8X8 ||
+                     pc_tree->split[idx]->partitioning == PARTITION_NONE)) {
+      const MB_MODE_INFO *const mbmi = &pc_tree->split[idx]->none->mic;
+      const PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+      // Neither palette mode nor cfl predicted.
+      if (pmi->palette_size[0] == 0 && pmi->palette_size[1] == 0) {
+        if (mbmi->uv_mode != UV_CFL_PRED)
+          part_search_state->is_split_ctx_is_ready[idx] = 1;
+      }
+    }
+  }
+#if CONFIG_COLLECT_PARTITION_STATS
+  if (partition_timer_on) {
+    aom_usec_timer_mark(&partition_timer);
+    int64_t time = aom_usec_timer_elapsed(&partition_timer);
+    partition_times[PARTITION_SPLIT] += time;
+    partition_timer_on = 0;
+  }
+#endif
+  const int reached_last_index = (idx == SUB_PARTITIONS_SPLIT);
+
+  // Calculate the total cost and update the best partition.
+  *part_split_rd = sum_rdc.rdcost;
+  if (reached_last_index && sum_rdc.rdcost < best_rdc->rdcost) {
+    sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
+    if (sum_rdc.rdcost < best_rdc->rdcost) {
+      *best_rdc = sum_rdc;
+      part_search_state->found_best_partition = true;
+      pc_tree->partitioning = PARTITION_SPLIT;
+    }
+  } else if (cpi->sf.part_sf.less_rectangular_check_level > 0) {
+    // Skip rectangular partition test when partition type none gives better
+    // rd than partition type split.
+    if (cpi->sf.part_sf.less_rectangular_check_level == 2 || idx <= 2) {
+      const int partition_none_valid = part_search_state->none_rd > 0;
+      const int partition_none_better =
+          part_search_state->none_rd < sum_rdc.rdcost;
+      part_search_state->do_rectangular_split &=
+          !(partition_none_valid && partition_none_better);
+    }
+  }
+  av1_restore_context(x, x_ctx, mi_row, mi_col, bsize, av1_num_planes(cm));
+}
+
+/*!\brief AV1 block partition search (full search).
+*
+* \ingroup partition_search
+* \callgraph
+* Searches for the best partition pattern for a block based on the
+* rate-distortion cost, and returns a bool value to indicate whether a valid
+* partition pattern is found. The partition can recursively go down to the
+* smallest block size.
+*
+* \param[in]    cpi                Top-level encoder structure
+* \param[in]    td                 Pointer to thread data
+* \param[in]    tile_data          Pointer to struct holding adaptive
+data/contexts/models for the tile during
+encoding
+* \param[in]    tp                 Pointer to the starting token
+* \param[in]    mi_row             Row coordinate of the block in a step size
+of MI_SIZE
+* \param[in]    mi_col             Column coordinate of the block in a step
+size of MI_SIZE
+* \param[in]    bsize              Current block size
+* \param[in]    rd_cost            Pointer to the final rd cost of the block
+* \param[in]    best_rdc           Upper bound of rd cost of a valid partition
+* \param[in]    pc_tree            Pointer to the PC_TREE node storing the
+picked partitions and mode info for the
+current block
+* \param[in]    sms_tree           Pointer to struct holding simple motion
+search data for the current block
+* \param[in]    none_rd            Pointer to the rd cost in the case of not
+splitting the current block
+* \param[in]    multi_pass_mode    SB_SINGLE_PASS/SB_DRY_PASS/SB_WET_PASS
+* \param[in]    rect_part_win_info Pointer to struct storing whether horz/vert
+partition outperforms previously tested
+partitions
+*
+* \return A bool value is returned indicating if a valid partition is found.
+* The pc_tree struct is modified to store the picked partition and modes.
+* The rd_cost struct is also updated with the RD stats corresponding to the
+* best partition found.
+*/
+bool av1_rd_pick_partition(AV1_COMP *const cpi, ThreadData *td,
+                           TileDataEnc *tile_data, TokenExtra **tp, int mi_row,
+                           int mi_col, BLOCK_SIZE bsize, RD_STATS *rd_cost,
+                           RD_STATS best_rdc, PC_TREE *pc_tree,
+                           SIMPLE_MOTION_DATA_TREE *sms_tree, int64_t *none_rd,
+                           SB_MULTI_PASS_MODE multi_pass_mode,
+                           RD_RECT_PART_WIN_INFO *rect_part_win_info) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  TileInfo *const tile_info = &tile_data->tile_info;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  RD_SEARCH_MACROBLOCK_CONTEXT x_ctx;
+  const TokenExtra *const tp_orig = *tp;
+  PartitionSearchState part_search_state;
+  // Initialization of state variables used in partition search.
+  init_partition_search_state_params(x, cpi, &part_search_state, mi_row, mi_col,
+                                     bsize);
+  PartitionBlkParams blk_params = part_search_state.part_blk_params;
+
+  sms_tree->partitioning = PARTITION_NONE;
+  if (best_rdc.rdcost < 0) {
+    av1_invalid_rd_stats(rd_cost);
+    return part_search_state.found_best_partition;
+  }
+  if (bsize == cm->seq_params.sb_size) x->must_find_valid_partition = 0;
+
+  // Override skipping rectangular partition operations for edge blocks.
+  if (none_rd) *none_rd = 0;
+  (void)*tp_orig;
+
+#if CONFIG_COLLECT_PARTITION_STATS
+  int partition_decisions[EXT_PARTITION_TYPES] = { 0 };
+  int partition_attempts[EXT_PARTITION_TYPES] = { 0 };
+  int64_t partition_times[EXT_PARTITION_TYPES] = { 0 };
+  struct aom_usec_timer partition_timer = { 0 };
+  int partition_timer_on = 0;
+#if CONFIG_COLLECT_PARTITION_STATS == 2
+  PartitionStats *part_stats = &cpi->partition_stats;
+#endif
+#endif
+
+  // Override partition costs at the edges of the frame in the same
+  // way as in read_partition (see decodeframe.c).
+  if (!(blk_params.has_rows && blk_params.has_cols))
+    set_partition_cost_for_edge_blk(cm, &part_search_state);
+
+  // Disable rectangular partitions for inner blocks when the current block is
+  // forced to only use square partitions.
+  if (bsize > cpi->sf.part_sf.use_square_partition_only_threshold) {
+    part_search_state.partition_rect_allowed[HORZ] &= !blk_params.has_rows;
+    part_search_state.partition_rect_allowed[VERT] &= !blk_params.has_cols;
+  }
+
+#ifndef NDEBUG
+  // Nothing should rely on the default value of this array (which is just
+  // leftover from encoding the previous block. Setting it to fixed pattern
+  // when debugging.
+  // bit 0, 1, 2 are blk_skip of each plane
+  // bit 4, 5, 6 are initialization checking of each plane
+  memset(x->txfm_search_info.blk_skip, 0x77,
+         sizeof(x->txfm_search_info.blk_skip));
+#endif  // NDEBUG
+
+  assert(mi_size_wide[bsize] == mi_size_high[bsize]);
+
+  // Set buffers and offsets.
+  av1_set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
+
+  // Save rdmult before it might be changed, so it can be restored later.
+  const int orig_rdmult = x->rdmult;
+  setup_block_rdmult(cpi, x, mi_row, mi_col, bsize, NO_AQ, NULL);
+
+  // Update rd cost of the bound using the current multiplier.
+  av1_rd_cost_update(x->rdmult, &best_rdc);
+
+  if (bsize == BLOCK_16X16 && cpi->vaq_refresh)
+    x->mb_energy = av1_log_block_var(cpi, x, bsize);
+
+  // Set the context.
+  xd->above_txfm_context =
+      cm->above_contexts.txfm[tile_info->tile_row] + mi_col;
+  xd->left_txfm_context =
+      xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+  av1_save_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  start_timing(cpi, av1_prune_partitions_time);
+#endif
+  int *partition_horz_allowed = &part_search_state.partition_rect_allowed[HORZ];
+  int *partition_vert_allowed = &part_search_state.partition_rect_allowed[VERT];
+  int *prune_horz = &part_search_state.prune_rect_part[HORZ];
+  int *prune_vert = &part_search_state.prune_rect_part[VERT];
+  // Pruning: before searching any partition type, using source and simple
+  // motion search results to prune out unlikely partitions.
+  av1_prune_partitions_before_search(
+      cpi, x, mi_row, mi_col, bsize, sms_tree,
+      &part_search_state.partition_none_allowed, partition_horz_allowed,
+      partition_vert_allowed, &part_search_state.do_rectangular_split,
+      &part_search_state.do_square_split, prune_horz, prune_vert);
+
+  // Pruning: eliminating partition types leading to coding block sizes outside
+  // the min and max bsize limitations set from the encoder.
+  av1_prune_partitions_by_max_min_bsize(
+      &x->sb_enc, bsize, blk_params.has_rows && blk_params.has_cols,
+      &part_search_state.partition_none_allowed, partition_horz_allowed,
+      partition_vert_allowed, &part_search_state.do_square_split);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  end_timing(cpi, av1_prune_partitions_time);
+#endif
+
+  // Partition search
+BEGIN_PARTITION_SEARCH:
+  // If a valid partition is required, usually when the first round cannot find
+  // a valid one under the cost limit after pruning, reset the limitations on
+  // partition types and intra cnn output.
+  if (x->must_find_valid_partition) {
+    reset_part_limitations(cpi, &part_search_state);
+    // Invalidate intra cnn output for key frames.
+    if (frame_is_intra_only(cm) && bsize == BLOCK_64X64) {
+      part_search_state.intra_part_info->quad_tree_idx = 0;
+      part_search_state.intra_part_info->cnn_output_valid = 0;
+    }
+  }
+  // Partition block source pixel variance.
+  unsigned int pb_source_variance = UINT_MAX;
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  start_timing(cpi, none_partition_search_time);
+#endif
+  // PARTITION_NONE search stage.
+  int64_t part_none_rd = INT64_MAX;
+  none_partition_search(cpi, td, tile_data, x, pc_tree, sms_tree, &x_ctx,
+                        &part_search_state, &best_rdc, &pb_source_variance,
+                        none_rd, &part_none_rd);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  end_timing(cpi, none_partition_search_time);
+#endif
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  start_timing(cpi, split_partition_search_time);
+#endif
+  // PARTITION_SPLIT search stage.
+  int64_t part_split_rd = INT64_MAX;
+  split_partition_search(cpi, td, tile_data, tp, x, pc_tree, sms_tree, &x_ctx,
+                         &part_search_state, &best_rdc, multi_pass_mode,
+                         &part_split_rd);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  end_timing(cpi, split_partition_search_time);
+#endif
+  // Terminate partition search for child partition,
+  // when NONE and SPLIT partition rd_costs are INT64_MAX.
+  if (cpi->sf.part_sf.early_term_after_none_split &&
+      part_none_rd == INT64_MAX && part_split_rd == INT64_MAX &&
+      !x->must_find_valid_partition && (bsize != cm->seq_params.sb_size)) {
+    part_search_state.terminate_partition_search = 1;
+  }
+
+  // Prune partitions based on PARTITION_NONE and PARTITION_SPLIT.
+  prune_partitions_after_split(cpi, x, sms_tree, &part_search_state, &best_rdc,
+                               part_none_rd, part_split_rd);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  start_timing(cpi, rectangular_partition_search_time);
+#endif
+  // Rectangular partitions search stage.
+  rectangular_partition_search(cpi, td, tile_data, tp, x, pc_tree, &x_ctx,
+                               &part_search_state, &best_rdc,
+                               rect_part_win_info);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  end_timing(cpi, rectangular_partition_search_time);
+#endif
+
+  if (pb_source_variance == UINT_MAX) {
+    av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes, bsize);
+    if (is_cur_buf_hbd(xd)) {
+      pb_source_variance = av1_high_get_sby_perpixel_variance(
+          cpi, &x->plane[0].src, bsize, xd->bd);
+    } else {
+      pb_source_variance =
+          av1_get_sby_perpixel_variance(cpi, &x->plane[0].src, bsize);
+    }
+  }
+
+  assert(IMPLIES(!cpi->oxcf.part_cfg.enable_rect_partitions,
+                 !part_search_state.do_rectangular_split));
+
+  const int ext_partition_allowed =
+      part_search_state.do_rectangular_split &&
+      bsize > cpi->sf.part_sf.ext_partition_eval_thresh &&
+      blk_params.has_rows && blk_params.has_cols;
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  start_timing(cpi, ab_partitions_search_time);
+#endif
+  // AB partitions search stage.
+  ab_partitions_search(cpi, td, tile_data, tp, x, &x_ctx, pc_tree,
+                       &part_search_state, &best_rdc, rect_part_win_info,
+                       pb_source_variance, ext_partition_allowed);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  end_timing(cpi, ab_partitions_search_time);
+#endif
+
+  // 4-way partitions search stage.
+  int part4_search_allowed[NUM_PART4_TYPES] = { 1, 1 };
+
+  // Disable 4-way partition search flags for width less than twice the minimum
+  // width.
+  if (blk_params.width < (blk_params.min_partition_size_1d << 2)) {
+    part4_search_allowed[HORZ4] = 0;
+    part4_search_allowed[VERT4] = 0;
+  } else {
+    // Prune 4-way partition search.
+    prune_4_way_partition_search(cpi, x, pc_tree, &part_search_state, &best_rdc,
+                                 pb_source_variance, ext_partition_allowed,
+                                 part4_search_allowed);
+  }
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  start_timing(cpi, rd_pick_4partition_time);
+#endif
+  // PARTITION_HORZ_4
+  assert(IMPLIES(!cpi->oxcf.part_cfg.enable_rect_partitions,
+                 !part4_search_allowed[HORZ4]));
+  if (!part_search_state.terminate_partition_search &&
+      part4_search_allowed[HORZ4] && blk_params.has_rows &&
+      (part_search_state.do_rectangular_split ||
+       av1_active_h_edge(cpi, mi_row, blk_params.mi_step))) {
+    const int inc_step[NUM_PART4_TYPES] = { mi_size_high[blk_params.bsize] / 4,
+                                            0 };
+    // Evaluation of Horz4 partition type.
+    rd_pick_4partition(cpi, td, tile_data, tp, x, &x_ctx, pc_tree,
+                       pc_tree->horizontal4, &part_search_state, &best_rdc,
+                       inc_step, PARTITION_HORZ_4);
+  }
+
+  // PARTITION_VERT_4
+  assert(IMPLIES(!cpi->oxcf.part_cfg.enable_rect_partitions,
+                 !part4_search_allowed[VERT4]));
+  if (!part_search_state.terminate_partition_search &&
+      part4_search_allowed[VERT4] && blk_params.has_cols &&
+      (part_search_state.do_rectangular_split ||
+       av1_active_v_edge(cpi, mi_row, blk_params.mi_step))) {
+    const int inc_step[NUM_PART4_TYPES] = { 0, mi_size_wide[blk_params.bsize] /
+                                                   4 };
+    // Evaluation of Vert4 partition type.
+    rd_pick_4partition(cpi, td, tile_data, tp, x, &x_ctx, pc_tree,
+                       pc_tree->vertical4, &part_search_state, &best_rdc,
+                       inc_step, PARTITION_VERT_4);
+  }
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  end_timing(cpi, rd_pick_4partition_time);
+#endif
+
+  if (bsize == cm->seq_params.sb_size &&
+      !part_search_state.found_best_partition) {
+    // Did not find a valid partition, go back and search again, with less
+    // constraint on which partition types to search.
+    x->must_find_valid_partition = 1;
+#if CONFIG_COLLECT_PARTITION_STATS == 2
+    part_stats->partition_redo += 1;
+#endif
+    goto BEGIN_PARTITION_SEARCH;
+  }
+
+  // Store the final rd cost
+  *rd_cost = best_rdc;
+
+  // Also record the best partition in simple motion data tree because it is
+  // necessary for the related speed features.
+  sms_tree->partitioning = pc_tree->partitioning;
+
+#if CONFIG_COLLECT_PARTITION_STATS
+  if (best_rdc.rate < INT_MAX && best_rdc.dist < INT64_MAX) {
+    partition_decisions[pc_tree->partitioning] += 1;
+  }
+#endif
+
+#if CONFIG_COLLECT_PARTITION_STATS == 1
+  // If CONFIG_COLLECT_PARTITION_STATS is 1, then print out the stats for each
+  // prediction block.
+  FILE *f = fopen("data.csv", "a");
+  fprintf(f, "%d,%d,%d,", bsize, cm->show_frame, frame_is_intra_only(cm));
+  for (int idx = 0; idx < EXT_PARTITION_TYPES; idx++) {
+    fprintf(f, "%d,", partition_decisions[idx]);
+  }
+  for (int idx = 0; idx < EXT_PARTITION_TYPES; idx++) {
+    fprintf(f, "%d,", partition_attempts[idx]);
+  }
+  for (int idx = 0; idx < EXT_PARTITION_TYPES; idx++) {
+    fprintf(f, "%ld,", partition_times[idx]);
+  }
+  fprintf(f, "\n");
+  fclose(f);
+#endif
+
+#if CONFIG_COLLECT_PARTITION_STATS == 2
+  // If CONFIG_COLLECTION_PARTITION_STATS is 2, then we print out the stats for
+  // the whole clip. So we need to pass the information upstream to the encoder.
+  const int bsize_idx = av1_get_bsize_idx_for_part_stats(bsize);
+  int *agg_attempts = part_stats->partition_attempts[bsize_idx];
+  int *agg_decisions = part_stats->partition_decisions[bsize_idx];
+  int64_t *agg_times = part_stats->partition_times[bsize_idx];
+  for (int idx = 0; idx < EXT_PARTITION_TYPES; idx++) {
+    agg_attempts[idx] += partition_attempts[idx];
+    agg_decisions[idx] += partition_decisions[idx];
+    agg_times[idx] += partition_times[idx];
+  }
+#endif
+
+  // Reset the PC_TREE deallocation flag.
+  int pc_tree_dealloc = 0;
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  start_timing(cpi, encode_sb_time);
+#endif
+  // If a valid partition is found and reconstruction is required for future
+  // sub-blocks in the same group.
+  if (part_search_state.found_best_partition && pc_tree->index != 3) {
+    if (bsize == cm->seq_params.sb_size) {
+      // Encode the superblock.
+      const int emit_output = multi_pass_mode != SB_DRY_PASS;
+      const RUN_TYPE run_type = emit_output ? OUTPUT_ENABLED : DRY_RUN_NORMAL;
+
+      set_cb_offsets(x->cb_offset, 0, 0);
+      encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, run_type, bsize,
+                pc_tree, NULL);
+      // Dealloc the whole PC_TREE after a superblock is done.
+      av1_free_pc_tree_recursive(pc_tree, num_planes, 0, 0);
+      pc_tree_dealloc = 1;
+    } else {
+      // Encode the smaller blocks in DRY_RUN mode.
+      encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_NORMAL, bsize,
+                pc_tree, NULL);
+    }
+  }
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  end_timing(cpi, encode_sb_time);
+#endif
+
+  // If the tree still exists (non-superblock), dealloc most nodes, only keep
+  // nodes for the best partition and PARTITION_NONE.
+  if (pc_tree_dealloc == 0)
+    av1_free_pc_tree_recursive(pc_tree, num_planes, 1, 1);
+
+  if (bsize == cm->seq_params.sb_size) {
+    assert(best_rdc.rate < INT_MAX);
+    assert(best_rdc.dist < INT64_MAX);
+  } else {
+    assert(tp_orig == *tp);
+  }
+
+  // Restore the rd multiplier.
+  x->rdmult = orig_rdmult;
+  return part_search_state.found_best_partition;
+}
+#endif  // !CONFIG_REALTIME_ONLY
+
+#define FEATURES 6
+#define LABELS 2
+static int ml_predict_var_paritioning(AV1_COMP *cpi, MACROBLOCK *x,
+                                      BLOCK_SIZE bsize, int mi_row,
+                                      int mi_col) {
+  AV1_COMMON *const cm = &cpi->common;
+  const NN_CONFIG *nn_config = NULL;
+  const float *means = NULL;
+  const float *vars = NULL;
+  switch (bsize) {
+    case BLOCK_64X64:
+      nn_config = &av1_var_part_nnconfig_64;
+      means = av1_var_part_means_64;
+      vars = av1_var_part_vars_64;
+      break;
+    case BLOCK_32X32:
+      nn_config = &av1_var_part_nnconfig_32;
+      means = av1_var_part_means_32;
+      vars = av1_var_part_vars_32;
+      break;
+    case BLOCK_16X16:
+      nn_config = &av1_var_part_nnconfig_16;
+      means = av1_var_part_means_16;
+      vars = av1_var_part_vars_16;
+      break;
+    case BLOCK_8X8:
+    default: assert(0 && "Unexpected block size."); return -1;
+  }
+
+  if (!nn_config) return -1;
+
+  aom_clear_system_state();
+
+  {
+    const float thresh = cpi->oxcf.speed <= 5 ? 1.25f : 0.0f;
+    float features[FEATURES] = { 0.0f };
+    const int dc_q = av1_dc_quant_QTX(cm->quant_params.base_qindex, 0,
+                                      cm->seq_params.bit_depth);
+    int feature_idx = 0;
+    float score[LABELS];
+
+    features[feature_idx] =
+        (logf((float)(dc_q * dc_q) / 256.0f + 1.0f) - means[feature_idx]) /
+        sqrtf(vars[feature_idx]);
+    feature_idx++;
+    av1_setup_src_planes(x, cpi->source, mi_row, mi_col, 1, bsize);
+    {
+      const int bs = block_size_wide[bsize];
+      const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
+      const int sb_offset_row = 4 * (mi_row & 15);
+      const int sb_offset_col = 4 * (mi_col & 15);
+      const uint8_t *pred = x->est_pred + sb_offset_row * 64 + sb_offset_col;
+      const uint8_t *src = x->plane[0].src.buf;
+      const int src_stride = x->plane[0].src.stride;
+      const int pred_stride = 64;
+      unsigned int sse;
+      int i;
+      // Variance of whole block.
+      const unsigned int var =
+          cpi->fn_ptr[bsize].vf(src, src_stride, pred, pred_stride, &sse);
+      const float factor = (var == 0) ? 1.0f : (1.0f / (float)var);
+
+      features[feature_idx] = (logf((float)var + 1.0f) - means[feature_idx]) /
+                              sqrtf(vars[feature_idx]);
+      feature_idx++;
+      for (i = 0; i < 4; ++i) {
+        const int x_idx = (i & 1) * bs / 2;
+        const int y_idx = (i >> 1) * bs / 2;
+        const int src_offset = y_idx * src_stride + x_idx;
+        const int pred_offset = y_idx * pred_stride + x_idx;
+        // Variance of quarter block.
+        const unsigned int sub_var =
+            cpi->fn_ptr[subsize].vf(src + src_offset, src_stride,
+                                    pred + pred_offset, pred_stride, &sse);
+        const float var_ratio = (var == 0) ? 1.0f : factor * (float)sub_var;
+        features[feature_idx] =
+            (var_ratio - means[feature_idx]) / sqrtf(vars[feature_idx]);
+        feature_idx++;
+      }
+    }
+    //    for (int i = 0; i<FEATURES; i++)
+    //      printf("F_%d, %f; ", i, features[i]);
+    assert(feature_idx == FEATURES);
+    av1_nn_predict(features, nn_config, 1, score);
+    //    printf("Score %f, thr %f ", (float)score[0], thresh);
+    if (score[0] > thresh) return PARTITION_SPLIT;
+    if (score[0] < -thresh) return PARTITION_NONE;
+    return -1;
+  }
+}
+#undef FEATURES
+#undef LABELS
+
+// Uncomment for collecting data for ML-based partitioning
+// #define _COLLECT_GROUND_TRUTH_
+
+#ifdef _COLLECT_GROUND_TRUTH_
+static int store_partition_data(AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
+                                int mi_row, int mi_col, PARTITION_TYPE part) {
+  AV1_COMMON *const cm = &cpi->common;
+  char fname[128];
+  switch (bsize) {
+    case BLOCK_64X64: sprintf(fname, "data_64x64.txt"); break;
+    case BLOCK_32X32: sprintf(fname, "data_32x32.txt"); break;
+    case BLOCK_16X16: sprintf(fname, "data_16x16.txt"); break;
+    case BLOCK_8X8: sprintf(fname, "data_8x8.txt"); break;
+    default: assert(0 && "Unexpected block size."); return -1;
+  }
+
+  float features[6];  // DC_Q, VAR, VAR_RATIO-0..3
+
+  FILE *f = fopen(fname, "a");
+
+  aom_clear_system_state();
+
+  {
+    const int dc_q = av1_dc_quant_QTX(cm->quant_params.base_qindex, 0,
+                                      cm->seq_params.bit_depth);
+    int feature_idx = 0;
+
+    features[feature_idx++] = logf((float)(dc_q * dc_q) / 256.0f + 1.0f);
+    av1_setup_src_planes(x, cpi->source, mi_row, mi_col, 1, bsize);
+    {
+      const int bs = block_size_wide[bsize];
+      const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
+      const int sb_offset_row = 4 * (mi_row & 15);
+      const int sb_offset_col = 4 * (mi_col & 15);
+      const uint8_t *pred = x->est_pred + sb_offset_row * 64 + sb_offset_col;
+      const uint8_t *src = x->plane[0].src.buf;
+      const int src_stride = x->plane[0].src.stride;
+      const int pred_stride = 64;
+      unsigned int sse;
+      int i;
+      // Variance of whole block.
+      /*
+                if (bs == 8)
+                {
+                  int r, c;
+                  printf("%d %d\n", mi_row, mi_col);
+                  for (r = 0; r < bs; ++r) {
+                    for (c = 0; c < bs; ++c) {
+                      printf("%3d ",
+                             src[r * src_stride + c] - pred[64 * r + c]);
+                    }
+                    printf("\n");
+                  }
+                  printf("\n");
+                }
+      */
+      const unsigned int var =
+          cpi->fn_ptr[bsize].vf(src, src_stride, pred, pred_stride, &sse);
+      const float factor = (var == 0) ? 1.0f : (1.0f / (float)var);
+
+      features[feature_idx++] = logf((float)var + 1.0f);
+
+      fprintf(f, "%f,%f,", features[0], features[1]);
+      for (i = 0; i < 4; ++i) {
+        const int x_idx = (i & 1) * bs / 2;
+        const int y_idx = (i >> 1) * bs / 2;
+        const int src_offset = y_idx * src_stride + x_idx;
+        const int pred_offset = y_idx * pred_stride + x_idx;
+        // Variance of quarter block.
+        const unsigned int sub_var =
+            cpi->fn_ptr[subsize].vf(src + src_offset, src_stride,
+                                    pred + pred_offset, pred_stride, &sse);
+        const float var_ratio = (var == 0) ? 1.0f : factor * (float)sub_var;
+        features[feature_idx++] = var_ratio;
+        fprintf(f, "%f,", var_ratio);
+      }
+
+      fprintf(f, "%d\n", part == PARTITION_NONE ? 0 : 1);
+    }
+
+    fclose(f);
+    return -1;
+  }
+}
+#endif
+
+static void duplicate_mode_info_in_sb(AV1_COMMON *cm, MACROBLOCKD *xd,
+                                      int mi_row, int mi_col,
+                                      BLOCK_SIZE bsize) {
+  const int block_width =
+      AOMMIN(mi_size_wide[bsize], cm->mi_params.mi_cols - mi_col);
+  const int block_height =
+      AOMMIN(mi_size_high[bsize], cm->mi_params.mi_rows - mi_row);
+  const int mi_stride = xd->mi_stride;
+  MB_MODE_INFO *const src_mi = xd->mi[0];
+  int i, j;
+
+  for (j = 0; j < block_height; ++j)
+    for (i = 0; i < block_width; ++i) xd->mi[j * mi_stride + i] = src_mi;
+}
+
+static INLINE void copy_mbmi_ext_frame_to_mbmi_ext(
+    MB_MODE_INFO_EXT *const mbmi_ext,
+    const MB_MODE_INFO_EXT_FRAME *mbmi_ext_best, uint8_t ref_frame_type) {
+  memcpy(mbmi_ext->ref_mv_stack[ref_frame_type], mbmi_ext_best->ref_mv_stack,
+         sizeof(mbmi_ext->ref_mv_stack[USABLE_REF_MV_STACK_SIZE]));
+  memcpy(mbmi_ext->weight[ref_frame_type], mbmi_ext_best->weight,
+         sizeof(mbmi_ext->weight[USABLE_REF_MV_STACK_SIZE]));
+  mbmi_ext->mode_context[ref_frame_type] = mbmi_ext_best->mode_context;
+  mbmi_ext->ref_mv_count[ref_frame_type] = mbmi_ext_best->ref_mv_count;
+  memcpy(mbmi_ext->global_mvs, mbmi_ext_best->global_mvs,
+         sizeof(mbmi_ext->global_mvs));
+}
+
+static void fill_mode_info_sb(AV1_COMP *cpi, MACROBLOCK *x, int mi_row,
+                              int mi_col, BLOCK_SIZE bsize, PC_TREE *pc_tree) {
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *xd = &x->e_mbd;
+  int hbs = mi_size_wide[bsize] >> 1;
+  PARTITION_TYPE partition = pc_tree->partitioning;
+  BLOCK_SIZE subsize = get_partition_subsize(bsize, partition);
+
+  assert(bsize >= BLOCK_8X8);
+
+  if (mi_row >= cm->mi_params.mi_rows || mi_col >= cm->mi_params.mi_cols)
+    return;
+
+  switch (partition) {
+    case PARTITION_NONE:
+      set_mode_info_offsets(&cm->mi_params, &cpi->mbmi_ext_info, x, xd, mi_row,
+                            mi_col);
+      *(xd->mi[0]) = pc_tree->none->mic;
+      copy_mbmi_ext_frame_to_mbmi_ext(
+          &x->mbmi_ext, &pc_tree->none->mbmi_ext_best, LAST_FRAME);
+      duplicate_mode_info_in_sb(cm, xd, mi_row, mi_col, bsize);
+      break;
+    case PARTITION_SPLIT: {
+      fill_mode_info_sb(cpi, x, mi_row, mi_col, subsize, pc_tree->split[0]);
+      fill_mode_info_sb(cpi, x, mi_row, mi_col + hbs, subsize,
+                        pc_tree->split[1]);
+      fill_mode_info_sb(cpi, x, mi_row + hbs, mi_col, subsize,
+                        pc_tree->split[2]);
+      fill_mode_info_sb(cpi, x, mi_row + hbs, mi_col + hbs, subsize,
+                        pc_tree->split[3]);
+      break;
+    }
+    default: break;
+  }
+}
+
+void av1_nonrd_pick_partition(AV1_COMP *cpi, ThreadData *td,
+                              TileDataEnc *tile_data, TokenExtra **tp,
+                              int mi_row, int mi_col, BLOCK_SIZE bsize,
+                              RD_STATS *rd_cost, int do_recon, int64_t best_rd,
+                              PC_TREE *pc_tree) {
+  AV1_COMMON *const cm = &cpi->common;
+  TileInfo *const tile_info = &tile_data->tile_info;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const int hbs = mi_size_wide[bsize] >> 1;
+  TokenExtra *tp_orig = *tp;
+  const ModeCosts *mode_costs = &x->mode_costs;
+  RD_STATS this_rdc, best_rdc;
+  RD_SEARCH_MACROBLOCK_CONTEXT x_ctx;
+  int do_split = bsize > BLOCK_8X8;
+  // Override skipping rectangular partition operations for edge blocks
+  const int force_horz_split = (mi_row + 2 * hbs > cm->mi_params.mi_rows);
+  const int force_vert_split = (mi_col + 2 * hbs > cm->mi_params.mi_cols);
+
+  int partition_none_allowed = !force_horz_split && !force_vert_split;
+
+  assert(mi_size_wide[bsize] == mi_size_high[bsize]);  // Square partition only
+  assert(cm->seq_params.sb_size == BLOCK_64X64);       // Small SB so far
+
+  (void)*tp_orig;
+
+  av1_invalid_rd_stats(&best_rdc);
+  best_rdc.rdcost = best_rd;
+#ifndef _COLLECT_GROUND_TRUTH_
+  if (partition_none_allowed && do_split) {
+    const int ml_predicted_partition =
+        ml_predict_var_paritioning(cpi, x, bsize, mi_row, mi_col);
+    if (ml_predicted_partition == PARTITION_NONE) do_split = 0;
+    if (ml_predicted_partition == PARTITION_SPLIT) partition_none_allowed = 0;
+  }
+#endif
+
+  xd->above_txfm_context =
+      cm->above_contexts.txfm[tile_info->tile_row] + mi_col;
+  xd->left_txfm_context =
+      xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+  av1_save_context(x, &x_ctx, mi_row, mi_col, bsize, 3);
+
+  // PARTITION_NONE
+  if (partition_none_allowed) {
+    pc_tree->none = av1_alloc_pmc(cm, bsize, &td->shared_coeff_buf);
+    PICK_MODE_CONTEXT *ctx = pc_tree->none;
+
+// Flip for RDO based pick mode
+#if 0
+    RD_STATS dummy;
+    av1_invalid_rd_stats(&dummy);
+    pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc,
+                  PARTITION_NONE, bsize, ctx, dummy);
+#else
+    pick_sb_modes_nonrd(cpi, tile_data, x, mi_row, mi_col, &this_rdc, bsize,
+                        ctx);
+#endif
+    if (this_rdc.rate != INT_MAX) {
+      const int pl = partition_plane_context(xd, mi_row, mi_col, bsize);
+
+      this_rdc.rate += mode_costs->partition_cost[pl][PARTITION_NONE];
+      this_rdc.rdcost = RDCOST(x->rdmult, this_rdc.rate, this_rdc.dist);
+      if (this_rdc.rdcost < best_rdc.rdcost) {
+        best_rdc = this_rdc;
+        if (bsize >= BLOCK_8X8) pc_tree->partitioning = PARTITION_NONE;
+      }
+    }
+  }
+
+  // PARTITION_SPLIT
+  if (do_split) {
+    RD_STATS sum_rdc;
+    const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
+
+    av1_init_rd_stats(&sum_rdc);
+
+    for (int i = 0; i < SUB_PARTITIONS_SPLIT; ++i) {
+      pc_tree->split[i] = av1_alloc_pc_tree_node(subsize);
+      pc_tree->split[i]->index = i;
+    }
+
+    int pl = partition_plane_context(xd, mi_row, mi_col, bsize);
+    sum_rdc.rate += mode_costs->partition_cost[pl][PARTITION_SPLIT];
+    sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
+    for (int i = 0;
+         i < SUB_PARTITIONS_SPLIT && sum_rdc.rdcost < best_rdc.rdcost; ++i) {
+      const int x_idx = (i & 1) * hbs;
+      const int y_idx = (i >> 1) * hbs;
+
+      if (mi_row + y_idx >= cm->mi_params.mi_rows ||
+          mi_col + x_idx >= cm->mi_params.mi_cols)
+        continue;
+      av1_nonrd_pick_partition(cpi, td, tile_data, tp, mi_row + y_idx,
+                               mi_col + x_idx, subsize, &this_rdc, i < 3,
+                               best_rdc.rdcost - sum_rdc.rdcost,
+                               pc_tree->split[i]);
+
+      if (this_rdc.rate == INT_MAX) {
+        av1_invalid_rd_stats(&sum_rdc);
+      } else {
+        sum_rdc.rate += this_rdc.rate;
+        sum_rdc.dist += this_rdc.dist;
+        sum_rdc.rdcost += this_rdc.rdcost;
+      }
+    }
+    if (sum_rdc.rdcost < best_rdc.rdcost) {
+      best_rdc = sum_rdc;
+      pc_tree->partitioning = PARTITION_SPLIT;
+    }
+  }
+
+#ifdef _COLLECT_GROUND_TRUTH_
+  store_partition_data(cpi, x, bsize, mi_row, mi_col, pc_tree->partitioning);
+#endif
+
+  *rd_cost = best_rdc;
+
+  av1_restore_context(x, &x_ctx, mi_row, mi_col, bsize, 3);
+
+  if (best_rdc.rate == INT_MAX) {
+    av1_invalid_rd_stats(rd_cost);
+    return;
+  }
+
+  // update mode info array
+  fill_mode_info_sb(cpi, x, mi_row, mi_col, bsize, pc_tree);
+
+  if (do_recon) {
+    if (bsize == cm->seq_params.sb_size) {
+      // NOTE: To get estimate for rate due to the tokens, use:
+      // int rate_coeffs = 0;
+      // encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_COSTCOEFFS,
+      //           bsize, pc_tree, &rate_coeffs);
+      set_cb_offsets(x->cb_offset, 0, 0);
+      encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, OUTPUT_ENABLED, bsize,
+                pc_tree, NULL);
+    } else {
+      encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_NORMAL, bsize,
+                pc_tree, NULL);
+    }
+  }
+
+  if (bsize == BLOCK_64X64 && do_recon) {
+    assert(best_rdc.rate < INT_MAX);
+    assert(best_rdc.dist < INT64_MAX);
+  } else {
+    assert(tp_orig == *tp);
+  }
+}
diff --git a/av1/encoder/partition_search.h b/av1/encoder/partition_search.h
new file mode 100644
index 0000000..b77d4a2
--- /dev/null
+++ b/av1/encoder/partition_search.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_PARTITION_SEARCH_H_
+#define AOM_AV1_ENCODER_PARTITION_SEARCH_H_
+
+#include "av1/encoder/block.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/encodeframe.h"
+#include "av1/encoder/tokenize.h"
+
+void av1_set_offsets_without_segment_id(const AV1_COMP *const cpi,
+                                        const TileInfo *const tile,
+                                        MACROBLOCK *const x, int mi_row,
+                                        int mi_col, BLOCK_SIZE bsize);
+void av1_set_offsets(const AV1_COMP *const cpi, const TileInfo *const tile,
+                     MACROBLOCK *const x, int mi_row, int mi_col,
+                     BLOCK_SIZE bsize);
+void av1_rd_use_partition(AV1_COMP *cpi, ThreadData *td, TileDataEnc *tile_data,
+                          MB_MODE_INFO **mib, TokenExtra **tp, int mi_row,
+                          int mi_col, BLOCK_SIZE bsize, int *rate,
+                          int64_t *dist, int do_recon, PC_TREE *pc_tree);
+void av1_nonrd_use_partition(AV1_COMP *cpi, ThreadData *td,
+                             TileDataEnc *tile_data, MB_MODE_INFO **mib,
+                             TokenExtra **tp, int mi_row, int mi_col,
+                             BLOCK_SIZE bsize, PC_TREE *pc_tree);
+void av1_nonrd_pick_partition(AV1_COMP *cpi, ThreadData *td,
+                              TileDataEnc *tile_data, TokenExtra **tp,
+                              int mi_row, int mi_col, BLOCK_SIZE bsize,
+                              RD_STATS *rd_cost, int do_recon, int64_t best_rd,
+                              PC_TREE *pc_tree);
+bool av1_rd_pick_partition(AV1_COMP *const cpi, ThreadData *td,
+                           TileDataEnc *tile_data, TokenExtra **tp, int mi_row,
+                           int mi_col, BLOCK_SIZE bsize, RD_STATS *rd_cost,
+                           RD_STATS best_rdc, PC_TREE *pc_tree,
+                           SIMPLE_MOTION_DATA_TREE *sms_tree, int64_t *none_rd,
+                           SB_MULTI_PASS_MODE multi_pass_mode,
+                           RD_RECT_PART_WIN_INFO *rect_part_win_info);
+
+static AOM_INLINE void set_cb_offsets(uint16_t *cb_offset,
+                                      const uint16_t cb_offset_y,
+                                      const uint16_t cb_offset_uv) {
+  cb_offset[PLANE_TYPE_Y] = cb_offset_y;
+  cb_offset[PLANE_TYPE_UV] = cb_offset_uv;
+}
+
+static AOM_INLINE void update_cb_offsets(MACROBLOCK *x, const BLOCK_SIZE bsize,
+                                         const int subsampling_x,
+                                         const int subsampling_y) {
+  const BLOCK_SIZE plane_bsize =
+      get_plane_block_size(bsize, subsampling_x, subsampling_y);
+  x->cb_offset[PLANE_TYPE_Y] += block_size_wide[bsize] * block_size_high[bsize];
+  if (x->e_mbd.is_chroma_ref)
+    x->cb_offset[PLANE_TYPE_UV] +=
+        block_size_wide[plane_bsize] * block_size_high[plane_bsize];
+}
+
+#endif  // AOM_AV1_ENCODER_PARTITION_SEARCH_H_
diff --git a/av1/encoder/partition_strategy.c b/av1/encoder/partition_strategy.c
index cc820ba..e3eceb9 100644
--- a/av1/encoder/partition_strategy.c
+++ b/av1/encoder/partition_strategy.c
@@ -31,9 +31,9 @@
 
 #if !CONFIG_REALTIME_ONLY
 static AOM_INLINE void simple_motion_search_prune_part_features(
-    AV1_COMP *const cpi, MACROBLOCK *x, PC_TREE *pc_tree, int mi_row,
-    int mi_col, BLOCK_SIZE bsize, float *features, int features_to_get);
-#endif
+    AV1_COMP *const cpi, MACROBLOCK *x, SIMPLE_MOTION_DATA_TREE *sms_tree,
+    int mi_row, int mi_col, BLOCK_SIZE bsize, float *features,
+    int features_to_get);
 
 static INLINE int convert_bsize_to_idx(BLOCK_SIZE bsize) {
   switch (bsize) {
@@ -45,6 +45,7 @@
     default: assert(0 && "Invalid bsize"); return -1;
   }
 }
+#endif
 
 #if !CONFIG_REALTIME_ONLY
 // TODO(chiyotsai@google.com): This is very much a work in progress. We still
@@ -68,8 +69,10 @@
     return;
   }
 
+  PartitionSearchInfo *part_info = &x->part_search_info;
+
   // Precompute the CNN part and cache the result in MACROBLOCK
-  if (bsize == BLOCK_64X64 && !x->cnn_output_valid) {
+  if (bsize == BLOCK_64X64 && !part_info->cnn_output_valid) {
     aom_clear_system_state();
     const CNN_CONFIG *cnn_config = &av1_intra_mode_cnn_partition_cnn_config;
 
@@ -82,7 +85,7 @@
     float *output_buffer[CNN_TOT_OUT_CH];
 
     float **cur_output_buf = output_buffer;
-    float *curr_buf_ptr = x->cnn_buffer;
+    float *curr_buf_ptr = part_info->cnn_buffer;
     for (int output_idx = 0; output_idx < num_outputs; output_idx++) {
       const int num_chs = out_chs[output_idx];
       const int ch_size = output_dims[output_idx] * output_dims[output_idx];
@@ -105,9 +108,10 @@
     const int bit_depth = xd->bd;
     const int dc_q =
         av1_dc_quant_QTX(x->qindex, 0, bit_depth) >> (bit_depth - 8);
-    x->log_q = logf(1.0f + (float)(dc_q * dc_q) / 256.0f);
-    x->log_q = (x->log_q - av1_intra_mode_cnn_partition_mean[0]) /
-               av1_intra_mode_cnn_partition_std[0];
+    part_info->log_q = logf(1.0f + (float)(dc_q * dc_q) / 256.0f);
+    part_info->log_q =
+        (part_info->log_q - av1_intra_mode_cnn_partition_mean[0]) /
+        av1_intra_mode_cnn_partition_std[0];
 
     const int width = 65, height = 65,
               stride = x->plane[AOM_PLANE_Y].src.stride;
@@ -127,10 +131,10 @@
                                     &thread_data, &output);
     }
 
-    x->cnn_output_valid = 1;
+    part_info->cnn_output_valid = 1;
   }
 
-  if (!x->cnn_output_valid) {
+  if (!part_info->cnn_output_valid) {
     return;
   }
 
@@ -148,7 +152,7 @@
   float dnn_features[100];
   float logits[4] = { 0.0f };
 
-  const float *branch_0 = x->cnn_buffer;
+  const float *branch_0 = part_info->cnn_buffer;
   const float *branch_1 = branch_0 + CNN_BRANCH_0_OUT_SIZE;
   const float *branch_2 = branch_1 + CNN_BRANCH_1_OUT_SIZE;
   const float *branch_3 = branch_2 + CNN_BRANCH_2_OUT_SIZE;
@@ -165,7 +169,7 @@
         dnn_features[f_idx++] = branch_1[lin_idx + ch_idx * spa_stride];
       }
     }
-    dnn_features[f_idx++] = x->log_q;
+    dnn_features[f_idx++] = part_info->log_q;
   } else if (bsize == BLOCK_32X32) {
     int f_idx = 0;
     for (int idx = 0; idx < CNN_BRANCH_0_OUT_CH; idx++) {
@@ -177,7 +181,7 @@
     for (int ch_idx = 0; ch_idx < CNN_BRANCH_1_OUT_CH; ch_idx++) {
       dnn_features[f_idx++] = branch_1[curr_lin_idx + ch_idx * spa_stride];
     }
-    dnn_features[f_idx++] = x->log_q;
+    dnn_features[f_idx++] = part_info->log_q;
   } else if (bsize == BLOCK_16X16) {
     int f_idx = 0;
     const int prev_quad_idx = (quad_tree_idx - 1) / 4;
@@ -192,7 +196,7 @@
     for (int ch_idx = 0; ch_idx < CNN_BRANCH_2_OUT_CH; ch_idx++) {
       dnn_features[f_idx++] = branch_2[curr_lin_idx + ch_idx * spa_stride];
     }
-    dnn_features[f_idx++] = x->log_q;
+    dnn_features[f_idx++] = part_info->log_q;
   } else if (bsize == BLOCK_8X8) {
     int f_idx = 0;
     const int prev_quad_idx = (quad_tree_idx - 1) / 4;
@@ -207,7 +211,7 @@
     for (int ch_idx = 0; ch_idx < CNN_BRANCH_3_OUT_CH; ch_idx++) {
       dnn_features[f_idx++] = branch_3[curr_lin_idx + ch_idx * spa_stride];
     }
-    dnn_features[f_idx++] = x->log_q;
+    dnn_features[f_idx++] = part_info->log_q;
   } else {
     assert(0 && "Invalid bsize in intra_cnn partition");
   }
@@ -249,8 +253,8 @@
 }
 
 void av1_simple_motion_search_based_split(
-    AV1_COMP *const cpi, MACROBLOCK *x, PC_TREE *pc_tree, int mi_row,
-    int mi_col, BLOCK_SIZE bsize, int *partition_none_allowed,
+    AV1_COMP *const cpi, MACROBLOCK *x, SIMPLE_MOTION_DATA_TREE *sms_tree,
+    int mi_row, int mi_col, BLOCK_SIZE bsize, int *partition_none_allowed,
     int *partition_horz_allowed, int *partition_vert_allowed,
     int *do_rectangular_split, int *do_square_split) {
   aom_clear_system_state();
@@ -277,7 +281,7 @@
       av1_simple_motion_search_no_split_thresh[agg][res_idx][bsize_idx];
 
   float features[FEATURE_SIZE_SMS_SPLIT] = { 0.0f };
-  simple_motion_search_prune_part_features(cpi, x, pc_tree, mi_row, mi_col,
+  simple_motion_search_prune_part_features(cpi, x, sms_tree, mi_row, mi_col,
                                            bsize, features,
                                            FEATURE_SMS_SPLIT_MODEL_FLAG);
   for (int idx = 0; idx < FEATURE_SIZE_SMS_SPLIT; idx++) {
@@ -306,12 +310,12 @@
 // the refs and returns the ref with the smallest sse. Returns -1 if none of the
 // ref in the list is available. Also stores the best sse and var in best_sse,
 // best_var, respectively. If save_mv is 0, don't update mv_ref_fulls in
-// pc_tree. If save_mv is 1, update mv_ref_fulls under pc_tree and the
+// sms_tree. If save_mv is 1, update mv_ref_fulls under sms_tree and the
 // subtrees.
 static int simple_motion_search_get_best_ref(
-    AV1_COMP *const cpi, MACROBLOCK *x, PC_TREE *pc_tree, int mi_row,
-    int mi_col, BLOCK_SIZE bsize, const int *const refs, int num_refs,
-    int use_subpixel, int save_mv, unsigned int *best_sse,
+    AV1_COMP *const cpi, MACROBLOCK *x, SIMPLE_MOTION_DATA_TREE *sms_tree,
+    int mi_row, int mi_col, BLOCK_SIZE bsize, const int *const refs,
+    int num_refs, int use_subpixel, int save_mv, unsigned int *best_sse,
     unsigned int *best_var) {
   const AV1_COMMON *const cm = &cpi->common;
   int best_ref = -1;
@@ -336,7 +340,7 @@
     const int ref = refs[ref_idx];
 
     if (cpi->ref_frame_flags & av1_ref_frame_flag_list[ref]) {
-      const FULLPEL_MV *start_mvs = pc_tree->start_mvs;
+      const FULLPEL_MV *start_mvs = sms_tree->start_mvs;
       unsigned int curr_sse = 0, curr_var = 0;
       int_mv best_mv =
           av1_simple_motion_search(cpi, x, mi_row, mi_col, bsize, ref,
@@ -351,14 +355,14 @@
       }
 
       if (save_mv) {
-        pc_tree->start_mvs[ref].row = best_mv.as_mv.row / 8;
-        pc_tree->start_mvs[ref].col = best_mv.as_mv.col / 8;
+        sms_tree->start_mvs[ref].row = best_mv.as_mv.row / 8;
+        sms_tree->start_mvs[ref].col = best_mv.as_mv.col / 8;
 
         if (bsize >= BLOCK_8X8) {
-          for (int r_idx = 0; r_idx < 4; r_idx++) {
+          for (int r_idx = 0; r_idx < SUB_PARTITIONS_SPLIT; r_idx++) {
             // Propagate the new motion vectors to a lower level
-            PC_TREE *sub_tree = pc_tree->split[r_idx];
-            sub_tree->start_mvs[ref] = pc_tree->start_mvs[ref];
+            SIMPLE_MOTION_DATA_TREE *sub_tree = sms_tree->split[r_idx];
+            sub_tree->start_mvs[ref] = sms_tree->start_mvs[ref];
           }
         }
       }
@@ -369,10 +373,10 @@
 }
 
 // Collects features using simple_motion_search and store them in features. The
-// features are also cached in PC_TREE. By default, the features collected are
-// the sse and var from the subblocks flagged by features_to_get. Furthermore,
-// if features is not NULL, then 7 more features are appended to the end of
-// features:
+// features are also cached in SIMPLE_MOTION_DATA_TREE. By default, the features
+// collected are the sse and var from the subblocks flagged by features_to_get.
+// Furthermore, if features is not NULL, then 7 more features are appended to
+// the end of features:
 //  - log(1.0 + dc_q ** 2)
 //  - whether an above macroblock exists
 //  - width of above macroblock
@@ -381,8 +385,9 @@
 //  - width of left macroblock
 //  - height of left macroblock
 static AOM_INLINE void simple_motion_search_prune_part_features(
-    AV1_COMP *const cpi, MACROBLOCK *x, PC_TREE *pc_tree, int mi_row,
-    int mi_col, BLOCK_SIZE bsize, float *features, int features_to_get) {
+    AV1_COMP *const cpi, MACROBLOCK *x, SIMPLE_MOTION_DATA_TREE *sms_tree,
+    int mi_row, int mi_col, BLOCK_SIZE bsize, float *features,
+    int features_to_get) {
   const int w_mi = mi_size_wide[bsize];
   const int h_mi = mi_size_high[bsize];
   assert(mi_size_wide[bsize] == mi_size_high[bsize]);
@@ -396,21 +401,21 @@
   const int use_subpixel = 1;
 
   // Doing whole block first to update the mv
-  if (!pc_tree->sms_none_valid && features_to_get & FEATURE_SMS_NONE_FLAG) {
-    simple_motion_search_get_best_ref(cpi, x, pc_tree, mi_row, mi_col, bsize,
+  if (!sms_tree->sms_none_valid && features_to_get & FEATURE_SMS_NONE_FLAG) {
+    simple_motion_search_get_best_ref(cpi, x, sms_tree, mi_row, mi_col, bsize,
                                       ref_list, num_refs, use_subpixel, 1,
-                                      &pc_tree->sms_none_feat[0],
-                                      &pc_tree->sms_none_feat[1]);
-    pc_tree->sms_none_valid = 1;
+                                      &sms_tree->sms_none_feat[0],
+                                      &sms_tree->sms_none_feat[1]);
+    sms_tree->sms_none_valid = 1;
   }
 
   // Split subblocks
   if (features_to_get & FEATURE_SMS_SPLIT_FLAG) {
     const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
-    for (int r_idx = 0; r_idx < 4; r_idx++) {
+    for (int r_idx = 0; r_idx < SUB_PARTITIONS_SPLIT; r_idx++) {
       const int sub_mi_col = mi_col + (r_idx & 1) * w_mi / 2;
       const int sub_mi_row = mi_row + (r_idx >> 1) * h_mi / 2;
-      PC_TREE *sub_tree = pc_tree->split[r_idx];
+      SIMPLE_MOTION_DATA_TREE *sub_tree = sms_tree->split[r_idx];
 
       if (!sub_tree->sms_none_valid) {
         simple_motion_search_get_best_ref(
@@ -423,31 +428,31 @@
   }
 
   // Rectangular subblocks
-  if (!pc_tree->sms_rect_valid && features_to_get & FEATURE_SMS_RECT_FLAG) {
+  if (!sms_tree->sms_rect_valid && features_to_get & FEATURE_SMS_RECT_FLAG) {
     // Horz subblock
     BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_HORZ);
-    for (int r_idx = 0; r_idx < 2; r_idx++) {
+    for (int r_idx = 0; r_idx < SUB_PARTITIONS_RECT; r_idx++) {
       const int sub_mi_col = mi_col + 0;
       const int sub_mi_row = mi_row + r_idx * h_mi / 2;
 
       simple_motion_search_get_best_ref(
-          cpi, x, pc_tree, sub_mi_row, sub_mi_col, subsize, ref_list, num_refs,
-          use_subpixel, 0, &pc_tree->sms_rect_feat[2 * r_idx],
-          &pc_tree->sms_rect_feat[2 * r_idx + 1]);
+          cpi, x, sms_tree, sub_mi_row, sub_mi_col, subsize, ref_list, num_refs,
+          use_subpixel, 0, &sms_tree->sms_rect_feat[2 * r_idx],
+          &sms_tree->sms_rect_feat[2 * r_idx + 1]);
     }
 
     // Vert subblock
     subsize = get_partition_subsize(bsize, PARTITION_VERT);
-    for (int r_idx = 0; r_idx < 2; r_idx++) {
+    for (int r_idx = 0; r_idx < SUB_PARTITIONS_RECT; r_idx++) {
       const int sub_mi_col = mi_col + r_idx * w_mi / 2;
       const int sub_mi_row = mi_row + 0;
 
       simple_motion_search_get_best_ref(
-          cpi, x, pc_tree, sub_mi_row, sub_mi_col, subsize, ref_list, num_refs,
-          use_subpixel, 0, &pc_tree->sms_rect_feat[4 + 2 * r_idx],
-          &pc_tree->sms_rect_feat[4 + 2 * r_idx + 1]);
+          cpi, x, sms_tree, sub_mi_row, sub_mi_col, subsize, ref_list, num_refs,
+          use_subpixel, 0, &sms_tree->sms_rect_feat[4 + 2 * r_idx],
+          &sms_tree->sms_rect_feat[4 + 2 * r_idx + 1]);
     }
-    pc_tree->sms_rect_valid = 1;
+    sms_tree->sms_rect_valid = 1;
   }
 
   if (!features) return;
@@ -456,13 +461,13 @@
   int f_idx = 0;
   if (features_to_get & FEATURE_SMS_NONE_FLAG) {
     for (int sub_idx = 0; sub_idx < 2; sub_idx++) {
-      features[f_idx++] = logf(1.0f + pc_tree->sms_none_feat[sub_idx]);
+      features[f_idx++] = logf(1.0f + sms_tree->sms_none_feat[sub_idx]);
     }
   }
 
   if (features_to_get & FEATURE_SMS_SPLIT_FLAG) {
-    for (int sub_idx = 0; sub_idx < 4; sub_idx++) {
-      PC_TREE *sub_tree = pc_tree->split[sub_idx];
+    for (int sub_idx = 0; sub_idx < SUB_PARTITIONS_SPLIT; sub_idx++) {
+      SIMPLE_MOTION_DATA_TREE *sub_tree = sms_tree->split[sub_idx];
       features[f_idx++] = logf(1.0f + sub_tree->sms_none_feat[0]);
       features[f_idx++] = logf(1.0f + sub_tree->sms_none_feat[1]);
     }
@@ -470,9 +475,10 @@
 
   if (features_to_get & FEATURE_SMS_RECT_FLAG) {
     for (int sub_idx = 0; sub_idx < 8; sub_idx++) {
-      features[f_idx++] = logf(1.0f + pc_tree->sms_rect_feat[sub_idx]);
+      features[f_idx++] = logf(1.0f + sms_tree->sms_rect_feat[sub_idx]);
     }
   }
+  aom_clear_system_state();
 
   const MACROBLOCKD *xd = &x->e_mbd;
   set_offsets_for_motion_search(cpi, x, mi_row, mi_col, bsize);
@@ -484,8 +490,8 @@
   // Neighbor stuff
   const int has_above = !!xd->above_mbmi;
   const int has_left = !!xd->left_mbmi;
-  const BLOCK_SIZE above_bsize = has_above ? xd->above_mbmi->sb_type : bsize;
-  const BLOCK_SIZE left_bsize = has_left ? xd->left_mbmi->sb_type : bsize;
+  const BLOCK_SIZE above_bsize = has_above ? xd->above_mbmi->bsize : bsize;
+  const BLOCK_SIZE left_bsize = has_left ? xd->left_mbmi->bsize : bsize;
   features[f_idx++] = (float)has_above;
   features[f_idx++] = (float)mi_size_wide_log2[above_bsize];
   features[f_idx++] = (float)mi_size_high_log2[above_bsize];
@@ -494,12 +500,10 @@
   features[f_idx++] = (float)mi_size_high_log2[left_bsize];
 }
 
-void av1_simple_motion_search_prune_rect(AV1_COMP *const cpi, MACROBLOCK *x,
-                                         PC_TREE *pc_tree, int mi_row,
-                                         int mi_col, BLOCK_SIZE bsize,
-                                         int *partition_horz_allowed,
-                                         int *partition_vert_allowed,
-                                         int *prune_horz, int *prune_vert) {
+void av1_simple_motion_search_prune_rect(
+    AV1_COMP *const cpi, MACROBLOCK *x, SIMPLE_MOTION_DATA_TREE *sms_tree,
+    int mi_row, int mi_col, BLOCK_SIZE bsize, int partition_horz_allowed,
+    int partition_vert_allowed, int *prune_horz, int *prune_vert) {
   aom_clear_system_state();
   const AV1_COMMON *const cm = &cpi->common;
   const int bsize_idx = convert_bsize_to_idx(bsize);
@@ -525,7 +529,7 @@
 
   // Get features
   float features[FEATURE_SIZE_SMS_PRUNE_PART] = { 0.0f };
-  simple_motion_search_prune_part_features(cpi, x, pc_tree, mi_row, mi_col,
+  simple_motion_search_prune_part_features(cpi, x, sms_tree, mi_row, mi_col,
                                            bsize, features,
                                            FEATURE_SMS_PRUNE_PART_FLAG);
   for (int f_idx = 0; f_idx < FEATURE_SIZE_SMS_PRUNE_PART; f_idx++) {
@@ -547,7 +551,7 @@
   // Determine if we should prune rectangular partitions.
   if (cpi->sf.part_sf.simple_motion_search_prune_rect &&
       !frame_is_intra_only(cm) &&
-      (*partition_horz_allowed || *partition_vert_allowed) &&
+      (partition_horz_allowed || partition_vert_allowed) &&
       bsize >= BLOCK_8X8 && !av1_superres_scaled(cm)) {
     *prune_horz = probs[PARTITION_HORZ] <= prune_thresh;
     *prune_vert = probs[PARTITION_VERT] <= prune_thresh;
@@ -560,16 +564,14 @@
 //  - The frame is not intra only
 //  - The current bsize is > BLOCK_8X8
 //  - blk_row + blk_height/2 < total_rows and blk_col + blk_width/2 < total_cols
-void av1_simple_motion_search_early_term_none(AV1_COMP *const cpi,
-                                              MACROBLOCK *x, PC_TREE *pc_tree,
-                                              int mi_row, int mi_col,
-                                              BLOCK_SIZE bsize,
-                                              const RD_STATS *none_rdc,
-                                              int *early_terminate) {
+void av1_simple_motion_search_early_term_none(
+    AV1_COMP *const cpi, MACROBLOCK *x, SIMPLE_MOTION_DATA_TREE *sms_tree,
+    int mi_row, int mi_col, BLOCK_SIZE bsize, const RD_STATS *none_rdc,
+    int *early_terminate) {
   // TODO(chiyotsai@google.com): There are other features we can extract from
   // PARTITION_NONE. Play with this later.
   float features[FEATURE_SIZE_SMS_TERM_NONE] = { 0.0f };
-  simple_motion_search_prune_part_features(cpi, x, pc_tree, mi_row, mi_col,
+  simple_motion_search_prune_part_features(cpi, x, sms_tree, mi_row, mi_col,
                                            bsize, features,
                                            FEATURE_SMS_PRUNE_PART_FLAG);
   int f_idx = FEATURE_SIZE_SMS_PRUNE_PART;
@@ -714,7 +716,8 @@
   assert(f_idx == FEATURE_SIZE_MAX_MIN_PART_PRED);
 }
 
-BLOCK_SIZE av1_predict_max_partition(AV1_COMP *const cpi, MACROBLOCK *const x,
+BLOCK_SIZE av1_predict_max_partition(const AV1_COMP *const cpi,
+                                     const MACROBLOCK *const x,
                                      const float *features) {
   float scores[MAX_NUM_CLASSES_MAX_MIN_PART_PRED] = { 0.0f },
         probs[MAX_NUM_CLASSES_MAX_MIN_PART_PRED] = { 0.0f };
@@ -750,7 +753,7 @@
   } else if (cpi->sf.part_sf.auto_max_partition_based_on_simple_motion ==
              ADAPT_PRED) {
     const BLOCK_SIZE sb_size = cpi->common.seq_params.sb_size;
-    MACROBLOCKD *const xd = &x->e_mbd;
+    const MACROBLOCKD *const xd = &x->e_mbd;
     // TODO(debargha): x->source_variance is unavailable at this point,
     // so compute. The redundant recomputation later can be removed.
     const unsigned int source_variance =
@@ -774,24 +777,24 @@
 }
 
 // Get the minimum partition block width and height(in log scale) under a
-// PC_TREE.
-static AOM_INLINE void get_min_bsize(const PC_TREE *pc_tree, int *min_bw,
-                                     int *min_bh) {
-  if (!pc_tree) return;
+// SIMPLE_MOTION_DATA_TREE.
+static AOM_INLINE void get_min_bsize(const SIMPLE_MOTION_DATA_TREE *sms_tree,
+                                     int *min_bw, int *min_bh) {
+  if (!sms_tree) return;
 
-  const BLOCK_SIZE bsize = pc_tree->block_size;
+  const BLOCK_SIZE bsize = sms_tree->block_size;
   if (bsize == BLOCK_4X4) {
     *min_bw = 0;
     *min_bh = 0;
     return;
   }
 
-  PARTITION_TYPE part_type = pc_tree->partitioning;
+  PARTITION_TYPE part_type = sms_tree->partitioning;
   if (part_type == PARTITION_INVALID) return;
 
   if (part_type == PARTITION_SPLIT) {
-    for (int i = 0; i < 4; ++i) {
-      get_min_bsize(pc_tree->split[i], min_bw, min_bh);
+    for (int i = 0; i < SUB_PARTITIONS_SPLIT; ++i) {
+      get_min_bsize(sms_tree->split[i], min_bw, min_bh);
     }
   } else {
     if (part_type == PARTITION_HORZ_A || part_type == PARTITION_HORZ_B ||
@@ -815,9 +818,9 @@
 
 #define FEATURES 31
 void av1_ml_early_term_after_split(AV1_COMP *const cpi, MACROBLOCK *const x,
-                                   PC_TREE *const pc_tree, BLOCK_SIZE bsize,
-                                   int64_t best_rd, int64_t part_none_rd,
-                                   int64_t part_split_rd,
+                                   SIMPLE_MOTION_DATA_TREE *const sms_tree,
+                                   BLOCK_SIZE bsize, int64_t best_rd,
+                                   int64_t part_none_rd, int64_t part_split_rd,
                                    int64_t *split_block_rd, int mi_row,
                                    int mi_col,
                                    int *const terminate_partition_search) {
@@ -870,30 +873,30 @@
   add_rd_feature(part_none_rd, best_rd, features, &f_idx);
   add_rd_feature(part_split_rd, best_rd, features, &f_idx);
 
-  for (int i = 0; i < 4; ++i) {
+  for (int i = 0; i < SUB_PARTITIONS_SPLIT; ++i) {
     add_rd_feature(split_block_rd[i], best_rd, features, &f_idx);
     int min_bw = MAX_SB_SIZE_LOG2;
     int min_bh = MAX_SB_SIZE_LOG2;
-    get_min_bsize(pc_tree->split[i], &min_bw, &min_bh);
+    get_min_bsize(sms_tree->split[i], &min_bw, &min_bh);
     features[f_idx++] = (float)min_bw;
     features[f_idx++] = (float)min_bh;
   }
 
-  simple_motion_search_prune_part_features(cpi, x, pc_tree, mi_row, mi_col,
+  simple_motion_search_prune_part_features(cpi, x, sms_tree, mi_row, mi_col,
                                            bsize, NULL,
                                            FEATURE_SMS_PRUNE_PART_FLAG);
 
-  features[f_idx++] = logf(1.0f + (float)pc_tree->sms_none_feat[1]);
+  features[f_idx++] = logf(1.0f + (float)sms_tree->sms_none_feat[1]);
 
-  features[f_idx++] = logf(1.0f + (float)pc_tree->split[0]->sms_none_feat[1]);
-  features[f_idx++] = logf(1.0f + (float)pc_tree->split[1]->sms_none_feat[1]);
-  features[f_idx++] = logf(1.0f + (float)pc_tree->split[2]->sms_none_feat[1]);
-  features[f_idx++] = logf(1.0f + (float)pc_tree->split[3]->sms_none_feat[1]);
+  features[f_idx++] = logf(1.0f + (float)sms_tree->split[0]->sms_none_feat[1]);
+  features[f_idx++] = logf(1.0f + (float)sms_tree->split[1]->sms_none_feat[1]);
+  features[f_idx++] = logf(1.0f + (float)sms_tree->split[2]->sms_none_feat[1]);
+  features[f_idx++] = logf(1.0f + (float)sms_tree->split[3]->sms_none_feat[1]);
 
-  features[f_idx++] = logf(1.0f + (float)pc_tree->sms_rect_feat[1]);
-  features[f_idx++] = logf(1.0f + (float)pc_tree->sms_rect_feat[3]);
-  features[f_idx++] = logf(1.0f + (float)pc_tree->sms_rect_feat[5]);
-  features[f_idx++] = logf(1.0f + (float)pc_tree->sms_rect_feat[7]);
+  features[f_idx++] = logf(1.0f + (float)sms_tree->sms_rect_feat[1]);
+  features[f_idx++] = logf(1.0f + (float)sms_tree->sms_rect_feat[3]);
+  features[f_idx++] = logf(1.0f + (float)sms_tree->sms_rect_feat[5]);
+  features[f_idx++] = logf(1.0f + (float)sms_tree->sms_rect_feat[7]);
 
   assert(f_idx == FEATURES);
 
@@ -947,7 +950,7 @@
   for (int i = 0; i < 5; i++) features[i] = 1.0f;
   if (none_rd > 0 && none_rd < 1000000000)
     features[0] = (float)none_rd / (float)best_rd;
-  for (int i = 0; i < 4; i++) {
+  for (int i = 0; i < SUB_PARTITIONS_SPLIT; i++) {
     if (split_rd[i] > 0 && split_rd[i] < 1000000000)
       features[1 + i] = (float)split_rd[i] / (float)best_rd;
   }
@@ -964,12 +967,12 @@
   }
   whole_block_variance = AOMMAX(whole_block_variance, 1);
 
-  int split_variance[4];
+  int split_variance[SUB_PARTITIONS_SPLIT];
   const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
   struct buf_2d buf;
   buf.stride = x->plane[0].src.stride;
   const int bw = block_size_wide[bsize];
-  for (int i = 0; i < 4; ++i) {
+  for (int i = 0; i < SUB_PARTITIONS_SPLIT; ++i) {
     const int x_idx = (i & 1) * bw / 2;
     const int y_idx = (i >> 1) * bw / 2;
     buf.buf = x->plane[0].src.buf + x_idx + y_idx * buf.stride;
@@ -981,7 +984,7 @@
     }
   }
 
-  for (int i = 0; i < 4; i++)
+  for (int i = 0; i < SUB_PARTITIONS_SPLIT; i++)
     features[5 + i] = (float)split_variance[i] / (float)whole_block_variance;
 
   // 2. Do the prediction and prune 0-2 partitions based on their probabilities
@@ -999,13 +1002,12 @@
 
 // Use a ML model to predict if horz_a, horz_b, vert_a, and vert_b should be
 // considered.
-void av1_ml_prune_ab_partition(BLOCK_SIZE bsize, int part_ctx, int var_ctx,
-                               int64_t best_rd, int64_t horz_rd[2],
-                               int64_t vert_rd[2], int64_t split_rd[4],
-                               int *const horza_partition_allowed,
-                               int *const horzb_partition_allowed,
-                               int *const verta_partition_allowed,
-                               int *const vertb_partition_allowed) {
+void av1_ml_prune_ab_partition(
+    BLOCK_SIZE bsize, int part_ctx, int var_ctx, int64_t best_rd,
+    int64_t horz_rd[SUB_PARTITIONS_RECT], int64_t vert_rd[SUB_PARTITIONS_RECT],
+    int64_t split_rd[SUB_PARTITIONS_SPLIT], int *const horza_partition_allowed,
+    int *const horzb_partition_allowed, int *const verta_partition_allowed,
+    int *const vertb_partition_allowed) {
   if (bsize < BLOCK_8X8 || best_rd >= 1000000000) return;
   const NN_CONFIG *nn_config = NULL;
   switch (bsize) {
@@ -1028,17 +1030,17 @@
   const int rdcost = (int)AOMMIN(INT_MAX, best_rd);
   int sub_block_rdcost[8] = { 0 };
   int rd_index = 0;
-  for (int i = 0; i < 2; ++i) {
+  for (int i = 0; i < SUB_PARTITIONS_RECT; ++i) {
     if (horz_rd[i] > 0 && horz_rd[i] < 1000000000)
       sub_block_rdcost[rd_index] = (int)horz_rd[i];
     ++rd_index;
   }
-  for (int i = 0; i < 2; ++i) {
+  for (int i = 0; i < SUB_PARTITIONS_RECT; ++i) {
     if (vert_rd[i] > 0 && vert_rd[i] < 1000000000)
       sub_block_rdcost[rd_index] = (int)vert_rd[i];
     ++rd_index;
   }
-  for (int i = 0; i < 4; ++i) {
+  for (int i = 0; i < SUB_PARTITIONS_SPLIT; ++i) {
     if (split_rd[i] > 0 && split_rd[i] < 1000000000)
       sub_block_rdcost[rd_index] = (int)split_rd[i];
     ++rd_index;
@@ -1087,15 +1089,16 @@
 #define FEATURES 18
 #define LABELS 4
 // Use a ML model to predict if horz4 and vert4 should be considered.
-void av1_ml_prune_4_partition(const AV1_COMP *const cpi, MACROBLOCK *const x,
-                              BLOCK_SIZE bsize, int part_ctx, int64_t best_rd,
-                              int64_t horz_rd[2], int64_t vert_rd[2],
-                              int64_t split_rd[4],
-                              int *const partition_horz4_allowed,
-                              int *const partition_vert4_allowed,
-                              unsigned int pb_source_variance, int mi_row,
-                              int mi_col) {
+void av1_ml_prune_4_partition(
+    const AV1_COMP *const cpi, MACROBLOCK *const x, BLOCK_SIZE bsize,
+    int part_ctx, int64_t best_rd,
+    int64_t rect_part_rd[NUM_RECT_PARTS][SUB_PARTITIONS_RECT],
+    int64_t split_rd[SUB_PARTITIONS_SPLIT], int *const partition_horz4_allowed,
+    int *const partition_vert4_allowed, unsigned int pb_source_variance,
+    int mi_row, int mi_col) {
   if (best_rd >= 1000000000) return;
+  int64_t *horz_rd = rect_part_rd[HORZ];
+  int64_t *vert_rd = rect_part_rd[VERT];
   const NN_CONFIG *nn_config = NULL;
   switch (bsize) {
     case BLOCK_16X16: nn_config = &av1_4_partition_nnconfig_16; break;
@@ -1116,17 +1119,17 @@
   const int rdcost = (int)AOMMIN(INT_MAX, best_rd);
   int sub_block_rdcost[8] = { 0 };
   int rd_index = 0;
-  for (int i = 0; i < 2; ++i) {
+  for (int i = 0; i < SUB_PARTITIONS_RECT; ++i) {
     if (horz_rd[i] > 0 && horz_rd[i] < 1000000000)
       sub_block_rdcost[rd_index] = (int)horz_rd[i];
     ++rd_index;
   }
-  for (int i = 0; i < 2; ++i) {
+  for (int i = 0; i < SUB_PARTITIONS_RECT; ++i) {
     if (vert_rd[i] > 0 && vert_rd[i] < 1000000000)
       sub_block_rdcost[rd_index] = (int)vert_rd[i];
     ++rd_index;
   }
-  for (int i = 0; i < 4; ++i) {
+  for (int i = 0; i < SUB_PARTITIONS_SPLIT; ++i) {
     if (split_rd[i] > 0 && split_rd[i] < 1000000000)
       sub_block_rdcost[rd_index] = (int)split_rd[i];
     ++rd_index;
@@ -1140,8 +1143,8 @@
   }
 
   // Get variance of the 1:4 and 4:1 sub-blocks.
-  unsigned int horz_4_source_var[4] = { 0 };
-  unsigned int vert_4_source_var[4] = { 0 };
+  unsigned int horz_4_source_var[SUB_PARTITIONS_PART4] = { 0 };
+  unsigned int vert_4_source_var[SUB_PARTITIONS_PART4] = { 0 };
   {
     BLOCK_SIZE horz_4_bs = get_partition_subsize(bsize, PARTITION_HORZ_4);
     BLOCK_SIZE vert_4_bs = get_partition_subsize(bsize, PARTITION_VERT_4);
@@ -1155,7 +1158,7 @@
     horz_4_src.stride = src_stride;
     vert_4_src.stride = src_stride;
 
-    for (int i = 0; i < 4; ++i) {
+    for (int i = 0; i < SUB_PARTITIONS_PART4; ++i) {
       horz_4_src.buf = src + i * block_size_high[horz_4_bs] * src_stride;
       vert_4_src.buf = src + i * block_size_wide[vert_4_bs];
 
@@ -1176,14 +1179,14 @@
   const float denom = (float)(pb_source_variance + 1);
   const float low_b = 0.1f;
   const float high_b = 10.0f;
-  for (int i = 0; i < 4; ++i) {
+  for (int i = 0; i < SUB_PARTITIONS_PART4; ++i) {
     // Ratio between the 4:1 sub-block variance and the whole-block variance.
     float var_ratio = (float)(horz_4_source_var[i] + 1) / denom;
     if (var_ratio < low_b) var_ratio = low_b;
     if (var_ratio > high_b) var_ratio = high_b;
     features[feature_index++] = var_ratio;
   }
-  for (int i = 0; i < 4; ++i) {
+  for (int i = 0; i < SUB_PARTITIONS_PART4; ++i) {
     // Ratio between the 1:4 sub-block RD and the whole-block RD.
     float var_ratio = (float)(vert_4_source_var[i] + 1) / denom;
     if (var_ratio < low_b) var_ratio = low_b;
@@ -1227,7 +1230,7 @@
 int av1_ml_predict_breakout(const AV1_COMP *const cpi, BLOCK_SIZE bsize,
                             const MACROBLOCK *const x,
                             const RD_STATS *const rd_stats,
-                            unsigned int pb_source_variance) {
+                            unsigned int pb_source_variance, int bit_depth) {
   const NN_CONFIG *nn_config = NULL;
   int thresh = 0;
   switch (bsize) {
@@ -1255,6 +1258,11 @@
   }
   if (!nn_config || thresh < 0) return 0;
 
+  const float ml_predict_breakout_thresh_scale[3] = { 1.15f, 1.05f, 1.0f };
+  thresh = (int)((float)thresh *
+                 ml_predict_breakout_thresh_scale
+                     [cpi->sf.part_sf.ml_predict_breakout_level - 1]);
+
   // Generate feature values.
   float features[FEATURES];
   int feature_index = 0;
@@ -1272,7 +1280,7 @@
 
   features[feature_index++] = (float)pb_source_variance;
 
-  const int dc_q = (int)x->plane[0].dequant_QTX[0];
+  const int dc_q = (int)x->plane[0].dequant_QTX[0] >> (bit_depth - 8);
   features[feature_index++] = (float)(dc_q * dc_q) / 256.0f;
   assert(feature_index == FEATURES);
 
@@ -1285,4 +1293,317 @@
   return (int)(score * 100) >= thresh;
 }
 #undef FEATURES
+
+void av1_prune_partitions_before_search(
+    AV1_COMP *const cpi, MACROBLOCK *const x, int mi_row, int mi_col,
+    BLOCK_SIZE bsize, SIMPLE_MOTION_DATA_TREE *const sms_tree,
+    int *partition_none_allowed, int *partition_horz_allowed,
+    int *partition_vert_allowed, int *do_rectangular_split,
+    int *do_square_split, int *prune_horz, int *prune_vert) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+
+  // Prune rectangular, AB and 4-way partition based on q index and block size
+  if (cpi->sf.part_sf.prune_rectangular_split_based_on_qidx) {
+    // Enumeration difference between two square partitions
+    const int sqr_bsize_step = BLOCK_32X32 - BLOCK_16X16;
+    int max_bsize =
+        BLOCK_32X32 - (x->qindex * 3 / QINDEX_RANGE) * sqr_bsize_step;
+    max_bsize = AOMMAX(max_bsize, BLOCK_4X4);
+    const BLOCK_SIZE max_prune_bsize =
+        (BLOCK_SIZE)AOMMIN(max_bsize, BLOCK_32X32);
+
+    // Prune partition
+    // qidx 0 to 85: prune bsize below BLOCK_32X32
+    // qidx 86 to 170: prune bsize below BLOCK_16X16
+    // qidx 171 to 255: prune bsize below BLOCK_8X8
+    if (bsize < max_prune_bsize) {
+      *do_rectangular_split = 0;
+      *partition_horz_allowed = 0;
+      *partition_vert_allowed = 0;
+    }
+  }
+
+  if (cpi->sf.part_sf.prune_sub_8x8_partition_level && (bsize == BLOCK_8X8)) {
+    const MACROBLOCKD *const xd = &x->e_mbd;
+    int prune_sub_8x8 = 1;
+    if (cpi->sf.part_sf.prune_sub_8x8_partition_level == 1) {
+      int num_neighbors_lt_8x8 = 0;
+      if (xd->left_available)
+        num_neighbors_lt_8x8 += (xd->left_mbmi->bsize <= BLOCK_8X8);
+      if (xd->up_available)
+        num_neighbors_lt_8x8 += (xd->above_mbmi->bsize <= BLOCK_8X8);
+      // Evaluate only if both left and above blocks are of size <= BLOCK_8X8.
+      if (num_neighbors_lt_8x8 == 2) {
+        prune_sub_8x8 = 0;
+      }
+    }
+    if (prune_sub_8x8) {
+      *partition_horz_allowed = 0;
+      *partition_vert_allowed = 0;
+      *do_square_split = 0;
+    }
+  }
+
+  // A CNN-based speed feature pruning out either split or all non-split
+  // partition in INTRA frame coding.
+  const int try_intra_cnn_split =
+      !cpi->use_screen_content_tools && frame_is_intra_only(cm) &&
+      cpi->sf.part_sf.intra_cnn_split &&
+      cm->seq_params.sb_size >= BLOCK_64X64 && bsize <= BLOCK_64X64 &&
+      bsize >= BLOCK_8X8 &&
+      mi_row + mi_size_high[bsize] <= mi_params->mi_rows &&
+      mi_col + mi_size_wide[bsize] <= mi_params->mi_cols;
+
+  if (try_intra_cnn_split) {
+    av1_intra_mode_cnn_partition(
+        &cpi->common, x, bsize, x->part_search_info.quad_tree_idx,
+        partition_none_allowed, partition_horz_allowed, partition_vert_allowed,
+        do_rectangular_split, do_square_split);
+  }
+
+  // Use simple motion search to prune out split or non-split partitions. This
+  // must be done prior to PARTITION_SPLIT to propagate the initial mvs to a
+  // smaller blocksize.
+  const int try_split_only =
+      !cpi->use_screen_content_tools &&
+      cpi->sf.part_sf.simple_motion_search_split && *do_square_split &&
+      bsize >= BLOCK_8X8 &&
+      mi_row + mi_size_high[bsize] <= mi_params->mi_rows &&
+      mi_col + mi_size_wide[bsize] <= mi_params->mi_cols &&
+      !frame_is_intra_only(cm) && !av1_superres_scaled(cm);
+
+  if (try_split_only) {
+    av1_simple_motion_search_based_split(
+        cpi, x, sms_tree, mi_row, mi_col, bsize, partition_none_allowed,
+        partition_horz_allowed, partition_vert_allowed, do_rectangular_split,
+        do_square_split);
+  }
+
+  // Use simple motion search to prune out rectangular partition in some
+  // direction. The results are stored in prune_horz and prune_vert in order to
+  // bypass future related pruning checks if a pruning decision has been made.
+  const int try_prune_rect =
+      !cpi->use_screen_content_tools &&
+      cpi->sf.part_sf.simple_motion_search_prune_rect &&
+      !frame_is_intra_only(cm) && *do_rectangular_split &&
+      (*do_square_split || *partition_none_allowed ||
+       (*prune_horz && *prune_vert)) &&
+      (*partition_horz_allowed || *partition_vert_allowed) &&
+      bsize >= BLOCK_8X8;
+
+  if (try_prune_rect) {
+    av1_simple_motion_search_prune_rect(
+        cpi, x, sms_tree, mi_row, mi_col, bsize, *partition_horz_allowed,
+        *partition_vert_allowed, prune_horz, prune_vert);
+  }
+}
+
+#ifndef NDEBUG
+static AOM_INLINE int is_bsize_square(BLOCK_SIZE bsize) {
+  return block_size_wide[bsize] == block_size_high[bsize];
+}
+#endif  // NDEBUG
+
+void av1_prune_partitions_by_max_min_bsize(
+    SuperBlockEnc *sb_enc, BLOCK_SIZE bsize, int is_not_edge_block,
+    int *partition_none_allowed, int *partition_horz_allowed,
+    int *partition_vert_allowed, int *do_square_split) {
+  assert(is_bsize_square(sb_enc->max_partition_size));
+  assert(is_bsize_square(sb_enc->min_partition_size));
+  assert(sb_enc->min_partition_size <= sb_enc->max_partition_size);
+  assert(is_bsize_square(bsize));
+  const int max_partition_size_1d = block_size_wide[sb_enc->max_partition_size];
+  const int min_partition_size_1d = block_size_wide[sb_enc->min_partition_size];
+  const int bsize_1d = block_size_wide[bsize];
+  assert(min_partition_size_1d <= max_partition_size_1d);
+  const int is_le_min_sq_part = bsize_1d <= min_partition_size_1d;
+  const int is_gt_max_sq_part = bsize_1d > max_partition_size_1d;
+  if (is_gt_max_sq_part) {
+    // If current block size is larger than max, only allow split.
+    *partition_none_allowed = 0;
+    *partition_horz_allowed = 0;
+    *partition_vert_allowed = 0;
+    *do_square_split = 1;
+  } else if (is_le_min_sq_part) {
+    // If current block size is less or equal to min, only allow none if valid
+    // block large enough; only allow split otherwise.
+    *partition_horz_allowed = 0;
+    *partition_vert_allowed = 0;
+    // only disable square split when current block is not at the picture
+    // boundary. otherwise, inherit the square split flag from previous logic
+    if (is_not_edge_block) *do_square_split = 0;
+    *partition_none_allowed = !(*do_square_split);
+  }
+}
+
+// Decide whether to evaluate the AB partition specified by part_type based on
+// split and HORZ/VERT info
+int evaluate_ab_partition_based_on_split(
+    const PC_TREE *pc_tree, PARTITION_TYPE rect_part,
+    const RD_RECT_PART_WIN_INFO *rect_part_win_info, int qindex, int split_idx1,
+    int split_idx2) {
+  int num_win = 0;
+  // Threshold for number of winners
+  // Conservative pruning for high quantizers
+  const int num_win_thresh = AOMMIN(3 * (2 * (MAXQ - qindex) / MAXQ), 3);
+  int sub_part_win = (rect_part_win_info == NULL)
+                         ? (pc_tree->partitioning == rect_part)
+                         : (rect_part == PARTITION_HORZ)
+                               ? rect_part_win_info->rect_part_win[HORZ]
+                               : rect_part_win_info->rect_part_win[VERT];
+  num_win += (sub_part_win) ? 1 : 0;
+  if (pc_tree->split[split_idx1]) {
+    num_win +=
+        (pc_tree->split[split_idx1]->partitioning == PARTITION_NONE) ? 1 : 0;
+  } else {
+    num_win += 1;
+  }
+  if (pc_tree->split[split_idx2]) {
+    num_win +=
+        (pc_tree->split[split_idx2]->partitioning == PARTITION_NONE) ? 1 : 0;
+  } else {
+    num_win += 1;
+  }
+  if (num_win < num_win_thresh) {
+    return 0;
+  }
+  return 1;
+}
+
+void av1_prune_ab_partitions(
+    const AV1_COMP *cpi, const MACROBLOCK *x, const PC_TREE *pc_tree,
+    BLOCK_SIZE bsize, int pb_source_variance, int64_t best_rdcost,
+    int64_t rect_part_rd[NUM_RECT_PARTS][SUB_PARTITIONS_RECT],
+    int64_t split_rd[SUB_PARTITIONS_SPLIT],
+    const RD_RECT_PART_WIN_INFO *rect_part_win_info, int ext_partition_allowed,
+    int partition_horz_allowed, int partition_vert_allowed,
+    int *horza_partition_allowed, int *horzb_partition_allowed,
+    int *verta_partition_allowed, int *vertb_partition_allowed) {
+  int64_t *horz_rd = rect_part_rd[HORZ];
+  int64_t *vert_rd = rect_part_rd[VERT];
+  const PartitionCfg *const part_cfg = &cpi->oxcf.part_cfg;
+  // The standard AB partitions are allowed initially if ext-partition-types are
+  // allowed.
+  int horzab_partition_allowed =
+      ext_partition_allowed & part_cfg->enable_ab_partitions;
+  int vertab_partition_allowed =
+      ext_partition_allowed & part_cfg->enable_ab_partitions;
+
+  // Pruning: pruning out AB partitions on one main direction based on the
+  // current best partition and source variance.
+  if (cpi->sf.part_sf.prune_ext_partition_types_search_level) {
+    if (cpi->sf.part_sf.prune_ext_partition_types_search_level == 1) {
+      // TODO(debargha,huisu@google.com): may need to tune the threshold for
+      // pb_source_variance.
+      horzab_partition_allowed &= (pc_tree->partitioning == PARTITION_HORZ ||
+                                   (pc_tree->partitioning == PARTITION_NONE &&
+                                    pb_source_variance < 32) ||
+                                   pc_tree->partitioning == PARTITION_SPLIT);
+      vertab_partition_allowed &= (pc_tree->partitioning == PARTITION_VERT ||
+                                   (pc_tree->partitioning == PARTITION_NONE &&
+                                    pb_source_variance < 32) ||
+                                   pc_tree->partitioning == PARTITION_SPLIT);
+    } else {
+      horzab_partition_allowed &= (pc_tree->partitioning == PARTITION_HORZ ||
+                                   pc_tree->partitioning == PARTITION_SPLIT);
+      vertab_partition_allowed &= (pc_tree->partitioning == PARTITION_VERT ||
+                                   pc_tree->partitioning == PARTITION_SPLIT);
+    }
+    horz_rd[0] = (horz_rd[0] < INT64_MAX ? horz_rd[0] : 0);
+    horz_rd[1] = (horz_rd[1] < INT64_MAX ? horz_rd[1] : 0);
+    vert_rd[0] = (vert_rd[0] < INT64_MAX ? vert_rd[0] : 0);
+    vert_rd[1] = (vert_rd[1] < INT64_MAX ? vert_rd[1] : 0);
+    split_rd[0] = (split_rd[0] < INT64_MAX ? split_rd[0] : 0);
+    split_rd[1] = (split_rd[1] < INT64_MAX ? split_rd[1] : 0);
+    split_rd[2] = (split_rd[2] < INT64_MAX ? split_rd[2] : 0);
+    split_rd[3] = (split_rd[3] < INT64_MAX ? split_rd[3] : 0);
+  }
+
+  // Pruning: pruning out horz_a or horz_b if the combined rdcost of its
+  // subblocks estimated from previous partitions is much higher than the best
+  // rd so far.
+  *horza_partition_allowed = horzab_partition_allowed;
+  *horzb_partition_allowed = horzab_partition_allowed;
+  if (cpi->sf.part_sf.prune_ext_partition_types_search_level) {
+    const int64_t horz_a_rd = horz_rd[1] + split_rd[0] + split_rd[1];
+    const int64_t horz_b_rd = horz_rd[0] + split_rd[2] + split_rd[3];
+    switch (cpi->sf.part_sf.prune_ext_partition_types_search_level) {
+      case 1:
+        *horza_partition_allowed &= (horz_a_rd / 16 * 14 < best_rdcost);
+        *horzb_partition_allowed &= (horz_b_rd / 16 * 14 < best_rdcost);
+        break;
+      case 2:
+      default:
+        *horza_partition_allowed &= (horz_a_rd / 16 * 15 < best_rdcost);
+        *horzb_partition_allowed &= (horz_b_rd / 16 * 15 < best_rdcost);
+        break;
+    }
+  }
+
+  // Pruning: pruning out vert_a or vert_b if the combined rdcost of its
+  // subblocks estimated from previous partitions is much higher than the best
+  // rd so far.
+  *verta_partition_allowed = vertab_partition_allowed;
+  *vertb_partition_allowed = vertab_partition_allowed;
+  if (cpi->sf.part_sf.prune_ext_partition_types_search_level) {
+    const int64_t vert_a_rd = vert_rd[1] + split_rd[0] + split_rd[2];
+    const int64_t vert_b_rd = vert_rd[0] + split_rd[1] + split_rd[3];
+    switch (cpi->sf.part_sf.prune_ext_partition_types_search_level) {
+      case 1:
+        *verta_partition_allowed &= (vert_a_rd / 16 * 14 < best_rdcost);
+        *vertb_partition_allowed &= (vert_b_rd / 16 * 14 < best_rdcost);
+        break;
+      case 2:
+      default:
+        *verta_partition_allowed &= (vert_a_rd / 16 * 15 < best_rdcost);
+        *vertb_partition_allowed &= (vert_b_rd / 16 * 15 < best_rdcost);
+        break;
+    }
+  }
+
+  // Pruning: pruning out some ab partitions using a DNN taking rd costs of
+  // sub-blocks from previous basic partition types.
+  if (cpi->sf.part_sf.ml_prune_ab_partition && ext_partition_allowed &&
+      partition_horz_allowed && partition_vert_allowed) {
+    // TODO(huisu@google.com): x->source_variance may not be the current
+    // block's variance. The correct one to use is pb_source_variance. Need to
+    // re-train the model to fix it.
+    av1_ml_prune_ab_partition(bsize, pc_tree->partitioning,
+                              get_unsigned_bits(x->source_variance),
+                              best_rdcost, horz_rd, vert_rd, split_rd,
+                              horza_partition_allowed, horzb_partition_allowed,
+                              verta_partition_allowed, vertb_partition_allowed);
+  }
+
+  // Disable ab partitions if they are disabled by the encoder parameter.
+  *horza_partition_allowed &= part_cfg->enable_ab_partitions;
+  *horzb_partition_allowed &= part_cfg->enable_ab_partitions;
+  *verta_partition_allowed &= part_cfg->enable_ab_partitions;
+  *vertb_partition_allowed &= part_cfg->enable_ab_partitions;
+
+  // Pruning: pruning AB partitions based on the number of horz/vert wins
+  // in the current block and sub-blocks in PARTITION_SPLIT.
+  if (cpi->sf.part_sf.prune_ab_partition_using_split_info &&
+      *horza_partition_allowed) {
+    *horza_partition_allowed &= evaluate_ab_partition_based_on_split(
+        pc_tree, PARTITION_HORZ, rect_part_win_info, x->qindex, 0, 1);
+  }
+  if (cpi->sf.part_sf.prune_ab_partition_using_split_info &&
+      *horzb_partition_allowed) {
+    *horzb_partition_allowed &= evaluate_ab_partition_based_on_split(
+        pc_tree, PARTITION_HORZ, rect_part_win_info, x->qindex, 2, 3);
+  }
+  if (cpi->sf.part_sf.prune_ab_partition_using_split_info &&
+      *verta_partition_allowed) {
+    *verta_partition_allowed &= evaluate_ab_partition_based_on_split(
+        pc_tree, PARTITION_VERT, rect_part_win_info, x->qindex, 0, 2);
+  }
+  if (cpi->sf.part_sf.prune_ab_partition_using_split_info &&
+      *vertb_partition_allowed) {
+    *vertb_partition_allowed &= evaluate_ab_partition_based_on_split(
+        pc_tree, PARTITION_VERT, rect_part_win_info, x->qindex, 1, 3);
+  }
+}
+
 #endif  // !CONFIG_REALTIME_ONLY
diff --git a/av1/encoder/partition_strategy.h b/av1/encoder/partition_strategy.h
index f9b4d8b..0527a94 100644
--- a/av1/encoder/partition_strategy.h
+++ b/av1/encoder/partition_strategy.h
@@ -33,6 +33,38 @@
 #define FEATURE_SMS_SPLIT_MODEL_FLAG \
   (FEATURE_SMS_NONE_FLAG | FEATURE_SMS_SPLIT_FLAG)
 
+// Number of sub-partitions in rectangular partition types.
+#define SUB_PARTITIONS_RECT 2
+
+// Number of sub-partitions in split partition type.
+#define SUB_PARTITIONS_SPLIT 4
+
+// Number of sub-partitions in AB partition types.
+#define SUB_PARTITIONS_AB 3
+
+// Number of sub-partitions in 4-way partition types.
+#define SUB_PARTITIONS_PART4 4
+
+// 4part parition types.
+enum { HORZ4 = 0, VERT4, NUM_PART4_TYPES } UENUM1BYTE(PART4_TYPES);
+
+// AB parition types.
+enum {
+  HORZ_A = 0,
+  HORZ_B,
+  VERT_A,
+  VERT_B,
+  NUM_AB_PARTS
+} UENUM1BYTE(AB_PART_TYPE);
+
+// Rectangular parition types.
+enum { HORZ = 0, VERT, NUM_RECT_PARTS } UENUM1BYTE(RECT_PART_TYPE);
+
+// Structure to keep win flags for HORZ and VERT partition evaluations.
+typedef struct {
+  int rect_part_win[NUM_RECT_PARTS];
+} RD_RECT_PART_WIN_INFO;
+
 void av1_intra_mode_cnn_partition(const AV1_COMMON *const cm, MACROBLOCK *x,
                                   int bsize, int label_idx,
                                   int *partition_none_allowed,
@@ -45,20 +77,18 @@
 // the variance of residues. Then use the features to determine whether we want
 // to go straight to splitting without trying PARTITION_NONE
 void av1_simple_motion_search_based_split(
-    AV1_COMP *const cpi, MACROBLOCK *x, PC_TREE *pc_tree, int mi_row,
-    int mi_col, BLOCK_SIZE bsize, int *partition_none_allowed,
+    AV1_COMP *const cpi, MACROBLOCK *x, SIMPLE_MOTION_DATA_TREE *sms_tree,
+    int mi_row, int mi_col, BLOCK_SIZE bsize, int *partition_none_allowed,
     int *partition_horz_allowed, int *partition_vert_allowed,
     int *do_rectangular_split, int *do_square_split);
 
 // Performs a simple_motion_search with two reference frames and extract
 // the variance of residues. Then use the features to determine whether we want
 // to prune some partitions.
-void av1_simple_motion_search_prune_rect(AV1_COMP *const cpi, MACROBLOCK *x,
-                                         PC_TREE *pc_tree, int mi_row,
-                                         int mi_col, BLOCK_SIZE bsize,
-                                         int *partition_horz_allowed,
-                                         int *partition_vert_allowed,
-                                         int *prune_horz, int *prune_vert);
+void av1_simple_motion_search_prune_rect(
+    AV1_COMP *const cpi, MACROBLOCK *x, SIMPLE_MOTION_DATA_TREE *sms_tree,
+    int mi_row, int mi_col, BLOCK_SIZE bsize, int partition_horz_allowed,
+    int partition_vert_allowed, int *prune_horz, int *prune_vert);
 
 #if !CONFIG_REALTIME_ONLY
 // Early terminates PARTITION_NONE using simple_motion_search features and the
@@ -67,12 +97,10 @@
 //  - The frame is not intra only
 //  - The current bsize is > BLOCK_8X8
 //  - blk_row + blk_height/2 < total_rows and blk_col + blk_width/2 < total_cols
-void av1_simple_motion_search_early_term_none(AV1_COMP *const cpi,
-                                              MACROBLOCK *x, PC_TREE *pc_tree,
-                                              int mi_row, int mi_col,
-                                              BLOCK_SIZE bsize,
-                                              const RD_STATS *none_rdc,
-                                              int *early_terminate);
+void av1_simple_motion_search_early_term_none(
+    AV1_COMP *const cpi, MACROBLOCK *x, SIMPLE_MOTION_DATA_TREE *sms_tree,
+    int mi_row, int mi_col, BLOCK_SIZE bsize, const RD_STATS *none_rdc,
+    int *early_terminate);
 
 // Get the features for selecting the max and min partition size. Currently this
 // performs simple_motion_search on 16X16 subblocks of the current superblock,
@@ -82,14 +110,15 @@
                                         float *features);
 
 // Predict the maximum BLOCK_SIZE to be used to encoder the current superblock.
-BLOCK_SIZE av1_predict_max_partition(AV1_COMP *const cpi, MACROBLOCK *const x,
+BLOCK_SIZE av1_predict_max_partition(const AV1_COMP *const cpi,
+                                     const MACROBLOCK *const x,
                                      const float *features);
 
 // Attempts an early termination after PARTITION_SPLIT.
 void av1_ml_early_term_after_split(AV1_COMP *const cpi, MACROBLOCK *const x,
-                                   PC_TREE *const pc_tree, BLOCK_SIZE bsize,
-                                   int64_t best_rd, int64_t part_none_rd,
-                                   int64_t part_split_rd,
+                                   SIMPLE_MOTION_DATA_TREE *const sms_tree,
+                                   BLOCK_SIZE bsize, int64_t best_rd,
+                                   int64_t part_none_rd, int64_t part_split_rd,
                                    int64_t *split_block_rd, int mi_row,
                                    int mi_col,
                                    int *const terminate_partition_search);
@@ -108,29 +137,60 @@
 
 // Use a ML model to predict if horz_a, horz_b, vert_a, and vert_b should be
 // considered.
-void av1_ml_prune_ab_partition(BLOCK_SIZE bsize, int part_ctx, int var_ctx,
-                               int64_t best_rd, int64_t horz_rd[2],
-                               int64_t vert_rd[2], int64_t split_rd[4],
-                               int *const horza_partition_allowed,
-                               int *const horzb_partition_allowed,
-                               int *const verta_partition_allowed,
-                               int *const vertb_partition_allowed);
+void av1_ml_prune_ab_partition(
+    BLOCK_SIZE bsize, int part_ctx, int var_ctx, int64_t best_rd,
+    int64_t horz_rd[SUB_PARTITIONS_RECT], int64_t vert_rd[SUB_PARTITIONS_RECT],
+    int64_t split_rd[SUB_PARTITIONS_SPLIT], int *const horza_partition_allowed,
+    int *const horzb_partition_allowed, int *const verta_partition_allowed,
+    int *const vertb_partition_allowed);
 
 // Use a ML model to predict if horz4 and vert4 should be considered.
-void av1_ml_prune_4_partition(const AV1_COMP *const cpi, MACROBLOCK *const x,
-                              BLOCK_SIZE bsize, int part_ctx, int64_t best_rd,
-                              int64_t horz_rd[2], int64_t vert_rd[2],
-                              int64_t split_rd[4],
-                              int *const partition_horz4_allowed,
-                              int *const partition_vert4_allowed,
-                              unsigned int pb_source_variance, int mi_row,
-                              int mi_col);
+void av1_ml_prune_4_partition(
+    const AV1_COMP *const cpi, MACROBLOCK *const x, BLOCK_SIZE bsize,
+    int part_ctx, int64_t best_rd,
+    int64_t rect_part_rd[NUM_RECT_PARTS][SUB_PARTITIONS_RECT],
+    int64_t split_rd[SUB_PARTITIONS_SPLIT], int *const partition_horz4_allowed,
+    int *const partition_vert4_allowed, unsigned int pb_source_variance,
+    int mi_row, int mi_col);
 
-// ML-based partition search breakout after PARTITION_NONE
+// ML-based partition search breakout after PARTITION_NONE.
 int av1_ml_predict_breakout(const AV1_COMP *const cpi, BLOCK_SIZE bsize,
                             const MACROBLOCK *const x,
                             const RD_STATS *const rd_stats,
-                            unsigned int pb_source_variance);
+                            unsigned int pb_source_variance, int bit_depth);
+
+// The first round of partition pruning determined before any partition
+// has been tested. The decisions will be updated and passed back
+// to the partition search function.
+void av1_prune_partitions_before_search(
+    AV1_COMP *const cpi, MACROBLOCK *const x, int mi_row, int mi_col,
+    BLOCK_SIZE bsize, SIMPLE_MOTION_DATA_TREE *const sms_tree,
+    int *partition_none_allowed, int *partition_horz_allowed,
+    int *partition_vert_allowed, int *do_rectangular_split,
+    int *do_square_split, int *prune_horz, int *prune_vert);
+
+// Prune out partitions that lead to coding block sizes outside the min and max
+// bsizes set by the encoder. Max and min square partition levels are defined as
+// the partition nodes that the recursive function rd_pick_partition() can
+// reach. To implement this: only PARTITION_NONE is allowed if the current node
+// equals max_partition_size, only PARTITION_SPLIT is allowed if the current
+// node exceeds max_partition_size.
+void av1_prune_partitions_by_max_min_bsize(
+    SuperBlockEnc *sb_enc, BLOCK_SIZE bsize, int is_not_edge_block,
+    int *partition_none_allowed, int *partition_horz_allowed,
+    int *partition_vert_allowed, int *do_square_split);
+
+// Prune out AB partitions based on rd decisions made from testing the
+// basic partitions.
+void av1_prune_ab_partitions(
+    const AV1_COMP *cpi, const MACROBLOCK *x, const PC_TREE *pc_tree,
+    BLOCK_SIZE bsize, int pb_source_variance, int64_t best_rdcost,
+    int64_t rect_part_rd[NUM_RECT_PARTS][SUB_PARTITIONS_RECT],
+    int64_t split_rd[SUB_PARTITIONS_SPLIT],
+    const RD_RECT_PART_WIN_INFO *rect_part_win_info, int ext_partition_allowed,
+    int partition_horz_allowed, int partition_vert_allowed,
+    int *horza_partition_allowed, int *horzb_partition_allowed,
+    int *verta_partition_allowed, int *vertb_partition_allowed);
 #endif  // !CONFIG_REALTIME_ONLY
 
 // A simplified version of set_offsets meant to be used for
@@ -176,19 +236,19 @@
   av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes, bsize);
 }
 
-static INLINE void init_simple_motion_search_mvs(PC_TREE *pc_tree) {
-  av1_zero(pc_tree->start_mvs);
+static INLINE void init_simple_motion_search_mvs(
+    SIMPLE_MOTION_DATA_TREE *sms_tree) {
+  av1_zero(sms_tree->start_mvs);
+  av1_zero(sms_tree->sms_none_feat);
+  av1_zero(sms_tree->sms_rect_feat);
+  av1_zero(sms_tree->sms_none_valid);
+  av1_zero(sms_tree->sms_rect_valid);
 
-  av1_zero(pc_tree->sms_none_feat);
-  av1_zero(pc_tree->sms_rect_feat);
-  av1_zero(pc_tree->sms_none_valid);
-  av1_zero(pc_tree->sms_rect_valid);
-
-  if (pc_tree->block_size >= BLOCK_8X8) {
-    init_simple_motion_search_mvs(pc_tree->split[0]);
-    init_simple_motion_search_mvs(pc_tree->split[1]);
-    init_simple_motion_search_mvs(pc_tree->split[2]);
-    init_simple_motion_search_mvs(pc_tree->split[3]);
+  if (sms_tree->block_size >= BLOCK_8X8) {
+    init_simple_motion_search_mvs(sms_tree->split[0]);
+    init_simple_motion_search_mvs(sms_tree->split[1]);
+    init_simple_motion_search_mvs(sms_tree->split[2]);
+    init_simple_motion_search_mvs(sms_tree->split[3]);
   }
 }
 
@@ -204,13 +264,13 @@
 // Do not use this criteria for screen content videos.
 // Since screen content videos could often find good predictors and the largest
 // block size is likely to be used.
-static INLINE int use_auto_max_partition(AV1_COMP *const cpi,
+static INLINE int use_auto_max_partition(const AV1_COMP *const cpi,
                                          BLOCK_SIZE sb_size, int mi_row,
                                          int mi_col) {
   assert(IMPLIES(cpi->gf_group.size > 0,
                  cpi->gf_group.index < cpi->gf_group.size));
-  AV1_COMMON *const cm = &cpi->common;
-  return !frame_is_intra_only(cm) && !cpi->is_screen_content_type &&
+  const AV1_COMMON *const cm = &cpi->common;
+  return !frame_is_intra_only(cm) && !cpi->use_screen_content_tools &&
          cpi->sf.part_sf.auto_max_partition_based_on_simple_motion !=
              NOT_IN_USE &&
          sb_size == BLOCK_128X128 &&
diff --git a/av1/encoder/pass2_strategy.c b/av1/encoder/pass2_strategy.c
index 6adc1fb..f16b0d2 100644
--- a/av1/encoder/pass2_strategy.c
+++ b/av1/encoder/pass2_strategy.c
@@ -9,6 +9,14 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
+/*!\defgroup gf_group_algo Golden Frame Group
+ * \ingroup high_level_algo
+ * Algorithms regarding determining the length of GF groups and defining GF
+ * group structures.
+ * @{
+ */
+/*! @} - end defgroup gf_group_algo */
+
 #include <stdint.h>
 
 #include "config/aom_config.h"
@@ -26,6 +34,8 @@
 #include "av1/encoder/gop_structure.h"
 #include "av1/encoder/pass2_strategy.h"
 #include "av1/encoder/ratectrl.h"
+#include "av1/encoder/rc_utils.h"
+#include "av1/encoder/temporal_filter.h"
 #include "av1/encoder/tpl_model.h"
 #include "av1/encoder/use_flat_gop_model_params.h"
 #include "av1/encoder/encode_strategy.h"
@@ -64,7 +74,7 @@
   double modified_error =
       av_err * pow(this_frame->coded_error * this_frame->weight /
                        DOUBLE_DIVIDE_CHECK(av_err),
-                   oxcf->two_pass_vbrbias / 100.0);
+                   oxcf->rc_cfg.vbrbias / 100.0);
 
   // Correction for active area. Frames with a reduced active area
   // (eg due to formatting bars) have a higher error per mb for the
@@ -145,7 +155,7 @@
 static int frame_max_bits(const RATE_CONTROL *rc,
                           const AV1EncoderConfig *oxcf) {
   int64_t max_bits = ((int64_t)rc->avg_frame_bandwidth *
-                      (int64_t)oxcf->two_pass_vbrmax_section) /
+                      (int64_t)oxcf->rc_cfg.vbrmax_section) /
                      100;
   if (max_bits < 0)
     max_bits = 0;
@@ -181,7 +191,7 @@
 }
 
 static int qbpm_enumerator(int rate_err_tol) {
-  return 1350000 + ((300000 * AOMMIN(75, AOMMAX(rate_err_tol - 25, 0))) / 75);
+  return 1200000 + ((300000 * AOMMIN(75, AOMMAX(rate_err_tol - 25, 0))) / 75);
 }
 
 // Similar to find_qindex_by_rate() function in ratectrl.c, but includes
@@ -211,28 +221,50 @@
   return low;
 }
 
-static int get_twopass_worst_quality(AV1_COMP *cpi, const double section_err,
+/*!\brief Choose a target maximum Q for a group of frames
+ *
+ * \ingroup rate_control
+ *
+ * This function is used to estimate a suitable maximum Q for a
+ * group of frames. Inititally it is called to get a crude estimate
+ * for the whole clip. It is then called for each ARF/GF group to get
+ * a revised estimate for that group.
+ *
+ * \param[in]    cpi                 Top-level encoder structure
+ * \param[in]    av_frame_err        The average per frame coded error score
+ *                                   for frames making up this section/group.
+ * \param[in]    inactive_zone       Used to mask off /ignore part of the
+ *                                   frame. The most common use case is where
+ *                                   a wide format video (e.g. 16:9) is
+ *                                   letter-boxed into a more square format.
+ *                                   Here we want to ignore the bands at the
+ *                                   top and bottom.
+ * \param[in]    av_target_bandwidth The target bits per frame
+ * \param[in]    group_weight_factor A correction factor allowing the algorithm
+ *                                   to correct for errors over time.
+ *
+ * \return The maximum Q for frames in the group.
+ */
+static int get_twopass_worst_quality(AV1_COMP *cpi, const double av_frame_err,
                                      double inactive_zone,
-                                     int section_target_bandwidth,
+                                     int av_target_bandwidth,
                                      double group_weight_factor) {
   const RATE_CONTROL *const rc = &cpi->rc;
   const AV1EncoderConfig *const oxcf = &cpi->oxcf;
-
+  const RateControlCfg *const rc_cfg = &oxcf->rc_cfg;
   inactive_zone = fclamp(inactive_zone, 0.0, 1.0);
 
-  if (section_target_bandwidth <= 0) {
+  if (av_target_bandwidth <= 0) {
     return rc->worst_quality;  // Highest value allowed
   } else {
-    const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE)
+    const int num_mbs = (oxcf->resize_cfg.resize_mode != RESIZE_NONE)
                             ? cpi->initial_mbs
                             : cpi->common.mi_params.MBs;
     const int active_mbs = AOMMAX(1, num_mbs - (int)(num_mbs * inactive_zone));
-    const double av_err_per_mb = section_err / active_mbs;
+    const double av_err_per_mb = av_frame_err / active_mbs;
     const int target_norm_bits_per_mb =
-        (int)((uint64_t)section_target_bandwidth << BPER_MB_NORMBITS) /
-        active_mbs;
-    int rate_err_tol =
-        AOMMIN(cpi->oxcf.under_shoot_pct, cpi->oxcf.over_shoot_pct);
+        (int)((uint64_t)av_target_bandwidth << BPER_MB_NORMBITS) / active_mbs;
+    int rate_err_tol = AOMMIN(rc_cfg->under_shoot_pct, rc_cfg->over_shoot_pct);
 
     twopass_update_bpm_factor(&cpi->twopass);
     // Try and pick a max Q that will be high enough to encode the
@@ -243,7 +275,7 @@
         rc->worst_quality);
 
     // Restriction on active max q for constrained quality mode.
-    if (cpi->oxcf.rc_mode == AOM_CQ) q = AOMMAX(q, oxcf->cq_level);
+    if (rc_cfg->mode == AOM_CQ) q = AOMMAX(q, rc_cfg->cq_level);
     return q;
   }
 }
@@ -384,11 +416,12 @@
   gf_stats->gf_group_inactive_zone_rows += stats->inactive_zone_rows;
 }
 
-static void accumulate_next_frame_stats(
-    const FIRSTPASS_STATS *stats, const FRAME_INFO *frame_info,
-    TWO_PASS *const twopass, const int flash_detected,
-    const int frames_since_key, const int cur_idx, const int can_disable_arf,
-    const int min_gf_interval, GF_GROUP_STATS *gf_stats) {
+static void accumulate_next_frame_stats(const FIRSTPASS_STATS *stats,
+                                        const FRAME_INFO *frame_info,
+                                        const int flash_detected,
+                                        const int frames_since_key,
+                                        const int cur_idx,
+                                        GF_GROUP_STATS *gf_stats) {
   accumulate_frame_motion_stats(stats, gf_stats);
   // sum up the metric values of current gf group
   gf_stats->avg_sr_coded_error += stats->sr_coded_error;
@@ -416,15 +449,6 @@
           AOMMIN(gf_stats->zero_motion_accumulator,
                  get_zero_motion_factor(frame_info, stats));
     }
-
-    // Break clause to detect very still sections after motion. For example,
-    // a static image after a fade or other transition.
-    if (can_disable_arf &&
-        detect_transition_to_still(twopass, min_gf_interval, cur_idx, 5,
-                                   gf_stats->loop_decay_rate,
-                                   gf_stats->last_loop_decay_rate)) {
-      gf_stats->allow_alt_ref = 0;
-    }
   }
 }
 
@@ -696,7 +720,18 @@
   return (int)(intra_error / DOUBLE_DIVIDE_CHECK(coded_error));
 }
 
-// Calculate the total bits to allocate in this GF/ARF group.
+/*!\brief Calculates the bit target for this GF/ARF group
+ *
+ * \ingroup rate_control
+ *
+ * Calculates the total bits to allocate in this GF/ARF group.
+ *
+ * \param[in]    cpi              Top-level encoder structure
+ * \param[in]    gf_group_err     Cumulative coded error score for the
+ *                                frames making up this group.
+ *
+ * \return The target total number of bits for this GF/ARF group.
+ */
 static int64_t calculate_total_gf_group_bits(AV1_COMP *cpi,
                                              double gf_group_err) {
   const RATE_CONTROL *const rc = &cpi->rc;
@@ -813,9 +848,7 @@
   return bits_assigned;
 }
 
-// Compile time switch on alternate algorithm to allocate bits in ARF groups
-// #define ALT_ARF_ALLOCATION
-#ifdef ALT_ARF_ALLOCATION
+// Allocate bits to each frame in a GF / ARF group
 double layer_fraction[MAX_ARF_LAYERS + 1] = { 1.0,  0.70, 0.55, 0.60,
                                               0.60, 1.0,  1.0 };
 static void allocate_gf_group_bits(GF_GROUP *gf_group, RATE_CONTROL *const rc,
@@ -826,26 +859,17 @@
   const int gf_group_size = gf_group->size;
   int layer_frames[MAX_ARF_LAYERS + 1] = { 0 };
 
-  // Subtract the extra bits set aside for ARF frames from the Group Total
-  if (use_arf || !key_frame) total_group_bits -= gf_arf_bits;
-
-  if (rc->baseline_gf_interval)
-    base_frame_bits = (int)(total_group_bits / rc->baseline_gf_interval);
-  else
-    base_frame_bits = (int)1;
-
   // For key frames the frame target rate is already set and it
   // is also the golden frame.
   // === [frame_index == 0] ===
-  int frame_index = 0;
-  if (!key_frame) {
-    if (rc->source_alt_ref_active)
-      gf_group->bit_allocation[frame_index] = 0;
-    else
-      gf_group->bit_allocation[frame_index] =
-          base_frame_bits + (int)(gf_arf_bits * layer_fraction[1]);
-  }
-  frame_index++;
+  int frame_index = !!key_frame;
+
+  // Subtract the extra bits set aside for ARF frames from the Group Total
+  if (use_arf) total_group_bits -= gf_arf_bits;
+
+  int num_frames =
+      AOMMAX(1, rc->baseline_gf_interval - (rc->frames_since_key == 0));
+  base_frame_bits = (int)(total_group_bits / num_frames);
 
   // Check the number of frames in each layer in case we have a
   // non standard group length.
@@ -853,7 +877,6 @@
   for (int idx = frame_index; idx < gf_group_size; ++idx) {
     if ((gf_group->update_type[idx] == ARF_UPDATE) ||
         (gf_group->update_type[idx] == INTNL_ARF_UPDATE)) {
-      // max_arf_layer = AOMMAX(max_arf_layer, gf_group->layer_depth[idx]);
       layer_frames[gf_group->layer_depth[idx]]++;
     }
   }
@@ -890,90 +913,20 @@
   // simplify logics in reference frame management.
   gf_group->bit_allocation[gf_group_size] = 0;
 }
-#else
-static void allocate_gf_group_bits(GF_GROUP *gf_group, RATE_CONTROL *const rc,
-                                   int64_t gf_group_bits, int gf_arf_bits,
-                                   int key_frame, int use_arf) {
-  int64_t total_group_bits = gf_group_bits;
-
-  // For key frames the frame target rate is already set and it
-  // is also the golden frame.
-  // === [frame_index == 0] ===
-  int frame_index = 0;
-  if (!key_frame) {
-    if (rc->source_alt_ref_active)
-      gf_group->bit_allocation[frame_index] = 0;
-    else
-      gf_group->bit_allocation[frame_index] = gf_arf_bits;
-  }
-
-  // Deduct the boost bits for arf (or gf if it is not a key frame)
-  // from the group total.
-  if (use_arf || !key_frame) total_group_bits -= gf_arf_bits;
-
-  frame_index++;
-
-  // Store the bits to spend on the ARF if there is one.
-  // === [frame_index == 1] ===
-  if (use_arf) {
-    gf_group->bit_allocation[frame_index] = gf_arf_bits;
-    ++frame_index;
-  }
-
-  const int gf_group_size = gf_group->size;
-  int arf_depth_bits[MAX_ARF_LAYERS + 1] = { 0 };
-  int arf_depth_count[MAX_ARF_LAYERS + 1] = { 0 };
-  int arf_depth_boost[MAX_ARF_LAYERS + 1] = { 0 };
-  int total_arfs = 0;
-  int total_overlays = rc->source_alt_ref_active;
-
-  for (int idx = 0; idx < gf_group_size; ++idx) {
-    if (gf_group->update_type[idx] == ARF_UPDATE ||
-        gf_group->update_type[idx] == INTNL_ARF_UPDATE ||
-        gf_group->update_type[idx] == LF_UPDATE) {
-      arf_depth_boost[gf_group->layer_depth[idx]] += gf_group->arf_boost[idx];
-      ++arf_depth_count[gf_group->layer_depth[idx]];
-    }
-  }
-
-  for (int idx = 2; idx <= MAX_ARF_LAYERS; ++idx) {
-    arf_depth_bits[idx] =
-        calculate_boost_bits(rc->baseline_gf_interval - total_arfs -
-                                 total_overlays - arf_depth_count[idx],
-                             arf_depth_boost[idx], total_group_bits);
-    total_group_bits -= arf_depth_bits[idx];
-    total_arfs += arf_depth_count[idx];
-  }
-
-  for (int idx = frame_index; idx < gf_group_size; ++idx) {
-    switch (gf_group->update_type[idx]) {
-      case ARF_UPDATE:
-      case INTNL_ARF_UPDATE:
-      case LF_UPDATE:
-        gf_group->bit_allocation[idx] =
-            (int)(((int64_t)arf_depth_bits[gf_group->layer_depth[idx]] *
-                   gf_group->arf_boost[idx]) /
-                  arf_depth_boost[gf_group->layer_depth[idx]]);
-        break;
-      case INTNL_OVERLAY_UPDATE:
-      case OVERLAY_UPDATE:
-      default: gf_group->bit_allocation[idx] = 0; break;
-    }
-  }
-
-  // Set the frame following the current GOP to 0 bit allocation. For ARF
-  // groups, this next frame will be overlay frame, which is the first frame
-  // in the next GOP. For GF group, next GOP will overwrite the rate allocation.
-  // Setting this frame to use 0 bit (of out the current GOP budget) will
-  // simplify logics in reference frame management.
-  gf_group->bit_allocation[gf_group_size] = 0;
-}
-#endif
 
 // Returns true if KF group and GF group both are almost completely static.
-static INLINE int is_almost_static(double gf_zero_motion, int kf_zero_motion) {
-  return (gf_zero_motion >= 0.995) &&
-         (kf_zero_motion >= STATIC_KF_GROUP_THRESH);
+static INLINE int is_almost_static(double gf_zero_motion, int kf_zero_motion,
+                                   int is_lap_enabled) {
+  if (is_lap_enabled) {
+    /*
+     * when LAP enabled kf_zero_motion is not reliable, so use strict
+     * constraint on gf_zero_motion.
+     */
+    return (gf_zero_motion >= 0.999);
+  } else {
+    return (gf_zero_motion >= 0.995) &&
+           (kf_zero_motion >= STATIC_KF_GROUP_THRESH);
+  }
 }
 
 #define ARF_ABS_ZOOM_THRESH 4.4
@@ -983,9 +936,10 @@
                                 GF_GROUP_STATS *gf_stats) {
   RATE_CONTROL *const rc = &cpi->rc;
   TWO_PASS *const twopass = &cpi->twopass;
+  InitialDimensions *const initial_dimensions = &cpi->initial_dimensions;
   // Motion breakout threshold for loop below depends on image size.
   const double mv_ratio_accumulator_thresh =
-      (cpi->initial_height + cpi->initial_width) / 4.0;
+      (initial_dimensions->height + initial_dimensions->width) / 4.0;
 
   if (!flash_detected) {
     // Break clause to detect very still sections after motion. For example,
@@ -1011,252 +965,928 @@
   // so we can continue for more frames.
   if (((frame_index - cur_start) >= active_max_gf_interval + 1) &&
       !is_almost_static(gf_stats->zero_motion_accumulator,
-                        twopass->kf_zeromotion_pct)) {
+                        twopass->kf_zeromotion_pct, cpi->lap_enabled)) {
     return 1;
   }
   return 0;
 }
 
-#define MAX_PAD_GF_CHECK 6  // padding length to check for gf length
-#define AVG_SI_THRES 0.6    // thres for average silouette
-#define GF_SHRINK_OUTPUT 0  // print output for gf length decision
-int determine_high_err_gf(double *errs, int *is_high, double *si, int len,
-                          double *ratio, int gf_start, int gf_end,
-                          int before_pad) {
-  (void)gf_start;
-  (void)gf_end;
-  (void)before_pad;
-  // alpha and beta controls the threshold placement
-  // e.g. a smaller alpha makes the lower group more rigid
-  const double alpha = 0.5;
-  const double beta = 1 - alpha;
-  double mean = 0;
-  double mean_low = 0;
-  double mean_high = 0;
-  double prev_mean_low = 0;
-  double prev_mean_high = 0;
-  int count_low = 0;
-  int count_high = 0;
-  // calculate mean of errs
-  for (int i = 0; i < len; i++) {
-    mean += errs[i];
-  }
-  mean /= len;
-  // separate into two initial groups with greater / lower than mean
-  for (int i = 0; i < len; i++) {
-    if (errs[i] <= mean) {
-      is_high[i] = 0;
-      count_low++;
-      prev_mean_low += errs[i];
-    } else {
-      is_high[i] = 1;
-      count_high++;
-      prev_mean_high += errs[i];
-    }
-  }
-  prev_mean_low /= count_low;
-  prev_mean_high /= count_high;
-  // kmeans to refine
-  int count = 0;
-  while (count < 10) {
-    // re-group
-    mean_low = 0;
-    mean_high = 0;
-    count_low = 0;
-    count_high = 0;
-    double thres = prev_mean_low * alpha + prev_mean_high * beta;
-    for (int i = 0; i < len; i++) {
-      if (errs[i] <= thres) {
-        is_high[i] = 0;
-        count_low++;
-        mean_low += errs[i];
-      } else {
-        is_high[i] = 1;
-        count_high++;
-        mean_high += errs[i];
-      }
-    }
-    mean_low /= count_low;
-    mean_high /= count_high;
-
-    // break if not changed much
-    if (fabs((mean_low - prev_mean_low) / (prev_mean_low + 0.00001)) <
-            0.00001 &&
-        fabs((mean_high - prev_mean_high) / (prev_mean_high + 0.00001)) <
-            0.00001)
-      break;
-
-    // update means
-    prev_mean_high = mean_high;
-    prev_mean_low = mean_low;
-
-    count++;
-  }
-
-  // count how many jumps of group changes
-  int num_change = 0;
-  for (int i = 0; i < len - 1; i++) {
-    if (is_high[i] != is_high[i + 1]) num_change++;
-  }
-
-  // get silhouette as a measure of the classification quality
-  double avg_si = 0;
-  // ai: avg dist of its own class, bi: avg dist to the other class
-  double ai, bi;
-  if (count_low > 1 && count_high > 1) {
-    for (int i = 0; i < len; i++) {
-      ai = 0;
-      bi = 0;
-      // calculate average distance to everyone in the same group
-      // and in the other group
-      for (int j = 0; j < len; j++) {
-        if (i == j) continue;
-        if (is_high[i] == is_high[j]) {
-          ai += fabs(errs[i] - errs[j]);
-        } else {
-          bi += fabs(errs[i] - errs[j]);
-        }
-      }
-      if (is_high[i] == 0) {
-        ai = ai / (count_low - 1);
-        bi = bi / count_high;
-      } else {
-        ai = ai / (count_high - 1);
-        bi = bi / count_low;
-      }
-      if (ai <= bi) {
-        si[i] = 1 - ai / (bi + 0.00001);
-      } else {
-        si[i] = bi / (ai + 0.00001) - 1;
-      }
-      avg_si += si[i];
-    }
-    avg_si /= len;
-  }
-
-  int reset = 0;
-  *ratio = mean_high / (mean_low + 0.00001);
-  // if the two groups too similar, or
-  // if too many numbers of changes, or
-  // silhouette is too small, not confident
-  // reset everything to 0 later so we fallback to the original decision
-  if (*ratio < 1.3 || num_change > AOMMAX(len / 3, 6) ||
-      avg_si < AVG_SI_THRES) {
-    reset = 1;
-  }
-
-#if GF_SHRINK_OUTPUT
-  printf("\n");
-  for (int i = 0; i < len; i++) {
-    printf("%d: err %.1f, ishigh %d, si %.2f, (i=%d)\n",
-           gf_start + i - before_pad, errs[i], is_high[i], si[i], gf_end);
-  }
-  printf(
-      "count: %d, mean_high: %.1f, mean_low: %.1f, avg_si: %.2f, num_change: "
-      "%d, ratio %.2f, reset: %d\n",
-      count, mean_high, mean_low, avg_si, num_change,
-      mean_high / (mean_low + 0.000001), reset);
-#endif
-
-  if (reset) {
-    memset(is_high, 0, sizeof(is_high[0]) * len);
-    memset(si, 0, sizeof(si[0]) * len);
-  }
-  return reset;
-}
-
 #if GROUP_ADAPTIVE_MAXQ
 #define RC_FACTOR_MIN 0.75
 #define RC_FACTOR_MAX 1.25
 #endif  // GROUP_ADAPTIVE_MAXQ
+
 #define MIN_FWD_KF_INTERVAL 8
-#define MIN_SHRINK_LEN 6      // the minimum length of gf if we are shrinking
-#define SI_HIGH AVG_SI_THRES  // high quality classification
-#define SI_LOW 0.3            // very unsure classification
-// this function finds an low error frame previously to the current last frame
-// in the gf group, and set the last frame to it.
-// The resulting last frame is then returned by *cur_last_ptr
-// *cur_start_ptr and cut_pos[n] could also change due to shrinking
-// previous gf groups
-void set_last_prev_low_err(int *cur_start_ptr, int *cur_last_ptr, int *cut_pos,
-                           int count_cuts, int before_pad, double ratio,
-                           int *is_high, double *si, int prev_lows) {
-  int n;
-  int cur_start = *cur_start_ptr;
-  int cur_last = *cur_last_ptr;
-  for (n = cur_last; n >= cur_start + MIN_SHRINK_LEN; n--) {
-    // try to find a point that is very probable to be good
-    if (is_high[n - cur_start + before_pad] == 0 &&
-        si[n - cur_start + before_pad] > SI_HIGH) {
-      *cur_last_ptr = n;
-      return;
-    }
-  }
-  // could not find a low-err point, then let's try find an "unsure"
-  // point at least
-  for (n = cur_last; n >= cur_start + MIN_SHRINK_LEN; n--) {
-    if ((is_high[n - cur_start + before_pad] == 0) ||
-        (is_high[n - cur_start + before_pad] &&
-         si[n - cur_start + before_pad] < SI_LOW)) {
-      *cur_last_ptr = n;
-      return;
-    }
-  }
-  if (prev_lows) {
-    // try with shrinking previous all_zero interval
-    for (n = cur_start + MIN_SHRINK_LEN - 1; n > cur_start; n--) {
-      if (is_high[n - cur_start + before_pad] == 0 &&
-          si[n - cur_start + before_pad] > SI_HIGH) {
-        int tentative_start = n - MIN_SHRINK_LEN;
-        // check if the previous interval can shrink this much
-        int available =
-            tentative_start - cut_pos[count_cuts - 2] > MIN_SHRINK_LEN &&
-            cur_start - tentative_start < prev_lows;
-        // shrinking too agressively may worsen performance
-        // set stricter thres for shorter length
-        double ratio_thres =
-            1.0 * (cur_start - tentative_start) / (double)(MIN_SHRINK_LEN) +
-            1.0;
+#define MIN_SHRINK_LEN 6  // the minimum length of gf if we are shrinking
+#define SMOOTH_FILT_LEN 7
+#define HALF_FILT_LEN (SMOOTH_FILT_LEN / 2)
+#define WINDOW_SIZE 7
+#define HALF_WIN (WINDOW_SIZE / 2)
+// A 7-tap gaussian smooth filter
+const double smooth_filt[SMOOTH_FILT_LEN] = { 0.006, 0.061, 0.242, 0.383,
+                                              0.242, 0.061, 0.006 };
 
-        if (available && (ratio > ratio_thres)) {
-          cut_pos[count_cuts - 1] = tentative_start;
-          *cur_start_ptr = tentative_start;
-          *cur_last_ptr = n;
-          return;
-        }
-      }
+// Smooth filter intra_error and coded_error in firstpass stats.
+// If ignore[i]==1, the ith element should not be used in the filtering.
+static void smooth_filter_stats(const FIRSTPASS_STATS *stats, const int *ignore,
+                                int start_idx, int last_idx,
+                                double *filt_intra_err,
+                                double *filt_coded_err) {
+  int i, j;
+  for (i = start_idx; i <= last_idx; i++) {
+    double total_wt = 0;
+    for (j = -HALF_FILT_LEN; j <= HALF_FILT_LEN; j++) {
+      int idx = AOMMIN(AOMMAX(i + j, start_idx), last_idx);
+      if (ignore[idx]) continue;
+
+      filt_intra_err[i] +=
+          smooth_filt[j + HALF_FILT_LEN] * stats[idx].intra_error;
+      total_wt += smooth_filt[j + HALF_FILT_LEN];
+    }
+    if (total_wt > 0.01) {
+      filt_intra_err[i] /= total_wt;
+    } else {
+      filt_intra_err[i] = stats[i].intra_error;
     }
   }
-  if (prev_lows) {
-    // try with shrinking previous all_zero interval with unsure points
-    for (n = cur_start + MIN_SHRINK_LEN - 1; n > cur_start; n--) {
-      if ((is_high[n - cur_start + before_pad] == 0) ||
-          (is_high[n - cur_start + before_pad] &&
-           si[n - cur_start + before_pad] < SI_LOW)) {
-        int tentative_start = n - MIN_SHRINK_LEN;
-        // check if the previous interval can shrink this much
-        int available =
-            tentative_start - cut_pos[count_cuts - 2] > MIN_SHRINK_LEN &&
-            cur_start - tentative_start < prev_lows;
-        // shrinking too agressively may worsen performance
-        double ratio_thres =
-            1.0 * (cur_start - tentative_start) / (double)(MIN_SHRINK_LEN) +
-            1.0;
+  for (i = start_idx; i <= last_idx; i++) {
+    double total_wt = 0;
+    for (j = -HALF_FILT_LEN; j <= HALF_FILT_LEN; j++) {
+      int idx = AOMMIN(AOMMAX(i + j, start_idx), last_idx);
+      // Coded error involves idx and idx - 1.
+      if (ignore[idx] || (idx > 0 && ignore[idx - 1])) continue;
 
-        if (available && (ratio > ratio_thres)) {
-          cut_pos[count_cuts - 1] = tentative_start;
-          *cur_start_ptr = tentative_start;
-          *cur_last_ptr = n;
-          return;
-        }
-      }
+      filt_coded_err[i] +=
+          smooth_filt[j + HALF_FILT_LEN] * stats[idx].coded_error;
+      total_wt += smooth_filt[j + HALF_FILT_LEN];
     }
-  }  // prev_lows
-  return;
+    if (total_wt > 0.01) {
+      filt_coded_err[i] /= total_wt;
+    } else {
+      filt_coded_err[i] = stats[i].coded_error;
+    }
+  }
 }
 
-// This function decides the gf group length of future frames in batch
-// rc->gf_intervals is modified to store the group lengths
+// Calculate gradient
+static void get_gradient(const double *values, int start, int last,
+                         double *grad) {
+  if (start == last) {
+    grad[start] = 0;
+    return;
+  }
+  for (int i = start; i <= last; i++) {
+    int prev = AOMMAX(i - 1, start);
+    int next = AOMMIN(i + 1, last);
+    grad[i] = (values[next] - values[prev]) / (next - prev);
+  }
+}
+
+static int find_next_scenecut(const FIRSTPASS_STATS *const stats_start,
+                              int first, int last, int *ignore) {
+  // Identify unstable areas caused by scenecuts.
+  // Find the max and 2nd max coded error, and the average of the rest frames.
+  // If there is only one frame that yields a huge coded error, it is likely a
+  // scenecut.
+  double this_ratio, max_prev_ratio, max_next_ratio, max_prev_coded,
+      max_next_coded;
+
+  if (last - first == 0) return -1;
+
+  for (int i = first; i <= last; i++) {
+    if (ignore[i] || (i > 0 && ignore[i - 1])) continue;
+    double temp_intra = AOMMAX(stats_start[i].intra_error, 0.01);
+    this_ratio = stats_start[i].coded_error / temp_intra;
+    // find the avg ratio in the preceding neighborhood
+    max_prev_ratio = 0;
+    max_prev_coded = 0;
+    for (int j = AOMMAX(first, i - HALF_WIN); j < i; j++) {
+      if (ignore[j] || (j > 0 && ignore[j - 1])) continue;
+      temp_intra = AOMMAX(stats_start[j].intra_error, 0.01);
+      double temp_ratio = stats_start[j].coded_error / temp_intra;
+      if (temp_ratio > max_prev_ratio) {
+        max_prev_ratio = temp_ratio;
+      }
+      if (stats_start[j].coded_error > max_prev_coded) {
+        max_prev_coded = stats_start[j].coded_error;
+      }
+    }
+    // find the avg ratio in the following neighborhood
+    max_next_ratio = 0;
+    max_next_coded = 0;
+    for (int j = i + 1; j <= AOMMIN(i + HALF_WIN, last); j++) {
+      if (ignore[j] || (j > 0 && ignore[j - 1])) continue;
+      temp_intra = AOMMAX(stats_start[j].intra_error, 0.01);
+      double temp_ratio = stats_start[j].coded_error / temp_intra;
+      if (temp_ratio > max_next_ratio) {
+        max_next_ratio = temp_ratio;
+      }
+      if (stats_start[j].coded_error > max_next_coded) {
+        max_next_coded = stats_start[j].coded_error;
+      }
+    }
+
+    if (max_prev_ratio < 0.001 && max_next_ratio < 0.001) {
+      // the ratios are very small, only check a small fixed threshold
+      if (this_ratio < 0.02) continue;
+    } else {
+      // check if this frame has a larger ratio than the neighborhood
+      double max_sr = stats_start[i].sr_coded_error;
+      if (i < last) max_sr = AOMMAX(max_sr, stats_start[i + 1].sr_coded_error);
+      double max_sr_fr_ratio =
+          max_sr / AOMMAX(stats_start[i].coded_error, 0.01);
+
+      if (max_sr_fr_ratio > 1.2) continue;
+      if (this_ratio < 2 * AOMMAX(max_prev_ratio, max_next_ratio) &&
+          stats_start[i].coded_error <
+              2 * AOMMAX(max_prev_coded, max_next_coded)) {
+        continue;
+      }
+    }
+    return i;
+  }
+  return -1;
+}
+
+static void mark_flashes(const FIRSTPASS_STATS *stats, int start_idx,
+                         int last_idx, int *is_flash) {
+  int i;
+  for (i = start_idx; i < last_idx; i++) {
+    if (stats[i + 1].pcnt_second_ref > stats[i + 1].pcnt_inter &&
+        stats[i + 1].pcnt_second_ref >= 0.5) {
+      // this is a new flash frame
+      is_flash[i] = 1;
+      continue;
+    }
+  }
+}
+
+// Remove the region with index next_region.
+// parameter merge: 0: merge with previous; 1: merge with next; 2:
+// merge with both, take type from previous if possible
+// After removing, next_region will be the index of the next region.
+static void remove_region(int merge, REGIONS *regions, int *num_regions,
+                          int *next_region) {
+  int k = *next_region;
+  assert(k < *num_regions);
+  if (*num_regions == 1) {
+    *num_regions = 0;
+    return;
+  }
+  if (k == 0) {
+    merge = 1;
+  } else if (k == *num_regions - 1) {
+    merge = 0;
+  }
+  int num_merge = (merge == 2) ? 2 : 1;
+  switch (merge) {
+    case 0:
+      regions[k - 1].last = regions[k].last;
+      *next_region = k;
+      break;
+    case 1:
+      regions[k + 1].start = regions[k].start;
+      *next_region = k + 1;
+      break;
+    case 2:
+      regions[k - 1].last = regions[k + 1].last;
+      *next_region = k;
+      break;
+    default: assert(0);
+  }
+  *num_regions -= num_merge;
+  for (k = *next_region - (merge == 1); k < *num_regions; k++) {
+    regions[k] = regions[k + num_merge];
+  }
+}
+
+// Insert a region in the cur_region_idx. The start and last should both be in
+// the current region. After insertion, the cur_region_idx will point to the
+// last region that was splitted from the original region.
+static void insert_region(int start, int last, REGION_TYPES type,
+                          REGIONS *regions, int *num_regions,
+                          int *cur_region_idx) {
+  int k = *cur_region_idx;
+  REGION_TYPES this_region_type = regions[k].type;
+  int this_region_last = regions[k].last;
+  int num_add = (start != regions[k].start) + (last != regions[k].last);
+  // move the following regions further to the back
+  for (int r = *num_regions - 1; r > k; r--) {
+    regions[r + num_add] = regions[r];
+  }
+  *num_regions += num_add;
+  if (start > regions[k].start) {
+    regions[k].last = start - 1;
+    k++;
+    regions[k].start = start;
+  }
+  regions[k].type = type;
+  if (last < this_region_last) {
+    regions[k].last = last;
+    k++;
+    regions[k].start = last + 1;
+    regions[k].last = this_region_last;
+    regions[k].type = this_region_type;
+  } else {
+    regions[k].last = this_region_last;
+  }
+  *cur_region_idx = k;
+}
+
+// Estimate the noise variance of each frame from the first pass stats
+static void estimate_region_noise(const FIRSTPASS_STATS *stats,
+                                  const int *is_flash, REGIONS *region) {
+  double C1, C2, C3, noise;
+  int count = 0;
+  region->avg_noise_var = -1;
+  for (int i = region->start + 2; i <= region->last; i++) {
+    if (is_flash[i] || is_flash[i - 1] || is_flash[i - 2]) continue;
+
+    C1 = stats[i - 1].intra_error *
+         (stats[i].intra_error - stats[i].coded_error);
+    C2 = stats[i - 2].intra_error *
+         (stats[i - 1].intra_error - stats[i - 1].coded_error);
+    C3 = stats[i - 2].intra_error *
+         (stats[i].intra_error - stats[i].sr_coded_error);
+    if (C1 <= 0 || C2 <= 0 || C3 <= 0) continue;
+    C1 = sqrt(C1);
+    C2 = sqrt(C2);
+    C3 = sqrt(C3);
+
+    noise = stats[i - 1].intra_error - C1 * C2 / C3;
+    noise = AOMMAX(noise, 0.01);
+    region->avg_noise_var = (region->avg_noise_var == -1)
+                                ? noise
+                                : AOMMIN(noise, region->avg_noise_var);
+    count++;
+  }
+  if (count == 0) {
+    region->avg_noise_var = 0;
+  }
+}
+
+// Analyze the corrrelation coefficient of each frame with its previous frame in
+// a region. Also get the average of stats inside a region.
+// Before calling this function, the region's noise variance is needed.
+static void analyze_region(const FIRSTPASS_STATS *stats, int region_idx,
+                           REGIONS *regions, double *coeff) {
+  double cor_coeff;
+
+  int i, k = region_idx;
+  regions[k].avg_cor_coeff = 0;
+  regions[k].avg_sr_fr_ratio = 0;
+  regions[k].avg_intra_err = 0;
+  regions[k].avg_coded_err = 0;
+
+  int check_first_sr = (k != 0);
+
+  for (i = regions[k].start; i <= regions[k].last; i++) {
+    double C = sqrt(AOMMAX(stats[i - 1].intra_error *
+                               (stats[i].intra_error - stats[i].coded_error),
+                           0.001));
+    cor_coeff =
+        C / AOMMAX(stats[i - 1].intra_error - regions[k].avg_noise_var, 0.001);
+
+    if (i > regions[k].start || check_first_sr) {
+      double num_frames =
+          (double)(regions[k].last - regions[k].start + check_first_sr);
+      double max_coded_error =
+          AOMMAX(stats[i].coded_error, stats[i - 1].coded_error);
+      double this_ratio =
+          stats[i].sr_coded_error / AOMMAX(max_coded_error, 0.001);
+      regions[k].avg_sr_fr_ratio += this_ratio / num_frames;
+    }
+
+    regions[k].avg_intra_err +=
+        stats[i].intra_error / (double)(regions[k].last - regions[k].start + 1);
+    regions[k].avg_coded_err +=
+        stats[i].coded_error / (double)(regions[k].last - regions[k].start + 1);
+
+    coeff[i] =
+        cor_coeff *
+        sqrt(
+            AOMMAX(stats[i - 1].intra_error - regions[k].avg_noise_var, 0.001) /
+            AOMMAX(stats[i].intra_error - regions[k].avg_noise_var, 0.001));
+    // clip correlation coefficient.
+    coeff[i] = AOMMIN(AOMMAX(coeff[i], 0), 1);
+
+    regions[k].avg_cor_coeff +=
+        coeff[i] / (double)(regions[k].last - regions[k].start + 1);
+  }
+}
+
+// Calculate the regions stats of every region. Uses the stable regions to
+// estimate noise variance of other regions. Then call analyze_region for each.
+static void get_region_stats(const FIRSTPASS_STATS *stats, const int *is_flash,
+                             REGIONS *regions, double *coeff, int num_regions) {
+  int k, count_stable = 0;
+  // Analyze stable regions.
+  for (k = 0; k < num_regions; k++) {
+    if (regions[k].type == STABLE_REGION) {
+      estimate_region_noise(stats, is_flash, regions + k);
+      analyze_region(stats, k, regions, coeff);
+      count_stable++;
+    }
+  }
+
+  if (count_stable == 0) {
+    // no stable region, just use the lowest noise variance estimated.
+    double lowest_noise = -1;
+    for (k = 0; k < num_regions; k++) {
+      if (regions[k].type == SCENECUT_REGION) continue;
+      estimate_region_noise(stats, is_flash, regions + k);
+      if (regions[k].avg_noise_var < 0.01) continue;
+      if (lowest_noise < 0 || lowest_noise > regions[k].avg_noise_var) {
+        lowest_noise = regions[k].avg_noise_var;
+      }
+    }
+    lowest_noise = AOMMAX(lowest_noise, 0);
+    for (k = 0; k < num_regions; k++) {
+      regions[k].avg_noise_var = lowest_noise;
+      analyze_region(stats, k, regions, coeff);
+    }
+    return;
+  }
+
+  // Analyze other regions
+  for (k = 0; k < num_regions; k++) {
+    if (regions[k].type != STABLE_REGION) {
+      // use the average of the nearest previous and next stable regions
+      int count = 0;
+      regions[k].avg_noise_var = 0;
+      for (int r = k - 1; r >= 0; r--) {
+        if (regions[r].type == STABLE_REGION) {
+          count++;
+          regions[k].avg_noise_var += regions[r].avg_noise_var;
+          break;
+        }
+      }
+      for (int r = k + 1; r < num_regions; r++) {
+        if (regions[r].type == STABLE_REGION) {
+          count++;
+          regions[k].avg_noise_var += regions[r].avg_noise_var;
+          break;
+        }
+      }
+      if (count) {
+        regions[k].avg_noise_var /= (double)count;
+      }
+      analyze_region(stats, k, regions, coeff);
+    }
+  }
+}
+
+// Find tentative stable regions
+static int find_stable_regions(const FIRSTPASS_STATS *stats,
+                               const double *grad_coded, const int *ignore,
+                               int this_start, int this_last,
+                               REGIONS *regions) {
+  int i, j, k = 0;
+  regions[k].start = this_start;
+  for (i = this_start; i <= this_last; i++) {
+    // Check mean and variance of stats in a window
+    double mean_intra = 0.001, var_intra = 0.001;
+    double mean_coded = 0.001, var_coded = 0.001;
+    int count = 0;
+    for (j = -HALF_WIN; j <= HALF_WIN; j++) {
+      int idx = AOMMIN(AOMMAX(i + j, this_start), this_last);
+      if (ignore[idx] || (idx > 0 && ignore[idx - 1])) continue;
+      mean_intra += stats[idx].intra_error;
+      var_intra += stats[idx].intra_error * stats[idx].intra_error;
+      mean_coded += stats[idx].coded_error;
+      var_coded += stats[idx].coded_error * stats[idx].coded_error;
+      count++;
+    }
+
+    REGION_TYPES cur_type;
+    if (count > 0) {
+      mean_intra /= (double)count;
+      var_intra /= (double)count;
+      mean_coded /= (double)count;
+      var_coded /= (double)count;
+      int is_intra_stable = (var_intra / (mean_intra * mean_intra) < 1.03);
+      int is_coded_stable = (var_coded / (mean_coded * mean_coded) < 1.04 &&
+                             fabs(grad_coded[i]) / mean_coded < 0.05) ||
+                            mean_coded / mean_intra < 0.05;
+      int is_coded_small = mean_coded < 0.5 * mean_intra;
+      cur_type = (is_intra_stable && is_coded_stable && is_coded_small)
+                     ? STABLE_REGION
+                     : HIGH_VAR_REGION;
+    } else {
+      cur_type = HIGH_VAR_REGION;
+    }
+
+    // mark a new region if type changes
+    if (i == regions[k].start) {
+      // first frame in the region
+      regions[k].type = cur_type;
+    } else if (cur_type != regions[k].type) {
+      // Append a new region
+      regions[k].last = i - 1;
+      regions[k + 1].start = i;
+      regions[k + 1].type = cur_type;
+      k++;
+    }
+  }
+  regions[k].last = this_last;
+  return k + 1;
+}
+
+// Clean up regions that should be removed or merged.
+static void cleanup_regions(REGIONS *regions, int *num_regions) {
+  int k = 0;
+  while (k < *num_regions) {
+    if ((k > 0 && regions[k - 1].type == regions[k].type &&
+         regions[k].type != SCENECUT_REGION) ||
+        regions[k].last < regions[k].start) {
+      remove_region(0, regions, num_regions, &k);
+    } else {
+      k++;
+    }
+  }
+}
+
+// Remove regions that are of type and shorter than length.
+// Merge it with its neighboring regions.
+static void remove_short_regions(REGIONS *regions, int *num_regions,
+                                 REGION_TYPES type, int length) {
+  int k = 0;
+  while (k < *num_regions && (*num_regions) > 1) {
+    if ((regions[k].last - regions[k].start + 1 < length &&
+         regions[k].type == type)) {
+      // merge current region with the previous and next regions
+      remove_region(2, regions, num_regions, &k);
+    } else {
+      k++;
+    }
+  }
+  cleanup_regions(regions, num_regions);
+}
+
+static void adjust_unstable_region_bounds(const FIRSTPASS_STATS *stats,
+                                          const int *is_flash,
+                                          const double *grad, REGIONS *regions,
+                                          double *coeff, int *num_regions) {
+  int i, j, k;
+  // Remove regions that are too short. Likely noise.
+  remove_short_regions(regions, num_regions, STABLE_REGION, HALF_WIN);
+  remove_short_regions(regions, num_regions, HIGH_VAR_REGION, HALF_WIN);
+
+  get_region_stats(stats, is_flash, regions, coeff, *num_regions);
+
+  // Adjust region boundaries. The thresholds are empirically obtained, but
+  // overall the performance is not very sensitive to small changes to them.
+  for (k = 0; k < *num_regions; k++) {
+    if (regions[k].type == STABLE_REGION) continue;
+    if (k > 0) {
+      // Adjust previous boundary.
+      // First find the average intra/coded error in the previous
+      // neighborhood.
+      double avg_intra_err = 0, avg_coded_err = 0, avg_coeff = 0;
+      int starti = AOMMAX(regions[k - 1].last - WINDOW_SIZE + 1,
+                          regions[k - 1].start + 1);
+      int lasti = regions[k - 1].last;
+      int counti = 0;
+      for (i = starti; i <= lasti; i++) {
+        avg_intra_err += stats[i].intra_error;
+        avg_coded_err += stats[i].coded_error;
+        avg_coeff += coeff[i];
+        counti++;
+      }
+      if (counti > 0) {
+        avg_intra_err = AOMMAX(avg_intra_err / (double)counti, 0.001);
+        avg_coded_err /= AOMMAX(avg_coded_err / (double)counti, 0.001);
+        avg_coeff /= AOMMIN(avg_intra_err / (double)counti, 0.99999);
+        int count_coded = 0, count_grad = 0;
+        for (j = lasti + 1; j <= regions[k].last; j++) {
+          int intra_close =
+              fabs(stats[j].intra_error - avg_intra_err) / avg_intra_err < 0.1;
+          int coded_close =
+              fabs(stats[j].coded_error - avg_coded_err) / avg_coded_err < 0.15;
+          int grad_small = fabs(grad[j]) / avg_coded_err < 0.05;
+          int coded_small = stats[j].coded_error / avg_intra_err < 0.03;
+          int coeff_close =
+              (1 - coeff[j]) / (1 - avg_coeff) < 1.5 || coeff[j] > 0.995;
+          if (!coeff_close || (!coded_close && !coded_small)) count_coded--;
+          if (!grad_small && !coded_small) count_grad--;
+
+          if (intra_close && count_coded >= 0 && count_grad >= 0) {
+            // this frame probably belongs to the previous stable region
+            regions[k - 1].last = j;
+            regions[k].start = j + 1;
+          } else {
+            break;
+          }
+        }
+      }
+    }  // if k > 0
+    if (k < *num_regions - 1) {
+      // Adjust next boundary.
+      // First find the average intra/coded error in the next neighborhood.
+      double avg_intra_err = 0, avg_coded_err = 0, avg_coeff = 0;
+      int starti = regions[k + 1].start;
+      int lasti = AOMMIN(regions[k + 1].last - 1,
+                         regions[k + 1].start + WINDOW_SIZE - 1);
+      int counti = 0;
+      for (i = starti; i <= lasti; i++) {
+        avg_intra_err += stats[i].intra_error;
+        avg_coded_err += stats[i + 1].coded_error;
+        avg_coeff += coeff[i];
+        counti++;
+      }
+      if (counti > 0) {
+        avg_intra_err = AOMMAX(avg_intra_err / (double)counti, 0.001);
+        avg_coded_err /= AOMMAX(avg_coded_err / (double)counti, 0.001);
+        avg_coeff /= AOMMIN(avg_intra_err / (double)counti, 0.99999);
+        // At the boundary, coded error is large, but still the frame is stable
+        int count_coded = 1, count_grad = 1;
+        for (j = starti - 1; j >= regions[k].start; j--) {
+          int intra_close =
+              fabs(stats[j].intra_error - avg_intra_err) / avg_intra_err < 0.1;
+          int coded_close =
+              fabs(stats[j + 1].coded_error - avg_coded_err) / avg_coded_err <
+              0.15;
+          int grad_small = fabs(grad[j + 1]) / avg_coded_err < 0.05;
+          int coded_small = stats[j + 1].coded_error / avg_intra_err < 0.03;
+          int coeff_close =
+              (1 - coeff[j + 1]) / (1 - avg_coeff) < 1.5 || coeff[j] > 0.995;
+          if (!coeff_close || (!coded_close && !coded_small)) count_coded--;
+          if (!grad_small && !coded_small) count_grad--;
+          if (intra_close && count_coded >= 0 && count_grad >= 0) {
+            // this frame probably belongs to the next stable region
+            regions[k + 1].start = j;
+            regions[k].last = j - 1;
+          } else {
+            break;
+          }
+        }
+      }
+    }  // if k < *num_regions - 1
+  }    // end of loop over all regions
+
+  cleanup_regions(regions, num_regions);
+  remove_short_regions(regions, num_regions, HIGH_VAR_REGION, HALF_WIN);
+  get_region_stats(stats, is_flash, regions, coeff, *num_regions);
+
+  // If a stable regions has higher error than neighboring high var regions,
+  // or if the stable region has a lower average correlation,
+  // then it should be merged with them
+  k = 0;
+  while (k < *num_regions && (*num_regions) > 1) {
+    if (regions[k].type == STABLE_REGION &&
+        ((k > 0 &&  // previous regions
+          (regions[k].avg_coded_err > regions[k - 1].avg_coded_err ||
+           regions[k].avg_cor_coeff < regions[k - 1].avg_cor_coeff)) &&
+         (k < *num_regions - 1 &&  // next region
+          (regions[k].avg_coded_err > regions[k + 1].avg_coded_err ||
+           regions[k].avg_cor_coeff < regions[k + 1].avg_cor_coeff)))) {
+      // merge current region with the previous and next regions
+      remove_region(2, regions, num_regions, &k);
+      analyze_region(stats, k - 1, regions, coeff);
+    } else if (regions[k].type == HIGH_VAR_REGION &&
+               ((k > 0 &&  // previous regions
+                 (regions[k].avg_coded_err < regions[k - 1].avg_coded_err ||
+                  regions[k].avg_cor_coeff > regions[k - 1].avg_cor_coeff)) &&
+                (k < *num_regions - 1 &&  // next region
+                 (regions[k].avg_coded_err < regions[k + 1].avg_coded_err ||
+                  regions[k].avg_cor_coeff > regions[k + 1].avg_cor_coeff)))) {
+      // merge current region with the previous and next regions
+      remove_region(2, regions, num_regions, &k);
+      analyze_region(stats, k - 1, regions, coeff);
+    } else {
+      k++;
+    }
+  }
+
+  remove_short_regions(regions, num_regions, STABLE_REGION, WINDOW_SIZE);
+  remove_short_regions(regions, num_regions, HIGH_VAR_REGION, HALF_WIN);
+}
+
+// Identify blending regions.
+static void find_blending_regions(const FIRSTPASS_STATS *stats,
+                                  const int *is_flash, REGIONS *regions,
+                                  int *num_regions, double *coeff) {
+  int i, k = 0;
+  // Blending regions will have large content change, therefore will have a
+  // large consistent change in intra error.
+  int count_stable = 0;
+  while (k < *num_regions) {
+    if (regions[k].type == STABLE_REGION) {
+      k++;
+      count_stable++;
+      continue;
+    }
+    int dir = 0;
+    int start = 0, last;
+    for (i = regions[k].start; i <= regions[k].last; i++) {
+      // First mark the regions that has consistent large change of intra error.
+      if (is_flash[i] || (i > 0 && is_flash[i - 1])) continue;
+      double grad = stats[i].intra_error - stats[i - 1].intra_error;
+      int large_change = fabs(grad) / AOMMAX(stats[i].intra_error, 0.01) > 0.05;
+      int this_dir = 0;
+      if (large_change) {
+        this_dir = (grad > 0) ? 1 : -1;
+      }
+      // the current trend continues
+      if (dir == this_dir) continue;
+      if (dir != 0) {
+        // Mark the end of a new large change group and add it
+        last = i - 1;
+        insert_region(start, last, BLENDING_REGION, regions, num_regions, &k);
+      }
+      dir = this_dir;
+      start = i;
+    }
+    if (dir != 0) {
+      last = regions[k].last;
+      insert_region(start, last, BLENDING_REGION, regions, num_regions, &k);
+    }
+    k++;
+  }
+
+  // If the blending region has very low correlation, mark it as high variance
+  // since we probably cannot benefit from it anyways.
+  get_region_stats(stats, is_flash, regions, coeff, *num_regions);
+  for (k = 0; k < *num_regions; k++) {
+    if (regions[k].type != BLENDING_REGION) continue;
+    if (regions[k].last == regions[k].start || regions[k].avg_cor_coeff < 0.6 ||
+        count_stable == 0)
+      regions[k].type = HIGH_VAR_REGION;
+  }
+  get_region_stats(stats, is_flash, regions, coeff, *num_regions);
+
+  // It is possible for blending to result in a "dip" in intra error (first
+  // decrease then increase). Therefore we need to find the dip and combine the
+  // two regions.
+  k = 1;
+  while (k < *num_regions) {
+    if (k < *num_regions - 1 && regions[k].type == HIGH_VAR_REGION) {
+      // Check if this short high variance regions is actually in the middle of
+      // a blending region.
+      if (regions[k - 1].type == BLENDING_REGION &&
+          regions[k + 1].type == BLENDING_REGION &&
+          regions[k].last - regions[k].start < 3) {
+        int prev_dir = (stats[regions[k - 1].last].intra_error -
+                        stats[regions[k - 1].last - 1].intra_error) > 0
+                           ? 1
+                           : -1;
+        int next_dir = (stats[regions[k + 1].last].intra_error -
+                        stats[regions[k + 1].last - 1].intra_error) > 0
+                           ? 1
+                           : -1;
+        if (prev_dir < 0 && next_dir > 0) {
+          // This is possibly a mid region of blending. Check the ratios
+          double ratio_thres = AOMMIN(regions[k - 1].avg_sr_fr_ratio,
+                                      regions[k + 1].avg_sr_fr_ratio) *
+                               0.95;
+          if (regions[k].avg_sr_fr_ratio > ratio_thres) {
+            regions[k].type = BLENDING_REGION;
+            remove_region(2, regions, num_regions, &k);
+            analyze_region(stats, k - 1, regions, coeff);
+            continue;
+          }
+        }
+      }
+    }
+    // Check if we have a pair of consecutive blending regions.
+    if (regions[k - 1].type == BLENDING_REGION &&
+        regions[k].type == BLENDING_REGION) {
+      int prev_dir = (stats[regions[k - 1].last].intra_error -
+                      stats[regions[k - 1].last - 1].intra_error) > 0
+                         ? 1
+                         : -1;
+      int next_dir = (stats[regions[k].last].intra_error -
+                      stats[regions[k].last - 1].intra_error) > 0
+                         ? 1
+                         : -1;
+
+      // if both are too short, no need to check
+      int total_length = regions[k].last - regions[k - 1].start + 1;
+      if (total_length < 4) {
+        regions[k - 1].type = HIGH_VAR_REGION;
+        k++;
+        continue;
+      }
+
+      int to_merge = 0;
+      if (prev_dir < 0 && next_dir > 0) {
+        // In this case we check the last frame in the previous region.
+        double prev_length =
+            (double)(regions[k - 1].last - regions[k - 1].start + 1);
+        double last_ratio, ratio_thres;
+        if (prev_length < 2.01) {
+          // if the previous region is very short
+          double max_coded_error =
+              AOMMAX(stats[regions[k - 1].last].coded_error,
+                     stats[regions[k - 1].last - 1].coded_error);
+          last_ratio = stats[regions[k - 1].last].sr_coded_error /
+                       AOMMAX(max_coded_error, 0.001);
+          ratio_thres = regions[k].avg_sr_fr_ratio * 0.95;
+        } else {
+          double max_coded_error =
+              AOMMAX(stats[regions[k - 1].last].coded_error,
+                     stats[regions[k - 1].last - 1].coded_error);
+          last_ratio = stats[regions[k - 1].last].sr_coded_error /
+                       AOMMAX(max_coded_error, 0.001);
+          double prev_ratio =
+              (regions[k - 1].avg_sr_fr_ratio * prev_length - last_ratio) /
+              (prev_length - 1.0);
+          ratio_thres = AOMMIN(prev_ratio, regions[k].avg_sr_fr_ratio) * 0.95;
+        }
+        if (last_ratio > ratio_thres) {
+          to_merge = 1;
+        }
+      }
+
+      if (to_merge) {
+        remove_region(0, regions, num_regions, &k);
+        analyze_region(stats, k - 1, regions, coeff);
+        continue;
+      } else {
+        // These are possibly two separate blending regions. Mark the boundary
+        // frame as HIGH_VAR_REGION to separate the two.
+        int prev_k = k - 1;
+        insert_region(regions[prev_k].last, regions[prev_k].last,
+                      HIGH_VAR_REGION, regions, num_regions, &prev_k);
+        analyze_region(stats, prev_k, regions, coeff);
+        k = prev_k + 1;
+        analyze_region(stats, k, regions, coeff);
+      }
+    }
+    k++;
+  }
+  cleanup_regions(regions, num_regions);
+}
+
+// Clean up decision for blendings. Remove blending regions that are too short.
+// Also if a very short high var region is between a blending and a stable
+// region, just merge it with one of them.
+static void cleanup_blendings(REGIONS *regions, int *num_regions) {
+  int k = 0;
+  while (k<*num_regions && * num_regions> 1) {
+    int is_short_blending = regions[k].type == BLENDING_REGION &&
+                            regions[k].last - regions[k].start + 1 < 5;
+    int is_short_hv = regions[k].type == HIGH_VAR_REGION &&
+                      regions[k].last - regions[k].start + 1 < 5;
+    int has_stable_neighbor =
+        ((k > 0 && regions[k - 1].type == STABLE_REGION) ||
+         (k < *num_regions - 1 && regions[k + 1].type == STABLE_REGION));
+    int has_blend_neighbor =
+        ((k > 0 && regions[k - 1].type == BLENDING_REGION) ||
+         (k < *num_regions - 1 && regions[k + 1].type == BLENDING_REGION));
+    int total_neighbors = (k > 0) + (k < *num_regions - 1);
+
+    if (is_short_blending ||
+        (is_short_hv &&
+         has_stable_neighbor + has_blend_neighbor >= total_neighbors)) {
+      // Remove this region.Try to determine whether to combine it with the
+      // previous or next region.
+      int merge;
+      double prev_diff =
+          (k > 0)
+              ? fabs(regions[k].avg_cor_coeff - regions[k - 1].avg_cor_coeff)
+              : 1;
+      double next_diff =
+          (k < *num_regions - 1)
+              ? fabs(regions[k].avg_cor_coeff - regions[k + 1].avg_cor_coeff)
+              : 1;
+      // merge == 0 means to merge with previous, 1 means to merge with next
+      merge = prev_diff > next_diff;
+      remove_region(merge, regions, num_regions, &k);
+    } else {
+      k++;
+    }
+  }
+  cleanup_regions(regions, num_regions);
+}
+
+// Identify stable and unstable regions from first pass stats.
+// Stats_start points to the first frame to analyze.
+// Offset is the offset from the current frame to the frame stats_start is
+// pointing to.
+static void identify_regions(const FIRSTPASS_STATS *const stats_start,
+                             int total_frames, int offset, REGIONS *regions,
+                             int *total_regions, double *cor_coeff) {
+  int k;
+  if (total_frames <= 1) return;
+
+  double *coeff = cor_coeff + offset;
+
+  // store the initial decisions
+  REGIONS temp_regions[MAX_FIRSTPASS_ANALYSIS_FRAMES];
+  av1_zero_array(temp_regions, MAX_FIRSTPASS_ANALYSIS_FRAMES);
+  int is_flash[MAX_FIRSTPASS_ANALYSIS_FRAMES] = { 0 };
+  // buffers for filtered stats
+  double filt_intra_err[MAX_FIRSTPASS_ANALYSIS_FRAMES] = { 0 };
+  double filt_coded_err[MAX_FIRSTPASS_ANALYSIS_FRAMES] = { 0 };
+  double grad_coded[MAX_FIRSTPASS_ANALYSIS_FRAMES] = { 0 };
+
+  int cur_region = 0, this_start = 0, this_last;
+
+  // find possible flash frames
+  mark_flashes(stats_start, 0, total_frames - 1, is_flash);
+
+  // first get the obvious scenecuts
+  int next_scenecut = -1;
+
+  do {
+    next_scenecut =
+        find_next_scenecut(stats_start, this_start, total_frames - 1, is_flash);
+    this_last = (next_scenecut >= 0) ? (next_scenecut - 1) : total_frames - 1;
+    // low-pass filter the needed stats
+    smooth_filter_stats(stats_start, is_flash, this_start, this_last,
+                        filt_intra_err, filt_coded_err);
+    get_gradient(filt_coded_err, this_start, this_last, grad_coded);
+
+    // find tentative stable regions and unstable regions
+    int num_regions = find_stable_regions(stats_start, grad_coded, is_flash,
+                                          this_start, this_last, temp_regions);
+    adjust_unstable_region_bounds(stats_start, is_flash, grad_coded,
+                                  temp_regions, coeff, &num_regions);
+
+    get_region_stats(stats_start, is_flash, temp_regions, coeff, num_regions);
+
+    // Try to identify blending regions in the unstable regions
+    find_blending_regions(stats_start, is_flash, temp_regions, &num_regions,
+                          coeff);
+    cleanup_blendings(temp_regions, &num_regions);
+
+    // The flash points should all be considered high variance points
+    k = 0;
+    while (k < num_regions) {
+      if (temp_regions[k].type != STABLE_REGION) {
+        k++;
+        continue;
+      }
+      int start = temp_regions[k].start;
+      int last = temp_regions[k].last;
+      for (int i = start; i <= last; i++) {
+        if (is_flash[i]) {
+          insert_region(i, i, HIGH_VAR_REGION, temp_regions, &num_regions, &k);
+        }
+      }
+      k++;
+    }
+    cleanup_regions(temp_regions, &num_regions);
+
+    // copy the regions in the scenecut group
+    for (k = 0; k < num_regions; k++) {
+      regions[k + cur_region] = temp_regions[k];
+    }
+    cur_region += num_regions;
+
+    // add the scenecut region
+    if (next_scenecut > -1) {
+      // add the scenecut region, and find the next scenecut
+      regions[cur_region].type = SCENECUT_REGION;
+      regions[cur_region].start = next_scenecut;
+      regions[cur_region].last = next_scenecut;
+      cur_region++;
+      this_start = next_scenecut + 1;
+    }
+  } while (next_scenecut >= 0);
+
+  *total_regions = cur_region;
+  get_region_stats(stats_start, is_flash, regions, coeff, *total_regions);
+
+  for (k = 0; k < *total_regions; k++) {
+    // If scenecuts are very minor, mark them as high variance.
+    if (regions[k].type != SCENECUT_REGION || regions[k].avg_cor_coeff < 0.8) {
+      continue;
+    }
+    regions[k].type = HIGH_VAR_REGION;
+  }
+  cleanup_regions(regions, total_regions);
+  get_region_stats(stats_start, is_flash, regions, coeff, *total_regions);
+
+  for (k = 0; k < *total_regions; k++) {
+    regions[k].start += offset;
+    regions[k].last += offset;
+  }
+}
+
+static int find_regions_index(const REGIONS *regions, int num_regions,
+                              int frame_idx) {
+  for (int k = 0; k < num_regions; k++) {
+    if (regions[k].start <= frame_idx && regions[k].last >= frame_idx) {
+      return k;
+    }
+  }
+  return -1;
+}
+
+/*!\brief Determine the length of future GF groups.
+ *
+ * \ingroup gf_group_algo
+ * This function decides the gf group length of future frames in batch
+ *
+ * \param[in]    cpi              Top-level encoder structure
+ * \param[in]    max_gop_length   Maximum length of the GF group
+ * \param[in]    max_intervals    Maximum number of intervals to decide
+ *
+ * \return Nothing is returned. Instead, cpi->rc.gf_intervals is
+ * changed to store the decided GF group lengths.
+ */
 static void calculate_gf_length(AV1_COMP *cpi, int max_gop_length,
                                 int max_intervals) {
   RATE_CONTROL *const rc = &cpi->rc;
@@ -1284,45 +1914,36 @@
   const int active_min_gf_interval = rc->min_gf_interval;
   const int active_max_gf_interval =
       AOMMIN(rc->max_gf_interval, max_gop_length);
+  const int min_shrink_int = AOMMAX(MIN_SHRINK_LEN, active_min_gf_interval);
 
-  i = 0;
+  i = (rc->frames_since_key == 0);
   max_intervals = cpi->lap_enabled ? 1 : max_intervals;
-  int cut_pos[MAX_NUM_GF_INTERVALS + 1] = { 0 };
   int count_cuts = 1;
-  int cur_start = 0, cur_last;
+  // If cpi->gf_state.arf_gf_boost_lst is 0, we are starting with a KF or GF.
+  int cur_start = -1 + !cpi->gf_state.arf_gf_boost_lst, cur_last;
+  int cut_pos[MAX_NUM_GF_INTERVALS + 1] = { -1 };
   int cut_here;
-  int prev_lows = 0;
   GF_GROUP_STATS gf_stats;
   init_gf_stats(&gf_stats);
   while (count_cuts < max_intervals + 1) {
-    ++i;
-
     // reaches next key frame, break here
-    if (i >= rc->frames_to_key) {
-      cut_pos[count_cuts] = i - 1;
-      count_cuts++;
-      break;
-    }
-
-    // reached maximum len, but nothing special yet (almost static)
-    // let's look at the next interval
-    if (i - cur_start >= rc->static_scene_max_gf_interval) {
+    if (i >= rc->frames_to_key + rc->next_is_fwd_key) {
+      cut_here = 2;
+    } else if (i - cur_start >= rc->static_scene_max_gf_interval) {
+      // reached maximum len, but nothing special yet (almost static)
+      // let's look at the next interval
       cut_here = 1;
-    } else {
+    } else if (EOF == input_stats(twopass, &next_frame)) {
       // reaches last frame, break
-      if (EOF == input_stats(twopass, &next_frame)) {
-        cut_pos[count_cuts] = i - 1;
-        count_cuts++;
-        break;
-      }
+      cut_here = 2;
+    } else {
       // Test for the case where there is a brief flash but the prediction
       // quality back to an earlier frame is then restored.
       flash_detected = detect_flash(twopass, 0);
       // TODO(bohanli): remove redundant accumulations here, or unify
       // this and the ones in define_gf_group
-      accumulate_next_frame_stats(&next_frame, frame_info, twopass,
-                                  flash_detected, rc->frames_since_key, i, 0,
-                                  rc->min_gf_interval, &gf_stats);
+      accumulate_next_frame_stats(&next_frame, frame_info, flash_detected,
+                                  rc->frames_since_key, i, &gf_stats);
 
       cut_here = detect_gf_cut(cpi, i, cur_start, flash_detected,
                                active_max_gf_interval, active_min_gf_interval,
@@ -1330,54 +1951,102 @@
     }
     if (cut_here) {
       cur_last = i - 1;  // the current last frame in the gf group
-      // only try shrinking if interval smaller than active_max_gf_interval
-      if (cur_last - cur_start <= active_max_gf_interval) {
-        // determine in the current decided gop the higher and lower errs
-        int n;
-        double ratio;
-
-        // load neighboring coded errs
-        int is_high[MAX_GF_INTERVAL + 1 + MAX_PAD_GF_CHECK * 2] = { 0 };
-        double errs[MAX_GF_INTERVAL + 1 + MAX_PAD_GF_CHECK * 2] = { 0 };
-        double si[MAX_GF_INTERVAL + 1 + MAX_PAD_GF_CHECK * 2] = { 0 };
-        int before_pad =
-            AOMMIN(MAX_PAD_GF_CHECK, rc->frames_since_key - 1 + cur_start);
-        int after_pad =
-            AOMMIN(MAX_PAD_GF_CHECK, rc->frames_to_key - cur_last - 1);
-        for (n = cur_start - before_pad; n <= cur_last + after_pad; n++) {
-          if (start_pos + n - 1 > twopass->stats_buf_ctx->stats_in_end) {
-            after_pad = n - cur_last - 1;
-            assert(after_pad >= 0);
-            break;
-          } else if (start_pos + n - 1 <
-                     twopass->stats_buf_ctx->stats_in_start) {
-            before_pad = cur_start - n - 1;
-            continue;
-          }
-          errs[n + before_pad - cur_start] = (start_pos + n - 1)->coded_error;
+      int ori_last = cur_last;
+      // The region frame idx does not start from the same frame as cur_start
+      // and cur_last. Need to offset them.
+      int offset = rc->frames_since_key - rc->regions_offset;
+      REGIONS *regions = rc->regions;
+      int num_regions = rc->num_regions;
+      if (cpi->oxcf.kf_cfg.fwd_kf_enabled && rc->next_is_fwd_key) {
+        const int frames_left = rc->frames_to_key - i;
+        const int min_int = AOMMIN(MIN_FWD_KF_INTERVAL, active_min_gf_interval);
+        if (frames_left < min_int && frames_left > 0) {
+          cur_last = rc->frames_to_key - min_int - 1;
         }
-        const int len = before_pad + after_pad + cur_last - cur_start + 1;
-        const int reset = determine_high_err_gf(
-            errs, is_high, si, len, &ratio, cur_start, cur_last, before_pad);
+      }
 
-        // if the current frame may have high error, try shrinking
-        if (is_high[cur_last - cur_start + before_pad] == 1 ||
-            (!reset && si[cur_last - cur_start + before_pad] < SI_LOW)) {
-          // try not to cut in high err area
-          set_last_prev_low_err(&cur_start, &cur_last, cut_pos, count_cuts,
-                                before_pad, ratio, is_high, si, prev_lows);
-        }  // if current frame high error
-        // count how many trailing lower error frames we have in this decided
-        // gf group
-        prev_lows = 0;
-        for (n = cur_last - 1; n > cur_start + MIN_SHRINK_LEN; n--) {
-          if (is_high[n - cur_start + before_pad] == 0 &&
-              (si[n - cur_start + before_pad] > SI_HIGH || reset)) {
-            prev_lows++;
-          } else {
+      int scenecut_idx = -1;
+      // only try shrinking if interval smaller than active_max_gf_interval
+      if (cur_last - cur_start <= active_max_gf_interval &&
+          cur_last > cur_start) {
+        // find the region indices of where the first and last frame belong.
+        int k_start =
+            find_regions_index(regions, num_regions, cur_start + offset);
+        int k_last =
+            find_regions_index(regions, num_regions, cur_last + offset);
+        if (cur_start + offset == 0) k_start = 0;
+
+        // See if we have a scenecut in between
+        for (int r = k_start + 1; r <= k_last; r++) {
+          if (regions[r].type == SCENECUT_REGION &&
+              regions[r].last - offset - cur_start > active_min_gf_interval) {
+            scenecut_idx = r;
             break;
           }
         }
+
+        // if the found scenecut is very close to the end, ignore it.
+        if (regions[num_regions - 1].last - regions[scenecut_idx].last < 4) {
+          scenecut_idx = -1;
+        }
+
+        if (scenecut_idx != -1) {
+          // If we have a scenecut, then stop at it.
+          // TODO(bohanli): add logic here to stop before the scenecut and for
+          // the next gop start from the scenecut with GF
+          int is_minor_sc = (regions[scenecut_idx].avg_cor_coeff > 0.6);
+          cur_last = regions[scenecut_idx].last - offset - !is_minor_sc;
+        } else {
+          int is_last_analysed = (k_last == num_regions - 1) &&
+                                 (cur_last + offset == regions[k_last].last);
+          int not_enough_regions =
+              k_last - k_start <=
+              1 + (regions[k_start].type == SCENECUT_REGION);
+          // if we are very close to the end, then do not shrink since it may
+          // introduce intervals that are too short
+          if (!(is_last_analysed && not_enough_regions)) {
+            int found = 0;
+            // first try to end at a stable area
+            for (int j = cur_last; j >= cur_start + min_shrink_int; j--) {
+              if (regions[find_regions_index(regions, num_regions, j + offset)]
+                      .type == STABLE_REGION) {
+                cur_last = j;
+                found = 1;
+                break;
+              }
+            }
+            if (!found) {
+              // Could not find stable point,
+              // try to find an OK point (high correlation, not blending)
+              for (int j = cur_last; j >= cur_start + min_shrink_int; j--) {
+                REGIONS *cur_region =
+                    regions +
+                    find_regions_index(regions, num_regions, j + offset);
+                double avg_coeff = cur_region->avg_cor_coeff;
+                if (rc->cor_coeff[j + offset] > avg_coeff &&
+                    cur_region->type != BLENDING_REGION) {
+                  cur_last = j;
+                  found = 1;
+                  break;
+                }
+              }
+            }
+            if (!found) {
+              // Could not find a better point,
+              // try not to cut in blending areas
+              for (int j = cur_last; j >= cur_start + min_shrink_int; j--) {
+                REGIONS *cur_region =
+                    regions +
+                    find_regions_index(regions, num_regions, j + offset);
+                if (cur_region->type != BLENDING_REGION) {
+                  cur_last = j;
+                  break;
+                }
+              }
+            }
+            // if cannot find anything, just cut at the original place.
+          }
+        }
       }
       cut_pos[count_cuts] = cur_last;
       count_cuts++;
@@ -1385,51 +2054,63 @@
       // reset pointers to the shrinked location
       twopass->stats_in = start_pos + cur_last;
       cur_start = cur_last;
+      if (regions[find_regions_index(regions, num_regions,
+                                     cur_start + 1 + offset)]
+              .type == SCENECUT_REGION) {
+        cur_start++;
+      }
       i = cur_last;
 
+      if (cut_here > 1 && cur_last == ori_last) break;
+
       // reset accumulators
       init_gf_stats(&gf_stats);
     }
+    ++i;
   }
 
   // save intervals
   rc->intervals_till_gf_calculate_due = count_cuts - 1;
   for (int n = 1; n < count_cuts; n++) {
-    rc->gf_intervals[n - 1] = cut_pos[n] + 1 - cut_pos[n - 1];
+    rc->gf_intervals[n - 1] = cut_pos[n] - cut_pos[n - 1];
   }
   rc->cur_gf_index = 0;
   twopass->stats_in = start_pos;
-
-#if GF_SHRINK_OUTPUT
-  printf("\nf_to_key: %d, count_cut: %d. ", rc->frames_to_key, count_cuts);
-  for (int n = 0; n < count_cuts; n++) {
-    printf("%d ", cut_pos[n]);
-  }
-  printf("\n");
-
-  for (int n = 0; n < rc->intervals_till_gf_calculate_due; n++) {
-    printf("%d ", rc->gf_intervals[n]);
-  }
-  printf("\n\n");
-#endif
 }
 
 static void correct_frames_to_key(AV1_COMP *cpi) {
   int lookahead_size =
-      (int)av1_lookahead_depth(cpi->lookahead, cpi->compressor_stage) + 1;
+      (int)av1_lookahead_depth(cpi->lookahead, cpi->compressor_stage);
   if (lookahead_size <
       av1_lookahead_pop_sz(cpi->lookahead, cpi->compressor_stage)) {
+    assert(IMPLIES(cpi->frames_left > 0, lookahead_size == cpi->frames_left));
     cpi->rc.frames_to_key = AOMMIN(cpi->rc.frames_to_key, lookahead_size);
+  } else if (cpi->frames_left > 0) {
+    // Correct frames to key based on limit
+    cpi->rc.frames_to_key = AOMMIN(cpi->rc.frames_to_key, cpi->frames_left);
   }
 }
 
-static void define_gf_group_pass0(AV1_COMP *cpi,
-                                  const EncodeFrameParams *const frame_params) {
+/*!\brief Define a GF group in one pass mode when no look ahead stats are
+ * available.
+ *
+ * \ingroup gf_group_algo
+ * This function defines the structure of a GF group, along with various
+ * parameters regarding bit-allocation and quality setup in the special
+ * case of one pass encoding where no lookahead stats are avialable.
+ *
+ * \param[in]    cpi             Top-level encoder structure
+ *
+ * \return Nothing is returned. Instead, cpi->gf_group is changed.
+ */
+static void define_gf_group_pass0(AV1_COMP *cpi) {
   RATE_CONTROL *const rc = &cpi->rc;
   GF_GROUP *const gf_group = &cpi->gf_group;
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  const GFConfig *const gf_cfg = &oxcf->gf_cfg;
   int target;
 
-  if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) {
+  if (oxcf->q_cfg.aq_mode == CYCLIC_REFRESH_AQ) {
     av1_cyclic_refresh_set_golden_update(cpi);
   } else {
     rc->baseline_gf_interval = rc->gf_intervals[rc->cur_gf_index];
@@ -1447,29 +2128,30 @@
   rc->constrained_gf_group =
       (rc->baseline_gf_interval >= rc->frames_to_key) ? 1 : 0;
 
-  gf_group->max_layer_depth_allowed = cpi->oxcf.gf_max_pyr_height;
+  gf_group->max_layer_depth_allowed = oxcf->gf_cfg.gf_max_pyr_height;
 
   // Rare case when the look-ahead is less than the target GOP length, can't
   // generate ARF frame.
-  if (rc->baseline_gf_interval > cpi->oxcf.lag_in_frames ||
-      !is_altref_enabled(cpi) || rc->baseline_gf_interval < rc->min_gf_interval)
+  if (rc->baseline_gf_interval > gf_cfg->lag_in_frames ||
+      !is_altref_enabled(gf_cfg->lag_in_frames, gf_cfg->enable_auto_arf) ||
+      rc->baseline_gf_interval < rc->min_gf_interval)
     gf_group->max_layer_depth_allowed = 0;
 
   // Set up the structure of this Group-Of-Pictures (same as GF_GROUP)
-  av1_gop_setup_structure(cpi, frame_params);
+  av1_gop_setup_structure(cpi);
 
   // Allocate bits to each of the frames in the GF group.
   // TODO(sarahparker) Extend this to work with pyramid structure.
   for (int cur_index = 0; cur_index < gf_group->size; ++cur_index) {
     const FRAME_UPDATE_TYPE cur_update_type = gf_group->update_type[cur_index];
-    if (cpi->oxcf.rc_mode == AOM_CBR) {
-      if (cur_update_type == KEY_FRAME) {
+    if (oxcf->rc_cfg.mode == AOM_CBR) {
+      if (cur_update_type == KF_UPDATE) {
         target = av1_calc_iframe_target_size_one_pass_cbr(cpi);
       } else {
         target = av1_calc_pframe_target_size_one_pass_cbr(cpi, cur_update_type);
       }
     } else {
-      if (cur_update_type == KEY_FRAME) {
+      if (cur_update_type == KF_UPDATE) {
         target = av1_calc_iframe_target_size_one_pass_vbr(cpi);
       } else {
         target = av1_calc_pframe_target_size_one_pass_vbr(cpi, cur_update_type);
@@ -1488,31 +2170,32 @@
   // Set the interval until the next gf.
   // If forward keyframes are enabled, ensure the final gf group obeys the
   // MIN_FWD_KF_INTERVAL.
-  if (cpi->oxcf.fwd_kf_enabled && use_alt_ref &&
-      ((twopass->stats_in - arf_position + rc->frames_to_key) <
-       twopass->stats_buf_ctx->stats_in_end) &&
+  const int is_last_kf =
+      (twopass->stats_in - arf_position + rc->frames_to_key) >=
+      twopass->stats_buf_ctx->stats_in_end;
+
+  if (cpi->oxcf.kf_cfg.fwd_kf_enabled && use_alt_ref && !is_last_kf &&
       cpi->rc.next_is_fwd_key) {
-    if (arf_position == rc->frames_to_key) {
+    if (arf_position == rc->frames_to_key + 1) {
       rc->baseline_gf_interval = arf_position;
       // if the last gf group will be smaller than MIN_FWD_KF_INTERVAL
-    } else if ((rc->frames_to_key - arf_position <
-                AOMMAX(MIN_FWD_KF_INTERVAL, rc->min_gf_interval)) &&
-               (rc->frames_to_key != arf_position)) {
+    } else if (rc->frames_to_key + 1 - arf_position <
+               AOMMAX(MIN_FWD_KF_INTERVAL, rc->min_gf_interval)) {
       // if possible, merge the last two gf groups
-      if (rc->frames_to_key <= active_max_gf_interval) {
-        rc->baseline_gf_interval = rc->frames_to_key;
+      if (rc->frames_to_key + 1 <= active_max_gf_interval) {
+        rc->baseline_gf_interval = rc->frames_to_key + 1;
         if (is_final_pass) rc->intervals_till_gf_calculate_due = 0;
         // if merging the last two gf groups creates a group that is too long,
         // split them and force the last gf group to be the MIN_FWD_KF_INTERVAL
       } else {
-        rc->baseline_gf_interval = rc->frames_to_key - MIN_FWD_KF_INTERVAL;
+        rc->baseline_gf_interval = rc->frames_to_key + 1 - MIN_FWD_KF_INTERVAL;
         if (is_final_pass) rc->intervals_till_gf_calculate_due = 0;
       }
     } else {
-      rc->baseline_gf_interval = arf_position - rc->source_alt_ref_pending;
+      rc->baseline_gf_interval = arf_position;
     }
   } else {
-    rc->baseline_gf_interval = arf_position - rc->source_alt_ref_pending;
+    rc->baseline_gf_interval = arf_position;
   }
 }
 
@@ -1541,32 +2224,44 @@
   gf_stats->avg_wavelet_energy = 0.0;
   gf_stats->avg_raw_err_stdev = 0.0;
   gf_stats->non_zero_stdev_count = 0;
-
-  gf_stats->allow_alt_ref = 0;
 }
 
 // Analyse and define a gf/arf group.
 #define MAX_GF_BOOST 5400
+/*!\brief Define a GF group.
+ *
+ * \ingroup gf_group_algo
+ * This function defines the structure of a GF group, along with various
+ * parameters regarding bit-allocation and quality setup.
+ *
+ * \param[in]    cpi             Top-level encoder structure
+ * \param[in]    this_frame      First pass statistics structure
+ * \param[in]    frame_params    Structure with frame parameters
+ * \param[in]    max_gop_length  Maximum length of the GF group
+ * \param[in]    is_final_pass   Whether this is the final pass for the
+ *                               GF group, or a trial (non-zero)
+ *
+ * \return Nothing is returned. Instead, cpi->gf_group is changed.
+ */
 static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame,
-                            const EncodeFrameParams *const frame_params,
-                            int max_gop_length, int is_final_pass) {
+                            EncodeFrameParams *frame_params, int max_gop_length,
+                            int is_final_pass) {
   AV1_COMMON *const cm = &cpi->common;
   RATE_CONTROL *const rc = &cpi->rc;
-  AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
   TWO_PASS *const twopass = &cpi->twopass;
   FIRSTPASS_STATS next_frame;
   const FIRSTPASS_STATS *const start_pos = twopass->stats_in;
   GF_GROUP *gf_group = &cpi->gf_group;
   FRAME_INFO *frame_info = &cpi->frame_info;
+  const GFConfig *const gf_cfg = &oxcf->gf_cfg;
+  const RateControlCfg *const rc_cfg = &oxcf->rc_cfg;
   int i;
-
   int flash_detected;
   int64_t gf_group_bits;
-  const int is_intra_only = frame_params->frame_type == KEY_FRAME ||
-                            frame_params->frame_type == INTRA_ONLY_FRAME;
-  const int arf_active_or_kf = is_intra_only || rc->source_alt_ref_active;
+  const int is_intra_only = rc->frames_since_key == 0;
 
-  cpi->internal_altref_allowed = (oxcf->gf_max_pyr_height > 1);
+  cpi->internal_altref_allowed = (gf_cfg->gf_max_pyr_height > 1);
 
   // Reset the GF group data structures unless this is a key
   // frame in which case it will already have been done.
@@ -1578,7 +2273,7 @@
   av1_zero(next_frame);
 
   if (has_no_stats_stage(cpi)) {
-    define_gf_group_pass0(cpi, frame_params);
+    define_gf_group_pass0(cpi);
     return;
   }
 
@@ -1591,8 +2286,7 @@
   init_gf_stats(&gf_stats);
   GF_FRAME_STATS first_frame_stats, last_frame_stats;
 
-  gf_stats.allow_alt_ref = is_altref_enabled(cpi);
-  const int can_disable_arf = (oxcf->gf_min_pyr_height == MIN_PYRAMID_LVL);
+  const int can_disable_arf = !gf_cfg->gf_min_pyr_height;
 
   // Load stats for the current frame.
   double mod_frame_err =
@@ -1607,30 +2301,38 @@
 
   // If this is a key frame or the overlay from a previous arf then
   // the error score / cost of this frame has already been accounted for.
-  if (arf_active_or_kf) {
-    gf_stats.gf_group_err -= first_frame_stats.frame_err;
-#if GROUP_ADAPTIVE_MAXQ
-    gf_stats.gf_group_raw_error -= this_frame->coded_error;
-#endif
-    gf_stats.gf_group_skip_pct -= this_frame->intra_skip_pct;
-    gf_stats.gf_group_inactive_zone_rows -= this_frame->inactive_zone_rows;
-  }
 
   // TODO(urvang): Try logic to vary min and max interval based on q.
   const int active_min_gf_interval = rc->min_gf_interval;
   const int active_max_gf_interval =
       AOMMIN(rc->max_gf_interval, max_gop_length);
 
-  i = 0;
+  i = is_intra_only;
   // get the determined gf group length from rc->gf_intervals
   while (i < rc->gf_intervals[rc->cur_gf_index]) {
-    ++i;
+    // read in the next frame
+    if (EOF == input_stats(twopass, &next_frame)) break;
     // Accumulate error score of frames in this gf group.
     mod_frame_err =
-        calculate_modified_err(frame_info, twopass, oxcf, this_frame);
+        calculate_modified_err(frame_info, twopass, oxcf, &next_frame);
     // accumulate stats for this frame
-    accumulate_this_frame_stats(this_frame, mod_frame_err, &gf_stats);
+    accumulate_this_frame_stats(&next_frame, mod_frame_err, &gf_stats);
 
+    if (i == 0) {
+      first_frame_stats.frame_err = mod_frame_err;
+      first_frame_stats.frame_coded_error = next_frame.coded_error;
+      first_frame_stats.frame_sr_coded_error = next_frame.sr_coded_error;
+      first_frame_stats.frame_tr_coded_error = next_frame.tr_coded_error;
+    }
+
+    ++i;
+  }
+
+  reset_fpf_position(twopass, start_pos);
+
+  i = is_intra_only;
+  input_stats(twopass, &next_frame);
+  while (i < rc->gf_intervals[rc->cur_gf_index]) {
     // read in the next frame
     if (EOF == input_stats(twopass, &next_frame)) break;
 
@@ -1639,12 +2341,14 @@
     flash_detected = detect_flash(twopass, 0);
 
     // accumulate stats for next frame
-    accumulate_next_frame_stats(
-        &next_frame, frame_info, twopass, flash_detected, rc->frames_since_key,
-        i, can_disable_arf, rc->min_gf_interval, &gf_stats);
+    accumulate_next_frame_stats(&next_frame, frame_info, flash_detected,
+                                rc->frames_since_key, i, &gf_stats);
 
-    *this_frame = next_frame;
+    ++i;
   }
+
+  i = rc->gf_intervals[rc->cur_gf_index];
+
   // save the errs for the last frame
   last_frame_stats.frame_coded_error = next_frame.coded_error;
   last_frame_stats.frame_sr_coded_error = next_frame.sr_coded_error;
@@ -1658,7 +2362,7 @@
   // Was the group length constrained by the requirement for a new KF?
   rc->constrained_gf_group = (i >= rc->frames_to_key) ? 1 : 0;
 
-  const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE)
+  const int num_mbs = (oxcf->resize_cfg.resize_mode != RESIZE_NONE)
                           ? cpi->initial_mbs
                           : cm->mi_params.MBs;
   assert(num_mbs > 0);
@@ -1670,8 +2374,7 @@
   //   avg_sr_coded_error:      average of the SSE per pixel of each frame;
   //   avg_raw_err_stdev:       average of the standard deviation of (0,0)
   //                            motion error per block of each frame.
-  const int can_disable_internal_arfs =
-      (oxcf->gf_min_pyr_height <= MIN_PYRAMID_LVL + 1);
+  const int can_disable_internal_arfs = gf_cfg->gf_min_pyr_height <= 1;
   if (can_disable_internal_arfs &&
       gf_stats.zero_motion_accumulator > MIN_ZERO_MOTION &&
       gf_stats.avg_sr_coded_error / num_mbs < MAX_SR_CODED_ERROR &&
@@ -1681,15 +2384,14 @@
 
   int use_alt_ref;
   if (can_disable_arf) {
-    use_alt_ref = !is_almost_static(gf_stats.zero_motion_accumulator,
-                                    twopass->kf_zeromotion_pct) &&
-                  gf_stats.allow_alt_ref && (i < cpi->oxcf.lag_in_frames) &&
-                  (i >= MIN_GF_INTERVAL) &&
-                  (cpi->oxcf.gf_max_pyr_height > MIN_PYRAMID_LVL);
+    use_alt_ref =
+        !is_almost_static(gf_stats.zero_motion_accumulator,
+                          twopass->kf_zeromotion_pct, cpi->lap_enabled) &&
+        rc->use_arf_in_this_kf_group && (i < gf_cfg->lag_in_frames) &&
+        (i >= MIN_GF_INTERVAL);
 
     // TODO(urvang): Improve and use model for VBR, CQ etc as well.
-    if (use_alt_ref && cpi->oxcf.rc_mode == AOM_Q &&
-        cpi->oxcf.cq_level <= 200) {
+    if (use_alt_ref && rc_cfg->mode == AOM_Q && rc_cfg->cq_level <= 200) {
       aom_clear_system_state();
       float features[21];
       get_features_from_gf_stats(
@@ -1701,9 +2403,8 @@
       use_alt_ref = (score <= 0.0);
     }
   } else {
-    assert(cpi->oxcf.gf_max_pyr_height > MIN_PYRAMID_LVL);
     use_alt_ref =
-        gf_stats.allow_alt_ref && (i < cpi->oxcf.lag_in_frames) && (i > 2);
+        rc->use_arf_in_this_kf_group && (i < gf_cfg->lag_in_frames) && (i > 2);
   }
 
 #define REDUCE_GF_LENGTH_THRESH 4
@@ -1713,9 +2414,9 @@
   // The length reduction strategy is tweaked for certain cases, and doesn't
   // work well for certain other cases.
   const int allow_gf_length_reduction =
-      ((cpi->oxcf.rc_mode == AOM_Q && cpi->oxcf.cq_level <= 128) ||
+      ((rc_cfg->mode == AOM_Q && rc_cfg->cq_level <= 128) ||
        !cpi->internal_altref_allowed) &&
-      !is_lossless_requested(&cpi->oxcf);
+      !is_lossless_requested(rc_cfg);
 
   if (allow_gf_length_reduction && use_alt_ref) {
     // adjust length of this gf group if one of the following condition met
@@ -1745,24 +2446,23 @@
   }
 
   // Should we use the alternate reference frame.
+  int ext_len = i - is_intra_only;
   if (use_alt_ref) {
-    rc->source_alt_ref_pending = 1;
-    gf_group->max_layer_depth_allowed = cpi->oxcf.gf_max_pyr_height;
+    gf_group->max_layer_depth_allowed = gf_cfg->gf_max_pyr_height;
     set_baseline_gf_interval(cpi, i, active_max_gf_interval, use_alt_ref,
                              is_final_pass);
 
-    const int forward_frames = (rc->frames_to_key - i >= i - 1)
-                                   ? i - 1
+    const int forward_frames = (rc->frames_to_key - i >= ext_len)
+                                   ? ext_len
                                    : AOMMAX(0, rc->frames_to_key - i);
 
     // Calculate the boost for alt ref.
     rc->gfu_boost = av1_calc_arf_boost(
-        twopass, rc, frame_info, alt_offset, forward_frames, (i - 1),
+        twopass, rc, frame_info, alt_offset, forward_frames, ext_len,
         cpi->lap_enabled ? &rc->num_stats_used_for_gfu_boost : NULL,
         cpi->lap_enabled ? &rc->num_stats_required_for_gfu_boost : NULL);
   } else {
     reset_fpf_position(twopass, start_pos);
-    rc->source_alt_ref_pending = 0;
     gf_group->max_layer_depth_allowed = 0;
     set_baseline_gf_interval(cpi, i, active_max_gf_interval, use_alt_ref,
                              is_final_pass);
@@ -1770,27 +2470,17 @@
     rc->gfu_boost = AOMMIN(
         MAX_GF_BOOST,
         av1_calc_arf_boost(
-            twopass, rc, frame_info, alt_offset, (i - 1), 0,
+            twopass, rc, frame_info, alt_offset, ext_len, 0,
             cpi->lap_enabled ? &rc->num_stats_used_for_gfu_boost : NULL,
             cpi->lap_enabled ? &rc->num_stats_required_for_gfu_boost : NULL));
   }
 
-  // rc->gf_intervals assumes the usage of alt_ref, therefore adding one overlay
-  // frame to the next gf. If no alt_ref is used, should substract 1 frame from
-  // the next gf group.
-  // TODO(bohanli): should incorporate the usage of alt_ref into
-  // calculate_gf_length
-  if (is_final_pass && rc->source_alt_ref_pending == 0 &&
-      rc->intervals_till_gf_calculate_due > 0) {
-    rc->gf_intervals[rc->cur_gf_index]--;
-  }
-
 #define LAST_ALR_BOOST_FACTOR 0.2f
   rc->arf_boost_factor = 1.0;
-  if (rc->source_alt_ref_pending && !is_lossless_requested(&cpi->oxcf)) {
+  if (use_alt_ref && !is_lossless_requested(rc_cfg)) {
     // Reduce the boost of altref in the last gf group
-    if (rc->frames_to_key - i == REDUCE_GF_LENGTH_BY ||
-        rc->frames_to_key - i == 0) {
+    if (rc->frames_to_key - ext_len == REDUCE_GF_LENGTH_BY ||
+        rc->frames_to_key - ext_len == 0) {
       rc->arf_boost_factor = LAST_ALR_BOOST_FACTOR;
     }
   }
@@ -1800,6 +2490,12 @@
   // Reset the file position.
   reset_fpf_position(twopass, start_pos);
 
+  if (cpi->lap_enabled) {
+    // Since we don't have enough stats to know the actual error of the
+    // gf group, we assume error of each frame to be equal to 1 and set
+    // the error of the group as baseline_gf_interval.
+    gf_stats.gf_group_err = rc->baseline_gf_interval;
+  }
   // Calculate the bits to be allocated to the gf/arf group as a whole
   gf_group_bits = calculate_total_gf_group_bits(cpi, gf_stats.gf_group_err);
   rc->gf_group_bits = gf_group_bits;
@@ -1810,7 +2506,8 @@
   // where there could be significant overshoot than for easier
   // sections where we do not wish to risk creating an overshoot
   // of the allocated bit budget.
-  if ((cpi->oxcf.rc_mode != AOM_Q) && (rc->baseline_gf_interval > 1)) {
+  if ((rc_cfg->mode != AOM_Q) && (rc->baseline_gf_interval > 1) &&
+      is_final_pass) {
     const int vbr_group_bits_per_frame =
         (int)(gf_group_bits / rc->baseline_gf_interval);
     const double group_av_err =
@@ -1824,7 +2521,7 @@
     int tmp_q;
     // rc factor is a weight factor that corrects for local rate control drift.
     double rc_factor = 1.0;
-    int64_t bits = cpi->oxcf.target_bandwidth;
+    int64_t bits = rc_cfg->target_bandwidth;
 
     if (bits > 0) {
       int rate_error;
@@ -1850,25 +2547,35 @@
     twopass->kf_group_error_left -= (int64_t)gf_stats.gf_group_err;
 
   // Set up the structure of this Group-Of-Pictures (same as GF_GROUP)
-  av1_gop_setup_structure(cpi, frame_params);
+  av1_gop_setup_structure(cpi);
 
   // Reset the file position.
   reset_fpf_position(twopass, start_pos);
 
   // Calculate a section intra ratio used in setting max loop filter.
-  if (frame_params->frame_type != KEY_FRAME) {
+  if (rc->frames_since_key != 0) {
     twopass->section_intra_rating = calculate_section_intra_ratio(
         start_pos, twopass->stats_buf_ctx->stats_in_end,
         rc->baseline_gf_interval);
   }
 
-  // Reset rolling actual and target bits counters for ARF groups.
-  twopass->rolling_arf_group_target_bits = 1;
-  twopass->rolling_arf_group_actual_bits = 1;
+  av1_gop_bit_allocation(cpi, rc, gf_group, rc->frames_since_key == 0,
+                         use_alt_ref, gf_group_bits);
 
-  av1_gop_bit_allocation(cpi, rc, gf_group,
-                         frame_params->frame_type == KEY_FRAME, use_alt_ref,
-                         gf_group_bits);
+  frame_params->frame_type =
+      rc->frames_since_key == 0 ? KEY_FRAME : INTER_FRAME;
+  frame_params->show_frame =
+      !(gf_group->update_type[gf_group->index] == ARF_UPDATE ||
+        gf_group->update_type[gf_group->index] == INTNL_ARF_UPDATE);
+
+  // TODO(jingning): Generalize this condition.
+  if (is_final_pass) {
+    cpi->gf_state.arf_gf_boost_lst = use_alt_ref;
+
+    // Reset rolling actual and target bits counters for ARF groups.
+    twopass->rolling_arf_group_target_bits = 1;
+    twopass->rolling_arf_group_actual_bits = 1;
+  }
 }
 
 // #define FIXED_ARF_BITS
@@ -1882,8 +2589,9 @@
 #ifdef FIXED_ARF_BITS
   int gf_arf_bits = (int)(ARF_BITS_FRACTION * gf_group_bits);
 #else
-  int gf_arf_bits = calculate_boost_bits(rc->baseline_gf_interval,
-                                         rc->gfu_boost, gf_group_bits);
+  int gf_arf_bits = calculate_boost_bits(
+      rc->baseline_gf_interval - (rc->frames_since_key == 0), rc->gfu_boost,
+      gf_group_bits);
 #endif
 
   gf_arf_bits = adjust_boost_bits_for_target_level(cpi, rc, gf_arf_bits,
@@ -1937,13 +2645,33 @@
                              const FIRSTPASS_STATS *last_frame,
                              const FIRSTPASS_STATS *this_frame,
                              const FIRSTPASS_STATS *next_frame,
-                             int frame_count_so_far, enum aom_rc_mode rc_mode) {
+                             int frame_count_so_far, enum aom_rc_mode rc_mode,
+                             int scenecut_mode) {
   int is_viable_kf = 0;
   double pcnt_intra = 1.0 - this_frame->pcnt_inter;
   double modified_pcnt_inter =
       this_frame->pcnt_inter - this_frame->pcnt_neutral;
   const double second_ref_usage_thresh =
       get_second_ref_usage_thresh(frame_count_so_far);
+  int total_frames_to_test = SCENE_CUT_KEY_TEST_INTERVAL;
+  int count_for_tolerable_prediction = 3;
+  int num_future_frames = 0;
+  FIRSTPASS_STATS curr_frame;
+
+  if (scenecut_mode == ENABLE_SCENECUT_MODE_1) {
+    curr_frame = *this_frame;
+    const FIRSTPASS_STATS *const start_position = twopass->stats_in;
+    for (num_future_frames = 0; num_future_frames < SCENE_CUT_KEY_TEST_INTERVAL;
+         num_future_frames++)
+      if (EOF == input_stats(twopass, &curr_frame)) break;
+    reset_fpf_position(twopass, start_position);
+    if (num_future_frames < 3) {
+      return 0;
+    } else {
+      total_frames_to_test = 3;
+      count_for_tolerable_prediction = 1;
+    }
+  }
 
   // Does the frame satisfy the primary criteria of a key frame?
   // See above for an explanation of the test criteria.
@@ -1968,13 +2696,15 @@
           II_IMPROVEMENT_THRESHOLD))))) {
     int i;
     const FIRSTPASS_STATS *start_pos = twopass->stats_in;
-    FIRSTPASS_STATS local_next_frame = *next_frame;
     double boost_score = 0.0;
     double old_boost_score = 0.0;
     double decay_accumulator = 1.0;
 
     // Examine how well the key frame predicts subsequent frames.
-    for (i = 0; i < SCENE_CUT_KEY_TEST_INTERVAL; ++i) {
+    for (i = 0; i < total_frames_to_test; ++i) {
+      // Get the next frame details
+      FIRSTPASS_STATS local_next_frame;
+      if (EOF == input_stats(twopass, &local_next_frame)) break;
       double next_iiratio = (BOOST_FACTOR * local_next_frame.intra_error /
                              DOUBLE_DIVIDE_CHECK(local_next_frame.coded_error));
 
@@ -2000,23 +2730,19 @@
       }
 
       old_boost_score = boost_score;
-
-      // Get the next frame details
-      if (EOF == input_stats(twopass, &local_next_frame)) break;
     }
 
     // If there is tolerable prediction for at least the next 3 frames then
     // break out else discard this potential key frame and move on
-    if (boost_score > 30.0 && (i > 3)) {
+    if (boost_score > 30.0 && (i > count_for_tolerable_prediction)) {
       is_viable_kf = 1;
     } else {
-      // Reset the file position
-      reset_fpf_position(twopass, start_pos);
-
       is_viable_kf = 0;
     }
-  }
 
+    // Reset the file position
+    reset_fpf_position(twopass, start_pos);
+  }
   return is_viable_kf;
 }
 
@@ -2028,7 +2754,7 @@
 #define MIN_STATIC_KF_BOOST 5400  // Minimum boost for static KF interval
 
 static int detect_app_forced_key(AV1_COMP *cpi) {
-  if (cpi->oxcf.fwd_kf_enabled) cpi->rc.next_is_fwd_key = 1;
+  if (cpi->oxcf.kf_cfg.fwd_kf_enabled) cpi->rc.next_is_fwd_key = 1;
   int num_frames_to_app_forced_key = is_forced_keyframe_pending(
       cpi->lookahead, cpi->lookahead->max_sz, cpi->compressor_stage);
   if (num_frames_to_app_forced_key != -1) cpi->rc.next_is_fwd_key = 0;
@@ -2054,12 +2780,26 @@
   return projected_kf_boost;
 }
 
+/*!\brief Determine the location of the next key frame
+ *
+ * \ingroup gf_group_algo
+ * This function decides the placement of the next key frame when a
+ * scenecut is detected or the maximum key frame distance is reached.
+ *
+ * \param[in]    cpi              Top-level encoder structure
+ * \param[in]    this_frame       Pointer to first pass stats
+ * \param[out]   kf_group_err     The total error in the KF group
+ * \param[in]    num_frames_to_detect_scenecut Maximum lookahead frames.
+ *
+ * \return       Number of frames to the next key.
+ */
 static int define_kf_interval(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame,
                               double *kf_group_err,
                               int num_frames_to_detect_scenecut) {
   TWO_PASS *const twopass = &cpi->twopass;
   RATE_CONTROL *const rc = &cpi->rc;
   const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  const KeyFrameCfg *const kf_cfg = &oxcf->kf_cfg;
   double recent_loop_decay[FRAMES_TO_CHECK_DECAY];
   FIRSTPASS_STATS last_frame;
   double decay_accumulator = 1.0;
@@ -2102,13 +2842,15 @@
     input_stats(twopass, this_frame);
 
     // Provided that we are not at the end of the file...
-    if (cpi->rc.enable_scenecut_detection && cpi->oxcf.auto_key &&
+    if ((cpi->rc.enable_scenecut_detection > 0) && kf_cfg->auto_key &&
         twopass->stats_in < twopass->stats_buf_ctx->stats_in_end) {
       double loop_decay_rate;
 
       // Check for a scene cut.
-      if (test_candidate_kf(twopass, &last_frame, this_frame, twopass->stats_in,
-                            frames_since_key, oxcf->rc_mode)) {
+      if (frames_since_key >= kf_cfg->key_freq_min &&
+          test_candidate_kf(twopass, &last_frame, this_frame, twopass->stats_in,
+                            frames_since_key, oxcf->rc_cfg.mode,
+                            cpi->rc.enable_scenecut_detection)) {
         scenecut_detected = 1;
         break;
       }
@@ -2127,10 +2869,15 @@
 
       // Special check for transition or high motion followed by a
       // static scene.
-      if (detect_transition_to_still(twopass, rc->min_gf_interval, i,
-                                     cpi->oxcf.key_freq - i, loop_decay_rate,
+      if (frames_since_key >= kf_cfg->key_freq_min &&
+          detect_transition_to_still(twopass, rc->min_gf_interval, i,
+                                     kf_cfg->key_freq_max - i, loop_decay_rate,
                                      decay_accumulator)) {
         scenecut_detected = 1;
+        // In the case of transition followed by a static scene, the key frame
+        // could be a good predictor for the following frames, therefore we
+        // do not use an arf.
+        rc->use_arf_in_this_kf_group = 0;
         break;
       }
 
@@ -2139,8 +2886,8 @@
       ++frames_since_key;
 
       // If we don't have a real key frame within the next two
-      // key_freq intervals then break out of the loop.
-      if (frames_to_key >= 2 * cpi->oxcf.key_freq) break;
+      // key_freq_max intervals then break out of the loop.
+      if (frames_to_key >= 2 * kf_cfg->key_freq_max) break;
     } else {
       ++frames_to_key;
       ++frames_since_key;
@@ -2154,9 +2901,173 @@
   if (cpi->lap_enabled && !scenecut_detected)
     frames_to_key = num_frames_to_next_key;
 
+  if (!kf_cfg->fwd_kf_enabled || scenecut_detected ||
+      twopass->stats_in >= twopass->stats_buf_ctx->stats_in_end)
+    rc->next_is_fwd_key = 0;
+
   return frames_to_key;
 }
 
+static double get_kf_group_avg_error(TWO_PASS *twopass,
+                                     const FIRSTPASS_STATS *first_frame,
+                                     const FIRSTPASS_STATS *start_position,
+                                     int frames_to_key) {
+  FIRSTPASS_STATS cur_frame = *first_frame;
+  int num_frames, i;
+  double kf_group_avg_error = 0.0;
+
+  reset_fpf_position(twopass, start_position);
+
+  for (i = 0; i < frames_to_key; ++i) {
+    kf_group_avg_error += cur_frame.coded_error;
+    if (EOF == input_stats(twopass, &cur_frame)) break;
+  }
+  num_frames = i + 1;
+  num_frames = AOMMIN(num_frames, frames_to_key);
+  kf_group_avg_error = kf_group_avg_error / num_frames;
+
+  return (kf_group_avg_error);
+}
+
+static int64_t get_kf_group_bits(AV1_COMP *cpi, double kf_group_err,
+                                 double kf_group_avg_error) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  TWO_PASS *const twopass = &cpi->twopass;
+  int64_t kf_group_bits;
+  if (cpi->lap_enabled) {
+    kf_group_bits = (int64_t)rc->frames_to_key * rc->avg_frame_bandwidth;
+    if (cpi->oxcf.rc_cfg.vbr_corpus_complexity_lap) {
+      const int num_mbs = (cpi->oxcf.resize_cfg.resize_mode != RESIZE_NONE)
+                              ? cpi->initial_mbs
+                              : cpi->common.mi_params.MBs;
+
+      double vbr_corpus_complexity_lap =
+          cpi->oxcf.rc_cfg.vbr_corpus_complexity_lap / 10.0;
+      /* Get the average corpus complexity of the frame */
+      vbr_corpus_complexity_lap = vbr_corpus_complexity_lap * num_mbs;
+      kf_group_bits = (int64_t)(
+          kf_group_bits * (kf_group_avg_error / vbr_corpus_complexity_lap));
+    }
+  } else {
+    kf_group_bits = (int64_t)(twopass->bits_left *
+                              (kf_group_err / twopass->modified_error_left));
+  }
+
+  return kf_group_bits;
+}
+
+static int calc_avg_stats(AV1_COMP *cpi, FIRSTPASS_STATS *avg_frame_stat) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  TWO_PASS *const twopass = &cpi->twopass;
+  FIRSTPASS_STATS cur_frame;
+  av1_zero(cur_frame);
+  int num_frames = 0;
+  // Accumulate total stat using available number of stats.
+  for (num_frames = 0; num_frames < (rc->frames_to_key - 1); ++num_frames) {
+    if (EOF == input_stats(twopass, &cur_frame)) break;
+    av1_accumulate_stats(avg_frame_stat, &cur_frame);
+  }
+
+  if (num_frames < 2) {
+    return num_frames;
+  }
+  // Average the total stat
+  avg_frame_stat->weight = avg_frame_stat->weight / num_frames;
+  avg_frame_stat->intra_error = avg_frame_stat->intra_error / num_frames;
+  avg_frame_stat->frame_avg_wavelet_energy =
+      avg_frame_stat->frame_avg_wavelet_energy / num_frames;
+  avg_frame_stat->coded_error = avg_frame_stat->coded_error / num_frames;
+  avg_frame_stat->sr_coded_error = avg_frame_stat->sr_coded_error / num_frames;
+  avg_frame_stat->pcnt_inter = avg_frame_stat->pcnt_inter / num_frames;
+  avg_frame_stat->pcnt_motion = avg_frame_stat->pcnt_motion / num_frames;
+  avg_frame_stat->pcnt_second_ref =
+      avg_frame_stat->pcnt_second_ref / num_frames;
+  avg_frame_stat->pcnt_neutral = avg_frame_stat->pcnt_neutral / num_frames;
+  avg_frame_stat->intra_skip_pct = avg_frame_stat->intra_skip_pct / num_frames;
+  avg_frame_stat->inactive_zone_rows =
+      avg_frame_stat->inactive_zone_rows / num_frames;
+  avg_frame_stat->inactive_zone_cols =
+      avg_frame_stat->inactive_zone_cols / num_frames;
+  avg_frame_stat->MVr = avg_frame_stat->MVr / num_frames;
+  avg_frame_stat->mvr_abs = avg_frame_stat->mvr_abs / num_frames;
+  avg_frame_stat->MVc = avg_frame_stat->MVc / num_frames;
+  avg_frame_stat->mvc_abs = avg_frame_stat->mvc_abs / num_frames;
+  avg_frame_stat->MVrv = avg_frame_stat->MVrv / num_frames;
+  avg_frame_stat->MVcv = avg_frame_stat->MVcv / num_frames;
+  avg_frame_stat->mv_in_out_count =
+      avg_frame_stat->mv_in_out_count / num_frames;
+  avg_frame_stat->new_mv_count = avg_frame_stat->new_mv_count / num_frames;
+  avg_frame_stat->count = avg_frame_stat->count / num_frames;
+  avg_frame_stat->duration = avg_frame_stat->duration / num_frames;
+
+  return num_frames;
+}
+
+static double get_kf_boost_score(AV1_COMP *cpi, double kf_raw_err,
+                                 double *zero_motion_accumulator,
+                                 double *sr_accumulator, int use_avg_stat) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  TWO_PASS *const twopass = &cpi->twopass;
+  FRAME_INFO *const frame_info = &cpi->frame_info;
+  FIRSTPASS_STATS frame_stat;
+  av1_zero(frame_stat);
+  int i = 0, num_stat_used = 0;
+  double boost_score = 0.0;
+  const double kf_max_boost =
+      cpi->oxcf.rc_cfg.mode == AOM_Q
+          ? AOMMIN(AOMMAX(rc->frames_to_key * 2.0, KF_MIN_FRAME_BOOST),
+                   KF_MAX_FRAME_BOOST)
+          : KF_MAX_FRAME_BOOST;
+
+  // Calculate the average using available number of stats.
+  if (use_avg_stat) num_stat_used = calc_avg_stats(cpi, &frame_stat);
+
+  for (i = num_stat_used; i < (rc->frames_to_key - 1); ++i) {
+    if (!use_avg_stat && EOF == input_stats(twopass, &frame_stat)) break;
+
+    // Monitor for static sections.
+    // For the first frame in kf group, the second ref indicator is invalid.
+    if (i > 0) {
+      *zero_motion_accumulator =
+          AOMMIN(*zero_motion_accumulator,
+                 get_zero_motion_factor(frame_info, &frame_stat));
+    } else {
+      *zero_motion_accumulator = frame_stat.pcnt_inter - frame_stat.pcnt_motion;
+    }
+
+    // Not all frames in the group are necessarily used in calculating boost.
+    if ((*sr_accumulator < (kf_raw_err * 1.50)) &&
+        (i <= rc->max_gf_interval * 2)) {
+      double frame_boost;
+      double zm_factor;
+
+      // Factor 0.75-1.25 based on how much of frame is static.
+      zm_factor = (0.75 + (*zero_motion_accumulator / 2.0));
+
+      if (i < 2) *sr_accumulator = 0.0;
+      frame_boost = calc_kf_frame_boost(rc, frame_info, &frame_stat,
+                                        sr_accumulator, kf_max_boost);
+      boost_score += frame_boost * zm_factor;
+    }
+  }
+  return boost_score;
+}
+
+/*!\brief Interval(in seconds) to clip key-frame distance to in LAP.
+ */
+#define MAX_KF_BITS_INTERVAL_SINGLE_PASS 5
+
+/*!\brief Determine the next key frame group
+ *
+ * \ingroup gf_group_algo
+ * This function decides the placement of the next key frame, and
+ * calculates the bit allocation of the KF group and the keyframe itself.
+ *
+ * \param[in]    cpi              Top-level encoder structure
+ * \param[in]    this_frame       Pointer to first pass stats
+ *
+ * \return Nothing is returned.
+ */
 static void find_next_key_frame(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   RATE_CONTROL *const rc = &cpi->rc;
   TWO_PASS *const twopass = &cpi->twopass;
@@ -2165,19 +3076,19 @@
   AV1_COMMON *const cm = &cpi->common;
   CurrentFrame *const current_frame = &cm->current_frame;
   const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  const KeyFrameCfg *const kf_cfg = &oxcf->kf_cfg;
   const FIRSTPASS_STATS first_frame = *this_frame;
   FIRSTPASS_STATS next_frame;
   av1_zero(next_frame);
 
   rc->frames_since_key = 0;
+  // Use arfs if possible.
+  rc->use_arf_in_this_kf_group = is_altref_enabled(
+      oxcf->gf_cfg.lag_in_frames, oxcf->gf_cfg.enable_auto_arf);
 
   // Reset the GF group data structures.
   av1_zero(*gf_group);
 
-  // Clear the alt ref active flag and last group multi arf flags as they
-  // can never be set for a key frame.
-  rc->source_alt_ref_active = 0;
-
   // KF is always a GF so clear frames till next gf counter.
   rc->frames_till_gf_update_due = 0;
 
@@ -2190,10 +3101,9 @@
     if (num_frames_to_app_forced_key != -1)
       rc->frames_to_key = num_frames_to_app_forced_key;
     else
-      rc->frames_to_key = AOMMAX(1, cpi->oxcf.key_freq);
+      rc->frames_to_key = AOMMAX(1, kf_cfg->key_freq_max);
     correct_frames_to_key(cpi);
     rc->kf_boost = DEFAULT_KF_BOOST;
-    rc->source_alt_ref_active = 0;
     gf_group->update_type[0] = KF_UPDATE;
     return;
   }
@@ -2206,7 +3116,10 @@
   double kf_mod_err = 0.0;
   double kf_group_err = 0.0;
   double sr_accumulator = 0.0;
-  int frames_to_key;
+  double kf_group_avg_error = 0.0;
+  int frames_to_key, frames_to_key_clipped = INT_MAX;
+  int64_t kf_group_bits_clipped = INT64_MAX;
+
   // Is this a forced key frame by interval.
   rc->this_key_frame_forced = rc->next_key_frame_forced;
 
@@ -2217,12 +3130,12 @@
   kf_mod_err = calculate_modified_err(frame_info, twopass, oxcf, this_frame);
 
   frames_to_key =
-      define_kf_interval(cpi, this_frame, &kf_group_err, oxcf->key_freq);
+      define_kf_interval(cpi, this_frame, &kf_group_err, kf_cfg->key_freq_max);
 
   if (frames_to_key != -1)
-    rc->frames_to_key = AOMMIN(oxcf->key_freq, frames_to_key);
+    rc->frames_to_key = AOMMIN(kf_cfg->key_freq_max, frames_to_key);
   else
-    rc->frames_to_key = oxcf->key_freq;
+    rc->frames_to_key = kf_cfg->key_freq_max;
 
   if (cpi->lap_enabled) correct_frames_to_key(cpi);
 
@@ -2230,7 +3143,7 @@
   // We already breakout of the loop above at 2x max.
   // This code centers the extra kf if the actual natural interval
   // is between 1x and 2x.
-  if (cpi->oxcf.auto_key && rc->frames_to_key > cpi->oxcf.key_freq) {
+  if (kf_cfg->auto_key && rc->frames_to_key > kf_cfg->key_freq_max) {
     FIRSTPASS_STATS tmp_frame = first_frame;
 
     rc->frames_to_key /= 2;
@@ -2249,32 +3162,40 @@
     rc->next_key_frame_forced = 1;
   } else if ((twopass->stats_in == twopass->stats_buf_ctx->stats_in_end &&
               is_stat_consumption_stage_twopass(cpi)) ||
-             rc->frames_to_key >= cpi->oxcf.key_freq) {
+             rc->frames_to_key >= kf_cfg->key_freq_max) {
     rc->next_key_frame_forced = 1;
   } else {
     rc->next_key_frame_forced = 0;
   }
 
+  if (kf_cfg->fwd_kf_enabled) rc->next_is_fwd_key |= rc->next_key_frame_forced;
+
   // Special case for the last key frame of the file.
   if (twopass->stats_in >= twopass->stats_buf_ctx->stats_in_end) {
     // Accumulate kf group error.
     kf_group_err +=
         calculate_modified_err(frame_info, twopass, oxcf, this_frame);
+    rc->next_is_fwd_key = 0;
   }
 
   // Calculate the number of bits that should be assigned to the kf group.
-  if (twopass->bits_left > 0 && twopass->modified_error_left > 0.0) {
+  if ((twopass->bits_left > 0 && twopass->modified_error_left > 0.0) ||
+      (cpi->lap_enabled && oxcf->rc_cfg.mode != AOM_Q)) {
     // Maximum number of bits for a single normal frame (not key frame).
-    const int max_bits = frame_max_bits(rc, &cpi->oxcf);
+    const int max_bits = frame_max_bits(rc, oxcf);
 
     // Maximum number of bits allocated to the key frame group.
     int64_t max_grp_bits;
 
+    if (oxcf->rc_cfg.vbr_corpus_complexity_lap) {
+      kf_group_avg_error = get_kf_group_avg_error(
+          twopass, &first_frame, start_position, rc->frames_to_key);
+    }
+
     // Default allocation based on bits left and relative
     // complexity of the section.
-    twopass->kf_group_bits = (int64_t)(
-        twopass->bits_left * (kf_group_err / twopass->modified_error_left));
-
+    twopass->kf_group_bits =
+        get_kf_group_bits(cpi, kf_group_err, kf_group_avg_error);
     // Clip based on maximum per frame rate defined by the user.
     max_grp_bits = (int64_t)max_bits * (int64_t)rc->frames_to_key;
     if (twopass->kf_group_bits > max_grp_bits)
@@ -2284,48 +3205,30 @@
   }
   twopass->kf_group_bits = AOMMAX(0, twopass->kf_group_bits);
 
+  if (cpi->lap_enabled) {
+    // In the case of single pass based on LAP, frames to  key may have an
+    // inaccurate value, and hence should be clipped to an appropriate
+    // interval.
+    frames_to_key_clipped =
+        (int)(MAX_KF_BITS_INTERVAL_SINGLE_PASS * cpi->framerate);
+
+    // This variable calculates the bits allocated to kf_group with a clipped
+    // frames_to_key.
+    if (rc->frames_to_key > frames_to_key_clipped) {
+      kf_group_bits_clipped =
+          (int64_t)((double)twopass->kf_group_bits * frames_to_key_clipped /
+                    rc->frames_to_key);
+    }
+  }
+
   // Reset the first pass file position.
   reset_fpf_position(twopass, start_position);
 
   // Scan through the kf group collating various stats used to determine
   // how many bits to spend on it.
-  boost_score = 0.0;
-  const double kf_max_boost =
-      cpi->oxcf.rc_mode == AOM_Q
-          ? AOMMIN(AOMMAX(rc->frames_to_key * 2.0, KF_MIN_FRAME_BOOST),
-                   KF_MAX_FRAME_BOOST)
-          : KF_MAX_FRAME_BOOST;
-  for (i = 0; i < (rc->frames_to_key - 1); ++i) {
-    if (EOF == input_stats(twopass, &next_frame)) break;
-
-    // Monitor for static sections.
-    // For the first frame in kf group, the second ref indicator is invalid.
-    if (i > 0) {
-      zero_motion_accumulator =
-          AOMMIN(zero_motion_accumulator,
-                 get_zero_motion_factor(frame_info, &next_frame));
-    } else {
-      zero_motion_accumulator = next_frame.pcnt_inter - next_frame.pcnt_motion;
-    }
-
-    // Not all frames in the group are necessarily used in calculating boost.
-    if ((sr_accumulator < (kf_raw_err * 1.50)) &&
-        (i <= rc->max_gf_interval * 2)) {
-      double frame_boost;
-      double zm_factor;
-
-      // Factor 0.75-1.25 based on how much of frame is static.
-      zm_factor = (0.75 + (zero_motion_accumulator / 2.0));
-
-      if (i < 2) sr_accumulator = 0.0;
-      frame_boost = calc_kf_frame_boost(rc, frame_info, &next_frame,
-                                        &sr_accumulator, kf_max_boost);
-      boost_score += frame_boost * zm_factor;
-    }
-  }
-
+  boost_score = get_kf_boost_score(cpi, kf_raw_err, &zero_motion_accumulator,
+                                   &sr_accumulator, 0);
   reset_fpf_position(twopass, start_position);
-
   // Store the zero motion percentage
   twopass->kf_zeromotion_pct = (int)(zero_motion_accumulator * 100.0);
 
@@ -2336,7 +3239,15 @@
   rc->kf_boost = (int)boost_score;
 
   if (cpi->lap_enabled) {
-    rc->kf_boost = get_projected_kf_boost(cpi);
+    if (oxcf->rc_cfg.mode == AOM_Q) {
+      rc->kf_boost = get_projected_kf_boost(cpi);
+    } else {
+      // TODO(any): Explore using average frame stats for AOM_Q as well.
+      boost_score = get_kf_boost_score(
+          cpi, kf_raw_err, &zero_motion_accumulator, &sr_accumulator, 1);
+      reset_fpf_position(twopass, start_position);
+      rc->kf_boost += (int)boost_score;
+    }
   }
 
   // Special case for static / slide show content but don't apply
@@ -2354,8 +3265,12 @@
   }
 
   // Work out how many bits to allocate for the key frame itself.
-  kf_bits = calculate_boost_bits((rc->frames_to_key - 1), rc->kf_boost,
-                                 twopass->kf_group_bits);
+  // In case of LAP enabled for VBR, if the frames_to_key value is
+  // very high, we calculate the bits based on a clipped value of
+  // frames_to_key.
+  kf_bits = calculate_boost_bits(
+      AOMMIN(rc->frames_to_key, frames_to_key_clipped) - 1, rc->kf_boost,
+      AOMMIN(twopass->kf_group_bits, kf_group_bits_clipped));
   // printf("kf boost = %d kf_bits = %d kf_zeromotion_pct = %d\n", rc->kf_boost,
   //        kf_bits, twopass->kf_zeromotion_pct);
   kf_bits = adjust_boost_bits_for_target_level(cpi, rc, kf_bits,
@@ -2368,7 +3283,13 @@
   gf_group->update_type[0] = KF_UPDATE;
 
   // Note the total error score of the kf group minus the key frame itself.
-  twopass->kf_group_error_left = (int)(kf_group_err - kf_mod_err);
+  if (cpi->lap_enabled)
+    // As we don't have enough stats to know the actual error of the group,
+    // we assume the complexity of each frame to be equal to 1, and set the
+    // error as the number of frames in the group(minus the keyframe).
+    twopass->kf_group_error_left = (int)(rc->frames_to_key - 1);
+  else
+    twopass->kf_group_error_left = (int)(kf_group_err - kf_mod_err);
 
   // Adjust the count of total modified error left.
   // The count of bits left is adjusted elsewhere based on real coded frame
@@ -2402,6 +3323,21 @@
 #endif
 #define DEFAULT_GRP_WEIGHT 1.0
 
+static int get_section_target_bandwidth(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  CurrentFrame *const current_frame = &cm->current_frame;
+  RATE_CONTROL *const rc = &cpi->rc;
+  TWO_PASS *const twopass = &cpi->twopass;
+  int section_target_bandwidth;
+  const int frames_left = (int)(twopass->stats_buf_ctx->total_stats->count -
+                                current_frame->frame_number);
+  if (cpi->lap_enabled)
+    section_target_bandwidth = (int)rc->avg_frame_bandwidth;
+  else
+    section_target_bandwidth = (int)(twopass->bits_left / frames_left);
+  return section_target_bandwidth;
+}
+
 static void process_first_pass_stats(AV1_COMP *cpi,
                                      FIRSTPASS_STATS *this_frame) {
   AV1_COMMON *const cm = &cpi->common;
@@ -2409,8 +3345,8 @@
   RATE_CONTROL *const rc = &cpi->rc;
   TWO_PASS *const twopass = &cpi->twopass;
 
-  if (cpi->oxcf.rc_mode != AOM_Q && current_frame->frame_number == 0 &&
-      cpi->twopass.stats_buf_ctx->total_stats &&
+  if (cpi->oxcf.rc_cfg.mode != AOM_Q && current_frame->frame_number == 0 &&
+      cpi->gf_group.index == 0 && cpi->twopass.stats_buf_ctx->total_stats &&
       cpi->twopass.stats_buf_ctx->total_left_stats) {
     if (cpi->lap_enabled) {
       /*
@@ -2420,12 +3356,8 @@
       *cpi->twopass.stats_buf_ctx->total_left_stats =
           *cpi->twopass.stats_buf_ctx->total_stats;
     }
-    const int frames_left = (int)(twopass->stats_buf_ctx->total_stats->count -
-                                  current_frame->frame_number);
-
     // Special case code for first frame.
-    const int section_target_bandwidth =
-        (int)(twopass->bits_left / frames_left);
+    const int section_target_bandwidth = get_section_target_bandwidth(cpi);
     const double section_length =
         twopass->stats_buf_ctx->total_left_stats->count;
     const double section_error =
@@ -2445,7 +3377,7 @@
     rc->last_q[INTER_FRAME] = tmp_q;
     rc->avg_q = av1_convert_qindex_to_q(tmp_q, cm->seq_params.bit_depth);
     rc->avg_frame_qindex[INTER_FRAME] = tmp_q;
-    rc->last_q[KEY_FRAME] = (tmp_q + cpi->oxcf.best_allowed_q) / 2;
+    rc->last_q[KEY_FRAME] = (tmp_q + cpi->oxcf.rc_cfg.best_allowed_q) / 2;
     rc->avg_frame_qindex[KEY_FRAME] = rc->last_q[KEY_FRAME];
   }
 
@@ -2458,7 +3390,7 @@
   if (err == EOF) return;
 
   {
-    const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE)
+    const int num_mbs = (cpi->oxcf.resize_cfg.resize_mode != RESIZE_NONE)
                             ? cpi->initial_mbs
                             : cm->mi_params.MBs;
     // The multiplication by 256 reverses a scaling factor of (>> 8)
@@ -2468,10 +3400,6 @@
         log((this_frame->frame_avg_wavelet_energy / num_mbs) + 1.0);
   }
 
-  // Update the total stats remaining structure.
-  if (twopass->stats_buf_ctx->total_left_stats)
-    subtract_stats(twopass->stats_buf_ctx->total_left_stats, this_frame);
-
   // Set the frame content type flag.
   if (this_frame->intra_skip_pct >= FC_ANIMATION_THRESH)
     twopass->fr_content_type = FC_GRAPHICS_ANIMATION;
@@ -2500,71 +3428,72 @@
   RATE_CONTROL *const rc = &cpi->rc;
   TWO_PASS *const twopass = &cpi->twopass;
   GF_GROUP *const gf_group = &cpi->gf_group;
-  AV1_COMMON *cm = &cpi->common;
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
 
-  if (frame_is_intra_only(cm)) {
-    FeatureFlags *const features = &cm->features;
-    av1_set_screen_content_options(cpi, features);
-    cpi->is_screen_content_type = features->allow_screen_content_tools;
-  }
+  const FIRSTPASS_STATS *const start_pos = twopass->stats_in;
 
   if (is_stat_consumption_stage(cpi) && !twopass->stats_in) return;
 
-  if (rc->frames_till_gf_update_due > 0 && !(frame_flags & FRAMEFLAGS_KEY)) {
+  const int update_type = gf_group->update_type[gf_group->index];
+  frame_params->frame_type = gf_group->frame_type[gf_group->index];
+
+  if (gf_group->index < gf_group->size && !(frame_flags & FRAMEFLAGS_KEY)) {
     assert(gf_group->index < gf_group->size);
-    const int update_type = gf_group->update_type[gf_group->index];
 
     setup_target_rate(cpi);
 
     // If this is an arf frame then we dont want to read the stats file or
     // advance the input pointer as we already have what we need.
     if (update_type == ARF_UPDATE || update_type == INTNL_ARF_UPDATE) {
-      if (cpi->no_show_kf) {
-        assert(update_type == ARF_UPDATE);
-        frame_params->frame_type = KEY_FRAME;
-      } else {
-        frame_params->frame_type = INTER_FRAME;
-      }
-
       // Do the firstpass stats indicate that this frame is skippable for the
       // partition search?
-      if (cpi->sf.part_sf.allow_partition_search_skip && cpi->oxcf.pass == 2) {
+      if (cpi->sf.part_sf.allow_partition_search_skip && oxcf->pass == 2) {
         cpi->partition_search_skippable_frame = is_skippable_frame(cpi);
       }
-
       return;
     }
   }
 
   aom_clear_system_state();
 
-  if (cpi->oxcf.rc_mode == AOM_Q) rc->active_worst_quality = cpi->oxcf.cq_level;
+  if (oxcf->rc_cfg.mode == AOM_Q)
+    rc->active_worst_quality = oxcf->rc_cfg.cq_level;
   FIRSTPASS_STATS this_frame;
   av1_zero(this_frame);
   // call above fn
   if (is_stat_consumption_stage(cpi)) {
-    process_first_pass_stats(cpi, &this_frame);
+    if (gf_group->index < gf_group->size || rc->frames_to_key == 0)
+      process_first_pass_stats(cpi, &this_frame);
   } else {
-    rc->active_worst_quality = cpi->oxcf.cq_level;
+    rc->active_worst_quality = oxcf->rc_cfg.cq_level;
   }
 
   // Keyframe and section processing.
-  if (rc->frames_to_key == 0 || (frame_flags & FRAMEFLAGS_KEY)) {
-    FIRSTPASS_STATS this_frame_copy;
-    this_frame_copy = this_frame;
-    frame_params->frame_type = KEY_FRAME;
+  FIRSTPASS_STATS this_frame_copy;
+  this_frame_copy = this_frame;
+  int is_overlay_forward_kf =
+      rc->frames_to_key == 0 &&
+      gf_group->update_type[gf_group->index] == OVERLAY_UPDATE;
+  if (rc->frames_to_key <= 0 && !is_overlay_forward_kf) {
+    assert(rc->frames_to_key >= -1);
     // Define next KF group and assign bits to it.
+    int kf_offset = rc->frames_to_key;
+    if (rc->frames_to_key < 0) {
+      this_frame = *(twopass->stats_in - 1);
+    } else {
+      frame_params->frame_type = KEY_FRAME;
+    }
     find_next_key_frame(cpi, &this_frame);
+    rc->frames_since_key -= kf_offset;
+    rc->frames_to_key += kf_offset;
     this_frame = this_frame_copy;
   } else {
-    frame_params->frame_type = INTER_FRAME;
-    const int altref_enabled = is_altref_enabled(cpi);
-    const int sframe_dist = cpi->oxcf.sframe_dist;
-    const int sframe_mode = cpi->oxcf.sframe_mode;
-    const int sframe_enabled = cpi->oxcf.sframe_enabled;
-    const int update_type = gf_group->update_type[gf_group->index];
+    const int altref_enabled = is_altref_enabled(oxcf->gf_cfg.lag_in_frames,
+                                                 oxcf->gf_cfg.enable_auto_arf);
+    const int sframe_dist = oxcf->kf_cfg.sframe_dist;
+    const int sframe_mode = oxcf->kf_cfg.sframe_mode;
     CurrentFrame *const current_frame = &cpi->common.current_frame;
-    if (sframe_enabled) {
+    if (sframe_dist != 0) {
       if (altref_enabled) {
         if (sframe_mode == 1) {
           // sframe_mode == 1: insert sframe if it matches altref frame.
@@ -2594,52 +3523,125 @@
   }
 
   // Define a new GF/ARF group. (Should always enter here for key frames).
-  if (rc->frames_till_gf_update_due == 0) {
+  if (gf_group->index == gf_group->size) {
     assert(cpi->common.current_frame.frame_number == 0 ||
            gf_group->index == gf_group->size);
     const FIRSTPASS_STATS *const start_position = twopass->stats_in;
-    int num_frames_to_detect_scenecut, frames_to_key;
-    if (cpi->lap_enabled && cpi->rc.enable_scenecut_detection)
-      num_frames_to_detect_scenecut = MAX_GF_LENGTH_LAP + 1;
-    else
-      num_frames_to_detect_scenecut = 0;
-    frames_to_key = define_kf_interval(cpi, &this_frame, NULL,
-                                       num_frames_to_detect_scenecut);
-    reset_fpf_position(twopass, start_position);
-    if (frames_to_key != -1)
-      rc->frames_to_key = AOMMIN(rc->frames_to_key, frames_to_key);
 
-    int max_gop_length = (cpi->oxcf.lag_in_frames >= 32 &&
-                          is_stat_consumption_stage_twopass(cpi))
-                             ? MAX_GF_INTERVAL
-                             : MAX_GF_LENGTH_LAP;
-    if (rc->intervals_till_gf_calculate_due == 0) {
+    if (cpi->lap_enabled && cpi->rc.enable_scenecut_detection) {
+      int num_frames_to_detect_scenecut, frames_to_key;
+      num_frames_to_detect_scenecut = MAX_GF_LENGTH_LAP + 1;
+      frames_to_key = define_kf_interval(cpi, &this_frame, NULL,
+                                         num_frames_to_detect_scenecut);
+      if (frames_to_key != -1)
+        rc->frames_to_key = AOMMIN(rc->frames_to_key, frames_to_key);
+    }
+
+    reset_fpf_position(twopass, start_position);
+
+    int max_gop_length =
+        (oxcf->gf_cfg.lag_in_frames >= 32 &&
+         is_stat_consumption_stage_twopass(cpi))
+            ? AOMMIN(MAX_GF_INTERVAL, oxcf->gf_cfg.lag_in_frames -
+                                          oxcf->algo_cfg.arnr_max_frames / 2)
+            : MAX_GF_LENGTH_LAP;
+
+    // Identify regions if needed.
+    if (rc->frames_since_key == 0 || rc->frames_since_key == 1 ||
+        (rc->frames_till_regions_update - rc->frames_since_key <
+             rc->frames_to_key &&
+         rc->frames_till_regions_update - rc->frames_since_key <
+             max_gop_length + 1)) {
+      int is_first_stat =
+          twopass->stats_in == twopass->stats_buf_ctx->stats_in_start;
+      const FIRSTPASS_STATS *stats_start = twopass->stats_in + is_first_stat;
+      // offset of stats_start from the current frame
+      int offset = is_first_stat || (rc->frames_since_key == 0);
+      // offset of the region indices from the previous key frame
+      rc->regions_offset = rc->frames_since_key;
+      // how many frames we can analyze from this frame
+      int rest_frames = AOMMIN(rc->frames_to_key + rc->next_is_fwd_key,
+                               MAX_FIRSTPASS_ANALYSIS_FRAMES);
+      rest_frames =
+          AOMMIN(rest_frames,
+                 (int)(twopass->stats_buf_ctx->stats_in_end - stats_start + 1) +
+                     offset);
+
+      rc->frames_till_regions_update = rest_frames;
+
+      identify_regions(stats_start, rest_frames - offset, offset, rc->regions,
+                       &rc->num_regions, rc->cor_coeff);
+    }
+
+    int cur_region_idx =
+        find_regions_index(rc->regions, rc->num_regions,
+                           rc->frames_since_key - rc->regions_offset);
+    if ((cur_region_idx >= 0 &&
+         rc->regions[cur_region_idx].type == SCENECUT_REGION) ||
+        rc->frames_since_key == 0) {
+      // If we start from a scenecut, then the last GOP's arf boost is not
+      // needed for this GOP.
+      cpi->gf_state.arf_gf_boost_lst = 0;
+    }
+
+    // TODO(jingning): Resoleve the redundant calls here.
+    if (rc->intervals_till_gf_calculate_due == 0 || 1) {
       calculate_gf_length(cpi, max_gop_length, MAX_NUM_GF_INTERVALS);
     }
 
-    if (max_gop_length > 16) {
-      if (rc->gf_intervals[rc->cur_gf_index] - 1 > 16) {
+    if (max_gop_length > 16 && oxcf->algo_cfg.enable_tpl_model &&
+        !cpi->sf.tpl_sf.disable_gop_length_decision) {
+      int this_idx = rc->frames_since_key + rc->gf_intervals[rc->cur_gf_index] -
+                     rc->regions_offset - 1;
+      int this_region =
+          find_regions_index(rc->regions, rc->num_regions, this_idx);
+      int next_region =
+          find_regions_index(rc->regions, rc->num_regions, this_idx + 1);
+      int is_last_scenecut =
+          (rc->gf_intervals[rc->cur_gf_index] >= rc->frames_to_key ||
+           rc->regions[this_region].type == SCENECUT_REGION ||
+           rc->regions[next_region].type == SCENECUT_REGION);
+      int ori_gf_int = rc->gf_intervals[rc->cur_gf_index];
+
+      if (rc->gf_intervals[rc->cur_gf_index] > 16) {
         // The calculate_gf_length function is previously used with
         // max_gop_length = 32 with look-ahead gf intervals.
         define_gf_group(cpi, &this_frame, frame_params, max_gop_length, 0);
+        this_frame = this_frame_copy;
+        int is_temporal_filter_enabled =
+            (rc->frames_since_key > 0 && gf_group->arf_index > -1);
+        if (is_temporal_filter_enabled) {
+          int arf_src_index = gf_group->arf_src_offset[gf_group->arf_index];
+          av1_temporal_filter(cpi, arf_src_index, NULL);
+          aom_extend_frame_borders(&cpi->alt_ref_buffer,
+                                   av1_num_planes(&cpi->common));
+        }
         if (!av1_tpl_setup_stats(cpi, 1, frame_params, frame_input)) {
           // Tpl decides that a shorter gf interval is better.
           // TODO(jingning): Remove redundant computations here.
           max_gop_length = 16;
           calculate_gf_length(cpi, max_gop_length, 1);
+          if (is_last_scenecut &&
+              (ori_gf_int - rc->gf_intervals[rc->cur_gf_index] < 4)) {
+            rc->gf_intervals[rc->cur_gf_index] = ori_gf_int;
+          }
+        } else {
+          // Tpl stats is reused only when the ARF frame is temporally filtered
+          if (is_temporal_filter_enabled)
+            cpi->tpl_data.skip_tpl_setup_stats = 1;
         }
-      } else {
-        // Even based on 32 we still decide to use a short gf interval.
-        // Better to re-decide based on 16 then
-        max_gop_length = 16;
-        calculate_gf_length(cpi, max_gop_length, 1);
       }
     }
-    define_gf_group(cpi, &this_frame, frame_params, max_gop_length, 1);
-    rc->frames_till_gf_update_due = rc->baseline_gf_interval;
-    cpi->num_gf_group_show_frames = 0;
-    assert(gf_group->index == 0);
+    define_gf_group(cpi, &this_frame, frame_params, max_gop_length, 0);
 
+    if (gf_group->update_type[gf_group->index] != ARF_UPDATE &&
+        rc->frames_since_key > 0)
+      process_first_pass_stats(cpi, &this_frame);
+
+    define_gf_group(cpi, &this_frame, frame_params, max_gop_length, 1);
+
+    rc->frames_till_gf_update_due = rc->baseline_gf_interval;
+    assert(gf_group->index == 0);
 #if ARF_STATS_OUTPUT
     {
       FILE *fpfile;
@@ -2656,9 +3658,21 @@
   }
   assert(gf_group->index < gf_group->size);
 
+  if (gf_group->update_type[gf_group->index] == ARF_UPDATE ||
+      gf_group->update_type[gf_group->index] == INTNL_ARF_UPDATE) {
+    reset_fpf_position(twopass, start_pos);
+  } else {
+    // Update the total stats remaining structure.
+    if (twopass->stats_buf_ctx->total_left_stats)
+      subtract_stats(twopass->stats_buf_ctx->total_left_stats,
+                     &this_frame_copy);
+  }
+
+  frame_params->frame_type = gf_group->frame_type[gf_group->index];
+
   // Do the firstpass stats indicate that this frame is skippable for the
   // partition search?
-  if (cpi->sf.part_sf.allow_partition_search_skip && cpi->oxcf.pass == 2) {
+  if (cpi->sf.part_sf.allow_partition_search_skip && oxcf->pass == 2) {
     cpi->partition_search_skippable_frame = is_skippable_frame(cpi);
   }
 
@@ -2687,7 +3701,7 @@
   // first pass.
   av1_new_framerate(cpi, frame_rate);
   twopass->bits_left =
-      (int64_t)(stats->duration * oxcf->target_bandwidth / 10000000.0);
+      (int64_t)(stats->duration * oxcf->rc_cfg.target_bandwidth / 10000000.0);
 
   // This variable monitors how far behind the second ref update is lagging.
   twopass->sr_update_lag = 1;
@@ -2700,9 +3714,9 @@
     const FIRSTPASS_STATS *s = twopass->stats_in;
     double modified_error_total = 0.0;
     twopass->modified_error_min =
-        (avg_error * oxcf->two_pass_vbrmin_section) / 100;
+        (avg_error * oxcf->rc_cfg.vbrmin_section) / 100;
     twopass->modified_error_max =
-        (avg_error * oxcf->two_pass_vbrmax_section) / 100;
+        (avg_error * oxcf->rc_cfg.vbrmax_section) / 100;
     while (s < twopass->stats_buf_ctx->stats_in_end) {
       modified_error_total +=
           calculate_modified_err(frame_info, twopass, oxcf, s);
@@ -2767,6 +3781,7 @@
   TWO_PASS *const twopass = &cpi->twopass;
   RATE_CONTROL *const rc = &cpi->rc;
   const int bits_used = rc->base_frame_target;
+  const RateControlCfg *const rc_cfg = &cpi->oxcf.rc_cfg;
 
   // VBR correction is done through rc->vbr_bits_off_target. Based on the
   // sign of this value, a limited % adjustment is made to the target rate
@@ -2795,11 +3810,13 @@
     int i;
     for (i = pyramid_level; i <= MAX_ARF_LAYERS; ++i) {
       rc->active_best_quality[i] = cpi->common.quant_params.base_qindex;
-      // if (pyramid_level >= 2) {
-      //   rc->active_best_quality[pyramid_level] =
-      //     AOMMAX(rc->active_best_quality[pyramid_level],
-      //            cpi->common.base_qindex);
-      // }
+#if CONFIG_TUNE_VMAF
+      if (cpi->vmaf_info.original_qindex != -1 &&
+          (cpi->oxcf.tune_cfg.tuning >= AOM_TUNE_VMAF_WITH_PREPROCESSING &&
+           cpi->oxcf.tune_cfg.tuning <= AOM_TUNE_VMAF_NEG_MAX_GAIN)) {
+        rc->active_best_quality[i] = cpi->vmaf_info.original_qindex;
+      }
+#endif
     }
   }
 
@@ -2819,7 +3836,7 @@
             (double)twopass->rolling_arf_group_actual_bits /
                 (double)twopass->rolling_arf_group_target_bits,
             twopass->bpm_factor,
-            av1_convert_qindex_to_q(quant_params->base_qindex,
+            av1_convert_qindex_to_q(cpi->common.quant_params.base_qindex,
                                     cm->seq_params.bit_depth),
             av1_convert_qindex_to_q(rc->active_worst_quality,
                                     cm->seq_params.bit_depth));
@@ -2834,18 +3851,18 @@
   twopass->kf_group_bits = AOMMAX(twopass->kf_group_bits, 0);
 
   // If the rate control is drifting consider adjustment to min or maxq.
-  if ((cpi->oxcf.rc_mode != AOM_Q) && !cpi->rc.is_src_frame_alt_ref) {
+  if ((rc_cfg->mode != AOM_Q) && !cpi->rc.is_src_frame_alt_ref) {
     const int maxq_adj_limit = rc->worst_quality - rc->active_worst_quality;
     const int minq_adj_limit =
-        (cpi->oxcf.rc_mode == AOM_CQ ? MINQ_ADJ_LIMIT_CQ : MINQ_ADJ_LIMIT);
+        (rc_cfg->mode == AOM_CQ ? MINQ_ADJ_LIMIT_CQ : MINQ_ADJ_LIMIT);
 
     // Undershoot.
-    if (rc->rate_error_estimate > cpi->oxcf.under_shoot_pct) {
+    if (rc->rate_error_estimate > rc_cfg->under_shoot_pct) {
       --twopass->extend_maxq;
       if (rc->rolling_target_bits >= rc->rolling_actual_bits)
         ++twopass->extend_minq;
       // Overshoot.
-    } else if (rc->rate_error_estimate < -cpi->oxcf.over_shoot_pct) {
+    } else if (rc->rate_error_estimate < -rc_cfg->over_shoot_pct) {
       --twopass->extend_minq;
       if (rc->rolling_target_bits < rc->rolling_actual_bits)
         ++twopass->extend_maxq;
diff --git a/av1/encoder/pass2_strategy.h b/av1/encoder/pass2_strategy.h
index 437fb8f..9f6ce22 100644
--- a/av1/encoder/pass2_strategy.h
+++ b/av1/encoder/pass2_strategy.h
@@ -18,8 +18,15 @@
 
 struct AV1_COMP;
 struct EncodeFrameParams;
-// structure of accumulated stats and features in a gf group
+
+#include "av1/encoder/encoder.h"
+
+/*!\endcond */
+/*!
+ * \brief accumulated stats and features in a gf group
+ */
 typedef struct {
+  /*!\cond */
   double gf_group_err;
   double gf_group_raw_error;
   double gf_group_skip_pct;
@@ -43,28 +50,86 @@
   double avg_wavelet_energy;
   double avg_raw_err_stdev;
   int non_zero_stdev_count;
-
-  unsigned int allow_alt_ref;
+  /*!\endcond */
 } GF_GROUP_STATS;
 
+/*!
+ * \brief accumulated stats and features for a frame
+ */
 typedef struct {
+  /*!\cond */
   double frame_err;
   double frame_coded_error;
   double frame_sr_coded_error;
   double frame_tr_coded_error;
+  /*!\endcond */
 } GF_FRAME_STATS;
+/*!cond */
 
 void av1_init_second_pass(struct AV1_COMP *cpi);
 
 void av1_init_single_pass_lap(AV1_COMP *cpi);
 
+/*!\endcond */
+/*!\brief Main per frame entry point for second pass of two pass encode
+ *
+ *\ingroup rate_control
+ *
+ * This function is called for each frame in the second pass of a two pass
+ * encode. It checks the frame type and if a new KF or GF/ARF is due.
+ * When a KF is due it calls find_next_key_frame() to work out how long
+ * this key frame group will be and assign bits to the key frame.
+ * At the start of a new GF/ARF group it calls calculate_gf_length()
+ * and define_gf_group() which are the main functions responsible for
+ * defining the size and structure of the new GF/ARF group.
+ *
+ * \param[in]    cpi           Top - level encoder instance structure
+ * \param[in]    frame_params  Per frame encoding parameters
+ * \param[in]    frame_input   Current and last input frame buffers
+ * \param[in]    frame_flags   Frame type and coding flags
+ *
+ * \return No return but analyses first pass stats and assigns a target
+ *         number of bits to the current frame and a target Q range.
+ */
 void av1_get_second_pass_params(struct AV1_COMP *cpi,
                                 struct EncodeFrameParams *const frame_params,
                                 const EncodeFrameInput *const frame_input,
                                 unsigned int frame_flags);
 
+/*!\brief Adjustments to two pass and rate control after each frame.
+ *
+ *\ingroup rate_control
+ *
+ * This function is called after each frame to make adjustments to
+ * heuristics and data structures that relate to rate control.
+ *
+ * \param[in]    cpi       Top - level encoder instance structure
+ *
+ * \return No return value but this function updates various rate control
+ *         related data structures that for example track overshoot and
+ *         undershoot.
+ */
 void av1_twopass_postencode_update(struct AV1_COMP *cpi);
 
+/*!\brief Distributes bits to frames in a group
+ *
+ *\ingroup rate_control
+ *
+ * This function decides on the allocation of bits between the different
+ * frames and types of frame in a GF/ARF group.
+ *
+ * \param[in]   cpi           Top - level encoder instance structure
+ * \param[in]   rc            Rate control data
+ * \param[in]   gf_group      GF/ARF group data structure
+ * \param[in]   is_key_frame  Indicates if the first frame in the group is
+ *                            also a key frame.
+ * \param[in]   use_arf       Are ARF frames enabled or is this a GF only
+ *                            uni-directional group.
+ * \param[in]   gf_group_bits Bits available to be allocated.
+ *
+ * \return No return but updates the rate control and group data structures
+ *         to reflect the allocation of bits.
+ */
 void av1_gop_bit_allocation(const AV1_COMP *cpi, RATE_CONTROL *const rc,
                             GF_GROUP *gf_group, int is_key_frame, int use_arf,
                             int64_t gf_group_bits);
diff --git a/av1/encoder/pickcdef.c b/av1/encoder/pickcdef.c
index a1092fd..c71ef31 100644
--- a/av1/encoder/pickcdef.c
+++ b/av1/encoder/pickcdef.c
@@ -12,42 +12,53 @@
 #include <math.h>
 #include <string.h>
 
+#include "config/aom_dsp_rtcd.h"
 #include "config/aom_scale_rtcd.h"
 
 #include "aom/aom_integer.h"
 #include "aom_ports/system_state.h"
 #include "av1/common/av1_common_int.h"
-#include "av1/common/cdef.h"
 #include "av1/common/reconinter.h"
 #include "av1/encoder/encoder.h"
+#include "av1/encoder/pickcdef.h"
 
-#define REDUCED_PRI_STRENGTHS_LVL1 8
-#define REDUCED_PRI_STRENGTHS_LVL2 5
+// Get primary and secondary filter strength for the given strength index and
+// search method
+static INLINE void get_cdef_filter_strengths(CDEF_PICK_METHOD pick_method,
+                                             int *pri_strength,
+                                             int *sec_strength,
+                                             int strength_idx) {
+  const int tot_sec_filter = (pick_method >= CDEF_FAST_SEARCH_LVL3)
+                                 ? REDUCED_SEC_STRENGTHS_LVL3
+                                 : CDEF_SEC_STRENGTHS;
+  const int pri_idx = strength_idx / tot_sec_filter;
+  const int sec_idx = strength_idx % tot_sec_filter;
+  *pri_strength = pri_idx;
+  *sec_strength = sec_idx;
+  if (pick_method == CDEF_FULL_SEARCH) return;
 
-#define REDUCED_TOTAL_STRENGTHS_LVL1 \
-  (REDUCED_PRI_STRENGTHS_LVL1 * CDEF_SEC_STRENGTHS)
-#define REDUCED_TOTAL_STRENGTHS_LVL2 \
-  (REDUCED_PRI_STRENGTHS_LVL2 * CDEF_SEC_STRENGTHS)
-#define TOTAL_STRENGTHS (CDEF_PRI_STRENGTHS * CDEF_SEC_STRENGTHS)
-
-static const int priconv_lvl1[REDUCED_TOTAL_STRENGTHS_LVL1] = { 0, 1, 2,  3,
-                                                                5, 7, 10, 13 };
-static const int priconv_lvl2[REDUCED_TOTAL_STRENGTHS_LVL2] = { 0, 2, 4, 8,
-                                                                14 };
-static const int nb_cdef_strengths[CDEF_PICK_METHODS] = {
-  TOTAL_STRENGTHS, REDUCED_TOTAL_STRENGTHS_LVL1, REDUCED_TOTAL_STRENGTHS_LVL2,
-  TOTAL_STRENGTHS
-};
-
-// Get primary strength value for the given index and search method
-static INLINE int get_pri_strength(CDEF_PICK_METHOD pick_method, int pri_idx) {
   switch (pick_method) {
-    case CDEF_FAST_SEARCH_LVL1: return priconv_lvl1[pri_idx];
-    case CDEF_FAST_SEARCH_LVL2: return priconv_lvl2[pri_idx];
-    default: assert(0 && "Invalid CDEF primary index"); return -1;
+    case CDEF_FAST_SEARCH_LVL1: *pri_strength = priconv_lvl1[pri_idx]; break;
+    case CDEF_FAST_SEARCH_LVL2: *pri_strength = priconv_lvl2[pri_idx]; break;
+    case CDEF_FAST_SEARCH_LVL3:
+      *pri_strength = priconv_lvl2[pri_idx];
+      *sec_strength = secconv_lvl3[sec_idx];
+      break;
+    case CDEF_FAST_SEARCH_LVL4:
+      *pri_strength = priconv_lvl4[pri_idx];
+      *sec_strength = secconv_lvl3[sec_idx];
+      break;
+    default: assert(0 && "Invalid CDEF search method");
   }
 }
 
+// Store CDEF filter strength calculated from strength index for given search
+// method
+#define STORE_CDEF_FILTER_STRENGTH(cdef_strength, pick_method, strength_idx) \
+  get_cdef_filter_strengths((pick_method), &pri_strength, &sec_strength,     \
+                            (strength_idx));                                 \
+  cdef_strength = pri_strength * CDEF_SEC_STRENGTHS + sec_strength;
+
 /* Search for the best strength to add as an option, knowing we
    already selected nb_strengths options. */
 static uint64_t search_one(int *lev, int nb_strengths,
@@ -141,8 +152,8 @@
                                       int sb_count,
                                       CDEF_PICK_METHOD pick_method) {
   uint64_t best_tot_mse;
-  int fast = (pick_method == CDEF_FAST_SEARCH_LVL1 ||
-              pick_method == CDEF_FAST_SEARCH_LVL2);
+  int fast = (pick_method >= CDEF_FAST_SEARCH_LVL1 &&
+              pick_method <= CDEF_FAST_SEARCH_LVL4);
   int i;
   best_tot_mse = (uint64_t)1 << 63;
   /* Greedy search: add one strength options at a time. */
@@ -190,14 +201,7 @@
   return best_tot_mse;
 }
 
-typedef void (*copy_fn_t)(uint16_t *dst, int dstride, const void *src,
-                          int src_voffset, int src_hoffset, int sstride,
-                          int vsize, int hsize);
-typedef uint64_t (*compute_cdef_dist_t)(void *dst, int dstride, uint16_t *src,
-                                        cdef_list *dlist, int cdef_count,
-                                        BLOCK_SIZE bsize, int coeff_shift,
-                                        int row, int col);
-
+#if CONFIG_AV1_HIGHBITDEPTH
 static void copy_sb16_16_highbd(uint16_t *dst, int dstride, const void *src,
                                 int src_voffset, int src_hoffset, int sstride,
                                 int vsize, int hsize) {
@@ -207,6 +211,7 @@
   for (r = 0; r < vsize; r++)
     memcpy(dst + r * dstride, base + r * sstride, hsize * sizeof(*base));
 }
+#endif
 
 static void copy_sb16_16(uint16_t *dst, int dstride, const void *src,
                          int src_voffset, int src_hoffset, int sstride,
@@ -219,33 +224,6 @@
       dst[r * dstride + c] = (uint16_t)base[r * sstride + c];
 }
 
-static INLINE uint64_t mse_wxh_16bit_highbd(uint16_t *dst, int dstride,
-                                            uint16_t *src, int sstride, int w,
-                                            int h) {
-  uint64_t sum = 0;
-  int i, j;
-  for (i = 0; i < h; i++) {
-    for (j = 0; j < w; j++) {
-      int e = dst[i * dstride + j] - src[i * sstride + j];
-      sum += e * e;
-    }
-  }
-  return sum;
-}
-
-static INLINE uint64_t mse_wxh_16bit(uint8_t *dst, int dstride, uint16_t *src,
-                                     int sstride, int w, int h) {
-  uint64_t sum = 0;
-  int i, j;
-  for (i = 0; i < h; i++) {
-    for (j = 0; j < w; j++) {
-      int e = (uint16_t)dst[i * dstride + j] - src[i * sstride + j];
-      sum += e * e;
-    }
-  }
-  return sum;
-}
-
 static INLINE void init_src_params(int *src_stride, int *width, int *height,
                                    int *width_log2, int *height_log2,
                                    BLOCK_SIZE bsize) {
@@ -255,7 +233,7 @@
   *width_log2 = MI_SIZE_LOG2 + mi_size_wide_log2[bsize];
   *height_log2 = MI_SIZE_LOG2 + mi_size_wide_log2[bsize];
 }
-
+#if CONFIG_AV1_HIGHBITDEPTH
 /* Compute MSE only on the blocks we filtered. */
 static uint64_t compute_cdef_dist_highbd(void *dst, int dstride, uint16_t *src,
                                          cdef_list *dlist, int cdef_count,
@@ -273,13 +251,13 @@
   for (bi = 0; bi < cdef_count; bi++) {
     by = dlist[bi].by;
     bx = dlist[bi].bx;
-    sum += mse_wxh_16bit_highbd(
+    sum += aom_mse_wxh_16bit_highbd(
         &dst_buff[(by << height_log2) * dstride + (bx << width_log2)], dstride,
         &src[bi << (height_log2 + width_log2)], src_stride, width, height);
   }
   return sum >> 2 * coeff_shift;
 }
-
+#endif
 static uint64_t compute_cdef_dist(void *dst, int dstride, uint16_t *src,
                                   cdef_list *dlist, int cdef_count,
                                   BLOCK_SIZE bsize, int coeff_shift, int row,
@@ -296,25 +274,221 @@
   for (bi = 0; bi < cdef_count; bi++) {
     by = dlist[bi].by;
     bx = dlist[bi].bx;
-    sum += mse_wxh_16bit(
+    sum += aom_mse_wxh_16bit(
         &dst_buff[(by << height_log2) * dstride + (bx << width_log2)], dstride,
         &src[bi << (height_log2 + width_log2)], src_stride, width, height);
   }
   return sum >> 2 * coeff_shift;
 }
 
-static int sb_all_skip(const CommonModeInfoParams *const mi_params, int mi_row,
-                       int mi_col) {
-  const int maxr = AOMMIN(mi_params->mi_rows - mi_row, MI_SIZE_64X64);
-  const int maxc = AOMMIN(mi_params->mi_cols - mi_col, MI_SIZE_64X64);
-  const int stride = mi_params->mi_stride;
-  MB_MODE_INFO **mbmi = mi_params->mi_grid_base + mi_row * stride + mi_col;
-  for (int r = 0; r < maxr; ++r, mbmi += stride) {
-    for (int c = 0; c < maxc; ++c) {
-      if (!mbmi[c]->skip) return 0;
+// Calculates MSE at block level.
+// Inputs:
+//   cdef_search_ctx: Pointer to the structure containing parameters related to
+//   CDEF search context.
+//   fbr: Row index in units of 64x64 block
+//   fbc: Column index in units of 64x64 block
+// Returns:
+//   Nothing will be returned. Contents of cdef_search_ctx will be modified.
+static void cdef_mse_calc_block(CdefSearchCtx *cdef_search_ctx, int fbr,
+                                int fbc) {
+  const CommonModeInfoParams *const mi_params = &cdef_search_ctx->cm->mi_params;
+  const YV12_BUFFER_CONFIG *ref = cdef_search_ctx->ref;
+  const int sb_count = cdef_search_ctx->sb_count;
+  const int coeff_shift = cdef_search_ctx->coeff_shift;
+  const int *mi_wide_l2 = cdef_search_ctx->mi_wide_l2;
+  const int *mi_high_l2 = cdef_search_ctx->mi_high_l2;
+
+  // Declare and initialize the temporary buffers.
+  DECLARE_ALIGNED(32, uint16_t, tmp_dst[1 << (MAX_SB_SIZE_LOG2 * 2)]);
+  DECLARE_ALIGNED(32, uint16_t, inbuf[CDEF_INBUF_SIZE]);
+  cdef_list dlist[MI_SIZE_128X128 * MI_SIZE_128X128];
+  int dir[CDEF_NBLOCKS][CDEF_NBLOCKS] = { { 0 } };
+  int var[CDEF_NBLOCKS][CDEF_NBLOCKS] = { { 0 } };
+  uint16_t *const in = inbuf + CDEF_VBORDER * CDEF_BSTRIDE + CDEF_HBORDER;
+  int nhb = AOMMIN(MI_SIZE_64X64, mi_params->mi_cols - MI_SIZE_64X64 * fbc);
+  int nvb = AOMMIN(MI_SIZE_64X64, mi_params->mi_rows - MI_SIZE_64X64 * fbr);
+  int hb_step = 1, vb_step = 1;
+  BLOCK_SIZE bs;
+
+  const MB_MODE_INFO *const mbmi =
+      mi_params->mi_grid_base[MI_SIZE_64X64 * fbr * mi_params->mi_stride +
+                              MI_SIZE_64X64 * fbc];
+
+  uint8_t *ref_buffer[MAX_MB_PLANE] = { ref->y_buffer, ref->u_buffer,
+                                        ref->v_buffer };
+  int ref_stride[MAX_MB_PLANE] = { ref->y_stride, ref->uv_stride,
+                                   ref->uv_stride };
+
+  if (mbmi->bsize == BLOCK_128X128 || mbmi->bsize == BLOCK_128X64 ||
+      mbmi->bsize == BLOCK_64X128) {
+    bs = mbmi->bsize;
+    if (bs == BLOCK_128X128 || bs == BLOCK_128X64) {
+      nhb = AOMMIN(MI_SIZE_128X128, mi_params->mi_cols - MI_SIZE_64X64 * fbc);
+      hb_step = 2;
+    }
+    if (bs == BLOCK_128X128 || bs == BLOCK_64X128) {
+      nvb = AOMMIN(MI_SIZE_128X128, mi_params->mi_rows - MI_SIZE_64X64 * fbr);
+      vb_step = 2;
+    }
+  } else {
+    bs = BLOCK_64X64;
+  }
+  // Get number of 8x8 blocks which are not skip. Cdef processing happens for
+  // 8x8 blocks which are not skip.
+  const int cdef_count = av1_cdef_compute_sb_list(
+      mi_params, fbr * MI_SIZE_64X64, fbc * MI_SIZE_64X64, dlist, bs);
+
+  const int yoff = CDEF_VBORDER * (fbr != 0);
+  const int xoff = CDEF_HBORDER * (fbc != 0);
+  int dirinit = 0;
+  for (int pli = 0; pli < cdef_search_ctx->num_planes; pli++) {
+    for (int i = 0; i < CDEF_INBUF_SIZE; i++) inbuf[i] = CDEF_VERY_LARGE;
+    /* We avoid filtering the pixels for which some of the pixels to
+    average are outside the frame. We could change the filter instead,
+    but it would add special cases for any future vectorization. */
+    const int ysize = (nvb << mi_high_l2[pli]) +
+                      CDEF_VBORDER * (fbr + vb_step < cdef_search_ctx->nvfb) +
+                      yoff;
+    const int xsize = (nhb << mi_wide_l2[pli]) +
+                      CDEF_HBORDER * (fbc + hb_step < cdef_search_ctx->nhfb) +
+                      xoff;
+    const int row = fbr * MI_SIZE_64X64 << mi_high_l2[pli];
+    const int col = fbc * MI_SIZE_64X64 << mi_wide_l2[pli];
+    struct macroblockd_plane pd = cdef_search_ctx->plane[pli];
+    cdef_search_ctx->copy_fn(&in[(-yoff * CDEF_BSTRIDE - xoff)], CDEF_BSTRIDE,
+                             pd.dst.buf, row - yoff, col - xoff, pd.dst.stride,
+                             ysize, xsize);
+    for (int gi = 0; gi < cdef_search_ctx->total_strengths; gi++) {
+      int pri_strength, sec_strength;
+      get_cdef_filter_strengths(cdef_search_ctx->pick_method, &pri_strength,
+                                &sec_strength, gi);
+      av1_cdef_filter_fb(NULL, tmp_dst, CDEF_BSTRIDE, in,
+                         cdef_search_ctx->xdec[pli], cdef_search_ctx->ydec[pli],
+                         dir, &dirinit, var, pli, dlist, cdef_count,
+                         pri_strength, sec_strength + (sec_strength == 3),
+                         cdef_search_ctx->damping, coeff_shift);
+      const uint64_t curr_mse = cdef_search_ctx->compute_cdef_dist_fn(
+          ref_buffer[pli], ref_stride[pli], tmp_dst, dlist, cdef_count,
+          cdef_search_ctx->bsize[pli], coeff_shift, row, col);
+      if (pli < 2)
+        cdef_search_ctx->mse[pli][sb_count][gi] = curr_mse;
+      else
+        cdef_search_ctx->mse[1][sb_count][gi] += curr_mse;
     }
   }
-  return 1;
+  cdef_search_ctx->sb_index[sb_count] =
+      MI_SIZE_64X64 * fbr * mi_params->mi_stride + MI_SIZE_64X64 * fbc;
+}
+
+// MSE calculation at frame level.
+// Inputs:
+//   cdef_search_ctx: Pointer to the structure containing parameters related to
+//   CDEF search context.
+// Returns:
+//   Nothing will be returned. Contents of cdef_search_ctx will be modified.
+static void cdef_mse_calc_frame(CdefSearchCtx *cdef_search_ctx) {
+  const CommonModeInfoParams *const mi_params = &cdef_search_ctx->cm->mi_params;
+  // Loop over each sb.
+  for (int fbr = 0; fbr < cdef_search_ctx->nvfb; ++fbr) {
+    for (int fbc = 0; fbc < cdef_search_ctx->nhfb; ++fbc) {
+      // Checks if cdef processing can be skipped for particular sb.
+      if (cdef_sb_skip(mi_params, fbr, fbc)) continue;
+      // Calculate mse for each sb and store the relevant sb index.
+      cdef_mse_calc_block(cdef_search_ctx, fbr, fbc);
+      cdef_search_ctx->sb_count++;
+    }
+  }
+}
+
+// Allocates memory for members of CdefSearchCtx.
+// Inputs:
+//   cdef_search_ctx: Pointer to the structure containing parameters
+//   related to CDEF search context.
+// Returns:
+//   Nothing will be returned. Contents of cdef_search_ctx will be modified.
+static AOM_INLINE void cdef_alloc_data(CdefSearchCtx *cdef_search_ctx) {
+  const int nvfb = cdef_search_ctx->nvfb;
+  const int nhfb = cdef_search_ctx->nhfb;
+  cdef_search_ctx->sb_index =
+      aom_malloc(nvfb * nhfb * sizeof(cdef_search_ctx->sb_index));
+  cdef_search_ctx->sb_count = 0;
+  cdef_search_ctx->mse[0] =
+      aom_malloc(sizeof(**cdef_search_ctx->mse) * nvfb * nhfb);
+  cdef_search_ctx->mse[1] =
+      aom_malloc(sizeof(**cdef_search_ctx->mse) * nvfb * nhfb);
+}
+
+// Deallocates the memory allocated for members of CdefSearchCtx.
+// Inputs:
+//   cdef_search_ctx: Pointer to the structure containing parameters
+//   related to CDEF search context.
+// Returns:
+//   Nothing will be returned.
+static AOM_INLINE void cdef_dealloc_data(CdefSearchCtx *cdef_search_ctx) {
+  aom_free(cdef_search_ctx->mse[0]);
+  aom_free(cdef_search_ctx->mse[1]);
+  aom_free(cdef_search_ctx->sb_index);
+}
+
+// Initialize the parameters related to CDEF search context.
+// Inputs:
+//   frame: Pointer to compressed frame buffer
+//   ref: Pointer to the frame buffer holding the source frame
+//   cm: Pointer to top level common structure
+//   xd: Pointer to common current coding block structure
+//   cdef_search_ctx: Pointer to the structure containing parameters related to
+//   CDEF search context.
+//   pick_method: Search method used to select CDEF parameters
+// Returns:
+//   Nothing will be returned. Contents of cdef_search_ctx will be modified.
+static AOM_INLINE void cdef_params_init(const YV12_BUFFER_CONFIG *frame,
+                                        const YV12_BUFFER_CONFIG *ref,
+                                        AV1_COMMON *cm, MACROBLOCKD *xd,
+                                        CdefSearchCtx *cdef_search_ctx,
+                                        CDEF_PICK_METHOD pick_method) {
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  const int num_planes = av1_num_planes(cm);
+  cdef_search_ctx->cm = cm;
+  cdef_search_ctx->ref = ref;
+  cdef_search_ctx->nvfb =
+      (mi_params->mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
+  cdef_search_ctx->nhfb =
+      (mi_params->mi_cols + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
+  cdef_search_ctx->coeff_shift = AOMMAX(cm->seq_params.bit_depth - 8, 0);
+  cdef_search_ctx->damping = 3 + (cm->quant_params.base_qindex >> 6);
+  cdef_search_ctx->total_strengths = nb_cdef_strengths[pick_method];
+  cdef_search_ctx->num_planes = num_planes;
+  cdef_search_ctx->pick_method = pick_method;
+  cdef_search_ctx->sb_count = 0;
+  av1_setup_dst_planes(xd->plane, cm->seq_params.sb_size, frame, 0, 0, 0,
+                       num_planes);
+  // Initialize plane wise information.
+  for (int pli = 0; pli < num_planes; pli++) {
+    cdef_search_ctx->xdec[pli] = xd->plane[pli].subsampling_x;
+    cdef_search_ctx->ydec[pli] = xd->plane[pli].subsampling_y;
+    cdef_search_ctx->bsize[pli] =
+        cdef_search_ctx->ydec[pli]
+            ? (cdef_search_ctx->xdec[pli] ? BLOCK_4X4 : BLOCK_8X4)
+            : (cdef_search_ctx->xdec[pli] ? BLOCK_4X8 : BLOCK_8X8);
+    cdef_search_ctx->mi_wide_l2[pli] =
+        MI_SIZE_LOG2 - xd->plane[pli].subsampling_x;
+    cdef_search_ctx->mi_high_l2[pli] =
+        MI_SIZE_LOG2 - xd->plane[pli].subsampling_y;
+    cdef_search_ctx->plane[pli] = xd->plane[pli];
+  }
+  // Function pointer initialization.
+#if CONFIG_AV1_HIGHBITDEPTH
+  if (cm->seq_params.use_highbitdepth) {
+    cdef_search_ctx->copy_fn = copy_sb16_16_highbd;
+    cdef_search_ctx->compute_cdef_dist_fn = compute_cdef_dist_highbd;
+  } else {
+    cdef_search_ctx->copy_fn = copy_sb16_16;
+    cdef_search_ctx->compute_cdef_dist_fn = compute_cdef_dist;
+  }
+#else
+  cdef_search_ctx->copy_fn = copy_sb16_16;
+  cdef_search_ctx->compute_cdef_dist_fn = compute_cdef_dist;
+#endif
 }
 
 static void pick_cdef_from_qp(AV1_COMMON *const cm) {
@@ -375,148 +549,35 @@
   }
 }
 
-void av1_cdef_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref,
-                     AV1_COMMON *cm, MACROBLOCKD *xd, int pick_method,
+void av1_cdef_search(const YV12_BUFFER_CONFIG *frame,
+                     const YV12_BUFFER_CONFIG *ref, AV1_COMMON *cm,
+                     MACROBLOCKD *xd, CDEF_PICK_METHOD pick_method,
                      int rdmult) {
   if (pick_method == CDEF_PICK_FROM_Q) {
     pick_cdef_from_qp(cm);
     return;
   }
-
-  cdef_list dlist[MI_SIZE_128X128 * MI_SIZE_128X128];
-  int dir[CDEF_NBLOCKS][CDEF_NBLOCKS] = { { 0 } };
-  int var[CDEF_NBLOCKS][CDEF_NBLOCKS] = { { 0 } };
   const CommonModeInfoParams *const mi_params = &cm->mi_params;
-  const int nvfb = (mi_params->mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
-  const int nhfb = (mi_params->mi_cols + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
-  int *sb_index = aom_malloc(nvfb * nhfb * sizeof(*sb_index));
   const int damping = 3 + (cm->quant_params.base_qindex >> 6);
-  const int fast = (pick_method == CDEF_FAST_SEARCH_LVL1 ||
-                    pick_method == CDEF_FAST_SEARCH_LVL2);
-  const int total_strengths = nb_cdef_strengths[pick_method];
-  DECLARE_ALIGNED(32, uint16_t, tmp_dst[1 << (MAX_SB_SIZE_LOG2 * 2)]);
+  const int fast = (pick_method >= CDEF_FAST_SEARCH_LVL1 &&
+                    pick_method <= CDEF_FAST_SEARCH_LVL4);
   const int num_planes = av1_num_planes(cm);
-  av1_setup_dst_planes(xd->plane, cm->seq_params.sb_size, frame, 0, 0, 0,
-                       num_planes);
-  uint64_t(*mse[2])[TOTAL_STRENGTHS];
-  mse[0] = aom_malloc(sizeof(**mse) * nvfb * nhfb);
-  mse[1] = aom_malloc(sizeof(**mse) * nvfb * nhfb);
+  CdefSearchCtx cdef_search_ctx;
+  // Initialize parameters related to CDEF search context.
+  cdef_params_init(frame, ref, cm, xd, &cdef_search_ctx, pick_method);
+  // Allocate CDEF search context buffers.
+  cdef_alloc_data(&cdef_search_ctx);
+  // Frame level mse calculation.
+  cdef_mse_calc_frame(&cdef_search_ctx);
 
-  int bsize[3];
-  int mi_wide_l2[3];
-  int mi_high_l2[3];
-  int xdec[3];
-  int ydec[3];
-  uint8_t *ref_buffer[3] = { ref->y_buffer, ref->u_buffer, ref->v_buffer };
-  int ref_stride[3] = { ref->y_stride, ref->uv_stride, ref->uv_stride };
-
-  for (int pli = 0; pli < num_planes; pli++) {
-    xdec[pli] = xd->plane[pli].subsampling_x;
-    ydec[pli] = xd->plane[pli].subsampling_y;
-    bsize[pli] = ydec[pli] ? (xdec[pli] ? BLOCK_4X4 : BLOCK_8X4)
-                           : (xdec[pli] ? BLOCK_4X8 : BLOCK_8X8);
-    mi_wide_l2[pli] = MI_SIZE_LOG2 - xd->plane[pli].subsampling_x;
-    mi_high_l2[pli] = MI_SIZE_LOG2 - xd->plane[pli].subsampling_y;
-  }
-
-  copy_fn_t copy_fn;
-  compute_cdef_dist_t compute_cdef_dist_fn;
-
-  if (cm->seq_params.use_highbitdepth) {
-    copy_fn = copy_sb16_16_highbd;
-    compute_cdef_dist_fn = compute_cdef_dist_highbd;
-  } else {
-    copy_fn = copy_sb16_16;
-    compute_cdef_dist_fn = compute_cdef_dist;
-  }
-
-  DECLARE_ALIGNED(32, uint16_t, inbuf[CDEF_INBUF_SIZE]);
-  uint16_t *const in = inbuf + CDEF_VBORDER * CDEF_BSTRIDE + CDEF_HBORDER;
-  const int coeff_shift = AOMMAX(cm->seq_params.bit_depth - 8, 0);
-  int sb_count = 0;
-  for (int fbr = 0; fbr < nvfb; ++fbr) {
-    for (int fbc = 0; fbc < nhfb; ++fbc) {
-      // No filtering if the entire filter block is skipped
-      if (sb_all_skip(mi_params, fbr * MI_SIZE_64X64, fbc * MI_SIZE_64X64))
-        continue;
-
-      const MB_MODE_INFO *const mbmi =
-          mi_params->mi_grid_base[MI_SIZE_64X64 * fbr * mi_params->mi_stride +
-                                  MI_SIZE_64X64 * fbc];
-      if (((fbc & 1) &&
-           (mbmi->sb_type == BLOCK_128X128 || mbmi->sb_type == BLOCK_128X64)) ||
-          ((fbr & 1) &&
-           (mbmi->sb_type == BLOCK_128X128 || mbmi->sb_type == BLOCK_64X128)))
-        continue;
-
-      int nhb = AOMMIN(MI_SIZE_64X64, mi_params->mi_cols - MI_SIZE_64X64 * fbc);
-      int nvb = AOMMIN(MI_SIZE_64X64, mi_params->mi_rows - MI_SIZE_64X64 * fbr);
-      int hb_step = 1;
-      int vb_step = 1;
-      BLOCK_SIZE bs;
-      if (mbmi->sb_type == BLOCK_128X128 || mbmi->sb_type == BLOCK_128X64 ||
-          mbmi->sb_type == BLOCK_64X128) {
-        bs = mbmi->sb_type;
-        if (bs == BLOCK_128X128 || bs == BLOCK_128X64) {
-          nhb =
-              AOMMIN(MI_SIZE_128X128, mi_params->mi_cols - MI_SIZE_64X64 * fbc);
-          hb_step = 2;
-        }
-        if (bs == BLOCK_128X128 || bs == BLOCK_64X128) {
-          nvb =
-              AOMMIN(MI_SIZE_128X128, mi_params->mi_rows - MI_SIZE_64X64 * fbr);
-          vb_step = 2;
-        }
-      } else {
-        bs = BLOCK_64X64;
-      }
-
-      const int cdef_count = av1_cdef_compute_sb_list(
-          mi_params, fbr * MI_SIZE_64X64, fbc * MI_SIZE_64X64, dlist, bs);
-
-      const int yoff = CDEF_VBORDER * (fbr != 0);
-      const int xoff = CDEF_HBORDER * (fbc != 0);
-      int dirinit = 0;
-      for (int pli = 0; pli < num_planes; pli++) {
-        for (int i = 0; i < CDEF_INBUF_SIZE; i++) inbuf[i] = CDEF_VERY_LARGE;
-        /* We avoid filtering the pixels for which some of the pixels to
-           average are outside the frame. We could change the filter instead,
-           but it would add special cases for any future vectorization. */
-        const int ysize = (nvb << mi_high_l2[pli]) +
-                          CDEF_VBORDER * (fbr + vb_step < nvfb) + yoff;
-        const int xsize = (nhb << mi_wide_l2[pli]) +
-                          CDEF_HBORDER * (fbc + hb_step < nhfb) + xoff;
-        const int row = fbr * MI_SIZE_64X64 << mi_high_l2[pli];
-        const int col = fbc * MI_SIZE_64X64 << mi_wide_l2[pli];
-        for (int gi = 0; gi < total_strengths; gi++) {
-          int pri_strength = gi / CDEF_SEC_STRENGTHS;
-          if (fast) pri_strength = get_pri_strength(pick_method, pri_strength);
-          const int sec_strength = gi % CDEF_SEC_STRENGTHS;
-          copy_fn(&in[(-yoff * CDEF_BSTRIDE - xoff)], CDEF_BSTRIDE,
-                  xd->plane[pli].dst.buf, row - yoff, col - xoff,
-                  xd->plane[pli].dst.stride, ysize, xsize);
-          av1_cdef_filter_fb(
-              NULL, tmp_dst, CDEF_BSTRIDE, in, xdec[pli], ydec[pli], dir,
-              &dirinit, var, pli, dlist, cdef_count, pri_strength,
-              sec_strength + (sec_strength == 3), damping, coeff_shift);
-          const uint64_t curr_mse = compute_cdef_dist_fn(
-              ref_buffer[pli], ref_stride[pli], tmp_dst, dlist, cdef_count,
-              bsize[pli], coeff_shift, row, col);
-          if (pli < 2)
-            mse[pli][sb_count][gi] = curr_mse;
-          else
-            mse[1][sb_count][gi] += curr_mse;
-        }
-      }
-      sb_index[sb_count++] =
-          MI_SIZE_64X64 * fbr * mi_params->mi_stride + MI_SIZE_64X64 * fbc;
-    }
-  }
-
-  /* Search for different number of signalling bits. */
+  /* Search for different number of signaling bits. */
   int nb_strength_bits = 0;
   uint64_t best_rd = UINT64_MAX;
   CdefInfo *const cdef_info = &cm->cdef_info;
+  int sb_count = cdef_search_ctx.sb_count;
+  uint64_t(*mse[2])[TOTAL_STRENGTHS];
+  mse[0] = cdef_search_ctx.mse[0];
+  mse[1] = cdef_search_ctx.mse[1];
   for (int i = 0; i <= 3; i++) {
     int best_lev0[CDEF_MAX_STRENGTHS];
     int best_lev1[CDEF_MAX_STRENGTHS] = { 0 };
@@ -560,28 +621,24 @@
         best_mse = curr;
       }
     }
-    mi_params->mi_grid_base[sb_index[i]]->cdef_strength = best_gi;
+    mi_params->mi_grid_base[cdef_search_ctx.sb_index[i]]->cdef_strength =
+        best_gi;
   }
 
   if (fast) {
     for (int j = 0; j < cdef_info->nb_cdef_strengths; j++) {
       const int luma_strength = cdef_info->cdef_strengths[j];
       const int chroma_strength = cdef_info->cdef_uv_strengths[j];
-      int pri_strength;
-      pri_strength =
-          get_pri_strength(pick_method, luma_strength / CDEF_SEC_STRENGTHS);
-      cdef_info->cdef_strengths[j] = pri_strength * CDEF_SEC_STRENGTHS +
-                                     (luma_strength % CDEF_SEC_STRENGTHS);
-      pri_strength =
-          get_pri_strength(pick_method, chroma_strength / CDEF_SEC_STRENGTHS);
-      cdef_info->cdef_uv_strengths[j] = pri_strength * CDEF_SEC_STRENGTHS +
-                                        (chroma_strength % CDEF_SEC_STRENGTHS);
+      int pri_strength, sec_strength;
+
+      STORE_CDEF_FILTER_STRENGTH(cdef_info->cdef_strengths[j], pick_method,
+                                 luma_strength);
+      STORE_CDEF_FILTER_STRENGTH(cdef_info->cdef_uv_strengths[j], pick_method,
+                                 chroma_strength);
     }
   }
 
   cdef_info->cdef_damping = damping;
-
-  aom_free(mse[0]);
-  aom_free(mse[1]);
-  aom_free(sb_index);
+  // Deallocate CDEF search context buffers.
+  cdef_dealloc_data(&cdef_search_ctx);
 }
diff --git a/av1/encoder/pickcdef.h b/av1/encoder/pickcdef.h
new file mode 100644
index 0000000..ef342dc
--- /dev/null
+++ b/av1/encoder/pickcdef.h
@@ -0,0 +1,226 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_AV1_ENCODER_PICKCDEF_H_
+#define AOM_AV1_ENCODER_PICKCDEF_H_
+
+#include "av1/common/cdef.h"
+#include "av1/encoder/speed_features.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*!\cond */
+#define REDUCED_PRI_STRENGTHS_LVL1 8
+#define REDUCED_PRI_STRENGTHS_LVL2 5
+#define REDUCED_SEC_STRENGTHS_LVL3 2
+#define REDUCED_PRI_STRENGTHS_LVL4 2
+
+#define REDUCED_TOTAL_STRENGTHS_LVL1 \
+  (REDUCED_PRI_STRENGTHS_LVL1 * CDEF_SEC_STRENGTHS)
+#define REDUCED_TOTAL_STRENGTHS_LVL2 \
+  (REDUCED_PRI_STRENGTHS_LVL2 * CDEF_SEC_STRENGTHS)
+#define REDUCED_TOTAL_STRENGTHS_LVL3 \
+  (REDUCED_PRI_STRENGTHS_LVL2 * REDUCED_SEC_STRENGTHS_LVL3)
+#define REDUCED_TOTAL_STRENGTHS_LVL4 \
+  (REDUCED_PRI_STRENGTHS_LVL4 * REDUCED_SEC_STRENGTHS_LVL3)
+#define TOTAL_STRENGTHS (CDEF_PRI_STRENGTHS * CDEF_SEC_STRENGTHS)
+
+static const int priconv_lvl1[REDUCED_PRI_STRENGTHS_LVL1] = { 0, 1, 2,  3,
+                                                              5, 7, 10, 13 };
+static const int priconv_lvl2[REDUCED_PRI_STRENGTHS_LVL2] = { 0, 2, 4, 8, 14 };
+static const int priconv_lvl4[REDUCED_PRI_STRENGTHS_LVL4] = { 0, 11 };
+static const int secconv_lvl3[REDUCED_SEC_STRENGTHS_LVL3] = { 0, 2 };
+static const int nb_cdef_strengths[CDEF_PICK_METHODS] = {
+  TOTAL_STRENGTHS,
+  REDUCED_TOTAL_STRENGTHS_LVL1,
+  REDUCED_TOTAL_STRENGTHS_LVL2,
+  REDUCED_TOTAL_STRENGTHS_LVL3,
+  REDUCED_TOTAL_STRENGTHS_LVL4,
+  TOTAL_STRENGTHS
+};
+
+typedef void (*copy_fn_t)(uint16_t *dst, int dstride, const void *src,
+                          int src_voffset, int src_hoffset, int sstride,
+                          int vsize, int hsize);
+typedef uint64_t (*compute_cdef_dist_t)(void *dst, int dstride, uint16_t *src,
+                                        cdef_list *dlist, int cdef_count,
+                                        BLOCK_SIZE bsize, int coeff_shift,
+                                        int row, int col);
+
+/*! \brief CDEF search context.
+ */
+typedef struct {
+  /*!
+   * Pointer to the frame buffer holding the source frame
+   */
+  const YV12_BUFFER_CONFIG *ref;
+  /*!
+   * Pointer to top level common structure
+   */
+  AV1_COMMON *cm;
+  /*!
+   * Info specific to each plane
+   */
+  struct macroblockd_plane plane[MAX_MB_PLANE];
+  /*!
+   * Function pointer of copy_fn
+   */
+  copy_fn_t copy_fn;
+  /*!
+   * Function pointer of compute_cdef_dist_fn
+   */
+  compute_cdef_dist_t compute_cdef_dist_fn;
+  /*!
+   *  Number of strenghts evaluated in CDEF filter search
+   */
+  int total_strengths;
+  /*!
+   * Bit-depth dependent shift
+   */
+  int coeff_shift;
+  /*!
+   * CDEF damping factor
+   */
+  int damping;
+  /*!
+   * Search method used to select CDEF parameters
+   */
+  int pick_method;
+  /*!
+   * Number of planes
+   */
+  int num_planes;
+  /*!
+   * Log2 of width of the MI unit in pixels. mi_wide_l2[i]
+   * indicates the width of the MI unit in pixels for the ith plane
+   */
+  int mi_wide_l2[MAX_MB_PLANE];
+  /*!
+   * Log2 of height of the MI unit in pixels. mi_high_l2[i]
+   * indicates the height of the MI unit in pixels for the ith plane
+   */
+  int mi_high_l2[MAX_MB_PLANE];
+  /*!
+   * Subsampling in x direction. xdec[i] indicates the subsampling
+   * for the ith plane
+   */
+  int xdec[MAX_MB_PLANE];
+  /*!
+   * Subsampling in y direction. ydec[i] indicates the subsampling
+   * for the ith plane
+   */
+  int ydec[MAX_MB_PLANE];
+  /*!
+   * bsize[i] indicates the block size of ith plane
+   */
+  int bsize[MAX_MB_PLANE];
+  /*!
+   * Number of 64x64 blocks in vertical direction of a frame
+   */
+  int nvfb;
+  /*!
+   * Number of 64x64 blocks in horizontal direction of a frame
+   */
+  int nhfb;
+  /*!
+   * Pointer to the mean squared error between the CDEF filtered block and the
+   * source block. mse[i][j][k] stores the MSE of the ith plane (i=0 corresponds
+   * to Y-plane, i=1 corresponds to U and V planes), jth block and kth strength
+   * index
+   */
+  uint64_t (*mse[2])[TOTAL_STRENGTHS];
+  /*!
+   * Holds the position (in units of mi's) of the cdef filtered
+   * block in raster scan order
+   */
+  int *sb_index;
+  /*!
+   * Holds the count of cdef filtered blocks
+   */
+  int sb_count;
+} CdefSearchCtx;
+
+static INLINE int sb_all_skip(const CommonModeInfoParams *const mi_params,
+                              int mi_row, int mi_col) {
+  const int maxr = AOMMIN(mi_params->mi_rows - mi_row, MI_SIZE_64X64);
+  const int maxc = AOMMIN(mi_params->mi_cols - mi_col, MI_SIZE_64X64);
+  const int stride = mi_params->mi_stride;
+  MB_MODE_INFO **mbmi = mi_params->mi_grid_base + mi_row * stride + mi_col;
+  for (int r = 0; r < maxr; ++r, mbmi += stride) {
+    for (int c = 0; c < maxc; ++c) {
+      if (!mbmi[c]->skip_txfm) return 0;
+    }
+  }
+  return 1;
+}
+
+// Checks if cdef processing can be skipped for particular sb.
+// Inputs:
+//   cdef_search_ctx: Pointer to the structure containing parameters related to
+//   CDEF search context.
+//   fbr: Row index in units of 64x64 block
+//   fbc: Column index in units of 64x64 block
+// Returns:
+//   1/0 will be returned to indicate skip/don't skip cdef processing of sb
+//   respectively.
+static INLINE int cdef_sb_skip(const CommonModeInfoParams *const mi_params,
+                               int fbr, int fbc) {
+  const MB_MODE_INFO *const mbmi =
+      mi_params->mi_grid_base[MI_SIZE_64X64 * fbr * mi_params->mi_stride +
+                              MI_SIZE_64X64 * fbc];
+  // No filtering if the entire filter block is skipped.
+  if (sb_all_skip(mi_params, fbr * MI_SIZE_64X64, fbc * MI_SIZE_64X64))
+    return 1;
+  // Skip odd numbered 64x64 block rows(cols) when bsize is BLOCK_128X128,
+  // BLOCK_64X128(BLOCK_128X128, BLOCK_128X64) as for such blocks CDEF filtering
+  // is done at the corresponding block sizes.
+  if (((fbc & 1) &&
+       (mbmi->bsize == BLOCK_128X128 || mbmi->bsize == BLOCK_128X64)) ||
+      ((fbr & 1) &&
+       (mbmi->bsize == BLOCK_128X128 || mbmi->bsize == BLOCK_64X128)))
+    return 1;
+  return 0;
+}
+/*!\endcond */
+
+/*!\brief AV1 CDEF parameter search
+ *
+ * \ingroup in_loop_cdef
+ *
+ * Searches for optimal CDEF parameters for frame
+ *
+ * \param[in]      frame        Compressed frame buffer
+ * \param[in]      ref          Source frame buffer
+ * \param[in,out]  cm           Pointer to top level common structure
+ * \param[in]      xd           Pointer to common current coding block structure
+ * \param[in]      pick_method  The method used to select params
+ * \param[in]      rdmult       rd multiplier to use in making param choices
+ *
+ * \return Nothing is returned. Instead, optimal CDEF parameters are stored
+ * in the \c cdef_info structure of type \ref CdefInfo inside \c cm:
+ * \arg \c cdef_bits: Bits of strength parameters
+ * \arg \c nb_cdef_strengths: Number of strength parameters
+ * \arg \c cdef_strengths: list of \c nb_cdef_strengths strength parameters
+ * for the luma plane.
+ * \arg \c uv_cdef_strengths: list of \c nb_cdef_strengths strength parameters
+ * for the chroma planes.
+ * \arg \c damping_factor: CDEF damping factor.
+ *
+ */
+void av1_cdef_search(const YV12_BUFFER_CONFIG *frame,
+                     const YV12_BUFFER_CONFIG *ref, AV1_COMMON *cm,
+                     MACROBLOCKD *xd, CDEF_PICK_METHOD pick_method, int rdmult);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+#endif  // AOM_AV1_ENCODER_PICKCDEF_H_
diff --git a/av1/encoder/picklpf.c b/av1/encoder/picklpf.c
index 17c9965..e6446c7 100644
--- a/av1/encoder/picklpf.c
+++ b/av1/encoder/picklpf.c
@@ -49,6 +49,8 @@
 static int64_t try_filter_frame(const YV12_BUFFER_CONFIG *sd,
                                 AV1_COMP *const cpi, int filt_level,
                                 int partial_frame, int plane, int dir) {
+  MultiThreadInfo *const mt_info = &cpi->mt_info;
+  int num_workers = mt_info->num_workers;
   AV1_COMMON *const cm = &cpi->common;
   int64_t filt_err;
 
@@ -69,13 +71,14 @@
 
   // TODO(any): please enable multi-thread and remove the flag when loop
   // filter mask is compatible with multi-thread.
-  if (cpi->num_workers > 1)
+  if (num_workers > 1)
     av1_loop_filter_frame_mt(&cm->cur_frame->buf, cm, &cpi->td.mb.e_mbd, plane,
                              plane + 1, partial_frame,
 #if CONFIG_LPF_MASK
                              0,
 #endif
-                             cpi->workers, cpi->num_workers, &cpi->lf_row_sync);
+                             mt_info->workers, num_workers,
+                             &mt_info->lf_row_sync);
   else
     av1_loop_filter_frame(&cm->cur_frame->buf, cm, &cpi->td.mb.e_mbd,
 #if CONFIG_LPF_MASK
@@ -128,6 +131,13 @@
   // Sum squared error at each filter level
   int64_t ss_err[MAX_LOOP_FILTER + 1];
 
+  const int use_coarse_search = cpi->sf.lpf_sf.use_coarse_filter_level_search;
+  assert(use_coarse_search <= 1);
+  static const int min_filter_step_lookup[2] = { 0, 2 };
+  // min_filter_step_thesh determines the stopping criteria for the search.
+  // The search is terminated when filter_step equals min_filter_step_thesh.
+  const int min_filter_step_thesh = min_filter_step_lookup[use_coarse_search];
+
   // Set each entry to -1
   memset(ss_err, 0xFF, sizeof(ss_err));
   yv12_copy_plane(&cm->cur_frame->buf, &cpi->last_frame_uf, plane);
@@ -135,7 +145,7 @@
   filt_best = filt_mid;
   ss_err[filt_mid] = best_err;
 
-  while (filter_step > 0) {
+  while (filter_step > min_filter_step_thesh) {
     const int filt_high = AOMMIN(filt_mid + filter_step, max_filter_level);
     const int filt_low = AOMMAX(filt_mid - filter_step, min_filter_level);
 
@@ -193,7 +203,9 @@
   // Update best error
   best_err = ss_err[filt_best];
 
-  if (best_cost_ret) *best_cost_ret = RDCOST_DBL(x->rdmult, 0, best_err);
+  if (best_cost_ret)
+    *best_cost_ret = RDCOST_DBL_WITH_NATIVE_BD_DIST(
+        x->rdmult, 0, (best_err << 4), cm->seq_params.bit_depth);
   return filt_best;
 }
 
@@ -217,9 +229,13 @@
                                    cm->seq_params.bit_depth);
     // based on tests result for rtc test set
     // 0.04590 boosted or 0.02295 non-booseted in 18-bit fixed point
-    const int strength_boost_q_treshold = 700;
-    const int inter_frame_multiplier =
-        q > strength_boost_q_treshold ? 12034 : 6017;
+    const int strength_boost_q_treshold = 0;
+    int inter_frame_multiplier =
+        (q > strength_boost_q_treshold ||
+         (cpi->sf.rt_sf.use_nonrd_pick_mode &&
+          cpi->common.width * cpi->common.height > 352 * 288))
+            ? 12034
+            : 6017;
     // These values were determined by linear fitting the result of the
     // searched level for 8 bit depth:
     // Keyframes: filt_guess = q * 0.06699 - 1.60817
diff --git a/av1/encoder/picklpf.h b/av1/encoder/picklpf.h
index 357097a..7273355 100644
--- a/av1/encoder/picklpf.h
+++ b/av1/encoder/picklpf.h
@@ -21,6 +21,141 @@
 struct yv12_buffer_config;
 struct AV1_COMP;
 int av1_get_max_filter_level(const AV1_COMP *cpi);
+
+/*!\brief Algorithm for AV1 loop filter level selection.
+ *
+ * \ingroup in_loop_filter
+ * This function determines proper filter levels used for in-loop filter
+ * (deblock filter).
+ *
+ * \param[in]    sd             The pointer of frame buffer
+ * \param[in]    cpi            Top-level encoder structure
+ * \param[in]    method         The method used to select filter levels
+ *
+ * \par
+ * method includes:
+ * \arg \c LPF_PICK_FROM_FULL_IMAGE:  Try the full image with different values.
+ * \arg \c LPF_PICK_FROM_FULL_IMAGE_NON_DUAL: Try the full image filter search
+ * with non-dual filter only.
+ * \arg \c LPF_PICK_FROM_SUBIMAGE: Try a small portion of the image with
+ * different values.
+ * \arg \c LPF_PICK_FROM_Q: Estimate the level based on quantizer and frame type
+ * \arg \c LPF_PICK_MINIMAL_LPF: Pick 0 to disable LPF if LPF was enabled last
+ * frame
+ *
+ * \return Nothing is returned. Instead, filter levels below are stored in the
+ * "loopfilter" structure inside "cpi":
+ * \arg \c filter_level[0]: the vertical filter level for Y plane
+ * \arg \c filter_level[1]: the horizontal filter level for Y plane
+ * \arg \c filter_level_u: the filter level for U plane
+ * \arg \c filter_level_v: the filter level for V plane
+ *
+ * \n
+ * \b Overview
+ * \par
+ * The workflow of deblock filter is shown in Fig.1. \n
+ * Boundary pixels pass through a non-flatness check, followed by a step that
+ * determines smoothness and selects proper types of filters
+ * (4-, 6-, 8-, 14-tap filter). \n
+ * If non-flatness criteria is not satisfied, the encoder will not apply
+ * deblock filtering on these boundary pixels.
+ * \image html filter_flow.png "Fig.1. The workflow of deblock filter" width=70%
+ *
+ * \par
+ * The non-flatness is determined by the boundary pixels and thresholds as shown
+ * in Fig.2. \n
+ * Filtering is applied when \n
+ * \f$|p_0-p_1|<thr_1\f$   and   \f$|q_0-q_1|<thr_1\f$   and
+ * \f$2*|p_0-q_0|+|p_1-q_1|/2<thr_2\f$ \n
+ * \image html filter_thr.png "Fig.2. Non-flatness of pixel boundary" height=40%
+ *
+ * \par
+ * Thresholds ("thr_1" and "thr_2") are determined by the filter level. \n
+ * In AV1, for each frame, we employ the four filter levels, based on these
+ * observations: \n
+ * Luma and chroma planes have different characteristics, including subsampling
+ * (different plane size), coding quality (chroma planes are better coded). \n
+ * Therefore chroma planes need less deblocking filtering than luma plane. \n
+ * In addition, content texture has different spatial characteristics: vertical
+ * and horizontal direction may need different level of filtering. \n
+ * The selection of these filter levels is described in the following section.
+ *
+ * \par
+ * \b Algorithm
+ * \par
+ * The encoder selects filter levels given the current frame buffer, and the
+ * method. \n
+ * By default, "LPF_PICK_FROM_FULL_IMAGE" is used, which should provide
+ * the most appropriate filter levels. \n
+ * For video on demand (VOD) mode, if speed setting is larger than 5,
+ * "LPF_PICK_FROM_FULL_IMAGE_NON_DUAL" is used. \n
+ * For real-time mode, if speed setting is larger than 5, "LPF_PICK_FROM_Q" is
+ * used.
+ *
+ * \par
+ * "LPF_PICK_FROM_FULL_IMAGE" method: determine filter levels sequentially
+ * by a filter level search procedure (function "search_filter_level"). \n
+ * The order is: \n
+ * First search and determine the filter level for Y plane.
+ * Let vertical filter level (filter_level[0]) and the horizontal filter level
+ * (filter_level[1]) be equal to it. \n
+ * Keep the horizontal filter level the same and search and determine the
+ * vertical filter level. \n
+ * Search and determine the horizontal filter level. \n
+ * Search and determine filter level for U plane. \n
+ * Search and determine filter level for V plane.
+ *
+ * \par
+ * Search and determine filter level is fulfilled by function
+ * "search_filter_level". \n
+ * It starts with a base filter level ("filt_mid") initialized by the
+ * corresponding last frame's filter level. \n
+ * A filter step ("filter_step") is determined as:
+ * filter_step = filt_mid < 16 ? 4 : filt_mid / 4. \n
+ * Then a modified binary search strategy is employed to find a proper
+ * filter level. \n
+ * In each iteration, set filt_low = filt_mid - filter_step,
+ * filt_high = filt_mid + filter_step. \n
+ * We now have three candidate levels, "filt_mid", "filt_low" and "filt_high".
+ * \n
+ * Deblock filtering is applied on the current frame with candidate filter
+ * levels and the sum of squared error (SSE) between source and filtered frame
+ * is computed. \n
+ * Set "filt_best" to the filter level of the smallest SSE. If "filter_best"
+ * equals to "filt_mid", halve the filter_step. Otherwise, set filt_mid =
+ * filt_best. \n
+ * Go to the next iteration until "filter_step" is 0. \n
+ * Note that in the comparison of SSEs between SSE[filt_low] and SSE[filt_mid],
+ * a "bias" is introduced to slightly raise the filter level. \n
+ * It is based on the observation that low filter levels tend to yield a smaller
+ * SSE and produce a higher PSNR for the current frame, \n
+ * while oversmoothing it and degradating the quality for prediction for future
+ * frames and leanding to a suboptimal performance overall. \n
+ * Function "try_filter_frame" is the referrence for applying deblock filtering
+ * with a given filter level and computatition of SSE.
+ *
+ * \par
+ * "LPF_PICK_FROM_FULL_IMAGE_NON_DUAL" method: almost the same as
+ * "LPF_PICK_FROM_FULL_IMAGE", \n
+ * just without separately searching for appropriate filter levels for vertical
+ * and horizontal filters.
+ *
+ * \par
+ * "LPF_PICK_FROM_Q" method: filter levels are determined by the
+ * quantization factor (q). \n
+ * For 8 bit: \n
+ *   Keyframes: filt_guess = q * 0.06699 - 1.60817 \n
+ *   Other frames: filt_guess = q * inter_frame_multiplier + 2.48225 \n
+ *   inter_frame_multiplier = q > 700 ? 0.04590 : 0.02295 \n
+ * For 10 bit and 12 bit: \n
+ * filt_guess = q * 0.316206 + 3.87252 \n
+ * Then filter_level[0] = filter_level[1] = filter_level_u = filter_level_v =
+ * clamp(filt_guess, min_filter_level, max_filter_level) \n
+ * Where min_filter_level = 0, max_filter_level = 64 \n
+ * The equations were determined by linear fitting using filter levels
+ * generated by "LPF_PICK_FROM_FULL_IMAGE" method.
+ *
+ */
 void av1_pick_filter_level(const struct yv12_buffer_config *sd,
                            struct AV1_COMP *cpi, LPF_PICK_METHOD method);
 #ifdef __cplusplus
diff --git a/av1/encoder/pickrst.c b/av1/encoder/pickrst.c
index ccbe1cc..2196513 100644
--- a/av1/encoder/pickrst.c
+++ b/av1/encoder/pickrst.c
@@ -133,7 +133,7 @@
   RestUnitSearchInfo *rusi;
 
   // Speed features
-  const SPEED_FEATURES *sf;
+  const LOOP_FILTER_SPEED_FEATURES *lpf_sf;
 
   uint8_t *dgd_buffer;
   int dgd_stride;
@@ -166,8 +166,8 @@
 
 static AOM_INLINE void init_rsc(const YV12_BUFFER_CONFIG *src,
                                 const AV1_COMMON *cm, const MACROBLOCK *x,
-                                const SPEED_FEATURES *sf, int plane,
-                                RestUnitSearchInfo *rusi,
+                                const LOOP_FILTER_SPEED_FEATURES *lpf_sf,
+                                int plane, RestUnitSearchInfo *rusi,
                                 YV12_BUFFER_CONFIG *dst, RestSearchCtxt *rsc) {
   rsc->src = src;
   rsc->dst = dst;
@@ -175,7 +175,7 @@
   rsc->x = x;
   rsc->plane = plane;
   rsc->rusi = rusi;
-  rsc->sf = sf;
+  rsc->lpf_sf = lpf_sf;
 
   const YV12_BUFFER_CONFIG *dgd = &cm->cur_frame->buf;
   const int is_uv = plane != AOM_PLANE_Y;
@@ -631,11 +631,13 @@
   }
 }
 
-static AOM_INLINE void av1_calc_proj_params_high_bd_c(
-    const uint8_t *src8, int width, int height, int src_stride,
-    const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
-    int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2],
-    const sgr_params_type *params) {
+void av1_calc_proj_params_high_bd_c(const uint8_t *src8, int width, int height,
+                                    int src_stride, const uint8_t *dat8,
+                                    int dat_stride, int32_t *flt0,
+                                    int flt0_stride, int32_t *flt1,
+                                    int flt1_stride, int64_t H[2][2],
+                                    int64_t C[2],
+                                    const sgr_params_type *params) {
   if ((params->r[0] > 0) && (params->r[1] > 0)) {
     calc_proj_params_r0_r1_high_bd_c(src8, width, height, src_stride, dat8,
                                      dat_stride, flt0, flt0_stride, flt1,
@@ -672,11 +674,20 @@
                              flt0, flt0_stride, flt1, flt1_stride, H, C,
                              params);
     }
-  } else {
-    av1_calc_proj_params_high_bd_c(src8, width, height, src_stride, dat8,
+  }
+#if CONFIG_AV1_HIGHBITDEPTH
+  else {  // NOLINT
+    if ((width & 0x7) == 0) {
+      av1_calc_proj_params_high_bd(src8, width, height, src_stride, dat8,
                                    dat_stride, flt0, flt0_stride, flt1,
                                    flt1_stride, H, C, params);
+    } else {
+      av1_calc_proj_params_high_bd_c(src8, width, height, src_stride, dat8,
+                                     dat_stride, flt0, flt0_stride, flt1,
+                                     flt1_stride, H, C, params);
+    }
   }
+#endif
 
   if (params->r[0] == 0) {
     // H matrix is now only the scalar H[1][1]
@@ -878,7 +889,7 @@
   const int highbd = cm->seq_params.use_highbitdepth;
   const int bit_depth = cm->seq_params.bit_depth;
 
-  const int64_t bits_none = x->sgrproj_restore_cost[0];
+  const int64_t bits_none = x->mode_costs.sgrproj_restore_cost[0];
   // Prune evaluation of RESTORE_SGRPROJ if 'skip_sgr_eval' is set
   if (rusi->skip_sgr_eval) {
     rsc->bits += bits_none;
@@ -903,7 +914,7 @@
       dgd_start, limits->h_end - limits->h_start,
       limits->v_end - limits->v_start, rsc->dgd_stride, src_start,
       rsc->src_stride, highbd, bit_depth, procunit_width, procunit_height,
-      tmpbuf, rsc->sf->lpf_sf.enable_sgr_ep_pruning);
+      tmpbuf, rsc->lpf_sf->enable_sgr_ep_pruning);
 
   RestorationUnitInfo rui;
   rui.restoration_type = RESTORE_SGRPROJ;
@@ -911,17 +922,16 @@
 
   rusi->sse[RESTORE_SGRPROJ] = try_restoration_unit(rsc, limits, tile, &rui);
 
-  const int64_t bits_sgr = x->sgrproj_restore_cost[1] +
+  const int64_t bits_sgr = x->mode_costs.sgrproj_restore_cost[1] +
                            (count_sgrproj_bits(&rusi->sgrproj, &rsc->sgrproj)
                             << AV1_PROB_COST_SHIFT);
-
-  double cost_none =
-      RDCOST_DBL(x->rdmult, bits_none >> 4, rusi->sse[RESTORE_NONE]);
-  double cost_sgr =
-      RDCOST_DBL(x->rdmult, bits_sgr >> 4, rusi->sse[RESTORE_SGRPROJ]);
+  double cost_none = RDCOST_DBL_WITH_NATIVE_BD_DIST(
+      x->rdmult, bits_none >> 4, rusi->sse[RESTORE_NONE], bit_depth);
+  double cost_sgr = RDCOST_DBL_WITH_NATIVE_BD_DIST(
+      x->rdmult, bits_sgr >> 4, rusi->sse[RESTORE_SGRPROJ], bit_depth);
   if (rusi->sgrproj.ep < 10)
     cost_sgr *=
-        (1 + DUAL_SGR_PENALTY_MULT * rsc->sf->lpf_sf.dual_sgr_penalty_level);
+        (1 + DUAL_SGR_PENALTY_MULT * rsc->lpf_sf->dual_sgr_penalty_level);
 
   RestorationType rtype =
       (cost_sgr < cost_none) ? RESTORE_SGRPROJ : RESTORE_NONE;
@@ -1191,8 +1201,8 @@
   }
 }
 
-static int wiener_decompose_sep_sym(int wiener_win, int64_t *M, int64_t *H,
-                                    int32_t *a, int32_t *b) {
+static void wiener_decompose_sep_sym(int wiener_win, int64_t *M, int64_t *H,
+                                     int32_t *a, int32_t *b) {
   static const int32_t init_filt[WIENER_WIN] = {
     WIENER_FILT_TAP0_MIDV, WIENER_FILT_TAP1_MIDV, WIENER_FILT_TAP2_MIDV,
     WIENER_FILT_TAP3_MIDV, WIENER_FILT_TAP2_MIDV, WIENER_FILT_TAP1_MIDV,
@@ -1221,7 +1231,6 @@
     update_b_sep_sym(wiener_win, Mc, Hc, a, b);
     iter++;
   }
-  return 1;
 }
 
 // Computes the function x'*H*x - x'*M for the learned 2D filter x, and compares
@@ -1274,7 +1283,7 @@
   const int wiener_halfwin = (wiener_win >> 1);
 
   for (i = 0; i < wiener_halfwin; ++i) {
-    const int64_t dividend = f[i] * WIENER_FILT_STEP;
+    const int64_t dividend = (int64_t)f[i] * WIENER_FILT_STEP;
     const int64_t divisor = WIENER_TAP_SCALE_FACTOR;
     // Perform this division with proper rounding rather than truncation
     if (dividend < 0) {
@@ -1458,10 +1467,10 @@
   RestUnitSearchInfo *rusi = &rsc->rusi[rest_unit_idx];
 
   const MACROBLOCK *const x = rsc->x;
-  const int64_t bits_none = x->wiener_restore_cost[0];
+  const int64_t bits_none = x->mode_costs.wiener_restore_cost[0];
 
   // Skip Wiener search for low variance contents
-  if (rsc->sf->lpf_sf.prune_wiener_based_on_src_var) {
+  if (rsc->lpf_sf->prune_wiener_based_on_src_var) {
     const int scale[3] = { 0, 1, 2 };
     // Obtain the normalized Qscale
     const int qs = av1_dc_quant_QTX(rsc->cm->quant_params.base_qindex, 0,
@@ -1469,7 +1478,7 @@
                    3;
     // Derive threshold as sqr(normalized Qscale) * scale / 16,
     const uint64_t thresh =
-        (qs * qs * scale[rsc->sf->lpf_sf.prune_wiener_based_on_src_var]) >> 4;
+        (qs * qs * scale[rsc->lpf_sf->prune_wiener_based_on_src_var]) >> 4;
     const int highbd = rsc->cm->seq_params.use_highbitdepth;
     const uint64_t src_var =
         var_restoration_unit(limits, rsc->src, rsc->plane, highbd);
@@ -1481,8 +1490,7 @@
       rsc->sse += rusi->sse[RESTORE_NONE];
       rusi->best_rtype[RESTORE_WIENER - 1] = RESTORE_NONE;
       rusi->sse[RESTORE_WIENER] = INT64_MAX;
-      if (rsc->sf->lpf_sf.prune_sgr_based_on_wiener == 2)
-        rusi->skip_sgr_eval = 1;
+      if (rsc->lpf_sf->prune_sgr_based_on_wiener == 2) rusi->skip_sgr_eval = 1;
       return;
     }
   }
@@ -1491,7 +1499,7 @@
       (rsc->plane == AOM_PLANE_Y) ? WIENER_WIN : WIENER_WIN_CHROMA;
 
   int reduced_wiener_win = wiener_win;
-  if (rsc->sf->lpf_sf.reduce_wiener_window_size) {
+  if (rsc->lpf_sf->reduce_wiener_window_size) {
     reduced_wiener_win =
         (rsc->plane == AOM_PLANE_Y) ? WIENER_WIN_REDUCED : WIENER_WIN_CHROMA;
   }
@@ -1518,14 +1526,7 @@
                     limits->v_end, rsc->dgd_stride, rsc->src_stride, M, H);
 #endif
 
-  if (!wiener_decompose_sep_sym(reduced_wiener_win, M, H, vfilter, hfilter)) {
-    rsc->bits += bits_none;
-    rsc->sse += rusi->sse[RESTORE_NONE];
-    rusi->best_rtype[RESTORE_WIENER - 1] = RESTORE_NONE;
-    rusi->sse[RESTORE_WIENER] = INT64_MAX;
-    if (rsc->sf->lpf_sf.prune_sgr_based_on_wiener == 2) rusi->skip_sgr_eval = 1;
-    return;
-  }
+  wiener_decompose_sep_sym(reduced_wiener_win, M, H, vfilter, hfilter);
 
   RestorationUnitInfo rui;
   memset(&rui, 0, sizeof(rui));
@@ -1542,7 +1543,7 @@
     rsc->sse += rusi->sse[RESTORE_NONE];
     rusi->best_rtype[RESTORE_WIENER - 1] = RESTORE_NONE;
     rusi->sse[RESTORE_WIENER] = INT64_MAX;
-    if (rsc->sf->lpf_sf.prune_sgr_based_on_wiener == 2) rusi->skip_sgr_eval = 1;
+    if (rsc->lpf_sf->prune_sgr_based_on_wiener == 2) rusi->skip_sgr_eval = 1;
     return;
   }
 
@@ -1560,14 +1561,16 @@
   }
 
   const int64_t bits_wiener =
-      x->wiener_restore_cost[1] +
+      x->mode_costs.wiener_restore_cost[1] +
       (count_wiener_bits(wiener_win, &rusi->wiener, &rsc->wiener)
        << AV1_PROB_COST_SHIFT);
 
-  double cost_none =
-      RDCOST_DBL(x->rdmult, bits_none >> 4, rusi->sse[RESTORE_NONE]);
-  double cost_wiener =
-      RDCOST_DBL(x->rdmult, bits_wiener >> 4, rusi->sse[RESTORE_WIENER]);
+  double cost_none = RDCOST_DBL_WITH_NATIVE_BD_DIST(
+      x->rdmult, bits_none >> 4, rusi->sse[RESTORE_NONE],
+      rsc->cm->seq_params.bit_depth);
+  double cost_wiener = RDCOST_DBL_WITH_NATIVE_BD_DIST(
+      x->rdmult, bits_wiener >> 4, rusi->sse[RESTORE_WIENER],
+      rsc->cm->seq_params.bit_depth);
 
   RestorationType rtype =
       (cost_wiener < cost_none) ? RESTORE_WIENER : RESTORE_NONE;
@@ -1575,9 +1578,9 @@
 
   // Set 'skip_sgr_eval' based on rdcost ratio of RESTORE_WIENER and
   // RESTORE_NONE or based on best_rtype
-  if (rsc->sf->lpf_sf.prune_sgr_based_on_wiener == 1) {
+  if (rsc->lpf_sf->prune_sgr_based_on_wiener == 1) {
     rusi->skip_sgr_eval = cost_wiener > (1.01 * cost_none);
-  } else if (rsc->sf->lpf_sf.prune_sgr_based_on_wiener == 2) {
+  } else if (rsc->lpf_sf->prune_sgr_based_on_wiener == 2) {
     rusi->skip_sgr_eval = rusi->best_rtype[RESTORE_WIENER - 1] == RESTORE_NONE;
   }
 
@@ -1649,11 +1652,11 @@
       default: assert(0); break;
     }
     const int64_t coeff_bits = coeff_pcost << AV1_PROB_COST_SHIFT;
-    const int64_t bits = x->switchable_restore_cost[r] + coeff_bits;
-    double cost = RDCOST_DBL(x->rdmult, bits >> 4, sse);
+    const int64_t bits = x->mode_costs.switchable_restore_cost[r] + coeff_bits;
+    double cost = RDCOST_DBL_WITH_NATIVE_BD_DIST(x->rdmult, bits >> 4, sse,
+                                                 rsc->cm->seq_params.bit_depth);
     if (r == RESTORE_SGRPROJ && rusi->sgrproj.ep < 10)
-      cost *=
-          (1 + DUAL_SGR_PENALTY_MULT * rsc->sf->lpf_sf.dual_sgr_penalty_level);
+      cost *= (1 + DUAL_SGR_PENALTY_MULT * rsc->lpf_sf->dual_sgr_penalty_level);
     if (r == 0 || cost < best_cost) {
       best_cost = cost;
       best_bits = bits;
@@ -1690,7 +1693,8 @@
 
   av1_foreach_rest_unit_in_plane(rsc->cm, rsc->plane, funs[rtype], rsc,
                                  &rsc->tile_rect, rsc->cm->rst_tmpbuf, NULL);
-  return RDCOST_DBL(rsc->x->rdmult, rsc->bits >> 4, rsc->sse);
+  return RDCOST_DBL_WITH_NATIVE_BD_DIST(
+      rsc->x->rdmult, rsc->bits >> 4, rsc->sse, rsc->cm->seq_params.bit_depth);
 }
 
 static int rest_tiles_in_plane(const AV1_COMMON *cm, int plane) {
@@ -1700,9 +1704,12 @@
 
 void av1_pick_filter_restoration(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &cpi->td.mb;
   const int num_planes = av1_num_planes(cm);
   assert(!cm->features.all_lossless);
 
+  av1_fill_lr_rates(&x->mode_costs, x->e_mbd.tile_ctx);
+
   int ntiles[2];
   for (int is_uv = 0; is_uv < 2; ++is_uv)
     ntiles[is_uv] = rest_tiles_in_plane(cm, is_uv);
@@ -1717,13 +1724,13 @@
   // problem, as these elements are ignored later, but in order to quiet
   // Valgrind's warnings we initialise the array below.
   memset(rusi, 0, sizeof(*rusi) * ntiles[0]);
-  cpi->td.mb.rdmult = cpi->rd.RDMULT;
+  x->rdmult = cpi->rd.RDMULT;
 
   RestSearchCtxt rsc;
   const int plane_start = AOM_PLANE_Y;
   const int plane_end = num_planes > 1 ? AOM_PLANE_V : AOM_PLANE_Y;
   for (int plane = plane_start; plane <= plane_end; ++plane) {
-    init_rsc(src, &cpi->common, &cpi->td.mb, &cpi->sf, plane, rusi,
+    init_rsc(src, &cpi->common, x, &cpi->sf.lpf_sf, plane, rusi,
              &cpi->trial_frame_rst, &rsc);
 
     const int plane_ntiles = ntiles[plane > 0];
diff --git a/av1/encoder/pickrst.h b/av1/encoder/pickrst.h
index eee3055..2463361 100644
--- a/av1/encoder/pickrst.h
+++ b/av1/encoder/pickrst.h
@@ -57,6 +57,39 @@
 }
 #endif
 
+/*!\brief Algorithm for AV1 loop restoration search and estimation.
+ *
+ * \ingroup in_loop_restoration
+ * This function determines proper restoration filter types and
+ * associated parameters for each restoration unit in a frame.
+ *
+ * \param[in]       sd           Source frame buffer
+ * \param[in,out]   cpi          Top-level encoder structure
+ *
+ * \return Nothing is returned. Instead, chosen restoration filter
+ * types and parameters are stored per plane in the \c rst_info structure
+ * of type \ref RestorationInfo inside \c cpi->common:
+ * \arg \c rst_info[ \c 0 ]: Chosen parameters for Y plane
+ * \arg \c rst_info[ \c 1 ]: Chosen parameters for U plane if it exists
+ * \arg \c rst_info[ \c 2 ]: Chosen parameters for V plane if it exists
+ * \par
+ * The following fields in each \c rst_info[ \c p], \c p = 0, 1, 2
+ * are populated:
+ * \arg \c rst_info[ \c p ].\c frame_restoration_type
+ * \arg \c rst_info[ \c p ].\c unit_info[ \c u ],
+ * for each \c u in 0, 1, ..., \c n( \c p ) - 1,
+ * where \c n( \c p ) is the number of restoration units in plane \c p.
+ * \par
+ * The following fields in each \c rst_info[ \c p ].\c unit_info[ \c u ],
+ * \c p = 0, 1, 2 and \c u = 0, 1, ..., \c n( \c p ) - 1, of type
+ * \ref RestorationUnitInfo are populated:
+ * \arg \c rst_info[ \c p ].\c unit_info[ \c u ].\c restoration_type
+ * \arg \c rst_info[ \c p ].\c unit_info[ \c u ].\c wiener_info OR
+ *      \c rst_info[ \c p ].\c unit_info[ \c u ].\c sgrproj_info OR
+ *      neither, depending on
+ *      \c rst_info[ \c p ].\c unit_info[ \c u ].\c restoration_type
+ *
+ */
 void av1_pick_filter_restoration(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi);
 
 #ifdef __cplusplus
diff --git a/av1/encoder/ratectrl.c b/av1/encoder/ratectrl.c
index 433163f..aaa9dfd 100644
--- a/av1/encoder/ratectrl.c
+++ b/av1/encoder/ratectrl.c
@@ -95,8 +95,9 @@
 
 // How many times less pixels there are to encode given the current scaling.
 // Temporary replacement for rcf_mult and rate_thresh_mult.
-static double resize_rate_factor(const AV1_COMP *cpi, int width, int height) {
-  return (double)(cpi->oxcf.width * cpi->oxcf.height) / (width * height);
+static double resize_rate_factor(const FrameDimensionCfg *const frm_dim_cfg,
+                                 int width, int height) {
+  return (double)(frm_dim_cfg->width * frm_dim_cfg->height) / (width * height);
 }
 
 // Functions to compute the active minq lookup table entries based on a
@@ -157,9 +158,13 @@
 }
 
 int av1_rc_bits_per_mb(FRAME_TYPE frame_type, int qindex,
-                       double correction_factor, aom_bit_depth_t bit_depth) {
+                       double correction_factor, aom_bit_depth_t bit_depth,
+                       const int is_screen_content_type) {
   const double q = av1_convert_qindex_to_q(qindex, bit_depth);
   int enumerator = frame_type == KEY_FRAME ? 2000000 : 1500000;
+  if (is_screen_content_type) {
+    enumerator = frame_type == KEY_FRAME ? 1000000 : 750000;
+  }
 
   assert(correction_factor <= MAX_BPB_FACTOR &&
          correction_factor >= MIN_BPB_FACTOR);
@@ -169,10 +174,10 @@
 }
 
 int av1_estimate_bits_at_q(FRAME_TYPE frame_type, int q, int mbs,
-                           double correction_factor,
-                           aom_bit_depth_t bit_depth) {
-  const int bpm =
-      (int)(av1_rc_bits_per_mb(frame_type, q, correction_factor, bit_depth));
+                           double correction_factor, aom_bit_depth_t bit_depth,
+                           const int is_screen_content_type) {
+  const int bpm = (int)(av1_rc_bits_per_mb(frame_type, q, correction_factor,
+                                           bit_depth, is_screen_content_type));
   return AOMMAX(FRAME_OVERHEAD_BITS,
                 (int)((uint64_t)bpm * mbs) >> BPER_MB_NORMBITS);
 }
@@ -197,9 +202,9 @@
 
   // Clip the frame target to the maximum allowed value.
   if (target > rc->max_frame_bandwidth) target = rc->max_frame_bandwidth;
-  if (oxcf->rc_max_inter_bitrate_pct) {
+  if (oxcf->rc_cfg.max_inter_bitrate_pct) {
     const int max_rate =
-        rc->avg_frame_bandwidth * oxcf->rc_max_inter_bitrate_pct / 100;
+        rc->avg_frame_bandwidth * oxcf->rc_cfg.max_inter_bitrate_pct / 100;
     target = AOMMIN(target, max_rate);
   }
 
@@ -208,10 +213,10 @@
 
 int av1_rc_clamp_iframe_target_size(const AV1_COMP *const cpi, int target) {
   const RATE_CONTROL *rc = &cpi->rc;
-  const AV1EncoderConfig *oxcf = &cpi->oxcf;
-  if (oxcf->rc_max_intra_bitrate_pct) {
+  const RateControlCfg *const rc_cfg = &cpi->oxcf.rc_cfg;
+  if (rc_cfg->max_intra_bitrate_pct) {
     const int max_rate =
-        rc->avg_frame_bandwidth * oxcf->rc_max_intra_bitrate_pct / 100;
+        rc->avg_frame_bandwidth * rc_cfg->max_intra_bitrate_pct / 100;
     target = AOMMIN(target, max_rate);
   }
   if (target > rc->max_frame_bandwidth) target = rc->max_frame_bandwidth;
@@ -281,28 +286,27 @@
 }
 
 void av1_rc_init(const AV1EncoderConfig *oxcf, int pass, RATE_CONTROL *rc) {
+  const RateControlCfg *const rc_cfg = &oxcf->rc_cfg;
   int i;
 
-  if (pass == 0 && oxcf->rc_mode == AOM_CBR) {
-    rc->avg_frame_qindex[KEY_FRAME] = oxcf->worst_allowed_q;
-    rc->avg_frame_qindex[INTER_FRAME] = oxcf->worst_allowed_q;
+  if (pass == 0 && rc_cfg->mode == AOM_CBR) {
+    rc->avg_frame_qindex[KEY_FRAME] = rc_cfg->worst_allowed_q;
+    rc->avg_frame_qindex[INTER_FRAME] = rc_cfg->worst_allowed_q;
   } else {
     rc->avg_frame_qindex[KEY_FRAME] =
-        (oxcf->worst_allowed_q + oxcf->best_allowed_q) / 2;
+        (rc_cfg->worst_allowed_q + rc_cfg->best_allowed_q) / 2;
     rc->avg_frame_qindex[INTER_FRAME] =
-        (oxcf->worst_allowed_q + oxcf->best_allowed_q) / 2;
+        (rc_cfg->worst_allowed_q + rc_cfg->best_allowed_q) / 2;
   }
 
-  rc->last_q[KEY_FRAME] = oxcf->best_allowed_q;
-  rc->last_q[INTER_FRAME] = oxcf->worst_allowed_q;
+  rc->last_q[KEY_FRAME] = rc_cfg->best_allowed_q;
+  rc->last_q[INTER_FRAME] = rc_cfg->worst_allowed_q;
 
   rc->buffer_level = rc->starting_buffer_level;
   rc->bits_off_target = rc->starting_buffer_level;
 
   rc->rolling_target_bits = rc->avg_frame_bandwidth;
   rc->rolling_actual_bits = rc->avg_frame_bandwidth;
-  rc->long_rolling_target_bits = rc->avg_frame_bandwidth;
-  rc->long_rolling_actual_bits = rc->avg_frame_bandwidth;
 
   rc->total_actual_bits = 0;
   rc->total_target_bits = 0;
@@ -311,37 +315,43 @@
   rc->frames_since_key = 8;  // Sensible default for first frame.
   rc->this_key_frame_forced = 0;
   rc->next_key_frame_forced = 0;
-  rc->source_alt_ref_pending = 0;
-  rc->source_alt_ref_active = 0;
 
   rc->frames_till_gf_update_due = 0;
-  rc->ni_av_qi = oxcf->worst_allowed_q;
+  rc->ni_av_qi = rc_cfg->worst_allowed_q;
   rc->ni_tot_qi = 0;
   rc->ni_frames = 0;
 
   rc->tot_q = 0.0;
-  rc->avg_q = av1_convert_qindex_to_q(oxcf->worst_allowed_q, oxcf->bit_depth);
+  rc->avg_q = av1_convert_qindex_to_q(rc_cfg->worst_allowed_q,
+                                      oxcf->tool_cfg.bit_depth);
 
   for (i = 0; i < RATE_FACTOR_LEVELS; ++i) {
     rc->rate_correction_factors[i] = 0.7;
   }
   rc->rate_correction_factors[KF_STD] = 1.0;
-  rc->min_gf_interval = oxcf->min_gf_interval;
-  rc->max_gf_interval = oxcf->max_gf_interval;
+  rc->min_gf_interval = oxcf->gf_cfg.min_gf_interval;
+  rc->max_gf_interval = oxcf->gf_cfg.max_gf_interval;
   if (rc->min_gf_interval == 0)
     rc->min_gf_interval = av1_rc_get_default_min_gf_interval(
-        oxcf->width, oxcf->height, oxcf->init_framerate);
+        oxcf->frm_dim_cfg.width, oxcf->frm_dim_cfg.height,
+        oxcf->input_cfg.init_framerate);
   if (rc->max_gf_interval == 0)
     rc->max_gf_interval = av1_rc_get_default_max_gf_interval(
-        oxcf->init_framerate, rc->min_gf_interval);
+        oxcf->input_cfg.init_framerate, rc->min_gf_interval);
   rc->baseline_gf_interval = (rc->min_gf_interval + rc->max_gf_interval) / 2;
+  rc->avg_frame_low_motion = 0;
+
+  rc->resize_state = ORIG;
+  rc->resize_avg_qp = 0;
+  rc->resize_buffer_underflow = 0;
+  rc->resize_count = 0;
 }
 
 int av1_rc_drop_frame(AV1_COMP *cpi) {
   const AV1EncoderConfig *oxcf = &cpi->oxcf;
   RATE_CONTROL *const rc = &cpi->rc;
 
-  if (!oxcf->drop_frames_water_mark) {
+  if (!oxcf->rc_cfg.drop_frames_water_mark) {
     return 0;
   } else {
     if (rc->buffer_level < 0) {
@@ -350,8 +360,8 @@
     } else {
       // If buffer is below drop_mark, for now just drop every other frame
       // (starting with the next frame) until it increases back over drop_mark.
-      int drop_mark =
-          (int)(oxcf->drop_frames_water_mark * rc->optimal_buffer_level / 100);
+      int drop_mark = (int)(oxcf->rc_cfg.drop_frames_water_mark *
+                            rc->optimal_buffer_level / 100);
       if ((rc->buffer_level > drop_mark) && (rc->decimation_factor > 0)) {
         --rc->decimation_factor;
       } else if (rc->buffer_level <= drop_mark && rc->decimation_factor == 0) {
@@ -376,6 +386,7 @@
 static int adjust_q_cbr(const AV1_COMP *cpi, int q, int active_worst_quality) {
   const RATE_CONTROL *const rc = &cpi->rc;
   const AV1_COMMON *const cm = &cpi->common;
+  const RefreshFrameFlagsInfo *const refresh_frame_flags = &cpi->refresh_frame;
   const int max_delta = 16;
   const int change_avg_frame_bandwidth =
       abs(rc->avg_frame_bandwidth - rc->prev_avg_frame_bandwidth) >
@@ -389,14 +400,37 @@
   // Apply some control/clamp to QP under certain conditions.
   if (cm->current_frame.frame_type != KEY_FRAME && !cpi->use_svc &&
       rc->frames_since_key > 1 && !change_target_bits_mb &&
-      (!cpi->oxcf.gf_cbr_boost_pct ||
-       !(cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame))) {
+      (!cpi->oxcf.rc_cfg.gf_cbr_boost_pct ||
+       !(refresh_frame_flags->alt_ref_frame ||
+         refresh_frame_flags->golden_frame))) {
     // Make sure q is between oscillating Qs to prevent resonance.
     if (rc->rc_1_frame * rc->rc_2_frame == -1 &&
         rc->q_1_frame != rc->q_2_frame) {
       q = clamp(q, AOMMIN(rc->q_1_frame, rc->q_2_frame),
                 AOMMAX(rc->q_1_frame, rc->q_2_frame));
     }
+    // Adjust Q base on source content change from scene detection.
+    if (cpi->sf.rt_sf.check_scene_detection && rc->prev_avg_source_sad > 0 &&
+        rc->frames_since_key > 10) {
+      const int bit_depth = cm->seq_params.bit_depth;
+      double delta =
+          (double)rc->avg_source_sad / (double)rc->prev_avg_source_sad - 1.0;
+      // Push Q downwards if content change is decreasing and buffer level
+      // is stable (at least 1/4-optimal level), so not overshooting. Do so
+      // only for high Q to avoid excess overshoot.
+      // Else reduce decrease in Q from previous frame if content change is
+      // increasing and buffer is below max (so not undershooting).
+      if (delta < 0.0 && rc->buffer_level > (rc->optimal_buffer_level >> 2) &&
+          q > (rc->worst_quality >> 1)) {
+        double q_adj_factor = 1.0 + 0.5 * tanh(4.0 * delta);
+        double q_val = av1_convert_qindex_to_q(q, bit_depth);
+        q += av1_compute_qdelta(rc, q_val, q_val * q_adj_factor, bit_depth);
+      } else if (rc->q_1_frame - q > 0 && delta > 0.1 &&
+                 rc->buffer_level < AOMMIN(rc->maximum_buffer_size,
+                                           rc->optimal_buffer_level << 1)) {
+        q = (3 * q + rc->q_1_frame) >> 2;
+      }
+    }
     // Limit the decrease in Q from previous frame.
     if (rc->q_1_frame - q > max_delta) q = rc->q_1_frame - max_delta;
   }
@@ -425,9 +459,23 @@
   return rate_factor_levels[update_type];
 }
 
+/*!\brief Gets a rate vs Q correction factor
+ *
+ * This function returns the current value of a correction factor used to
+ * dynamilcally adjust the relationship between Q and the expected number
+ * of bits for the frame.
+ *
+ * \ingroup rate_control
+ * \param[in]   cpi                   Top level encoder instance structure
+ * \param[in]   width                 Frame width
+ * \param[in]   height                Frame height
+ *
+ * \return Returns a correction factor for the current frame
+ */
 static double get_rate_correction_factor(const AV1_COMP *cpi, int width,
                                          int height) {
   const RATE_CONTROL *const rc = &cpi->rc;
+  const RefreshFrameFlagsInfo *const refresh_frame_flags = &cpi->refresh_frame;
   double rcf;
 
   if (cpi->common.current_frame.frame_type == KEY_FRAME) {
@@ -436,23 +484,41 @@
     const RATE_FACTOR_LEVEL rf_lvl = get_rate_factor_level(&cpi->gf_group);
     rcf = rc->rate_correction_factors[rf_lvl];
   } else {
-    if ((cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame) &&
+    if ((refresh_frame_flags->alt_ref_frame ||
+         refresh_frame_flags->golden_frame) &&
         !rc->is_src_frame_alt_ref && !cpi->use_svc &&
-        (cpi->oxcf.rc_mode != AOM_CBR || cpi->oxcf.gf_cbr_boost_pct > 20))
+        (cpi->oxcf.rc_cfg.mode != AOM_CBR ||
+         cpi->oxcf.rc_cfg.gf_cbr_boost_pct > 20))
       rcf = rc->rate_correction_factors[GF_ARF_STD];
     else
       rcf = rc->rate_correction_factors[INTER_NORMAL];
   }
-  rcf *= resize_rate_factor(cpi, width, height);
+  rcf *= resize_rate_factor(&cpi->oxcf.frm_dim_cfg, width, height);
   return fclamp(rcf, MIN_BPB_FACTOR, MAX_BPB_FACTOR);
 }
 
+/*!\brief Sets a rate vs Q correction factor
+ *
+ * This function updates the current value of a correction factor used to
+ * dynamilcally adjust the relationship between Q and the expected number
+ * of bits for the frame.
+ *
+ * \ingroup rate_control
+ * \param[in]   cpi                   Top level encoder instance structure
+ * \param[in]   factor                New correction factor
+ * \param[in]   width                 Frame width
+ * \param[in]   height                Frame height
+ *
+ * \return None but updates the rate correction factor for the
+ *         current frame type in cpi->rc.
+ */
 static void set_rate_correction_factor(AV1_COMP *cpi, double factor, int width,
                                        int height) {
   RATE_CONTROL *const rc = &cpi->rc;
+  const RefreshFrameFlagsInfo *const refresh_frame_flags = &cpi->refresh_frame;
 
   // Normalize RCF to account for the size-dependent scaling factor.
-  factor /= resize_rate_factor(cpi, width, height);
+  factor /= resize_rate_factor(&cpi->oxcf.frm_dim_cfg, width, height);
 
   factor = fclamp(factor, MIN_BPB_FACTOR, MAX_BPB_FACTOR);
 
@@ -462,9 +528,11 @@
     const RATE_FACTOR_LEVEL rf_lvl = get_rate_factor_level(&cpi->gf_group);
     rc->rate_correction_factors[rf_lvl] = factor;
   } else {
-    if ((cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame) &&
+    if ((refresh_frame_flags->alt_ref_frame ||
+         refresh_frame_flags->golden_frame) &&
         !rc->is_src_frame_alt_ref && !cpi->use_svc &&
-        (cpi->oxcf.rc_mode != AOM_CBR || cpi->oxcf.gf_cbr_boost_pct > 20))
+        (cpi->oxcf.rc_cfg.mode != AOM_CBR ||
+         cpi->oxcf.rc_cfg.gf_cbr_boost_pct > 20))
       rc->rate_correction_factors[GF_ARF_STD] = factor;
     else
       rc->rate_correction_factors[INTER_NORMAL] = factor;
@@ -491,13 +559,14 @@
   // Work out how big we would have expected the frame to be at this Q given
   // the current correction factor.
   // Stay in double to avoid int overflow when values are large
-  if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cpi->common.seg.enabled) {
+  if (cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ && cpi->common.seg.enabled) {
     projected_size_based_on_q =
         av1_cyclic_refresh_estimate_bits_at_q(cpi, rate_correction_factor);
   } else {
     projected_size_based_on_q = av1_estimate_bits_at_q(
         cm->current_frame.frame_type, cm->quant_params.base_qindex, MBs,
-        rate_correction_factor, cm->seq_params.bit_depth);
+        rate_correction_factor, cm->seq_params.bit_depth,
+        cpi->is_screen_content_type);
   }
   // Work out a size correction factor.
   if (projected_size_based_on_q > FRAME_OVERHEAD_BITS)
@@ -552,18 +621,32 @@
   return use_cyclic_refresh
              ? av1_cyclic_refresh_rc_bits_per_mb(cpi, q, correction_factor)
              : av1_rc_bits_per_mb(cm->current_frame.frame_type, q,
-                                  correction_factor, cm->seq_params.bit_depth);
+                                  correction_factor, cm->seq_params.bit_depth,
+                                  cpi->is_screen_content_type);
 }
 
-// Similar to find_qindex_by_rate() function in ratectrl.c, but returns the q
-// index with rate just above or below the desired rate, depending on which of
-// the two rates is closer to the desired rate.
-// Also, respects the selected aq_mode when computing the rate.
+/*!\brief Searches for a Q index value predicted to give an average macro
+ * block rate closest to the target value.
+ *
+ * Similar to find_qindex_by_rate() function, but returns a q index with a
+ * rate just above or below the desired rate, depending on which of the two
+ * rates is closer to the desired rate.
+ * Also, respects the selected aq_mode when computing the rate.
+ *
+ * \ingroup rate_control
+ * \param[in]   desired_bits_per_mb   Target bits per mb
+ * \param[in]   cpi                   Top level encoder instance structure
+ * \param[in]   correction_factor     Current Q to rate correction factor
+ * \param[in]   best_qindex           Min allowed Q value.
+ * \param[in]   worst_qindex          Max allowed Q value.
+ *
+ * \return Returns a correction factor for the current frame
+ */
 static int find_closest_qindex_by_rate(int desired_bits_per_mb,
                                        const AV1_COMP *cpi,
                                        double correction_factor,
                                        int best_qindex, int worst_qindex) {
-  const int use_cyclic_refresh = cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ &&
+  const int use_cyclic_refresh = cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ &&
                                  cpi->cyclic_refresh->apply_cyclic_refresh;
 
   // Find 'qindex' based on 'desired_bits_per_mb'.
@@ -621,7 +704,7 @@
   int q =
       find_closest_qindex_by_rate(target_bits_per_mb, cpi, correction_factor,
                                   active_best_quality, active_worst_quality);
-  if (cpi->oxcf.rc_mode == AOM_CBR && has_no_stats_stage(cpi))
+  if (cpi->oxcf.rc_cfg.mode == AOM_CBR && has_no_stats_stage(cpi))
     return adjust_q_cbr(cpi, q, active_worst_quality);
 
   return q;
@@ -668,8 +751,9 @@
   return arfgf_high_motion_minq[q];
 }
 
-static int calc_active_worst_quality_one_pass_vbr(const AV1_COMP *cpi) {
+static int calc_active_worst_quality_no_stats_vbr(const AV1_COMP *cpi) {
   const RATE_CONTROL *const rc = &cpi->rc;
+  const RefreshFrameFlagsInfo *const refresh_frame_flags = &cpi->refresh_frame;
   const unsigned int curr_frame = cpi->common.current_frame.frame_number;
   int active_worst_quality;
 
@@ -677,9 +761,9 @@
     active_worst_quality =
         curr_frame == 0 ? rc->worst_quality : rc->last_q[KEY_FRAME] * 2;
   } else {
-    if (!rc->is_src_frame_alt_ref &&
-        (cpi->refresh_golden_frame || cpi->refresh_bwd_ref_frame ||
-         cpi->refresh_alt_ref_frame)) {
+    if (!rc->is_src_frame_alt_ref && (refresh_frame_flags->golden_frame ||
+                                      refresh_frame_flags->bwd_ref_frame ||
+                                      refresh_frame_flags->alt_ref_frame)) {
       active_worst_quality = curr_frame == 1 ? rc->last_q[KEY_FRAME] * 5 / 4
                                              : rc->last_q[INTER_FRAME];
     } else {
@@ -691,7 +775,7 @@
 }
 
 // Adjust active_worst_quality level based on buffer level.
-static int calc_active_worst_quality_one_pass_cbr(const AV1_COMP *cpi) {
+static int calc_active_worst_quality_no_stats_cbr(const AV1_COMP *cpi) {
   // Adjust active_worst_quality: If buffer is above the optimal/target level,
   // bring active_worst_quality down depending on fullness of buffer.
   // If buffer is below the optimal level, let the active_worst_quality go from
@@ -746,21 +830,20 @@
   return active_worst_quality;
 }
 
-static int rc_pick_q_and_bounds_one_pass_cbr(const AV1_COMP *cpi, int width,
-                                             int height, int *bottom_index,
-                                             int *top_index) {
+// Calculate the active_best_quality level.
+static int calc_active_best_quality_no_stats_cbr(const AV1_COMP *cpi,
+                                                 int active_worst_quality,
+                                                 int width, int height) {
   const AV1_COMMON *const cm = &cpi->common;
   const RATE_CONTROL *const rc = &cpi->rc;
+  const RefreshFrameFlagsInfo *const refresh_frame_flags = &cpi->refresh_frame;
   const CurrentFrame *const current_frame = &cm->current_frame;
-  int active_best_quality;
-  int active_worst_quality = calc_active_worst_quality_one_pass_cbr(cpi);
-  int q;
   int *rtc_minq;
   const int bit_depth = cm->seq_params.bit_depth;
+  int active_best_quality = rc->best_quality;
   ASSIGN_MINQ_TABLE(bit_depth, rtc_minq);
 
   if (frame_is_intra_only(cm)) {
-    active_best_quality = rc->best_quality;
     // Handle the special case for key frames forced when we have reached
     // the maximum key frame interval. Here force the Q to a range
     // based on the ambient Q to reduce the risk of popping.
@@ -774,15 +857,12 @@
       // not first frame of one pass and kf_boost is set
       double q_adj_factor = 1.0;
       double q_val;
-
       active_best_quality =
           get_kf_active_quality(rc, rc->avg_frame_qindex[KEY_FRAME], bit_depth);
-
       // Allow somewhat lower kf minq with small image formats.
       if ((width * height) <= (352 * 288)) {
         q_adj_factor -= 0.25;
       }
-
       // Convert the adjustment factor to a qindex delta
       // on active_best_quality.
       q_val = av1_convert_qindex_to_q(active_best_quality, bit_depth);
@@ -790,32 +870,58 @@
           av1_compute_qdelta(rc, q_val, q_val * q_adj_factor, bit_depth);
     }
   } else if (!rc->is_src_frame_alt_ref && !cpi->use_svc &&
-             cpi->oxcf.gf_cbr_boost_pct &&
-             (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
+             cpi->oxcf.rc_cfg.gf_cbr_boost_pct &&
+             (refresh_frame_flags->golden_frame ||
+              refresh_frame_flags->alt_ref_frame)) {
     // Use the lower of active_worst_quality and recent
     // average Q as basis for GF/ARF best Q limit unless last frame was
     // a key frame.
+    int q = active_worst_quality;
     if (rc->frames_since_key > 1 &&
         rc->avg_frame_qindex[INTER_FRAME] < active_worst_quality) {
       q = rc->avg_frame_qindex[INTER_FRAME];
-    } else {
-      q = active_worst_quality;
     }
     active_best_quality = get_gf_active_quality(rc, q, bit_depth);
   } else {
     // Use the lower of active_worst_quality and recent/average Q.
-    if (current_frame->frame_number > 1) {
-      if (rc->avg_frame_qindex[INTER_FRAME] < active_worst_quality)
-        active_best_quality = rtc_minq[rc->avg_frame_qindex[INTER_FRAME]];
-      else
-        active_best_quality = rtc_minq[active_worst_quality];
-    } else {
-      if (rc->avg_frame_qindex[KEY_FRAME] < active_worst_quality)
-        active_best_quality = rtc_minq[rc->avg_frame_qindex[KEY_FRAME]];
-      else
-        active_best_quality = rtc_minq[active_worst_quality];
-    }
+    FRAME_TYPE frame_type =
+        (current_frame->frame_number > 1) ? INTER_FRAME : KEY_FRAME;
+    if (rc->avg_frame_qindex[frame_type] < active_worst_quality)
+      active_best_quality = rtc_minq[rc->avg_frame_qindex[frame_type]];
+    else
+      active_best_quality = rtc_minq[active_worst_quality];
   }
+  return active_best_quality;
+}
+
+/*!\brief Picks q and q bounds given CBR rate control parameters in \c cpi->rc.
+ *
+ * Handles the special case when using:
+ * - Constant bit-rate mode: \c cpi->oxcf.rc_cfg.mode == \ref AOM_CBR, and
+ * - 1-pass encoding without LAP (look-ahead processing), so 1st pass stats are
+ * NOT available.
+ *
+ * \ingroup rate_control
+ * \param[in]       cpi          Top level encoder structure
+ * \param[in]       width        Coded frame width
+ * \param[in]       height       Coded frame height
+ * \param[out]      bottom_index Bottom bound for q index (best quality)
+ * \param[out]      top_index    Top bound for q index (worst quality)
+ * \return Returns selected q index to be used for encoding this frame.
+ */
+static int rc_pick_q_and_bounds_no_stats_cbr(const AV1_COMP *cpi, int width,
+                                             int height, int *bottom_index,
+                                             int *top_index) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const RATE_CONTROL *const rc = &cpi->rc;
+  const CurrentFrame *const current_frame = &cm->current_frame;
+  int q;
+  const int bit_depth = cm->seq_params.bit_depth;
+  int active_worst_quality = calc_active_worst_quality_no_stats_cbr(cpi);
+  int active_best_quality = calc_active_best_quality_no_stats_cbr(
+      cpi, active_worst_quality, width, height);
+  assert(has_no_stats_stage(cpi));
+  assert(cpi->oxcf.rc_cfg.mode == AOM_CBR);
 
   // Clip the active best and worst quality values to limits
   active_best_quality =
@@ -828,11 +934,12 @@
 
   // Limit Q range for the adaptive loop.
   if (current_frame->frame_type == KEY_FRAME && !rc->this_key_frame_forced &&
-      !(current_frame->frame_number == 0)) {
+      current_frame->frame_number != 0) {
     int qdelta = 0;
     aom_clear_system_state();
     qdelta = av1_compute_qdelta_by_rate(&cpi->rc, current_frame->frame_type,
-                                        active_worst_quality, 2.0, bit_depth);
+                                        active_worst_quality, 2.0,
+                                        cpi->is_screen_content_type, bit_depth);
     *top_index = active_worst_quality + qdelta;
     *top_index = AOMMAX(*top_index, *bottom_index);
   }
@@ -865,15 +972,17 @@
 
 static int get_active_cq_level(const RATE_CONTROL *rc,
                                const AV1EncoderConfig *const oxcf,
-                               int intra_only, SUPERRES_MODE superres_mode,
+                               int intra_only, aom_superres_mode superres_mode,
                                int superres_denom) {
+  const RateControlCfg *const rc_cfg = &oxcf->rc_cfg;
   static const double cq_adjust_threshold = 0.1;
-  int active_cq_level = oxcf->cq_level;
+  int active_cq_level = rc_cfg->cq_level;
   (void)intra_only;
-  if (oxcf->rc_mode == AOM_CQ || oxcf->rc_mode == AOM_Q) {
+  if (rc_cfg->mode == AOM_CQ || rc_cfg->mode == AOM_Q) {
     // printf("Superres %d %d %d = %d\n", superres_denom, intra_only,
     //        rc->frames_to_key, !(intra_only && rc->frames_to_key <= 1));
-    if ((superres_mode == SUPERRES_QTHRESH || superres_mode == SUPERRES_AUTO) &&
+    if ((superres_mode == AOM_SUPERRES_QTHRESH ||
+         superres_mode == AOM_SUPERRES_AUTO) &&
         superres_denom != SCALE_NUMERATOR) {
       int mult = SUPERRES_QADJ_PER_DENOM_KEYFRAME_SOLO;
       if (intra_only && rc->frames_to_key <= 1) {
@@ -887,7 +996,7 @@
           active_cq_level - ((superres_denom - SCALE_NUMERATOR) * mult), 0);
     }
   }
-  if (oxcf->rc_mode == AOM_CQ && rc->total_target_bits > 0) {
+  if (rc_cfg->mode == AOM_CQ && rc->total_target_bits > 0) {
     const double x = (double)rc->total_actual_bits / rc->total_target_bits;
     if (x < cq_adjust_threshold) {
       active_cq_level = (int)(active_cq_level * x / cq_adjust_threshold);
@@ -896,18 +1005,36 @@
   return active_cq_level;
 }
 
+/*! \brief Pick q index for this frame using fixed q index offsets.
+ *
+ * The q index offsets are fixed in the sense that they are independent of the
+ * video content. The offsets for each pyramid level are taken from
+ * \c oxcf->q_cfg.fixed_qp_offsets array.
+ *
+ * \ingroup rate_control
+ * \param[in]   oxcf        Top level encoder configuration
+ * \param[in]   rc          Top level rate control structure
+ * \param[in]   gf_group    Configuration of current golden frame group
+ * \param[in]   gf_index    Index of this frame in the golden frame group
+ * \param[in]   cq_level    Upper bound for q index (this may be same as
+ *                          \c oxcf->cq_level, or slightly modified for some
+ *                          special cases)
+ * \param[in]   bit_depth   Bit depth of the codec (same as
+ *                          \c cm->seq_params.bit_depth)
+ * \return Returns selected q index to be used for encoding this frame.
+ */
 static int get_q_using_fixed_offsets(const AV1EncoderConfig *const oxcf,
                                      const RATE_CONTROL *const rc,
                                      const GF_GROUP *const gf_group,
                                      int gf_index, int cq_level,
                                      int bit_depth) {
-  assert(oxcf->use_fixed_qp_offsets);
-  assert(oxcf->rc_mode == AOM_Q);
+  assert(oxcf->q_cfg.use_fixed_qp_offsets);
+  assert(oxcf->rc_cfg.mode == AOM_Q);
   const FRAME_UPDATE_TYPE update_type = gf_group->update_type[gf_index];
 
   int offset_idx = -1;
   if (update_type == KF_UPDATE) {
-    if (rc->frames_to_key == 1) {
+    if (rc->frames_to_key <= 1) {
       // Image / intra-only coding: ignore offsets.
       return cq_level;
     }
@@ -923,42 +1050,70 @@
     return cq_level;  // Directly Return worst quality allowed.
   }
   assert(offset_idx >= 0 && offset_idx < FIXED_QP_OFFSET_COUNT);
-  assert(oxcf->fixed_qp_offsets[offset_idx] >= 0);
+  assert(oxcf->q_cfg.fixed_qp_offsets[offset_idx] >= 0);
 
   // Get qindex offset, by first converting to 'q' and then back.
   const double q_val_orig = av1_convert_qindex_to_q(cq_level, bit_depth);
   const double q_val_target =
-      AOMMAX(q_val_orig - oxcf->fixed_qp_offsets[offset_idx], 0.0);
+      AOMMAX(q_val_orig - oxcf->q_cfg.fixed_qp_offsets[offset_idx], 0.0);
   const int delta_qindex =
       av1_compute_qdelta(rc, q_val_orig, q_val_target, bit_depth);
   return AOMMAX(cq_level + delta_qindex, 0);
 }
 
-static int rc_pick_q_and_bounds_one_pass_vbr(const AV1_COMP *cpi, int width,
-                                             int height, int *bottom_index,
-                                             int *top_index) {
+/*!\brief Picks q and q bounds given non-CBR rate control params in \c cpi->rc.
+ *
+ * Handles the special case when using:
+ * - Any rate control other than constant bit-rate mode:
+ * \c cpi->oxcf.rc_cfg.mode != \ref AOM_CBR, and
+ * - 1-pass encoding without LAP (look-ahead processing), so 1st pass stats are
+ * NOT available.
+ *
+ * \ingroup rate_control
+ * \param[in]       cpi          Top level encoder structure
+ * \param[in]       width        Coded frame width
+ * \param[in]       height       Coded frame height
+ * \param[in]       gf_index     Index of this frame in the golden frame group
+ * \param[out]      bottom_index Bottom bound for q index (best quality)
+ * \param[out]      top_index    Top bound for q index (worst quality)
+ * \return Returns selected q index to be used for encoding this frame.
+ */
+static int rc_pick_q_and_bounds_no_stats(const AV1_COMP *cpi, int width,
+                                         int height, int gf_index,
+                                         int *bottom_index, int *top_index) {
   const AV1_COMMON *const cm = &cpi->common;
   const RATE_CONTROL *const rc = &cpi->rc;
   const CurrentFrame *const current_frame = &cm->current_frame;
   const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  const RefreshFrameFlagsInfo *const refresh_frame_flags = &cpi->refresh_frame;
+  const GF_GROUP *const gf_group = &cpi->gf_group;
+  const enum aom_rc_mode rc_mode = oxcf->rc_cfg.mode;
+
+  assert(has_no_stats_stage(cpi));
+  assert(rc_mode == AOM_VBR ||
+         (!USE_UNRESTRICTED_Q_IN_CQ_MODE && rc_mode == AOM_CQ) ||
+         rc_mode == AOM_Q);
+  assert(
+      IMPLIES(rc_mode == AOM_Q, gf_group->update_type[gf_index] == ARF_UPDATE));
+
   const int cq_level =
       get_active_cq_level(rc, oxcf, frame_is_intra_only(cm), cpi->superres_mode,
                           cm->superres_scale_denominator);
   const int bit_depth = cm->seq_params.bit_depth;
 
-  if (oxcf->use_fixed_qp_offsets) {
-    return get_q_using_fixed_offsets(oxcf, rc, &cpi->gf_group,
-                                     cpi->gf_group.index, cq_level, bit_depth);
+  if (oxcf->q_cfg.use_fixed_qp_offsets) {
+    return get_q_using_fixed_offsets(oxcf, rc, gf_group, gf_index, cq_level,
+                                     bit_depth);
   }
 
   int active_best_quality;
-  int active_worst_quality = calc_active_worst_quality_one_pass_vbr(cpi);
+  int active_worst_quality = calc_active_worst_quality_no_stats_vbr(cpi);
   int q;
   int *inter_minq;
   ASSIGN_MINQ_TABLE(bit_depth, inter_minq);
 
   if (frame_is_intra_only(cm)) {
-    if (oxcf->rc_mode == AOM_Q) {
+    if (rc_mode == AOM_Q) {
       const int qindex = cq_level;
       const double q_val = av1_convert_qindex_to_q(qindex, bit_depth);
       const int delta_qindex =
@@ -990,7 +1145,8 @@
       }
     }
   } else if (!rc->is_src_frame_alt_ref &&
-             (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
+             (refresh_frame_flags->golden_frame ||
+              refresh_frame_flags->alt_ref_frame)) {
     // Use the lower of active_worst_quality and recent
     // average Q as basis for GF/ARF best Q limit unless last frame was
     // a key frame.
@@ -999,16 +1155,16 @@
             ? rc->avg_frame_qindex[INTER_FRAME]
             : rc->avg_frame_qindex[KEY_FRAME];
     // For constrained quality dont allow Q less than the cq level
-    if (oxcf->rc_mode == AOM_CQ) {
+    if (rc_mode == AOM_CQ) {
       if (q < cq_level) q = cq_level;
       active_best_quality = get_gf_active_quality(rc, q, bit_depth);
       // Constrained quality use slightly lower active best.
       active_best_quality = active_best_quality * 15 / 16;
-    } else if (oxcf->rc_mode == AOM_Q) {
+    } else if (rc_mode == AOM_Q) {
       const int qindex = cq_level;
       const double q_val = av1_convert_qindex_to_q(qindex, bit_depth);
       const int delta_qindex =
-          (cpi->refresh_alt_ref_frame)
+          (refresh_frame_flags->alt_ref_frame)
               ? av1_compute_qdelta(rc, q_val, q_val * 0.40, bit_depth)
               : av1_compute_qdelta(rc, q_val, q_val * 0.50, bit_depth);
       active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality);
@@ -1016,7 +1172,7 @@
       active_best_quality = get_gf_active_quality(rc, q, bit_depth);
     }
   } else {
-    if (oxcf->rc_mode == AOM_Q) {
+    if (rc_mode == AOM_Q) {
       const int qindex = cq_level;
       const double q_val = av1_convert_qindex_to_q(qindex, bit_depth);
       const double delta_rate[FIXED_GF_INTERVAL] = { 0.50, 1.0, 0.85, 1.0,
@@ -1033,7 +1189,7 @@
                                 : inter_minq[rc->avg_frame_qindex[KEY_FRAME]];
       // For the constrained quality mode we don't want
       // q to fall below the cq level.
-      if ((oxcf->rc_mode == AOM_CQ) && (active_best_quality < cq_level)) {
+      if ((rc_mode == AOM_CQ) && (active_best_quality < cq_level)) {
         active_best_quality = cq_level;
       }
     }
@@ -1053,20 +1209,22 @@
     int qdelta = 0;
     aom_clear_system_state();
     if (current_frame->frame_type == KEY_FRAME && !rc->this_key_frame_forced &&
-        !(current_frame->frame_number == 0)) {
-      qdelta = av1_compute_qdelta_by_rate(&cpi->rc, current_frame->frame_type,
-                                          active_worst_quality, 2.0, bit_depth);
+        current_frame->frame_number != 0) {
+      qdelta = av1_compute_qdelta_by_rate(
+          &cpi->rc, current_frame->frame_type, active_worst_quality, 2.0,
+          cpi->is_screen_content_type, bit_depth);
     } else if (!rc->is_src_frame_alt_ref &&
-               (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
-      qdelta =
-          av1_compute_qdelta_by_rate(&cpi->rc, current_frame->frame_type,
-                                     active_worst_quality, 1.75, bit_depth);
+               (refresh_frame_flags->golden_frame ||
+                refresh_frame_flags->alt_ref_frame)) {
+      qdelta = av1_compute_qdelta_by_rate(
+          &cpi->rc, current_frame->frame_type, active_worst_quality, 1.75,
+          cpi->is_screen_content_type, bit_depth);
     }
     *top_index = active_worst_quality + qdelta;
     *top_index = AOMMAX(*top_index, *bottom_index);
   }
 
-  if (oxcf->rc_mode == AOM_Q) {
+  if (rc_mode == AOM_Q) {
     q = active_best_quality;
     // Special case code to try and match quality with forced key frames
   } else if ((current_frame->frame_type == KEY_FRAME) &&
@@ -1091,42 +1249,41 @@
   return q;
 }
 
-static const double rate_factor_deltas[RATE_FACTOR_LEVELS] = {
-  1.00,  // INTER_NORMAL
-  1.50,  // GF_ARF_LOW
-  2.00,  // GF_ARF_STD
-  2.00,  // KF_STD
-};
-
+static const double arf_layer_deltas[MAX_ARF_LAYERS + 1] = { 2.50, 2.00, 1.75,
+                                                             1.50, 1.25, 1.15,
+                                                             1.0 };
 int av1_frame_type_qdelta(const AV1_COMP *cpi, int q) {
-  const RATE_FACTOR_LEVEL rf_lvl = get_rate_factor_level(&cpi->gf_group);
-  const FRAME_TYPE frame_type = (rf_lvl == KF_STD) ? KEY_FRAME : INTER_FRAME;
-  double rate_factor;
+  const GF_GROUP *const gf_group = &cpi->gf_group;
+  const RATE_FACTOR_LEVEL rf_lvl = get_rate_factor_level(gf_group);
+  const FRAME_TYPE frame_type = gf_group->frame_type[gf_group->index];
+  const int arf_layer = AOMMIN(gf_group->layer_depth[gf_group->index], 6);
+  const double rate_factor =
+      (rf_lvl == INTER_NORMAL) ? 1.0 : arf_layer_deltas[arf_layer];
 
-  rate_factor = rate_factor_deltas[rf_lvl];
-  if (rf_lvl == GF_ARF_LOW) {
-    rate_factor -= (cpi->gf_group.layer_depth[cpi->gf_group.index] - 2) * 0.1;
-    rate_factor = AOMMAX(rate_factor, 1.0);
-  }
   return av1_compute_qdelta_by_rate(&cpi->rc, frame_type, q, rate_factor,
+                                    cpi->is_screen_content_type,
                                     cpi->common.seq_params.bit_depth);
 }
 
 // This unrestricted Q selection on CQ mode is useful when testing new features,
 // but may lead to Q being out of range on current RC restrictions
 #if USE_UNRESTRICTED_Q_IN_CQ_MODE
-static int rc_pick_q_and_bounds_one_pass_cq(const AV1_COMP *cpi, int width,
+static int rc_pick_q_and_bounds_no_stats_cq(const AV1_COMP *cpi, int width,
                                             int height, int *bottom_index,
                                             int *top_index) {
   const AV1_COMMON *const cm = &cpi->common;
   const RATE_CONTROL *const rc = &cpi->rc;
   const AV1EncoderConfig *const oxcf = &cpi->oxcf;
-  const int cq_level = get_active_cq_level(rc, oxcf, frame_is_intra_only(cm),
-                                           cm->superres_scale_denominator);
+  const int cq_level =
+      get_active_cq_level(rc, oxcf, frame_is_intra_only(cm), cpi->superres_mode,
+                          cm->superres_scale_denominator);
   const int bit_depth = cm->seq_params.bit_depth;
   const int q = (int)av1_convert_qindex_to_q(cq_level, bit_depth);
   (void)width;
   (void)height;
+  assert(has_no_stats_stage(cpi));
+  assert(cpi->oxcf.rc_cfg.mode == AOM_CQ);
+
   *top_index = q;
   *bottom_index = q;
 
@@ -1135,10 +1292,9 @@
 #endif  // USE_UNRESTRICTED_Q_IN_CQ_MODE
 
 #define STATIC_MOTION_THRESH 95
-static void get_intra_q_and_bounds_two_pass(const AV1_COMP *cpi, int width,
-                                            int height, int *active_best,
-                                            int *active_worst, int cq_level,
-                                            int is_fwd_kf) {
+static void get_intra_q_and_bounds(const AV1_COMP *cpi, int width, int height,
+                                   int *active_best, int *active_worst,
+                                   int cq_level, int is_fwd_kf) {
   const AV1_COMMON *const cm = &cpi->common;
   const RATE_CONTROL *const rc = &cpi->rc;
   const AV1EncoderConfig *const oxcf = &cpi->oxcf;
@@ -1146,7 +1302,7 @@
   int active_worst_quality = *active_worst;
   const int bit_depth = cm->seq_params.bit_depth;
 
-  if (rc->frames_to_key == 1 && oxcf->rc_mode == AOM_Q) {
+  if (rc->frames_to_key <= 1 && oxcf->rc_cfg.mode == AOM_Q) {
     // If the next frame is also a key frame or the current frame is the
     // only frame in the sequence in AOM_Q mode, just use the cq_level
     // as q.
@@ -1193,6 +1349,9 @@
     // Baseline value derived from cpi->active_worst_quality and kf boost.
     active_best_quality =
         get_kf_active_quality(rc, active_worst_quality, bit_depth);
+    if (cpi->is_screen_content_type) {
+      active_best_quality /= 2;
+    }
 
     if (is_stat_consumption_stage_twopass(cpi) &&
         cpi->twopass.kf_zeromotion_pct >= STATIC_KF_GROUP_THRESH) {
@@ -1216,9 +1375,9 @@
 
     // Tweak active_best_quality for AOM_Q mode when superres is on, as this
     // will be used directly as 'q' later.
-    if (oxcf->rc_mode == AOM_Q &&
-        (cpi->superres_mode == SUPERRES_QTHRESH ||
-         cpi->superres_mode == SUPERRES_AUTO) &&
+    if (oxcf->rc_cfg.mode == AOM_Q &&
+        (cpi->superres_mode == AOM_SUPERRES_QTHRESH ||
+         cpi->superres_mode == AOM_SUPERRES_AUTO) &&
         cm->superres_scale_denominator != SCALE_NUMERATOR) {
       active_best_quality =
           AOMMAX(active_best_quality -
@@ -1237,16 +1396,17 @@
                                                  int *active_best) {
   const AV1_COMMON *const cm = &cpi->common;
   const RATE_CONTROL *const rc = &cpi->rc;
+  const RefreshFrameFlagsInfo *const refresh_frame_flags = &cpi->refresh_frame;
   const int bit_depth = cpi->common.seq_params.bit_depth;
   int active_best_quality = *active_best;
   int active_worst_quality = *active_worst;
   // Extension to max or min Q if undershoot or overshoot is outside
   // the permitted range.
-  if (cpi->oxcf.rc_mode != AOM_Q) {
+  if (cpi->oxcf.rc_cfg.mode != AOM_Q) {
     if (frame_is_intra_only(cm) ||
         (!rc->is_src_frame_alt_ref &&
-         (cpi->refresh_golden_frame || is_intrl_arf_boost ||
-          cpi->refresh_alt_ref_frame))) {
+         (refresh_frame_flags->golden_frame || is_intrl_arf_boost ||
+          refresh_frame_flags->alt_ref_frame))) {
       active_best_quality -=
           (cpi->twopass.extend_minq + cpi->twopass.extend_minq_fast);
       active_worst_quality += (cpi->twopass.extend_maxq / 2);
@@ -1271,7 +1431,8 @@
   // Modify active_best_quality for downscaled normal frames.
   if (av1_frame_scaled(cm) && !frame_is_kf_gf_arf(cpi)) {
     int qdelta = av1_compute_qdelta_by_rate(
-        rc, cm->current_frame.frame_type, active_best_quality, 2.0, bit_depth);
+        rc, cm->current_frame.frame_type, active_best_quality, 2.0,
+        cpi->is_screen_content_type, bit_depth);
     active_best_quality =
         AOMMAX(active_best_quality + qdelta, rc->best_quality);
   }
@@ -1285,6 +1446,21 @@
   *active_worst = active_worst_quality;
 }
 
+/*!\brief Gets a Q value to use  for the current frame
+ *
+ *
+ * Selects a Q value from a permitted range that we estimate
+ * will result in approximately the target number of bits.
+ *
+ * \ingroup rate_control
+ * \param[in]   cpi                   Top level encoder instance structure
+ * \param[in]   width                 Width of frame
+ * \param[in]   height                Height of frame
+ * \param[in]   active_worst_quality  Max Q allowed
+ * \param[in]   active_best_quality   Min Q allowed
+ *
+ * \return The suggested Q for this frame.
+ */
 static int get_q(const AV1_COMP *cpi, const int width, const int height,
                  const int active_worst_quality,
                  const int active_best_quality) {
@@ -1292,7 +1468,7 @@
   const RATE_CONTROL *const rc = &cpi->rc;
   int q;
 
-  if (cpi->oxcf.rc_mode == AOM_Q ||
+  if (cpi->oxcf.rc_cfg.mode == AOM_Q ||
       (frame_is_intra_only(cm) && !rc->this_key_frame_forced &&
        cpi->twopass.kf_zeromotion_pct >= STATIC_KF_GROUP_THRESH &&
        rc->frames_to_key > 1)) {
@@ -1333,15 +1509,17 @@
   const int bit_depth = cm->seq_params.bit_depth;
   const RATE_CONTROL *const rc = &cpi->rc;
   const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  const RefreshFrameFlagsInfo *const refresh_frame_flags = &cpi->refresh_frame;
   const GF_GROUP *gf_group = &cpi->gf_group;
-  const int rc_mode = oxcf->rc_mode;
+  const enum aom_rc_mode rc_mode = oxcf->rc_cfg.mode;
   int *inter_minq;
   ASSIGN_MINQ_TABLE(bit_depth, inter_minq);
   int active_best_quality = 0;
   const int is_intrl_arf_boost =
       gf_group->update_type[gf_index] == INTNL_ARF_UPDATE;
-  const int is_leaf_frame = !(cpi->refresh_golden_frame ||
-                              cpi->refresh_alt_ref_frame || is_intrl_arf_boost);
+  const int is_leaf_frame =
+      !(refresh_frame_flags->golden_frame ||
+        refresh_frame_flags->alt_ref_frame || is_intrl_arf_boost);
   const int is_overlay_frame = rc->is_src_frame_alt_ref;
 
   if (is_leaf_frame || is_overlay_frame) {
@@ -1357,7 +1535,8 @@
   }
 
   // TODO(chengchen): can we remove this condition?
-  if (rc_mode == AOM_Q && !cpi->refresh_alt_ref_frame && !is_intrl_arf_boost) {
+  if (rc_mode == AOM_Q && !refresh_frame_flags->alt_ref_frame &&
+      !refresh_frame_flags->golden_frame && !is_intrl_arf_boost) {
     return cq_level;
   }
 
@@ -1388,19 +1567,38 @@
   return active_best_quality;
 }
 
-static int rc_pick_q_and_bounds_two_pass(const AV1_COMP *cpi, int width,
-                                         int height, int gf_index,
-                                         int *bottom_index, int *top_index) {
+/*!\brief Picks q and q bounds given rate control parameters in \c cpi->rc.
+ *
+ * Handles the the general cases not covered by
+ * \ref rc_pick_q_and_bounds_no_stats_cbr() and
+ * \ref rc_pick_q_and_bounds_no_stats()
+ *
+ * \ingroup rate_control
+ * \param[in]       cpi          Top level encoder structure
+ * \param[in]       width        Coded frame width
+ * \param[in]       height       Coded frame height
+ * \param[in]       gf_index     Index of this frame in the golden frame group
+ * \param[out]      bottom_index Bottom bound for q index (best quality)
+ * \param[out]      top_index    Top bound for q index (worst quality)
+ * \return Returns selected q index to be used for encoding this frame.
+ */
+static int rc_pick_q_and_bounds(const AV1_COMP *cpi, int width, int height,
+                                int gf_index, int *bottom_index,
+                                int *top_index) {
   const AV1_COMMON *const cm = &cpi->common;
   const RATE_CONTROL *const rc = &cpi->rc;
   const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  const RefreshFrameFlagsInfo *const refresh_frame_flags = &cpi->refresh_frame;
   const GF_GROUP *gf_group = &cpi->gf_group;
+  assert(IMPLIES(has_no_stats_stage(cpi),
+                 cpi->oxcf.rc_cfg.mode == AOM_Q &&
+                     gf_group->update_type[gf_index] != ARF_UPDATE));
   const int cq_level =
       get_active_cq_level(rc, oxcf, frame_is_intra_only(cm), cpi->superres_mode,
                           cm->superres_scale_denominator);
   const int bit_depth = cm->seq_params.bit_depth;
 
-  if (oxcf->use_fixed_qp_offsets) {
+  if (oxcf->q_cfg.use_fixed_qp_offsets) {
     return get_q_using_fixed_offsets(oxcf, rc, gf_group, gf_group->index,
                                      cq_level, bit_depth);
   }
@@ -1413,24 +1611,30 @@
       gf_group->update_type[gf_index] == INTNL_ARF_UPDATE;
 
   if (frame_is_intra_only(cm)) {
-    const int is_fwd_kf =
-        cm->current_frame.frame_type == KEY_FRAME && cm->show_frame == 0;
-    get_intra_q_and_bounds_two_pass(cpi, width, height, &active_best_quality,
-                                    &active_worst_quality, cq_level, is_fwd_kf);
+    const int is_fwd_kf = cm->current_frame.frame_type == KEY_FRAME &&
+                          cm->show_frame == 0 && cpi->no_show_fwd_kf;
+    get_intra_q_and_bounds(cpi, width, height, &active_best_quality,
+                           &active_worst_quality, cq_level, is_fwd_kf);
 #ifdef STRICT_RC
     active_best_quality = 0;
 #endif
   } else {
-#ifdef STRICT_RC
     //  Active best quality limited by previous layer.
     const int pyramid_level = gf_group_pyramid_level(gf_group, gf_index);
-    active_best_quality =
-        rc->active_best_quality[pyramid_level - 1] +
-        AOMMAX((rc->active_best_quality[pyramid_level - 1] / 10), 5);
+
+    if ((pyramid_level <= 1) || (pyramid_level > MAX_ARF_LAYERS) ||
+        (oxcf->rc_cfg.mode == AOM_Q)) {
+      active_best_quality = get_active_best_quality(cpi, active_worst_quality,
+                                                    cq_level, gf_index);
+    } else {
+      active_best_quality = rc->active_best_quality[pyramid_level - 1] + 1;
+      active_best_quality = AOMMIN(active_best_quality, active_worst_quality);
+#ifdef STRICT_RC
+      active_best_quality += (active_worst_quality - active_best_quality) / 16;
 #else
-    active_best_quality =
-        get_active_best_quality(cpi, active_worst_quality, cq_level, gf_index);
+      active_best_quality += (active_worst_quality - active_best_quality) / 2;
 #endif
+    }
 
     // For alt_ref and GF frames (including internal arf frames) adjust the
     // worst allowed quality as well. This insures that even on hard
@@ -1438,8 +1642,8 @@
     // leaf (non arf) frames. This is important to the TPL model which assumes
     // Q drops with each arf level.
     if (!(rc->is_src_frame_alt_ref) &&
-        (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame ||
-         is_intrl_arf_boost)) {
+        (refresh_frame_flags->golden_frame ||
+         refresh_frame_flags->alt_ref_frame || is_intrl_arf_boost)) {
       active_worst_quality =
           (active_best_quality + (3 * active_worst_quality) + 2) / 4;
     }
@@ -1455,11 +1659,7 @@
     active_worst_quality = q;
   }
 
-#ifdef STRICT_RC
-  *top_index = rc->worst_quality;
-#else
   *top_index = active_worst_quality;
-#endif
   *bottom_index = active_best_quality;
 
   assert(*top_index <= rc->worst_quality && *top_index >= rc->best_quality);
@@ -1474,26 +1674,27 @@
                              int height, int gf_index, int *bottom_index,
                              int *top_index) {
   int q;
-  // TODO(sarahparker) merge onepass vbr and altref q computation
-  // with two pass
+  // TODO(sarahparker) merge no-stats vbr and altref q computation
+  // with rc_pick_q_and_bounds().
   const GF_GROUP *gf_group = &cpi->gf_group;
-  if ((cpi->oxcf.rc_mode != AOM_Q ||
+  if ((cpi->oxcf.rc_cfg.mode != AOM_Q ||
        gf_group->update_type[gf_index] == ARF_UPDATE) &&
       has_no_stats_stage(cpi)) {
-    if (cpi->oxcf.rc_mode == AOM_CBR)
-      q = rc_pick_q_and_bounds_one_pass_cbr(cpi, width, height, bottom_index,
+    if (cpi->oxcf.rc_cfg.mode == AOM_CBR) {
+      q = rc_pick_q_and_bounds_no_stats_cbr(cpi, width, height, bottom_index,
                                             top_index);
 #if USE_UNRESTRICTED_Q_IN_CQ_MODE
-    else if (cpi->oxcf.rc_mode == AOM_CQ)
-      q = rc_pick_q_and_bounds_one_pass_cq(cpi, width, height, bottom_index,
+    } else if (cpi->oxcf.rc_cfg.mode == AOM_CQ) {
+      q = rc_pick_q_and_bounds_no_stats_cq(cpi, width, height, bottom_index,
                                            top_index);
 #endif  // USE_UNRESTRICTED_Q_IN_CQ_MODE
-    else
-      q = rc_pick_q_and_bounds_one_pass_vbr(cpi, width, height, bottom_index,
-                                            top_index);
+    } else {
+      q = rc_pick_q_and_bounds_no_stats(cpi, width, height, gf_index,
+                                        bottom_index, top_index);
+    }
   } else {
-    q = rc_pick_q_and_bounds_two_pass(cpi, width, height, gf_index,
-                                      bottom_index, top_index);
+    q = rc_pick_q_and_bounds(cpi, width, height, gf_index, bottom_index,
+                             top_index);
   }
   if (gf_group->update_type[gf_index] == ARF_UPDATE) rc->arf_q = q;
 
@@ -1503,14 +1704,15 @@
 void av1_rc_compute_frame_size_bounds(const AV1_COMP *cpi, int frame_target,
                                       int *frame_under_shoot_limit,
                                       int *frame_over_shoot_limit) {
-  if (cpi->oxcf.rc_mode == AOM_Q) {
+  if (cpi->oxcf.rc_cfg.mode == AOM_Q) {
     *frame_under_shoot_limit = 0;
     *frame_over_shoot_limit = INT_MAX;
   } else {
     // For very small rate targets where the fractional adjustment
     // may be tiny make sure there is at least a minimum range.
-    const int tolerance =
-        AOMMAX(100, (cpi->sf.hl_sf.recode_tolerance * frame_target) / 100);
+    assert(cpi->sf.hl_sf.recode_tolerance <= 100);
+    const int tolerance = (int)AOMMAX(
+        100, ((int64_t)cpi->sf.hl_sf.recode_tolerance * frame_target) / 100);
     *frame_under_shoot_limit = AOMMAX(frame_target - tolerance, 0);
     *frame_over_shoot_limit =
         AOMMIN(frame_target + tolerance, cpi->rc.max_frame_bandwidth);
@@ -1524,9 +1726,11 @@
   rc->this_frame_target = target;
 
   // Modify frame size target when down-scaled.
-  if (av1_frame_scaled(cm))
+  if (av1_frame_scaled(cm) && cpi->oxcf.rc_cfg.mode != AOM_CBR) {
     rc->this_frame_target =
-        (int)(rc->this_frame_target * resize_rate_factor(cpi, width, height));
+        (int)(rc->this_frame_target *
+              resize_rate_factor(&cpi->oxcf.frm_dim_cfg, width, height));
+  }
 
   // Target rate per SB64 (including partial SB64s.
   rc->sb64_target_rate =
@@ -1537,27 +1741,14 @@
   // this frame refreshes means next frames don't unless specified by user
   RATE_CONTROL *const rc = &cpi->rc;
   rc->frames_since_golden = 0;
-
-  // Mark the alt ref as done (setting to 0 means no further alt refs pending).
-  rc->source_alt_ref_pending = 0;
-
-  // Set the alternate reference frame active flag
-  rc->source_alt_ref_active = 1;
 }
 
 static void update_golden_frame_stats(AV1_COMP *cpi) {
   RATE_CONTROL *const rc = &cpi->rc;
-  const GF_GROUP *const gf_group = &cpi->gf_group;
 
   // Update the Golden frame usage counts.
-  if (cpi->refresh_golden_frame || rc->is_src_frame_alt_ref) {
+  if (cpi->refresh_frame.golden_frame || rc->is_src_frame_alt_ref) {
     rc->frames_since_golden = 0;
-
-    // If we are not using alt ref in the up and coming group clear the arf
-    // active flag. In multi arf group case, if the index is not 0 then
-    // we are overlaying a mid group arf so should not reset the flag.
-    if (!rc->source_alt_ref_pending && (gf_group->index == 0))
-      rc->source_alt_ref_active = 0;
   } else if (cpi->common.show_frame) {
     rc->frames_since_golden++;
   }
@@ -1568,6 +1759,7 @@
   const CurrentFrame *const current_frame = &cm->current_frame;
   RATE_CONTROL *const rc = &cpi->rc;
   const GF_GROUP *const gf_group = &cpi->gf_group;
+  const RefreshFrameFlagsInfo *const refresh_frame_flags = &cpi->refresh_frame;
 
   const int is_intrnl_arf =
       gf_group->update_type[gf_group->index] == INTNL_ARF_UPDATE;
@@ -1586,10 +1778,10 @@
     rc->avg_frame_qindex[KEY_FRAME] =
         ROUND_POWER_OF_TWO(3 * rc->avg_frame_qindex[KEY_FRAME] + qindex, 2);
   } else {
-    if ((cpi->use_svc && cpi->oxcf.rc_mode == AOM_CBR) ||
+    if ((cpi->use_svc && cpi->oxcf.rc_cfg.mode == AOM_CBR) ||
         (!rc->is_src_frame_alt_ref &&
-         !(cpi->refresh_golden_frame || is_intrnl_arf ||
-           cpi->refresh_alt_ref_frame))) {
+         !(refresh_frame_flags->golden_frame || is_intrnl_arf ||
+           refresh_frame_flags->alt_ref_frame))) {
       rc->last_q[INTER_FRAME] = qindex;
       rc->avg_frame_qindex[INTER_FRAME] =
           ROUND_POWER_OF_TWO(3 * rc->avg_frame_qindex[INTER_FRAME] + qindex, 2);
@@ -1611,8 +1803,8 @@
   if ((qindex < rc->last_boosted_qindex) ||
       (current_frame->frame_type == KEY_FRAME) ||
       (!rc->constrained_gf_group &&
-       (cpi->refresh_alt_ref_frame || is_intrnl_arf ||
-        (cpi->refresh_golden_frame && !rc->is_src_frame_alt_ref)))) {
+       (refresh_frame_flags->alt_ref_frame || is_intrnl_arf ||
+        (refresh_frame_flags->golden_frame && !rc->is_src_frame_alt_ref)))) {
     rc->last_boosted_qindex = qindex;
   }
   if (current_frame->frame_type == KEY_FRAME) rc->last_kf_qindex = qindex;
@@ -1623,18 +1815,14 @@
   // Rolling monitors of whether we are over or underspending used to help
   // regulate min and Max Q in two pass.
   if (av1_frame_scaled(cm))
-    rc->this_frame_target =
-        (int)(rc->this_frame_target /
-              resize_rate_factor(cpi, cm->width, cm->height));
+    rc->this_frame_target = (int)(rc->this_frame_target /
+                                  resize_rate_factor(&cpi->oxcf.frm_dim_cfg,
+                                                     cm->width, cm->height));
   if (current_frame->frame_type != KEY_FRAME) {
     rc->rolling_target_bits = (int)ROUND_POWER_OF_TWO_64(
         rc->rolling_target_bits * 3 + rc->this_frame_target, 2);
     rc->rolling_actual_bits = (int)ROUND_POWER_OF_TWO_64(
         rc->rolling_actual_bits * 3 + rc->projected_frame_size, 2);
-    rc->long_rolling_target_bits = (int)ROUND_POWER_OF_TWO_64(
-        rc->long_rolling_target_bits * 31 + rc->this_frame_target, 5);
-    rc->long_rolling_actual_bits = (int)ROUND_POWER_OF_TWO_64(
-        rc->long_rolling_actual_bits * 31 + rc->projected_frame_size, 5);
   }
 
   // Actual bits spent
@@ -1643,8 +1831,10 @@
 
   rc->total_target_vs_actual = rc->total_actual_bits - rc->total_target_bits;
 
-  if (is_altref_enabled(cpi) && cpi->refresh_alt_ref_frame &&
-      (current_frame->frame_type != KEY_FRAME))
+  if (is_altref_enabled(cpi->oxcf.gf_cfg.lag_in_frames,
+                        cpi->oxcf.gf_cfg.enable_auto_arf) &&
+      refresh_frame_flags->alt_ref_frame &&
+      (current_frame->frame_type != KEY_FRAME && !frame_is_sframe(cm)))
     // Update the alternate reference frame stats as appropriate.
     update_alt_ref_frame_stats(cpi);
   else
@@ -1655,8 +1845,8 @@
   // if (current_frame->frame_number == 1 && cm->show_frame)
   /*
   rc->this_frame_target =
-      (int)(rc->this_frame_target / resize_rate_factor(cpi, cm->width,
-  cm->height));
+      (int)(rc->this_frame_target / resize_rate_factor(&cpi->oxcf.frm_dim_cfg,
+  cm->width, cm->height));
       */
 }
 
@@ -1705,14 +1895,15 @@
 // If no such q index is found, returns 'worst_qindex'.
 static int find_qindex_by_rate(int desired_bits_per_mb,
                                aom_bit_depth_t bit_depth, FRAME_TYPE frame_type,
+                               const int is_screen_content_type,
                                int best_qindex, int worst_qindex) {
   assert(best_qindex <= worst_qindex);
   int low = best_qindex;
   int high = worst_qindex;
   while (low < high) {
     const int mid = (low + high) >> 1;
-    const int mid_bits_per_mb =
-        av1_rc_bits_per_mb(frame_type, mid, 1.0, bit_depth);
+    const int mid_bits_per_mb = av1_rc_bits_per_mb(
+        frame_type, mid, 1.0, bit_depth, is_screen_content_type);
     if (mid_bits_per_mb > desired_bits_per_mb) {
       low = mid + 1;
     } else {
@@ -1720,25 +1911,26 @@
     }
   }
   assert(low == high);
-  assert(av1_rc_bits_per_mb(frame_type, low, 1.0, bit_depth) <=
-             desired_bits_per_mb ||
+  assert(av1_rc_bits_per_mb(frame_type, low, 1.0, bit_depth,
+                            is_screen_content_type) <= desired_bits_per_mb ||
          low == worst_qindex);
   return low;
 }
 
 int av1_compute_qdelta_by_rate(const RATE_CONTROL *rc, FRAME_TYPE frame_type,
                                int qindex, double rate_target_ratio,
+                               const int is_screen_content_type,
                                aom_bit_depth_t bit_depth) {
   // Look up the current projected bits per block for the base index
-  const int base_bits_per_mb =
-      av1_rc_bits_per_mb(frame_type, qindex, 1.0, bit_depth);
+  const int base_bits_per_mb = av1_rc_bits_per_mb(
+      frame_type, qindex, 1.0, bit_depth, is_screen_content_type);
 
   // Find the target bits per mb based on the base value and given ratio.
   const int target_bits_per_mb = (int)(rate_target_ratio * base_bits_per_mb);
 
-  const int target_index =
-      find_qindex_by_rate(target_bits_per_mb, bit_depth, frame_type,
-                          rc->best_quality, rc->worst_quality);
+  const int target_index = find_qindex_by_rate(
+      target_bits_per_mb, bit_depth, frame_type, is_screen_content_type,
+      rc->best_quality, rc->worst_quality);
   return target_index - qindex;
 }
 
@@ -1747,17 +1939,17 @@
   const AV1EncoderConfig *const oxcf = &cpi->oxcf;
 
   // Special case code for 1 pass fixed Q mode tests
-  if ((has_no_stats_stage(cpi)) && (oxcf->rc_mode == AOM_Q)) {
+  if ((has_no_stats_stage(cpi)) && (oxcf->rc_cfg.mode == AOM_Q)) {
     rc->max_gf_interval = FIXED_GF_INTERVAL;
     rc->min_gf_interval = FIXED_GF_INTERVAL;
     rc->static_scene_max_gf_interval = FIXED_GF_INTERVAL;
   } else {
     // Set Maximum gf/arf interval
-    rc->max_gf_interval = oxcf->max_gf_interval;
-    rc->min_gf_interval = oxcf->min_gf_interval;
+    rc->max_gf_interval = oxcf->gf_cfg.max_gf_interval;
+    rc->min_gf_interval = oxcf->gf_cfg.min_gf_interval;
     if (rc->min_gf_interval == 0)
       rc->min_gf_interval = av1_rc_get_default_min_gf_interval(
-          oxcf->width, oxcf->height, cpi->framerate);
+          oxcf->frm_dim_cfg.width, oxcf->frm_dim_cfg.height, cpi->framerate);
     if (rc->max_gf_interval == 0)
       rc->max_gf_interval = av1_rc_get_default_max_gf_interval(
           cpi->framerate, rc->min_gf_interval);
@@ -1785,9 +1977,10 @@
   int vbr_max_bits;
   const int MBs = av1_get_MBs(width, height);
 
-  rc->avg_frame_bandwidth = (int)(oxcf->target_bandwidth / cpi->framerate);
+  rc->avg_frame_bandwidth =
+      (int)(oxcf->rc_cfg.target_bandwidth / cpi->framerate);
   rc->min_frame_bandwidth =
-      (int)(rc->avg_frame_bandwidth * oxcf->two_pass_vbrmin_section / 100);
+      (int)(rc->avg_frame_bandwidth * oxcf->rc_cfg.vbrmin_section / 100);
 
   rc->min_frame_bandwidth =
       AOMMAX(rc->min_frame_bandwidth, FRAME_OVERHEAD_BITS);
@@ -1800,7 +1993,7 @@
   // be acheived because of a user specificed max q (e.g. when the user
   // specifies lossless encode.
   vbr_max_bits =
-      (int)(((int64_t)rc->avg_frame_bandwidth * oxcf->two_pass_vbrmax_section) /
+      (int)(((int64_t)rc->avg_frame_bandwidth * oxcf->rc_cfg.vbrmax_section) /
             100);
   rc->max_frame_bandwidth =
       AOMMAX(AOMMAX((MBs * MAX_MB_RATE), MAXRATE_1080P), vbr_max_bits);
@@ -1819,11 +2012,11 @@
           : 0;
   const int frame_window = AOMMIN(
       16, (int)(stats_count - (int)cpi->common.current_frame.frame_number));
-
+  assert(VBR_PCT_ADJUSTMENT_LIMIT <= 100);
   if (frame_window > 0) {
-    const int max_delta =
-        AOMMIN(abs((int)(vbr_bits_off_target / frame_window)),
-               (*this_frame_target * VBR_PCT_ADJUSTMENT_LIMIT) / 100);
+    const int max_delta = (int)AOMMIN(
+        abs((int)(vbr_bits_off_target / frame_window)),
+        ((int64_t)(*this_frame_target) * VBR_PCT_ADJUSTMENT_LIMIT) / 100);
 
     // vbr_bits_off_target > 0 means we have extra bits to spend
     // vbr_bits_off_target < 0 we are currently overshooting
@@ -1850,7 +2043,7 @@
   int target_rate = rc->base_frame_target;
 
   // Correction to rate target based on prior over or under shoot.
-  if (cpi->oxcf.rc_mode == AOM_VBR || cpi->oxcf.rc_mode == AOM_CQ)
+  if (cpi->oxcf.rc_cfg.mode == AOM_VBR || cpi->oxcf.rc_cfg.mode == AOM_CQ)
     vbr_rate_correction(cpi, &target_rate);
   av1_rc_set_frame_target(cpi, target_rate, width, height);
 }
@@ -1888,14 +2081,15 @@
     const AV1_COMP *cpi, FRAME_UPDATE_TYPE frame_update_type) {
   const AV1EncoderConfig *oxcf = &cpi->oxcf;
   const RATE_CONTROL *rc = &cpi->rc;
+  const RateControlCfg *rc_cfg = &oxcf->rc_cfg;
   const int64_t diff = rc->optimal_buffer_level - rc->buffer_level;
   const int64_t one_pct_bits = 1 + rc->optimal_buffer_level / 100;
   int min_frame_target =
       AOMMAX(rc->avg_frame_bandwidth >> 4, FRAME_OVERHEAD_BITS);
   int target;
 
-  if (oxcf->gf_cbr_boost_pct) {
-    const int af_ratio_pct = oxcf->gf_cbr_boost_pct + 100;
+  if (rc_cfg->gf_cbr_boost_pct) {
+    const int af_ratio_pct = rc_cfg->gf_cbr_boost_pct + 100;
     if (frame_update_type == GF_UPDATE || frame_update_type == OVERLAY_UPDATE) {
       target =
           (rc->avg_frame_bandwidth * rc->baseline_gf_interval * af_ratio_pct) /
@@ -1920,17 +2114,18 @@
   }
   if (diff > 0) {
     // Lower the target bandwidth for this frame.
-    const int pct_low = (int)AOMMIN(diff / one_pct_bits, oxcf->under_shoot_pct);
+    const int pct_low =
+        (int)AOMMIN(diff / one_pct_bits, rc_cfg->under_shoot_pct);
     target -= (target * pct_low) / 200;
   } else if (diff < 0) {
     // Increase the target bandwidth for this frame.
     const int pct_high =
-        (int)AOMMIN(-diff / one_pct_bits, oxcf->over_shoot_pct);
+        (int)AOMMIN(-diff / one_pct_bits, rc_cfg->over_shoot_pct);
     target += (target * pct_high) / 200;
   }
-  if (oxcf->rc_max_inter_bitrate_pct) {
+  if (rc_cfg->max_inter_bitrate_pct) {
     const int max_rate =
-        rc->avg_frame_bandwidth * oxcf->rc_max_inter_bitrate_pct / 100;
+        rc->avg_frame_bandwidth * rc_cfg->max_inter_bitrate_pct / 100;
     target = AOMMIN(target, max_rate);
   }
   return AOMMAX(min_frame_target, target);
@@ -1956,27 +2151,41 @@
   return av1_rc_clamp_iframe_target_size(cpi, target);
 }
 
-static void set_reference_structure_one_pass_rt(AV1_COMP *cpi, int gf_update) {
+/*!\brief Setup the reference prediction structure for 1 pass real-time
+ *
+ * Set the reference prediction structure for 1 layer.
+ * Current structue is to use 3 references (LAST, GOLDEN, ALTREF),
+ * where ALT_REF always behind current by lag_alt frames, and GOLDEN is
+ * either updated on LAST with period baseline_gf_interval (fixed slot)
+ * or always behind current by lag_gld (gld_fixed_slot = 0, lag_gld <= 7).
+ *
+ * \ingroup rate_control
+ * \param[in]       cpi          Top level encoder structure
+ * \param[in]       gf_update    Flag to indicate if GF is updated
+ *
+ * \return Nothing is returned. Instead the settings for the prediction
+ * structure are set in \c cpi-ext_flags; and the buffer slot index
+ * (for each of 7 references) and refresh flags (for each of the 8 slots)
+ * are set in \c cpi->svc.ref_idx[] and \c cpi->svc.refresh[].
+ */
+void av1_set_reference_structure_one_pass_rt(AV1_COMP *cpi, int gf_update) {
   AV1_COMMON *const cm = &cpi->common;
   ExternalFlags *const ext_flags = &cpi->ext_flags;
+  ExtRefreshFrameFlagsInfo *const ext_refresh_frame_flags =
+      &ext_flags->refresh_frame;
   SVC *const svc = &cpi->svc;
-  // Specify the reference prediction structure, for 1 layer nonrd mode.
-  // Current structue is to use 3 references (LAST, GOLDEN, ALTREF),
-  // where ALT_REF always behind current by lag_alt frames, and GOLDEN is
-  // either updated on LAST with period baseline_gf_interval (fixed slot)
-  // or always behind current by lag_gld (gld_fixed_slot = 0, lag_gld <= 7).
   const int gld_fixed_slot = 1;
   const unsigned int lag_alt = 4;
   int last_idx = 0;
   int last_idx_refresh = 0;
   int gld_idx = 0;
   int alt_ref_idx = 0;
-  ext_flags->refresh_frame_flags_pending = 1;
+  ext_refresh_frame_flags->update_pending = 1;
   svc->external_ref_frame_config = 1;
   ext_flags->ref_frame_flags = 0;
-  ext_flags->refresh_last_frame = 1;
-  ext_flags->refresh_golden_frame = 0;
-  ext_flags->refresh_alt_ref_frame = 0;
+  ext_refresh_frame_flags->last_frame = 1;
+  ext_refresh_frame_flags->golden_frame = 0;
+  ext_refresh_frame_flags->alt_ref_frame = 0;
   for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) svc->ref_idx[i] = 7;
   for (int i = 0; i < REF_FRAMES; ++i) svc->refresh[i] = 0;
   // Always reference LAST, GOLDEN, ALTREF
@@ -2008,57 +2217,151 @@
   svc->refresh[last_idx_refresh] = 1;
   // Update GOLDEN on period for fixed slot case.
   if (gld_fixed_slot && gf_update) {
-    ext_flags->refresh_golden_frame = 1;
+    ext_refresh_frame_flags->golden_frame = 1;
     svc->refresh[gld_idx] = 1;
   }
 }
 
+/*!\brief Check for scene detection, for 1 pass real-time mode.
+ *
+ * Compute average source sad (temporal sad: between current source and
+ * previous source) over a subset of superblocks. Use this is detect big changes
+ * in content and set the \c cpi->rc.high_source_sad flag.
+ *
+ * \ingroup rate_control
+ * \param[in]       cpi          Top level encoder structure
+ *
+ * \return Nothing is returned. Instead the flag \c cpi->rc.high_source_sad
+ * is set if scene change is detected, and \c cpi->rc.avg_source_sad is updated.
+ */
+static void rc_scene_detection_onepass_rt(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  RATE_CONTROL *const rc = &cpi->rc;
+  YV12_BUFFER_CONFIG const *unscaled_src = cpi->unscaled_source;
+  YV12_BUFFER_CONFIG const *unscaled_last_src = cpi->unscaled_last_source;
+  uint8_t *src_y;
+  int src_ystride;
+  int src_width;
+  int src_height;
+  uint8_t *last_src_y;
+  int last_src_ystride;
+  int last_src_width;
+  int last_src_height;
+  if (cpi->unscaled_source == NULL || cpi->unscaled_last_source == NULL) return;
+  src_y = unscaled_src->y_buffer;
+  src_ystride = unscaled_src->y_stride;
+  src_width = unscaled_src->y_width;
+  src_height = unscaled_src->y_height;
+  last_src_y = unscaled_last_src->y_buffer;
+  last_src_ystride = unscaled_last_src->y_stride;
+  last_src_width = unscaled_last_src->y_width;
+  last_src_height = unscaled_last_src->y_height;
+  rc->high_source_sad = 0;
+  rc->prev_avg_source_sad = rc->avg_source_sad;
+  if (src_width == last_src_width && src_height == last_src_height) {
+    const int num_mi_cols = cm->mi_params.mi_cols;
+    const int num_mi_rows = cm->mi_params.mi_rows;
+    int num_zero_temp_sad = 0;
+    uint32_t min_thresh = 10000;
+    if (cpi->oxcf.tune_cfg.content != AOM_CONTENT_SCREEN) min_thresh = 100000;
+    const BLOCK_SIZE bsize = BLOCK_64X64;
+    int full_sampling = (cm->width * cm->height < 640 * 360) ? 1 : 0;
+    // Loop over sub-sample of frame, compute average sad over 64x64 blocks.
+    uint64_t avg_sad = 0;
+    uint64_t tmp_sad = 0;
+    int num_samples = 0;
+    const int thresh = 6;
+    // SAD is computed on 64x64 blocks
+    const int sb_size_by_mb = (cm->seq_params.sb_size == BLOCK_128X128)
+                                  ? (cm->seq_params.mib_size >> 1)
+                                  : cm->seq_params.mib_size;
+    const int sb_cols = (num_mi_cols + sb_size_by_mb - 1) / sb_size_by_mb;
+    const int sb_rows = (num_mi_rows + sb_size_by_mb - 1) / sb_size_by_mb;
+    uint64_t sum_sq_thresh = 10000;  // sum = sqrt(thresh / 64*64)) ~1.5
+    int num_low_var_high_sumdiff = 0;
+    int light_change = 0;
+    // Flag to check light change or not.
+    const int check_light_change = 0;
+    for (int sbi_row = 0; sbi_row < sb_rows; ++sbi_row) {
+      for (int sbi_col = 0; sbi_col < sb_cols; ++sbi_col) {
+        // Checker-board pattern, ignore boundary.
+        if (full_sampling ||
+            ((sbi_row > 0 && sbi_col > 0) &&
+             (sbi_row < sb_rows - 1 && sbi_col < sb_cols - 1) &&
+             ((sbi_row % 2 == 0 && sbi_col % 2 == 0) ||
+              (sbi_row % 2 != 0 && sbi_col % 2 != 0)))) {
+          tmp_sad = cpi->fn_ptr[bsize].sdf(src_y, src_ystride, last_src_y,
+                                           last_src_ystride);
+          if (check_light_change) {
+            unsigned int sse, variance;
+            variance = cpi->fn_ptr[bsize].vf(src_y, src_ystride, last_src_y,
+                                             last_src_ystride, &sse);
+            // Note: sse - variance = ((sum * sum) >> 12)
+            // Detect large lighting change.
+            if (variance < (sse >> 1) && (sse - variance) > sum_sq_thresh) {
+              num_low_var_high_sumdiff++;
+            }
+          }
+          avg_sad += tmp_sad;
+          num_samples++;
+          if (tmp_sad == 0) num_zero_temp_sad++;
+        }
+        src_y += 64;
+        last_src_y += 64;
+      }
+      src_y += (src_ystride << 6) - (sb_cols << 6);
+      last_src_y += (last_src_ystride << 6) - (sb_cols << 6);
+    }
+    if (check_light_change && num_samples > 0 &&
+        num_low_var_high_sumdiff > (num_samples >> 1))
+      light_change = 1;
+    if (num_samples > 0) avg_sad = avg_sad / num_samples;
+    // Set high_source_sad flag if we detect very high increase in avg_sad
+    // between current and previous frame value(s). Use minimum threshold
+    // for cases where there is small change from content that is completely
+    // static.
+    if (!light_change &&
+        avg_sad >
+            AOMMAX(min_thresh, (unsigned int)(rc->avg_source_sad * thresh)) &&
+        rc->frames_since_key > 1 + cpi->svc.number_spatial_layers &&
+        num_zero_temp_sad < 3 * (num_samples >> 2))
+      rc->high_source_sad = 1;
+    else
+      rc->high_source_sad = 0;
+    rc->avg_source_sad = (3 * rc->avg_source_sad + avg_sad) >> 2;
+  }
+}
+
 #define DEFAULT_KF_BOOST_RT 2300
 #define DEFAULT_GF_BOOST_RT 2000
 
-void av1_get_one_pass_rt_params(AV1_COMP *cpi,
-                                EncodeFrameParams *const frame_params,
-                                unsigned int frame_flags) {
+/*!\brief Set the GF baseline interval for 1 pass real-time mode.
+ *
+ *
+ * \ingroup rate_control
+ * \param[in]       cpi          Top level encoder structure
+ * \param[in]       frame_type   frame type
+ *
+ * \return Return GF update flag, and update the \c cpi->rc with
+ * the next GF interval settings.
+ */
+static int set_gf_interval_update_onepass_rt(AV1_COMP *cpi,
+                                             FRAME_TYPE frame_type) {
   RATE_CONTROL *const rc = &cpi->rc;
-  AV1_COMMON *const cm = &cpi->common;
   GF_GROUP *const gf_group = &cpi->gf_group;
   ResizePendingParams *const resize_pending_params =
       &cpi->resize_pending_params;
   int gf_update = 0;
-  int target;
   const int resize_pending =
       (resize_pending_params->width && resize_pending_params->height &&
-       (cm->width != resize_pending_params->width ||
-        cm->height != resize_pending_params->height));
-  // Turn this on to explicitly set the reference structure rather than
-  // relying on internal/default structure.
-  const int set_reference_structure = 1;
-  if (cpi->use_svc) {
-    av1_update_temporal_layer_framerate(cpi);
-    av1_restore_layer_context(cpi);
-  }
-  if ((!cpi->use_svc && rc->frames_to_key == 0) ||
-      (cpi->use_svc && cpi->svc.spatial_layer_id == 0 &&
-       cpi->svc.current_superframe % cpi->oxcf.key_freq == 0) ||
-      (frame_flags & FRAMEFLAGS_KEY)) {
-    frame_params->frame_type = KEY_FRAME;
-    rc->this_key_frame_forced =
-        cm->current_frame.frame_number != 0 && rc->frames_to_key == 0;
-    rc->frames_to_key = cpi->oxcf.key_freq;
-    rc->kf_boost = DEFAULT_KF_BOOST_RT;
-    rc->source_alt_ref_active = 0;
-    gf_group->update_type[gf_group->index] = KF_UPDATE;
-    if (cpi->use_svc && cm->current_frame.frame_number > 0)
-      av1_svc_reset_temporal_layers(cpi, 1);
-  } else {
-    frame_params->frame_type = INTER_FRAME;
-    gf_group->update_type[gf_group->index] = LF_UPDATE;
-  }
+       (cpi->common.width != resize_pending_params->width ||
+        cpi->common.height != resize_pending_params->height));
   // GF update based on frames_till_gf_update_due, also
-  // force upddate on resize pending frame.
-  if ((resize_pending || rc->frames_till_gf_update_due == 0) &&
+  // force upddate on resize pending frame or for scene change.
+  if ((resize_pending || rc->high_source_sad ||
+       rc->frames_till_gf_update_due == 0) &&
       cpi->svc.temporal_layer_id == 0 && cpi->svc.spatial_layer_id == 0) {
-    if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
+    if (cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ)
       av1_cyclic_refresh_set_golden_update(cpi);
     else
       rc->baseline_gf_interval = MAX_GF_INTERVAL;
@@ -2069,7 +2372,7 @@
         (rc->baseline_gf_interval >= rc->frames_to_key) ? 1 : 0;
     rc->frames_till_gf_update_due = rc->baseline_gf_interval;
     gf_group->index = 0;
-    // SVC does not use GF as periodid boost.
+    // SVC does not use GF as periodic boost.
     // TODO(marpan): Find better way to disable this for SVC.
     if (cpi->use_svc) {
       SVC *const svc = &cpi->svc;
@@ -2090,10 +2393,240 @@
     }
     gf_group->size = rc->baseline_gf_interval;
     gf_group->update_type[0] =
-        (frame_params->frame_type == KEY_FRAME) ? KF_UPDATE : GF_UPDATE;
+        (frame_type == KEY_FRAME) ? KF_UPDATE : GF_UPDATE;
     gf_update = 1;
   }
-  if (cpi->oxcf.rc_mode == AOM_CBR) {
+  return gf_update;
+}
+
+static void resize_reset_rc(AV1_COMP *cpi, int resize_width, int resize_height,
+                            int prev_width, int prev_height) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  SVC *const svc = &cpi->svc;
+  double tot_scale_change = 1.0;
+  int target_bits_per_frame;
+  int active_worst_quality;
+  int qindex;
+  tot_scale_change = (double)(resize_width * resize_height) /
+                     (double)(prev_width * prev_height);
+  // Reset buffer level to optimal, update target size.
+  rc->buffer_level = rc->optimal_buffer_level;
+  rc->bits_off_target = rc->optimal_buffer_level;
+  rc->this_frame_target =
+      av1_calc_pframe_target_size_one_pass_cbr(cpi, INTER_FRAME);
+  target_bits_per_frame = rc->this_frame_target;
+  if (tot_scale_change > 4.0)
+    rc->avg_frame_qindex[INTER_FRAME] = rc->worst_quality;
+  else if (tot_scale_change > 1.0)
+    rc->avg_frame_qindex[INTER_FRAME] =
+        (rc->avg_frame_qindex[INTER_FRAME] + rc->worst_quality) >> 1;
+  active_worst_quality = calc_active_worst_quality_no_stats_cbr(cpi);
+  qindex = av1_rc_regulate_q(cpi, target_bits_per_frame, rc->best_quality,
+                             active_worst_quality, resize_width, resize_height);
+  // If resize is down, check if projected q index is close to worst_quality,
+  // and if so, reduce the rate correction factor (since likely can afford
+  // lower q for resized frame).
+  if (tot_scale_change < 1.0 && qindex > 90 * cpi->rc.worst_quality / 100)
+    rc->rate_correction_factors[INTER_NORMAL] *= 0.85;
+  // Apply the same rate control reset to all temporal layers.
+  for (int tl = 0; tl < svc->number_temporal_layers; tl++) {
+    LAYER_CONTEXT *lc = NULL;
+    lc = &svc->layer_context[svc->spatial_layer_id *
+                                 svc->number_temporal_layers +
+                             tl];
+    lc->rc.resize_state = rc->resize_state;
+    lc->rc.buffer_level = lc->rc.optimal_buffer_level;
+    lc->rc.bits_off_target = lc->rc.optimal_buffer_level;
+    lc->rc.rate_correction_factors[INTER_FRAME] =
+        rc->rate_correction_factors[INTER_FRAME];
+  }
+  // If resize is back up: check if projected q index is too much above the
+  // previous index, and if so, reduce the rate correction factor
+  // (since prefer to keep q for resized frame at least closet to previous q).
+  // Also check if projected qindex is close to previous qindex, if so
+  // increase correction factor (to push qindex higher and avoid overshoot).
+  if (tot_scale_change >= 1.0) {
+    if (tot_scale_change < 4.0 && qindex > 130 * rc->last_q[INTER_FRAME] / 100)
+      rc->rate_correction_factors[INTER_NORMAL] *= 0.8;
+    if (qindex <= 120 * rc->last_q[INTER_FRAME] / 100)
+      rc->rate_correction_factors[INTER_NORMAL] *= 2.0;
+  }
+}
+
+/*!\brief ChecK for resize based on Q, for 1 pass real-time mode.
+ *
+ * Check if we should resize, based on average QP from past x frames.
+ * Only allow for resize at most 1/2 scale down for now, Scaling factor
+ * for each step may be 3/4 or 1/2.
+ *
+ * \ingroup rate_control
+ * \param[in]       cpi          Top level encoder structure
+ *
+ * \return Return resized width/height in \c cpi->resize_pending_params,
+ * and update some resize counters in \c rc.
+ */
+static void dynamic_resize_one_pass_cbr(AV1_COMP *cpi) {
+  const AV1_COMMON *const cm = &cpi->common;
+  RATE_CONTROL *const rc = &cpi->rc;
+  RESIZE_ACTION resize_action = NO_RESIZE;
+  const int avg_qp_thr1 = 70;
+  const int avg_qp_thr2 = 50;
+  // Don't allow for resized frame to go below 160x90, resize in steps of 3/4.
+  const int min_width = (160 * 4) / 3;
+  const int min_height = (90 * 4) / 3;
+  int down_size_on = 1;
+  // Don't resize on key frame; reset the counters on key frame.
+  if (cm->current_frame.frame_type == KEY_FRAME) {
+    rc->resize_avg_qp = 0;
+    rc->resize_count = 0;
+    rc->resize_buffer_underflow = 0;
+    return;
+  }
+  // No resizing down if frame size is below some limit.
+  if ((cm->width * cm->height) < min_width * min_height) down_size_on = 0;
+
+  // Resize based on average buffer underflow and QP over some window.
+  // Ignore samples close to key frame, since QP is usually high after key.
+  if (cpi->rc.frames_since_key > cpi->framerate) {
+    const int window = AOMMIN(30, (int)(2 * cpi->framerate));
+    rc->resize_avg_qp += rc->last_q[INTER_FRAME];
+    if (cpi->rc.buffer_level < (int)(30 * rc->optimal_buffer_level / 100))
+      ++rc->resize_buffer_underflow;
+    ++rc->resize_count;
+    // Check for resize action every "window" frames.
+    if (rc->resize_count >= window) {
+      int avg_qp = rc->resize_avg_qp / rc->resize_count;
+      // Resize down if buffer level has underflowed sufficient amount in past
+      // window, and we are at original or 3/4 of original resolution.
+      // Resize back up if average QP is low, and we are currently in a resized
+      // down state, i.e. 1/2 or 3/4 of original resolution.
+      // Currently, use a flag to turn 3/4 resizing feature on/off.
+      if (rc->resize_buffer_underflow > (rc->resize_count >> 2) &&
+          down_size_on) {
+        if (rc->resize_state == THREE_QUARTER) {
+          resize_action = DOWN_ONEHALF;
+          rc->resize_state = ONE_HALF;
+        } else if (rc->resize_state == ORIG) {
+          resize_action = DOWN_THREEFOUR;
+          rc->resize_state = THREE_QUARTER;
+        }
+      } else if (rc->resize_state != ORIG &&
+                 avg_qp < avg_qp_thr1 * cpi->rc.worst_quality / 100) {
+        if (rc->resize_state == THREE_QUARTER ||
+            avg_qp < avg_qp_thr2 * cpi->rc.worst_quality / 100) {
+          resize_action = UP_ORIG;
+          rc->resize_state = ORIG;
+        } else if (rc->resize_state == ONE_HALF) {
+          resize_action = UP_THREEFOUR;
+          rc->resize_state = THREE_QUARTER;
+        }
+      }
+      // Reset for next window measurement.
+      rc->resize_avg_qp = 0;
+      rc->resize_count = 0;
+      rc->resize_buffer_underflow = 0;
+    }
+  }
+  // If decision is to resize, reset some quantities, and check is we should
+  // reduce rate correction factor,
+  if (resize_action != NO_RESIZE) {
+    int resize_width = cpi->oxcf.frm_dim_cfg.width;
+    int resize_height = cpi->oxcf.frm_dim_cfg.height;
+    int resize_scale_num = 1;
+    int resize_scale_den = 1;
+    if (resize_action == DOWN_THREEFOUR || resize_action == UP_THREEFOUR) {
+      resize_scale_num = 3;
+      resize_scale_den = 4;
+    } else if (resize_action == DOWN_ONEHALF) {
+      resize_scale_num = 1;
+      resize_scale_den = 2;
+    }
+    resize_width = resize_width * resize_scale_num / resize_scale_den;
+    resize_height = resize_height * resize_scale_num / resize_scale_den;
+    resize_reset_rc(cpi, resize_width, resize_height, cm->width, cm->height);
+  }
+  return;
+}
+
+void av1_get_one_pass_rt_params(AV1_COMP *cpi,
+                                EncodeFrameParams *const frame_params,
+                                unsigned int frame_flags) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  AV1_COMMON *const cm = &cpi->common;
+  GF_GROUP *const gf_group = &cpi->gf_group;
+  SVC *const svc = &cpi->svc;
+  ResizePendingParams *const resize_pending_params =
+      &cpi->resize_pending_params;
+  int target;
+  const int layer =
+      LAYER_IDS_TO_IDX(svc->spatial_layer_id, svc->temporal_layer_id,
+                       svc->number_temporal_layers);
+  // Turn this on to explicitly set the reference structure rather than
+  // relying on internal/default structure.
+  if (cpi->use_svc) {
+    av1_update_temporal_layer_framerate(cpi);
+    av1_restore_layer_context(cpi);
+  }
+  // Set frame type.
+  if ((!cpi->use_svc && rc->frames_to_key == 0) ||
+      (cpi->use_svc && svc->spatial_layer_id == 0 &&
+       svc->current_superframe % cpi->oxcf.kf_cfg.key_freq_max == 0) ||
+      (frame_flags & FRAMEFLAGS_KEY)) {
+    frame_params->frame_type = KEY_FRAME;
+    rc->this_key_frame_forced =
+        cm->current_frame.frame_number != 0 && rc->frames_to_key == 0;
+    rc->frames_to_key = cpi->oxcf.kf_cfg.key_freq_max;
+    rc->kf_boost = DEFAULT_KF_BOOST_RT;
+    gf_group->update_type[gf_group->index] = KF_UPDATE;
+    gf_group->frame_type[gf_group->index] = KEY_FRAME;
+    gf_group->refbuf_state[gf_group->index] = REFBUF_RESET;
+    if (cpi->use_svc) {
+      if (cm->current_frame.frame_number > 0)
+        av1_svc_reset_temporal_layers(cpi, 1);
+      svc->layer_context[layer].is_key_frame = 1;
+    }
+  } else {
+    frame_params->frame_type = INTER_FRAME;
+    gf_group->update_type[gf_group->index] = LF_UPDATE;
+    gf_group->frame_type[gf_group->index] = INTER_FRAME;
+    gf_group->refbuf_state[gf_group->index] = REFBUF_UPDATE;
+    if (cpi->use_svc) {
+      LAYER_CONTEXT *lc = &svc->layer_context[layer];
+      lc->is_key_frame =
+          svc->spatial_layer_id == 0
+              ? 0
+              : svc->layer_context[svc->temporal_layer_id].is_key_frame;
+    }
+  }
+  // Check for scene change, for non-SVC for now.
+  if (!cpi->use_svc && cpi->sf.rt_sf.check_scene_detection)
+    rc_scene_detection_onepass_rt(cpi);
+  // Check for dynamic resize, for single spatial layer for now.
+  // For temporal layers only check on base temporal layer.
+  if (cpi->oxcf.resize_cfg.resize_mode == RESIZE_DYNAMIC) {
+    if (svc->number_spatial_layers == 1 && svc->temporal_layer_id == 0)
+      dynamic_resize_one_pass_cbr(cpi);
+    if (rc->resize_state == THREE_QUARTER) {
+      resize_pending_params->width = (3 + cpi->oxcf.frm_dim_cfg.width * 3) >> 2;
+      resize_pending_params->height =
+          (3 + cpi->oxcf.frm_dim_cfg.height * 3) >> 2;
+    } else if (rc->resize_state == ONE_HALF) {
+      resize_pending_params->width = (1 + cpi->oxcf.frm_dim_cfg.width) >> 1;
+      resize_pending_params->height = (1 + cpi->oxcf.frm_dim_cfg.height) >> 1;
+    } else {
+      resize_pending_params->width = cpi->oxcf.frm_dim_cfg.width;
+      resize_pending_params->height = cpi->oxcf.frm_dim_cfg.height;
+    }
+  } else if (resize_pending_params->width && resize_pending_params->height &&
+             (cpi->common.width != resize_pending_params->width ||
+              cpi->common.height != resize_pending_params->height)) {
+    resize_reset_rc(cpi, resize_pending_params->width,
+                    resize_pending_params->height, cm->width, cm->height);
+  }
+  // Set the GF interval and update flag.
+  set_gf_interval_update_onepass_rt(cpi, frame_params->frame_type);
+  // Set target size.
+  if (cpi->oxcf.rc_cfg.mode == AOM_CBR) {
     if (frame_params->frame_type == KEY_FRAME) {
       target = av1_calc_iframe_target_size_one_pass_cbr(cpi);
     } else {
@@ -2110,8 +2643,56 @@
   }
   av1_rc_set_frame_target(cpi, target, cm->width, cm->height);
   rc->base_frame_target = target;
-  if (set_reference_structure && cpi->oxcf.speed >= 6 &&
-      cm->number_spatial_layers == 1 && cm->number_temporal_layers == 1)
-    set_reference_structure_one_pass_rt(cpi, gf_update);
   cm->current_frame.frame_type = frame_params->frame_type;
 }
+
+int av1_encodedframe_overshoot_cbr(AV1_COMP *cpi, int *q) {
+  AV1_COMMON *const cm = &cpi->common;
+  RATE_CONTROL *const rc = &cpi->rc;
+  SPEED_FEATURES *const sf = &cpi->sf;
+  int thresh_qp = 7 * (rc->worst_quality >> 3);
+  // Lower thresh_qp for video (more overshoot at lower Q) to be
+  // more conservative for video.
+  if (cpi->oxcf.tune_cfg.content != AOM_CONTENT_SCREEN)
+    thresh_qp = 3 * (rc->worst_quality >> 2);
+  if (sf->rt_sf.overshoot_detection_cbr == FAST_DETECTION_MAXQ &&
+      cm->quant_params.base_qindex < thresh_qp) {
+    double rate_correction_factor =
+        cpi->rc.rate_correction_factors[INTER_NORMAL];
+    const int target_size = cpi->rc.avg_frame_bandwidth;
+    double new_correction_factor;
+    int target_bits_per_mb;
+    double q2;
+    int enumerator;
+    *q = (3 * cpi->rc.worst_quality + *q) >> 2;
+    // Adjust avg_frame_qindex, buffer_level, and rate correction factors, as
+    // these parameters will affect QP selection for subsequent frames. If they
+    // have settled down to a very different (low QP) state, then not adjusting
+    // them may cause next frame to select low QP and overshoot again.
+    cpi->rc.avg_frame_qindex[INTER_FRAME] = *q;
+    rc->buffer_level = rc->optimal_buffer_level;
+    rc->bits_off_target = rc->optimal_buffer_level;
+    // Reset rate under/over-shoot flags.
+    cpi->rc.rc_1_frame = 0;
+    cpi->rc.rc_2_frame = 0;
+    // Adjust rate correction factor.
+    target_bits_per_mb =
+        (int)(((uint64_t)target_size << BPER_MB_NORMBITS) / cm->mi_params.MBs);
+    // Rate correction factor based on target_bits_per_mb and qp (==max_QP).
+    // This comes from the inverse computation of vp9_rc_bits_per_mb().
+    q2 = av1_convert_qindex_to_q(*q, cm->seq_params.bit_depth);
+    enumerator = 1800000;  // Factor for inter frame.
+    enumerator += (int)(enumerator * q2) >> 12;
+    new_correction_factor = (double)target_bits_per_mb * q2 / enumerator;
+    if (new_correction_factor > rate_correction_factor) {
+      rate_correction_factor =
+          AOMMIN(2.0 * rate_correction_factor, new_correction_factor);
+      if (rate_correction_factor > MAX_BPB_FACTOR)
+        rate_correction_factor = MAX_BPB_FACTOR;
+      cpi->rc.rate_correction_factors[INTER_NORMAL] = rate_correction_factor;
+    }
+    return 1;
+  } else {
+    return 0;
+  }
+}
diff --git a/av1/encoder/ratectrl.h b/av1/encoder/ratectrl.h
index c463786..aefb5b4 100644
--- a/av1/encoder/ratectrl.h
+++ b/av1/encoder/ratectrl.h
@@ -24,6 +24,8 @@
 extern "C" {
 #endif
 
+/*!\cond */
+
 // Bits Per MB at different Q (Multiplied by 512)
 #define BPER_MB_NORMBITS 9
 
@@ -39,14 +41,9 @@
 // The maximum duration of a GF group that is static (e.g. a slide show).
 #define MAX_STATIC_GF_GROUP_LENGTH 250
 
-// Minimum and maximum height for the new pyramid structure.
-// (Old structure supports height = 1, but does NOT support height = 4).
-#define MIN_PYRAMID_LVL 0
-#define MAX_PYRAMID_LVL 4
-
 #define MIN_GF_INTERVAL 4
 #define MAX_GF_INTERVAL 32
-#define FIXED_GF_INTERVAL 8  // Used in some testing modes only
+#define FIXED_GF_INTERVAL 16
 #define MAX_GF_LENGTH_LAP 16
 
 #define MAX_NUM_GF_INTERVALS 15
@@ -79,50 +76,161 @@
   FRAME_UPDATE_TYPES
 } UENUM1BYTE(FRAME_UPDATE_TYPE);
 
+enum {
+  REFBUF_RESET,   // Clear reference frame buffer
+  REFBUF_UPDATE,  // Refresh reference frame buffer
+  REFBUF_STATES
+} UENUM1BYTE(REFBUF_STATE);
+
+typedef enum {
+  NO_RESIZE = 0,
+  DOWN_THREEFOUR = 1,  // From orig to 3/4.
+  DOWN_ONEHALF = 2,    // From orig or 3/4 to 1/2.
+  UP_THREEFOUR = -1,   // From 1/2 to 3/4.
+  UP_ORIG = -2,        // From 1/2 or 3/4 to orig.
+} RESIZE_ACTION;
+
+typedef enum { ORIG = 0, THREE_QUARTER = 1, ONE_HALF = 2 } RESIZE_STATE;
+
+#define MAX_FIRSTPASS_ANALYSIS_FRAMES 150
+typedef enum region_types {
+  STABLE_REGION = 0,
+  HIGH_VAR_REGION = 1,
+  SCENECUT_REGION = 2,
+  BLENDING_REGION = 3,
+} REGION_TYPES;
+
+typedef struct regions {
+  int start;
+  int last;
+  double avg_noise_var;
+  double avg_cor_coeff;
+  double avg_sr_fr_ratio;
+  double avg_intra_err;
+  double avg_coded_err;
+  REGION_TYPES type;
+} REGIONS;
+
+/*!\endcond */
+/*!
+ * \brief  Rate Control parameters and status
+ */
 typedef struct {
   // Rate targetting variables
-  int base_frame_target;  // A baseline frame target before adjustment
-                          // for previous under or over shoot.
+
+  /*!
+   * Baseline target rate for frame before adjustment for previous under or
+   * over shoot.
+   */
+  int base_frame_target;
+  /*!
+   * Target rate for frame after adjustment for previous under or over shoot.
+   */
   int this_frame_target;  // Actual frame target after rc adjustment.
 
-  // gop bit budget
+  /*!
+   * Target bit budget for the current GF / ARF group of frame.
+   */
   int64_t gf_group_bits;
 
+  /*!
+   * Projected size for current frame
+   */
   int projected_frame_size;
-  int sb64_target_rate;
-  int last_q[FRAME_TYPES];  // Separate values for Intra/Inter
-  int last_boosted_qindex;  // Last boosted GF/KF/ARF q
-  int last_kf_qindex;       // Q index of the last key frame coded.
 
+  /*!
+   * Bit size of transform coefficient for current frame.
+   */
+  int coefficient_size;
+
+  /*!
+   * Super block rate target used with some adaptive quantization strategies.
+   */
+  int sb64_target_rate;
+
+  /*!
+   * Q used on last encoded frame of the given type.
+   */
+  int last_q[FRAME_TYPES];
+
+  /*!
+   * Q used for last boosted (non leaf) frame (GF/KF/ARF)
+   */
+  int last_boosted_qindex;
+
+  /*!
+   * Q used for last boosted (non leaf) frame
+   */
+  int last_kf_qindex;
+
+  /*!
+   * Boost factor used to calculate the extra bits allocated to ARFs and GFs
+   */
   int gfu_boost;
+  /*!
+   * Boost factor used to calculate the extra bits allocated to the key frame
+   */
   int kf_boost;
 
+  /*!
+   * Correction factors used to adjust the q estimate for a given target rate
+   * in the encode loop.
+   */
   double rate_correction_factors[RATE_FACTOR_LEVELS];
 
+  /*!
+   * Number of frames since the last ARF / GF.
+   */
   int frames_since_golden;
+
+  /*!
+   * Number of frames till the next ARF / GF is due.
+   */
   int frames_till_gf_update_due;
 
-  // number of determined gf group length left
+  /*!
+   * Number of determined gf groups left
+   */
   int intervals_till_gf_calculate_due;
-  // stores gf group length intervals
+
+  /*!
+   * Stores the determined gf group lengths for a set of gf groups
+   */
   int gf_intervals[MAX_NUM_GF_INTERVALS];
-  // the current index in gf_intervals
+
+  /*!
+   * The current group's index into gf_intervals[]
+   */
   int cur_gf_index;
 
+  /*!\cond */
+  int num_regions;
+  REGIONS regions[MAX_FIRSTPASS_ANALYSIS_FRAMES];
+  double cor_coeff[MAX_FIRSTPASS_ANALYSIS_FRAMES];
+  int regions_offset;  // offset of regions from the last keyframe
+  int frames_till_regions_update;
+
   int min_gf_interval;
   int max_gf_interval;
   int static_scene_max_gf_interval;
   int baseline_gf_interval;
   int constrained_gf_group;
+  /*!\endcond */
+  /*!
+   * Frames before the next key frame
+   */
   int frames_to_key;
+  /*!\cond */
   int frames_since_key;
   int this_key_frame_forced;
   int next_key_frame_forced;
-  int source_alt_ref_pending;
-  int source_alt_ref_active;
   int is_src_frame_alt_ref;
   int sframe_due;
 
+  int high_source_sad;
+  uint64_t avg_source_sad;
+  uint64_t prev_avg_source_sad;
+
   int avg_frame_bandwidth;  // Average frame size target for clip
   int min_frame_bandwidth;  // Minimum allocation used for any frame
   int max_frame_bandwidth;  // Maximum burst rate allowed for a frame.
@@ -146,21 +254,35 @@
   int rolling_target_bits;
   int rolling_actual_bits;
 
-  int long_rolling_target_bits;
-  int long_rolling_actual_bits;
-
   int rate_error_estimate;
 
   int64_t total_actual_bits;
   int64_t total_target_bits;
   int64_t total_target_vs_actual;
 
+  /*!\endcond */
+  /*!
+   * User specified maximum Q allowed for current frame
+   */
   int worst_quality;
+  /*!
+   * User specified minimum Q allowed for current frame
+   */
   int best_quality;
 
+  /*!
+   * Initial buffuer level in ms for CBR / low delay encoding
+   */
   int64_t starting_buffer_level;
+  /*!
+   * Optimum / target buffuer level in ms for CBR / low delay encoding
+   */
   int64_t optimal_buffer_level;
+  /*!
+   * Maximum target buffuer level in ms for CBR / low delay encoding
+   */
   int64_t maximum_buffer_size;
+  /*!\cond */
 
   // rate control history for last frame(1) and the frame before(2).
   // -1: undershot
@@ -172,10 +294,22 @@
   int q_2_frame;
 
   float_t arf_boost_factor;
-  // Q index used for ALT frame
+
+  /*!\endcond */
+  /*!
+   * Q index used for ALT frame
+   */
   int arf_q;
+  /*!
+   * Proposed maximum alloed Q for current frame
+   */
   int active_worst_quality;
+  /*!
+   * Proposed minimum allowed Q different layers in a coding pyramid
+   */
   int active_best_quality[MAX_ARF_LAYERS + 1];
+
+  /*!\cond */
   int base_layer_qp;
 
   // Total number of stats used only for kf_boost calculation.
@@ -186,8 +320,20 @@
   int num_stats_required_for_gfu_boost;
   int next_is_fwd_key;
   int enable_scenecut_detection;
+  int use_arf_in_this_kf_group;
+  // Track amount of low motion in scene
+  int avg_frame_low_motion;
+
+  // For dynamic resize, 1 pass cbr.
+  RESIZE_STATE resize_state;
+  int resize_avg_qp;
+  int resize_buffer_underflow;
+  int resize_count;
+  /*!\endcond */
 } RATE_CONTROL;
 
+/*!\cond */
+
 struct AV1_COMP;
 struct AV1EncoderConfig;
 
@@ -195,7 +341,8 @@
                  RATE_CONTROL *rc);
 
 int av1_estimate_bits_at_q(FRAME_TYPE frame_kind, int q, int mbs,
-                           double correction_factor, aom_bit_depth_t bit_depth);
+                           double correction_factor, aom_bit_depth_t bit_depth,
+                           const int is_screen_content_type);
 
 double av1_convert_qindex_to_q(int qindex, aom_bit_depth_t bit_depth);
 
@@ -236,10 +383,22 @@
 // Post encode update of the rate control parameters for dropped frames
 void av1_rc_postencode_update_drop_frame(struct AV1_COMP *cpi);
 
-// Updates rate correction factors
-// Changes only the rate correction factors in the rate control structure.
+/*!\endcond */
+/*!\brief Updates the rate correction factor linking Q to output bits
+ *
+ * This function updates the Q rate correction factor after an encode
+ * cycle depending on whether we overshot or undershot the target rate.
+ *
+ * \ingroup rate_control
+ * \param[in]   cpi                   Top level encoder instance structure
+ * \param[in]   width                 Frame width
+ * \param[in]   height                Frame height
+ *
+ * \return None but updates the relevant rate correction factor in cpi->rc
+ */
 void av1_rc_update_rate_correction_factors(struct AV1_COMP *cpi, int width,
                                            int height);
+/*!\cond */
 
 // Decide if we should drop this frame: For 1-pass CBR.
 // Changes only the decimation count in the rate control structure
@@ -251,19 +410,46 @@
                                       int *frame_under_shoot_limit,
                                       int *frame_over_shoot_limit);
 
-// Picks q and q bounds given the target for bits
+/*!\endcond */
+
+/*!\brief Picks q and q bounds given the rate control parameters in \c cpi->rc.
+ *
+ * \ingroup rate_control
+ * \param[in]       cpi          Top level encoder structure
+ * \param[in,out]   rc           Top level rate control structure
+ * \param[in]       width        Coded frame width
+ * \param[in]       height       Coded frame height
+ * \param[in]       gf_index     Index of this frame in the golden frame group
+ * \param[out]      bottom_index Bottom bound for q index (best quality)
+ * \param[out]      top_index    Top bound for q index (worst quality)
+ * \return Returns selected q index to be used for encoding this frame.
+ * Also, updates \c rc->arf_q.
+ */
 int av1_rc_pick_q_and_bounds(const struct AV1_COMP *cpi, RATE_CONTROL *rc,
                              int width, int height, int gf_index,
                              int *bottom_index, int *top_index);
 
-// Estimates q to achieve a target bits per frame
+/*!\brief Estimates q to achieve a target bits per frame
+ *
+ * \ingroup rate_control
+ * \param[in]   cpi                   Top level encoder instance structure
+ * \param[in]   target_bits_per_frame Frame rate target
+ * \param[in]   active_worst_quality  Max Q allowed
+ * \param[in]   active_best_quality   Min Q allowed
+ * \param[in]   width                 Frame width
+ * \param[in]   height                Frame height
+ *
+ * \return Returns a q index value
+ */
 int av1_rc_regulate_q(const struct AV1_COMP *cpi, int target_bits_per_frame,
                       int active_best_quality, int active_worst_quality,
                       int width, int height);
 
+/*!\cond */
 // Estimates bits per mb for a given qindex and correction factor.
 int av1_rc_bits_per_mb(FRAME_TYPE frame_type, int qindex,
-                       double correction_factor, aom_bit_depth_t bit_depth);
+                       double correction_factor, aom_bit_depth_t bit_depth,
+                       const int is_screen_content_type);
 
 // Clamping utilities for bitrate targets for iframes and pframes.
 int av1_rc_clamp_iframe_target_size(const struct AV1_COMP *const cpi,
@@ -287,6 +473,7 @@
 // to a value that should equate to the given rate ratio.
 int av1_compute_qdelta_by_rate(const RATE_CONTROL *rc, FRAME_TYPE frame_type,
                                int qindex, double rate_target_ratio,
+                               const int is_screen_content_type,
                                aom_bit_depth_t bit_depth);
 
 int av1_frame_type_qdelta(const struct AV1_COMP *cpi, int q);
@@ -303,20 +490,98 @@
 void av1_rc_set_frame_target(struct AV1_COMP *cpi, int target, int width,
                              int height);
 
+void av1_set_reference_structure_one_pass_rt(struct AV1_COMP *cpi,
+                                             int gf_update);
+
+/*!\endcond */
+/*!\brief Calculates how many bits to use for a P frame in one pass vbr
+ *
+ * \ingroup rate_control
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in]       cpi                 Top level encoder structure
+ * \param[in]       frame_update_type   Type of frame
+ *
+ * \return	Returns the target number of bits for this frame.
+ */
 int av1_calc_pframe_target_size_one_pass_vbr(
     const struct AV1_COMP *const cpi, FRAME_UPDATE_TYPE frame_update_type);
 
+/*!\brief Calculates how many bits to use for an i frame in one pass vbr
+ *
+ * \ingroup rate_control
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in]       cpi  Top level encoder structure
+ *
+ * \return	Returns the target number of bits for this frame.
+ */
 int av1_calc_iframe_target_size_one_pass_vbr(const struct AV1_COMP *const cpi);
 
+/*!\brief Calculates how many bits to use for a P frame in one pass cbr
+ *
+ * \ingroup rate_control
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in]       cpi                 Top level encoder structure
+ * \param[in]       frame_update_type   Type of frame
+ *
+ * \return  Returns the target number of bits for this frame.
+ */
 int av1_calc_pframe_target_size_one_pass_cbr(
     const struct AV1_COMP *cpi, FRAME_UPDATE_TYPE frame_update_type);
 
+/*!\brief Calculates how many bits to use for an i frame in one pass cbr
+ *
+ * \ingroup rate_control
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in]       cpi  Top level encoder structure
+ *
+ * \return  Returns the target number of bits for this frame.
+ */
 int av1_calc_iframe_target_size_one_pass_cbr(const struct AV1_COMP *cpi);
 
+/*!\brief Setup the rate control parameters for 1 pass real-time mode.
+ *
+ * - Sets the frame type and target frame size.
+ * - Sets the GF update.
+ * - Checks for scene change.
+ * - Sets the reference prediction structure for 1 layers (non-SVC).
+ * - Resets and updates are done for SVC.
+ *
+ * \ingroup rate_control
+ * \param[in]       cpi          Top level encoder structure
+ * \param[in]       frame_params Encoder frame parameters
+ * \param[in]       frame_flags  Emcoder frame flags
+ *
+ * \return Nothing is returned. Instead the settings computed in this
+ * funtion are set in: \c frame_params, \c cpi->common, \c cpi->rc, \c cpi->svc.
+ */
 void av1_get_one_pass_rt_params(struct AV1_COMP *cpi,
                                 struct EncodeFrameParams *const frame_params,
                                 unsigned int frame_flags);
 
+/*!\brief Increase q on expected encoder overshoot, for CBR mode.
+ *
+ *  Handles the case when encoder is expected to create a large frame:
+ *  - q is increased to value closer to \c cpi->rc.worst_quality
+ *  - avg_frame_qindex is reset
+ *  - buffer levels are reset
+ *  - rate correction factor is adjusted
+ *
+ * \ingroup rate_control
+ * \param[in]       cpi          Top level encoder structure
+ * \param[in]        q           Current q index
+ *
+ * \return q is returned, and updates are done to \c cpi->rc.
+ */
+int av1_encodedframe_overshoot_cbr(struct AV1_COMP *cpi, int *q);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/av1/encoder/rc_utils.h b/av1/encoder/rc_utils.h
new file mode 100644
index 0000000..98cec2e
--- /dev/null
+++ b/av1/encoder/rc_utils.h
@@ -0,0 +1,405 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_RC_UTILS_H_
+#define AOM_AV1_ENCODER_RC_UTILS_H_
+
+#include "av1/encoder/encoder.h"
+#include "aom_dsp/psnr.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+static AOM_INLINE void set_rc_buffer_sizes(RATE_CONTROL *rc,
+                                           const RateControlCfg *rc_cfg) {
+  const int64_t bandwidth = rc_cfg->target_bandwidth;
+  const int64_t starting = rc_cfg->starting_buffer_level_ms;
+  const int64_t optimal = rc_cfg->optimal_buffer_level_ms;
+  const int64_t maximum = rc_cfg->maximum_buffer_size_ms;
+
+  rc->starting_buffer_level = starting * bandwidth / 1000;
+  rc->optimal_buffer_level =
+      (optimal == 0) ? bandwidth / 8 : optimal * bandwidth / 1000;
+  rc->maximum_buffer_size =
+      (maximum == 0) ? bandwidth / 8 : maximum * bandwidth / 1000;
+}
+
+static AOM_INLINE void config_target_level(AV1_COMP *const cpi,
+                                           AV1_LEVEL target_level, int tier) {
+  aom_clear_system_state();
+
+  AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  SequenceHeader *const seq_params = &cpi->common.seq_params;
+  TileConfig *const tile_cfg = &oxcf->tile_cfg;
+  RateControlCfg *const rc_cfg = &oxcf->rc_cfg;
+
+  // Adjust target bitrate to be no larger than 70% of level limit.
+  const BITSTREAM_PROFILE profile = seq_params->profile;
+  const double level_bitrate_limit =
+      av1_get_max_bitrate_for_level(target_level, tier, profile);
+  const int64_t max_bitrate = (int64_t)(level_bitrate_limit * 0.70);
+  rc_cfg->target_bandwidth = AOMMIN(rc_cfg->target_bandwidth, max_bitrate);
+  // Also need to update cpi->twopass.bits_left.
+  TWO_PASS *const twopass = &cpi->twopass;
+  FIRSTPASS_STATS *stats = twopass->stats_buf_ctx->total_stats;
+  if (stats != NULL)
+    cpi->twopass.bits_left =
+        (int64_t)(stats->duration * rc_cfg->target_bandwidth / 10000000.0);
+
+  // Adjust max over-shoot percentage.
+  rc_cfg->over_shoot_pct = 0;
+
+  // Adjust max quantizer.
+  rc_cfg->worst_allowed_q = 255;
+
+  // Adjust number of tiles and tile columns to be under level limit.
+  int max_tiles, max_tile_cols;
+  av1_get_max_tiles_for_level(target_level, &max_tiles, &max_tile_cols);
+  while (tile_cfg->tile_columns > 0 &&
+         (1 << tile_cfg->tile_columns) > max_tile_cols) {
+    --tile_cfg->tile_columns;
+  }
+  const int tile_cols = (1 << tile_cfg->tile_columns);
+  while (tile_cfg->tile_rows > 0 &&
+         tile_cols * (1 << tile_cfg->tile_rows) > max_tiles) {
+    --tile_cfg->tile_rows;
+  }
+
+  // Adjust min compression ratio.
+  const int still_picture = seq_params->still_picture;
+  const double min_cr =
+      av1_get_min_cr_for_level(target_level, tier, still_picture);
+  rc_cfg->min_cr = AOMMAX(rc_cfg->min_cr, (unsigned int)(min_cr * 100));
+}
+
+#if !CONFIG_REALTIME_ONLY
+
+/*!\brief Function to test for conditions that indicate we should loop
+ * back and recode a frame.
+ *
+ * \ingroup rate_control
+ *
+ * \param[in]     cpi         Top-level encoder structure
+ * \param[in]     high_limit  Upper rate threshold
+ * \param[in]     low_limit   Lower rate threshold
+ * \param[in]     q           Current q index
+ * \param[in]     maxq        Maximum allowed q index
+ * \param[in]     minq        Minimum allowed q index
+ *
+ * \return        Indicates if a recode is required.
+ * \retval        1           Recode Required
+ * \retval        0           No Recode required
+ */
+static AOM_INLINE int recode_loop_test(AV1_COMP *cpi, int high_limit,
+                                       int low_limit, int q, int maxq,
+                                       int minq) {
+  const RATE_CONTROL *const rc = &cpi->rc;
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  const int frame_is_kfgfarf = frame_is_kf_gf_arf(cpi);
+  int force_recode = 0;
+
+  if ((rc->projected_frame_size >= rc->max_frame_bandwidth) ||
+      (cpi->sf.hl_sf.recode_loop == ALLOW_RECODE) ||
+      (frame_is_kfgfarf &&
+       (cpi->sf.hl_sf.recode_loop == ALLOW_RECODE_KFARFGF))) {
+    // TODO(agrange) high_limit could be greater than the scale-down threshold.
+    if ((rc->projected_frame_size > high_limit && q < maxq) ||
+        (rc->projected_frame_size < low_limit && q > minq)) {
+      force_recode = 1;
+    } else if (cpi->oxcf.rc_cfg.mode == AOM_CQ) {
+      // Deal with frame undershoot and whether or not we are
+      // below the automatically set cq level.
+      if (q > oxcf->rc_cfg.cq_level &&
+          rc->projected_frame_size < ((rc->this_frame_target * 7) >> 3)) {
+        force_recode = 1;
+      }
+    }
+  }
+  return force_recode;
+}
+
+static AOM_INLINE double av1_get_gfu_boost_projection_factor(double min_factor,
+                                                             double max_factor,
+                                                             int frame_count) {
+  double factor = sqrt((double)frame_count);
+  factor = AOMMIN(factor, max_factor);
+  factor = AOMMAX(factor, min_factor);
+  factor = (200.0 + 10.0 * factor);
+  return factor;
+}
+
+static AOM_INLINE int get_gfu_boost_from_r0_lap(double min_factor,
+                                                double max_factor, double r0,
+                                                int frames_to_key) {
+  double factor = av1_get_gfu_boost_projection_factor(min_factor, max_factor,
+                                                      frames_to_key);
+  const int boost = (int)rint(factor / r0);
+  return boost;
+}
+
+static AOM_INLINE double av1_get_kf_boost_projection_factor(int frame_count) {
+  double factor = sqrt((double)frame_count);
+  factor = AOMMIN(factor, 10.0);
+  factor = AOMMAX(factor, 4.0);
+  factor = (75.0 + 14.0 * factor);
+  return factor;
+}
+
+static AOM_INLINE int get_regulated_q_overshoot(AV1_COMP *const cpi, int q_low,
+                                                int q_high, int top_index,
+                                                int bottom_index) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const RATE_CONTROL *const rc = &cpi->rc;
+
+  av1_rc_update_rate_correction_factors(cpi, cm->width, cm->height);
+
+  int q_regulated =
+      av1_rc_regulate_q(cpi, rc->this_frame_target, bottom_index,
+                        AOMMAX(q_high, top_index), cm->width, cm->height);
+
+  int retries = 0;
+  while (q_regulated < q_low && retries < 10) {
+    av1_rc_update_rate_correction_factors(cpi, cm->width, cm->height);
+    q_regulated =
+        av1_rc_regulate_q(cpi, rc->this_frame_target, bottom_index,
+                          AOMMAX(q_high, top_index), cm->width, cm->height);
+    retries++;
+  }
+  return q_regulated;
+}
+
+static AOM_INLINE int get_regulated_q_undershoot(AV1_COMP *const cpi,
+                                                 int q_high, int top_index,
+                                                 int bottom_index) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const RATE_CONTROL *const rc = &cpi->rc;
+
+  av1_rc_update_rate_correction_factors(cpi, cm->width, cm->height);
+  int q_regulated = av1_rc_regulate_q(cpi, rc->this_frame_target, bottom_index,
+                                      top_index, cm->width, cm->height);
+
+  int retries = 0;
+  while (q_regulated > q_high && retries < 10) {
+    av1_rc_update_rate_correction_factors(cpi, cm->width, cm->height);
+    q_regulated = av1_rc_regulate_q(cpi, rc->this_frame_target, bottom_index,
+                                    top_index, cm->width, cm->height);
+    retries++;
+  }
+  return q_regulated;
+}
+
+/*!\brief Called after encode_with_recode_loop() has just encoded a frame.
+ * This function works out whether we undershot or overshot our bitrate
+ *  target and adjusts q as appropriate. It also decides whether or not
+ *  we need to recode the frame to get closer to the target rate.
+ *
+ * \ingroup rate_control
+ *
+ * \param[in]     cpi             Top-level encoder structure
+ * \param[out]    loop            Should we go around the recode loop again
+ * \param[in,out] q               New q index value
+ * \param[in,out] q_low           Low q index limit for this loop itteration
+ * \param[in,out] q_high          High q index limit for this loop itteration
+ * \param[in]     top_index       Max permited new value for q index
+ * \param[in]     bottom_index    Min permited new value for q index
+ * \param[in,out] undershoot_seen Have we seen undershoot on this frame
+ * \param[in,out] overshoot_seen  Have we seen overshoot on this frame
+ * \param[in,out] low_cr_seen     Have we previously trriggered recode
+ *                                because the compression ration was less
+ *                                than a given minimum threshold.
+ * \param[in]     loop_count      Loop itterations so far.
+ *
+ */
+static AOM_INLINE void recode_loop_update_q(
+    AV1_COMP *const cpi, int *const loop, int *const q, int *const q_low,
+    int *const q_high, const int top_index, const int bottom_index,
+    int *const undershoot_seen, int *const overshoot_seen,
+    int *const low_cr_seen, const int loop_count) {
+  AV1_COMMON *const cm = &cpi->common;
+  RATE_CONTROL *const rc = &cpi->rc;
+  const RateControlCfg *const rc_cfg = &cpi->oxcf.rc_cfg;
+  *loop = 0;
+
+  // Special case for overlay frame.
+  if (rc->is_src_frame_alt_ref &&
+      rc->projected_frame_size < rc->max_frame_bandwidth)
+    return;
+
+  const int min_cr = rc_cfg->min_cr;
+  if (min_cr > 0) {
+    aom_clear_system_state();
+    const double compression_ratio =
+        av1_get_compression_ratio(cm, rc->projected_frame_size >> 3);
+    const double target_cr = min_cr / 100.0;
+    if (compression_ratio < target_cr) {
+      *low_cr_seen = 1;
+      if (*q < rc->worst_quality) {
+        const double cr_ratio = target_cr / compression_ratio;
+        const int projected_q = AOMMAX(*q + 1, (int)(*q * cr_ratio * cr_ratio));
+        *q = AOMMIN(AOMMIN(projected_q, *q + 32), rc->worst_quality);
+        *q_low = AOMMAX(*q, *q_low);
+        *q_high = AOMMAX(*q, *q_high);
+        *loop = 1;
+      }
+    }
+    if (*low_cr_seen) return;
+  }
+
+  if (rc_cfg->mode == AOM_Q) return;
+
+  const int last_q = *q;
+  int frame_over_shoot_limit = 0, frame_under_shoot_limit = 0;
+  av1_rc_compute_frame_size_bounds(cpi, rc->this_frame_target,
+                                   &frame_under_shoot_limit,
+                                   &frame_over_shoot_limit);
+  if (frame_over_shoot_limit == 0) frame_over_shoot_limit = 1;
+
+  if (cm->current_frame.frame_type == KEY_FRAME && rc->this_key_frame_forced &&
+      rc->projected_frame_size < rc->max_frame_bandwidth) {
+    int64_t kf_err;
+    const int64_t high_err_target = cpi->ambient_err;
+    const int64_t low_err_target = cpi->ambient_err >> 1;
+
+#if CONFIG_AV1_HIGHBITDEPTH
+    if (cm->seq_params.use_highbitdepth) {
+      kf_err = aom_highbd_get_y_sse(cpi->source, &cm->cur_frame->buf);
+    } else {
+      kf_err = aom_get_y_sse(cpi->source, &cm->cur_frame->buf);
+    }
+#else
+    kf_err = aom_get_y_sse(cpi->source, &cm->cur_frame->buf);
+#endif
+    // Prevent possible divide by zero error below for perfect KF
+    kf_err += !kf_err;
+
+    // The key frame is not good enough or we can afford
+    // to make it better without undue risk of popping.
+    if ((kf_err > high_err_target &&
+         rc->projected_frame_size <= frame_over_shoot_limit) ||
+        (kf_err > low_err_target &&
+         rc->projected_frame_size <= frame_under_shoot_limit)) {
+      // Lower q_high
+      *q_high = AOMMAX(*q - 1, *q_low);
+
+      // Adjust Q
+      *q = (int)((*q * high_err_target) / kf_err);
+      *q = AOMMIN(*q, (*q_high + *q_low) >> 1);
+    } else if (kf_err < low_err_target &&
+               rc->projected_frame_size >= frame_under_shoot_limit) {
+      // The key frame is much better than the previous frame
+      // Raise q_low
+      *q_low = AOMMIN(*q + 1, *q_high);
+
+      // Adjust Q
+      *q = (int)((*q * low_err_target) / kf_err);
+      *q = AOMMIN(*q, (*q_high + *q_low + 1) >> 1);
+    }
+
+    // Clamp Q to upper and lower limits:
+    *q = clamp(*q, *q_low, *q_high);
+    *loop = (*q != last_q);
+    return;
+  }
+
+  if (recode_loop_test(cpi, frame_over_shoot_limit, frame_under_shoot_limit, *q,
+                       AOMMAX(*q_high, top_index), bottom_index)) {
+    // Is the projected frame size out of range and are we allowed
+    // to attempt to recode.
+
+    // Frame size out of permitted range:
+    // Update correction factor & compute new Q to try...
+    // Frame is too large
+    if (rc->projected_frame_size > rc->this_frame_target) {
+      // Special case if the projected size is > the max allowed.
+      if (*q == *q_high &&
+          rc->projected_frame_size >= rc->max_frame_bandwidth) {
+        const double q_val_high_current =
+            av1_convert_qindex_to_q(*q_high, cm->seq_params.bit_depth);
+        const double q_val_high_new =
+            q_val_high_current *
+            ((double)rc->projected_frame_size / rc->max_frame_bandwidth);
+        *q_high = av1_find_qindex(q_val_high_new, cm->seq_params.bit_depth,
+                                  rc->best_quality, rc->worst_quality);
+      }
+
+      // Raise Qlow as to at least the current value
+      *q_low = AOMMIN(*q + 1, *q_high);
+
+      if (*undershoot_seen || loop_count > 2 ||
+          (loop_count == 2 && !frame_is_intra_only(cm))) {
+        av1_rc_update_rate_correction_factors(cpi, cm->width, cm->height);
+
+        *q = (*q_high + *q_low + 1) / 2;
+      } else if (loop_count == 2 && frame_is_intra_only(cm)) {
+        const int q_mid = (*q_high + *q_low + 1) / 2;
+        const int q_regulated = get_regulated_q_overshoot(
+            cpi, *q_low, *q_high, top_index, bottom_index);
+        // Get 'q' in-between 'q_mid' and 'q_regulated' for a smooth
+        // transition between loop_count < 2 and loop_count > 2.
+        *q = (q_mid + q_regulated + 1) / 2;
+      } else {
+        *q = get_regulated_q_overshoot(cpi, *q_low, *q_high, top_index,
+                                       bottom_index);
+      }
+
+      *overshoot_seen = 1;
+    } else {
+      // Frame is too small
+      *q_high = AOMMAX(*q - 1, *q_low);
+
+      if (*overshoot_seen || loop_count > 2 ||
+          (loop_count == 2 && !frame_is_intra_only(cm))) {
+        av1_rc_update_rate_correction_factors(cpi, cm->width, cm->height);
+        *q = (*q_high + *q_low) / 2;
+      } else if (loop_count == 2 && frame_is_intra_only(cm)) {
+        const int q_mid = (*q_high + *q_low) / 2;
+        const int q_regulated =
+            get_regulated_q_undershoot(cpi, *q_high, top_index, bottom_index);
+        // Get 'q' in-between 'q_mid' and 'q_regulated' for a smooth
+        // transition between loop_count < 2 and loop_count > 2.
+        *q = (q_mid + q_regulated) / 2;
+
+        // Special case reset for qlow for constrained quality.
+        // This should only trigger where there is very substantial
+        // undershoot on a frame and the auto cq level is above
+        // the user passsed in value.
+        if (rc_cfg->mode == AOM_CQ && q_regulated < *q_low) {
+          *q_low = *q;
+        }
+      } else {
+        *q = get_regulated_q_undershoot(cpi, *q_high, top_index, bottom_index);
+
+        // Special case reset for qlow for constrained quality.
+        // This should only trigger where there is very substantial
+        // undershoot on a frame and the auto cq level is above
+        // the user passsed in value.
+        if (rc_cfg->mode == AOM_CQ && *q < *q_low) {
+          *q_low = *q;
+        }
+      }
+
+      *undershoot_seen = 1;
+    }
+
+    // Clamp Q to upper and lower limits:
+    *q = clamp(*q, *q_low, *q_high);
+  }
+
+  *loop = (*q != last_q);
+}
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_RC_UTILS_H_
diff --git a/av1/encoder/rd.c b/av1/encoder/rd.c
index e48c771..59fd5f3 100644
--- a/av1/encoder/rd.c
+++ b/av1/encoder/rd.c
@@ -84,68 +84,72 @@
   },
 };
 
-void av1_fill_mode_rates(AV1_COMMON *const cm, MACROBLOCK *x,
+void av1_fill_mode_rates(AV1_COMMON *const cm, ModeCosts *mode_costs,
                          FRAME_CONTEXT *fc) {
   int i, j;
 
   for (i = 0; i < PARTITION_CONTEXTS; ++i)
-    av1_cost_tokens_from_cdf(x->partition_cost[i], fc->partition_cdf[i], NULL);
+    av1_cost_tokens_from_cdf(mode_costs->partition_cost[i],
+                             fc->partition_cdf[i], NULL);
 
   if (cm->current_frame.skip_mode_info.skip_mode_flag) {
-    for (i = 0; i < SKIP_CONTEXTS; ++i) {
-      av1_cost_tokens_from_cdf(x->skip_mode_cost[i], fc->skip_mode_cdfs[i],
-                               NULL);
+    for (i = 0; i < SKIP_MODE_CONTEXTS; ++i) {
+      av1_cost_tokens_from_cdf(mode_costs->skip_mode_cost[i],
+                               fc->skip_mode_cdfs[i], NULL);
     }
   }
 
   for (i = 0; i < SKIP_CONTEXTS; ++i) {
-    av1_cost_tokens_from_cdf(x->skip_cost[i], fc->skip_cdfs[i], NULL);
+    av1_cost_tokens_from_cdf(mode_costs->skip_txfm_cost[i],
+                             fc->skip_txfm_cdfs[i], NULL);
   }
 
   for (i = 0; i < KF_MODE_CONTEXTS; ++i)
     for (j = 0; j < KF_MODE_CONTEXTS; ++j)
-      av1_cost_tokens_from_cdf(x->y_mode_costs[i][j], fc->kf_y_cdf[i][j], NULL);
+      av1_cost_tokens_from_cdf(mode_costs->y_mode_costs[i][j],
+                               fc->kf_y_cdf[i][j], NULL);
 
   for (i = 0; i < BLOCK_SIZE_GROUPS; ++i)
-    av1_cost_tokens_from_cdf(x->mbmode_cost[i], fc->y_mode_cdf[i], NULL);
+    av1_cost_tokens_from_cdf(mode_costs->mbmode_cost[i], fc->y_mode_cdf[i],
+                             NULL);
   for (i = 0; i < CFL_ALLOWED_TYPES; ++i)
     for (j = 0; j < INTRA_MODES; ++j)
-      av1_cost_tokens_from_cdf(x->intra_uv_mode_cost[i][j],
+      av1_cost_tokens_from_cdf(mode_costs->intra_uv_mode_cost[i][j],
                                fc->uv_mode_cdf[i][j], NULL);
 
-  av1_cost_tokens_from_cdf(x->filter_intra_mode_cost, fc->filter_intra_mode_cdf,
-                           NULL);
+  av1_cost_tokens_from_cdf(mode_costs->filter_intra_mode_cost,
+                           fc->filter_intra_mode_cdf, NULL);
   for (i = 0; i < BLOCK_SIZES_ALL; ++i) {
     if (av1_filter_intra_allowed_bsize(cm, i))
-      av1_cost_tokens_from_cdf(x->filter_intra_cost[i],
+      av1_cost_tokens_from_cdf(mode_costs->filter_intra_cost[i],
                                fc->filter_intra_cdfs[i], NULL);
   }
 
   for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
-    av1_cost_tokens_from_cdf(x->switchable_interp_costs[i],
+    av1_cost_tokens_from_cdf(mode_costs->switchable_interp_costs[i],
                              fc->switchable_interp_cdf[i], NULL);
 
   for (i = 0; i < PALATTE_BSIZE_CTXS; ++i) {
-    av1_cost_tokens_from_cdf(x->palette_y_size_cost[i],
+    av1_cost_tokens_from_cdf(mode_costs->palette_y_size_cost[i],
                              fc->palette_y_size_cdf[i], NULL);
-    av1_cost_tokens_from_cdf(x->palette_uv_size_cost[i],
+    av1_cost_tokens_from_cdf(mode_costs->palette_uv_size_cost[i],
                              fc->palette_uv_size_cdf[i], NULL);
     for (j = 0; j < PALETTE_Y_MODE_CONTEXTS; ++j) {
-      av1_cost_tokens_from_cdf(x->palette_y_mode_cost[i][j],
+      av1_cost_tokens_from_cdf(mode_costs->palette_y_mode_cost[i][j],
                                fc->palette_y_mode_cdf[i][j], NULL);
     }
   }
 
   for (i = 0; i < PALETTE_UV_MODE_CONTEXTS; ++i) {
-    av1_cost_tokens_from_cdf(x->palette_uv_mode_cost[i],
+    av1_cost_tokens_from_cdf(mode_costs->palette_uv_mode_cost[i],
                              fc->palette_uv_mode_cdf[i], NULL);
   }
 
   for (i = 0; i < PALETTE_SIZES; ++i) {
     for (j = 0; j < PALETTE_COLOR_INDEX_CONTEXTS; ++j) {
-      av1_cost_tokens_from_cdf(x->palette_y_color_cost[i][j],
+      av1_cost_tokens_from_cdf(mode_costs->palette_y_color_cost[i][j],
                                fc->palette_y_color_index_cdf[i][j], NULL);
-      av1_cost_tokens_from_cdf(x->palette_uv_color_cost[i][j],
+      av1_cost_tokens_from_cdf(mode_costs->palette_uv_color_cost[i][j],
                                fc->palette_uv_color_index_cdf[i][j], NULL);
     }
   }
@@ -153,8 +157,8 @@
   int sign_cost[CFL_JOINT_SIGNS];
   av1_cost_tokens_from_cdf(sign_cost, fc->cfl_sign_cdf, NULL);
   for (int joint_sign = 0; joint_sign < CFL_JOINT_SIGNS; joint_sign++) {
-    int *cost_u = x->cfl_cost[joint_sign][CFL_PRED_U];
-    int *cost_v = x->cfl_cost[joint_sign][CFL_PRED_V];
+    int *cost_u = mode_costs->cfl_cost[joint_sign][CFL_PRED_U];
+    int *cost_v = mode_costs->cfl_cost[joint_sign][CFL_PRED_V];
     if (CFL_SIGN_U(joint_sign) == CFL_SIGN_ZERO) {
       memset(cost_u, 0, CFL_ALPHABET_SIZE * sizeof(*cost_u));
     } else {
@@ -173,11 +177,11 @@
 
   for (i = 0; i < MAX_TX_CATS; ++i)
     for (j = 0; j < TX_SIZE_CONTEXTS; ++j)
-      av1_cost_tokens_from_cdf(x->tx_size_cost[i][j], fc->tx_size_cdf[i][j],
-                               NULL);
+      av1_cost_tokens_from_cdf(mode_costs->tx_size_cost[i][j],
+                               fc->tx_size_cdf[i][j], NULL);
 
   for (i = 0; i < TXFM_PARTITION_CONTEXTS; ++i) {
-    av1_cost_tokens_from_cdf(x->txfm_partition_cost[i],
+    av1_cost_tokens_from_cdf(mode_costs->txfm_partition_cost[i],
                              fc->txfm_partition_cdf[i], NULL);
   }
 
@@ -186,7 +190,7 @@
     for (s = 1; s < EXT_TX_SETS_INTER; ++s) {
       if (use_inter_ext_tx_for_txsize[s][i]) {
         av1_cost_tokens_from_cdf(
-            x->inter_tx_type_costs[s][i], fc->inter_ext_tx_cdf[s][i],
+            mode_costs->inter_tx_type_costs[s][i], fc->inter_ext_tx_cdf[s][i],
             av1_ext_tx_inv[av1_ext_tx_set_idx_to_type[1][s]]);
       }
     }
@@ -194,123 +198,132 @@
       if (use_intra_ext_tx_for_txsize[s][i]) {
         for (j = 0; j < INTRA_MODES; ++j) {
           av1_cost_tokens_from_cdf(
-              x->intra_tx_type_costs[s][i][j], fc->intra_ext_tx_cdf[s][i][j],
+              mode_costs->intra_tx_type_costs[s][i][j],
+              fc->intra_ext_tx_cdf[s][i][j],
               av1_ext_tx_inv[av1_ext_tx_set_idx_to_type[0][s]]);
         }
       }
     }
   }
   for (i = 0; i < DIRECTIONAL_MODES; ++i) {
-    av1_cost_tokens_from_cdf(x->angle_delta_cost[i], fc->angle_delta_cdf[i],
-                             NULL);
+    av1_cost_tokens_from_cdf(mode_costs->angle_delta_cost[i],
+                             fc->angle_delta_cdf[i], NULL);
   }
-  av1_cost_tokens_from_cdf(x->switchable_restore_cost,
-                           fc->switchable_restore_cdf, NULL);
-  av1_cost_tokens_from_cdf(x->wiener_restore_cost, fc->wiener_restore_cdf,
-                           NULL);
-  av1_cost_tokens_from_cdf(x->sgrproj_restore_cost, fc->sgrproj_restore_cdf,
-                           NULL);
-  av1_cost_tokens_from_cdf(x->intrabc_cost, fc->intrabc_cdf, NULL);
+  av1_cost_tokens_from_cdf(mode_costs->intrabc_cost, fc->intrabc_cdf, NULL);
 
   if (!frame_is_intra_only(cm)) {
     for (i = 0; i < COMP_INTER_CONTEXTS; ++i) {
-      av1_cost_tokens_from_cdf(x->comp_inter_cost[i], fc->comp_inter_cdf[i],
-                               NULL);
+      av1_cost_tokens_from_cdf(mode_costs->comp_inter_cost[i],
+                               fc->comp_inter_cdf[i], NULL);
     }
 
     for (i = 0; i < REF_CONTEXTS; ++i) {
       for (j = 0; j < SINGLE_REFS - 1; ++j) {
-        av1_cost_tokens_from_cdf(x->single_ref_cost[i][j],
+        av1_cost_tokens_from_cdf(mode_costs->single_ref_cost[i][j],
                                  fc->single_ref_cdf[i][j], NULL);
       }
     }
 
     for (i = 0; i < COMP_REF_TYPE_CONTEXTS; ++i) {
-      av1_cost_tokens_from_cdf(x->comp_ref_type_cost[i],
+      av1_cost_tokens_from_cdf(mode_costs->comp_ref_type_cost[i],
                                fc->comp_ref_type_cdf[i], NULL);
     }
 
     for (i = 0; i < UNI_COMP_REF_CONTEXTS; ++i) {
       for (j = 0; j < UNIDIR_COMP_REFS - 1; ++j) {
-        av1_cost_tokens_from_cdf(x->uni_comp_ref_cost[i][j],
+        av1_cost_tokens_from_cdf(mode_costs->uni_comp_ref_cost[i][j],
                                  fc->uni_comp_ref_cdf[i][j], NULL);
       }
     }
 
     for (i = 0; i < REF_CONTEXTS; ++i) {
       for (j = 0; j < FWD_REFS - 1; ++j) {
-        av1_cost_tokens_from_cdf(x->comp_ref_cost[i][j], fc->comp_ref_cdf[i][j],
-                                 NULL);
+        av1_cost_tokens_from_cdf(mode_costs->comp_ref_cost[i][j],
+                                 fc->comp_ref_cdf[i][j], NULL);
       }
     }
 
     for (i = 0; i < REF_CONTEXTS; ++i) {
       for (j = 0; j < BWD_REFS - 1; ++j) {
-        av1_cost_tokens_from_cdf(x->comp_bwdref_cost[i][j],
+        av1_cost_tokens_from_cdf(mode_costs->comp_bwdref_cost[i][j],
                                  fc->comp_bwdref_cdf[i][j], NULL);
       }
     }
 
     for (i = 0; i < INTRA_INTER_CONTEXTS; ++i) {
-      av1_cost_tokens_from_cdf(x->intra_inter_cost[i], fc->intra_inter_cdf[i],
-                               NULL);
+      av1_cost_tokens_from_cdf(mode_costs->intra_inter_cost[i],
+                               fc->intra_inter_cdf[i], NULL);
     }
 
     for (i = 0; i < NEWMV_MODE_CONTEXTS; ++i) {
-      av1_cost_tokens_from_cdf(x->newmv_mode_cost[i], fc->newmv_cdf[i], NULL);
+      av1_cost_tokens_from_cdf(mode_costs->newmv_mode_cost[i], fc->newmv_cdf[i],
+                               NULL);
     }
 
     for (i = 0; i < GLOBALMV_MODE_CONTEXTS; ++i) {
-      av1_cost_tokens_from_cdf(x->zeromv_mode_cost[i], fc->zeromv_cdf[i], NULL);
+      av1_cost_tokens_from_cdf(mode_costs->zeromv_mode_cost[i],
+                               fc->zeromv_cdf[i], NULL);
     }
 
     for (i = 0; i < REFMV_MODE_CONTEXTS; ++i) {
-      av1_cost_tokens_from_cdf(x->refmv_mode_cost[i], fc->refmv_cdf[i], NULL);
+      av1_cost_tokens_from_cdf(mode_costs->refmv_mode_cost[i], fc->refmv_cdf[i],
+                               NULL);
     }
 
     for (i = 0; i < DRL_MODE_CONTEXTS; ++i) {
-      av1_cost_tokens_from_cdf(x->drl_mode_cost0[i], fc->drl_cdf[i], NULL);
+      av1_cost_tokens_from_cdf(mode_costs->drl_mode_cost0[i], fc->drl_cdf[i],
+                               NULL);
     }
     for (i = 0; i < INTER_MODE_CONTEXTS; ++i)
-      av1_cost_tokens_from_cdf(x->inter_compound_mode_cost[i],
+      av1_cost_tokens_from_cdf(mode_costs->inter_compound_mode_cost[i],
                                fc->inter_compound_mode_cdf[i], NULL);
     for (i = 0; i < BLOCK_SIZES_ALL; ++i)
-      av1_cost_tokens_from_cdf(x->compound_type_cost[i],
+      av1_cost_tokens_from_cdf(mode_costs->compound_type_cost[i],
                                fc->compound_type_cdf[i], NULL);
     for (i = 0; i < BLOCK_SIZES_ALL; ++i) {
       if (av1_is_wedge_used(i)) {
-        av1_cost_tokens_from_cdf(x->wedge_idx_cost[i], fc->wedge_idx_cdf[i],
-                                 NULL);
+        av1_cost_tokens_from_cdf(mode_costs->wedge_idx_cost[i],
+                                 fc->wedge_idx_cdf[i], NULL);
       }
     }
     for (i = 0; i < BLOCK_SIZE_GROUPS; ++i) {
-      av1_cost_tokens_from_cdf(x->interintra_cost[i], fc->interintra_cdf[i],
-                               NULL);
-      av1_cost_tokens_from_cdf(x->interintra_mode_cost[i],
+      av1_cost_tokens_from_cdf(mode_costs->interintra_cost[i],
+                               fc->interintra_cdf[i], NULL);
+      av1_cost_tokens_from_cdf(mode_costs->interintra_mode_cost[i],
                                fc->interintra_mode_cdf[i], NULL);
     }
     for (i = 0; i < BLOCK_SIZES_ALL; ++i) {
-      av1_cost_tokens_from_cdf(x->wedge_interintra_cost[i],
+      av1_cost_tokens_from_cdf(mode_costs->wedge_interintra_cost[i],
                                fc->wedge_interintra_cdf[i], NULL);
     }
     for (i = BLOCK_8X8; i < BLOCK_SIZES_ALL; i++) {
-      av1_cost_tokens_from_cdf(x->motion_mode_cost[i], fc->motion_mode_cdf[i],
-                               NULL);
+      av1_cost_tokens_from_cdf(mode_costs->motion_mode_cost[i],
+                               fc->motion_mode_cdf[i], NULL);
     }
     for (i = BLOCK_8X8; i < BLOCK_SIZES_ALL; i++) {
-      av1_cost_tokens_from_cdf(x->motion_mode_cost1[i], fc->obmc_cdf[i], NULL);
+      av1_cost_tokens_from_cdf(mode_costs->motion_mode_cost1[i],
+                               fc->obmc_cdf[i], NULL);
     }
     for (i = 0; i < COMP_INDEX_CONTEXTS; ++i) {
-      av1_cost_tokens_from_cdf(x->comp_idx_cost[i], fc->compound_index_cdf[i],
-                               NULL);
+      av1_cost_tokens_from_cdf(mode_costs->comp_idx_cost[i],
+                               fc->compound_index_cdf[i], NULL);
     }
     for (i = 0; i < COMP_GROUP_IDX_CONTEXTS; ++i) {
-      av1_cost_tokens_from_cdf(x->comp_group_idx_cost[i],
+      av1_cost_tokens_from_cdf(mode_costs->comp_group_idx_cost[i],
                                fc->comp_group_idx_cdf[i], NULL);
     }
   }
 }
 
+void av1_fill_lr_rates(ModeCosts *mode_costs, FRAME_CONTEXT *fc) {
+  av1_cost_tokens_from_cdf(mode_costs->switchable_restore_cost,
+                           fc->switchable_restore_cdf, NULL);
+  av1_cost_tokens_from_cdf(mode_costs->wiener_restore_cost,
+                           fc->wiener_restore_cdf, NULL);
+  av1_cost_tokens_from_cdf(mode_costs->sgrproj_restore_cost,
+                           fc->sgrproj_restore_cdf, NULL);
+}
+
 // Values are now correlated to quantizer.
 static int sad_per_bit_lut_8[QINDEX_RANGE];
 static int sad_per_bit_lut_10[QINDEX_RANGE];
@@ -336,14 +349,15 @@
 
 static const int rd_boost_factor[16] = { 64, 32, 32, 32, 24, 16, 12, 12,
                                          8,  8,  4,  4,  2,  2,  1,  0 };
-static const int rd_frame_type_factor[FRAME_UPDATE_TYPES] = { 128, 144, 128,
-                                                              128, 144, 144,
-                                                              128 };
+
+static const int rd_layer_depth_factor[7] = {
+  160, 160, 160, 160, 192, 208, 224
+};
 
 int av1_compute_rd_mult_based_on_qindex(const AV1_COMP *cpi, int qindex) {
   const int q = av1_dc_quant_QTX(qindex, 0, cpi->common.seq_params.bit_depth);
-  int rdmult = q * q;
-  rdmult = rdmult * 3 + (rdmult * 2 / 3);
+  int rdmult = (int)(((int64_t)88 * q * q) / 24);
+
   switch (cpi->common.seq_params.bit_depth) {
     case AOM_BITS_8: break;
     case AOM_BITS_10: rdmult = ROUND_POWER_OF_TWO(rdmult, 4); break;
@@ -360,10 +374,13 @@
   if (is_stat_consumption_stage(cpi) &&
       (cpi->common.current_frame.frame_type != KEY_FRAME)) {
     const GF_GROUP *const gf_group = &cpi->gf_group;
-    const FRAME_UPDATE_TYPE frame_type = gf_group->update_type[gf_group->index];
     const int boost_index = AOMMIN(15, (cpi->rc.gfu_boost / 100));
+    const int layer_depth = AOMMIN(gf_group->layer_depth[gf_group->index], 6);
 
-    rdmult = (rdmult * rd_frame_type_factor[frame_type]) >> 7;
+    // Layer depth adjustment
+    rdmult = (rdmult * rd_layer_depth_factor[layer_depth]) >> 7;
+
+    // ARF boost adjustment
     rdmult += ((rdmult * rd_boost_factor[boost_index]) >> 7);
   }
   return (int)rdmult;
@@ -391,32 +408,10 @@
 int av1_get_adaptive_rdmult(const AV1_COMP *cpi, double beta) {
   assert(beta > 0.0);
   const AV1_COMMON *cm = &cpi->common;
-  int64_t q = av1_dc_quant_QTX(cm->quant_params.base_qindex, 0,
-                               cm->seq_params.bit_depth);
-  int64_t rdmult = 0;
+  int q = av1_dc_quant_QTX(cm->quant_params.base_qindex, 0,
+                           cm->seq_params.bit_depth);
 
-  switch (cm->seq_params.bit_depth) {
-    case AOM_BITS_8: rdmult = (int)((88 * q * q / beta) / 24); break;
-    case AOM_BITS_10:
-      rdmult = ROUND_POWER_OF_TWO((int)((88 * q * q / beta) / 24), 4);
-      break;
-    default:
-      assert(cm->seq_params.bit_depth == AOM_BITS_12);
-      rdmult = ROUND_POWER_OF_TWO((int)((88 * q * q / beta) / 24), 8);
-      break;
-  }
-
-  if (is_stat_consumption_stage(cpi) &&
-      (cm->current_frame.frame_type != KEY_FRAME)) {
-    const GF_GROUP *const gf_group = &cpi->gf_group;
-    const FRAME_UPDATE_TYPE frame_type = gf_group->update_type[gf_group->index];
-    const int boost_index = AOMMIN(15, (cpi->rc.gfu_boost / 100));
-
-    rdmult = (rdmult * rd_frame_type_factor[frame_type]) >> 7;
-    rdmult += ((rdmult * rd_boost_factor[boost_index]) >> 7);
-  }
-  if (rdmult < 1) rdmult = 1;
-  return (int)rdmult;
+  return (int)(av1_compute_rd_mult(cpi, q) / beta);
 }
 
 static int compute_rd_thresh_factor(int qindex, aom_bit_depth_t bit_depth) {
@@ -437,11 +432,11 @@
   return AOMMAX((int)(pow(q, RD_THRESH_POW) * 5.12), 8);
 }
 
-void av1_initialize_me_consts(const AV1_COMP *cpi, MACROBLOCK *x, int qindex) {
+void av1_set_sad_per_bit(const AV1_COMP *cpi, MvCosts *mv_costs, int qindex) {
   switch (cpi->common.seq_params.bit_depth) {
-    case AOM_BITS_8: x->sadperbit = sad_per_bit_lut_8[qindex]; break;
-    case AOM_BITS_10: x->sadperbit = sad_per_bit_lut_10[qindex]; break;
-    case AOM_BITS_12: x->sadperbit = sad_per_bit_lut_12[qindex]; break;
+    case AOM_BITS_8: mv_costs->sadperbit = sad_per_bit_lut_8[qindex]; break;
+    case AOM_BITS_10: mv_costs->sadperbit = sad_per_bit_lut_10[qindex]; break;
+    case AOM_BITS_12: mv_costs->sadperbit = sad_per_bit_lut_12[qindex]; break;
     default:
       assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12");
   }
@@ -471,12 +466,12 @@
   }
 }
 
-void av1_fill_coeff_costs(MACROBLOCK *x, FRAME_CONTEXT *fc,
+void av1_fill_coeff_costs(CoeffCosts *coeff_costs, FRAME_CONTEXT *fc,
                           const int num_planes) {
   const int nplanes = AOMMIN(num_planes, PLANE_TYPES);
   for (int eob_multi_size = 0; eob_multi_size < 7; ++eob_multi_size) {
     for (int plane = 0; plane < nplanes; ++plane) {
-      LV_MAP_EOB_COST *pcost = &x->eob_costs[eob_multi_size][plane];
+      LV_MAP_EOB_COST *pcost = &coeff_costs->eob_costs[eob_multi_size][plane];
 
       for (int ctx = 0; ctx < 2; ++ctx) {
         aom_cdf_prob *pcdf;
@@ -496,7 +491,7 @@
   }
   for (int tx_size = 0; tx_size < TX_SIZES; ++tx_size) {
     for (int plane = 0; plane < nplanes; ++plane) {
-      LV_MAP_COEFF_COST *pcost = &x->coeff_costs[tx_size][plane];
+      LV_MAP_COEFF_COST *pcost = &coeff_costs->coeff_costs[tx_size][plane];
 
       for (int ctx = 0; ctx < TXB_SKIP_CONTEXTS; ++ctx)
         av1_cost_tokens_from_cdf(pcost->txb_skip_cost[ctx],
@@ -565,20 +560,20 @@
 }
 
 void av1_fill_mv_costs(const FRAME_CONTEXT *fc, int integer_mv, int usehp,
-                       MACROBLOCK *x) {
-  x->nmvcost[0] = &x->nmv_costs[0][MV_MAX];
-  x->nmvcost[1] = &x->nmv_costs[1][MV_MAX];
-  x->nmvcost_hp[0] = &x->nmv_costs_hp[0][MV_MAX];
-  x->nmvcost_hp[1] = &x->nmv_costs_hp[1][MV_MAX];
+                       MvCosts *mv_costs) {
+  mv_costs->nmv_cost[0] = &mv_costs->nmv_cost_alloc[0][MV_MAX];
+  mv_costs->nmv_cost[1] = &mv_costs->nmv_cost_alloc[1][MV_MAX];
+  mv_costs->nmv_cost_hp[0] = &mv_costs->nmv_cost_hp_alloc[0][MV_MAX];
+  mv_costs->nmv_cost_hp[1] = &mv_costs->nmv_cost_hp_alloc[1][MV_MAX];
   if (integer_mv) {
-    av1_build_nmv_cost_table(x->nmv_vec_cost, x->nmvcost, &fc->nmvc,
-                             MV_SUBPEL_NONE);
-    x->mv_cost_stack = (int **)&x->nmvcost;
+    mv_costs->mv_cost_stack = (int **)&mv_costs->nmv_cost;
+    av1_build_nmv_cost_table(mv_costs->nmv_joint_cost, mv_costs->mv_cost_stack,
+                             &fc->nmvc, MV_SUBPEL_NONE);
   } else {
-    int *(*src)[2] = usehp ? &x->nmvcost_hp : &x->nmvcost;
-    x->mv_cost_stack = *src;
-    av1_build_nmv_cost_table(
-        x->nmv_vec_cost, usehp ? x->nmvcost_hp : x->nmvcost, &fc->nmvc, usehp);
+    mv_costs->mv_cost_stack =
+        usehp ? mv_costs->nmv_cost_hp : mv_costs->nmv_cost;
+    av1_build_nmv_cost_table(mv_costs->nmv_joint_cost, mv_costs->mv_cost_stack,
+                             &fc->nmvc, usehp);
   }
 }
 
@@ -586,21 +581,22 @@
   AV1_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &cpi->td.mb;
   RD_OPT *const rd = &cpi->rd;
+  MvCosts *mv_costs = &x->mv_costs;
 
   aom_clear_system_state();
 
   rd->RDMULT = av1_compute_rd_mult(
       cpi, cm->quant_params.base_qindex + cm->quant_params.y_dc_delta_q);
 
-  set_error_per_bit(x, rd->RDMULT);
+  av1_set_error_per_bit(mv_costs, rd->RDMULT);
 
   set_block_thresholds(cm, rd);
 
   if ((!cpi->sf.rt_sf.use_nonrd_pick_mode &&
-       cpi->oxcf.mv_cost_upd_freq != COST_UPD_OFF) ||
+       cpi->oxcf.cost_upd_freq.mv != COST_UPD_OFF) ||
       frame_is_intra_only(cm) || (cm->current_frame.frame_number & 0x07) == 1)
     av1_fill_mv_costs(cm->fc, cm->features.cur_frame_force_integer_mv,
-                      cm->features.allow_high_precision_mv, x);
+                      cm->features.allow_high_precision_mv, mv_costs);
 
   if (!cpi->sf.rt_sf.use_nonrd_pick_mode && frame_is_intra_only(cm) &&
       cm->features.allow_screen_content_tools &&
@@ -989,19 +985,15 @@
                  int ref_y_stride, int ref_frame, BLOCK_SIZE block_size) {
   const MV_REFERENCE_FRAME ref_frames[2] = { ref_frame, NONE_FRAME };
   const int_mv ref_mv =
-      av1_get_ref_mv_from_stack(0, ref_frames, 0, x->mbmi_ext);
+      av1_get_ref_mv_from_stack(0, ref_frames, 0, &x->mbmi_ext);
   const int_mv ref_mv1 =
-      av1_get_ref_mv_from_stack(0, ref_frames, 1, x->mbmi_ext);
+      av1_get_ref_mv_from_stack(0, ref_frames, 1, &x->mbmi_ext);
   MV pred_mv[MAX_MV_REF_CANDIDATES + 1];
   int num_mv_refs = 0;
   pred_mv[num_mv_refs++] = ref_mv.as_mv;
   if (ref_mv.as_int != ref_mv1.as_int) {
     pred_mv[num_mv_refs++] = ref_mv1.as_mv;
   }
-  if (cpi->sf.mv_sf.adaptive_motion_search &&
-      block_size < x->max_partition_size) {
-    pred_mv[num_mv_refs++] = x->pred_mv[ref_frame];
-  }
 
   assert(num_mv_refs <= (int)(sizeof(pred_mv) / sizeof(pred_mv[0])));
 
@@ -1050,7 +1042,7 @@
   const int mi_row = xd->mi_row;
   const int mi_col = xd->mi_col;
   for (int i = 0; i < num_planes; ++i) {
-    setup_pred_plane(dst + i, xd->mi[0]->sb_type, dst[i].buf,
+    setup_pred_plane(dst + i, xd->mi[0]->bsize, dst[i].buf,
                      i ? src->uv_crop_width : src->y_crop_width,
                      i ? src->uv_crop_height : src->y_crop_height,
                      dst[i].stride, mi_row, mi_col, i ? scale_uv : scale,
@@ -1069,17 +1061,16 @@
 }
 
 int av1_get_switchable_rate(const MACROBLOCK *x, const MACROBLOCKD *xd,
-                            InterpFilter interp_filter) {
+                            InterpFilter interp_filter, int dual_filter) {
   if (interp_filter == SWITCHABLE) {
     const MB_MODE_INFO *const mbmi = xd->mi[0];
     int inter_filter_cost = 0;
-    int dir;
-
-    for (dir = 0; dir < 2; ++dir) {
+    for (int dir = 0; dir < 2; ++dir) {
+      if (dir && !dual_filter) break;
       const int ctx = av1_get_pred_context_switchable_interp(xd, dir);
       const InterpFilter filter =
           av1_extract_interp_filter(mbmi->interp_filters, dir);
-      inter_filter_cost += x->switchable_interp_costs[ctx][filter];
+      inter_filter_cost += x->mode_costs.switchable_interp_costs[ctx][filter];
     }
     return SWITCHABLE_INTERP_RATE_FACTOR * inter_filter_cost;
   } else {
diff --git a/av1/encoder/rd.h b/av1/encoder/rd.h
index 1addbae..73c575f 100644
--- a/av1/encoder/rd.h
+++ b/av1/encoder/rd.h
@@ -35,9 +35,9 @@
   (((D) * (1 << RDDIV_BITS)) - \
    ROUND_POWER_OF_TWO(((int64_t)(R)) * (RM), AV1_PROB_COST_SHIFT))
 
-#define RDCOST_DBL(RM, R, D)                                       \
+#define RDCOST_DBL_WITH_NATIVE_BD_DIST(RM, R, D, BD)               \
   (((((double)(R)) * (RM)) / (double)(1 << AV1_PROB_COST_SHIFT)) + \
-   ((double)(D) * (1 << RDDIV_BITS)))
+   ((double)((D) >> (2 * (BD - 8))) * (1 << RDDIV_BITS)))
 
 #define QIDX_SKIP_THRESH 115
 
@@ -78,8 +78,7 @@
 
   int RDMULT;
 
-  double r0, arf_r0;
-  double mc_saved_base, mc_count_base;
+  double r0;
 } RD_OPT;
 
 typedef struct {
@@ -104,7 +103,7 @@
   rd_stats->dist = 0;
   rd_stats->rdcost = 0;
   rd_stats->sse = 0;
-  rd_stats->skip = 1;
+  rd_stats->skip_txfm = 1;
   rd_stats->zero_rate = 0;
 #if CONFIG_RD_DEBUG
   // This may run into problems when monochrome video is
@@ -129,7 +128,7 @@
   rd_stats->dist = INT64_MAX;
   rd_stats->rdcost = INT64_MAX;
   rd_stats->sse = INT64_MAX;
-  rd_stats->skip = 0;
+  rd_stats->skip_txfm = 0;
   rd_stats->zero_rate = 0;
 #if CONFIG_RD_DEBUG
   // This may run into problems when monochrome video is
@@ -155,7 +154,7 @@
     rd_stats_dst->zero_rate = rd_stats_src->zero_rate;
   rd_stats_dst->dist += rd_stats_src->dist;
   rd_stats_dst->sse += rd_stats_src->sse;
-  rd_stats_dst->skip &= rd_stats_src->skip;
+  rd_stats_dst->skip_txfm &= rd_stats_src->skip_txfm;
 #if CONFIG_RD_DEBUG
   // This may run into problems when monochrome video is
   // encoded, as there will only be 1 plane
@@ -178,13 +177,13 @@
 }
 
 static INLINE void av1_accumulate_rd_stats(RD_STATS *rd_stats, int64_t dist,
-                                           int rate, int skip, int64_t sse,
+                                           int rate, int skip_txfm, int64_t sse,
                                            int zero_rate) {
   assert(rd_stats->rate != INT_MAX && rate != INT_MAX);
   rd_stats->rate += rate;
   if (!rd_stats->zero_rate) rd_stats->zero_rate = zero_rate;
   rd_stats->dist += dist;
-  rd_stats->skip &= skip;
+  rd_stats->skip_txfm &= skip_txfm;
   rd_stats->sse += sse;
 }
 
@@ -231,8 +230,9 @@
 
 void av1_initialize_rd_consts(struct AV1_COMP *cpi);
 
-void av1_initialize_me_consts(const struct AV1_COMP *cpi, MACROBLOCK *x,
-                              int qindex);
+// Sets the multiplier to convert mv cost to l1 error during motion search.
+void av1_set_sad_per_bit(const struct AV1_COMP *cpi, MvCosts *mv_costs,
+                         int qindex);
 
 void av1_model_rd_from_var_lapndz(int64_t var, unsigned int n,
                                   unsigned int qstep, int *rate, int64_t *dist);
@@ -243,7 +243,7 @@
                           double yl, double *rate_f, double *distbysse_f);
 
 int av1_get_switchable_rate(const MACROBLOCK *x, const MACROBLOCKD *xd,
-                            InterpFilter interp_filter);
+                            InterpFilter interp_filter, int dual_filter);
 
 YV12_BUFFER_CONFIG *av1_get_scaled_ref_frame(const struct AV1_COMP *cpi,
                                              int ref_frame);
@@ -271,18 +271,18 @@
   }
 }
 
-static INLINE int rd_less_than_thresh(int64_t best_rd, int thresh,
+static INLINE int rd_less_than_thresh(int64_t best_rd, int64_t thresh,
                                       int thresh_fact) {
-  return best_rd < ((int64_t)thresh * thresh_fact >> 5) || thresh == INT_MAX;
+  return best_rd < (thresh * thresh_fact >> 5) || thresh == INT_MAX;
 }
 
 void av1_mv_pred(const struct AV1_COMP *cpi, MACROBLOCK *x,
                  uint8_t *ref_y_buffer, int ref_y_stride, int ref_frame,
                  BLOCK_SIZE block_size);
 
-static INLINE void set_error_per_bit(MACROBLOCK *x, int rdmult) {
-  x->errorperbit = rdmult >> RD_EPB_SHIFT;
-  x->errorperbit += (x->errorperbit == 0);
+// Sets the multiplier to convert mv cost to l2 error during motion search.
+static INLINE void av1_set_error_per_bit(MvCosts *mv_costs, int rdmult) {
+  mv_costs->errorperbit = AOMMAX(rdmult >> RD_EPB_SHIFT, 1);
 }
 
 // Get the threshold for R-D optimization of coefficients depending upon mode
@@ -309,7 +309,7 @@
 }
 
 // Used to reset the state of tx/mb rd hash information
-static INLINE void reset_hash_records(MACROBLOCK *const x,
+static INLINE void reset_hash_records(TxfmSearchInfo *const txfm_info,
                                       int use_inter_txb_hash) {
   int32_t record_idx;
 
@@ -317,27 +317,28 @@
   if (use_inter_txb_hash) {
     for (record_idx = 0;
          record_idx < ((MAX_MIB_SIZE >> 1) * (MAX_MIB_SIZE >> 1)); record_idx++)
-      x->txb_rd_record_8X8[record_idx].num =
-          x->txb_rd_record_8X8[record_idx].index_start = 0;
+      txfm_info->txb_rd_record_8X8[record_idx].num =
+          txfm_info->txb_rd_record_8X8[record_idx].index_start = 0;
     for (record_idx = 0;
          record_idx < ((MAX_MIB_SIZE >> 2) * (MAX_MIB_SIZE >> 2)); record_idx++)
-      x->txb_rd_record_16X16[record_idx].num =
-          x->txb_rd_record_16X16[record_idx].index_start = 0;
+      txfm_info->txb_rd_record_16X16[record_idx].num =
+          txfm_info->txb_rd_record_16X16[record_idx].index_start = 0;
     for (record_idx = 0;
          record_idx < ((MAX_MIB_SIZE >> 3) * (MAX_MIB_SIZE >> 3)); record_idx++)
-      x->txb_rd_record_32X32[record_idx].num =
-          x->txb_rd_record_32X32[record_idx].index_start = 0;
+      txfm_info->txb_rd_record_32X32[record_idx].num =
+          txfm_info->txb_rd_record_32X32[record_idx].index_start = 0;
     for (record_idx = 0;
          record_idx < ((MAX_MIB_SIZE >> 4) * (MAX_MIB_SIZE >> 4)); record_idx++)
-      x->txb_rd_record_64X64[record_idx].num =
-          x->txb_rd_record_64X64[record_idx].index_start = 0;
+      txfm_info->txb_rd_record_64X64[record_idx].num =
+          txfm_info->txb_rd_record_64X64[record_idx].index_start = 0;
   }
 
   // Reset the state for use_intra_txb_hash
-  x->txb_rd_record_intra.num = x->txb_rd_record_intra.index_start = 0;
+  txfm_info->txb_rd_record_intra.num =
+      txfm_info->txb_rd_record_intra.index_start = 0;
 
   // Reset the state for use_mb_rd_hash
-  x->mb_rd_record.num = x->mb_rd_record.index_start = 0;
+  txfm_info->mb_rd_record.num = txfm_info->mb_rd_record.index_start = 0;
 }
 
 void av1_setup_pred_block(const MACROBLOCKD *xd,
@@ -350,14 +351,16 @@
 int av1_get_intra_cost_penalty(int qindex, int qdelta,
                                aom_bit_depth_t bit_depth);
 
-void av1_fill_mode_rates(AV1_COMMON *const cm, MACROBLOCK *x,
+void av1_fill_mode_rates(AV1_COMMON *const cm, ModeCosts *mode_costs,
                          FRAME_CONTEXT *fc);
 
-void av1_fill_coeff_costs(MACROBLOCK *x, FRAME_CONTEXT *fc,
+void av1_fill_lr_rates(ModeCosts *mode_costs, FRAME_CONTEXT *fc);
+
+void av1_fill_coeff_costs(CoeffCosts *coeff_costs, FRAME_CONTEXT *fc,
                           const int num_planes);
 
 void av1_fill_mv_costs(const FRAME_CONTEXT *fc, int integer_mv, int usehp,
-                       MACROBLOCK *x);
+                       MvCosts *mv_costs);
 
 int av1_get_adaptive_rdmult(const struct AV1_COMP *cpi, double beta);
 
diff --git a/av1/encoder/rdopt.c b/av1/encoder/rdopt.c
index 02afcd1..0cfadc8 100644
--- a/av1/encoder/rdopt.c
+++ b/av1/encoder/rdopt.c
@@ -26,6 +26,7 @@
 
 #include "av1/common/av1_common_int.h"
 #include "av1/common/cfl.h"
+#include "av1/common/blockd.h"
 #include "av1/common/common.h"
 #include "av1/common/common_data.h"
 #include "av1/common/entropy.h"
@@ -53,6 +54,7 @@
 #include "av1/encoder/hybrid_fwd_txfm.h"
 #include "av1/encoder/interp_search.h"
 #include "av1/encoder/intra_mode_search.h"
+#include "av1/encoder/intra_mode_search_utils.h"
 #include "av1/encoder/mcomp.h"
 #include "av1/encoder/ml.h"
 #include "av1/encoder/mode_prune_model_weights.h"
@@ -152,132 +154,132 @@
   THR_COMP_NEAREST_NEARESTLG,
   THR_COMP_NEAREST_NEARESTBA,
 
-  THR_COMP_NEAR_NEARLA,
-  THR_COMP_NEW_NEARESTLA,
-  THR_COMP_NEAREST_NEWLA,
-  THR_COMP_NEW_NEARLA,
-  THR_COMP_NEAR_NEWLA,
-  THR_COMP_NEW_NEWLA,
-  THR_COMP_GLOBAL_GLOBALLA,
-
-  THR_COMP_NEAR_NEARL2A,
-  THR_COMP_NEW_NEARESTL2A,
-  THR_COMP_NEAREST_NEWL2A,
-  THR_COMP_NEW_NEARL2A,
-  THR_COMP_NEAR_NEWL2A,
-  THR_COMP_NEW_NEWL2A,
-  THR_COMP_GLOBAL_GLOBALL2A,
-
-  THR_COMP_NEAR_NEARL3A,
-  THR_COMP_NEW_NEARESTL3A,
-  THR_COMP_NEAREST_NEWL3A,
-  THR_COMP_NEW_NEARL3A,
-  THR_COMP_NEAR_NEWL3A,
-  THR_COMP_NEW_NEWL3A,
-  THR_COMP_GLOBAL_GLOBALL3A,
-
-  THR_COMP_NEAR_NEARGA,
-  THR_COMP_NEW_NEARESTGA,
-  THR_COMP_NEAREST_NEWGA,
-  THR_COMP_NEW_NEARGA,
-  THR_COMP_NEAR_NEWGA,
-  THR_COMP_NEW_NEWGA,
-  THR_COMP_GLOBAL_GLOBALGA,
-
   THR_COMP_NEAR_NEARLB,
+  THR_COMP_NEW_NEWLB,
   THR_COMP_NEW_NEARESTLB,
   THR_COMP_NEAREST_NEWLB,
   THR_COMP_NEW_NEARLB,
   THR_COMP_NEAR_NEWLB,
-  THR_COMP_NEW_NEWLB,
   THR_COMP_GLOBAL_GLOBALLB,
 
+  THR_COMP_NEAR_NEARLA,
+  THR_COMP_NEW_NEWLA,
+  THR_COMP_NEW_NEARESTLA,
+  THR_COMP_NEAREST_NEWLA,
+  THR_COMP_NEW_NEARLA,
+  THR_COMP_NEAR_NEWLA,
+  THR_COMP_GLOBAL_GLOBALLA,
+
+  THR_COMP_NEAR_NEARL2A,
+  THR_COMP_NEW_NEWL2A,
+  THR_COMP_NEW_NEARESTL2A,
+  THR_COMP_NEAREST_NEWL2A,
+  THR_COMP_NEW_NEARL2A,
+  THR_COMP_NEAR_NEWL2A,
+  THR_COMP_GLOBAL_GLOBALL2A,
+
+  THR_COMP_NEAR_NEARL3A,
+  THR_COMP_NEW_NEWL3A,
+  THR_COMP_NEW_NEARESTL3A,
+  THR_COMP_NEAREST_NEWL3A,
+  THR_COMP_NEW_NEARL3A,
+  THR_COMP_NEAR_NEWL3A,
+  THR_COMP_GLOBAL_GLOBALL3A,
+
+  THR_COMP_NEAR_NEARGA,
+  THR_COMP_NEW_NEWGA,
+  THR_COMP_NEW_NEARESTGA,
+  THR_COMP_NEAREST_NEWGA,
+  THR_COMP_NEW_NEARGA,
+  THR_COMP_NEAR_NEWGA,
+  THR_COMP_GLOBAL_GLOBALGA,
+
   THR_COMP_NEAR_NEARL2B,
+  THR_COMP_NEW_NEWL2B,
   THR_COMP_NEW_NEARESTL2B,
   THR_COMP_NEAREST_NEWL2B,
   THR_COMP_NEW_NEARL2B,
   THR_COMP_NEAR_NEWL2B,
-  THR_COMP_NEW_NEWL2B,
   THR_COMP_GLOBAL_GLOBALL2B,
 
   THR_COMP_NEAR_NEARL3B,
+  THR_COMP_NEW_NEWL3B,
   THR_COMP_NEW_NEARESTL3B,
   THR_COMP_NEAREST_NEWL3B,
   THR_COMP_NEW_NEARL3B,
   THR_COMP_NEAR_NEWL3B,
-  THR_COMP_NEW_NEWL3B,
   THR_COMP_GLOBAL_GLOBALL3B,
 
   THR_COMP_NEAR_NEARGB,
+  THR_COMP_NEW_NEWGB,
   THR_COMP_NEW_NEARESTGB,
   THR_COMP_NEAREST_NEWGB,
   THR_COMP_NEW_NEARGB,
   THR_COMP_NEAR_NEWGB,
-  THR_COMP_NEW_NEWGB,
   THR_COMP_GLOBAL_GLOBALGB,
 
   THR_COMP_NEAR_NEARLA2,
+  THR_COMP_NEW_NEWLA2,
   THR_COMP_NEW_NEARESTLA2,
   THR_COMP_NEAREST_NEWLA2,
   THR_COMP_NEW_NEARLA2,
   THR_COMP_NEAR_NEWLA2,
-  THR_COMP_NEW_NEWLA2,
   THR_COMP_GLOBAL_GLOBALLA2,
 
   THR_COMP_NEAR_NEARL2A2,
+  THR_COMP_NEW_NEWL2A2,
   THR_COMP_NEW_NEARESTL2A2,
   THR_COMP_NEAREST_NEWL2A2,
   THR_COMP_NEW_NEARL2A2,
   THR_COMP_NEAR_NEWL2A2,
-  THR_COMP_NEW_NEWL2A2,
   THR_COMP_GLOBAL_GLOBALL2A2,
 
   THR_COMP_NEAR_NEARL3A2,
+  THR_COMP_NEW_NEWL3A2,
   THR_COMP_NEW_NEARESTL3A2,
   THR_COMP_NEAREST_NEWL3A2,
   THR_COMP_NEW_NEARL3A2,
   THR_COMP_NEAR_NEWL3A2,
-  THR_COMP_NEW_NEWL3A2,
   THR_COMP_GLOBAL_GLOBALL3A2,
 
   THR_COMP_NEAR_NEARGA2,
+  THR_COMP_NEW_NEWGA2,
   THR_COMP_NEW_NEARESTGA2,
   THR_COMP_NEAREST_NEWGA2,
   THR_COMP_NEW_NEARGA2,
   THR_COMP_NEAR_NEWGA2,
-  THR_COMP_NEW_NEWGA2,
   THR_COMP_GLOBAL_GLOBALGA2,
 
   THR_COMP_NEAR_NEARLL2,
+  THR_COMP_NEW_NEWLL2,
   THR_COMP_NEW_NEARESTLL2,
   THR_COMP_NEAREST_NEWLL2,
   THR_COMP_NEW_NEARLL2,
   THR_COMP_NEAR_NEWLL2,
-  THR_COMP_NEW_NEWLL2,
   THR_COMP_GLOBAL_GLOBALLL2,
 
   THR_COMP_NEAR_NEARLL3,
+  THR_COMP_NEW_NEWLL3,
   THR_COMP_NEW_NEARESTLL3,
   THR_COMP_NEAREST_NEWLL3,
   THR_COMP_NEW_NEARLL3,
   THR_COMP_NEAR_NEWLL3,
-  THR_COMP_NEW_NEWLL3,
   THR_COMP_GLOBAL_GLOBALLL3,
 
   THR_COMP_NEAR_NEARLG,
+  THR_COMP_NEW_NEWLG,
   THR_COMP_NEW_NEARESTLG,
   THR_COMP_NEAREST_NEWLG,
   THR_COMP_NEW_NEARLG,
   THR_COMP_NEAR_NEWLG,
-  THR_COMP_NEW_NEWLG,
   THR_COMP_GLOBAL_GLOBALLG,
 
   THR_COMP_NEAR_NEARBA,
+  THR_COMP_NEW_NEWBA,
   THR_COMP_NEW_NEARESTBA,
   THR_COMP_NEAREST_NEWBA,
   THR_COMP_NEW_NEARBA,
   THR_COMP_NEAR_NEWBA,
-  THR_COMP_NEW_NEWBA,
   THR_COMP_GLOBAL_GLOBALBA,
 
   THR_DC,
@@ -295,23 +297,7 @@
   THR_D45_PRED,
 };
 
-static int find_last_single_ref_mode_idx(const THR_MODES *mode_order) {
-  uint8_t mode_found[NUM_SINGLE_REF_MODES];
-  av1_zero(mode_found);
-  int num_single_ref_modes_left = NUM_SINGLE_REF_MODES;
-
-  for (int idx = 0; idx < MAX_MODES; idx++) {
-    const THR_MODES curr_mode = mode_order[idx];
-    if (curr_mode < SINGLE_REF_MODE_END) {
-      num_single_ref_modes_left--;
-    }
-    if (!num_single_ref_modes_left) {
-      return idx;
-    }
-  }
-  return -1;
-}
-
+/*!\cond */
 typedef struct SingleInterModeState {
   int64_t rd;
   MV_REFERENCE_FRAME ref_frame;
@@ -333,6 +319,11 @@
   int64_t mode_threshold[MAX_MODES];
   int64_t best_intra_rd;
   unsigned int best_pred_sse;
+
+  /*!
+   * \brief Keep track of best intra rd for use in compound mode.
+   */
+  int64_t best_pred_rd[REFERENCE_MODES];
   int64_t best_pred_diff[REFERENCE_MODES];
   // Save a set of single_newmv for each checked ref_mv.
   int_mv single_newmv[MAX_REF_MV_SEARCH][REF_FRAMES];
@@ -341,6 +332,8 @@
   int64_t modelled_rd[MB_MODE_COUNT][MAX_REF_MV_SEARCH][REF_FRAMES];
   // The rd of simple translation in single inter modes
   int64_t simple_rd[MB_MODE_COUNT][MAX_REF_MV_SEARCH][REF_FRAMES];
+  int64_t best_single_rd[REF_FRAMES];
+  PREDICTION_MODE best_single_mode[REF_FRAMES];
 
   // Single search results by [directions][modes][reference frames]
   SingleInterModeState single_state[2][SINGLE_INTER_MODE_NUM][FWD_REFS];
@@ -350,7 +343,9 @@
   int single_state_modelled_cnt[2][SINGLE_INTER_MODE_NUM];
   MV_REFERENCE_FRAME single_rd_order[2][SINGLE_INTER_MODE_NUM][FWD_REFS];
   IntraModeSearchState intra_search_state;
+  RD_STATS best_y_rdcost;
 } InterModeSearchState;
+/*!\endcond */
 
 void av1_inter_mode_data_init(TileDataEnc *tile_data) {
   for (int i = 0; i < BLOCK_SIZES_ALL; ++i) {
@@ -621,8 +616,8 @@
     if (plane && !xd->is_chroma_ref) break;
     const struct macroblock_plane *const p = &x->plane[plane];
     const struct macroblockd_plane *const pd = &xd->plane[plane];
-    const BLOCK_SIZE bs = get_plane_block_size(mbmi->sb_type, pd->subsampling_x,
-                                               pd->subsampling_y);
+    const BLOCK_SIZE bs =
+        get_plane_block_size(mbmi->bsize, pd->subsampling_x, pd->subsampling_y);
     unsigned int sse;
 
     cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride,
@@ -701,10 +696,10 @@
   return 0;
 }
 
-static int cost_mv_ref(const MACROBLOCK *const x, PREDICTION_MODE mode,
+static int cost_mv_ref(const ModeCosts *const mode_costs, PREDICTION_MODE mode,
                        int16_t mode_context) {
   if (is_inter_compound_mode(mode)) {
-    return x
+    return mode_costs
         ->inter_compound_mode_cost[mode_context][INTER_COMPOUND_OFFSET(mode)];
   }
 
@@ -714,19 +709,19 @@
   assert(is_inter_mode(mode));
 
   if (mode == NEWMV) {
-    mode_cost = x->newmv_mode_cost[mode_ctx][0];
+    mode_cost = mode_costs->newmv_mode_cost[mode_ctx][0];
     return mode_cost;
   } else {
-    mode_cost = x->newmv_mode_cost[mode_ctx][1];
+    mode_cost = mode_costs->newmv_mode_cost[mode_ctx][1];
     mode_ctx = (mode_context >> GLOBALMV_OFFSET) & GLOBALMV_CTX_MASK;
 
     if (mode == GLOBALMV) {
-      mode_cost += x->zeromv_mode_cost[mode_ctx][0];
+      mode_cost += mode_costs->zeromv_mode_cost[mode_ctx][0];
       return mode_cost;
     } else {
-      mode_cost += x->zeromv_mode_cost[mode_ctx][1];
+      mode_cost += mode_costs->zeromv_mode_cost[mode_ctx][1];
       mode_ctx = (mode_context >> REFMV_OFFSET) & REFMV_CTX_MASK;
-      mode_cost += x->refmv_mode_cost[mode_ctx][mode != NEARESTMV];
+      mode_cost += mode_costs->refmv_mode_cost[mode_ctx][mode != NEARESTMV];
       return mode_cost;
     }
   }
@@ -739,7 +734,7 @@
 }
 
 static AOM_INLINE void estimate_ref_frame_costs(
-    const AV1_COMMON *cm, const MACROBLOCKD *xd, const MACROBLOCK *x,
+    const AV1_COMMON *cm, const MACROBLOCKD *xd, const ModeCosts *mode_costs,
     int segment_id, unsigned int *ref_costs_single,
     unsigned int (*ref_costs_comp)[REF_FRAMES]) {
   int seg_ref_active =
@@ -752,8 +747,9 @@
              REF_FRAMES * sizeof((*ref_costs_comp)[0]));
   } else {
     int intra_inter_ctx = av1_get_intra_inter_context(xd);
-    ref_costs_single[INTRA_FRAME] = x->intra_inter_cost[intra_inter_ctx][0];
-    unsigned int base_cost = x->intra_inter_cost[intra_inter_ctx][1];
+    ref_costs_single[INTRA_FRAME] =
+        mode_costs->intra_inter_cost[intra_inter_ctx][0];
+    unsigned int base_cost = mode_costs->intra_inter_cost[intra_inter_ctx][1];
 
     for (int i = LAST_FRAME; i <= ALTREF_FRAME; ++i)
       ref_costs_single[i] = base_cost;
@@ -768,38 +764,41 @@
     // Determine cost of a single ref frame, where frame types are represented
     // by a tree:
     // Level 0: add cost whether this ref is a forward or backward ref
-    ref_costs_single[LAST_FRAME] += x->single_ref_cost[ctx_p1][0][0];
-    ref_costs_single[LAST2_FRAME] += x->single_ref_cost[ctx_p1][0][0];
-    ref_costs_single[LAST3_FRAME] += x->single_ref_cost[ctx_p1][0][0];
-    ref_costs_single[GOLDEN_FRAME] += x->single_ref_cost[ctx_p1][0][0];
-    ref_costs_single[BWDREF_FRAME] += x->single_ref_cost[ctx_p1][0][1];
-    ref_costs_single[ALTREF2_FRAME] += x->single_ref_cost[ctx_p1][0][1];
-    ref_costs_single[ALTREF_FRAME] += x->single_ref_cost[ctx_p1][0][1];
+    ref_costs_single[LAST_FRAME] += mode_costs->single_ref_cost[ctx_p1][0][0];
+    ref_costs_single[LAST2_FRAME] += mode_costs->single_ref_cost[ctx_p1][0][0];
+    ref_costs_single[LAST3_FRAME] += mode_costs->single_ref_cost[ctx_p1][0][0];
+    ref_costs_single[GOLDEN_FRAME] += mode_costs->single_ref_cost[ctx_p1][0][0];
+    ref_costs_single[BWDREF_FRAME] += mode_costs->single_ref_cost[ctx_p1][0][1];
+    ref_costs_single[ALTREF2_FRAME] +=
+        mode_costs->single_ref_cost[ctx_p1][0][1];
+    ref_costs_single[ALTREF_FRAME] += mode_costs->single_ref_cost[ctx_p1][0][1];
 
     // Level 1: if this ref is forward ref,
     // add cost whether it is last/last2 or last3/golden
-    ref_costs_single[LAST_FRAME] += x->single_ref_cost[ctx_p3][2][0];
-    ref_costs_single[LAST2_FRAME] += x->single_ref_cost[ctx_p3][2][0];
-    ref_costs_single[LAST3_FRAME] += x->single_ref_cost[ctx_p3][2][1];
-    ref_costs_single[GOLDEN_FRAME] += x->single_ref_cost[ctx_p3][2][1];
+    ref_costs_single[LAST_FRAME] += mode_costs->single_ref_cost[ctx_p3][2][0];
+    ref_costs_single[LAST2_FRAME] += mode_costs->single_ref_cost[ctx_p3][2][0];
+    ref_costs_single[LAST3_FRAME] += mode_costs->single_ref_cost[ctx_p3][2][1];
+    ref_costs_single[GOLDEN_FRAME] += mode_costs->single_ref_cost[ctx_p3][2][1];
 
     // Level 1: if this ref is backward ref
     // then add cost whether this ref is altref or backward ref
-    ref_costs_single[BWDREF_FRAME] += x->single_ref_cost[ctx_p2][1][0];
-    ref_costs_single[ALTREF2_FRAME] += x->single_ref_cost[ctx_p2][1][0];
-    ref_costs_single[ALTREF_FRAME] += x->single_ref_cost[ctx_p2][1][1];
+    ref_costs_single[BWDREF_FRAME] += mode_costs->single_ref_cost[ctx_p2][1][0];
+    ref_costs_single[ALTREF2_FRAME] +=
+        mode_costs->single_ref_cost[ctx_p2][1][0];
+    ref_costs_single[ALTREF_FRAME] += mode_costs->single_ref_cost[ctx_p2][1][1];
 
     // Level 2: further add cost whether this ref is last or last2
-    ref_costs_single[LAST_FRAME] += x->single_ref_cost[ctx_p4][3][0];
-    ref_costs_single[LAST2_FRAME] += x->single_ref_cost[ctx_p4][3][1];
+    ref_costs_single[LAST_FRAME] += mode_costs->single_ref_cost[ctx_p4][3][0];
+    ref_costs_single[LAST2_FRAME] += mode_costs->single_ref_cost[ctx_p4][3][1];
 
     // Level 2: last3 or golden
-    ref_costs_single[LAST3_FRAME] += x->single_ref_cost[ctx_p5][4][0];
-    ref_costs_single[GOLDEN_FRAME] += x->single_ref_cost[ctx_p5][4][1];
+    ref_costs_single[LAST3_FRAME] += mode_costs->single_ref_cost[ctx_p5][4][0];
+    ref_costs_single[GOLDEN_FRAME] += mode_costs->single_ref_cost[ctx_p5][4][1];
 
     // Level 2: bwdref or altref2
-    ref_costs_single[BWDREF_FRAME] += x->single_ref_cost[ctx_p6][5][0];
-    ref_costs_single[ALTREF2_FRAME] += x->single_ref_cost[ctx_p6][5][1];
+    ref_costs_single[BWDREF_FRAME] += mode_costs->single_ref_cost[ctx_p6][5][0];
+    ref_costs_single[ALTREF2_FRAME] +=
+        mode_costs->single_ref_cost[ctx_p6][5][1];
 
     if (cm->current_frame.reference_mode != SINGLE_REFERENCE) {
       // Similar to single ref, determine cost of compound ref frames.
@@ -815,34 +814,42 @@
 
       ref_bicomp_costs[LAST_FRAME] = ref_bicomp_costs[LAST2_FRAME] =
           ref_bicomp_costs[LAST3_FRAME] = ref_bicomp_costs[GOLDEN_FRAME] =
-              base_cost + x->comp_ref_type_cost[comp_ref_type_ctx][1];
+              base_cost + mode_costs->comp_ref_type_cost[comp_ref_type_ctx][1];
       ref_bicomp_costs[BWDREF_FRAME] = ref_bicomp_costs[ALTREF2_FRAME] = 0;
       ref_bicomp_costs[ALTREF_FRAME] = 0;
 
       // cost of first ref frame
-      ref_bicomp_costs[LAST_FRAME] += x->comp_ref_cost[ref_comp_ctx_p][0][0];
-      ref_bicomp_costs[LAST2_FRAME] += x->comp_ref_cost[ref_comp_ctx_p][0][0];
-      ref_bicomp_costs[LAST3_FRAME] += x->comp_ref_cost[ref_comp_ctx_p][0][1];
-      ref_bicomp_costs[GOLDEN_FRAME] += x->comp_ref_cost[ref_comp_ctx_p][0][1];
+      ref_bicomp_costs[LAST_FRAME] +=
+          mode_costs->comp_ref_cost[ref_comp_ctx_p][0][0];
+      ref_bicomp_costs[LAST2_FRAME] +=
+          mode_costs->comp_ref_cost[ref_comp_ctx_p][0][0];
+      ref_bicomp_costs[LAST3_FRAME] +=
+          mode_costs->comp_ref_cost[ref_comp_ctx_p][0][1];
+      ref_bicomp_costs[GOLDEN_FRAME] +=
+          mode_costs->comp_ref_cost[ref_comp_ctx_p][0][1];
 
-      ref_bicomp_costs[LAST_FRAME] += x->comp_ref_cost[ref_comp_ctx_p1][1][0];
-      ref_bicomp_costs[LAST2_FRAME] += x->comp_ref_cost[ref_comp_ctx_p1][1][1];
+      ref_bicomp_costs[LAST_FRAME] +=
+          mode_costs->comp_ref_cost[ref_comp_ctx_p1][1][0];
+      ref_bicomp_costs[LAST2_FRAME] +=
+          mode_costs->comp_ref_cost[ref_comp_ctx_p1][1][1];
 
-      ref_bicomp_costs[LAST3_FRAME] += x->comp_ref_cost[ref_comp_ctx_p2][2][0];
-      ref_bicomp_costs[GOLDEN_FRAME] += x->comp_ref_cost[ref_comp_ctx_p2][2][1];
+      ref_bicomp_costs[LAST3_FRAME] +=
+          mode_costs->comp_ref_cost[ref_comp_ctx_p2][2][0];
+      ref_bicomp_costs[GOLDEN_FRAME] +=
+          mode_costs->comp_ref_cost[ref_comp_ctx_p2][2][1];
 
       // cost of second ref frame
       ref_bicomp_costs[BWDREF_FRAME] +=
-          x->comp_bwdref_cost[bwdref_comp_ctx_p][0][0];
+          mode_costs->comp_bwdref_cost[bwdref_comp_ctx_p][0][0];
       ref_bicomp_costs[ALTREF2_FRAME] +=
-          x->comp_bwdref_cost[bwdref_comp_ctx_p][0][0];
+          mode_costs->comp_bwdref_cost[bwdref_comp_ctx_p][0][0];
       ref_bicomp_costs[ALTREF_FRAME] +=
-          x->comp_bwdref_cost[bwdref_comp_ctx_p][0][1];
+          mode_costs->comp_bwdref_cost[bwdref_comp_ctx_p][0][1];
 
       ref_bicomp_costs[BWDREF_FRAME] +=
-          x->comp_bwdref_cost[bwdref_comp_ctx_p1][1][0];
+          mode_costs->comp_bwdref_cost[bwdref_comp_ctx_p1][1][0];
       ref_bicomp_costs[ALTREF2_FRAME] +=
-          x->comp_bwdref_cost[bwdref_comp_ctx_p1][1][1];
+          mode_costs->comp_bwdref_cost[bwdref_comp_ctx_p1][1][1];
 
       // cost: if one ref frame is forward ref, the other ref is backward ref
       int ref0, ref1;
@@ -858,22 +865,22 @@
       const int uni_comp_ref_ctx_p1 = av1_get_pred_context_uni_comp_ref_p1(xd);
       const int uni_comp_ref_ctx_p2 = av1_get_pred_context_uni_comp_ref_p2(xd);
       ref_costs_comp[LAST_FRAME][LAST2_FRAME] =
-          base_cost + x->comp_ref_type_cost[comp_ref_type_ctx][0] +
-          x->uni_comp_ref_cost[uni_comp_ref_ctx_p][0][0] +
-          x->uni_comp_ref_cost[uni_comp_ref_ctx_p1][1][0];
+          base_cost + mode_costs->comp_ref_type_cost[comp_ref_type_ctx][0] +
+          mode_costs->uni_comp_ref_cost[uni_comp_ref_ctx_p][0][0] +
+          mode_costs->uni_comp_ref_cost[uni_comp_ref_ctx_p1][1][0];
       ref_costs_comp[LAST_FRAME][LAST3_FRAME] =
-          base_cost + x->comp_ref_type_cost[comp_ref_type_ctx][0] +
-          x->uni_comp_ref_cost[uni_comp_ref_ctx_p][0][0] +
-          x->uni_comp_ref_cost[uni_comp_ref_ctx_p1][1][1] +
-          x->uni_comp_ref_cost[uni_comp_ref_ctx_p2][2][0];
+          base_cost + mode_costs->comp_ref_type_cost[comp_ref_type_ctx][0] +
+          mode_costs->uni_comp_ref_cost[uni_comp_ref_ctx_p][0][0] +
+          mode_costs->uni_comp_ref_cost[uni_comp_ref_ctx_p1][1][1] +
+          mode_costs->uni_comp_ref_cost[uni_comp_ref_ctx_p2][2][0];
       ref_costs_comp[LAST_FRAME][GOLDEN_FRAME] =
-          base_cost + x->comp_ref_type_cost[comp_ref_type_ctx][0] +
-          x->uni_comp_ref_cost[uni_comp_ref_ctx_p][0][0] +
-          x->uni_comp_ref_cost[uni_comp_ref_ctx_p1][1][1] +
-          x->uni_comp_ref_cost[uni_comp_ref_ctx_p2][2][1];
+          base_cost + mode_costs->comp_ref_type_cost[comp_ref_type_ctx][0] +
+          mode_costs->uni_comp_ref_cost[uni_comp_ref_ctx_p][0][0] +
+          mode_costs->uni_comp_ref_cost[uni_comp_ref_ctx_p1][1][1] +
+          mode_costs->uni_comp_ref_cost[uni_comp_ref_ctx_p2][2][1];
       ref_costs_comp[BWDREF_FRAME][ALTREF_FRAME] =
-          base_cost + x->comp_ref_type_cost[comp_ref_type_ctx][0] +
-          x->uni_comp_ref_cost[uni_comp_ref_ctx_p][0][1];
+          base_cost + mode_costs->comp_ref_type_cost[comp_ref_type_ctx][0] +
+          mode_costs->uni_comp_ref_cost[uni_comp_ref_ctx_p][0][1];
     } else {
       int ref0, ref1;
       for (ref0 = LAST_FRAME; ref0 <= GOLDEN_FRAME; ++ref0) {
@@ -899,13 +906,13 @@
 
   // Take a snapshot of the coding context so it can be
   // restored if we decide to encode this way
-  ctx->rd_stats.skip = x->force_skip;
+  ctx->rd_stats.skip_txfm = x->txfm_search_info.skip_txfm;
   ctx->skippable = skippable;
 #if CONFIG_INTERNAL_STATS
   ctx->best_mode_index = mode_index;
 #endif  // CONFIG_INTERNAL_STATS
   ctx->mic = *xd->mi[0];
-  av1_copy_mbmi_ext_to_mbmi_ext_frame(&ctx->mbmi_ext_best, x->mbmi_ext,
+  av1_copy_mbmi_ext_to_mbmi_ext_frame(&ctx->mbmi_ext_best, &x->mbmi_ext,
                                       av1_ref_frame_type(xd->mi[0]->ref_frame));
   ctx->single_pred_diff = (int)comp_pred_diff[SINGLE_REFERENCE];
   ctx->comp_pred_diff = (int)comp_pred_diff[COMPOUND_REFERENCE];
@@ -921,7 +928,7 @@
       av1_get_scaled_ref_frame(cpi, ref_frame);
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = xd->mi[0];
-  MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
+  MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext;
   const struct scale_factors *const sf =
       get_ref_scale_factors_const(cm, ref_frame);
   const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_yv12_buf(cm, ref_frame);
@@ -979,7 +986,7 @@
                             InterModeSearchState *search_state) {
   const int is_comp_pred = ref_frames[1] > INTRA_FRAME;
   const uint8_t ref_frame_type = av1_ref_frame_type(ref_frames);
-  const MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
+  const MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext;
   const int ref_mv_count = mbmi_ext->ref_mv_count[ref_frame_type];
   PREDICTION_MODE compare_mode = MB_MODE_COUNT;
   if (!is_comp_pred) {
@@ -1012,8 +1019,9 @@
           INT64_MAX) {
         const int16_t mode_ctx =
             av1_mode_context_analyzer(mbmi_ext->mode_context, ref_frames);
-        const int compare_cost = cost_mv_ref(x, compare_mode, mode_ctx);
-        const int this_cost = cost_mv_ref(x, this_mode, mode_ctx);
+        const int compare_cost =
+            cost_mv_ref(&x->mode_costs, compare_mode, mode_ctx);
+        const int this_cost = cost_mv_ref(&x->mode_costs, this_mode, mode_ctx);
 
         // Only skip if the mode cost is larger than compare mode cost
         if (this_cost > compare_cost) {
@@ -1055,8 +1063,8 @@
                             const BLOCK_SIZE bsize, int_mv *cur_mv,
                             int *const rate_mv, HandleInterModeArgs *const args,
                             inter_mode_info *mode_info) {
-  const MACROBLOCKD *const xd = &x->e_mbd;
-  const MB_MODE_INFO *const mbmi = xd->mi[0];
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
   const int is_comp_pred = has_second_ref(mbmi);
   const PREDICTION_MODE this_mode = mbmi->mode;
   const int refs[2] = { mbmi->ref_frame[0],
@@ -1066,7 +1074,6 @@
   if (is_comp_pred) {
     const int valid_mv0 = args->single_newmv_valid[ref_mv_idx][refs[0]];
     const int valid_mv1 = args->single_newmv_valid[ref_mv_idx][refs[1]];
-
     if (this_mode == NEW_NEWMV) {
       if (valid_mv0) {
         cur_mv[0].as_int = args->single_newmv[ref_mv_idx][refs[0]].as_int;
@@ -1076,55 +1083,32 @@
         cur_mv[1].as_int = args->single_newmv[ref_mv_idx][refs[1]].as_int;
         clamp_mv_in_range(x, &cur_mv[1], 1);
       }
-
-      // aomenc1
-      if (cpi->sf.inter_sf.comp_inter_joint_search_thresh <= bsize ||
-          !valid_mv0 || !valid_mv1) {
-        av1_joint_motion_search(cpi, x, bsize, cur_mv, NULL, 0, rate_mv);
-      } else {
-        *rate_mv = 0;
-        for (int i = 0; i < 2; ++i) {
-          const int_mv ref_mv = av1_get_ref_mv(x, i);
-          *rate_mv +=
-              av1_mv_bit_cost(&cur_mv[i].as_mv, &ref_mv.as_mv, x->nmv_vec_cost,
-                              x->mv_cost_stack, MV_COST_WEIGHT);
-        }
+      *rate_mv = 0;
+      for (int i = 0; i < 2; ++i) {
+        const int_mv ref_mv = av1_get_ref_mv(x, i);
+        *rate_mv += av1_mv_bit_cost(&cur_mv[i].as_mv, &ref_mv.as_mv,
+                                    x->mv_costs.nmv_joint_cost,
+                                    x->mv_costs.mv_cost_stack, MV_COST_WEIGHT);
       }
     } else if (this_mode == NEAREST_NEWMV || this_mode == NEAR_NEWMV) {
       if (valid_mv1) {
         cur_mv[1].as_int = args->single_newmv[ref_mv_idx][refs[1]].as_int;
         clamp_mv_in_range(x, &cur_mv[1], 1);
       }
-
-      // aomenc2
-      if (cpi->sf.inter_sf.comp_inter_joint_search_thresh <= bsize ||
-          !valid_mv1) {
-        av1_compound_single_motion_search_interinter(cpi, x, bsize, cur_mv,
-                                                     NULL, 0, rate_mv, 1);
-      } else {
-        const int_mv ref_mv = av1_get_ref_mv(x, 1);
-        *rate_mv =
-            av1_mv_bit_cost(&cur_mv[1].as_mv, &ref_mv.as_mv, x->nmv_vec_cost,
-                            x->mv_cost_stack, MV_COST_WEIGHT);
-      }
+      const int_mv ref_mv = av1_get_ref_mv(x, 1);
+      *rate_mv = av1_mv_bit_cost(&cur_mv[1].as_mv, &ref_mv.as_mv,
+                                 x->mv_costs.nmv_joint_cost,
+                                 x->mv_costs.mv_cost_stack, MV_COST_WEIGHT);
     } else {
       assert(this_mode == NEW_NEARESTMV || this_mode == NEW_NEARMV);
       if (valid_mv0) {
         cur_mv[0].as_int = args->single_newmv[ref_mv_idx][refs[0]].as_int;
         clamp_mv_in_range(x, &cur_mv[0], 0);
       }
-
-      // aomenc3
-      if (cpi->sf.inter_sf.comp_inter_joint_search_thresh <= bsize ||
-          !valid_mv0) {
-        av1_compound_single_motion_search_interinter(cpi, x, bsize, cur_mv,
-                                                     NULL, 0, rate_mv, 0);
-      } else {
-        const int_mv ref_mv = av1_get_ref_mv(x, 0);
-        *rate_mv =
-            av1_mv_bit_cost(&cur_mv[0].as_mv, &ref_mv.as_mv, x->nmv_vec_cost,
-                            x->mv_cost_stack, MV_COST_WEIGHT);
-      }
+      const int_mv ref_mv = av1_get_ref_mv(x, 0);
+      *rate_mv = av1_mv_bit_cost(&cur_mv[0].as_mv, &ref_mv.as_mv,
+                                 x->mv_costs.nmv_joint_cost,
+                                 x->mv_costs.mv_cost_stack, MV_COST_WEIGHT);
     }
   } else {
     // Single ref case.
@@ -1138,7 +1122,7 @@
       MV prev_ref_mv[2] = { { 0 } };
       for (int idx = 0; idx < mbmi->ref_mv_idx; ++idx) {
         prev_ref_mv[idx] = av1_get_ref_mv_from_stack(ref_idx, mbmi->ref_frame,
-                                                     idx, x->mbmi_ext)
+                                                     idx, &x->mbmi_ext)
                                .as_mv;
         const int ref_mv_diff = AOMMAX(abs(ref_mv.row - prev_ref_mv[idx].row),
                                        abs(ref_mv.col - prev_ref_mv[idx].col));
@@ -1177,29 +1161,6 @@
   return 0;
 }
 
-// If number of valid neighbours is 1,
-// 1) ROTZOOM parameters can be obtained reliably (2 parameters from
-// one neighbouring MV)
-// 2) For IDENTITY/TRANSLATION cases, warp can perform better due to
-// a different interpolation filter being used. However the quality
-// gains (due to the same) may not be much
-// For above 2 cases warp evaluation is skipped
-
-static int check_if_optimal_warp(const AV1_COMP *cpi,
-                                 WarpedMotionParams *wm_params,
-                                 int num_proj_ref) {
-  int is_valid_warp = 1;
-  if (cpi->sf.inter_sf.prune_warp_using_wmtype) {
-    TransformationType wmtype = get_wmtype(wm_params);
-    if (num_proj_ref == 1) {
-      if (wmtype != ROTZOOM) is_valid_warp = 0;
-    } else {
-      if (wmtype < ROTZOOM) is_valid_warp = 0;
-    }
-  }
-  return is_valid_warp;
-}
-
 static INLINE void update_mode_start_end_index(const AV1_COMP *const cpi,
                                                int *mode_index_start,
                                                int *mode_index_end,
@@ -1219,24 +1180,91 @@
   }
 }
 
-// TODO(afergs): Refactor the MBMI references in here - there's four
-// TODO(afergs): Refactor optional args - add them to a struct or remove
+/*!\brief AV1 motion mode search
+ *
+ * \ingroup inter_mode_search
+ * Function to search over and determine the motion mode. It will update
+ * mbmi->motion_mode to one of SIMPLE_TRANSLATION, OBMC_CAUSAL, or
+ * WARPED_CAUSAL and determine any necessary side information for the selected
+ * motion mode. It will also perform the full transform search, unless the
+ * input parameter do_tx_search indicates to do an estimation of the RD rather
+ * than an RD corresponding to a full transform search. It will return the
+ * RD for the final motion_mode.
+ * Do the RD search for a given inter mode and compute all information relevant
+ * to the input mode. It will compute the best MV,
+ * compound parameters (if the mode is a compound mode) and interpolation filter
+ * parameters.
+ *
+ * \param[in]     cpi               Top-level encoder structure.
+ * \param[in]     tile_data         Pointer to struct holding adaptive
+ *                                  data/contexts/models for the tile during
+ *                                  encoding.
+ * \param[in]     x                 Pointer to struct holding all the data for
+ *                                  the current macroblock.
+ * \param[in]     bsize             Current block size.
+ * \param[in,out] rd_stats          Struct to keep track of the overall RD
+ *                                  information.
+ * \param[in,out] rd_stats_y        Struct to keep track of the RD information
+ *                                  for only the Y plane.
+ * \param[in,out] rd_stats_uv       Struct to keep track of the RD information
+ *                                  for only the UV planes.
+ * \param[in]     args              HandleInterModeArgs struct holding
+ *                                  miscellaneous arguments for inter mode
+ *                                  search. See the documentation for this
+ *                                  struct for a description of each member.
+ * \param[in]     ref_best_rd       Best RD found so far for this block.
+ *                                  It is used for early termination of this
+ *                                  search if the RD exceeds this value.
+ * \param[in,out] ref_skip_rd       A length 2 array, where skip_rd[0] is the
+ *                                  best total RD for a skip mode so far, and
+ *                                  skip_rd[1] is the best RD for a skip mode so
+ *                                  far in luma. This is used as a speed feature
+ *                                  to skip the transform search if the computed
+ *                                  skip RD for the current mode is not better
+ *                                  than the best skip_rd so far.
+ * \param[in,out] rate_mv           The rate associated with the motion vectors.
+ *                                  This will be modified if a motion search is
+ *                                  done in the motion mode search.
+ * \param[in,out] orig_dst          A prediction buffer to hold a computed
+ *                                  prediction. This will eventually hold the
+ *                                  final prediction, and the tmp_dst info will
+ *                                  be copied here.
+ * \param[in,out] best_est_rd       Estimated RD for motion mode search if
+ *                                  do_tx_search (see below) is 0.
+ * \param[in]     do_tx_search      Parameter to indicate whether or not to do
+ *                                  a full transform search. This will compute
+ *                                  an estimated RD for the modes without the
+ *                                  transform search and later perform the full
+ *                                  transform search on the best candidates.
+ * \param[in]     inter_modes_info  InterModesInfo struct to hold inter mode
+ *                                  information to perform a full transform
+ *                                  search only on winning candidates searched
+ *                                  with an estimate for transform coding RD.
+ * \param[in]     eval_motion_mode  Boolean whether or not to evaluate motion
+ *                                  motion modes other than SIMPLE_TRANSLATION.
+ * \param[out]    yrd               Stores the rdcost corresponding to encoding
+ *                                  the luma plane.
+ * \return Returns INT64_MAX if the determined motion mode is invalid and the
+ * current motion mode being tested should be skipped. It returns 0 if the
+ * motion mode search is a success.
+ */
 static int64_t motion_mode_rd(
     const AV1_COMP *const cpi, TileDataEnc *tile_data, MACROBLOCK *const x,
     BLOCK_SIZE bsize, RD_STATS *rd_stats, RD_STATS *rd_stats_y,
-    RD_STATS *rd_stats_uv, int *disable_skip, HandleInterModeArgs *const args,
-    int64_t ref_best_rd, int64_t *ref_skip_rd, int *rate_mv,
-    const BUFFER_SET *orig_dst, int64_t *best_est_rd, int do_tx_search,
-    InterModesInfo *inter_modes_info, int eval_motion_mode) {
+    RD_STATS *rd_stats_uv, HandleInterModeArgs *const args, int64_t ref_best_rd,
+    int64_t *ref_skip_rd, int *rate_mv, const BUFFER_SET *orig_dst,
+    int64_t *best_est_rd, int do_tx_search, InterModesInfo *inter_modes_info,
+    int eval_motion_mode, int64_t *yrd) {
   const AV1_COMMON *const cm = &cpi->common;
   const FeatureFlags *const features = &cm->features;
+  TxfmSearchInfo *txfm_info = &x->txfm_search_info;
   const int num_planes = av1_num_planes(cm);
   MACROBLOCKD *xd = &x->e_mbd;
   MB_MODE_INFO *mbmi = xd->mi[0];
   const int is_comp_pred = has_second_ref(mbmi);
   const PREDICTION_MODE this_mode = mbmi->mode;
   const int rate2_nocoeff = rd_stats->rate;
-  int best_xskip = 0, best_disable_skip = 0;
+  int best_xskip_txfm = 0;
   RD_STATS best_rd_stats, best_rd_stats_y, best_rd_stats_uv;
   uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
   uint8_t best_tx_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE];
@@ -1244,66 +1272,88 @@
   const int interintra_allowed = cm->seq_params.enable_interintra_compound &&
                                  is_interintra_allowed(mbmi) &&
                                  mbmi->compound_idx;
-  int pts0[SAMPLES_ARRAY_SIZE], pts_inref0[SAMPLES_ARRAY_SIZE];
+  WARP_SAMPLE_INFO *const warp_sample_info =
+      &x->warp_sample_info[mbmi->ref_frame[0]];
+  int *pts0 = warp_sample_info->pts;
+  int *pts_inref0 = warp_sample_info->pts_inref;
 
   assert(mbmi->ref_frame[1] != INTRA_FRAME);
   const MV_REFERENCE_FRAME ref_frame_1 = mbmi->ref_frame[1];
-  (void)tile_data;
   av1_invalid_rd_stats(&best_rd_stats);
   aom_clear_system_state();
   mbmi->num_proj_ref = 1;  // assume num_proj_ref >=1
   MOTION_MODE last_motion_mode_allowed = SIMPLE_TRANSLATION;
+  *yrd = INT64_MAX;
   if (features->switchable_motion_mode) {
+    // Determine which motion modes to search if more than SIMPLE_TRANSLATION
+    // is allowed.
     last_motion_mode_allowed = motion_mode_allowed(
         xd->global_motion, xd, mbmi, features->allow_warped_motion);
   }
 
   if (last_motion_mode_allowed == WARPED_CAUSAL) {
-    mbmi->num_proj_ref = av1_findSamples(cm, xd, pts0, pts_inref0);
+    // Collect projection samples used in least squares approximation of
+    // the warped motion parameters if WARPED_CAUSAL is going to be searched.
+    if (warp_sample_info->num < 0) {
+      warp_sample_info->num = av1_findSamples(cm, xd, pts0, pts_inref0);
+    }
+    mbmi->num_proj_ref = warp_sample_info->num;
   }
   const int total_samples = mbmi->num_proj_ref;
   if (total_samples == 0) {
+    // Do not search WARPED_CAUSAL if there are no samples to use to determine
+    // warped parameters.
     last_motion_mode_allowed = OBMC_CAUSAL;
   }
 
   const MB_MODE_INFO base_mbmi = *mbmi;
   MB_MODE_INFO best_mbmi;
-  SimpleRDState *const simple_states = &args->simple_rd_state[mbmi->ref_mv_idx];
   const int interp_filter = features->interp_filter;
   const int switchable_rate =
-      av1_is_interp_needed(xd) ? av1_get_switchable_rate(x, xd, interp_filter)
-                               : 0;
+      av1_is_interp_needed(xd)
+          ? av1_get_switchable_rate(x, xd, interp_filter,
+                                    cm->seq_params.enable_dual_filter)
+          : 0;
   int64_t best_rd = INT64_MAX;
   int best_rate_mv = rate_mv0;
   const int mi_row = xd->mi_row;
   const int mi_col = xd->mi_col;
   int mode_index_start, mode_index_end;
+  // Modify the start and end index according to speed features. For example,
+  // if SIMPLE_TRANSLATION has already been searched according to
+  // the motion_mode_for_winner_cand speed feature, update the mode_index_start
+  // to avoid searching it again.
   update_mode_start_end_index(cpi, &mode_index_start, &mode_index_end,
                               last_motion_mode_allowed, interintra_allowed,
                               eval_motion_mode);
+  // Main function loop. This loops over all of the possible motion modes and
+  // computes RD to determine the best one. This process includes computing
+  // any necessary side information for the motion mode and performing the
+  // transform search.
   for (int mode_index = mode_index_start; mode_index <= mode_index_end;
        mode_index++) {
     if (args->skip_motion_mode && mode_index) continue;
-    if (cpi->sf.inter_sf.prune_single_motion_modes_by_simple_trans &&
-        args->single_ref_first_pass && mode_index)
-      break;
     int tmp_rate2 = rate2_nocoeff;
     const int is_interintra_mode = mode_index > (int)last_motion_mode_allowed;
     int tmp_rate_mv = rate_mv0;
 
     *mbmi = base_mbmi;
     if (is_interintra_mode) {
+      // Only use SIMPLE_TRANSLATION for interintra
       mbmi->motion_mode = SIMPLE_TRANSLATION;
     } else {
       mbmi->motion_mode = (MOTION_MODE)mode_index;
       assert(mbmi->ref_frame[1] != INTRA_FRAME);
     }
 
+    // Do not search OBMC if the probability of selecting it is below a
+    // predetermined threshold for this update_type and block size.
     const FRAME_UPDATE_TYPE update_type = get_frame_update_type(&cpi->gf_group);
     const int prune_obmc = cpi->frame_probs.obmc_probs[update_type][bsize] <
                            cpi->sf.inter_sf.prune_obmc_prob_thresh;
-    if ((cpi->oxcf.enable_obmc == 0 || cpi->sf.inter_sf.disable_obmc ||
-         cpi->sf.rt_sf.use_nonrd_pick_mode || prune_obmc) &&
+    if ((!cpi->oxcf.motion_mode_cfg.enable_obmc ||
+         cpi->sf.inter_sf.disable_obmc || cpi->sf.rt_sf.use_nonrd_pick_mode ||
+         prune_obmc) &&
         mbmi->motion_mode == OBMC_CAUSAL)
       continue;
 
@@ -1311,32 +1361,9 @@
       // SIMPLE_TRANSLATION mode: no need to recalculate.
       // The prediction is calculated before motion_mode_rd() is called in
       // handle_inter_mode()
-      if (cpi->sf.inter_sf.prune_single_motion_modes_by_simple_trans &&
-          !is_comp_pred) {
-        if (args->single_ref_first_pass == 0) {
-          if (simple_states->early_skipped) {
-            assert(simple_states->rd_stats.rdcost == INT64_MAX);
-            return INT64_MAX;
-          }
-          if (simple_states->rd_stats.rdcost != INT64_MAX) {
-            best_rd = simple_states->rd_stats.rdcost;
-            best_rd_stats = simple_states->rd_stats;
-            best_rd_stats_y = simple_states->rd_stats_y;
-            best_rd_stats_uv = simple_states->rd_stats_uv;
-            memcpy(best_blk_skip, simple_states->blk_skip,
-                   sizeof(x->blk_skip[0]) * xd->height * xd->width);
-            av1_copy_array(best_tx_type_map, simple_states->tx_type_map,
-                           xd->height * xd->width);
-            best_xskip = simple_states->skip;
-            best_disable_skip = simple_states->disable_skip;
-            best_mbmi = *mbmi;
-          }
-          continue;
-        }
-        simple_states->early_skipped = 0;
-      }
     } else if (mbmi->motion_mode == OBMC_CAUSAL) {
       const uint32_t cur_mv = mbmi->mv[0].as_int;
+      // OBMC_CAUSAL not allowed for compound prediction
       assert(!is_comp_pred);
       if (have_newmv_in_inter_mode(this_mode)) {
         av1_single_motion_search(cpi, x, bsize, 0, &tmp_rate_mv, INT_MAX, NULL,
@@ -1344,12 +1371,17 @@
         tmp_rate2 = rate2_nocoeff - rate_mv0 + tmp_rate_mv;
       }
       if ((mbmi->mv[0].as_int != cur_mv) || eval_motion_mode) {
+        // Build the predictor according to the current motion vector if it has
+        // not already been built
         av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize,
                                       0, av1_num_planes(cm) - 1);
       }
+      // Build the inter predictor by blending the predictor corresponding to
+      // this MV, and the neighboring blocks using the OBMC model
       av1_build_obmc_inter_prediction(
           cm, xd, args->above_pred_buf, args->above_pred_stride,
           args->left_pred_buf, args->left_pred_stride);
+#if !CONFIG_REALTIME_ONLY
     } else if (mbmi->motion_mode == WARPED_CAUSAL) {
       int pts[SAMPLES_ARRAY_SIZE], pts_inref[SAMPLES_ARRAY_SIZE];
       mbmi->motion_mode = WARPED_CAUSAL;
@@ -1365,21 +1397,18 @@
             &mbmi->mv[0].as_mv, pts, pts_inref, mbmi->num_proj_ref, bsize);
       }
 
+      // Compute the warped motion parameters with a least squares fit
+      //  using the collected samples
       if (!av1_find_projection(mbmi->num_proj_ref, pts, pts_inref, bsize,
                                mbmi->mv[0].as_mv.row, mbmi->mv[0].as_mv.col,
                                &mbmi->wm_params, mi_row, mi_col)) {
-        // Refine MV for NEWMV mode
         assert(!is_comp_pred);
         if (have_newmv_in_inter_mode(this_mode)) {
+          // Refine MV for NEWMV mode
           const int_mv mv0 = mbmi->mv[0];
           const WarpedMotionParams wm_params0 = mbmi->wm_params;
           const int num_proj_ref0 = mbmi->num_proj_ref;
 
-          if (cpi->sf.inter_sf.prune_warp_using_wmtype) {
-            TransformationType wmtype = get_wmtype(&mbmi->wm_params);
-            if (wmtype < ROTZOOM) continue;
-          }
-
           const int_mv ref_mv = av1_get_ref_mv(x, 0);
           SUBPEL_MOTION_SEARCH_PARAMS ms_params;
           av1_make_default_subpel_ms_params(&ms_params, cpi, x, bsize,
@@ -1389,14 +1418,11 @@
           av1_refine_warped_mv(xd, cm, &ms_params, bsize, pts0, pts_inref0,
                                total_samples);
 
-          // Keep the refined MV and WM parameters.
           if (mv0.as_int != mbmi->mv[0].as_int) {
-            tmp_rate_mv = av1_mv_bit_cost(&mbmi->mv[0].as_mv, &ref_mv.as_mv,
-                                          x->nmv_vec_cost, x->mv_cost_stack,
-                                          MV_COST_WEIGHT);
-            if (cpi->sf.mv_sf.adaptive_motion_search) {
-              x->pred_mv[mbmi->ref_frame[0]] = mbmi->mv[0].as_mv;
-            }
+            // Keep the refined MV and WM parameters.
+            tmp_rate_mv = av1_mv_bit_cost(
+                &mbmi->mv[0].as_mv, &ref_mv.as_mv, x->mv_costs.nmv_joint_cost,
+                x->mv_costs.mv_cost_stack, MV_COST_WEIGHT);
             tmp_rate2 = rate2_nocoeff - rate_mv0 + tmp_rate_mv;
           } else {
             // Restore the old MV and WM parameters.
@@ -1404,16 +1430,15 @@
             mbmi->wm_params = wm_params0;
             mbmi->num_proj_ref = num_proj_ref0;
           }
-        } else {
-          if (!check_if_optimal_warp(cpi, &mbmi->wm_params, mbmi->num_proj_ref))
-            continue;
         }
 
+        // Build the warped predictor
         av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, 0,
                                       av1_num_planes(cm) - 1);
       } else {
         continue;
       }
+#endif  // !CONFIG_REALTIME_ONLY
     } else if (is_interintra_mode) {
       const int ret =
           av1_handle_inter_intra_mode(cpi, x, bsize, mbmi, args, ref_best_rd,
@@ -1423,50 +1448,38 @@
 
     // If we are searching newmv and the mv is the same as refmv, skip the
     // current mode
-    if (this_mode == NEW_NEWMV) {
-      const int_mv ref_mv_0 = av1_get_ref_mv(x, 0);
-      const int_mv ref_mv_1 = av1_get_ref_mv(x, 1);
-      if (mbmi->mv[0].as_int == ref_mv_0.as_int ||
-          mbmi->mv[1].as_int == ref_mv_1.as_int) {
-        continue;
-      }
-    } else if (this_mode == NEAREST_NEWMV || this_mode == NEAR_NEWMV) {
-      const int_mv ref_mv_1 = av1_get_ref_mv(x, 1);
-      if (mbmi->mv[1].as_int == ref_mv_1.as_int) {
-        continue;
-      }
-    } else if (this_mode == NEW_NEARESTMV || this_mode == NEW_NEARMV) {
-      const int_mv ref_mv_0 = av1_get_ref_mv(x, 0);
-      if (mbmi->mv[0].as_int == ref_mv_0.as_int) {
-        continue;
-      }
-    } else if (this_mode == NEWMV) {
-      const int_mv ref_mv_0 = av1_get_ref_mv(x, 0);
-      if (mbmi->mv[0].as_int == ref_mv_0.as_int) {
-        continue;
-      }
-    }
+    if (!av1_check_newmv_joint_nonzero(cm, x)) continue;
 
-    x->force_skip = 0;
+    // Update rd_stats for the current motion mode
+    txfm_info->skip_txfm = 0;
     rd_stats->dist = 0;
     rd_stats->sse = 0;
-    rd_stats->skip = 1;
+    rd_stats->skip_txfm = 1;
     rd_stats->rate = tmp_rate2;
+    const ModeCosts *mode_costs = &x->mode_costs;
     if (mbmi->motion_mode != WARPED_CAUSAL) rd_stats->rate += switchable_rate;
     if (interintra_allowed) {
-      rd_stats->rate += x->interintra_cost[size_group_lookup[bsize]]
-                                          [mbmi->ref_frame[1] == INTRA_FRAME];
+      rd_stats->rate +=
+          mode_costs->interintra_cost[size_group_lookup[bsize]]
+                                     [mbmi->ref_frame[1] == INTRA_FRAME];
     }
     if ((last_motion_mode_allowed > SIMPLE_TRANSLATION) &&
         (mbmi->ref_frame[1] != INTRA_FRAME)) {
       if (last_motion_mode_allowed == WARPED_CAUSAL) {
-        rd_stats->rate += x->motion_mode_cost[bsize][mbmi->motion_mode];
+        rd_stats->rate +=
+            mode_costs->motion_mode_cost[bsize][mbmi->motion_mode];
       } else {
-        rd_stats->rate += x->motion_mode_cost1[bsize][mbmi->motion_mode];
+        rd_stats->rate +=
+            mode_costs->motion_mode_cost1[bsize][mbmi->motion_mode];
       }
     }
 
+    int64_t this_yrd = INT64_MAX;
+
     if (!do_tx_search) {
+      // Avoid doing a transform search here to speed up the overall mode
+      // search. It will be done later in the mode search if the current
+      // motion mode seems promising.
       int64_t curr_sse = -1;
       int64_t sse_y = -1;
       int est_residue_cost = 0;
@@ -1474,9 +1487,6 @@
       int64_t est_rd = 0;
       if (cpi->sf.inter_sf.inter_mode_rd_model_estimation == 1) {
         curr_sse = get_sse(cpi, x, &sse_y);
-        // Scale luma SSE as per bit depth so as to be consistent with
-        // model_rd_sb_fn and compound type rd
-        sse_y = ROUND_POWER_OF_TWO(sse_y, (xd->bd - 8) * 2);
         const int has_est_rd = get_est_rate_dist(tile_data, bsize, curr_sse,
                                                  &est_residue_cost, &est_dist);
         (void)has_est_rd;
@@ -1517,17 +1527,15 @@
                               rd_stats->rdcost, rd_stats, rd_stats_y,
                               rd_stats_uv, mbmi);
       }
-      mbmi->skip = 0;
+      mbmi->skip_txfm = 0;
     } else {
+      // Perform full transform search
       int64_t skip_rd = INT64_MAX;
       int64_t skip_rdy = INT64_MAX;
       if (cpi->sf.inter_sf.txfm_rd_gate_level) {
         // Check if the mode is good enough based on skip RD
         int64_t sse_y = INT64_MAX;
         int64_t curr_sse = get_sse(cpi, x, &sse_y);
-        // Scale luma SSE as per bit depth so as to be consistent with
-        // model_rd_sb_fn and compound type rd
-        sse_y = ROUND_POWER_OF_TWO(sse_y, (xd->bd - 8) * 2);
         skip_rd = RDCOST(x->rdmult, rd_stats->rate, curr_sse);
         skip_rdy = RDCOST(x->rdmult, rd_stats->rate, (sse_y << 4));
         int eval_txfm = check_txfm_eval(x, bsize, ref_skip_rd[0], skip_rd,
@@ -1535,17 +1543,21 @@
         if (!eval_txfm) continue;
       }
 
+      // Do transform search
+      const int mode_rate = rd_stats->rate;
       if (!av1_txfm_search(cpi, x, bsize, rd_stats, rd_stats_y, rd_stats_uv,
                            rd_stats->rate, ref_best_rd)) {
         if (rd_stats_y->rate == INT_MAX && mode_index == 0) {
-          if (cpi->sf.inter_sf.prune_single_motion_modes_by_simple_trans &&
-              !is_comp_pred) {
-            simple_states->early_skipped = 1;
-          }
           return INT64_MAX;
         }
         continue;
       }
+      const int skip_ctx = av1_get_skip_txfm_context(xd);
+      const int y_rate =
+          rd_stats->skip_txfm
+              ? x->mode_costs.skip_txfm_cost[skip_ctx][1]
+              : (rd_stats_y->rate + x->mode_costs.skip_txfm_cost[skip_ctx][0]);
+      this_yrd = RDCOST(x->rdmult, y_rate + mode_rate, rd_stats_y->dist);
 
       const int64_t curr_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
       if (curr_rd < ref_best_rd) {
@@ -1553,13 +1565,11 @@
         ref_skip_rd[0] = skip_rd;
         ref_skip_rd[1] = skip_rdy;
       }
-      *disable_skip = 0;
       if (cpi->sf.inter_sf.inter_mode_rd_model_estimation == 1) {
-        const int skip_ctx = av1_get_skip_context(xd);
-        inter_mode_data_push(tile_data, mbmi->sb_type, rd_stats->sse,
-                             rd_stats->dist,
-                             rd_stats_y->rate + rd_stats_uv->rate +
-                                 x->skip_cost[skip_ctx][mbmi->skip]);
+        inter_mode_data_push(
+            tile_data, mbmi->bsize, rd_stats->sse, rd_stats->dist,
+            rd_stats_y->rate + rd_stats_uv->rate +
+                mode_costs->skip_txfm_cost[skip_ctx][mbmi->skip_txfm]);
       }
     }
 
@@ -1573,39 +1583,26 @@
     const int64_t tmp_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
     if (mode_index == 0) {
       args->simple_rd[this_mode][mbmi->ref_mv_idx][mbmi->ref_frame[0]] = tmp_rd;
-      if (!is_comp_pred) {
-        simple_states->rd_stats = *rd_stats;
-        simple_states->rd_stats.rdcost = tmp_rd;
-        simple_states->rd_stats_y = *rd_stats_y;
-        simple_states->rd_stats_uv = *rd_stats_uv;
-        memcpy(simple_states->blk_skip, x->blk_skip,
-               sizeof(x->blk_skip[0]) * xd->height * xd->width);
-        av1_copy_array(simple_states->tx_type_map, xd->tx_type_map,
-                       xd->height * xd->width);
-        simple_states->skip = mbmi->skip;
-        simple_states->disable_skip = *disable_skip;
-      }
     }
     if (mode_index == 0 || tmp_rd < best_rd) {
+      // Update best_rd data if this is the best motion mode so far
       best_mbmi = *mbmi;
       best_rd = tmp_rd;
       best_rd_stats = *rd_stats;
       best_rd_stats_y = *rd_stats_y;
       best_rate_mv = tmp_rate_mv;
+      *yrd = this_yrd;
       if (num_planes > 1) best_rd_stats_uv = *rd_stats_uv;
-      memcpy(best_blk_skip, x->blk_skip,
-             sizeof(x->blk_skip[0]) * xd->height * xd->width);
+      memcpy(best_blk_skip, txfm_info->blk_skip,
+             sizeof(txfm_info->blk_skip[0]) * xd->height * xd->width);
       av1_copy_array(best_tx_type_map, xd->tx_type_map, xd->height * xd->width);
-      best_xskip = mbmi->skip;
-      best_disable_skip = *disable_skip;
-      // TODO(anyone): evaluate the quality and speed trade-off of the early
-      // termination logic below.
-      // if (best_xskip) break;
+      best_xskip_txfm = mbmi->skip_txfm;
     }
   }
+  // Update RD and mbmi stats for selected motion mode
   mbmi->ref_frame[1] = ref_frame_1;
   *rate_mv = best_rate_mv;
-  if (best_rd == INT64_MAX) {
+  if (best_rd == INT64_MAX || !av1_check_newmv_joint_nonzero(cm, x)) {
     av1_invalid_rd_stats(rd_stats);
     restore_dst_buf(xd, *orig_dst, num_planes);
     return INT64_MAX;
@@ -1614,11 +1611,10 @@
   *rd_stats = best_rd_stats;
   *rd_stats_y = best_rd_stats_y;
   if (num_planes > 1) *rd_stats_uv = best_rd_stats_uv;
-  memcpy(x->blk_skip, best_blk_skip,
-         sizeof(x->blk_skip[0]) * xd->height * xd->width);
+  memcpy(txfm_info->blk_skip, best_blk_skip,
+         sizeof(txfm_info->blk_skip[0]) * xd->height * xd->width);
   av1_copy_array(xd->tx_type_map, best_tx_type_map, xd->height * xd->width);
-  x->force_skip = best_xskip;
-  *disable_skip = best_disable_skip;
+  txfm_info->skip_txfm = best_xskip_txfm;
 
   restore_dst_buf(xd, *orig_dst, num_planes);
   return 0;
@@ -1651,7 +1647,7 @@
   }
   const int skip_mode_ctx = av1_get_skip_mode_context(xd);
   rd_stats->dist = rd_stats->sse = total_sse;
-  rd_stats->rate = x->skip_mode_cost[skip_mode_ctx][1];
+  rd_stats->rate = x->mode_costs.skip_mode_cost[skip_mode_ctx][1];
   rd_stats->rdcost = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
 
   restore_dst_buf(xd, *orig_dst, num_planes);
@@ -1747,15 +1743,15 @@
     int_mv this_mv;
     this_mv.as_int = INVALID_MV;
     ret = get_this_mv(&this_mv, this_mode, i, mbmi->ref_mv_idx,
-                      skip_repeated_ref_mv, mbmi->ref_frame, x->mbmi_ext);
+                      skip_repeated_ref_mv, mbmi->ref_frame, &x->mbmi_ext);
     if (!ret) return 0;
     const PREDICTION_MODE single_mode = get_single_mode(this_mode, i);
     if (single_mode == NEWMV) {
       const uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
       cur_mv[i] =
-          (i == 0) ? x->mbmi_ext->ref_mv_stack[ref_frame_type][mbmi->ref_mv_idx]
+          (i == 0) ? x->mbmi_ext.ref_mv_stack[ref_frame_type][mbmi->ref_mv_idx]
                          .this_mv
-                   : x->mbmi_ext->ref_mv_stack[ref_frame_type][mbmi->ref_mv_idx]
+                   : x->mbmi_ext.ref_mv_stack[ref_frame_type][mbmi->ref_mv_idx]
                          .comp_mv;
     } else {
       ret &= clamp_and_check_mv(cur_mv + i, this_mv, cm, x);
@@ -1810,7 +1806,7 @@
 static int get_drl_refmv_count(const MACROBLOCK *const x,
                                const MV_REFERENCE_FRAME *ref_frame,
                                PREDICTION_MODE mode) {
-  MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
+  const MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext;
   const int8_t ref_frame_type = av1_ref_frame_type(ref_frame);
   const int has_nearmv = have_nearmv_in_inter_mode(mode) ? 1 : 0;
   const int ref_mv_count = mbmi_ext->ref_mv_count[ref_frame_type];
@@ -1823,15 +1819,32 @@
   return ref_set;
 }
 
+// Checks if particular ref_mv_idx should be pruned.
+static int prune_ref_mv_idx_using_qindex(const int reduce_inter_modes,
+                                         const int qindex,
+                                         const int ref_mv_idx) {
+  if (reduce_inter_modes >= 3) return 1;
+  // Q-index logic based pruning is enabled only for
+  // reduce_inter_modes = 2.
+  assert(reduce_inter_modes == 2);
+  // When reduce_inter_modes=2, pruning happens as below based on q index.
+  // For q index range between 0 and 85: prune if ref_mv_idx >= 1.
+  // For q index range between 86 and 170: prune if ref_mv_idx == 2.
+  // For q index range between 171 and 255: no pruning.
+  const int min_prune_ref_mv_idx = (qindex * 3 / QINDEX_RANGE) + 1;
+  return (ref_mv_idx >= min_prune_ref_mv_idx);
+}
+
 // Whether this reference motion vector can be skipped, based on initial
 // heuristics.
-static bool ref_mv_idx_early_breakout(const AV1_COMP *const cpi, MACROBLOCK *x,
-                                      const HandleInterModeArgs *const args,
-                                      int64_t ref_best_rd, int ref_mv_idx) {
-  const SPEED_FEATURES *const sf = &cpi->sf;
+static bool ref_mv_idx_early_breakout(
+    const SPEED_FEATURES *const sf,
+    const RefFrameDistanceInfo *const ref_frame_dist_info, MACROBLOCK *x,
+    const HandleInterModeArgs *const args, int64_t ref_best_rd,
+    int ref_mv_idx) {
   MACROBLOCKD *xd = &x->e_mbd;
   MB_MODE_INFO *mbmi = xd->mi[0];
-  const MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
+  const MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext;
   const int8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
   const int is_comp_pred = has_second_ref(mbmi);
   if (sf->inter_sf.reduce_inter_modes && ref_mv_idx > 0) {
@@ -1848,29 +1861,27 @@
     // TODO(any): Experiment with reduce_inter_modes for compound prediction
     if (sf->inter_sf.reduce_inter_modes >= 2 && !is_comp_pred &&
         have_newmv_in_inter_mode(mbmi->mode)) {
-      if (mbmi->ref_frame[0] != cpi->nearest_past_ref &&
-          mbmi->ref_frame[0] != cpi->nearest_future_ref) {
+      if (mbmi->ref_frame[0] != ref_frame_dist_info->nearest_past_ref &&
+          mbmi->ref_frame[0] != ref_frame_dist_info->nearest_future_ref) {
         const int has_nearmv = have_nearmv_in_inter_mode(mbmi->mode) ? 1 : 0;
-        if (mbmi_ext->weight[ref_frame_type][ref_mv_idx + has_nearmv] <
-            REF_CAT_LEVEL) {
+        const int do_prune = prune_ref_mv_idx_using_qindex(
+            sf->inter_sf.reduce_inter_modes, x->qindex, ref_mv_idx);
+        if (do_prune &&
+            (mbmi_ext->weight[ref_frame_type][ref_mv_idx + has_nearmv] <
+             REF_CAT_LEVEL)) {
           return true;
         }
       }
     }
   }
-  if (sf->inter_sf.prune_single_motion_modes_by_simple_trans && !is_comp_pred &&
-      args->single_ref_first_pass == 0) {
-    if (args->simple_rd_state[ref_mv_idx].early_skipped) {
-      return true;
-    }
-  }
+
   mbmi->ref_mv_idx = ref_mv_idx;
   if (is_comp_pred && (!is_single_newmv_valid(args, mbmi, mbmi->mode))) {
     return true;
   }
   size_t est_rd_rate = args->ref_frame_cost + args->single_comp_cost;
-  const int drl_cost =
-      get_drl_cost(mbmi, mbmi_ext, x->drl_mode_cost0, ref_frame_type);
+  const int drl_cost = get_drl_cost(
+      mbmi, mbmi_ext, x->mode_costs.drl_mode_cost0, ref_frame_type);
   est_rd_rate += drl_cost;
   if (RDCOST(x->rdmult, est_rd_rate, 0) > ref_best_rd &&
       mbmi->mode != NEARESTMV && mbmi->mode != NEAREST_NEARESTMV) {
@@ -1886,10 +1897,11 @@
     int64_t ref_best_rd, BLOCK_SIZE bsize) {
   MACROBLOCKD *xd = &x->e_mbd;
   MB_MODE_INFO *mbmi = xd->mi[0];
-  MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
+  MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext;
   const int8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
   const AV1_COMMON *cm = &cpi->common;
   const int is_comp_pred = has_second_ref(mbmi);
+  const ModeCosts *mode_costs = &x->mode_costs;
 
   struct macroblockd_plane *p = xd->plane;
   const BUFFER_SET orig_dst = {
@@ -1913,7 +1925,7 @@
 
   rd_stats->rate += args->ref_frame_cost + args->single_comp_cost;
   const int drl_cost =
-      get_drl_cost(mbmi, mbmi_ext, x->drl_mode_cost0, ref_frame_type);
+      get_drl_cost(mbmi, mbmi_ext, mode_costs->drl_mode_cost0, ref_frame_type);
   rd_stats->rate += drl_cost;
   mode_info[ref_mv_idx].drl_cost = drl_cost;
 
@@ -1925,7 +1937,7 @@
   for (int i = 0; i < is_comp_pred + 1; ++i) {
     mbmi->mv[i].as_int = cur_mv[i].as_int;
   }
-  const int ref_mv_cost = cost_mv_ref(x, mbmi->mode, mode_ctx);
+  const int ref_mv_cost = cost_mv_ref(mode_costs, mbmi->mode, mode_ctx);
   rd_stats->rate += ref_mv_cost;
 
   if (RDCOST(x->rdmult, rd_stats->rate, 0) > ref_best_rd) {
@@ -1979,7 +1991,8 @@
   // Only search indices if they have some chance of being good.
   int good_indices = 0;
   for (int i = 0; i < ref_set; ++i) {
-    if (ref_mv_idx_early_breakout(cpi, x, args, ref_best_rd, i)) {
+    if (ref_mv_idx_early_breakout(&cpi->sf, &cpi->ref_frame_dist_info, x, args,
+                                  ref_best_rd, i)) {
       continue;
     }
     mask_set_bit(&good_indices, i);
@@ -2033,14 +2046,37 @@
   return result;
 }
 
+/*!\brief Motion mode information for inter mode search speedup.
+ *
+ * Used in a speed feature to search motion modes other than
+ * SIMPLE_TRANSLATION only on winning candidates.
+ */
 typedef struct motion_mode_candidate {
+  /*!
+   * Mode info for the motion mode candidate.
+   */
   MB_MODE_INFO mbmi;
+  /*!
+   * Rate describing the cost of the motion vectors for this candidate.
+   */
   int rate_mv;
+  /*!
+   * Rate before motion mode search and transform coding is applied.
+   */
   int rate2_nocoeff;
+  /*!
+   * An integer value 0 or 1 which indicates whether or not to skip the motion
+   * mode search and default to SIMPLE_TRANSLATION as a speed feature for this
+   * candidate.
+   */
   int skip_motion_mode;
+  /*!
+   * Total RD cost for this candidate.
+   */
   int64_t rd_cost;
 } motion_mode_candidate;
 
+/*!\cond */
 typedef struct motion_mode_best_st_candidate {
   motion_mode_candidate motion_mode_cand[MAX_WINNER_MOTION_MODES];
   int num_motion_mode_cand;
@@ -2067,7 +2103,7 @@
 
 static AOM_INLINE int find_ref_match_in_above_nbs(const int total_mi_cols,
                                                   MACROBLOCKD *xd) {
-  if (!xd->up_available) return 0;
+  if (!xd->up_available) return 1;
   const int mi_col = xd->mi_col;
   MB_MODE_INFO **cur_mbmi = xd->mi;
   // prev_row_mi points into the mi array, starting at the beginning of the
@@ -2078,7 +2114,7 @@
   for (int above_mi_col = mi_col; above_mi_col < end_col;
        above_mi_col += mi_step) {
     MB_MODE_INFO **above_mi = prev_row_mi + above_mi_col;
-    mi_step = mi_size_wide[above_mi[0]->sb_type];
+    mi_step = mi_size_wide[above_mi[0]->bsize];
     int match_found = 0;
     if (is_inter_block(*above_mi))
       match_found = ref_match_found_in_nb_blocks(*cur_mbmi, *above_mi);
@@ -2089,7 +2125,7 @@
 
 static AOM_INLINE int find_ref_match_in_left_nbs(const int total_mi_rows,
                                                  MACROBLOCKD *xd) {
-  if (!xd->left_available) return 0;
+  if (!xd->left_available) return 1;
   const int mi_row = xd->mi_row;
   MB_MODE_INFO **cur_mbmi = xd->mi;
   // prev_col_mi points into the mi array, starting at the top of the
@@ -2100,7 +2136,7 @@
   for (int left_mi_row = mi_row; left_mi_row < end_row;
        left_mi_row += mi_step) {
     MB_MODE_INFO **left_mi = prev_col_mi + left_mi_row * xd->mi_stride;
-    mi_step = mi_size_high[left_mi[0]->sb_type];
+    mi_step = mi_size_high[left_mi[0]->bsize];
     int match_found = 0;
     if (is_inter_block(*left_mi))
       match_found = ref_match_found_in_nb_blocks(*cur_mbmi, *left_mi);
@@ -2108,9 +2144,19 @@
   }
   return 0;
 }
+/*!\endcond */
 
+/*! \brief Struct used to hold TPL data to
+ * narrow down parts of the inter mode search.
+ */
 typedef struct {
+  /*!
+   * The best inter cost out of all of the reference frames.
+   */
   int64_t best_inter_cost;
+  /*!
+   * The inter cost for each reference frame.
+   */
   int64_t ref_inter_cost[INTER_REFS_PER_FRAME];
 } PruneInfoFromTpl;
 
@@ -2126,7 +2172,7 @@
   const int tpl_idx = gf_group->index;
   TplParams *const tpl_data = &cpi->tpl_data;
   const TplDepFrame *tpl_frame = &tpl_data->tpl_frame[tpl_idx];
-  if (tpl_idx >= MAX_LAG_BUFFERS || !tpl_frame->is_valid) {
+  if (tpl_idx >= MAX_TPL_FRAME_IDX || !tpl_frame->is_valid) {
     return;
   }
 
@@ -2141,10 +2187,13 @@
       coded_to_superres_mi(mi_col + mi_wide, cm->superres_scale_denominator);
   const int mi_cols_sr = av1_pixels_to_mi(cm->superres_upscaled_width);
 
+  const int row_step = step;
+  const int col_step_sr =
+      coded_to_superres_mi(step, cm->superres_scale_denominator);
   for (int row = mi_row; row < AOMMIN(mi_row + mi_high, cm->mi_params.mi_rows);
-       row += step) {
+       row += row_step) {
     for (int col = mi_col_sr; col < AOMMIN(mi_col_end_sr, mi_cols_sr);
-         col += step) {
+         col += col_step_sr) {
       const TplDepStats *this_stats = &tpl_stats[av1_tpl_ptr_pos(
           row, col, tpl_stride, tpl_data->tpl_stats_block_mis_log2)];
 
@@ -2175,10 +2224,9 @@
     PruneInfoFromTpl *inter_cost_info_from_tpl, const int *refs, int ref_mv_idx,
     const PREDICTION_MODE this_mode, int prune_mode_level) {
   const int have_newmv = have_newmv_in_inter_mode(this_mode);
-  if ((prune_mode_level < 3) && have_newmv) return 0;
+  if ((prune_mode_level < 2) && have_newmv) return 0;
 
-  static const int prune_level_idx[3] = { 0, 1, 1 };
-  const int prune_level = prune_level_idx[prune_mode_level - 1];
+  const int prune_level = prune_mode_level - 1;
   int64_t cur_inter_cost;
 
   const int is_globalmv =
@@ -2190,8 +2238,8 @@
   // conservative pruning which is set based on ref_mv_idx and speed feature.
   // 'prune_index' 0, 1, 2 corresponds to ref_mv indices 0, 1 and 2. prune_index
   // 3 corresponds to GLOBALMV/GLOBAL_GLOBALMV
-  static const int tpl_inter_mode_prune_mul_factor[2][MAX_REF_MV_SEARCH + 1] = {
-    { 3, 3, 3, 2 }, { 3, 2, 2, 2 }
+  static const int tpl_inter_mode_prune_mul_factor[3][MAX_REF_MV_SEARCH + 1] = {
+    { 6, 6, 6, 4 }, { 6, 4, 4, 4 }, { 5, 4, 4, 4 }
   };
 
   const int is_comp_pred = (refs[1] > INTRA_FRAME);
@@ -2213,25 +2261,328 @@
   if (cur_inter_cost >
       ((tpl_inter_mode_prune_mul_factor[prune_level][prune_index] *
         best_inter_cost) >>
-       1))
+       2))
     return 1;
   return 0;
 }
 
+// If the current mode being searched is NEWMV, this function will look
+// at previously searched MVs and check if they are the same
+// as the current MV. If it finds that this MV is repeated, it compares
+// the cost to the previous MV and skips the rest of the search if it is
+// more expensive.
+static int skip_repeated_newmv(
+    AV1_COMP *const cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
+    const int do_tx_search, const PREDICTION_MODE this_mode,
+    MB_MODE_INFO *best_mbmi, motion_mode_candidate *motion_mode_cand,
+    int64_t *ref_best_rd, RD_STATS *best_rd_stats, RD_STATS *best_rd_stats_y,
+    RD_STATS *best_rd_stats_uv, inter_mode_info *mode_info,
+    HandleInterModeArgs *args, int drl_cost, const int *refs, int_mv *cur_mv,
+    int64_t *best_rd, const BUFFER_SET orig_dst, int ref_mv_idx) {
+  // This feature only works for NEWMV when a previous mv has been searched
+  if (this_mode != NEWMV || ref_mv_idx == 0) return 0;
+  MACROBLOCKD *xd = &x->e_mbd;
+  const AV1_COMMON *cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+
+  int skip = 0;
+  int this_rate_mv = 0;
+  int i;
+  for (i = 0; i < ref_mv_idx; ++i) {
+    // Check if the motion search result same as previous results
+    if (cur_mv[0].as_int == args->single_newmv[i][refs[0]].as_int &&
+        args->single_newmv_valid[i][refs[0]]) {
+      // If the compared mode has no valid rd, it is unlikely this
+      // mode will be the best mode
+      if (mode_info[i].rd == INT64_MAX) {
+        skip = 1;
+        break;
+      }
+      // Compare the cost difference including drl cost and mv cost
+      if (mode_info[i].mv.as_int != INVALID_MV) {
+        const int compare_cost = mode_info[i].rate_mv + mode_info[i].drl_cost;
+        const int_mv ref_mv = av1_get_ref_mv(x, 0);
+        this_rate_mv = av1_mv_bit_cost(
+            &mode_info[i].mv.as_mv, &ref_mv.as_mv, x->mv_costs.nmv_joint_cost,
+            x->mv_costs.mv_cost_stack, MV_COST_WEIGHT);
+        const int this_cost = this_rate_mv + drl_cost;
+
+        if (compare_cost <= this_cost) {
+          // Skip this mode if it is more expensive as the previous result
+          // for this MV
+          skip = 1;
+          break;
+        } else {
+          // If the cost is less than current best result, make this
+          // the best and update corresponding variables unless the
+          // best_mv is the same as ref_mv. In this case we skip and
+          // rely on NEAR(EST)MV instead
+          if (best_mbmi->ref_mv_idx == i &&
+              best_mbmi->mv[0].as_int != ref_mv.as_int) {
+            assert(*best_rd != INT64_MAX);
+            assert(best_mbmi->mv[0].as_int == mode_info[i].mv.as_int);
+            best_mbmi->ref_mv_idx = ref_mv_idx;
+            motion_mode_cand->rate_mv = this_rate_mv;
+            best_rd_stats->rate += this_cost - compare_cost;
+            *best_rd =
+                RDCOST(x->rdmult, best_rd_stats->rate, best_rd_stats->dist);
+            // We also need to update mode_info here because we are setting
+            // (ref_)best_rd here. So we will not be able to search the same
+            // mode again with the current configuration.
+            mode_info[ref_mv_idx].mv.as_int = best_mbmi->mv[0].as_int;
+            mode_info[ref_mv_idx].rate_mv = this_rate_mv;
+            mode_info[ref_mv_idx].rd = *best_rd;
+            if (*best_rd < *ref_best_rd) *ref_best_rd = *best_rd;
+            break;
+          }
+        }
+      }
+    }
+  }
+  if (skip) {
+    const THR_MODES mode_enum = get_prediction_mode_idx(
+        best_mbmi->mode, best_mbmi->ref_frame[0], best_mbmi->ref_frame[1]);
+    // Collect mode stats for multiwinner mode processing
+    store_winner_mode_stats(
+        &cpi->common, x, best_mbmi, best_rd_stats, best_rd_stats_y,
+        best_rd_stats_uv, mode_enum, NULL, bsize, *best_rd,
+        cpi->sf.winner_mode_sf.multi_winner_mode_type, do_tx_search);
+    args->modelled_rd[this_mode][ref_mv_idx][refs[0]] =
+        args->modelled_rd[this_mode][i][refs[0]];
+    args->simple_rd[this_mode][ref_mv_idx][refs[0]] =
+        args->simple_rd[this_mode][i][refs[0]];
+    mode_info[ref_mv_idx].rd = mode_info[i].rd;
+    mode_info[ref_mv_idx].rate_mv = this_rate_mv;
+    mode_info[ref_mv_idx].mv.as_int = mode_info[i].mv.as_int;
+
+    restore_dst_buf(xd, orig_dst, num_planes);
+    return 1;
+  }
+  return 0;
+}
+
+/*!\brief High level function to select parameters for compound mode.
+ *
+ * \ingroup inter_mode_search
+ * The main search functionality is done in the call to av1_compound_type_rd().
+ *
+ * \param[in]     cpi               Top-level encoder structure.
+ * \param[in]     x                 Pointer to struct holding all the data for
+ *                                  the current macroblock.
+ * \param[in]     args              HandleInterModeArgs struct holding
+ *                                  miscellaneous arguments for inter mode
+ *                                  search. See the documentation for this
+ *                                  struct for a description of each member.
+ * \param[in]     ref_best_rd       Best RD found so far for this block.
+ *                                  It is used for early termination of this
+ *                                  search if the RD exceeds this value.
+ * \param[in,out] cur_mv            Current motion vector.
+ * \param[in]     bsize             Current block size.
+ * \param[in,out] compmode_interinter_cost  RD of the selected interinter
+                                    compound mode.
+ * \param[in,out] rd_buffers        CompoundTypeRdBuffers struct to hold all
+ *                                  allocated buffers for the compound
+ *                                  predictors and masks in the compound type
+ *                                  search.
+ * \param[in,out] orig_dst          A prediction buffer to hold a computed
+ *                                  prediction. This will eventually hold the
+ *                                  final prediction, and the tmp_dst info will
+ *                                  be copied here.
+ * \param[in]     tmp_dst           A temporary prediction buffer to hold a
+ *                                  computed prediction.
+ * \param[in,out] rate_mv           The rate associated with the motion vectors.
+ *                                  This will be modified if a motion search is
+ *                                  done in the motion mode search.
+ * \param[in,out] rd_stats          Struct to keep track of the overall RD
+ *                                  information.
+ * \param[in,out] skip_rd           An array of length 2 where skip_rd[0] is the
+ *                                  best total RD for a skip mode so far, and
+ *                                  skip_rd[1] is the best RD for a skip mode so
+ *                                  far in luma. This is used as a speed feature
+ *                                  to skip the transform search if the computed
+ *                                  skip RD for the current mode is not better
+ *                                  than the best skip_rd so far.
+ * \param[in,out] skip_build_pred   Indicates whether or not to build the inter
+ *                                  predictor. If this is 0, the inter predictor
+ *                                  has already been built and thus we can avoid
+ *                                  repeating computation.
+ * \return Returns 1 if this mode is worse than one already seen and 0 if it is
+ * a viable candidate.
+ */
+static int process_compound_inter_mode(
+    AV1_COMP *const cpi, MACROBLOCK *x, HandleInterModeArgs *args,
+    int64_t ref_best_rd, int_mv *cur_mv, BLOCK_SIZE bsize,
+    int *compmode_interinter_cost, const CompoundTypeRdBuffers *rd_buffers,
+    const BUFFER_SET *orig_dst, const BUFFER_SET *tmp_dst, int *rate_mv,
+    RD_STATS *rd_stats, int64_t *skip_rd, int *skip_build_pred) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  const AV1_COMMON *cm = &cpi->common;
+  const int masked_compound_used = is_any_masked_compound_used(bsize) &&
+                                   cm->seq_params.enable_masked_compound;
+  int mode_search_mask = (1 << COMPOUND_AVERAGE) | (1 << COMPOUND_DISTWTD) |
+                         (1 << COMPOUND_WEDGE) | (1 << COMPOUND_DIFFWTD);
+
+  const int num_planes = av1_num_planes(cm);
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
+  // Find matching interp filter or set to default interp filter
+  const int need_search = av1_is_interp_needed(xd);
+  const InterpFilter assign_filter = cm->features.interp_filter;
+  int is_luma_interp_done = 0;
+  av1_find_interp_filter_match(mbmi, cpi, assign_filter, need_search,
+                               args->interp_filter_stats,
+                               args->interp_filter_stats_idx);
+
+  int64_t best_rd_compound;
+  int64_t rd_thresh;
+  const int comp_type_rd_shift = COMP_TYPE_RD_THRESH_SHIFT;
+  const int comp_type_rd_scale = COMP_TYPE_RD_THRESH_SCALE;
+  rd_thresh = get_rd_thresh_from_best_rd(ref_best_rd, (1 << comp_type_rd_shift),
+                                         comp_type_rd_scale);
+  // Select compound type and any parameters related to that type
+  // (for example, the mask parameters if it is a masked mode) and compute
+  // the RD
+  *compmode_interinter_cost = av1_compound_type_rd(
+      cpi, x, args, bsize, cur_mv, mode_search_mask, masked_compound_used,
+      orig_dst, tmp_dst, rd_buffers, rate_mv, &best_rd_compound, rd_stats,
+      ref_best_rd, skip_rd[1], &is_luma_interp_done, rd_thresh);
+  if (ref_best_rd < INT64_MAX &&
+      (best_rd_compound >> comp_type_rd_shift) * comp_type_rd_scale >
+          ref_best_rd) {
+    restore_dst_buf(xd, *orig_dst, num_planes);
+    return 1;
+  }
+
+  // Build only uv predictor for COMPOUND_AVERAGE.
+  // Note there is no need to call av1_enc_build_inter_predictor
+  // for luma if COMPOUND_AVERAGE is selected because it is the first
+  // candidate in av1_compound_type_rd, which means it used the dst_buf
+  // rather than the tmp_buf.
+  if (mbmi->interinter_comp.type == COMPOUND_AVERAGE && is_luma_interp_done) {
+    if (num_planes > 1) {
+      av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize,
+                                    AOM_PLANE_U, num_planes - 1);
+    }
+    *skip_build_pred = 1;
+  }
+  return 0;
+}
+
+// Speed feature to prune out MVs that are similar to previous MVs if they
+// don't achieve the best RD advantage.
+static int prune_ref_mv_idx_search(int ref_mv_idx, int best_ref_mv_idx,
+                                   int_mv save_mv[MAX_REF_MV_SEARCH - 1][2],
+                                   MB_MODE_INFO *mbmi, int pruning_factor) {
+  int i;
+  const int is_comp_pred = has_second_ref(mbmi);
+  const int thr = (1 + is_comp_pred) << (pruning_factor + 1);
+
+  // Skip the evaluation if an MV match is found.
+  if (ref_mv_idx > 0) {
+    for (int idx = 0; idx < ref_mv_idx; ++idx) {
+      if (save_mv[idx][0].as_int == INVALID_MV) continue;
+
+      int mv_diff = 0;
+      for (i = 0; i < 1 + is_comp_pred; ++i) {
+        mv_diff += abs(save_mv[idx][i].as_mv.row - mbmi->mv[i].as_mv.row) +
+                   abs(save_mv[idx][i].as_mv.col - mbmi->mv[i].as_mv.col);
+      }
+
+      // If this mode is not the best one, and current MV is similar to
+      // previous stored MV, terminate this ref_mv_idx evaluation.
+      if (best_ref_mv_idx == -1 && mv_diff <= thr) return 1;
+    }
+  }
+
+  if (ref_mv_idx < MAX_REF_MV_SEARCH - 1) {
+    for (i = 0; i < is_comp_pred + 1; ++i)
+      save_mv[ref_mv_idx][i].as_int = mbmi->mv[i].as_int;
+  }
+
+  return 0;
+}
+
+/*!\brief AV1 inter mode RD computation
+ *
+ * \ingroup inter_mode_search
+ * Do the RD search for a given inter mode and compute all information relevant
+ * to the input mode. It will compute the best MV,
+ * compound parameters (if the mode is a compound mode) and interpolation filter
+ * parameters.
+ *
+ * \param[in]     cpi               Top-level encoder structure.
+ * \param[in]     tile_data         Pointer to struct holding adaptive
+ *                                  data/contexts/models for the tile during
+ *                                  encoding.
+ * \param[in]     x                 Pointer to structure holding all the data
+ *                                  for the current macroblock.
+ * \param[in]     bsize             Current block size.
+ * \param[in,out] rd_stats          Struct to keep track of the overall RD
+ *                                  information.
+ * \param[in,out] rd_stats_y        Struct to keep track of the RD information
+ *                                  for only the Y plane.
+ * \param[in,out] rd_stats_uv       Struct to keep track of the RD information
+ *                                  for only the UV planes.
+ * \param[in]     args              HandleInterModeArgs struct holding
+ *                                  miscellaneous arguments for inter mode
+ *                                  search. See the documentation for this
+ *                                  struct for a description of each member.
+ * \param[in]     ref_best_rd       Best RD found so far for this block.
+ *                                  It is used for early termination of this
+ *                                  search if the RD exceeds this value.
+ * \param[in]     tmp_buf           Temporary buffer used to hold predictors
+ *                                  built in this search.
+ * \param[in,out] rd_buffers        CompoundTypeRdBuffers struct to hold all
+ *                                  allocated buffers for the compound
+ *                                  predictors and masks in the compound type
+ *                                  search.
+ * \param[in,out] best_est_rd       Estimated RD for motion mode search if
+ *                                  do_tx_search (see below) is 0.
+ * \param[in]     do_tx_search      Parameter to indicate whether or not to do
+ *                                  a full transform search. This will compute
+ *                                  an estimated RD for the modes without the
+ *                                  transform search and later perform the full
+ *                                  transform search on the best candidates.
+ * \param[in,out] inter_modes_info  InterModesInfo struct to hold inter mode
+ *                                  information to perform a full transform
+ *                                  search only on winning candidates searched
+ *                                  with an estimate for transform coding RD.
+ * \param[in,out] motion_mode_cand  A motion_mode_candidate struct to store
+ *                                  motion mode information used in a speed
+ *                                  feature to search motion modes other than
+ *                                  SIMPLE_TRANSLATION only on winning
+ *                                  candidates.
+ * \param[in,out] skip_rd           A length 2 array, where skip_rd[0] is the
+ *                                  best total RD for a skip mode so far, and
+ *                                  skip_rd[1] is the best RD for a skip mode so
+ *                                  far in luma. This is used as a speed feature
+ *                                  to skip the transform search if the computed
+ *                                  skip RD for the current mode is not better
+ *                                  than the best skip_rd so far.
+ * \param[in]     inter_cost_info_from_tpl A PruneInfoFromTpl struct used to
+ *                                         narrow down the search based on data
+ *                                         collected in the TPL model.
+ * \param[out]    yrd               Stores the rdcost corresponding to encoding
+ *                                  the luma plane.
+ *
+ * \return The RD cost for the mode being searched.
+ */
 static int64_t handle_inter_mode(
     AV1_COMP *const cpi, TileDataEnc *tile_data, MACROBLOCK *x,
     BLOCK_SIZE bsize, RD_STATS *rd_stats, RD_STATS *rd_stats_y,
-    RD_STATS *rd_stats_uv, int *disable_skip, HandleInterModeArgs *args,
-    int64_t ref_best_rd, uint8_t *const tmp_buf,
-    const CompoundTypeRdBuffers *rd_buffers, int64_t *best_est_rd,
-    const int do_tx_search, InterModesInfo *inter_modes_info,
-    motion_mode_candidate *motion_mode_cand, int64_t *skip_rd,
-    PruneInfoFromTpl *inter_cost_info_from_tpl) {
+    RD_STATS *rd_stats_uv, HandleInterModeArgs *args, int64_t ref_best_rd,
+    uint8_t *const tmp_buf, const CompoundTypeRdBuffers *rd_buffers,
+    int64_t *best_est_rd, const int do_tx_search,
+    InterModesInfo *inter_modes_info, motion_mode_candidate *motion_mode_cand,
+    int64_t *skip_rd, PruneInfoFromTpl *inter_cost_info_from_tpl,
+    int64_t *yrd) {
   const AV1_COMMON *cm = &cpi->common;
   const int num_planes = av1_num_planes(cm);
   MACROBLOCKD *xd = &x->e_mbd;
   MB_MODE_INFO *mbmi = xd->mi[0];
-  MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
+  MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext;
+  TxfmSearchInfo *txfm_info = &x->txfm_search_info;
   const int is_comp_pred = has_second_ref(mbmi);
   const PREDICTION_MODE this_mode = mbmi->mode;
 
@@ -2240,13 +2591,14 @@
   TplDepFrame *tpl_frame = &cpi->tpl_data.tpl_frame[tpl_idx];
   const int prune_modes_based_on_tpl =
       cpi->sf.inter_sf.prune_inter_modes_based_on_tpl &&
-      tpl_idx >= MAX_LAG_BUFFERS && tpl_frame->is_valid;
+      tpl_idx < MAX_TPL_FRAME_IDX && tpl_frame->is_valid;
   int i;
+  // Reference frames for this mode
   const int refs[2] = { mbmi->ref_frame[0],
                         (mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]) };
   int rate_mv = 0;
   int64_t rd = INT64_MAX;
-  // do first prediction into the destination buffer. Do the next
+  // Do first prediction into the destination buffer. Do the next
   // prediction into a temporary buffer. Then keep track of which one
   // of these currently holds the best predictor, and use the other
   // one for future predictions. In the end, copy from tmp_buf to
@@ -2260,23 +2612,18 @@
                                  tmp_buf + 2 * MAX_SB_SQUARE },
                                { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE } };
 
-  const int masked_compound_used = is_any_masked_compound_used(bsize) &&
-                                   cm->seq_params.enable_masked_compound;
   int64_t ret_val = INT64_MAX;
   const int8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
   RD_STATS best_rd_stats, best_rd_stats_y, best_rd_stats_uv;
   int64_t best_rd = INT64_MAX;
   uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
   uint8_t best_tx_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE];
+  int64_t best_yrd = INT64_MAX;
   MB_MODE_INFO best_mbmi = *mbmi;
-  int best_disable_skip = 0;
-  int best_xskip = 0;
+  int best_xskip_txfm = 0;
   int64_t newmv_ret_val = INT64_MAX;
   inter_mode_info mode_info[MAX_REF_MV_SEARCH];
 
-  int mode_search_mask = (1 << COMPOUND_AVERAGE) | (1 << COMPOUND_DISTWTD) |
-                         (1 << COMPOUND_WEDGE) | (1 << COMPOUND_DIFFWTD);
-
   // Do not prune the mode based on inter cost from tpl if the current ref frame
   // is the winner ref in neighbouring blocks.
   int ref_match_found_in_above_nb = 0;
@@ -2289,18 +2636,35 @@
   }
 
   // First, perform a simple translation search for each of the indices. If
-  // an index performs well, it will be fully searched here.
+  // an index performs well, it will be fully searched in the main loop
+  // of this function.
   const int ref_set = get_drl_refmv_count(x, mbmi->ref_frame, this_mode);
   // Save MV results from first 2 ref_mv_idx.
-  int_mv save_mv[MAX_REF_MV_SEARCH - 1][2] = { { { 0 } } };
+  int_mv save_mv[MAX_REF_MV_SEARCH - 1][2];
   int best_ref_mv_idx = -1;
   const int idx_mask = ref_mv_idx_to_search(cpi, x, rd_stats, args, ref_best_rd,
                                             mode_info, bsize, ref_set);
   const int16_t mode_ctx =
       av1_mode_context_analyzer(mbmi_ext->mode_context, mbmi->ref_frame);
-  const int ref_mv_cost = cost_mv_ref(x, this_mode, mode_ctx);
+  const ModeCosts *mode_costs = &x->mode_costs;
+  const int ref_mv_cost = cost_mv_ref(mode_costs, this_mode, mode_ctx);
   const int base_rate =
       args->ref_frame_cost + args->single_comp_cost + ref_mv_cost;
+
+  for (i = 0; i < MAX_REF_MV_SEARCH - 1; ++i) {
+    save_mv[i][0].as_int = INVALID_MV;
+    save_mv[i][1].as_int = INVALID_MV;
+  }
+
+  // Main loop of this function. This will  iterate over all of the ref mvs
+  // in the dynamic reference list and do the following:
+  //    1.) Get the current MV. Create newmv MV if necessary
+  //    2.) Search compound type and parameters if applicable
+  //    3.) Do interpolation filter search
+  //    4.) Build the inter predictor
+  //    5.) Pick the motion mode (SIMPLE_TRANSLATION, OBMC_CAUSAL,
+  //        WARPED_CAUSAL)
+  //    6.) Update stats if best so far
   for (int ref_mv_idx = 0; ref_mv_idx < ref_set; ++ref_mv_idx) {
     mode_info[ref_mv_idx].full_search_mv.as_int = INVALID_MV;
     mode_info[ref_mv_idx].mv.as_int = INVALID_MV;
@@ -2312,6 +2676,7 @@
     }
     if (prune_modes_based_on_tpl && !ref_match_found_in_above_nb &&
         !ref_match_found_in_left_nb && (ref_best_rd != INT64_MAX)) {
+      // Skip mode if TPL model indicates it will not be beneficial.
       if (prune_modes_based_on_tpl_stats(
               inter_cost_info_from_tpl, refs, ref_mv_idx, this_mode,
               cpi->sf.inter_sf.prune_inter_modes_based_on_tpl))
@@ -2319,6 +2684,7 @@
     }
     av1_init_rd_stats(rd_stats);
 
+    // Initialize compound mode data
     mbmi->interinter_comp.type = COMPOUND_AVERAGE;
     mbmi->comp_group_idx = 0;
     mbmi->compound_idx = 1;
@@ -2328,9 +2694,10 @@
     mbmi->motion_mode = SIMPLE_TRANSLATION;
     mbmi->ref_mv_idx = ref_mv_idx;
 
+    // Compute cost for signalling this DRL index
     rd_stats->rate = base_rate;
-    const int drl_cost =
-        get_drl_cost(mbmi, mbmi_ext, x->drl_mode_cost0, ref_frame_type);
+    const int drl_cost = get_drl_cost(
+        mbmi, mbmi_ext, mode_costs->drl_mode_cost0, ref_frame_type);
     rd_stats->rate += drl_cost;
     mode_info[ref_mv_idx].drl_cost = drl_cost;
 
@@ -2342,24 +2709,19 @@
     // TODO(Cherma): Extend this speed feature to support compound mode
     int skip_repeated_ref_mv =
         is_comp_pred ? 0 : cpi->sf.inter_sf.skip_repeated_ref_mv;
+    // Generate the current mv according to the prediction mode
     if (!build_cur_mv(cur_mv, this_mode, cm, x, skip_repeated_ref_mv)) {
       continue;
     }
 
+    // The above call to build_cur_mv does not handle NEWMV modes. Build
+    // the mv here if we have NEWMV for any predictors.
     if (have_newmv_in_inter_mode(this_mode)) {
 #if CONFIG_COLLECT_COMPONENT_TIMING
       start_timing(cpi, handle_newmv_time);
 #endif
-      if (cpi->sf.inter_sf.prune_single_motion_modes_by_simple_trans &&
-          args->single_ref_first_pass == 0 && !is_comp_pred) {
-        const int ref0 = mbmi->ref_frame[0];
-        newmv_ret_val = args->single_newmv_valid[ref_mv_idx][ref0] ? 0 : 1;
-        cur_mv[0] = args->single_newmv[ref_mv_idx][ref0];
-        rate_mv = args->single_newmv_rate[ref_mv_idx][ref0];
-      } else {
-        newmv_ret_val =
-            handle_newmv(cpi, x, bsize, cur_mv, &rate_mv, args, mode_info);
-      }
+      newmv_ret_val =
+          handle_newmv(cpi, x, bsize, cur_mv, &rate_mv, args, mode_info);
 #if CONFIG_COLLECT_COMPONENT_TIMING
       end_timing(cpi, handle_newmv_time);
 #endif
@@ -2368,76 +2730,17 @@
 
       rd_stats->rate += rate_mv;
 
-      if (cpi->sf.inter_sf.skip_repeated_newmv) {
-        if (!is_comp_pred && this_mode == NEWMV && ref_mv_idx > 0) {
-          int skip = 0;
-          int this_rate_mv = 0;
-          for (i = 0; i < ref_mv_idx; ++i) {
-            // Check if the motion search result same as previous results
-            if (cur_mv[0].as_int == args->single_newmv[i][refs[0]].as_int &&
-                args->single_newmv_valid[i][refs[0]]) {
-              // If the compared mode has no valid rd, it is unlikely this
-              // mode will be the best mode
-              if (mode_info[i].rd == INT64_MAX) {
-                skip = 1;
-                break;
-              }
-              // Compare the cost difference including drl cost and mv cost
-              if (mode_info[i].mv.as_int != INVALID_MV) {
-                const int compare_cost =
-                    mode_info[i].rate_mv + mode_info[i].drl_cost;
-                const int_mv ref_mv = av1_get_ref_mv(x, 0);
-                this_rate_mv = av1_mv_bit_cost(
-                    &mode_info[i].mv.as_mv, &ref_mv.as_mv, x->nmv_vec_cost,
-                    x->mv_cost_stack, MV_COST_WEIGHT);
-                const int this_cost = this_rate_mv + drl_cost;
-
-                if (compare_cost <= this_cost) {
-                  skip = 1;
-                  break;
-                } else {
-                  // If the cost is less than current best result, make this
-                  // the best and update corresponding variables unless the
-                  // best_mv is the same as ref_mv. In this case we skip and
-                  // rely on NEAR(EST)MV instead
-                  if (best_mbmi.ref_mv_idx == i &&
-                      mode_info[i].mv.as_int != ref_mv.as_int) {
-                    assert(best_rd != INT64_MAX);
-                    best_mbmi.ref_mv_idx = ref_mv_idx;
-                    motion_mode_cand->rate_mv = this_rate_mv;
-                    best_rd_stats.rate += this_cost - compare_cost;
-                    best_rd = RDCOST(x->rdmult, best_rd_stats.rate,
-                                     best_rd_stats.dist);
-                    if (best_rd < ref_best_rd) ref_best_rd = best_rd;
-                    break;
-                  }
-                }
-              }
-            }
-          }
-          if (skip) {
-            const THR_MODES mode_enum = get_prediction_mode_idx(
-                best_mbmi.mode, best_mbmi.ref_frame[0], best_mbmi.ref_frame[1]);
-            // Collect mode stats for multiwinner mode processing
-            store_winner_mode_stats(
-                &cpi->common, x, &best_mbmi, &best_rd_stats, &best_rd_stats_y,
-                &best_rd_stats_uv, mode_enum, NULL, bsize, best_rd,
-                cpi->sf.winner_mode_sf.enable_multiwinner_mode_process,
-                do_tx_search);
-            args->modelled_rd[this_mode][ref_mv_idx][refs[0]] =
-                args->modelled_rd[this_mode][i][refs[0]];
-            args->simple_rd[this_mode][ref_mv_idx][refs[0]] =
-                args->simple_rd[this_mode][i][refs[0]];
-            mode_info[ref_mv_idx].rd = mode_info[i].rd;
-            mode_info[ref_mv_idx].rate_mv = this_rate_mv;
-            mode_info[ref_mv_idx].mv.as_int = mode_info[i].mv.as_int;
-
-            restore_dst_buf(xd, orig_dst, num_planes);
-            continue;
-          }
-        }
-      }
+      // skip NEWMV mode in drl if the motion search result is the same
+      // as a previous result
+      if (cpi->sf.inter_sf.skip_repeated_newmv &&
+          skip_repeated_newmv(cpi, x, bsize, do_tx_search, this_mode,
+                              &best_mbmi, motion_mode_cand, &ref_best_rd,
+                              &best_rd_stats, &best_rd_stats_y,
+                              &best_rd_stats_uv, mode_info, args, drl_cost,
+                              refs, cur_mv, &best_rd, orig_dst, ref_mv_idx))
+        continue;
     }
+    // Copy the motion vector for this mode into mbmi struct
     for (i = 0; i < is_comp_pred + 1; ++i) {
       mbmi->mv[i].as_int = cur_mv[i].as_int;
     }
@@ -2447,33 +2750,12 @@
       continue;
     }
 
-    if (cpi->sf.inter_sf.prune_ref_mv_idx_search && is_comp_pred) {
-      // TODO(yunqing): Move this part to a separate function when it is done.
-      // Store MV result.
-      if (ref_mv_idx < MAX_REF_MV_SEARCH - 1) {
-        for (i = 0; i < is_comp_pred + 1; ++i)
-          save_mv[ref_mv_idx][i].as_int = mbmi->mv[i].as_int;
-      }
-      // Skip the evaluation if an MV match is found.
-      if (ref_mv_idx > 0) {
-        int match = 0;
-        for (int idx = 0; idx < ref_mv_idx; ++idx) {
-          int mv_diff = 0;
-          for (i = 0; i < 1 + is_comp_pred; ++i) {
-            mv_diff += abs(save_mv[idx][i].as_mv.row - mbmi->mv[i].as_mv.row) +
-                       abs(save_mv[idx][i].as_mv.col - mbmi->mv[i].as_mv.col);
-          }
-
-          // If this mode is not the best one, and current MV is similar to
-          // previous stored MV, terminate this ref_mv_idx evaluation.
-          if (best_ref_mv_idx == -1 && mv_diff < 1) {
-            match = 1;
-            break;
-          }
-        }
-        if (match == 1) continue;
-      }
-    }
+    // Skip the rest of the search if prune_ref_mv_idx_search speed feature
+    // is enabled, and the current MV is similar to a previous one.
+    if (cpi->sf.inter_sf.prune_ref_mv_idx_search && is_comp_pred &&
+        prune_ref_mv_idx_search(ref_mv_idx, best_ref_mv_idx, save_mv, mbmi,
+                                cpi->sf.inter_sf.prune_ref_mv_idx_search))
+      continue;
 
 #if CONFIG_COLLECT_COMPONENT_TIMING
     start_timing(cpi, compound_type_rd_time);
@@ -2481,44 +2763,15 @@
     int skip_build_pred = 0;
     const int mi_row = xd->mi_row;
     const int mi_col = xd->mi_col;
+
+    // Handle a compound predictor, continue if it is determined this
+    // cannot be the best compound mode
     if (is_comp_pred) {
-      // Find matching interp filter or set to default interp filter
-      const int need_search = av1_is_interp_needed(xd);
-      const InterpFilter assign_filter = cm->features.interp_filter;
-      int is_luma_interp_done = 0;
-      av1_find_interp_filter_match(mbmi, cpi, assign_filter, need_search,
-                                   args->interp_filter_stats,
-                                   args->interp_filter_stats_idx);
-
-      int64_t best_rd_compound;
-      int64_t rd_thresh;
-      const int comp_type_rd_shift = COMP_TYPE_RD_THRESH_SHIFT;
-      const int comp_type_rd_scale = COMP_TYPE_RD_THRESH_SCALE;
-      rd_thresh = get_rd_thresh_from_best_rd(
-          ref_best_rd, (1 << comp_type_rd_shift), comp_type_rd_scale);
-      compmode_interinter_cost = av1_compound_type_rd(
-          cpi, x, bsize, cur_mv, mode_search_mask, masked_compound_used,
-          &orig_dst, &tmp_dst, rd_buffers, &rate_mv, &best_rd_compound,
-          rd_stats, ref_best_rd, skip_rd[1], &is_luma_interp_done, rd_thresh);
-      if (ref_best_rd < INT64_MAX &&
-          (best_rd_compound >> comp_type_rd_shift) * comp_type_rd_scale >
-              ref_best_rd) {
-        restore_dst_buf(xd, orig_dst, num_planes);
-        continue;
-      }
-      // No need to call av1_enc_build_inter_predictor for luma if
-      // COMPOUND_AVERAGE is selected because it is the first
-      // candidate in av1_compound_type_rd, and the following
-      // compound types searching uses tmp_dst buffer
-
-      if (mbmi->interinter_comp.type == COMPOUND_AVERAGE &&
-          is_luma_interp_done) {
-        if (num_planes > 1) {
-          av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, &orig_dst,
-                                        bsize, AOM_PLANE_U, num_planes - 1);
-        }
-        skip_build_pred = 1;
-      }
+      const int not_best_mode = process_compound_inter_mode(
+          cpi, x, args, ref_best_rd, cur_mv, bsize, &compmode_interinter_cost,
+          rd_buffers, &orig_dst, &tmp_dst, &rate_mv, rd_stats, skip_rd,
+          &skip_build_pred);
+      if (not_best_mode) continue;
     }
 
 #if CONFIG_COLLECT_COMPONENT_TIMING
@@ -2528,6 +2781,7 @@
 #if CONFIG_COLLECT_COMPONENT_TIMING
     start_timing(cpi, interpolation_filter_search_time);
 #endif
+    // Determine the interpolation filter for this mode
     ret_val = av1_interpolation_filter_search(
         x, cpi, tile_data, bsize, &tmp_dst, &orig_dst, &rd, &rs,
         &skip_build_pred, args, ref_best_rd);
@@ -2546,6 +2800,7 @@
       continue;
     }
 
+    // Compute modelled RD if enabled
     if (args->modelled_rd != NULL) {
       if (is_comp_pred) {
         const int mode0 = compound_ref0_mode(this_mode);
@@ -2561,6 +2816,7 @@
     }
     rd_stats->rate += compmode_interinter_cost;
     if (skip_build_pred != 1) {
+      // Build this inter predictor if it has not been previously built
       av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, &orig_dst, bsize, 0,
                                     av1_num_planes(cm) - 1);
     }
@@ -2569,35 +2825,44 @@
     start_timing(cpi, motion_mode_rd_time);
 #endif
     int rate2_nocoeff = rd_stats->rate;
+    // Determine the motion mode. This will be one of SIMPLE_TRANSLATION,
+    // OBMC_CAUSAL or WARPED_CAUSAL
+    int64_t this_yrd;
     ret_val = motion_mode_rd(cpi, tile_data, x, bsize, rd_stats, rd_stats_y,
-                             rd_stats_uv, disable_skip, args, ref_best_rd,
-                             skip_rd, &rate_mv, &orig_dst, best_est_rd,
-                             do_tx_search, inter_modes_info, 0);
+                             rd_stats_uv, args, ref_best_rd, skip_rd, &rate_mv,
+                             &orig_dst, best_est_rd, do_tx_search,
+                             inter_modes_info, 0, &this_yrd);
 #if CONFIG_COLLECT_COMPONENT_TIMING
     end_timing(cpi, motion_mode_rd_time);
 #endif
+    assert(
+        IMPLIES(!av1_check_newmv_joint_nonzero(cm, x), ret_val == INT64_MAX));
 
-    mode_info[ref_mv_idx].mv.as_int = mbmi->mv[0].as_int;
-    mode_info[ref_mv_idx].rate_mv = rate_mv;
     if (ret_val != INT64_MAX) {
       int64_t tmp_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
-      mode_info[ref_mv_idx].rd = tmp_rd;
+      if (tmp_rd < mode_info[ref_mv_idx].rd) {
+        // Only update mode_info if the new result is actually better.
+        mode_info[ref_mv_idx].mv.as_int = mbmi->mv[0].as_int;
+        mode_info[ref_mv_idx].rate_mv = rate_mv;
+        mode_info[ref_mv_idx].rd = tmp_rd;
+      }
       const THR_MODES mode_enum = get_prediction_mode_idx(
           mbmi->mode, mbmi->ref_frame[0], mbmi->ref_frame[1]);
       // Collect mode stats for multiwinner mode processing
-      store_winner_mode_stats(
-          &cpi->common, x, mbmi, rd_stats, rd_stats_y, rd_stats_uv, mode_enum,
-          NULL, bsize, tmp_rd,
-          cpi->sf.winner_mode_sf.enable_multiwinner_mode_process, do_tx_search);
+      store_winner_mode_stats(&cpi->common, x, mbmi, rd_stats, rd_stats_y,
+                              rd_stats_uv, mode_enum, NULL, bsize, tmp_rd,
+                              cpi->sf.winner_mode_sf.multi_winner_mode_type,
+                              do_tx_search);
       if (tmp_rd < best_rd) {
+        best_yrd = this_yrd;
+        // Update the best rd stats if we found the best mode so far
         best_rd_stats = *rd_stats;
         best_rd_stats_y = *rd_stats_y;
         best_rd_stats_uv = *rd_stats_uv;
         best_rd = tmp_rd;
         best_mbmi = *mbmi;
-        best_disable_skip = *disable_skip;
-        best_xskip = x->force_skip;
-        memcpy(best_blk_skip, x->blk_skip,
+        best_xskip_txfm = txfm_info->skip_txfm;
+        memcpy(best_blk_skip, txfm_info->blk_skip,
                sizeof(best_blk_skip[0]) * xd->height * xd->width);
         av1_copy_array(best_tx_type_map, xd->tx_type_map,
                        xd->height * xd->width);
@@ -2619,12 +2884,12 @@
   *rd_stats = best_rd_stats;
   *rd_stats_y = best_rd_stats_y;
   *rd_stats_uv = best_rd_stats_uv;
+  *yrd = best_yrd;
   *mbmi = best_mbmi;
-  *disable_skip = best_disable_skip;
-  x->force_skip = best_xskip;
+  txfm_info->skip_txfm = best_xskip_txfm;
   assert(IMPLIES(mbmi->comp_group_idx == 1,
                  mbmi->interinter_comp.type != COMPOUND_AVERAGE));
-  memcpy(x->blk_skip, best_blk_skip,
+  memcpy(txfm_info->blk_skip, best_blk_skip,
          sizeof(best_blk_skip[0]) * xd->height * xd->width);
   av1_copy_array(xd->tx_type_map, best_tx_type_map, xd->height * xd->width);
 
@@ -2633,17 +2898,29 @@
   return rd_stats->rdcost;
 }
 
+/*!\brief Search for the best intrabc predictor
+ *
+ * \ingroup intra_mode_search
+ * \callergraph
+ * This function performs a motion search to find the best intrabc predictor.
+ *
+ * \returns Returns the best overall rdcost (including the non-intrabc modes
+ * search before this function).
+ */
 static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x,
                                        PICK_MODE_CONTEXT *ctx,
                                        RD_STATS *rd_stats, BLOCK_SIZE bsize,
                                        int64_t best_rd) {
   const AV1_COMMON *const cm = &cpi->common;
-  if (!av1_allow_intrabc(cm) || !cpi->oxcf.enable_intrabc) return INT64_MAX;
+  if (!av1_allow_intrabc(cm) || !cpi->oxcf.kf_cfg.enable_intrabc)
+    return INT64_MAX;
   const int num_planes = av1_num_planes(cm);
 
   MACROBLOCKD *const xd = &x->e_mbd;
   const TileInfo *tile = &xd->tile;
   MB_MODE_INFO *mbmi = xd->mi[0];
+  TxfmSearchInfo *txfm_info = &x->txfm_search_info;
+
   const int mi_row = xd->mi_row;
   const int mi_col = xd->mi_col;
   const int w = block_size_wide[bsize];
@@ -2651,7 +2928,7 @@
   const int sb_row = mi_row >> cm->seq_params.mib_size_log2;
   const int sb_col = mi_col >> cm->seq_params.mib_size_log2;
 
-  MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
+  MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext;
   MV_REFERENCE_FRAME ref_frame = INTRA_FRAME;
   av1_find_mv_refs(cm, xd, mbmi, ref_frame, mbmi_ext->ref_mv_count,
                    xd->ref_mv_stack, xd->weight, NULL, mbmi_ext->global_mvs,
@@ -2699,9 +2976,10 @@
 
   FULLPEL_MOTION_SEARCH_PARAMS fullms_params;
   const search_site_config *lookahead_search_sites =
-      &cpi->mv_search_params.ss_cfg[SS_CFG_LOOKAHEAD];
+      cpi->mv_search_params.search_site_cfg[SS_CFG_LOOKAHEAD];
   av1_make_default_fullpel_ms_params(&fullms_params, cpi, x, bsize,
-                                     &dv_ref.as_mv, lookahead_search_sites);
+                                     &dv_ref.as_mv, lookahead_search_sites,
+                                     /*fine_search_interval=*/0);
   fullms_params.is_intra_mode = 1;
 
   for (enum IntrabcMotionDirection dir = IBC_MOTION_ABOVE;
@@ -2779,7 +3057,7 @@
     mbmi->motion_mode = SIMPLE_TRANSLATION;
     mbmi->mv[0].as_mv = dv;
     mbmi->interp_filters = av1_broadcast_interp_filter(BILINEAR);
-    mbmi->skip = 0;
+    mbmi->skip_txfm = 0;
     av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, 0,
                                   av1_num_planes(cm) - 1);
 
@@ -2790,7 +3068,7 @@
     // in MV_COST_WEIGHT is too large. Explore other values.
     const int rate_mv = av1_mv_bit_cost(&dv, &dv_ref.as_mv, dv_costs->joint_mv,
                                         dvcost, MV_COST_WEIGHT_SUB);
-    const int rate_mode = x->intrabc_cost[1];
+    const int rate_mode = x->mode_costs.intrabc_cost[1];
     RD_STATS rd_stats_yuv, rd_stats_y, rd_stats_uv;
     if (!av1_txfm_search(cpi, x, bsize, &rd_stats_yuv, &rd_stats_y,
                          &rd_stats_uv, rate_mode + rate_mv, INT64_MAX))
@@ -2801,15 +3079,15 @@
       best_rd = rd_stats_yuv.rdcost;
       best_mbmi = *mbmi;
       best_rdstats = rd_stats_yuv;
-      memcpy(best_blk_skip, x->blk_skip,
-             sizeof(x->blk_skip[0]) * xd->height * xd->width);
+      memcpy(best_blk_skip, txfm_info->blk_skip,
+             sizeof(txfm_info->blk_skip[0]) * xd->height * xd->width);
       av1_copy_array(best_tx_type_map, xd->tx_type_map, xd->height * xd->width);
     }
   }
   *mbmi = best_mbmi;
   *rd_stats = best_rdstats;
-  memcpy(x->blk_skip, best_blk_skip,
-         sizeof(x->blk_skip[0]) * xd->height * xd->width);
+  memcpy(txfm_info->blk_skip, best_blk_skip,
+         sizeof(txfm_info->blk_skip[0]) * xd->height * xd->width);
   av1_copy_array(xd->tx_type_map, best_tx_type_map, ctx->num_4x4_blk);
 #if CONFIG_RD_DEBUG
   mbmi->rd_stats = *rd_stats;
@@ -2817,18 +3095,24 @@
   return best_rd;
 }
 
-void av1_rd_pick_intra_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x,
-                               RD_STATS *rd_cost, BLOCK_SIZE bsize,
+// TODO(chiyotsai@google.com): We are using struct $struct_name instead of their
+// typedef here because Doxygen doesn't know about the typedefs yet. So using
+// the typedef will prevent doxygen from finding this function and generating
+// the callgraph. Once documents for AV1_COMP and MACROBLOCK are added to
+// doxygen, we can revert back to using the typedefs.
+void av1_rd_pick_intra_mode_sb(const struct AV1_COMP *cpi, struct macroblock *x,
+                               struct RD_STATS *rd_cost, BLOCK_SIZE bsize,
                                PICK_MODE_CONTEXT *ctx, int64_t best_rd) {
   const AV1_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = xd->mi[0];
   const int num_planes = av1_num_planes(cm);
+  TxfmSearchInfo *txfm_info = &x->txfm_search_info;
   int rate_y = 0, rate_uv = 0, rate_y_tokenonly = 0, rate_uv_tokenonly = 0;
-  int y_skip = 0, uv_skip = 0;
+  int y_skip_txfm = 0, uv_skip_txfm = 0;
   int64_t dist_y = 0, dist_uv = 0;
 
-  ctx->rd_stats.skip = 0;
+  ctx->rd_stats.skip_txfm = 0;
   mbmi->ref_frame[0] = INTRA_FRAME;
   mbmi->ref_frame[1] = NONE_FRAME;
   mbmi->use_intrabc = 0;
@@ -2837,40 +3121,34 @@
 
   const int64_t intra_yrd =
       av1_rd_pick_intra_sby_mode(cpi, x, &rate_y, &rate_y_tokenonly, &dist_y,
-                                 &y_skip, bsize, best_rd, ctx);
+                                 &y_skip_txfm, bsize, best_rd, ctx);
 
   // Initialize default mode evaluation params
   set_mode_eval_params(cpi, x, DEFAULT_EVAL);
 
   if (intra_yrd < best_rd) {
-    // Only store reconstructed luma when there's chroma RDO. When there's no
-    // chroma RDO, the reconstructed luma will be stored in encode_superblock().
-    xd->cfl.store_y = store_cfl_required_rdo(cm, x);
-    if (xd->cfl.store_y) {
-      // Restore reconstructed luma values.
-      memcpy(x->blk_skip, ctx->blk_skip,
-             sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
-      av1_copy_array(xd->tx_type_map, ctx->tx_type_map, ctx->num_4x4_blk);
-      av1_encode_intra_block_plane(cpi, x, bsize, AOM_PLANE_Y, DRY_RUN_NORMAL,
-                                   cpi->optimize_seg_arr[mbmi->segment_id]);
-      av1_copy_array(ctx->tx_type_map, xd->tx_type_map, ctx->num_4x4_blk);
-      xd->cfl.store_y = 0;
-    }
+    // Search intra modes for uv planes if needed
     if (num_planes > 1) {
-      init_sbuv_mode(mbmi);
-      if (xd->is_chroma_ref) {
-        const TX_SIZE max_uv_tx_size = av1_get_tx_size(AOM_PLANE_U, xd);
-        av1_rd_pick_intra_sbuv_mode(cpi, x, &rate_uv, &rate_uv_tokenonly,
-                                    &dist_uv, &uv_skip, bsize, max_uv_tx_size);
+      // Set up the tx variables for reproducing the y predictions in case we
+      // need it for chroma-from-luma.
+      if (xd->is_chroma_ref && store_cfl_required_rdo(cm, x)) {
+        memcpy(txfm_info->blk_skip, ctx->blk_skip,
+               sizeof(txfm_info->blk_skip[0]) * ctx->num_4x4_blk);
+        av1_copy_array(xd->tx_type_map, ctx->tx_type_map, ctx->num_4x4_blk);
       }
+      const TX_SIZE max_uv_tx_size = av1_get_tx_size(AOM_PLANE_U, xd);
+      av1_rd_pick_intra_sbuv_mode(cpi, x, &rate_uv, &rate_uv_tokenonly,
+                                  &dist_uv, &uv_skip_txfm, bsize,
+                                  max_uv_tx_size);
     }
 
     // Intra block is always coded as non-skip
     rd_cost->rate =
-        rate_y + rate_uv + x->skip_cost[av1_get_skip_context(xd)][0];
+        rate_y + rate_uv +
+        x->mode_costs.skip_txfm_cost[av1_get_skip_txfm_context(xd)][0];
     rd_cost->dist = dist_y + dist_uv;
     rd_cost->rdcost = RDCOST(x->rdmult, rd_cost->rate, rd_cost->dist);
-    rd_cost->skip = 0;
+    rd_cost->skip_txfm = 0;
   } else {
     rd_cost->rate = INT_MAX;
   }
@@ -2878,15 +3156,15 @@
   if (rd_cost->rate != INT_MAX && rd_cost->rdcost < best_rd)
     best_rd = rd_cost->rdcost;
   if (rd_pick_intrabc_mode_sb(cpi, x, ctx, rd_cost, bsize, best_rd) < best_rd) {
-    ctx->rd_stats.skip = mbmi->skip;
-    memcpy(ctx->blk_skip, x->blk_skip,
-           sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
+    ctx->rd_stats.skip_txfm = mbmi->skip_txfm;
+    memcpy(ctx->blk_skip, txfm_info->blk_skip,
+           sizeof(txfm_info->blk_skip[0]) * ctx->num_4x4_blk);
     assert(rd_cost->rate != INT_MAX);
   }
   if (rd_cost->rate == INT_MAX) return;
 
   ctx->mic = *xd->mi[0];
-  av1_copy_mbmi_ext_to_mbmi_ext_frame(&ctx->mbmi_ext_best, x->mbmi_ext,
+  av1_copy_mbmi_ext_to_mbmi_ext_frame(&ctx->mbmi_ext_best, &x->mbmi_ext,
                                       av1_ref_frame_type(xd->mi[0]->ref_frame));
   av1_copy_array(ctx->tx_type_map, xd->tx_type_map, ctx->num_4x4_blk);
 }
@@ -2905,6 +3183,7 @@
   const int num_planes = av1_num_planes(cm);
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = xd->mi[0];
+  const TxfmSearchParams *txfm_params = &x->txfm_search_params;
 
   x->compound_idx = 1;  // COMPOUND_AVERAGE
   RD_STATS skip_mode_rd_stats;
@@ -2927,7 +3206,7 @@
     return;
   }
 
-  if ((!cpi->oxcf.enable_onesided_comp ||
+  if ((!cpi->oxcf.ref_frm_cfg.enable_onesided_comp ||
        cpi->sf.inter_sf.disable_onesided_comp) &&
       cpi->all_one_sided_refs) {
     return;
@@ -2938,12 +3217,12 @@
   mbmi->ref_frame[0] = ref_frame;
   mbmi->ref_frame[1] = second_ref_frame;
   const uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
-  if (x->mbmi_ext->ref_mv_count[ref_frame_type] == UINT8_MAX) {
-    if (x->mbmi_ext->ref_mv_count[ref_frame] == UINT8_MAX ||
-        x->mbmi_ext->ref_mv_count[second_ref_frame] == UINT8_MAX) {
+  if (x->mbmi_ext.ref_mv_count[ref_frame_type] == UINT8_MAX) {
+    MB_MODE_INFO_EXT *mbmi_ext = &x->mbmi_ext;
+    if (mbmi_ext->ref_mv_count[ref_frame] == UINT8_MAX ||
+        mbmi_ext->ref_mv_count[second_ref_frame] == UINT8_MAX) {
       return;
     }
-    MB_MODE_INFO_EXT *mbmi_ext = x->mbmi_ext;
     av1_find_mv_refs(cm, xd, mbmi, ref_frame_type, mbmi_ext->ref_mv_count,
                      xd->ref_mv_stack, xd->weight, NULL, mbmi_ext->global_mvs,
                      mbmi_ext->mode_context);
@@ -2964,7 +3243,7 @@
   mbmi->interinter_comp.type = COMPOUND_AVERAGE;
   mbmi->motion_mode = SIMPLE_TRANSLATION;
   mbmi->ref_mv_idx = 0;
-  mbmi->skip_mode = mbmi->skip = 1;
+  mbmi->skip_mode = mbmi->skip_txfm = 1;
 
   set_default_interp_filters(mbmi, cm->features.interp_filter);
 
@@ -2987,11 +3266,12 @@
   const int skip_mode_ctx = av1_get_skip_mode_context(xd);
   int64_t best_intra_inter_mode_cost = INT64_MAX;
   if (rd_cost->dist < INT64_MAX && rd_cost->rate < INT32_MAX) {
-    best_intra_inter_mode_cost =
-        RDCOST(x->rdmult, rd_cost->rate + x->skip_mode_cost[skip_mode_ctx][0],
-               rd_cost->dist);
+    const ModeCosts *mode_costs = &x->mode_costs;
+    best_intra_inter_mode_cost = RDCOST(
+        x->rdmult, rd_cost->rate + mode_costs->skip_mode_cost[skip_mode_ctx][0],
+        rd_cost->dist);
     // Account for non-skip mode rate in total rd stats
-    rd_cost->rate += x->skip_mode_cost[skip_mode_ctx][0];
+    rd_cost->rate += mode_costs->skip_mode_cost[skip_mode_ctx][0];
     av1_rd_cost_update(x->rdmult, rd_cost);
   }
 
@@ -3001,7 +3281,8 @@
     search_state->best_mbmode.skip_mode = 1;
     search_state->best_mbmode = *mbmi;
 
-    search_state->best_mbmode.skip_mode = search_state->best_mbmode.skip = 1;
+    search_state->best_mbmode.skip_mode = search_state->best_mbmode.skip_txfm =
+        1;
     search_state->best_mbmode.mode = NEAREST_NEARESTMV;
     search_state->best_mbmode.ref_frame[0] = mbmi->ref_frame[0];
     search_state->best_mbmode.ref_frame[1] = mbmi->ref_frame[1];
@@ -3012,13 +3293,14 @@
     // Set up tx_size related variables for skip-specific loop filtering.
     search_state->best_mbmode.tx_size =
         block_signals_txsize(bsize)
-            ? tx_size_from_tx_mode(bsize, x->tx_mode_search_type)
+            ? tx_size_from_tx_mode(bsize, txfm_params->tx_mode_search_type)
             : max_txsize_rect_lookup[bsize];
     memset(search_state->best_mbmode.inter_tx_size,
            search_state->best_mbmode.tx_size,
            sizeof(search_state->best_mbmode.inter_tx_size));
     set_txfm_ctxs(search_state->best_mbmode.tx_size, xd->width, xd->height,
-                  search_state->best_mbmode.skip && is_inter_block(mbmi), xd);
+                  search_state->best_mbmode.skip_txfm && is_inter_block(mbmi),
+                  xd);
 
     // Set up color-related variables for skip mode.
     search_state->best_mbmode.uv_mode = UV_DC_PRED;
@@ -3048,7 +3330,7 @@
     search_state->best_skip2 = 1;
     search_state->best_mode_skippable = 1;
 
-    x->force_skip = 1;
+    x->txfm_search_info.skip_txfm = 1;
   }
 }
 
@@ -3057,10 +3339,10 @@
     MACROBLOCK *x, MB_MODE_INFO *best_mbmode, RD_STATS *best_rd_cost,
     int best_rate_y, int best_rate_uv, THR_MODES *best_mode_index,
     RD_STATS **winner_rd_cost, int *winner_rate_y, int *winner_rate_uv,
-    THR_MODES *winner_mode_index, int enable_multiwinner_mode_process,
+    THR_MODES *winner_mode_index, MULTI_WINNER_MODE_TYPE multi_winner_mode_type,
     int mode_idx) {
   MB_MODE_INFO *winner_mbmi;
-  if (enable_multiwinner_mode_process) {
+  if (multi_winner_mode_type) {
     assert(mode_idx >= 0 && mode_idx < x->winner_mode_count);
     WinnerModeStats *winner_mode_stat = &x->winner_mode_stats[mode_idx];
     winner_mbmi = &winner_mode_stat->mbmi;
@@ -3092,6 +3374,8 @@
   const AV1_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = xd->mi[0];
+  TxfmSearchParams *txfm_params = &x->txfm_search_params;
+  TxfmSearchInfo *txfm_info = &x->txfm_search_info;
   int64_t best_rd;
   const int num_planes = av1_num_planes(cm);
 
@@ -3115,7 +3399,7 @@
     MB_MODE_INFO *winner_mbmi = get_winner_mode_stats(
         x, best_mbmode, rd_cost, best_rate_y, best_rate_uv, best_mode_index,
         &winner_rd_stats, &winner_rate_y, &winner_rate_uv, &winner_mode_index,
-        cpi->sf.winner_mode_sf.enable_multiwinner_mode_process, mode_idx);
+        cpi->sf.winner_mode_sf.multi_winner_mode_type, mode_idx);
 
     if (xd->lossless[winner_mbmi->segment_id] == 0 &&
         winner_mode_index != THR_INVALID &&
@@ -3124,7 +3408,7 @@
       RD_STATS rd_stats = *winner_rd_stats;
       int skip_blk = 0;
       RD_STATS rd_stats_y, rd_stats_uv;
-      const int skip_ctx = av1_get_skip_context(xd);
+      const int skip_ctx = av1_get_skip_txfm_context(xd);
 
       *mbmi = *winner_mbmi;
 
@@ -3146,7 +3430,7 @@
           av1_build_obmc_inter_predictors_sb(cm, xd);
 
         av1_subtract_plane(x, bsize, 0);
-        if (x->tx_mode_search_type == TX_MODE_SELECT &&
+        if (txfm_params->tx_mode_search_type == TX_MODE_SELECT &&
             !xd->lossless[mbmi->segment_id]) {
           av1_pick_recursive_tx_size_type_yrd(cpi, x, &rd_stats_y, bsize,
                                               INT64_MAX);
@@ -3157,7 +3441,7 @@
           memset(mbmi->inter_tx_size, mbmi->tx_size,
                  sizeof(mbmi->inter_tx_size));
           for (int i = 0; i < xd->height * xd->width; ++i)
-            set_blk_skip(x, 0, i, rd_stats_y.skip);
+            set_blk_skip(txfm_info->blk_skip, 0, i, rd_stats_y.skip_txfm);
         }
       } else {
         av1_pick_uniform_tx_size_type_yrd(cpi, x, &rd_stats_y, bsize,
@@ -3170,20 +3454,22 @@
         av1_init_rd_stats(&rd_stats_uv);
       }
 
+      const ModeCosts *mode_costs = &x->mode_costs;
       if (is_inter_mode(mbmi->mode) &&
           RDCOST(x->rdmult,
-                 x->skip_cost[skip_ctx][0] + rd_stats_y.rate + rd_stats_uv.rate,
+                 mode_costs->skip_txfm_cost[skip_ctx][0] + rd_stats_y.rate +
+                     rd_stats_uv.rate,
                  (rd_stats_y.dist + rd_stats_uv.dist)) >
-              RDCOST(x->rdmult, x->skip_cost[skip_ctx][1],
+              RDCOST(x->rdmult, mode_costs->skip_txfm_cost[skip_ctx][1],
                      (rd_stats_y.sse + rd_stats_uv.sse))) {
         skip_blk = 1;
-        rd_stats_y.rate = x->skip_cost[skip_ctx][1];
+        rd_stats_y.rate = mode_costs->skip_txfm_cost[skip_ctx][1];
         rd_stats_uv.rate = 0;
         rd_stats_y.dist = rd_stats_y.sse;
         rd_stats_uv.dist = rd_stats_uv.sse;
       } else {
         skip_blk = 0;
-        rd_stats_y.rate += x->skip_cost[skip_ctx][0];
+        rd_stats_y.rate += mode_costs->skip_txfm_cost[skip_ctx][0];
       }
       int this_rate = rd_stats.rate + rd_stats_y.rate + rd_stats_uv.rate -
                       winner_rate_y - winner_rate_uv;
@@ -3192,7 +3478,7 @@
       if (best_rd > this_rd) {
         *best_mbmode = *mbmi;
         *best_mode_index = winner_mode_index;
-        av1_copy_array(ctx->blk_skip, x->blk_skip, ctx->num_4x4_blk);
+        av1_copy_array(ctx->blk_skip, txfm_info->blk_skip, ctx->num_4x4_blk);
         av1_copy_array(ctx->tx_type_map, xd->tx_type_map, ctx->num_4x4_blk);
         rd_cost->rate = this_rate;
         rd_cost->dist = rd_stats_y.dist + rd_stats_uv.dist;
@@ -3205,6 +3491,7 @@
   }
 }
 
+/*!\cond */
 typedef struct {
   // Mask for each reference frame, specifying which prediction modes to NOT try
   // during search.
@@ -3215,6 +3502,7 @@
   // (NONE_FRAME).
   bool ref_combo[REF_FRAMES][REF_FRAMES + 1];
 } mode_skip_mask_t;
+/*!\endcond */
 
 // Update 'ref_combo' mask to disable given 'ref' in single and compound modes.
 static AOM_INLINE void disable_reference(
@@ -3307,7 +3595,7 @@
 
   if (sf->rt_sf.use_real_time_ref_set)
     ref_set = REF_SET_REALTIME;
-  else if (cpi->oxcf.enable_reduced_reference_set)
+  else if (cpi->oxcf.ref_frm_cfg.enable_reduced_reference_set)
     ref_set = REF_SET_REDUCED;
 
   default_skip_mask(mask, ref_set);
@@ -3355,15 +3643,17 @@
     // unless ARNR filtering is enabled in which case we want
     // an unfiltered alternative. We allow near/nearest as well
     // because they may result in zero-zero MVs but be cheaper.
-    if (cpi->rc.is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0)) {
+    if (cpi->rc.is_src_frame_alt_ref &&
+        (cpi->oxcf.algo_cfg.arnr_max_frames == 0)) {
       disable_inter_references_except_altref(mask->ref_combo);
 
       mask->pred_modes[ALTREF_FRAME] = ~INTER_NEAREST_NEAR_ZERO;
       const MV_REFERENCE_FRAME tmp_ref_frames[2] = { ALTREF_FRAME, NONE_FRAME };
       int_mv near_mv, nearest_mv, global_mv;
-      get_this_mv(&nearest_mv, NEARESTMV, 0, 0, 0, tmp_ref_frames, x->mbmi_ext);
-      get_this_mv(&near_mv, NEARMV, 0, 0, 0, tmp_ref_frames, x->mbmi_ext);
-      get_this_mv(&global_mv, GLOBALMV, 0, 0, 0, tmp_ref_frames, x->mbmi_ext);
+      get_this_mv(&nearest_mv, NEARESTMV, 0, 0, 0, tmp_ref_frames,
+                  &x->mbmi_ext);
+      get_this_mv(&near_mv, NEARMV, 0, 0, 0, tmp_ref_frames, &x->mbmi_ext);
+      get_this_mv(&global_mv, GLOBALMV, 0, 0, 0, tmp_ref_frames, &x->mbmi_ext);
 
       if (near_mv.as_int != global_mv.as_int)
         mask->pred_modes[ALTREF_FRAME] |= (1 << NEARMV);
@@ -3373,8 +3663,8 @@
   }
 
   if (cpi->rc.is_src_frame_alt_ref) {
-    if (sf->inter_sf.alt_ref_search_fp) {
-      assert(cpi->ref_frame_flags & av1_ref_frame_flag_list[ALTREF_FRAME]);
+    if (sf->inter_sf.alt_ref_search_fp &&
+        (cpi->ref_frame_flags & av1_ref_frame_flag_list[ALTREF_FRAME])) {
       mask->pred_modes[ALTREF_FRAME] = 0;
       disable_inter_references_except_altref(mask->ref_combo);
       disable_reference(INTRA_FRAME, mask->ref_combo);
@@ -3387,20 +3677,14 @@
       // Conservatively skip the modes w.r.t. BWDREF, ALTREF2 and ALTREF, if
       // those are past frames
       for (ref_frame = BWDREF_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
-        if (cpi->ref_relative_dist[ref_frame - LAST_FRAME] < 0)
+        if (cpi->ref_frame_dist_info.ref_relative_dist[ref_frame - LAST_FRAME] <
+            0)
           if (x->pred_mv_sad[ref_frame] > sad_thresh)
             mask->pred_modes[ref_frame] |= INTER_ALL;
       }
     }
   }
 
-  if (sf->inter_sf.adaptive_mode_search) {
-    if (cm->show_frame && !cpi->rc.is_src_frame_alt_ref &&
-        cpi->rc.frames_since_golden >= 3)
-      if ((x->pred_mv_sad[GOLDEN_FRAME] >> 1) > x->pred_mv_sad[LAST_FRAME])
-        mask->pred_modes[GOLDEN_FRAME] |= INTER_ALL;
-  }
-
   if (bsize > sf->part_sf.max_intra_bsize) {
     disable_reference(INTRA_FRAME, mask->ref_combo);
   }
@@ -3409,31 +3693,76 @@
       ~(sf->intra_sf.intra_y_mode_mask[max_txsize_lookup[bsize]]);
 }
 
-static AOM_INLINE void init_pred_buf(const MACROBLOCK *const x,
-                                     HandleInterModeArgs *const args) {
-  const MACROBLOCKD *const xd = &x->e_mbd;
-  if (is_cur_buf_hbd(xd)) {
+static AOM_INLINE void init_neighbor_pred_buf(
+    const OBMCBuffer *const obmc_buffer, HandleInterModeArgs *const args,
+    int is_hbd) {
+  if (is_hbd) {
     const int len = sizeof(uint16_t);
-    args->above_pred_buf[0] = CONVERT_TO_BYTEPTR(x->above_pred_buf);
-    args->above_pred_buf[1] =
-        CONVERT_TO_BYTEPTR(x->above_pred_buf + (MAX_SB_SQUARE >> 1) * len);
+    args->above_pred_buf[0] = CONVERT_TO_BYTEPTR(obmc_buffer->above_pred);
+    args->above_pred_buf[1] = CONVERT_TO_BYTEPTR(obmc_buffer->above_pred +
+                                                 (MAX_SB_SQUARE >> 1) * len);
     args->above_pred_buf[2] =
-        CONVERT_TO_BYTEPTR(x->above_pred_buf + MAX_SB_SQUARE * len);
-    args->left_pred_buf[0] = CONVERT_TO_BYTEPTR(x->left_pred_buf);
+        CONVERT_TO_BYTEPTR(obmc_buffer->above_pred + MAX_SB_SQUARE * len);
+    args->left_pred_buf[0] = CONVERT_TO_BYTEPTR(obmc_buffer->left_pred);
     args->left_pred_buf[1] =
-        CONVERT_TO_BYTEPTR(x->left_pred_buf + (MAX_SB_SQUARE >> 1) * len);
+        CONVERT_TO_BYTEPTR(obmc_buffer->left_pred + (MAX_SB_SQUARE >> 1) * len);
     args->left_pred_buf[2] =
-        CONVERT_TO_BYTEPTR(x->left_pred_buf + MAX_SB_SQUARE * len);
+        CONVERT_TO_BYTEPTR(obmc_buffer->left_pred + MAX_SB_SQUARE * len);
   } else {
-    args->above_pred_buf[0] = x->above_pred_buf;
-    args->above_pred_buf[1] = x->above_pred_buf + (MAX_SB_SQUARE >> 1);
-    args->above_pred_buf[2] = x->above_pred_buf + MAX_SB_SQUARE;
-    args->left_pred_buf[0] = x->left_pred_buf;
-    args->left_pred_buf[1] = x->left_pred_buf + (MAX_SB_SQUARE >> 1);
-    args->left_pred_buf[2] = x->left_pred_buf + MAX_SB_SQUARE;
+    args->above_pred_buf[0] = obmc_buffer->above_pred;
+    args->above_pred_buf[1] = obmc_buffer->above_pred + (MAX_SB_SQUARE >> 1);
+    args->above_pred_buf[2] = obmc_buffer->above_pred + MAX_SB_SQUARE;
+    args->left_pred_buf[0] = obmc_buffer->left_pred;
+    args->left_pred_buf[1] = obmc_buffer->left_pred + (MAX_SB_SQUARE >> 1);
+    args->left_pred_buf[2] = obmc_buffer->left_pred + MAX_SB_SQUARE;
   }
 }
 
+static AOM_INLINE int prune_ref_frame(const AV1_COMP *cpi, const MACROBLOCK *x,
+                                      MV_REFERENCE_FRAME ref_frame) {
+  const AV1_COMMON *const cm = &cpi->common;
+  MV_REFERENCE_FRAME rf[2];
+  av1_set_ref_frame(rf, ref_frame);
+
+  if ((cpi->prune_ref_frame_mask >> ref_frame) & 1) return 1;
+
+  if (prune_ref_by_selective_ref_frame(cpi, x, rf,
+                                       cm->cur_frame->ref_display_order_hint)) {
+    return 1;
+  }
+
+  return 0;
+}
+
+static AOM_INLINE int is_ref_frame_used_by_compound_ref(
+    int ref_frame, int skip_ref_frame_mask) {
+  for (int r = ALTREF_FRAME + 1; r < MODE_CTX_REF_FRAMES; ++r) {
+    if (!(skip_ref_frame_mask & (1 << r))) {
+      const MV_REFERENCE_FRAME *rf = ref_frame_map[r - REF_FRAMES];
+      if (rf[0] == ref_frame || rf[1] == ref_frame) {
+        return 1;
+      }
+    }
+  }
+  return 0;
+}
+
+static AOM_INLINE int is_ref_frame_used_in_cache(MV_REFERENCE_FRAME ref_frame,
+                                                 const MB_MODE_INFO *mi_cache) {
+  if (!mi_cache) {
+    return 0;
+  }
+
+  if (ref_frame < REF_FRAMES) {
+    return (ref_frame == mi_cache->ref_frame[0] ||
+            ref_frame == mi_cache->ref_frame[1]);
+  }
+
+  // if we are here, then the current mode is compound.
+  MV_REFERENCE_FRAME cached_ref_type = av1_ref_frame_type(mi_cache->ref_frame);
+  return ref_frame == cached_ref_type;
+}
+
 // Please add/modify parameter setting in this function, making it consistent
 // and easy to read and maintain.
 static AOM_INLINE void set_params_rd_pick_inter_mode(
@@ -3444,53 +3773,46 @@
   const AV1_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = xd->mi[0];
-  MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
+  MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext;
   unsigned char segment_id = mbmi->segment_id;
 
-  init_pred_buf(x, args);
+  init_neighbor_pred_buf(&x->obmc_buffer, args, is_cur_buf_hbd(&x->e_mbd));
   av1_collect_neighbors_ref_counts(xd);
-  estimate_ref_frame_costs(cm, xd, x, segment_id, ref_costs_single,
+  estimate_ref_frame_costs(cm, xd, &x->mode_costs, segment_id, ref_costs_single,
                            ref_costs_comp);
 
   const int mi_row = xd->mi_row;
   const int mi_col = xd->mi_col;
-  MV_REFERENCE_FRAME ref_frame;
   x->best_pred_mv_sad = INT_MAX;
-  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+
+  for (MV_REFERENCE_FRAME ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME;
+       ++ref_frame) {
     x->pred_mv_sad[ref_frame] = INT_MAX;
-    x->mbmi_ext->mode_context[ref_frame] = 0;
+    mbmi_ext->mode_context[ref_frame] = 0;
     mbmi_ext->ref_mv_count[ref_frame] = UINT8_MAX;
     if (cpi->ref_frame_flags & av1_ref_frame_flag_list[ref_frame]) {
-      if (mbmi->partition != PARTITION_NONE &&
-          mbmi->partition != PARTITION_SPLIT) {
-        if (skip_ref_frame_mask & (1 << ref_frame)) {
-          int skip = 1;
-          for (int r = ALTREF_FRAME + 1; r < MODE_CTX_REF_FRAMES; ++r) {
-            if (!(skip_ref_frame_mask & (1 << r))) {
-              const MV_REFERENCE_FRAME *rf = ref_frame_map[r - REF_FRAMES];
-              if (rf[0] == ref_frame || rf[1] == ref_frame) {
-                skip = 0;
-                break;
-              }
-            }
-          }
-          if (skip) continue;
-        }
+      // Skip the ref frame if the mask says skip and the ref is not used by
+      // compound ref.
+      if (skip_ref_frame_mask & (1 << ref_frame) &&
+          !is_ref_frame_used_by_compound_ref(ref_frame, skip_ref_frame_mask) &&
+          !is_ref_frame_used_in_cache(ref_frame, x->intermode_cache)) {
+        continue;
       }
       assert(get_ref_frame_yv12_buf(cm, ref_frame) != NULL);
       setup_buffer_ref_mvs_inter(cpi, x, ref_frame, bsize, yv12_mb);
     }
     // Store the best pred_mv_sad across all past frames
     if (cpi->sf.inter_sf.alt_ref_search_fp &&
-        cpi->ref_relative_dist[ref_frame - LAST_FRAME] < 0)
+        cpi->ref_frame_dist_info.ref_relative_dist[ref_frame - LAST_FRAME] < 0)
       x->best_pred_mv_sad =
           AOMMIN(x->best_pred_mv_sad, x->pred_mv_sad[ref_frame]);
   }
-  // ref_frame = ALTREF_FRAME
-  if (!cpi->sf.rt_sf.use_real_time_ref_set) {
+
+  if (!cpi->sf.rt_sf.use_real_time_ref_set && is_comp_ref_allowed(bsize)) {
     // No second reference on RT ref set, so no need to initialize
-    for (; ref_frame < MODE_CTX_REF_FRAMES; ++ref_frame) {
-      x->mbmi_ext->mode_context[ref_frame] = 0;
+    for (MV_REFERENCE_FRAME ref_frame = EXTREF_FRAME;
+         ref_frame < MODE_CTX_REF_FRAMES; ++ref_frame) {
+      mbmi_ext->mode_context[ref_frame] = 0;
       mbmi_ext->ref_mv_count[ref_frame] = UINT8_MAX;
       const MV_REFERENCE_FRAME *rf = ref_frame_map[ref_frame - REF_FRAMES];
       if (!((cpi->ref_frame_flags & av1_ref_frame_flag_list[rf[0]]) &&
@@ -3498,12 +3820,14 @@
         continue;
       }
 
-      if (mbmi->partition != PARTITION_NONE &&
-          mbmi->partition != PARTITION_SPLIT) {
-        if (skip_ref_frame_mask & (1 << ref_frame)) {
-          continue;
-        }
+      if (skip_ref_frame_mask & (1 << ref_frame) &&
+          !is_ref_frame_used_in_cache(ref_frame, x->intermode_cache)) {
+        continue;
       }
+      // Ref mv list population is not required, when compound references are
+      // pruned.
+      if (prune_ref_frame(cpi, x, ref_frame)) continue;
+
       av1_find_mv_refs(cm, xd, mbmi, ref_frame, mbmi_ext->ref_mv_count,
                        xd->ref_mv_stack, xd->weight, NULL, mbmi_ext->global_mvs,
                        mbmi_ext->mode_context);
@@ -3517,7 +3841,8 @@
   const FRAME_UPDATE_TYPE update_type = get_frame_update_type(&cpi->gf_group);
   const int prune_obmc = cpi->frame_probs.obmc_probs[update_type][bsize] <
                          cpi->sf.inter_sf.prune_obmc_prob_thresh;
-  if (cpi->oxcf.enable_obmc && !cpi->sf.inter_sf.disable_obmc && !prune_obmc) {
+  if (cpi->oxcf.motion_mode_cfg.enable_obmc && !cpi->sf.inter_sf.disable_obmc &&
+      !prune_obmc) {
     if (check_num_overlappable_neighbors(mbmi) &&
         is_motion_variation_allowed_bsize(bsize)) {
       int dst_width1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
@@ -3549,22 +3874,11 @@
   x->comp_rd_stats_idx = 0;
 }
 
-static AOM_INLINE void init_intra_mode_search_state(
-    IntraModeSearchState *intra_search_state) {
-  intra_search_state->skip_intra_modes = 0;
-  intra_search_state->best_intra_mode = DC_PRED;
-  intra_search_state->angle_stats_ready = 0;
-  av1_zero(intra_search_state->directional_mode_skip_mask);
-  intra_search_state->rate_uv_intra = INT_MAX;
-  av1_zero(intra_search_state->pmi_uv);
-  for (int i = 0; i < REFERENCE_MODES; ++i)
-    intra_search_state->best_pred_rd[i] = INT64_MAX;
-}
-
 static AOM_INLINE void init_inter_mode_search_state(
     InterModeSearchState *search_state, const AV1_COMP *cpi,
     const MACROBLOCK *x, BLOCK_SIZE bsize, int64_t best_rd_so_far) {
   init_intra_mode_search_state(&search_state->intra_search_state);
+  av1_invalid_rd_stats(&search_state->best_y_rdcost);
 
   search_state->best_rd = best_rd_so_far;
   search_state->best_skip_rd[0] = INT64_MAX;
@@ -3637,8 +3951,16 @@
       }
     }
   }
+  for (int ref_frame = 0; ref_frame < REF_FRAMES; ++ref_frame) {
+    search_state->best_single_rd[ref_frame] = INT64_MAX;
+    search_state->best_single_mode[ref_frame] = MB_MODE_COUNT;
+  }
   av1_zero(search_state->single_state_cnt);
   av1_zero(search_state->single_state_modelled_cnt);
+
+  for (int i = 0; i < REFERENCE_MODES; ++i) {
+    search_state->best_pred_rd[i] = INT64_MAX;
+  }
 }
 
 static bool mask_says_skip(const mode_skip_mask_t *mode_skip_mask,
@@ -3703,6 +4025,14 @@
   return picked_ref_frames_mask;
 }
 
+// Check if reference frame pair of the current block matches with the given
+// block.
+static INLINE int match_ref_frame_pair(const MB_MODE_INFO *mbmi,
+                                       const MV_REFERENCE_FRAME *ref_frames) {
+  return ((ref_frames[0] == mbmi->ref_frame[0]) &&
+          (ref_frames[1] == mbmi->ref_frame[1]));
+}
+
 // Case 1: return 0, means don't skip this mode
 // Case 2: return 1, means skip this mode completely
 // Case 3: return 2, means skip compound only, but still try single motion modes
@@ -3715,10 +4045,11 @@
   }
 
   const int ref_type = av1_ref_frame_type(ref_frame);
-  if ((cpi->prune_ref_frame_mask >> ref_type) & 1) return 1;
+  if (prune_ref_frame(cpi, x, ref_type)) return 1;
 
   // This is only used in motion vector unit test.
-  if (cpi->oxcf.motion_vector_unit_test && ref_frame[0] == INTRA_FRAME)
+  if (cpi->oxcf.unit_test_cfg.motion_vector_unit_test &&
+      ref_frame[0] == INTRA_FRAME)
     return 1;
 
   const AV1_COMMON *const cm = &cpi->common;
@@ -3726,11 +4057,49 @@
     return 1;
   }
 
-  const int comp_pred = ref_frame[1] > INTRA_FRAME;
-  if ((!cpi->oxcf.enable_onesided_comp ||
-       cpi->sf.inter_sf.disable_onesided_comp) &&
-      comp_pred && cpi->all_one_sided_refs) {
-    return 1;
+  // Reuse the prediction mode in cache
+  if (x->use_intermode_cache) {
+    const MB_MODE_INFO *cached_mi = x->intermode_cache;
+    const PREDICTION_MODE cached_mode = cached_mi->mode;
+    const MV_REFERENCE_FRAME *cached_frame = cached_mi->ref_frame;
+    const int cached_mode_is_single = cached_frame[1] <= INTRA_FRAME;
+
+    // If the cached mode is intra, then we just need to match the mode.
+    if (is_mode_intra(cached_mode) && mode != cached_mode) {
+      return 1;
+    }
+
+    // If the cached mode is single inter mode, then we match the mode and
+    // reference frame.
+    if (cached_mode_is_single) {
+      if (mode != cached_mode || ref_frame[0] != cached_frame[0]) {
+        return 1;
+      }
+    } else {
+      // If the cached mode is compound, then we need to consider several cases.
+      const int mode_is_single = ref_frame[1] <= INTRA_FRAME;
+      if (mode_is_single) {
+        // If the mode is single, we know the modes can't match. But we might
+        // still want to search it if compound mode depends on the current mode.
+        int skip_motion_mode_only = 0;
+        if (cached_mode == NEW_NEARMV || cached_mode == NEW_NEARESTMV) {
+          skip_motion_mode_only = (ref_frame[0] == cached_frame[0]);
+        } else if (cached_mode == NEAR_NEWMV || cached_mode == NEAREST_NEWMV) {
+          skip_motion_mode_only = (ref_frame[0] == cached_frame[1]);
+        } else if (cached_mode == NEW_NEWMV) {
+          skip_motion_mode_only = (ref_frame[0] == cached_frame[0] ||
+                                   ref_frame[0] == cached_frame[1]);
+        }
+
+        return 1 + skip_motion_mode_only;
+      } else {
+        // If both modes are compound, then everything must match.
+        if (mode != cached_mode || ref_frame[0] != cached_frame[0] ||
+            ref_frame[1] != cached_frame[1]) {
+          return 1;
+        }
+      }
+    }
   }
 
   const MB_MODE_INFO *const mbmi = x->e_mbd.mi[0];
@@ -3740,32 +4109,61 @@
       x->must_find_valid_partition)
     return 0;
 
+  const SPEED_FEATURES *const sf = &cpi->sf;
+  // Prune NEARMV and NEAR_NEARMV based on q index and neighbor's reference
+  // frames
+  if (sf->inter_sf.prune_nearmv_using_neighbors &&
+      (mode == NEAR_NEARMV || mode == NEARMV)) {
+    const MACROBLOCKD *const xd = &x->e_mbd;
+    if (search_state->best_rd != INT64_MAX && xd->left_available &&
+        xd->up_available) {
+      const int num_ref_frame_pair_match_thresh =
+          2 - (x->qindex * 3 / QINDEX_RANGE);
+      assert(num_ref_frame_pair_match_thresh <= 2 &&
+             num_ref_frame_pair_match_thresh >= 0);
+      int num_ref_frame_pair_match = 0;
+
+      num_ref_frame_pair_match = match_ref_frame_pair(xd->left_mbmi, ref_frame);
+      num_ref_frame_pair_match +=
+          match_ref_frame_pair(xd->above_mbmi, ref_frame);
+
+      // Prune modes if:
+      // num_ref_frame_pair_match < 2 for qindex   0 to 85
+      // num_ref_frame_pair_match < 1 for qindex  86 to 170
+      // No pruning for qindex 171 to 255
+      if (num_ref_frame_pair_match < num_ref_frame_pair_match_thresh) return 1;
+    }
+  }
+
   int skip_motion_mode = 0;
-  if (mbmi->partition != PARTITION_NONE && mbmi->partition != PARTITION_SPLIT) {
+  if (mbmi->partition != PARTITION_NONE) {
     int skip_ref = skip_ref_frame_mask & (1 << ref_type);
     if (ref_type <= ALTREF_FRAME && skip_ref) {
       // Since the compound ref modes depends on the motion estimation result of
-      // two single ref modes( best mv of single ref modes as the start point )
-      // If current single ref mode is marked skip, we need to check if it will
+      // two single ref modes (best mv of single ref modes as the start point),
+      // if current single ref mode is marked skip, we need to check if it will
       // be used in compound ref modes.
-      for (int r = ALTREF_FRAME + 1; r < MODE_CTX_REF_FRAMES; ++r) {
-        if (skip_ref_frame_mask & (1 << r)) continue;
-        const MV_REFERENCE_FRAME *rf = ref_frame_map[r - REF_FRAMES];
-        if (rf[0] == ref_type || rf[1] == ref_type) {
-          // Found a not skipped compound ref mode which contains current
-          // single ref. So this single ref can't be skipped completly
-          // Just skip it's motion mode search, still try it's simple
-          // transition mode.
-          skip_motion_mode = 1;
-          skip_ref = 0;
-          break;
-        }
+      if (is_ref_frame_used_by_compound_ref(ref_type, skip_ref_frame_mask)) {
+        // Found a not skipped compound ref mode which contains current
+        // single ref. So this single ref can't be skipped completely
+        // Just skip its motion mode search, still try its simple
+        // transition mode.
+        skip_motion_mode = 1;
+        skip_ref = 0;
       }
     }
+    // If we are reusing the prediction from cache, and the current frame is
+    // required by the cache, then we cannot prune it.
+    if (is_ref_frame_used_in_cache(ref_type, x->intermode_cache)) {
+      skip_ref = 0;
+      // If the cache only needs the current reference type for compound
+      // prediction, then we can skip motion mode search.
+      skip_motion_mode = (ref_type <= ALTREF_FRAME &&
+                          x->intermode_cache->ref_frame[1] > INTRA_FRAME);
+    }
     if (skip_ref) return 1;
   }
 
-  const SPEED_FEATURES *const sf = &cpi->sf;
   if (ref_frame[0] == INTRA_FRAME) {
     if (mode != DC_PRED) {
       // Disable intra modes other than DC_PRED for blocks with low variance
@@ -3778,10 +4176,6 @@
     }
   }
 
-  if (prune_ref_by_selective_ref_frame(cpi, x, ref_frame,
-                                       cm->cur_frame->ref_display_order_hint))
-    return 1;
-
   if (skip_motion_mode) return 2;
 
   return 0;
@@ -4011,8 +4405,8 @@
       int_mv single_mv;
       int_mv comp_mv;
       get_this_mv(&single_mv, mode[i], 0, ref_mv_idx, 0, single_refs,
-                  x->mbmi_ext);
-      get_this_mv(&comp_mv, this_mode, i, ref_mv_idx, 0, refs, x->mbmi_ext);
+                  &x->mbmi_ext);
+      get_this_mv(&comp_mv, this_mode, i, ref_mv_idx, 0, refs, &x->mbmi_ext);
       if (single_mv.as_int != comp_mv.as_int) {
         ref_mv_match[i] = 0;
         break;
@@ -4080,6 +4474,50 @@
   return 1;
 }
 
+// Update best single mode for the given reference frame based on simple rd.
+static INLINE void update_best_single_mode(InterModeSearchState *search_state,
+                                           const PREDICTION_MODE this_mode,
+                                           const MV_REFERENCE_FRAME ref_frame,
+                                           int64_t this_rd) {
+  if (this_rd < search_state->best_single_rd[ref_frame]) {
+    search_state->best_single_rd[ref_frame] = this_rd;
+    search_state->best_single_mode[ref_frame] = this_mode;
+  }
+}
+
+// Prune compound mode using best single mode for the same reference.
+static INLINE int skip_compound_using_best_single_mode_ref(
+    const PREDICTION_MODE this_mode, const MV_REFERENCE_FRAME *ref_frames,
+    const PREDICTION_MODE *best_single_mode,
+    int prune_comp_using_best_single_mode_ref) {
+  // Exclude non-extended compound modes from pruning
+  if (this_mode == NEAREST_NEARESTMV || this_mode == NEAR_NEARMV ||
+      this_mode == NEW_NEWMV || this_mode == GLOBAL_GLOBALMV)
+    return 0;
+
+  assert(this_mode >= NEAREST_NEWMV && this_mode <= NEW_NEARMV);
+  const PREDICTION_MODE comp_mode_ref0 = compound_ref0_mode(this_mode);
+  // Get ref frame direction corresponding to NEWMV
+  // 0 - NEWMV corresponding to forward direction
+  // 1 - NEWMV corresponding to backward direction
+  const int newmv_dir = comp_mode_ref0 != NEWMV;
+
+  // Avoid pruning the compound mode when ref frame corresponding to NEWMV
+  // have NEWMV as single mode winner.
+  // Example: For an extended-compound mode,
+  // {mode, {fwd_frame, bwd_frame}} = {NEAR_NEWMV, {LAST_FRAME, ALTREF_FRAME}}
+  // - Ref frame corresponding to NEWMV is ALTREF_FRAME
+  // - Avoid pruning this mode, if best single mode corresponding to ref frame
+  //   ALTREF_FRAME is NEWMV
+  const PREDICTION_MODE single_mode = best_single_mode[ref_frames[newmv_dir]];
+  if (single_mode == NEWMV) return 0;
+
+  // Avoid pruning the compound mode when best single mode is not available
+  if (prune_comp_using_best_single_mode_ref == 1)
+    if (single_mode == MB_MODE_COUNT) return 0;
+  return 1;
+}
+
 static int compare_int64(const void *a, const void *b) {
   int64_t a64 = *((int64_t *)a);
   int64_t b64 = *((int64_t *)b);
@@ -4099,28 +4537,31 @@
     THR_MODES new_best_mode, const MACROBLOCK *x, int txfm_search_done) {
   const MACROBLOCKD *xd = &x->e_mbd;
   const MB_MODE_INFO *mbmi = xd->mi[0];
-  const int skip_ctx = av1_get_skip_context(xd);
-  const int mode_is_intra =
-      (av1_mode_defs[new_best_mode].mode < INTRA_MODE_END);
-  const int skip = mbmi->skip && !mode_is_intra;
+  const int skip_ctx = av1_get_skip_txfm_context(xd);
+  const int skip_txfm =
+      mbmi->skip_txfm && !is_mode_intra(av1_mode_defs[new_best_mode].mode);
+  const TxfmSearchInfo *txfm_info = &x->txfm_search_info;
 
   search_state->best_rd = new_best_rd_stats->rdcost;
   search_state->best_mode_index = new_best_mode;
   *best_rd_stats_dst = *new_best_rd_stats;
   search_state->best_mbmode = *mbmi;
-  search_state->best_skip2 = skip;
-  search_state->best_mode_skippable = new_best_rd_stats->skip;
+  search_state->best_skip2 = skip_txfm;
+  search_state->best_mode_skippable = new_best_rd_stats->skip_txfm;
   // When !txfm_search_done, new_best_rd_stats won't provide correct rate_y and
   // rate_uv because av1_txfm_search process is replaced by rd estimation.
-  // Therfore, we should avoid updating best_rate_y and best_rate_uv here.
+  // Therefore, we should avoid updating best_rate_y and best_rate_uv here.
   // These two values will be updated when av1_txfm_search is called.
   if (txfm_search_done) {
     search_state->best_rate_y =
         new_best_rd_stats_y->rate +
-        x->skip_cost[skip_ctx][new_best_rd_stats->skip || skip];
+        x->mode_costs.skip_txfm_cost[skip_ctx]
+                                    [new_best_rd_stats->skip_txfm || skip_txfm];
     search_state->best_rate_uv = new_best_rd_stats_uv->rate;
   }
-  memcpy(ctx->blk_skip, x->blk_skip, sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
+  search_state->best_y_rdcost = *new_best_rd_stats_y;
+  memcpy(ctx->blk_skip, txfm_info->blk_skip,
+         sizeof(txfm_info->blk_skip[0]) * ctx->num_4x4_blk);
   av1_copy_array(ctx->tx_type_map, xd->tx_type_map, ctx->num_4x4_blk);
 }
 
@@ -4158,7 +4599,7 @@
     struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE],
     const motion_mode_best_st_candidate *const best_motion_mode_cands,
     int do_tx_search, const BLOCK_SIZE bsize, int64_t *const best_est_rd,
-    InterModeSearchState *const search_state) {
+    InterModeSearchState *const search_state, int64_t *yrd) {
   const AV1_COMMON *const cm = &cpi->common;
   const int num_planes = av1_num_planes(cm);
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -4173,7 +4614,7 @@
     av1_init_rd_stats(&rd_stats);
     av1_init_rd_stats(&rd_stats_y);
     av1_init_rd_stats(&rd_stats_uv);
-    int disable_skip = 0, rate_mv;
+    int rate_mv;
 
     rate_mv = best_motion_mode_cands->motion_mode_cand[cand].rate_mv;
     args->skip_motion_mode =
@@ -4185,9 +4626,7 @@
     // Continue if the best candidate is compound.
     if (!is_inter_singleref_mode(mbmi->mode)) continue;
 
-    x->force_skip = 0;
-    const int mode_index = get_prediction_mode_idx(
-        mbmi->mode, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+    x->txfm_search_info.skip_txfm = 0;
     struct macroblockd_plane *p = xd->plane;
     const BUFFER_SET orig_dst = {
       { p[0].dst.buf, p[1].dst.buf, p[2].dst.buf },
@@ -4195,7 +4634,6 @@
     };
 
     set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
-    args->simple_rd_state = x->simple_rd_state[mode_index];
     // Initialize motion mode to simple translation
     // Calculation of switchable rate depends on it.
     mbmi->motion_mode = 0;
@@ -4207,10 +4645,11 @@
 
     int64_t skip_rd[2] = { search_state->best_skip_rd[0],
                            search_state->best_skip_rd[1] };
+    int64_t this_yrd = INT64_MAX;
     int64_t ret_value = motion_mode_rd(
-        cpi, tile_data, x, bsize, &rd_stats, &rd_stats_y, &rd_stats_uv,
-        &disable_skip, args, search_state->best_rd, skip_rd, &rate_mv,
-        &orig_dst, best_est_rd, do_tx_search, inter_modes_info, 1);
+        cpi, tile_data, x, bsize, &rd_stats, &rd_stats_y, &rd_stats_uv, args,
+        search_state->best_rd, skip_rd, &rate_mv, &orig_dst, best_est_rd,
+        do_tx_search, inter_modes_info, 1, &this_yrd);
 
     if (ret_value != INT64_MAX) {
       rd_stats.rdcost = RDCOST(x->rdmult, rd_stats.rate, rd_stats.dist);
@@ -4220,8 +4659,9 @@
       store_winner_mode_stats(
           &cpi->common, x, mbmi, &rd_stats, &rd_stats_y, &rd_stats_uv,
           mode_enum, NULL, bsize, rd_stats.rdcost,
-          cpi->sf.winner_mode_sf.enable_multiwinner_mode_process, do_tx_search);
+          cpi->sf.winner_mode_sf.multi_winner_mode_type, do_tx_search);
       if (rd_stats.rdcost < search_state->best_rd) {
+        *yrd = this_yrd;
         update_search_state(search_state, rd_cost, ctx, &rd_stats, &rd_stats_y,
                             &rd_stats_uv, mode_enum, x, do_tx_search);
         if (do_tx_search) search_state->best_skip_rd[0] = skip_rd[0];
@@ -4230,6 +4670,7 @@
   }
 }
 
+/*!\cond */
 // Arguments for speed feature pruning of inter mode search
 typedef struct {
   int *skip_motion_mode;
@@ -4238,10 +4679,12 @@
   int skip_ref_frame_mask;
   int reach_first_comp_mode;
   int mode_thresh_mul_fact;
-  int *intra_mode_idx_ls;
-  int *intra_mode_num;
+  int intra_mode_idx_ls[INTRA_MODES];
+  int intra_mode_num;
+  int num_single_modes_processed;
   int prune_cpd_using_sr_stats_ready;
 } InterModeSFArgs;
+/*!\endcond */
 
 static int skip_inter_mode(AV1_COMP *cpi, MACROBLOCK *x, const BLOCK_SIZE bsize,
                            int64_t *ref_frame_rd, int midx,
@@ -4257,17 +4700,6 @@
   const MV_REFERENCE_FRAME ref_frame = ref_frames[0];
   const MV_REFERENCE_FRAME second_ref_frame = ref_frames[1];
   const int comp_pred = second_ref_frame > INTRA_FRAME;
-  const int last_single_ref_mode_idx =
-      find_last_single_ref_mode_idx(av1_default_mode_order);
-
-  // After we done with single reference modes, find the 2nd best RD
-  // for a reference frame. Only search compound modes that have a reference
-  // frame at least as good as the 2nd best.
-  if (sf->inter_sf.prune_compound_using_single_ref &&
-      midx == last_single_ref_mode_idx + 1) {
-    find_top_ref(ref_frame_rd);
-    args->prune_cpd_using_sr_stats_ready = 1;
-  }
 
   // Check if this mode should be skipped because it is incompatible with the
   // current frame
@@ -4307,26 +4739,33 @@
 
   // Speed features to prune out INTRA frames
   if (ref_frame == INTRA_FRAME) {
-    if ((!cpi->oxcf.enable_smooth_intra || sf->intra_sf.disable_smooth_intra) &&
+    if ((!cpi->oxcf.intra_mode_cfg.enable_smooth_intra ||
+         sf->intra_sf.disable_smooth_intra) &&
         (mbmi->mode == SMOOTH_PRED || mbmi->mode == SMOOTH_H_PRED ||
          mbmi->mode == SMOOTH_V_PRED))
       return 1;
-    if (!cpi->oxcf.enable_paeth_intra && mbmi->mode == PAETH_PRED) return 1;
-    if (sf->inter_sf.adaptive_mode_search > 1)
-      if ((x->source_variance << num_pels_log2_lookup[bsize]) >
-          args->search_state->best_pred_sse)
-        return 1;
+    if (!cpi->oxcf.intra_mode_cfg.enable_paeth_intra &&
+        mbmi->mode == PAETH_PRED)
+      return 1;
 
     // Intra modes will be handled in another loop later.
-    assert(*args->intra_mode_num < INTRA_MODES);
-    args->intra_mode_idx_ls[(*args->intra_mode_num)++] = mode_enum;
+    assert(args->intra_mode_num < INTRA_MODES);
+    args->intra_mode_idx_ls[args->intra_mode_num++] = mode_enum;
     return 1;
   }
 
-  if (sf->inter_sf.prune_compound_using_single_ref &&
-      args->prune_cpd_using_sr_stats_ready && comp_pred &&
-      !in_single_ref_cutoff(ref_frame_rd, ref_frame, second_ref_frame)) {
-    return 1;
+  if (sf->inter_sf.prune_compound_using_single_ref && comp_pred) {
+    // After we done with single reference modes, find the 2nd best RD
+    // for a reference frame. Only search compound modes that have a reference
+    // frame at least as good as the 2nd best.
+    if (!args->prune_cpd_using_sr_stats_ready &&
+        args->num_single_modes_processed == NUM_SINGLE_REF_MODES) {
+      find_top_ref(ref_frame_rd);
+      args->prune_cpd_using_sr_stats_ready = 1;
+    }
+    if (args->prune_cpd_using_sr_stats_ready &&
+        !in_single_ref_cutoff(ref_frame_rd, ref_frame, second_ref_frame))
+      return 1;
   }
 
   if (sf->inter_sf.prune_compound_using_neighbors && comp_pred) {
@@ -4336,6 +4775,13 @@
       return 1;
   }
 
+  if (sf->inter_sf.prune_comp_using_best_single_mode_ref && comp_pred) {
+    if (skip_compound_using_best_single_mode_ref(
+            this_mode, ref_frames, args->search_state->best_single_mode,
+            sf->inter_sf.prune_comp_using_best_single_mode_ref))
+      return 1;
+  }
+
   return 0;
 }
 
@@ -4357,20 +4803,133 @@
   hybrid_rd = RDCOST(rdmult, hybrid_rate, rd_stats->dist);
 
   if (!comp_pred) {
-    if (single_rd <
-        search_state->intra_search_state.best_pred_rd[SINGLE_REFERENCE])
-      search_state->intra_search_state.best_pred_rd[SINGLE_REFERENCE] =
-          single_rd;
+    if (single_rd < search_state->best_pred_rd[SINGLE_REFERENCE])
+      search_state->best_pred_rd[SINGLE_REFERENCE] = single_rd;
   } else {
-    if (single_rd <
-        search_state->intra_search_state.best_pred_rd[COMPOUND_REFERENCE])
-      search_state->intra_search_state.best_pred_rd[COMPOUND_REFERENCE] =
-          single_rd;
+    if (single_rd < search_state->best_pred_rd[COMPOUND_REFERENCE])
+      search_state->best_pred_rd[COMPOUND_REFERENCE] = single_rd;
   }
-  if (hybrid_rd <
-      search_state->intra_search_state.best_pred_rd[REFERENCE_MODE_SELECT])
-    search_state->intra_search_state.best_pred_rd[REFERENCE_MODE_SELECT] =
-        hybrid_rd;
+  if (hybrid_rd < search_state->best_pred_rd[REFERENCE_MODE_SELECT])
+    search_state->best_pred_rd[REFERENCE_MODE_SELECT] = hybrid_rd;
+}
+
+// Does a transform search over a list of the best inter mode candidates.
+// This is called if the original mode search computed an RD estimate
+// for the transform search rather than doing a full search.
+static void tx_search_best_inter_candidates(
+    AV1_COMP *cpi, TileDataEnc *tile_data, MACROBLOCK *x,
+    int64_t best_rd_so_far, BLOCK_SIZE bsize,
+    struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE], int mi_row, int mi_col,
+    InterModeSearchState *search_state, RD_STATS *rd_cost,
+    PICK_MODE_CONTEXT *ctx, int64_t *yrd) {
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  TxfmSearchInfo *txfm_info = &x->txfm_search_info;
+  const ModeCosts *mode_costs = &x->mode_costs;
+  const int num_planes = av1_num_planes(cm);
+  const int skip_ctx = av1_get_skip_txfm_context(xd);
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  InterModesInfo *inter_modes_info = x->inter_modes_info;
+  inter_modes_info_sort(inter_modes_info, inter_modes_info->rd_idx_pair_arr);
+  search_state->best_rd = best_rd_so_far;
+  search_state->best_mode_index = THR_INVALID;
+  // Initialize best mode stats for winner mode processing
+  x->winner_mode_count = 0;
+  store_winner_mode_stats(&cpi->common, x, mbmi, NULL, NULL, NULL, THR_INVALID,
+                          NULL, bsize, best_rd_so_far,
+                          cpi->sf.winner_mode_sf.multi_winner_mode_type, 0);
+  inter_modes_info->num =
+      inter_modes_info->num < cpi->sf.rt_sf.num_inter_modes_for_tx_search
+          ? inter_modes_info->num
+          : cpi->sf.rt_sf.num_inter_modes_for_tx_search;
+  const int64_t top_est_rd =
+      inter_modes_info->num > 0
+          ? inter_modes_info
+                ->est_rd_arr[inter_modes_info->rd_idx_pair_arr[0].idx]
+          : INT64_MAX;
+  *yrd = INT64_MAX;
+  int64_t best_rd_in_this_partition = INT64_MAX;
+  // Iterate over best inter mode candidates and perform tx search
+  for (int j = 0; j < inter_modes_info->num; ++j) {
+    const int data_idx = inter_modes_info->rd_idx_pair_arr[j].idx;
+    *mbmi = inter_modes_info->mbmi_arr[data_idx];
+    int64_t curr_est_rd = inter_modes_info->est_rd_arr[data_idx];
+    if (curr_est_rd * 0.80 > top_est_rd) break;
+
+    txfm_info->skip_txfm = 0;
+    set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+
+    // Select prediction reference frames.
+    const int is_comp_pred = mbmi->ref_frame[1] > INTRA_FRAME;
+    for (int i = 0; i < num_planes; i++) {
+      xd->plane[i].pre[0] = yv12_mb[mbmi->ref_frame[0]][i];
+      if (is_comp_pred) xd->plane[i].pre[1] = yv12_mb[mbmi->ref_frame[1]][i];
+    }
+
+    // Build the prediction for this mode
+    av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, 0,
+                                  av1_num_planes(cm) - 1);
+    if (mbmi->motion_mode == OBMC_CAUSAL) {
+      av1_build_obmc_inter_predictors_sb(cm, xd);
+    }
+
+    // Initialize RD stats
+    RD_STATS rd_stats;
+    RD_STATS rd_stats_y;
+    RD_STATS rd_stats_uv;
+    const int mode_rate = inter_modes_info->mode_rate_arr[data_idx];
+    int64_t skip_rd = INT64_MAX;
+    if (cpi->sf.inter_sf.txfm_rd_gate_level) {
+      // Check if the mode is good enough based on skip RD
+      int64_t curr_sse = inter_modes_info->sse_arr[data_idx];
+      skip_rd = RDCOST(x->rdmult, mode_rate, curr_sse);
+      int eval_txfm =
+          check_txfm_eval(x, bsize, search_state->best_skip_rd[0], skip_rd,
+                          cpi->sf.inter_sf.txfm_rd_gate_level, 0);
+      if (!eval_txfm) continue;
+    }
+
+    int64_t this_yrd = INT64_MAX;
+    // Do the transform search
+    if (!av1_txfm_search(cpi, x, bsize, &rd_stats, &rd_stats_y, &rd_stats_uv,
+                         mode_rate, search_state->best_rd)) {
+      continue;
+    } else {
+      const int y_rate =
+          rd_stats.skip_txfm
+              ? mode_costs->skip_txfm_cost[skip_ctx][1]
+              : (rd_stats_y.rate + mode_costs->skip_txfm_cost[skip_ctx][0]);
+      this_yrd = RDCOST(x->rdmult, y_rate + mode_rate, rd_stats_y.dist);
+
+      if (cpi->sf.inter_sf.inter_mode_rd_model_estimation == 1) {
+        inter_mode_data_push(
+            tile_data, mbmi->bsize, rd_stats.sse, rd_stats.dist,
+            rd_stats_y.rate + rd_stats_uv.rate +
+                mode_costs->skip_txfm_cost[skip_ctx][mbmi->skip_txfm]);
+      }
+    }
+    rd_stats.rdcost = RDCOST(x->rdmult, rd_stats.rate, rd_stats.dist);
+    if (rd_stats.rdcost < best_rd_in_this_partition) {
+      best_rd_in_this_partition = rd_stats.rdcost;
+      *yrd = this_yrd;
+    }
+
+    const THR_MODES mode_enum = get_prediction_mode_idx(
+        mbmi->mode, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+
+    // Collect mode stats for multiwinner mode processing
+    const int txfm_search_done = 1;
+    store_winner_mode_stats(
+        &cpi->common, x, mbmi, &rd_stats, &rd_stats_y, &rd_stats_uv, mode_enum,
+        NULL, bsize, rd_stats.rdcost,
+        cpi->sf.winner_mode_sf.multi_winner_mode_type, txfm_search_done);
+
+    if (rd_stats.rdcost < search_state->best_rd) {
+      update_search_state(search_state, rd_cost, ctx, &rd_stats, &rd_stats_y,
+                          &rd_stats_uv, mode_enum, x, txfm_search_done);
+      search_state->best_skip_rd[0] = skip_rd;
+    }
+  }
 }
 
 // Indicates number of winner simple translation modes to be used
@@ -4419,19 +4978,231 @@
   }
 }
 
-void av1_rd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
-                               MACROBLOCK *x, RD_STATS *rd_cost,
-                               const BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
-                               int64_t best_rd_so_far) {
+/*!\brief Search intra modes in interframes
+ *
+ * \ingroup intra_mode_search
+ *
+ * This function searches for the best intra mode when the current frame is an
+ * interframe. The list of luma intra mode candidates to be searched are stored
+ * in InterModeSFArgs::intra_mode_idx_ls. This function however does *not*
+ * handle luma palette mode. Palette mode is currently handled by \ref
+ * av1_search_palette_mode.
+ *
+ * This function will first iterate through the luma mode candidates to find the
+ * best luma intra mode. Once the best luma mode it's found, it will then search
+ * for the best chroma mode. Because palette mode is currently not handled by
+ * here, a cache of uv mode is stored in
+ * InterModeSearchState::intra_search_state so it can be reused later by \ref
+ * av1_search_palette_mode.
+ *
+ * \return Returns the rdcost of the current intra-mode if it's available,
+ * otherwise returns INT64_MAX. The corresponding values in x->e_mbd.mi[0],
+ * rd_stats, rd_stats_y/uv, and best_intra_rd are also updated. Moreover, in the
+ * first evocation of the function, the chroma intra mode result is cached in
+ * intra_search_state to be used in subsequent calls. In the first evaluation
+ * with directional mode, a prune_mask computed with histogram of gradient is
+ * also stored in intra_search_state.
+ *
+ * \param[in,out] search_state      Struct keep track of the prediction mode
+ *                                  search state in interframe.
+ *
+ * \param[in]     cpi               Top-level encoder structure.
+ * \param[in]     x                 Pointer to struct holding all the data for
+ *                                  the current prediction block.
+ * \param[out]    rd_cost           Stores the best rd_cost among all the
+ *                                  prediction modes searched.
+ * \param[in]     bsize             Current block size.
+ * \param[in,out] ctx               Structure to hold the number of 4x4 blks to
+ *                                  copy the tx_type and txfm_skip arrays.
+ *                                  for only the Y plane.
+ * \param[in,out] sf_args           Stores the list of intra mode candidates
+ *                                  to be searched.
+ * \param[in]     intra_ref_frame_cost  The entropy cost for signaling that the
+ *                                      current ref frame is an intra frame.
+ * \param[in]     yrd_threshold     The rdcost threshold for luma intra mode to
+ *                                  terminate chroma intra mode search.
+ *
+ * \return Returns INT64_MAX if the determined motion mode is invalid and the
+ * current motion mode being tested should be skipped. It returns 0 if the
+ * motion mode search is a success.
+ */
+static AOM_INLINE void search_intra_modes_in_interframe(
+    InterModeSearchState *search_state, const AV1_COMP *cpi, MACROBLOCK *x,
+    RD_STATS *rd_cost, BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
+    InterModeSFArgs *sf_args, unsigned int intra_ref_frame_cost,
+    int64_t yrd_threshold) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const SPEED_FEATURES *const sf = &cpi->sf;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  IntraModeSearchState *intra_search_state = &search_state->intra_search_state;
+
+  int is_best_y_mode_intra = 0;
+  RD_STATS best_intra_rd_stats_y;
+  int64_t best_rd_y = INT64_MAX;
+  int best_mode_cost_y = -1;
+  MB_MODE_INFO best_mbmi = *xd->mi[0];
+  THR_MODES best_mode_enum = THR_INVALID;
+  uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
+  uint8_t best_tx_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE];
+  const int num_4x4 = bsize_to_num_blk(bsize);
+
+  // Performs luma search
+  for (int j = 0; j < sf_args->intra_mode_num; ++j) {
+    if (sf->intra_sf.skip_intra_in_interframe &&
+        search_state->intra_search_state.skip_intra_modes)
+      break;
+    const THR_MODES mode_enum = sf_args->intra_mode_idx_ls[j];
+    const MODE_DEFINITION *mode_def = &av1_mode_defs[mode_enum];
+    const PREDICTION_MODE this_mode = mode_def->mode;
+
+    assert(av1_mode_defs[mode_enum].ref_frame[0] == INTRA_FRAME);
+    assert(av1_mode_defs[mode_enum].ref_frame[1] == NONE_FRAME);
+    init_mbmi(mbmi, this_mode, av1_mode_defs[mode_enum].ref_frame, cm);
+    x->txfm_search_info.skip_txfm = 0;
+
+    if (this_mode != DC_PRED) {
+      // Only search the oblique modes if the best so far is
+      // one of the neighboring directional modes
+      if ((sf->rt_sf.mode_search_skip_flags & FLAG_SKIP_INTRA_BESTINTER) &&
+          (this_mode >= D45_PRED && this_mode <= PAETH_PRED)) {
+        if (search_state->best_mode_index != THR_INVALID &&
+            search_state->best_mbmode.ref_frame[0] > INTRA_FRAME)
+          continue;
+      }
+      if (sf->rt_sf.mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) {
+        if (conditional_skipintra(
+                this_mode, search_state->intra_search_state.best_intra_mode))
+          continue;
+      }
+    }
+
+    RD_STATS intra_rd_stats_y;
+    int mode_cost_y;
+    int64_t intra_rd_y = INT64_MAX;
+    const int is_luma_result_valid = av1_handle_intra_y_mode(
+        intra_search_state, cpi, x, bsize, intra_ref_frame_cost, ctx,
+        &intra_rd_stats_y, search_state->best_rd, &mode_cost_y, &intra_rd_y);
+    if (is_luma_result_valid && intra_rd_y < yrd_threshold) {
+      is_best_y_mode_intra = 1;
+      if (intra_rd_y < best_rd_y) {
+        best_intra_rd_stats_y = intra_rd_stats_y;
+        best_mode_cost_y = mode_cost_y;
+        best_rd_y = intra_rd_y;
+        best_mbmi = *mbmi;
+        best_mode_enum = mode_enum;
+        memcpy(best_blk_skip, x->txfm_search_info.blk_skip,
+               sizeof(best_blk_skip[0]) * num_4x4);
+        av1_copy_array(best_tx_type_map, xd->tx_type_map, num_4x4);
+      }
+    }
+  }
+
+  if (!is_best_y_mode_intra) {
+    return;
+  }
+
+  assert(best_rd_y < INT64_MAX);
+
+  // Restores the best luma mode
+  *mbmi = best_mbmi;
+  memcpy(x->txfm_search_info.blk_skip, best_blk_skip,
+         sizeof(best_blk_skip[0]) * num_4x4);
+  av1_copy_array(xd->tx_type_map, best_tx_type_map, num_4x4);
+
+  // Performs chroma search
+  RD_STATS intra_rd_stats, intra_rd_stats_uv;
+  av1_init_rd_stats(&intra_rd_stats);
+  av1_init_rd_stats(&intra_rd_stats_uv);
+  const int num_planes = av1_num_planes(cm);
+  if (num_planes > 1) {
+    const int intra_uv_mode_valid = av1_search_intra_uv_modes_in_interframe(
+        intra_search_state, cpi, x, bsize, &intra_rd_stats,
+        &best_intra_rd_stats_y, &intra_rd_stats_uv, search_state->best_rd);
+
+    if (!intra_uv_mode_valid) {
+      return;
+    }
+  }
+
+  // Merge the luma and chroma rd stats
+  assert(best_mode_cost_y >= 0);
+  intra_rd_stats.rate = best_intra_rd_stats_y.rate + best_mode_cost_y;
+  if (!xd->lossless[mbmi->segment_id] && block_signals_txsize(bsize)) {
+    // av1_pick_uniform_tx_size_type_yrd above includes the cost of the tx_size
+    // in the tokenonly rate, but for intra blocks, tx_size is always coded
+    // (prediction granularity), so we account for it in the full rate,
+    // not the tokenonly rate.
+    best_intra_rd_stats_y.rate -= tx_size_cost(x, bsize, mbmi->tx_size);
+  }
+
+  const ModeCosts *mode_costs = &x->mode_costs;
+  const PREDICTION_MODE mode = mbmi->mode;
+  if (num_planes > 1 && xd->is_chroma_ref) {
+    const int uv_mode_cost =
+        mode_costs->intra_uv_mode_cost[is_cfl_allowed(xd)][mode][mbmi->uv_mode];
+    intra_rd_stats.rate +=
+        intra_rd_stats_uv.rate +
+        intra_mode_info_cost_uv(cpi, x, mbmi, bsize, uv_mode_cost);
+  }
+  if (mode != DC_PRED && mode != PAETH_PRED) {
+    const int intra_cost_penalty = av1_get_intra_cost_penalty(
+        cm->quant_params.base_qindex, cm->quant_params.y_dc_delta_q,
+        cm->seq_params.bit_depth);
+    intra_rd_stats.rate += intra_cost_penalty;
+  }
+
+  // Intra block is always coded as non-skip
+  intra_rd_stats.skip_txfm = 0;
+  intra_rd_stats.dist = best_intra_rd_stats_y.dist + intra_rd_stats_uv.dist;
+  // Add in the cost of the no skip flag.
+  const int skip_ctx = av1_get_skip_txfm_context(xd);
+  intra_rd_stats.rate += mode_costs->skip_txfm_cost[skip_ctx][0];
+  // Calculate the final RD estimate for this mode.
+  const int64_t this_rd =
+      RDCOST(x->rdmult, intra_rd_stats.rate, intra_rd_stats.dist);
+  // Keep record of best intra rd
+  if (this_rd < search_state->best_intra_rd) {
+    search_state->best_intra_rd = this_rd;
+    intra_search_state->best_intra_mode = mode;
+  }
+
+  for (int i = 0; i < REFERENCE_MODES; ++i) {
+    search_state->best_pred_rd[i] =
+        AOMMIN(search_state->best_pred_rd[i], this_rd);
+  }
+
+  intra_rd_stats.rdcost = this_rd;
+
+  // Collect mode stats for multiwinner mode processing
+  const int txfm_search_done = 1;
+  store_winner_mode_stats(
+      &cpi->common, x, mbmi, &intra_rd_stats, &best_intra_rd_stats_y,
+      &intra_rd_stats_uv, best_mode_enum, NULL, bsize, intra_rd_stats.rdcost,
+      cpi->sf.winner_mode_sf.multi_winner_mode_type, txfm_search_done);
+  if (intra_rd_stats.rdcost < search_state->best_rd) {
+    update_search_state(search_state, rd_cost, ctx, &intra_rd_stats,
+                        &best_intra_rd_stats_y, &intra_rd_stats_uv,
+                        best_mode_enum, x, txfm_search_done);
+  }
+}
+
+// TODO(chiyotsai@google.com): See the todo for av1_rd_pick_intra_mode_sb.
+void av1_rd_pick_inter_mode(struct AV1_COMP *cpi, struct TileDataEnc *tile_data,
+                            struct macroblock *x, struct RD_STATS *rd_cost,
+                            BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
+                            int64_t best_rd_so_far) {
   AV1_COMMON *const cm = &cpi->common;
   const FeatureFlags *const features = &cm->features;
   const int num_planes = av1_num_planes(cm);
   const SPEED_FEATURES *const sf = &cpi->sf;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = xd->mi[0];
+  TxfmSearchInfo *txfm_info = &x->txfm_search_info;
   int i;
+  const ModeCosts *mode_costs = &x->mode_costs;
   const int *comp_inter_cost =
-      x->comp_inter_cost[av1_get_reference_mode_context(xd)];
+      mode_costs->comp_inter_cost[av1_get_reference_mode_context(xd)];
 
   InterModeSearchState search_state;
   init_inter_mode_search_state(&search_state, cpi, x, bsize, best_rd_so_far);
@@ -4453,10 +5224,13 @@
                                search_state.simple_rd,
                                0,
                                interintra_modes,
-                               1,
-                               NULL,
                                { { { 0 }, { { 0 } }, { 0 }, 0, 0, 0, 0 } },
-                               0 };
+                               0,
+                               -1,
+                               -1,
+                               -1,
+                               { 0 } };
+  for (i = 0; i < MODE_CTX_REF_FRAMES; ++i) args.cmp_mode[i] = -1;
   // Indicates the appropriate number of simple translation winner modes for
   // exhaustive motion mode evaluation
   const int max_winner_motion_mode_cand =
@@ -4474,10 +5248,14 @@
 
   av1_invalid_rd_stats(rd_cost);
 
+  for (i = 0; i < REF_FRAMES; ++i) {
+    x->warp_sample_info[i].num = -1;
+  }
+
   // Ref frames that are selected by square partition blocks.
   int picked_ref_frames_mask = 0;
   if (cpi->sf.inter_sf.prune_ref_frame_for_rect_partitions &&
-      mbmi->partition != PARTITION_NONE && mbmi->partition != PARTITION_SPLIT) {
+      mbmi->partition != PARTITION_NONE) {
     // prune_ref_frame_for_rect_partitions = 1 implies prune only extended
     // partition blocks. prune_ref_frame_for_rect_partitions >=2
     // implies prune for vert, horiz and extended partition blocks.
@@ -4513,11 +5291,8 @@
   InterModesInfo *inter_modes_info = x->inter_modes_info;
   inter_modes_info->num = 0;
 
-  int intra_mode_num = 0;
-  int intra_mode_idx_ls[INTRA_MODES];
-
   // Temporary buffers used by handle_inter_mode().
-  uint8_t *const tmp_buf = get_buf_by_bd(xd, x->tmp_obmc_bufs[0]);
+  uint8_t *const tmp_buf = get_buf_by_bd(xd, x->tmp_pred_bufs[0]);
 
   // The best RD found for the reference frame, among single reference modes.
   // Note that the 0-th element will contain a cut-off that is later used
@@ -4525,7 +5300,6 @@
   int64_t ref_frame_rd[REF_FRAMES] = { INT64_MAX, INT64_MAX, INT64_MAX,
                                        INT64_MAX, INT64_MAX, INT64_MAX,
                                        INT64_MAX, INT64_MAX };
-  const int skip_ctx = av1_get_skip_context(xd);
 
   // Prepared stats used later to check if we could skip intra mode eval.
   int64_t inter_cost = -1;
@@ -4538,9 +5312,9 @@
   PruneInfoFromTpl inter_cost_info_from_tpl;
 #if !CONFIG_REALTIME_ONLY
   if (cpi->sf.inter_sf.prune_inter_modes_based_on_tpl) {
-    // x->search_ref_frame[id] = 1 => no pruning in
+    // x->tpl_keep_ref_frame[id] = 1 => no pruning in
     // prune_ref_by_selective_ref_frame()
-    // x->search_ref_frame[id] = 0  => ref frame can be pruned in
+    // x->tpl_keep_ref_frame[id] = 0  => ref frame can be pruned in
     // prune_ref_by_selective_ref_frame()
     // Populating valid_refs[idx] = 1 ensures that
     // 'inter_cost_info_from_tpl.best_inter_cost' does not correspond to a
@@ -4549,7 +5323,7 @@
     for (MV_REFERENCE_FRAME frame = LAST_FRAME; frame < REF_FRAMES; frame++) {
       const MV_REFERENCE_FRAME refs[2] = { frame, NONE_FRAME };
       valid_refs[frame - 1] =
-          x->search_ref_frame[frame] ||
+          x->tpl_keep_ref_frame[frame] ||
           !prune_ref_by_selective_ref_frame(
               cpi, x, refs, cm->cur_frame->ref_display_order_hint);
     }
@@ -4562,22 +5336,27 @@
       (AOMMIN(cm->width, cm->height) > 480 && cpi->speed <= 1) ? 0 : 1;
   if (do_pruning && sf->intra_sf.skip_intra_in_interframe) {
     // Only consider full SB.
-    int len = tpl_blocks_in_sb(cm->seq_params.sb_size);
-    if (len == x->valid_cost_b) {
-      const BLOCK_SIZE tpl_bsize = convert_length_to_bsize(MC_FLOW_BSIZE_1D);
+    const BLOCK_SIZE sb_size = cm->seq_params.sb_size;
+    const int tpl_bsize_1d = cpi->tpl_data.tpl_bsize_1d;
+    const int len = (block_size_wide[sb_size] / tpl_bsize_1d) *
+                    (block_size_high[sb_size] / tpl_bsize_1d);
+    SuperBlockEnc *sb_enc = &x->sb_enc;
+    if (sb_enc->tpl_data_count == len) {
+      const BLOCK_SIZE tpl_bsize = convert_length_to_bsize(tpl_bsize_1d);
+      const int tpl_stride = sb_enc->tpl_stride;
       const int tplw = mi_size_wide[tpl_bsize];
       const int tplh = mi_size_high[tpl_bsize];
       const int nw = mi_size_wide[bsize] / tplw;
       const int nh = mi_size_high[bsize] / tplh;
       if (nw >= 1 && nh >= 1) {
-        const int of_h = mi_row % mi_size_high[cm->seq_params.sb_size];
-        const int of_w = mi_col % mi_size_wide[cm->seq_params.sb_size];
-        const int start = of_h / tplh * x->cost_stride + of_w / tplw;
+        const int of_h = mi_row % mi_size_high[sb_size];
+        const int of_w = mi_col % mi_size_wide[sb_size];
+        const int start = of_h / tplh * tpl_stride + of_w / tplw;
 
         for (int k = 0; k < nh; k++) {
           for (int l = 0; l < nw; l++) {
-            inter_cost += x->inter_cost_b[start + k * x->cost_stride + l];
-            intra_cost += x->intra_cost_b[start + k * x->cost_stride + l];
+            inter_cost += sb_enc->tpl_inter_cost[start + k * tpl_stride + l];
+            intra_cost += sb_enc->tpl_intra_cost[start + k * tpl_stride + l];
           }
         }
         inter_cost /= nw * nh;
@@ -4589,10 +5368,9 @@
   // Initialize best mode stats for winner mode processing
   av1_zero(x->winner_mode_stats);
   x->winner_mode_count = 0;
-  store_winner_mode_stats(
-      &cpi->common, x, mbmi, NULL, NULL, NULL, THR_INVALID, NULL, bsize,
-      best_rd_so_far, cpi->sf.winner_mode_sf.enable_multiwinner_mode_process,
-      0);
+  store_winner_mode_stats(&cpi->common, x, mbmi, NULL, NULL, NULL, THR_INVALID,
+                          NULL, bsize, best_rd_so_far,
+                          cpi->sf.winner_mode_sf.multi_winner_mode_type, 0);
 
   int mode_thresh_mul_fact = (1 << MODE_THRESH_QBITS);
   if (sf->inter_sf.prune_inter_modes_if_skippable) {
@@ -4607,10 +5385,14 @@
                               skip_ref_frame_mask,
                               0,
                               mode_thresh_mul_fact,
-                              intra_mode_idx_ls,
-                              &intra_mode_num,
+                              { 0 },
+                              0,
+                              0,
                               0 };
+  int64_t best_inter_yrd = INT64_MAX;
 
+  // This is the main loop of this function. It loops over all possible modes
+  // and calls handle_inter_mode() to compute the RD for each.
   // Here midx is just an iterator index that should not be used by itself
   // except to keep track of the number of modes searched. It should be used
   // with av1_default_mode_order to get the enum that defines the mode, which
@@ -4631,7 +5413,8 @@
 
     init_mbmi(mbmi, this_mode, ref_frames, cm);
 
-    x->force_skip = 0;
+    txfm_info->skip_txfm = 0;
+    sf_args.num_single_modes_processed += is_single_pred;
     set_ref_ptrs(cm, xd, ref_frame, second_ref_frame);
 
     // Apply speed features to decide if this inter mode can be skipped
@@ -4649,7 +5432,6 @@
     mbmi->ref_mv_idx = 0;
 
     const int64_t ref_best_rd = search_state.best_rd;
-    int disable_skip = 0;
     RD_STATS rd_stats, rd_stats_y, rd_stats_uv;
     av1_init_rd_stats(&rd_stats);
 
@@ -4657,7 +5439,7 @@
                                    ? ref_costs_comp[ref_frame][second_ref_frame]
                                    : ref_costs_single[ref_frame];
     const int compmode_cost =
-        is_comp_ref_allowed(mbmi->sb_type) ? comp_inter_cost[comp_pred] : 0;
+        is_comp_ref_allowed(mbmi->bsize) ? comp_inter_cost[comp_pred] : 0;
     const int real_compmode_cost =
         cm->current_frame.reference_mode == REFERENCE_MODE_SELECT
             ? compmode_cost
@@ -4668,26 +5450,33 @@
     args.single_newmv_valid = search_state.single_newmv_valid;
     args.single_comp_cost = real_compmode_cost;
     args.ref_frame_cost = ref_frame_cost;
-    if (is_single_pred) {
-      args.simple_rd_state = x->simple_rd_state[mode_enum];
-    }
 
     int64_t skip_rd[2] = { search_state.best_skip_rd[0],
                            search_state.best_skip_rd[1] };
+    int64_t this_yrd = INT64_MAX;
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    start_timing(cpi, handle_inter_mode_time);
+#endif
     int64_t this_rd = handle_inter_mode(
-        cpi, tile_data, x, bsize, &rd_stats, &rd_stats_y, &rd_stats_uv,
-        &disable_skip, &args, ref_best_rd, tmp_buf, &x->comp_rd_buffer,
-        &best_est_rd, do_tx_search, inter_modes_info, &motion_mode_cand,
-        skip_rd, &inter_cost_info_from_tpl);
-
+        cpi, tile_data, x, bsize, &rd_stats, &rd_stats_y, &rd_stats_uv, &args,
+        ref_best_rd, tmp_buf, &x->comp_rd_buffer, &best_est_rd, do_tx_search,
+        inter_modes_info, &motion_mode_cand, skip_rd, &inter_cost_info_from_tpl,
+        &this_yrd);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    end_timing(cpi, handle_inter_mode_time);
+#endif
     if (sf->inter_sf.prune_comp_search_by_single_result > 0 &&
-        is_inter_singleref_mode(this_mode) && args.single_ref_first_pass) {
+        is_inter_singleref_mode(this_mode)) {
       collect_single_states(x, &search_state, mbmi);
     }
 
+    if (sf->inter_sf.prune_comp_using_best_single_mode_ref > 0 &&
+        is_inter_singleref_mode(this_mode))
+      update_best_single_mode(&search_state, this_mode, ref_frame, this_rd);
+
     if (this_rd == INT64_MAX) continue;
 
-    if (mbmi->skip) {
+    if (mbmi->skip_txfm) {
       rd_stats_y.rate = 0;
       rd_stats_uv.rate = 0;
     }
@@ -4702,6 +5491,7 @@
       assert(IMPLIES(comp_pred,
                      cm->current_frame.reference_mode != SINGLE_REFERENCE));
       search_state.best_pred_sse = x->pred_sse[ref_frame];
+      best_inter_yrd = this_yrd;
       update_search_state(&search_state, rd_cost, ctx, &rd_stats, &rd_stats_y,
                           &rd_stats_uv, mode_enum, x, do_tx_search);
       if (do_tx_search) search_state.best_skip_rd[0] = skip_rd[0];
@@ -4716,108 +5506,34 @@
     }
 
     /* keep record of best compound/single-only prediction */
-    if (!disable_skip) {
-      record_best_compound(cm->current_frame.reference_mode, &rd_stats,
-                           comp_pred, x->rdmult, &search_state, compmode_cost);
-    }
+    record_best_compound(cm->current_frame.reference_mode, &rd_stats, comp_pred,
+                         x->rdmult, &search_state, compmode_cost);
   }
 
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  start_timing(cpi, evaluate_motion_mode_for_winner_candidates_time);
+#endif
   if (cpi->sf.winner_mode_sf.motion_mode_for_winner_cand) {
     // For the single ref winner candidates, evaluate other motion modes (non
     // simple translation).
     evaluate_motion_mode_for_winner_candidates(
         cpi, x, rd_cost, &args, tile_data, ctx, yv12_mb,
         &best_motion_mode_cands, do_tx_search, bsize, &best_est_rd,
-        &search_state);
+        &search_state, &best_inter_yrd);
   }
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  end_timing(cpi, evaluate_motion_mode_for_winner_candidates_time);
+#endif
 
 #if CONFIG_COLLECT_COMPONENT_TIMING
   start_timing(cpi, do_tx_search_time);
 #endif
   if (do_tx_search != 1) {
-    inter_modes_info_sort(inter_modes_info, inter_modes_info->rd_idx_pair_arr);
-    search_state.best_rd = best_rd_so_far;
-    search_state.best_mode_index = THR_INVALID;
-    // Initialize best mode stats for winner mode processing
-    x->winner_mode_count = 0;
-    store_winner_mode_stats(
-        &cpi->common, x, mbmi, NULL, NULL, NULL, THR_INVALID, NULL, bsize,
-        best_rd_so_far, cpi->sf.winner_mode_sf.enable_multiwinner_mode_process,
-        do_tx_search);
-    inter_modes_info->num =
-        inter_modes_info->num < cpi->sf.rt_sf.num_inter_modes_for_tx_search
-            ? inter_modes_info->num
-            : cpi->sf.rt_sf.num_inter_modes_for_tx_search;
-    const int64_t top_est_rd =
-        inter_modes_info->num > 0
-            ? inter_modes_info
-                  ->est_rd_arr[inter_modes_info->rd_idx_pair_arr[0].idx]
-            : INT64_MAX;
-    for (int j = 0; j < inter_modes_info->num; ++j) {
-      const int data_idx = inter_modes_info->rd_idx_pair_arr[j].idx;
-      *mbmi = inter_modes_info->mbmi_arr[data_idx];
-      int64_t curr_est_rd = inter_modes_info->est_rd_arr[data_idx];
-      if (curr_est_rd * 0.80 > top_est_rd) break;
-
-      x->force_skip = 0;
-      set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
-
-      // Select prediction reference frames.
-      const int is_comp_pred = mbmi->ref_frame[1] > INTRA_FRAME;
-      for (i = 0; i < num_planes; i++) {
-        xd->plane[i].pre[0] = yv12_mb[mbmi->ref_frame[0]][i];
-        if (is_comp_pred) xd->plane[i].pre[1] = yv12_mb[mbmi->ref_frame[1]][i];
-      }
-
-      av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, 0,
-                                    av1_num_planes(cm) - 1);
-      if (mbmi->motion_mode == OBMC_CAUSAL) {
-        av1_build_obmc_inter_predictors_sb(cm, xd);
-      }
-
-      RD_STATS rd_stats;
-      RD_STATS rd_stats_y;
-      RD_STATS rd_stats_uv;
-      const int mode_rate = inter_modes_info->mode_rate_arr[data_idx];
-      int64_t skip_rd = INT64_MAX;
-      if (cpi->sf.inter_sf.txfm_rd_gate_level) {
-        // Check if the mode is good enough based on skip RD
-        int64_t curr_sse = inter_modes_info->sse_arr[data_idx];
-        skip_rd = RDCOST(x->rdmult, mode_rate, curr_sse);
-        int eval_txfm =
-            check_txfm_eval(x, bsize, search_state.best_skip_rd[0], skip_rd,
-                            cpi->sf.inter_sf.txfm_rd_gate_level, 0);
-        if (!eval_txfm) continue;
-      }
-
-      if (!av1_txfm_search(cpi, x, bsize, &rd_stats, &rd_stats_y, &rd_stats_uv,
-                           mode_rate, search_state.best_rd)) {
-        continue;
-      } else if (cpi->sf.inter_sf.inter_mode_rd_model_estimation == 1) {
-        inter_mode_data_push(tile_data, mbmi->sb_type, rd_stats.sse,
-                             rd_stats.dist,
-                             rd_stats_y.rate + rd_stats_uv.rate +
-                                 x->skip_cost[skip_ctx][mbmi->skip]);
-      }
-      rd_stats.rdcost = RDCOST(x->rdmult, rd_stats.rate, rd_stats.dist);
-
-      const THR_MODES mode_enum = get_prediction_mode_idx(
-          mbmi->mode, mbmi->ref_frame[0], mbmi->ref_frame[1]);
-
-      // Collect mode stats for multiwinner mode processing
-      const int txfm_search_done = 1;
-      store_winner_mode_stats(
-          &cpi->common, x, mbmi, &rd_stats, &rd_stats_y, &rd_stats_uv,
-          mode_enum, NULL, bsize, rd_stats.rdcost,
-          cpi->sf.winner_mode_sf.enable_multiwinner_mode_process,
-          txfm_search_done);
-
-      if (rd_stats.rdcost < search_state.best_rd) {
-        update_search_state(&search_state, rd_cost, ctx, &rd_stats, &rd_stats_y,
-                            &rd_stats_uv, mode_enum, x, txfm_search_done);
-        search_state.best_skip_rd[0] = skip_rd;
-      }
-    }
+    // A full tx search has not yet been done, do tx search for
+    // top mode candidates
+    tx_search_best_inter_candidates(cpi, tile_data, x, best_rd_so_far, bsize,
+                                    yv12_mb, mi_row, mi_col, &search_state,
+                                    rd_cost, ctx, &best_inter_yrd);
   }
 #if CONFIG_COLLECT_COMPONENT_TIMING
   end_timing(cpi, do_tx_search_time);
@@ -4826,7 +5542,6 @@
 #if CONFIG_COLLECT_COMPONENT_TIMING
   start_timing(cpi, handle_intra_mode_time);
 #endif
-
   // Gate intra mode evaluation if best of inter is skip except when source
   // variance is extremely low
   if (sf->intra_sf.skip_intra_in_interframe &&
@@ -4840,7 +5555,7 @@
       float scores[2] = { 0.0f };
       float probs[2] = { 0.0f };
 
-      nn_features[0] = (float)search_state.best_mbmode.skip;
+      nn_features[0] = (float)search_state.best_mbmode.skip_txfm;
       nn_features[1] = (float)mi_size_wide_log2[bsize];
       nn_features[2] = (float)mi_size_high_log2[bsize];
       nn_features[3] = (float)intra_cost;
@@ -4854,68 +5569,22 @@
       av1_nn_softmax(scores, probs, 2);
 
       if (probs[1] > 0.8) search_state.intra_search_state.skip_intra_modes = 1;
-    } else if ((search_state.best_mbmode.skip) &&
+    } else if ((search_state.best_mbmode.skip_txfm) &&
                (sf->intra_sf.skip_intra_in_interframe >= 2)) {
       search_state.intra_search_state.skip_intra_modes = 1;
     }
   }
 
-  const int intra_ref_frame_cost = ref_costs_single[INTRA_FRAME];
-  for (int j = 0; j < intra_mode_num; ++j) {
-    if (sf->intra_sf.skip_intra_in_interframe &&
-        search_state.intra_search_state.skip_intra_modes)
-      break;
-    const THR_MODES mode_enum = intra_mode_idx_ls[j];
-    const MODE_DEFINITION *mode_def = &av1_mode_defs[mode_enum];
-    const PREDICTION_MODE this_mode = mode_def->mode;
-
-    assert(av1_mode_defs[mode_enum].ref_frame[0] == INTRA_FRAME);
-    assert(av1_mode_defs[mode_enum].ref_frame[1] == NONE_FRAME);
-    init_mbmi(mbmi, this_mode, av1_mode_defs[mode_enum].ref_frame, cm);
-    x->force_skip = 0;
-
-    if (this_mode != DC_PRED) {
-      // Only search the oblique modes if the best so far is
-      // one of the neighboring directional modes
-      if ((sf->rt_sf.mode_search_skip_flags & FLAG_SKIP_INTRA_BESTINTER) &&
-          (this_mode >= D45_PRED && this_mode <= PAETH_PRED)) {
-        if (search_state.best_mode_index != THR_INVALID &&
-            search_state.best_mbmode.ref_frame[0] > INTRA_FRAME)
-          continue;
-      }
-      if (sf->rt_sf.mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) {
-        if (conditional_skipintra(
-                this_mode, search_state.intra_search_state.best_intra_mode))
-          continue;
-      }
-    }
-
-    RD_STATS intra_rd_stats, intra_rd_stats_y, intra_rd_stats_uv;
-    intra_rd_stats.rdcost = av1_handle_intra_mode(
-        &search_state.intra_search_state, cpi, x, bsize, intra_ref_frame_cost,
-        ctx, 0, &intra_rd_stats, &intra_rd_stats_y, &intra_rd_stats_uv,
-        search_state.best_rd, &search_state.best_intra_rd,
-        search_state.best_mbmode.skip);
-    // Collect mode stats for multiwinner mode processing
-    const int txfm_search_done = 1;
-    store_winner_mode_stats(
-        &cpi->common, x, mbmi, &intra_rd_stats, &intra_rd_stats_y,
-        &intra_rd_stats_uv, mode_enum, NULL, bsize, intra_rd_stats.rdcost,
-        cpi->sf.winner_mode_sf.enable_multiwinner_mode_process,
-        txfm_search_done);
-    if (intra_rd_stats.rdcost < search_state.best_rd) {
-      update_search_state(&search_state, rd_cost, ctx, &intra_rd_stats,
-                          &intra_rd_stats_y, &intra_rd_stats_uv, mode_enum, x,
-                          txfm_search_done);
-    }
-  }
+  const unsigned int intra_ref_frame_cost = ref_costs_single[INTRA_FRAME];
+  search_intra_modes_in_interframe(&search_state, cpi, x, rd_cost, bsize, ctx,
+                                   &sf_args, intra_ref_frame_cost,
+                                   best_inter_yrd);
 #if CONFIG_COLLECT_COMPONENT_TIMING
   end_timing(cpi, handle_intra_mode_time);
 #endif
 
-  int winner_mode_count = cpi->sf.winner_mode_sf.enable_multiwinner_mode_process
-                              ? x->winner_mode_count
-                              : 1;
+  int winner_mode_count =
+      cpi->sf.winner_mode_sf.multi_winner_mode_type ? x->winner_mode_count : 1;
   // In effect only when fast tx search speed features are enabled.
   refine_winner_mode_tx(
       cpi, x, rd_cost, bsize, ctx, &search_state.best_mode_index,
@@ -4927,16 +5596,21 @@
 
   // Only try palette mode when the best mode so far is an intra mode.
   const int try_palette =
-      cpi->oxcf.enable_palette &&
-      av1_allow_palette(features->allow_screen_content_tools, mbmi->sb_type) &&
-      !is_inter_mode(search_state.best_mbmode.mode);
-  PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+      cpi->oxcf.tool_cfg.enable_palette &&
+      av1_allow_palette(features->allow_screen_content_tools, mbmi->bsize) &&
+      !is_inter_mode(search_state.best_mbmode.mode) && rd_cost->rate != INT_MAX;
   RD_STATS this_rd_cost;
   int this_skippable = 0;
   if (try_palette) {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    start_timing(cpi, av1_search_palette_mode_time);
+#endif
     this_skippable = av1_search_palette_mode(
-        cpi, x, &this_rd_cost, ctx, bsize, mbmi, pmi, ref_costs_single,
-        &search_state.intra_search_state, search_state.best_rd);
+        &search_state.intra_search_state, cpi, x, bsize, intra_ref_frame_cost,
+        ctx, &this_rd_cost, search_state.best_rd);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    end_timing(cpi, av1_search_palette_mode_time);
+#endif
     if (this_rd_cost.rdcost < search_state.best_rd) {
       search_state.best_mode_index = THR_DC;
       mbmi->mv[0].as_int = 0;
@@ -4947,8 +5621,8 @@
       search_state.best_mbmode = *mbmi;
       search_state.best_skip2 = 0;
       search_state.best_mode_skippable = this_skippable;
-      memcpy(ctx->blk_skip, x->blk_skip,
-             sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
+      memcpy(ctx->blk_skip, txfm_info->blk_skip,
+             sizeof(txfm_info->blk_skip[0]) * ctx->num_4x4_blk);
       av1_copy_array(ctx->tx_type_map, xd->tx_type_map, ctx->num_4x4_blk);
     }
   }
@@ -4997,7 +5671,7 @@
 
   // macroblock modes
   *mbmi = search_state.best_mbmode;
-  x->force_skip |= search_state.best_skip2;
+  txfm_info->skip_txfm |= search_state.best_skip2;
 
   // Note: this section is needed since the mode may have been forced to
   // GLOBALMV by the all-zero mode handling of ref-mv.
@@ -5012,16 +5686,15 @@
   }
 
   for (i = 0; i < REFERENCE_MODES; ++i) {
-    if (search_state.intra_search_state.best_pred_rd[i] == INT64_MAX) {
+    if (search_state.best_pred_rd[i] == INT64_MAX) {
       search_state.best_pred_diff[i] = INT_MIN;
     } else {
       search_state.best_pred_diff[i] =
-          search_state.best_rd -
-          search_state.intra_search_state.best_pred_rd[i];
+          search_state.best_rd - search_state.best_pred_rd[i];
     }
   }
 
-  x->force_skip |= search_state.best_mode_skippable;
+  txfm_info->skip_txfm |= search_state.best_mode_skippable;
 
   assert(search_state.best_mode_index != THR_INVALID);
 
@@ -5034,7 +5707,7 @@
                        search_state.best_mode_skippable);
 #endif  // CONFIG_INTERNAL_STATS
 
-  if (pmi->palette_size[1] > 0) {
+  if (mbmi->palette_mode_info.palette_size[1] > 0) {
     assert(try_palette);
     av1_restore_uv_color_map(cpi, x);
   }
@@ -5056,7 +5729,9 @@
   int64_t best_pred_diff[REFERENCE_MODES];
   unsigned int ref_costs_single[REF_FRAMES];
   unsigned int ref_costs_comp[REF_FRAMES][REF_FRAMES];
-  int *comp_inter_cost = x->comp_inter_cost[av1_get_reference_mode_context(xd)];
+  const ModeCosts *mode_costs = &x->mode_costs;
+  const int *comp_inter_cost =
+      mode_costs->comp_inter_cost[av1_get_reference_mode_context(xd)];
   InterpFilter best_filter = SWITCHABLE;
   int64_t this_rd = INT64_MAX;
   int rate2 = 0;
@@ -5067,7 +5742,7 @@
 
   av1_collect_neighbors_ref_counts(xd);
 
-  estimate_ref_frame_costs(cm, xd, x, segment_id, ref_costs_single,
+  estimate_ref_frame_costs(cm, xd, mode_costs, segment_id, ref_costs_single,
                            ref_costs_comp);
 
   for (i = 0; i < REF_FRAMES; ++i) x->pred_sse[i] = INT_MAX;
@@ -5094,7 +5769,7 @@
                            mi_row, features->cur_frame_force_integer_mv)
           .as_int;
   mbmi->tx_size = max_txsize_lookup[bsize];
-  x->force_skip = 1;
+  x->txfm_search_info.skip_txfm = 1;
 
   mbmi->ref_mv_idx = 0;
 
@@ -5104,9 +5779,10 @@
     int pts[SAMPLES_ARRAY_SIZE], pts_inref[SAMPLES_ARRAY_SIZE];
     mbmi->num_proj_ref = av1_findSamples(cm, xd, pts, pts_inref);
     // Select the samples according to motion vector difference
-    if (mbmi->num_proj_ref > 1)
+    if (mbmi->num_proj_ref > 1) {
       mbmi->num_proj_ref = av1_selectSamples(&mbmi->mv[0].as_mv, pts, pts_inref,
                                              mbmi->num_proj_ref, bsize);
+    }
   }
 
   const InterpFilter interp_filter = features->interp_filter;
@@ -5116,14 +5792,13 @@
     best_filter = interp_filter;
   } else {
     best_filter = EIGHTTAP_REGULAR;
-    if (av1_is_interp_needed(xd) &&
-        x->source_variance >=
-            cpi->sf.interp_sf.disable_filter_search_var_thresh) {
+    if (av1_is_interp_needed(xd)) {
       int rs;
       int best_rs = INT_MAX;
       for (i = 0; i < SWITCHABLE_FILTERS; ++i) {
         mbmi->interp_filters = av1_broadcast_interp_filter(i);
-        rs = av1_get_switchable_rate(x, xd, interp_filter);
+        rs = av1_get_switchable_rate(x, xd, interp_filter,
+                                     cm->seq_params.enable_dual_filter);
         if (rs < best_rs) {
           best_rs = rs;
           best_filter = mbmi->interp_filters.as_filters.y_filter;
@@ -5133,7 +5808,8 @@
   }
   // Set the appropriate filter
   mbmi->interp_filters = av1_broadcast_interp_filter(best_filter);
-  rate2 += av1_get_switchable_rate(x, xd, interp_filter);
+  rate2 += av1_get_switchable_rate(x, xd, interp_filter,
+                                   cm->seq_params.enable_dual_filter);
 
   if (cm->current_frame.reference_mode == REFERENCE_MODE_SELECT)
     rate2 += comp_inter_cost[comp_pred];
@@ -5171,12 +5847,14 @@
 #endif  // CONFIG_INTERNAL_STATS
 }
 
+/*!\cond */
 struct calc_target_weighted_pred_ctxt {
-  const MACROBLOCK *x;
+  const OBMCBuffer *obmc_buffer;
   const uint8_t *tmp;
   int tmp_stride;
   int overlap;
 };
+/*!\endcond */
 
 static INLINE void calc_target_weighted_pred_above(
     MACROBLOCKD *xd, int rel_mi_row, int rel_mi_col, uint8_t op_mi_size,
@@ -5192,8 +5870,8 @@
   const int bw = xd->width << MI_SIZE_LOG2;
   const uint8_t *const mask1d = av1_get_obmc_mask(ctxt->overlap);
 
-  int32_t *wsrc = ctxt->x->wsrc_buf + (rel_mi_col * MI_SIZE);
-  int32_t *mask = ctxt->x->mask_buf + (rel_mi_col * MI_SIZE);
+  int32_t *wsrc = ctxt->obmc_buffer->wsrc + (rel_mi_col * MI_SIZE);
+  int32_t *mask = ctxt->obmc_buffer->mask + (rel_mi_col * MI_SIZE);
   const uint8_t *tmp = ctxt->tmp + rel_mi_col * MI_SIZE;
   const int is_hbd = is_cur_buf_hbd(xd);
 
@@ -5240,8 +5918,8 @@
   const int bw = xd->width << MI_SIZE_LOG2;
   const uint8_t *const mask1d = av1_get_obmc_mask(ctxt->overlap);
 
-  int32_t *wsrc = ctxt->x->wsrc_buf + (rel_mi_row * MI_SIZE * bw);
-  int32_t *mask = ctxt->x->mask_buf + (rel_mi_row * MI_SIZE * bw);
+  int32_t *wsrc = ctxt->obmc_buffer->wsrc + (rel_mi_row * MI_SIZE * bw);
+  int32_t *mask = ctxt->obmc_buffer->mask + (rel_mi_row * MI_SIZE * bw);
   const uint8_t *tmp = ctxt->tmp + (rel_mi_row * MI_SIZE * ctxt->tmp_stride);
   const int is_hbd = is_cur_buf_hbd(xd);
 
@@ -5318,11 +5996,12 @@
     const AV1_COMMON *cm, const MACROBLOCK *x, const MACROBLOCKD *xd,
     const uint8_t *above, int above_stride, const uint8_t *left,
     int left_stride) {
-  const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+  const BLOCK_SIZE bsize = xd->mi[0]->bsize;
   const int bw = xd->width << MI_SIZE_LOG2;
   const int bh = xd->height << MI_SIZE_LOG2;
-  int32_t *mask_buf = x->mask_buf;
-  int32_t *wsrc_buf = x->wsrc_buf;
+  const OBMCBuffer *obmc_buffer = &x->obmc_buffer;
+  int32_t *mask_buf = obmc_buffer->mask;
+  int32_t *wsrc_buf = obmc_buffer->wsrc;
 
   const int is_hbd = is_cur_buf_hbd(xd);
   const int src_scale = AOM_BLEND_A64_MAX_ALPHA * AOM_BLEND_A64_MAX_ALPHA;
@@ -5338,8 +6017,8 @@
   if (xd->up_available) {
     const int overlap =
         AOMMIN(block_size_high[bsize], block_size_high[BLOCK_64X64]) >> 1;
-    struct calc_target_weighted_pred_ctxt ctxt = { x, above, above_stride,
-                                                   overlap };
+    struct calc_target_weighted_pred_ctxt ctxt = { obmc_buffer, above,
+                                                   above_stride, overlap };
     foreach_overlappable_nb_above(cm, (MACROBLOCKD *)xd,
                                   max_neighbor_obmc[mi_size_wide_log2[bsize]],
                                   calc_target_weighted_pred_above, &ctxt);
@@ -5354,8 +6033,8 @@
   if (xd->left_available) {
     const int overlap =
         AOMMIN(block_size_wide[bsize], block_size_wide[BLOCK_64X64]) >> 1;
-    struct calc_target_weighted_pred_ctxt ctxt = { x, left, left_stride,
-                                                   overlap };
+    struct calc_target_weighted_pred_ctxt ctxt = { obmc_buffer, left,
+                                                   left_stride, overlap };
     foreach_overlappable_nb_left(cm, (MACROBLOCKD *)xd,
                                  max_neighbor_obmc[mi_size_high_log2[bsize]],
                                  calc_target_weighted_pred_left, &ctxt);
@@ -5427,7 +6106,6 @@
   ConvolveParams conv_params = get_conv_params(0, 0, bd);
   InterpFilterParams filter = { .filter_ptr = gauss_filter,
                                 .taps = 8,
-                                .subpel_shifts = 0,
                                 .interp_filter = EIGHTTAP_REGULAR };
   // Requirements from the vector-optimized implementations.
   assert(h % 4 == 0);
diff --git a/av1/encoder/rdopt.h b/av1/encoder/rdopt.h
index c7c99ac..df080b0 100644
--- a/av1/encoder/rdopt.h
+++ b/av1/encoder/rdopt.h
@@ -35,54 +35,137 @@
 struct macroblock;
 struct RD_STATS;
 
-// Returns the number of colors in 'src'.
-int av1_count_colors(const uint8_t *src, int stride, int rows, int cols,
-                     int *val_count);
-// Same as av1_count_colors(), but for high-bitdepth mode.
-int av1_count_colors_highbd(const uint8_t *src8, int stride, int rows, int cols,
-                            int bit_depth, int *val_count);
-
-static INLINE int av1_cost_skip_txb(MACROBLOCK *x, const TXB_CTX *const txb_ctx,
-                                    int plane, TX_SIZE tx_size) {
-  const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
-  const PLANE_TYPE plane_type = get_plane_type(plane);
-  const LV_MAP_COEFF_COST *const coeff_costs =
-      &x->coeff_costs[txs_ctx][plane_type];
-  return coeff_costs->txb_skip_cost[txb_ctx->txb_skip_ctx][1];
-}
-
+/*!\brief AV1 intra mode selection for intra frames.
+ *
+ * \ingroup intra_mode_search
+ * \callgraph
+ * Top level function for rd-based intra mode selection during intra frame
+ * encoding. This function will first search for the best luma prediction by
+ * calling av1_rd_pick_intra_sby_mode, then it searches for chroma prediction
+ * with av1_rd_pick_intra_sbuv_mode. If applicable, this function ends the
+ * search with an evaluation for intrabc.
+ *
+ * \param[in]    cpi            Top-level encoder structure.
+ * \param[in]    x              Pointer to structure holding all the data for
+                                the current macroblock.
+ * \param[in]    rd_cost        Struct to keep track of the RD information.
+ * \param[in]    bsize          Current block size.
+ * \param[in]    ctx            Structure to hold snapshot of coding context
+                                during the mode picking process.
+ * \param[in]    best_rd Best   RD seen for this block so far.
+ *
+ * \return Nothing is returned. Instead, the MB_MODE_INFO struct inside x
+ * is modified to store information about the best mode computed
+ * in this function. The rd_cost struct is also updated with the RD stats
+ * corresponding to the best mode found.
+ */
 void av1_rd_pick_intra_mode_sb(const struct AV1_COMP *cpi, struct macroblock *x,
                                struct RD_STATS *rd_cost, BLOCK_SIZE bsize,
                                PICK_MODE_CONTEXT *ctx, int64_t best_rd);
 
-unsigned int av1_get_sby_perpixel_variance(const struct AV1_COMP *cpi,
-                                           const struct buf_2d *ref,
-                                           BLOCK_SIZE bs);
-unsigned int av1_high_get_sby_perpixel_variance(const struct AV1_COMP *cpi,
-                                                const struct buf_2d *ref,
-                                                BLOCK_SIZE bs, int bd);
+/*!\brief AV1 inter mode selection.
+ *
+ * \ingroup inter_mode_search
+ * \callgraph
+ * Top level function for inter mode selection. This function will loop over
+ * all possible inter modes and select the best one for the current block by
+ * computing the RD cost. The mode search and RD are computed in
+ * handle_inter_mode(), which is called from this function within the main
+ * loop.
+ *
+ * \param[in]    cpi            Top-level encoder structure
+ * \param[in]    tile_data      Pointer to struct holding adaptive
+                                data/contexts/models for the tile during
+                                encoding
+ * \param[in]    x              Pointer to structure holding all the data for
+                                the current macroblock
+ * \param[in]    rd_cost        Struct to keep track of the RD information
+ * \param[in]    bsize          Current block size
+ * \param[in]    ctx            Structure to hold snapshot of coding context
+                                during the mode picking process
+ * \param[in]    best_rd_so_far Best RD seen for this block so far
+ *
+ * \return Nothing is returned. Instead, the MB_MODE_INFO struct inside x
+ * is modified to store information about the best mode computed
+ * in this function. The rd_cost struct is also updated with the RD stats
+ * corresponding to the best mode found.
+ */
+void av1_rd_pick_inter_mode(struct AV1_COMP *cpi, struct TileDataEnc *tile_data,
+                            struct macroblock *x, struct RD_STATS *rd_cost,
+                            BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
+                            int64_t best_rd_so_far);
 
-void av1_rd_pick_inter_mode_sb(struct AV1_COMP *cpi,
-                               struct TileDataEnc *tile_data,
-                               struct macroblock *x, struct RD_STATS *rd_cost,
-                               BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
-                               int64_t best_rd_so_far);
+/*!\brief AV1 intra mode selection based on Non-RD optimized model.
+ *
+ * \ingroup nonrd_mode_search
+ * \callgraph
+ * \callergraph
+ * Top level function for Non-RD optimized intra mode selection.
+ * This finction will loop over subset of intra modes and select the best one
+ * based on calculated modelled RD cost. Only 4 intra modes are checked as
+ * specified in \c intra_mode_list. When calculating RD cost Hadamard transform
+ * of residual is used to calculate rate. Estmation of RD cost is performed
+ * in \c estimate_block_intra which is called from this function
+ *
+ * \param[in]    cpi            Top-level encoder structure
+ * \param[in]    x              Pointer to structure holding all the data for
+                                the current macroblock
+ * \param[in]    rd_cost        Struct to keep track of the RD information
+ * \param[in]    bsize          Current block size
+ * \param[in]    ctx            Structure to hold snapshot of coding context
+                                during the mode picking process
+ *
+ * \return Nothing is returned. Instead, the MB_MODE_INFO struct inside x
+ * is modified to store information about the best mode computed
+ * in this function. The rd_cost struct is also updated with the RD stats
+ * corresponding to the best mode found.
+ */
+void av1_nonrd_pick_intra_mode(AV1_COMP *cpi, MACROBLOCK *x, RD_STATS *rd_cost,
+                               BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx);
 
-void av1_pick_intra_mode(AV1_COMP *cpi, MACROBLOCK *x, RD_STATS *rd_cost,
-                         BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx);
-
+/*!\brief AV1 inter mode selection based on Non-RD optimized model.
+ *
+ * \ingroup nonrd_mode_search
+ * \callgraph
+ * Top level function for Non-RD optimized inter mode selection.
+ * This finction will loop over subset of inter modes and select the best one
+ * based on calculated modelled RD cost. While making decisions which modes to
+ * check, this function applies heuristics based on previously checked modes,
+ * block residual variance, block size, and other factors to prune certain
+ * modes and reference frames. Currently only single reference frame modes
+ * are checked. Additional heuristics are applied to decide if intra modes
+ *  need to be checked.
+ *  *
+ * \param[in]    cpi            Top-level encoder structure
+ * \param[in]    tile_data      Pointer to struct holding adaptive
+                                data/contexts/models for the tile during
+                                encoding
+ * \param[in]    x              Pointer to structure holding all the data for
+                                the current macroblock
+ * \param[in]    rd_cost        Struct to keep track of the RD information
+ * \param[in]    bsize          Current block size
+ * \param[in]    ctx            Structure to hold snapshot of coding context
+                                during the mode picking process
+ *
+ * \return Nothing is returned. Instead, the MB_MODE_INFO struct inside x
+ * is modified to store information about the best mode computed
+ * in this function. The rd_cost struct is also updated with the RD stats
+ * corresponding to the best mode found.
+ */
 void av1_nonrd_pick_inter_mode_sb(struct AV1_COMP *cpi,
                                   struct TileDataEnc *tile_data,
                                   struct macroblock *x,
                                   struct RD_STATS *rd_cost, BLOCK_SIZE bsize,
-                                  PICK_MODE_CONTEXT *ctx,
-                                  int64_t best_rd_so_far);
+                                  PICK_MODE_CONTEXT *ctx);
 
 void av1_rd_pick_inter_mode_sb_seg_skip(
     const struct AV1_COMP *cpi, struct TileDataEnc *tile_data,
     struct macroblock *x, int mi_row, int mi_col, struct RD_STATS *rd_cost,
     BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx, int64_t best_rd_so_far);
 
+// TODO(any): The defs below could potentially be moved to rdopt_utils.h instead
+// because they are not the main rdopt functions.
+/*!\cond */
 // The best edge strength seen in the block, as well as the best x and y
 // components of edge strength seen.
 typedef struct {
@@ -90,6 +173,7 @@
   uint16_t x;
   uint16_t y;
 } EdgeInfo;
+/*!\endcond */
 
 /** Returns an integer indicating the strength of the edge.
  * 0 means no edge found, 556 is the strength of a solid black/white edge,
@@ -106,11 +190,13 @@
 void av1_gaussian_blur(const uint8_t *src, int src_stride, int w, int h,
                        uint8_t *dst, bool high_bd, int bd);
 
+/*!\cond */
 /* Applies standard 3x3 Sobel matrix. */
 typedef struct {
   int16_t x;
   int16_t y;
 } sobel_xy;
+/*!\endcond */
 
 sobel_xy av1_sobel(const uint8_t *input, int stride, int i, int j,
                    bool high_bd);
@@ -118,16 +204,11 @@
 void av1_inter_mode_data_init(struct TileDataEnc *tile_data);
 void av1_inter_mode_data_fit(TileDataEnc *tile_data, int rdmult);
 
-#if !CONFIG_REALTIME_ONLY
 static INLINE int coded_to_superres_mi(int mi_col, int denom) {
   return (mi_col * denom + SCALE_NUMERATOR / 2) / SCALE_NUMERATOR;
 }
-#endif
 
-static INLINE int av1_encoder_get_relative_dist(const OrderHintInfo *oh, int a,
-                                                int b) {
-  if (!oh->enable_order_hint) return 0;
-
+static INLINE int av1_encoder_get_relative_dist(int a, int b) {
   assert(a >= 0 && b >= 0);
   return (a - b);
 }
@@ -160,7 +241,6 @@
 // This function prunes the mode if either of the reference frame falls in the
 // pruning list
 static INLINE int prune_ref(const MV_REFERENCE_FRAME *const ref_frame,
-                            const OrderHintInfo *const order_hint_info,
                             const unsigned int *const ref_display_order_hint,
                             const unsigned int frame_display_order_hint,
                             const int *ref_frame_list) {
@@ -170,7 +250,6 @@
     if (ref_frame[0] == ref_frame_list[i] ||
         ref_frame[1] == ref_frame_list[i]) {
       if (av1_encoder_get_relative_dist(
-              order_hint_info,
               ref_display_order_hint[ref_frame_list[i] - LAST_FRAME],
               frame_display_order_hint) < 0)
         return 1;
@@ -186,8 +265,6 @@
   const SPEED_FEATURES *const sf = &cpi->sf;
   if (!sf->inter_sf.selective_ref_frame) return 0;
 
-  const AV1_COMMON *const cm = &cpi->common;
-  const OrderHintInfo *const order_hint_info = &cm->seq_params.order_hint_info;
   const int comp_pred = ref_frame[1] > INTRA_FRAME;
 
   if (sf->inter_sf.selective_ref_frame >= 2 ||
@@ -195,11 +272,11 @@
     int ref_frame_list[2] = { LAST3_FRAME, LAST2_FRAME };
 
     if (x != NULL) {
-      if (x->search_ref_frame[LAST3_FRAME]) ref_frame_list[0] = NONE_FRAME;
-      if (x->search_ref_frame[LAST2_FRAME]) ref_frame_list[1] = NONE_FRAME;
+      if (x->tpl_keep_ref_frame[LAST3_FRAME]) ref_frame_list[0] = NONE_FRAME;
+      if (x->tpl_keep_ref_frame[LAST2_FRAME]) ref_frame_list[1] = NONE_FRAME;
     }
 
-    if (prune_ref(ref_frame, order_hint_info, ref_display_order_hint,
+    if (prune_ref(ref_frame, ref_display_order_hint,
                   ref_display_order_hint[GOLDEN_FRAME - LAST_FRAME],
                   ref_frame_list))
       return 1;
@@ -209,11 +286,11 @@
     int ref_frame_list[2] = { ALTREF2_FRAME, BWDREF_FRAME };
 
     if (x != NULL) {
-      if (x->search_ref_frame[ALTREF2_FRAME]) ref_frame_list[0] = NONE_FRAME;
-      if (x->search_ref_frame[BWDREF_FRAME]) ref_frame_list[1] = NONE_FRAME;
+      if (x->tpl_keep_ref_frame[ALTREF2_FRAME]) ref_frame_list[0] = NONE_FRAME;
+      if (x->tpl_keep_ref_frame[BWDREF_FRAME]) ref_frame_list[1] = NONE_FRAME;
     }
 
-    if (prune_ref(ref_frame, order_hint_info, ref_display_order_hint,
+    if (prune_ref(ref_frame, ref_display_order_hint,
                   ref_display_order_hint[LAST_FRAME - LAST_FRAME],
                   ref_frame_list))
       return 1;
diff --git a/av1/encoder/rdopt_utils.h b/av1/encoder/rdopt_utils.h
index 53b410a..4063889 100644
--- a/av1/encoder/rdopt_utils.h
+++ b/av1/encoder/rdopt_utils.h
@@ -86,132 +86,132 @@
   { NEAREST_NEARESTMV, { LAST_FRAME, GOLDEN_FRAME } },
   { NEAREST_NEARESTMV, { BWDREF_FRAME, ALTREF_FRAME } },
 
-  { NEAR_NEARMV, { LAST_FRAME, ALTREF_FRAME } },
-  { NEW_NEARESTMV, { LAST_FRAME, ALTREF_FRAME } },
-  { NEAREST_NEWMV, { LAST_FRAME, ALTREF_FRAME } },
-  { NEW_NEARMV, { LAST_FRAME, ALTREF_FRAME } },
-  { NEAR_NEWMV, { LAST_FRAME, ALTREF_FRAME } },
-  { NEW_NEWMV, { LAST_FRAME, ALTREF_FRAME } },
-  { GLOBAL_GLOBALMV, { LAST_FRAME, ALTREF_FRAME } },
-
-  { NEAR_NEARMV, { LAST2_FRAME, ALTREF_FRAME } },
-  { NEW_NEARESTMV, { LAST2_FRAME, ALTREF_FRAME } },
-  { NEAREST_NEWMV, { LAST2_FRAME, ALTREF_FRAME } },
-  { NEW_NEARMV, { LAST2_FRAME, ALTREF_FRAME } },
-  { NEAR_NEWMV, { LAST2_FRAME, ALTREF_FRAME } },
-  { NEW_NEWMV, { LAST2_FRAME, ALTREF_FRAME } },
-  { GLOBAL_GLOBALMV, { LAST2_FRAME, ALTREF_FRAME } },
-
-  { NEAR_NEARMV, { LAST3_FRAME, ALTREF_FRAME } },
-  { NEW_NEARESTMV, { LAST3_FRAME, ALTREF_FRAME } },
-  { NEAREST_NEWMV, { LAST3_FRAME, ALTREF_FRAME } },
-  { NEW_NEARMV, { LAST3_FRAME, ALTREF_FRAME } },
-  { NEAR_NEWMV, { LAST3_FRAME, ALTREF_FRAME } },
-  { NEW_NEWMV, { LAST3_FRAME, ALTREF_FRAME } },
-  { GLOBAL_GLOBALMV, { LAST3_FRAME, ALTREF_FRAME } },
-
-  { NEAR_NEARMV, { GOLDEN_FRAME, ALTREF_FRAME } },
-  { NEW_NEARESTMV, { GOLDEN_FRAME, ALTREF_FRAME } },
-  { NEAREST_NEWMV, { GOLDEN_FRAME, ALTREF_FRAME } },
-  { NEW_NEARMV, { GOLDEN_FRAME, ALTREF_FRAME } },
-  { NEAR_NEWMV, { GOLDEN_FRAME, ALTREF_FRAME } },
-  { NEW_NEWMV, { GOLDEN_FRAME, ALTREF_FRAME } },
-  { GLOBAL_GLOBALMV, { GOLDEN_FRAME, ALTREF_FRAME } },
-
   { NEAR_NEARMV, { LAST_FRAME, BWDREF_FRAME } },
+  { NEW_NEWMV, { LAST_FRAME, BWDREF_FRAME } },
   { NEW_NEARESTMV, { LAST_FRAME, BWDREF_FRAME } },
   { NEAREST_NEWMV, { LAST_FRAME, BWDREF_FRAME } },
   { NEW_NEARMV, { LAST_FRAME, BWDREF_FRAME } },
   { NEAR_NEWMV, { LAST_FRAME, BWDREF_FRAME } },
-  { NEW_NEWMV, { LAST_FRAME, BWDREF_FRAME } },
   { GLOBAL_GLOBALMV, { LAST_FRAME, BWDREF_FRAME } },
 
+  { NEAR_NEARMV, { LAST_FRAME, ALTREF_FRAME } },
+  { NEW_NEWMV, { LAST_FRAME, ALTREF_FRAME } },
+  { NEW_NEARESTMV, { LAST_FRAME, ALTREF_FRAME } },
+  { NEAREST_NEWMV, { LAST_FRAME, ALTREF_FRAME } },
+  { NEW_NEARMV, { LAST_FRAME, ALTREF_FRAME } },
+  { NEAR_NEWMV, { LAST_FRAME, ALTREF_FRAME } },
+  { GLOBAL_GLOBALMV, { LAST_FRAME, ALTREF_FRAME } },
+
+  { NEAR_NEARMV, { LAST2_FRAME, ALTREF_FRAME } },
+  { NEW_NEWMV, { LAST2_FRAME, ALTREF_FRAME } },
+  { NEW_NEARESTMV, { LAST2_FRAME, ALTREF_FRAME } },
+  { NEAREST_NEWMV, { LAST2_FRAME, ALTREF_FRAME } },
+  { NEW_NEARMV, { LAST2_FRAME, ALTREF_FRAME } },
+  { NEAR_NEWMV, { LAST2_FRAME, ALTREF_FRAME } },
+  { GLOBAL_GLOBALMV, { LAST2_FRAME, ALTREF_FRAME } },
+
+  { NEAR_NEARMV, { LAST3_FRAME, ALTREF_FRAME } },
+  { NEW_NEWMV, { LAST3_FRAME, ALTREF_FRAME } },
+  { NEW_NEARESTMV, { LAST3_FRAME, ALTREF_FRAME } },
+  { NEAREST_NEWMV, { LAST3_FRAME, ALTREF_FRAME } },
+  { NEW_NEARMV, { LAST3_FRAME, ALTREF_FRAME } },
+  { NEAR_NEWMV, { LAST3_FRAME, ALTREF_FRAME } },
+  { GLOBAL_GLOBALMV, { LAST3_FRAME, ALTREF_FRAME } },
+
+  { NEAR_NEARMV, { GOLDEN_FRAME, ALTREF_FRAME } },
+  { NEW_NEWMV, { GOLDEN_FRAME, ALTREF_FRAME } },
+  { NEW_NEARESTMV, { GOLDEN_FRAME, ALTREF_FRAME } },
+  { NEAREST_NEWMV, { GOLDEN_FRAME, ALTREF_FRAME } },
+  { NEW_NEARMV, { GOLDEN_FRAME, ALTREF_FRAME } },
+  { NEAR_NEWMV, { GOLDEN_FRAME, ALTREF_FRAME } },
+  { GLOBAL_GLOBALMV, { GOLDEN_FRAME, ALTREF_FRAME } },
+
   { NEAR_NEARMV, { LAST2_FRAME, BWDREF_FRAME } },
+  { NEW_NEWMV, { LAST2_FRAME, BWDREF_FRAME } },
   { NEW_NEARESTMV, { LAST2_FRAME, BWDREF_FRAME } },
   { NEAREST_NEWMV, { LAST2_FRAME, BWDREF_FRAME } },
   { NEW_NEARMV, { LAST2_FRAME, BWDREF_FRAME } },
   { NEAR_NEWMV, { LAST2_FRAME, BWDREF_FRAME } },
-  { NEW_NEWMV, { LAST2_FRAME, BWDREF_FRAME } },
   { GLOBAL_GLOBALMV, { LAST2_FRAME, BWDREF_FRAME } },
 
   { NEAR_NEARMV, { LAST3_FRAME, BWDREF_FRAME } },
+  { NEW_NEWMV, { LAST3_FRAME, BWDREF_FRAME } },
   { NEW_NEARESTMV, { LAST3_FRAME, BWDREF_FRAME } },
   { NEAREST_NEWMV, { LAST3_FRAME, BWDREF_FRAME } },
   { NEW_NEARMV, { LAST3_FRAME, BWDREF_FRAME } },
   { NEAR_NEWMV, { LAST3_FRAME, BWDREF_FRAME } },
-  { NEW_NEWMV, { LAST3_FRAME, BWDREF_FRAME } },
   { GLOBAL_GLOBALMV, { LAST3_FRAME, BWDREF_FRAME } },
 
   { NEAR_NEARMV, { GOLDEN_FRAME, BWDREF_FRAME } },
+  { NEW_NEWMV, { GOLDEN_FRAME, BWDREF_FRAME } },
   { NEW_NEARESTMV, { GOLDEN_FRAME, BWDREF_FRAME } },
   { NEAREST_NEWMV, { GOLDEN_FRAME, BWDREF_FRAME } },
   { NEW_NEARMV, { GOLDEN_FRAME, BWDREF_FRAME } },
   { NEAR_NEWMV, { GOLDEN_FRAME, BWDREF_FRAME } },
-  { NEW_NEWMV, { GOLDEN_FRAME, BWDREF_FRAME } },
   { GLOBAL_GLOBALMV, { GOLDEN_FRAME, BWDREF_FRAME } },
 
   { NEAR_NEARMV, { LAST_FRAME, ALTREF2_FRAME } },
+  { NEW_NEWMV, { LAST_FRAME, ALTREF2_FRAME } },
   { NEW_NEARESTMV, { LAST_FRAME, ALTREF2_FRAME } },
   { NEAREST_NEWMV, { LAST_FRAME, ALTREF2_FRAME } },
   { NEW_NEARMV, { LAST_FRAME, ALTREF2_FRAME } },
   { NEAR_NEWMV, { LAST_FRAME, ALTREF2_FRAME } },
-  { NEW_NEWMV, { LAST_FRAME, ALTREF2_FRAME } },
   { GLOBAL_GLOBALMV, { LAST_FRAME, ALTREF2_FRAME } },
 
   { NEAR_NEARMV, { LAST2_FRAME, ALTREF2_FRAME } },
+  { NEW_NEWMV, { LAST2_FRAME, ALTREF2_FRAME } },
   { NEW_NEARESTMV, { LAST2_FRAME, ALTREF2_FRAME } },
   { NEAREST_NEWMV, { LAST2_FRAME, ALTREF2_FRAME } },
   { NEW_NEARMV, { LAST2_FRAME, ALTREF2_FRAME } },
   { NEAR_NEWMV, { LAST2_FRAME, ALTREF2_FRAME } },
-  { NEW_NEWMV, { LAST2_FRAME, ALTREF2_FRAME } },
   { GLOBAL_GLOBALMV, { LAST2_FRAME, ALTREF2_FRAME } },
 
   { NEAR_NEARMV, { LAST3_FRAME, ALTREF2_FRAME } },
+  { NEW_NEWMV, { LAST3_FRAME, ALTREF2_FRAME } },
   { NEW_NEARESTMV, { LAST3_FRAME, ALTREF2_FRAME } },
   { NEAREST_NEWMV, { LAST3_FRAME, ALTREF2_FRAME } },
   { NEW_NEARMV, { LAST3_FRAME, ALTREF2_FRAME } },
   { NEAR_NEWMV, { LAST3_FRAME, ALTREF2_FRAME } },
-  { NEW_NEWMV, { LAST3_FRAME, ALTREF2_FRAME } },
   { GLOBAL_GLOBALMV, { LAST3_FRAME, ALTREF2_FRAME } },
 
   { NEAR_NEARMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
+  { NEW_NEWMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
   { NEW_NEARESTMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
   { NEAREST_NEWMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
   { NEW_NEARMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
   { NEAR_NEWMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
-  { NEW_NEWMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
   { GLOBAL_GLOBALMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
 
   { NEAR_NEARMV, { LAST_FRAME, LAST2_FRAME } },
+  { NEW_NEWMV, { LAST_FRAME, LAST2_FRAME } },
   { NEW_NEARESTMV, { LAST_FRAME, LAST2_FRAME } },
   { NEAREST_NEWMV, { LAST_FRAME, LAST2_FRAME } },
   { NEW_NEARMV, { LAST_FRAME, LAST2_FRAME } },
   { NEAR_NEWMV, { LAST_FRAME, LAST2_FRAME } },
-  { NEW_NEWMV, { LAST_FRAME, LAST2_FRAME } },
   { GLOBAL_GLOBALMV, { LAST_FRAME, LAST2_FRAME } },
 
   { NEAR_NEARMV, { LAST_FRAME, LAST3_FRAME } },
+  { NEW_NEWMV, { LAST_FRAME, LAST3_FRAME } },
   { NEW_NEARESTMV, { LAST_FRAME, LAST3_FRAME } },
   { NEAREST_NEWMV, { LAST_FRAME, LAST3_FRAME } },
   { NEW_NEARMV, { LAST_FRAME, LAST3_FRAME } },
   { NEAR_NEWMV, { LAST_FRAME, LAST3_FRAME } },
-  { NEW_NEWMV, { LAST_FRAME, LAST3_FRAME } },
   { GLOBAL_GLOBALMV, { LAST_FRAME, LAST3_FRAME } },
 
   { NEAR_NEARMV, { LAST_FRAME, GOLDEN_FRAME } },
+  { NEW_NEWMV, { LAST_FRAME, GOLDEN_FRAME } },
   { NEW_NEARESTMV, { LAST_FRAME, GOLDEN_FRAME } },
   { NEAREST_NEWMV, { LAST_FRAME, GOLDEN_FRAME } },
   { NEW_NEARMV, { LAST_FRAME, GOLDEN_FRAME } },
   { NEAR_NEWMV, { LAST_FRAME, GOLDEN_FRAME } },
-  { NEW_NEWMV, { LAST_FRAME, GOLDEN_FRAME } },
   { GLOBAL_GLOBALMV, { LAST_FRAME, GOLDEN_FRAME } },
 
   { NEAR_NEARMV, { BWDREF_FRAME, ALTREF_FRAME } },
+  { NEW_NEWMV, { BWDREF_FRAME, ALTREF_FRAME } },
   { NEW_NEARESTMV, { BWDREF_FRAME, ALTREF_FRAME } },
   { NEAREST_NEWMV, { BWDREF_FRAME, ALTREF_FRAME } },
   { NEW_NEARMV, { BWDREF_FRAME, ALTREF_FRAME } },
   { NEAR_NEWMV, { BWDREF_FRAME, ALTREF_FRAME } },
-  { NEW_NEWMV, { BWDREF_FRAME, ALTREF_FRAME } },
   { GLOBAL_GLOBALMV, { BWDREF_FRAME, ALTREF_FRAME } },
 
   // intra modes
@@ -334,18 +334,21 @@
   // Derive aggressiveness factor for gating the transform search
   // Lower value indicates more aggressiveness. Be more conservative (high
   // value) for (i) low quantizers (ii) regions where prediction is poor
-  const int scale[5] = { INT_MAX, 4, 3, 3, 2 };
+  const int scale[6] = { INT_MAX, 4, 3, 3, 2, 2 };
   const int qslope = 2 * (!is_luma_only);
+  const int level_to_qindex_map[6] = { 0, 0, 0, 0, 80, 100 };
   int aggr_factor = 1;
-  if (!is_luma_only) {
+  const int pred_qindex_thresh = level_to_qindex_map[level];
+  if (!is_luma_only && level <= 3) {
     aggr_factor = AOMMAX(
         1, ((MAXQ - x->qindex) * qslope + QINDEX_RANGE / 2) >> QINDEX_BITS);
   }
-  if (best_skip_rd >
-      (x->source_variance << (num_pels_log2_lookup[bsize] + RDDIV_BITS)))
+  if ((best_skip_rd >
+       (x->source_variance << (num_pels_log2_lookup[bsize] + RDDIV_BITS))) &&
+      (x->qindex >= pred_qindex_thresh))
     aggr_factor *= scale[level];
-  // For level setting 1, be more conservative for luma only case even when
-  // prediction is good
+  // For level setting 1, be more conservative for non-luma-only case even when
+  // prediction is good.
   else if ((level <= 1) && !is_luma_only)
     aggr_factor *= 2;
 
@@ -353,7 +356,7 @@
   // since best_skip_rd is computed after and skip_rd is computed (with 8-bit
   // prediction signals blended for WEDGE/DIFFWTD rather than 16-bit) before
   // interpolation filter search
-  const int luma_mul[5] = { INT_MAX, 32, 29, 20, 17 };
+  const int luma_mul[6] = { INT_MAX, 32, 29, 20, 17, 17 };
   int mul_factor = is_luma_only ? luma_mul[level] : 16;
   int64_t rd_thresh =
       (best_skip_rd == INT64_MAX)
@@ -384,11 +387,12 @@
   if (is_inter_block(mbmi)) {
     if (is_inter_mode(best_mode) &&
         sf->tx_sf.tx_type_search.fast_inter_tx_type_search &&
-        !cpi->oxcf.use_inter_dct_only)
+        !cpi->oxcf.txfm_cfg.use_inter_dct_only)
       return 1;
   } else {
     if (sf->tx_sf.tx_type_search.fast_intra_tx_type_search &&
-        !cpi->oxcf.use_intra_default_tx_only && !cpi->oxcf.use_intra_dct_only)
+        !cpi->oxcf.txfm_cfg.use_intra_default_tx_only &&
+        !cpi->oxcf.txfm_cfg.use_intra_dct_only)
       return 1;
   }
 
@@ -404,55 +408,57 @@
 
 static INLINE void set_tx_size_search_method(
     const AV1_COMMON *cm, const WinnerModeParams *winner_mode_params,
-    MACROBLOCK *x, int enable_winner_mode_for_tx_size_srch,
+    TxfmSearchParams *txfm_params, int enable_winner_mode_for_tx_size_srch,
     int is_winner_mode) {
   // Populate transform size search method/transform mode appropriately
-  x->tx_size_search_method =
+  txfm_params->tx_size_search_method =
       winner_mode_params->tx_size_search_methods[DEFAULT_EVAL];
   if (enable_winner_mode_for_tx_size_srch) {
     if (is_winner_mode)
-      x->tx_size_search_method =
+      txfm_params->tx_size_search_method =
           winner_mode_params->tx_size_search_methods[WINNER_MODE_EVAL];
     else
-      x->tx_size_search_method =
+      txfm_params->tx_size_search_method =
           winner_mode_params->tx_size_search_methods[MODE_EVAL];
   }
-  x->tx_mode_search_type = select_tx_mode(cm, x->tx_size_search_method);
+  txfm_params->tx_mode_search_type =
+      select_tx_mode(cm, txfm_params->tx_size_search_method);
 }
 
-static INLINE void set_tx_type_prune(const SPEED_FEATURES *sf, MACROBLOCK *x,
-                                     int enable_winner_mode_tx_type_pruning,
+static INLINE void set_tx_type_prune(const SPEED_FEATURES *sf,
+                                     TxfmSearchParams *txfm_params,
+                                     int winner_mode_tx_type_pruning,
                                      int is_winner_mode) {
   // Populate prune transform mode appropriately
-  x->prune_mode = sf->tx_sf.tx_type_search.prune_mode;
-  if (enable_winner_mode_tx_type_pruning) {
-    if (is_winner_mode)
-      x->prune_mode = NO_PRUNE;
-    else
-      x->prune_mode = PRUNE_2D_AGGRESSIVE;
-  }
+  txfm_params->prune_2d_txfm_mode = sf->tx_sf.tx_type_search.prune_2d_txfm_mode;
+  if (!winner_mode_tx_type_pruning) return;
+
+  const int prune_mode[2][2] = { { TX_TYPE_PRUNE_4, TX_TYPE_PRUNE_0 },
+                                 { TX_TYPE_PRUNE_5, TX_TYPE_PRUNE_2 } };
+  txfm_params->prune_2d_txfm_mode =
+      prune_mode[winner_mode_tx_type_pruning - 1][is_winner_mode];
 }
 
 static INLINE void set_tx_domain_dist_params(
-    const WinnerModeParams *winner_mode_params, MACROBLOCK *x,
+    const WinnerModeParams *winner_mode_params, TxfmSearchParams *txfm_params,
     int enable_winner_mode_for_tx_domain_dist, int is_winner_mode) {
   if (!enable_winner_mode_for_tx_domain_dist) {
-    x->use_transform_domain_distortion =
+    txfm_params->use_transform_domain_distortion =
         winner_mode_params->use_transform_domain_distortion[DEFAULT_EVAL];
-    x->tx_domain_dist_threshold =
+    txfm_params->tx_domain_dist_threshold =
         winner_mode_params->tx_domain_dist_threshold[DEFAULT_EVAL];
     return;
   }
 
   if (is_winner_mode) {
-    x->use_transform_domain_distortion =
+    txfm_params->use_transform_domain_distortion =
         winner_mode_params->use_transform_domain_distortion[WINNER_MODE_EVAL];
-    x->tx_domain_dist_threshold =
+    txfm_params->tx_domain_dist_threshold =
         winner_mode_params->tx_domain_dist_threshold[WINNER_MODE_EVAL];
   } else {
-    x->use_transform_domain_distortion =
+    txfm_params->use_transform_domain_distortion =
         winner_mode_params->use_transform_domain_distortion[MODE_EVAL];
-    x->tx_domain_dist_threshold =
+    txfm_params->tx_domain_dist_threshold =
         winner_mode_params->tx_domain_dist_threshold[MODE_EVAL];
   }
 }
@@ -464,82 +470,101 @@
   const AV1_COMMON *cm = &cpi->common;
   const SPEED_FEATURES *sf = &cpi->sf;
   const WinnerModeParams *winner_mode_params = &cpi->winner_mode_params;
+  TxfmSearchParams *txfm_params = &x->txfm_search_params;
+  TxfmSearchInfo *txfm_info = &x->txfm_search_info;
 
   switch (mode_eval_type) {
     case DEFAULT_EVAL:
-      x->use_default_inter_tx_type = 0;
-      x->use_default_intra_tx_type = 0;
-      x->predict_skip_level =
-          winner_mode_params->predict_skip_level[DEFAULT_EVAL];
+      txfm_params->use_default_inter_tx_type = 0;
+      txfm_params->use_default_intra_tx_type = 0;
+      txfm_params->skip_txfm_level =
+          winner_mode_params->skip_txfm_level[DEFAULT_EVAL];
+      txfm_params->predict_dc_level =
+          winner_mode_params->predict_dc_level[DEFAULT_EVAL];
       // Set default transform domain distortion type
-      set_tx_domain_dist_params(winner_mode_params, x, 0, 0);
+      set_tx_domain_dist_params(winner_mode_params, txfm_params, 0, 0);
 
       // Get default threshold for R-D optimization of coefficients
-      x->coeff_opt_dist_threshold = get_rd_opt_coeff_thresh(
+      txfm_params->coeff_opt_dist_threshold = get_rd_opt_coeff_thresh(
           winner_mode_params->coeff_opt_dist_threshold, 0, 0);
+      txfm_params->coeff_opt_satd_threshold = get_rd_opt_coeff_thresh(
+          winner_mode_params->coeff_opt_satd_threshold, 0, 0);
+
       // Set default transform size search method
-      set_tx_size_search_method(cm, winner_mode_params, x, 0, 0);
+      set_tx_size_search_method(cm, winner_mode_params, txfm_params, 0, 0);
       // Set default transform type prune
-      set_tx_type_prune(sf, x, 0, 0);
+      set_tx_type_prune(sf, txfm_params, 0, 0);
       break;
     case MODE_EVAL:
-      x->use_default_intra_tx_type =
+      txfm_params->use_default_intra_tx_type =
           (cpi->sf.tx_sf.tx_type_search.fast_intra_tx_type_search ||
-           cpi->oxcf.use_intra_default_tx_only);
-      x->use_default_inter_tx_type =
+           cpi->oxcf.txfm_cfg.use_intra_default_tx_only);
+      txfm_params->use_default_inter_tx_type =
           cpi->sf.tx_sf.tx_type_search.fast_inter_tx_type_search;
-      x->predict_skip_level = winner_mode_params->predict_skip_level[MODE_EVAL];
-
+      txfm_params->skip_txfm_level =
+          winner_mode_params->skip_txfm_level[MODE_EVAL];
+      txfm_params->predict_dc_level =
+          winner_mode_params->predict_dc_level[MODE_EVAL];
       // Set transform domain distortion type for mode evaluation
       set_tx_domain_dist_params(
-          winner_mode_params, x,
+          winner_mode_params, txfm_params,
           sf->winner_mode_sf.enable_winner_mode_for_use_tx_domain_dist, 0);
 
       // Get threshold for R-D optimization of coefficients during mode
       // evaluation
-      x->coeff_opt_dist_threshold = get_rd_opt_coeff_thresh(
+      txfm_params->coeff_opt_dist_threshold = get_rd_opt_coeff_thresh(
           winner_mode_params->coeff_opt_dist_threshold,
           sf->winner_mode_sf.enable_winner_mode_for_coeff_opt, 0);
+      txfm_params->coeff_opt_satd_threshold = get_rd_opt_coeff_thresh(
+          winner_mode_params->coeff_opt_satd_threshold,
+          sf->winner_mode_sf.enable_winner_mode_for_coeff_opt, 0);
+
       // Set the transform size search method for mode evaluation
       set_tx_size_search_method(
-          cm, winner_mode_params, x,
+          cm, winner_mode_params, txfm_params,
           sf->winner_mode_sf.enable_winner_mode_for_tx_size_srch, 0);
       // Set transform type prune for mode evaluation
-      set_tx_type_prune(
-          sf, x, sf->tx_sf.tx_type_search.enable_winner_mode_tx_type_pruning,
-          0);
+      set_tx_type_prune(sf, txfm_params,
+                        sf->tx_sf.tx_type_search.winner_mode_tx_type_pruning,
+                        0);
       break;
     case WINNER_MODE_EVAL:
-      x->use_default_inter_tx_type = 0;
-      x->use_default_intra_tx_type = 0;
-      x->predict_skip_level =
-          winner_mode_params->predict_skip_level[WINNER_MODE_EVAL];
+      txfm_params->use_default_inter_tx_type = 0;
+      txfm_params->use_default_intra_tx_type = 0;
+      txfm_params->skip_txfm_level =
+          winner_mode_params->skip_txfm_level[WINNER_MODE_EVAL];
+      txfm_params->predict_dc_level =
+          winner_mode_params->predict_dc_level[WINNER_MODE_EVAL];
 
       // Set transform domain distortion type for winner mode evaluation
       set_tx_domain_dist_params(
-          winner_mode_params, x,
+          winner_mode_params, txfm_params,
           sf->winner_mode_sf.enable_winner_mode_for_use_tx_domain_dist, 1);
 
       // Get threshold for R-D optimization of coefficients for winner mode
       // evaluation
-      x->coeff_opt_dist_threshold = get_rd_opt_coeff_thresh(
+      txfm_params->coeff_opt_dist_threshold = get_rd_opt_coeff_thresh(
           winner_mode_params->coeff_opt_dist_threshold,
           sf->winner_mode_sf.enable_winner_mode_for_coeff_opt, 1);
+      txfm_params->coeff_opt_satd_threshold = get_rd_opt_coeff_thresh(
+          winner_mode_params->coeff_opt_satd_threshold,
+          sf->winner_mode_sf.enable_winner_mode_for_coeff_opt, 1);
+
       // Set the transform size search method for winner mode evaluation
       set_tx_size_search_method(
-          cm, winner_mode_params, x,
+          cm, winner_mode_params, txfm_params,
           sf->winner_mode_sf.enable_winner_mode_for_tx_size_srch, 1);
       // Set default transform type prune mode for winner mode evaluation
-      set_tx_type_prune(
-          sf, x, sf->tx_sf.tx_type_search.enable_winner_mode_tx_type_pruning,
-          1);
+      set_tx_type_prune(sf, txfm_params,
+                        sf->tx_sf.tx_type_search.winner_mode_tx_type_pruning,
+                        1);
 
       // Reset hash state for winner mode processing. Winner mode and subsequent
       // transform/mode evaluations (palette/IntraBC) cann't reuse old data as
       // the decisions would have been sub-optimal
       // TODO(any): Move the evaluation of palette/IntraBC modes before winner
       // mode is processed and clean-up the code below
-      reset_hash_records(x, cpi->sf.tx_sf.use_inter_txb_hash);
+      reset_hash_records(txfm_info, cpi->sf.tx_sf.use_inter_txb_hash);
 
       break;
     default: assert(0);
@@ -575,24 +600,27 @@
 
 // Store best mode stats for winner mode processing
 static INLINE void store_winner_mode_stats(
-    const AV1_COMMON *const cm, MACROBLOCK *x, MB_MODE_INFO *mbmi,
+    const AV1_COMMON *const cm, MACROBLOCK *x, const MB_MODE_INFO *mbmi,
     RD_STATS *rd_cost, RD_STATS *rd_cost_y, RD_STATS *rd_cost_uv,
     THR_MODES mode_index, uint8_t *color_map, BLOCK_SIZE bsize, int64_t this_rd,
-    int enable_multiwinner_mode_process, int txfm_search_done) {
+    int multi_winner_mode_type, int txfm_search_done) {
   WinnerModeStats *winner_mode_stats = x->winner_mode_stats;
   int mode_idx = 0;
   int is_palette_mode = mbmi->palette_mode_info.palette_size[PLANE_TYPE_Y] > 0;
   // Mode stat is not required when multiwinner mode processing is disabled
-  if (!enable_multiwinner_mode_process) return;
+  if (multi_winner_mode_type == MULTI_WINNER_MODE_OFF) return;
   // Ignore mode with maximum rd
   if (this_rd == INT64_MAX) return;
   // TODO(any): Winner mode processing is currently not applicable for palette
   // mode in Inter frames. Clean-up the following code, once support is added
   if (!frame_is_intra_only(cm) && is_palette_mode) return;
 
-  const int max_winner_mode_count = frame_is_intra_only(cm)
-                                        ? MAX_WINNER_MODE_COUNT_INTRA
-                                        : MAX_WINNER_MODE_COUNT_INTER;
+  int max_winner_mode_count = frame_is_intra_only(cm)
+                                  ? MAX_WINNER_MODE_COUNT_INTRA
+                                  : MAX_WINNER_MODE_COUNT_INTER;
+  max_winner_mode_count = (multi_winner_mode_type == MULTI_WINNER_MODE_FAST)
+                              ? AOMMIN(max_winner_mode_count, 2)
+                              : max_winner_mode_count;
   assert(x->winner_mode_count >= 0 &&
          x->winner_mode_count <= max_winner_mode_count);
 
@@ -619,14 +647,16 @@
   // Update rd stats required for inter frame
   if (!frame_is_intra_only(cm) && rd_cost && rd_cost_y && rd_cost_uv) {
     const MACROBLOCKD *xd = &x->e_mbd;
-    const int skip_ctx = av1_get_skip_context(xd);
+    const int skip_ctx = av1_get_skip_txfm_context(xd);
     const int is_intra_mode = av1_mode_defs[mode_index].mode < INTRA_MODE_END;
-    const int skip = mbmi->skip && !is_intra_mode;
+    const int skip_txfm = mbmi->skip_txfm && !is_intra_mode;
 
     winner_mode_stats[mode_idx].rd_cost = *rd_cost;
     if (txfm_search_done) {
       winner_mode_stats[mode_idx].rate_y =
-          rd_cost_y->rate + x->skip_cost[skip_ctx][rd_cost->skip || skip];
+          rd_cost_y->rate +
+          x->mode_costs
+              .skip_txfm_cost[skip_ctx][rd_cost->skip_txfm || skip_txfm];
       winner_mode_stats[mode_idx].rate_uv = rd_cost_uv->rate;
     }
   }
@@ -645,6 +675,18 @@
       AOMMIN(x->winner_mode_count + 1, max_winner_mode_count);
 }
 
+unsigned int av1_get_sby_perpixel_variance(const struct AV1_COMP *cpi,
+                                           const struct buf_2d *ref,
+                                           BLOCK_SIZE bs);
+
+unsigned int av1_high_get_sby_perpixel_variance(const struct AV1_COMP *cpi,
+                                                const struct buf_2d *ref,
+                                                BLOCK_SIZE bs, int bd);
+
+static INLINE int is_mode_intra(PREDICTION_MODE mode) {
+  return mode < INTRA_MODE_END;
+}
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/av1/encoder/reconinter_enc.c b/av1/encoder/reconinter_enc.c
index 231b020..20da822 100644
--- a/av1/encoder/reconinter_enc.c
+++ b/av1/encoder/reconinter_enc.c
@@ -31,7 +31,8 @@
 static void enc_calc_subpel_params(const MV *const src_mv,
                                    InterPredParams *const inter_pred_params,
                                    MACROBLOCKD *xd, int mi_x, int mi_y, int ref,
-                                   uint8_t **pre, SubpelParams *subpel_params,
+                                   uint8_t **mc_buf, uint8_t **pre,
+                                   SubpelParams *subpel_params,
                                    int *src_stride) {
   // These are part of the function signature to use this function through a
   // function pointer. See typedef of 'CalcSubpelParamsFunc'.
@@ -39,6 +40,7 @@
   (void)mi_x;
   (void)mi_y;
   (void)ref;
+  (void)mc_buf;
 
   const struct scale_factors *sf = inter_pred_params->scale_factors;
 
@@ -73,16 +75,17 @@
 void av1_enc_build_one_inter_predictor(uint8_t *dst, int dst_stride,
                                        const MV *src_mv,
                                        InterPredParams *inter_pred_params) {
-  av1_build_one_inter_predictor(dst, dst_stride, src_mv, inter_pred_params,
-                                NULL /* xd */, 0 /* mi_x */, 0 /* mi_y */,
-                                0 /* ref */, enc_calc_subpel_params);
+  av1_build_one_inter_predictor(
+      dst, dst_stride, src_mv, inter_pred_params, NULL /* xd */, 0 /* mi_x */,
+      0 /* mi_y */, 0 /* ref */, NULL /* mc_buf */, enc_calc_subpel_params);
 }
 
 static void enc_build_inter_predictors(const AV1_COMMON *cm, MACROBLOCKD *xd,
                                        int plane, const MB_MODE_INFO *mi,
                                        int bw, int bh, int mi_x, int mi_y) {
   av1_build_inter_predictors(cm, xd, plane, mi, 0 /* build_for_obmc */, bw, bh,
-                             mi_x, mi_y, enc_calc_subpel_params);
+                             mi_x, mi_y, NULL /* mc_buf */,
+                             enc_calc_subpel_params);
 }
 
 void av1_enc_build_inter_predictor_y(MACROBLOCKD *xd, int mi_row, int mi_col) {
@@ -136,18 +139,49 @@
   }
 }
 
+static void setup_address_for_obmc(MACROBLOCKD *xd, int mi_row_offset,
+                                   int mi_col_offset, MB_MODE_INFO *ref_mbmi,
+                                   struct build_prediction_ctxt *ctxt,
+                                   const int num_planes) {
+  const BLOCK_SIZE ref_bsize = AOMMAX(BLOCK_8X8, ref_mbmi->bsize);
+  const int ref_mi_row = xd->mi_row + mi_row_offset;
+  const int ref_mi_col = xd->mi_col + mi_col_offset;
+
+  for (int plane = 0; plane < num_planes; ++plane) {
+    struct macroblockd_plane *const pd = &xd->plane[plane];
+    setup_pred_plane(&pd->dst, ref_bsize, ctxt->tmp_buf[plane],
+                     ctxt->tmp_width[plane], ctxt->tmp_height[plane],
+                     ctxt->tmp_stride[plane], mi_row_offset, mi_col_offset,
+                     NULL, pd->subsampling_x, pd->subsampling_y);
+  }
+
+  const MV_REFERENCE_FRAME frame = ref_mbmi->ref_frame[0];
+
+  const RefCntBuffer *const ref_buf = get_ref_frame_buf(ctxt->cm, frame);
+  const struct scale_factors *const sf =
+      get_ref_scale_factors_const(ctxt->cm, frame);
+
+  xd->block_ref_scale_factors[0] = sf;
+  if ((!av1_is_valid_scale(sf)))
+    aom_internal_error(xd->error_info, AOM_CODEC_UNSUP_BITSTREAM,
+                       "Reference frame has invalid dimensions");
+
+  av1_setup_pre_planes(xd, 0, &ref_buf->buf, ref_mi_row, ref_mi_col, sf,
+                       num_planes);
+}
+
 static INLINE void build_obmc_prediction(MACROBLOCKD *xd, int rel_mi_row,
                                          int rel_mi_col, uint8_t op_mi_size,
                                          int dir, MB_MODE_INFO *above_mbmi,
                                          void *fun_ctxt, const int num_planes) {
   struct build_prediction_ctxt *ctxt = (struct build_prediction_ctxt *)fun_ctxt;
-  av1_setup_address_for_obmc(xd, rel_mi_row, rel_mi_col, above_mbmi, ctxt,
-                             num_planes);
+  setup_address_for_obmc(xd, rel_mi_row, rel_mi_col, above_mbmi, ctxt,
+                         num_planes);
 
   const int mi_x = (xd->mi_col + rel_mi_col) << MI_SIZE_LOG2;
   const int mi_y = (xd->mi_row + rel_mi_row) << MI_SIZE_LOG2;
 
-  const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+  const BLOCK_SIZE bsize = xd->mi[0]->bsize;
 
   InterPredParams inter_pred_params;
 
@@ -190,10 +224,10 @@
                                          int tmp_height[MAX_MB_PLANE],
                                          int tmp_stride[MAX_MB_PLANE]) {
   if (!xd->up_available) return;
-  struct build_prediction_ctxt ctxt = { cm,         tmp_buf,
-                                        tmp_width,  tmp_height,
-                                        tmp_stride, xd->mb_to_right_edge };
-  BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+  struct build_prediction_ctxt ctxt = {
+    cm, tmp_buf, tmp_width, tmp_height, tmp_stride, xd->mb_to_right_edge, NULL
+  };
+  BLOCK_SIZE bsize = xd->mi[0]->bsize;
   foreach_overlappable_nb_above(cm, xd,
                                 max_neighbor_obmc[mi_size_wide_log2[bsize]],
                                 build_obmc_prediction, &ctxt);
@@ -205,10 +239,10 @@
                                         int tmp_height[MAX_MB_PLANE],
                                         int tmp_stride[MAX_MB_PLANE]) {
   if (!xd->left_available) return;
-  struct build_prediction_ctxt ctxt = { cm,         tmp_buf,
-                                        tmp_width,  tmp_height,
-                                        tmp_stride, xd->mb_to_bottom_edge };
-  BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+  struct build_prediction_ctxt ctxt = {
+    cm, tmp_buf, tmp_width, tmp_height, tmp_stride, xd->mb_to_bottom_edge, NULL
+  };
+  BLOCK_SIZE bsize = xd->mi[0]->bsize;
   foreach_overlappable_nb_left(cm, xd,
                                max_neighbor_obmc[mi_size_high_log2[bsize]],
                                build_obmc_prediction, &ctxt);
@@ -224,26 +258,7 @@
   int dst_height1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
   int dst_height2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
 
-  if (is_cur_buf_hbd(xd)) {
-    int len = sizeof(uint16_t);
-    dst_buf1[0] = CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[0]);
-    dst_buf1[1] =
-        CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE * len);
-    dst_buf1[2] =
-        CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE * 2 * len);
-    dst_buf2[0] = CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[1]);
-    dst_buf2[1] =
-        CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE * len);
-    dst_buf2[2] =
-        CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE * 2 * len);
-  } else {
-    dst_buf1[0] = xd->tmp_obmc_bufs[0];
-    dst_buf1[1] = xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE;
-    dst_buf1[2] = xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE * 2;
-    dst_buf2[0] = xd->tmp_obmc_bufs[1];
-    dst_buf2[1] = xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE;
-    dst_buf2[2] = xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE * 2;
-  }
+  av1_setup_obmc_dst_bufs(xd, dst_buf1, dst_buf2);
 
   const int mi_row = xd->mi_row;
   const int mi_col = xd->mi_col;
@@ -251,8 +266,8 @@
                                       dst_stride1);
   av1_build_prediction_by_left_preds(cm, xd, dst_buf2, dst_width2, dst_height2,
                                      dst_stride2);
-  av1_setup_dst_planes(xd->plane, xd->mi[0]->sb_type, &cm->cur_frame->buf,
-                       mi_row, mi_col, 0, num_planes);
+  av1_setup_dst_planes(xd->plane, xd->mi[0]->bsize, &cm->cur_frame->buf, mi_row,
+                       mi_col, 0, num_planes);
   av1_build_obmc_inter_prediction(cm, xd, dst_buf1, dst_stride1, dst_buf2,
                                   dst_stride2);
 }
@@ -358,31 +373,28 @@
     if (is_hbd) {
       build_masked_compound_highbd(
           dst, dst_buf->stride, CONVERT_TO_BYTEPTR(ext_dst0), ext_dst_stride0,
-          CONVERT_TO_BYTEPTR(ext_dst1), ext_dst_stride1, comp_data,
-          mbmi->sb_type, h, w, xd->bd);
+          CONVERT_TO_BYTEPTR(ext_dst1), ext_dst_stride1, comp_data, mbmi->bsize,
+          h, w, xd->bd);
     } else {
       build_masked_compound(dst, dst_buf->stride, ext_dst0, ext_dst_stride0,
-                            ext_dst1, ext_dst_stride1, comp_data, mbmi->sb_type,
+                            ext_dst1, ext_dst_stride1, comp_data, mbmi->bsize,
                             h, w);
     }
 #else
     build_masked_compound(dst, dst_buf->stride, ext_dst0, ext_dst_stride0,
-                          ext_dst1, ext_dst_stride1, comp_data, mbmi->sb_type,
-                          h, w);
+                          ext_dst1, ext_dst_stride1, comp_data, mbmi->bsize, h,
+                          w);
 #endif
   } else {
 #if CONFIG_AV1_HIGHBITDEPTH
     if (is_hbd) {
-      aom_highbd_convolve_copy(CONVERT_TO_BYTEPTR(ext_dst0), ext_dst_stride0,
-                               dst, dst_buf->stride, NULL, 0, NULL, 0, w, h,
-                               xd->bd);
+      aom_highbd_convolve_copy(CONVERT_TO_SHORTPTR(ext_dst0), ext_dst_stride0,
+                               CONVERT_TO_SHORTPTR(dst), dst_buf->stride, w, h);
     } else {
-      aom_convolve_copy(ext_dst0, ext_dst_stride0, dst, dst_buf->stride, NULL,
-                        0, NULL, 0, w, h);
+      aom_convolve_copy(ext_dst0, ext_dst_stride0, dst, dst_buf->stride, w, h);
     }
 #else
-    aom_convolve_copy(ext_dst0, ext_dst_stride0, dst, dst_buf->stride, NULL, 0,
-                      NULL, 0, w, h);
+    aom_convolve_copy(ext_dst0, ext_dst_stride0, dst, dst_buf->stride, w, h);
 #endif
   }
 }
diff --git a/av1/encoder/segmentation.c b/av1/encoder/segmentation.c
index 0c029c0..de17d57 100644
--- a/av1/encoder/segmentation.c
+++ b/av1/encoder/segmentation.c
@@ -62,7 +62,7 @@
 
   // Temporal prediction not allowed on key frames
   if (cm->current_frame.frame_type != KEY_FRAME) {
-    const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+    const BLOCK_SIZE bsize = xd->mi[0]->bsize;
     // Test to see if the segment id matches the predicted value.
     const int pred_segment_id =
         cm->last_frame_seg_map
diff --git a/av1/encoder/speed_features.c b/av1/encoder/speed_features.c
index e03faec..206ac52 100644
--- a/av1/encoder/speed_features.c
+++ b/av1/encoder/speed_features.c
@@ -73,14 +73,21 @@
 // Index 2: Winner mode evaluation.
 // Index 1 and 2 are applicable when enable_winner_mode_for_coeff_opt speed
 // feature is ON
-// There are 6 levels with increasing speed, mapping to vertical indices.
-static unsigned int coeff_opt_dist_thresholds[6][MODE_EVAL_TYPES] = {
+// There are 7 levels with increasing speed, mapping to vertical indices.
+static unsigned int coeff_opt_dist_thresholds[7][MODE_EVAL_TYPES] = {
   { UINT_MAX, UINT_MAX, UINT_MAX },
   { 3200, 250, UINT_MAX },
   { 1728, 142, UINT_MAX },
   { 864, 142, UINT_MAX },
   { 432, 86, UINT_MAX },
-  { 216, 86, UINT_MAX }
+  { 216, 86, UINT_MAX },
+  { 216, 0, UINT_MAX }
+};
+
+static unsigned int coeff_opt_satd_thresholds[3][MODE_EVAL_TYPES] = {
+  { UINT_MAX, UINT_MAX, UINT_MAX },
+  { 97, 16, UINT_MAX },
+  { 25, 10, UINT_MAX },
 };
 
 // Transform size to be used for default, mode and winner mode evaluation
@@ -105,30 +112,55 @@
                                                                 { 1, 1, 1 },
                                                                 { 1, 2, 1 } };
 
+// Predict DC block levels to be used for default, mode and winner mode
+// evaluation. Index 0: Default mode evaluation, Winner mode processing is not
+// applicable. Index 1: Mode evaluation, Index 2: Winner mode evaluation
+// Values indicate the aggressiveness of skip flag prediction.
+// 0 : no early DC block prediction
+// 1 : Early DC block prediction based on error variance
+static unsigned int predict_dc_levels[3][MODE_EVAL_TYPES] = { { 0, 0, 0 },
+                                                              { 1, 1, 0 },
+                                                              { 1, 1, 1 } };
+
+// This table holds the maximum number of reference frames for global motion.
+// The table is indexed as per the speed feature 'gm_search_type'.
+// 0 : All reference frames are allowed.
+// 1 : All reference frames except L2 and L3 are allowed.
+// 2 : All reference frames except L2, L3 and ARF2 are allowed.
+// 3 : No reference frame is allowed.
+static int gm_available_reference_frames[GM_DISABLE_SEARCH + 1] = {
+  INTER_REFS_PER_FRAME, INTER_REFS_PER_FRAME - 2, INTER_REFS_PER_FRAME - 3, 0
+};
+
+// Qindex threshold levels used for selecting full-pel motion search.
+// ms_qthresh[i][j][k] indicates the qindex boundary value for 'k'th qindex band
+// for resolution index 'j' for aggressiveness level 'i'.
+// Aggressiveness increases from i = 0 to 2.
+// j = 0: lower than 720p resolution, j = 1: 720p or larger resolution.
+// Currently invoked only for speed 0, 1 and 2.
+static int ms_qindex_thresh[3][2][2] = { { { 200, 70 }, { MAXQ, 200 } },
+                                         { { 170, 50 }, { MAXQ, 200 } },
+                                         { { 170, 40 }, { 200, 40 } } };
+
+// Full-pel search methods for aggressive search based on qindex.
+// Index 0 is for resolutions lower than 720p, index 1 for 720p or larger
+// resolutions. Currently invoked only for speed 1 and 2.
+static SEARCH_METHODS motion_search_method[2] = { CLAMPED_DIAMOND, DIAMOND };
+
 // Intra only frames, golden frames (except alt ref overlays) and
 // alt ref frames tend to be coded at a higher than ambient quality
 static int frame_is_boosted(const AV1_COMP *cpi) {
   return frame_is_kf_gf_arf(cpi);
 }
 
-static BLOCK_SIZE dim_to_size(int dim) {
-  switch (dim) {
-    case 4: return BLOCK_4X4;
-    case 8: return BLOCK_8X8;
-    case 16: return BLOCK_16X16;
-    case 32: return BLOCK_32X32;
-    case 64: return BLOCK_64X64;
-    case 128: return BLOCK_128X128;
-    default: assert(0); return 0;
-  }
-}
-
 static void set_good_speed_feature_framesize_dependent(
     const AV1_COMP *const cpi, SPEED_FEATURES *const sf, int speed) {
   const AV1_COMMON *const cm = &cpi->common;
-  const int is_720p_or_larger = AOMMIN(cm->width, cm->height) >= 720;
   const int is_480p_or_larger = AOMMIN(cm->width, cm->height) >= 480;
+  const int is_720p_or_larger = AOMMIN(cm->width, cm->height) >= 720;
+  const int is_1080p_or_larger = AOMMIN(cm->width, cm->height) >= 1080;
   const int is_4k_or_larger = AOMMIN(cm->width, cm->height) >= 2160;
+  const bool use_hbd = cpi->oxcf.use_highbitdepth;
 
   if (is_480p_or_larger) {
     sf->part_sf.use_square_partition_only_threshold = BLOCK_128X128;
@@ -139,6 +171,7 @@
   } else {
     sf->part_sf.use_square_partition_only_threshold = BLOCK_64X64;
     sf->part_sf.auto_max_partition_based_on_simple_motion = DIRECT_PRED;
+    if (use_hbd) sf->tx_sf.prune_tx_size_level = 1;
   }
 
   if (is_4k_or_larger) {
@@ -155,6 +188,12 @@
     sf->part_sf.ml_early_term_after_part_split_level = 1;
   }
 
+  if (is_720p_or_larger) {
+    // TODO(chiyotsai@google.com): make this speed feature adaptive based on
+    // current block's vertical texture instead of hardcoded with resolution
+    sf->mv_sf.use_downsampled_sad = 1;
+  }
+
   if (speed >= 1) {
     if (is_720p_or_larger) {
       sf->part_sf.use_square_partition_only_threshold = BLOCK_128X128;
@@ -199,6 +238,9 @@
 
     if (is_480p_or_larger) {
       sf->tx_sf.tx_type_search.prune_tx_type_using_stats = 1;
+      if (use_hbd) sf->tx_sf.prune_tx_size_level = 2;
+    } else {
+      if (use_hbd) sf->tx_sf.prune_tx_size_level = 3;
     }
   }
 
@@ -213,6 +255,7 @@
       sf->part_sf.partition_search_breakout_dist_thr = (1 << 23);
       sf->part_sf.partition_search_breakout_rate_thr = 120;
     }
+    if (use_hbd) sf->tx_sf.prune_tx_size_level = 3;
   }
 
   if (speed >= 4) {
@@ -236,6 +279,41 @@
       sf->inter_sf.prune_warped_prob_thresh = 8;
     }
   }
+
+  if (speed >= 6) {
+    if (is_720p_or_larger) {
+      sf->part_sf.auto_max_partition_based_on_simple_motion = NOT_IN_USE;
+    } else if (is_480p_or_larger) {
+      sf->part_sf.auto_max_partition_based_on_simple_motion = DIRECT_PRED;
+    }
+
+    if (is_1080p_or_larger) {
+      sf->part_sf.default_min_partition_size = BLOCK_8X8;
+    }
+
+    if (is_720p_or_larger) {
+      sf->inter_sf.disable_masked_comp = 1;
+    }
+
+    if (!is_720p_or_larger) {
+      sf->inter_sf.mv_cost_upd_level = 2;
+    }
+
+    // TODO(yunqing): use BLOCK_32X32 for >= 4k.
+    if (is_4k_or_larger) {
+      sf->part_sf.use_square_partition_only_threshold = BLOCK_64X64;
+    } else if (is_720p_or_larger) {
+      sf->part_sf.use_square_partition_only_threshold = BLOCK_32X32;
+    } else {
+      sf->part_sf.use_square_partition_only_threshold = BLOCK_16X16;
+    }
+
+    if (is_720p_or_larger) {
+      sf->inter_sf.prune_ref_mv_idx_search = 2;
+    } else {
+      sf->inter_sf.prune_ref_mv_idx_search = 1;
+    }
+  }
 }
 
 static void set_rt_speed_feature_framesize_dependent(const AV1_COMP *const cpi,
@@ -254,10 +332,21 @@
       sf->rt_sf.use_modeled_non_rd_cost = 0;
       sf->rt_sf.use_nonrd_filter_search = 0;
     }
-  }
-  if (is_360p_or_larger) {
-    if (speed >= 7) {
-      sf->interp_sf.disable_filter_search_var_thresh = 0;
+    if (speed >= 9) {
+      sf->rt_sf.use_modeled_non_rd_cost = 1;
+      sf->rt_sf.nonrd_agressive_skip = 1;
+// TODO(kyslov) Re-enable when AV1 models are trained
+#if 0
+      if (!frame_is_intra_only(cm)) {
+        sf->part_sf.partition_search_type = ML_BASED_PARTITION;
+        sf->rt_sf.reuse_inter_pred_nonrd = 0;
+      }
+#endif
+    }
+  } else {
+    if (speed == 8 && !cpi->use_svc) {
+      sf->rt_sf.short_circuit_low_temp_var = 0;
+      sf->rt_sf.use_nonrd_altref_frame = 1;
     }
   }
   if (!is_480p_or_larger) {
@@ -266,9 +355,12 @@
     }
     if (speed >= 8) {
       sf->mv_sf.subpel_search_method = SUBPEL_TREE;
-
       sf->rt_sf.estimate_motion_for_var_based_partition = 1;
     }
+    if (speed >= 9) {
+      sf->mv_sf.subpel_search_method = SUBPEL_TREE_PRUNED;
+      sf->rt_sf.estimate_motion_for_var_based_partition = 0;
+    }
   }
 }
 
@@ -281,7 +373,8 @@
       boosted || gf_group->update_type[gf_group->index] == INTNL_ARF_UPDATE;
   const int allow_screen_content_tools =
       cm->features.allow_screen_content_tools;
-  if (!cpi->oxcf.large_scale_tile) {
+  const int use_hbd = cpi->oxcf.use_highbitdepth;
+  if (!cpi->oxcf.tile_cfg.enable_large_scale_tile) {
     sf->hl_sf.high_precision_mv_usage = LAST_MV_DATA;
   }
 
@@ -295,47 +388,47 @@
   sf->part_sf.ml_prune_rect_partition = 1;
   sf->part_sf.prune_ext_partition_types_search_level = 1;
   sf->part_sf.simple_motion_search_prune_rect = 1;
+  sf->part_sf.ml_predict_breakout_level = use_hbd ? 1 : 3;
 
-  sf->inter_sf.disable_wedge_search_edge_thresh = 0;
-  sf->inter_sf.disable_wedge_search_var_thresh = 0;
   // TODO(debargha): Test, tweak and turn on either 1 or 2
   sf->inter_sf.inter_mode_rd_model_estimation = 1;
   sf->inter_sf.model_based_post_interp_filter_breakout = 1;
   sf->inter_sf.prune_compound_using_single_ref = 1;
   sf->inter_sf.prune_mode_search_simple_translation = 1;
-  sf->inter_sf.prune_motion_mode_level = 1;
   sf->inter_sf.prune_ref_frame_for_rect_partitions =
       (boosted || (allow_screen_content_tools))
           ? 0
           : (is_boosted_arf2_bwd_type ? 1 : 2);
   sf->inter_sf.prune_wedge_pred_diff_based = 1;
-  sf->inter_sf.reduce_inter_modes = 1;
+  sf->inter_sf.reduce_inter_modes = boosted ? 1 : 2;
   sf->inter_sf.selective_ref_frame = 1;
   sf->inter_sf.use_dist_wtd_comp_flag = DIST_WTD_COMP_SKIP_MV_SEARCH;
 
-  sf->interp_sf.cb_pred_filter_search = 0;
   sf->interp_sf.use_fast_interpolation_filter_search = 1;
 
   sf->intra_sf.intra_pruning_with_hog = 1;
-  sf->intra_sf.intra_pruning_with_hog_thresh = -1.2f;
 
   sf->tx_sf.adaptive_txb_search_level = 1;
   sf->tx_sf.intra_tx_size_search_init_depth_sqr = 1;
   sf->tx_sf.model_based_prune_tx_search_level = 1;
   sf->tx_sf.tx_type_search.use_reduced_intra_txset = 1;
 
+  sf->tpl_sf.search_method = NSTEP_8PT;
+
   sf->rt_sf.use_nonrd_pick_mode = 0;
   sf->rt_sf.use_real_time_ref_set = 0;
 
-  if (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION)
-    sf->mv_sf.exhaustive_searches_thresh = (1 << 24);
-  else
+  if (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION ||
+      cpi->use_screen_content_tools) {
+    sf->mv_sf.exhaustive_searches_thresh = (1 << 20);
+  } else {
     sf->mv_sf.exhaustive_searches_thresh = (1 << 25);
+  }
 
   sf->rd_sf.perform_coeff_opt = 1;
+  sf->hl_sf.superres_auto_search_type = SUPERRES_AUTO_DUAL;
 
   if (speed >= 1) {
-    sf->gm_sf.disable_adaptive_warp_error_thresh = 0;
     sf->gm_sf.gm_search_type = GM_REDUCED_REF_SEARCH_SKIP_L2_L3_ARF2;
     sf->gm_sf.prune_ref_frame_for_gm_search = boosted ? 0 : 1;
 
@@ -345,6 +438,7 @@
     // simple_motion_search_split in partition search function and set the
     // speed feature accordingly
     sf->part_sf.simple_motion_search_split = allow_screen_content_tools ? 1 : 2;
+    sf->part_sf.ml_predict_breakout_level = use_hbd ? 2 : 3;
 
     sf->mv_sf.exhaustive_searches_thresh <<= 1;
     sf->mv_sf.obmc_full_pixel_search_level = 1;
@@ -354,18 +448,17 @@
     sf->inter_sf.prune_comp_search_by_single_result = boosted ? 2 : 1;
     sf->inter_sf.prune_comp_type_by_comp_avg = 1;
     sf->inter_sf.prune_comp_type_by_model_rd = boosted ? 0 : 1;
-    sf->inter_sf.prune_motion_mode_level = 2;
     sf->inter_sf.prune_ref_frame_for_rect_partitions =
         (frame_is_intra_only(&cpi->common) || (allow_screen_content_tools))
             ? 0
             : (boosted ? 1 : 2);
-    sf->inter_sf.reduce_inter_modes = boosted ? 1 : 2;
+    sf->inter_sf.reduce_inter_modes = boosted ? 1 : 3;
     sf->inter_sf.reuse_inter_intra_mode = 1;
     sf->inter_sf.selective_ref_frame = 2;
     sf->inter_sf.skip_repeated_newmv = 1;
 
-    sf->interp_sf.cb_pred_filter_search = 0;
     sf->interp_sf.use_interp_filter = 1;
+
     sf->intra_sf.prune_palette_search_level = 1;
 
     sf->tx_sf.adaptive_txb_search_level = 2;
@@ -374,7 +467,7 @@
     sf->tx_sf.intra_tx_size_search_init_depth_rect = 1;
     sf->tx_sf.model_based_prune_tx_search_level = 0;
     sf->tx_sf.tx_type_search.ml_tx_split_thresh = 4000;
-    sf->tx_sf.tx_type_search.prune_mode = PRUNE_2D_FAST;
+    sf->tx_sf.tx_type_search.prune_2d_txfm_mode = TX_TYPE_PRUNE_2;
     sf->tx_sf.tx_type_search.skip_tx_search = 1;
     sf->tx_sf.use_intra_txb_hash = 1;
 
@@ -391,8 +484,6 @@
   }
 
   if (speed >= 2) {
-    sf->gm_sf.gm_erroradv_type = GM_ERRORADV_TR_2;
-
     sf->part_sf.allow_partition_search_skip = 1;
 
     sf->mv_sf.auto_mv_step_size = 1;
@@ -404,30 +495,29 @@
     // bit more closely to figure out why.
     sf->inter_sf.adaptive_rd_thresh = 1;
     sf->inter_sf.comp_inter_joint_search_thresh = BLOCK_SIZES_ALL;
-    sf->inter_sf.disable_interinter_wedge_newmv_search = 1;
-    sf->inter_sf.disable_wedge_search_edge_thresh = 0;
     sf->inter_sf.disable_wedge_search_var_thresh = 100;
     sf->inter_sf.fast_interintra_wedge_search = 1;
-    sf->inter_sf.fast_wedge_sign_estimate = 1;
     sf->inter_sf.prune_comp_search_by_single_result = boosted ? 4 : 1;
     sf->inter_sf.prune_compound_using_neighbors = 1;
     sf->inter_sf.prune_comp_type_by_comp_avg = 2;
-    sf->inter_sf.prune_warp_using_wmtype = 1;
+    sf->inter_sf.reuse_best_prediction_for_part_ab = 1;
     sf->inter_sf.selective_ref_frame = 3;
     sf->inter_sf.use_dist_wtd_comp_flag = DIST_WTD_COMP_DISABLED;
 
     // TODO(Sachin): Enable/Enhance this speed feature for speed 2 & 3
     sf->interp_sf.adaptive_interp_filter_search = 1;
     sf->interp_sf.disable_dual_filter = 1;
-    sf->interp_sf.disable_filter_search_var_thresh = 100;
 
     sf->intra_sf.disable_smooth_intra =
         !frame_is_intra_only(&cpi->common) || (cpi->rc.frames_to_key != 1);
+    sf->intra_sf.intra_pruning_with_hog = 2;
 
     sf->rd_sf.perform_coeff_opt = is_boosted_arf2_bwd_type ? 3 : 4;
 
     sf->lpf_sf.prune_wiener_based_on_src_var = 1;
-    sf->lpf_sf.prune_sgr_based_on_wiener = !allow_screen_content_tools;
+    sf->lpf_sf.prune_sgr_based_on_wiener = 1;
+
+    sf->tpl_sf.prune_ref_frames_in_tpl = 1;
   }
 
   if (speed >= 3) {
@@ -438,47 +528,50 @@
 
     sf->part_sf.less_rectangular_check_level = 2;
     sf->part_sf.simple_motion_search_prune_agg = 1;
-    sf->part_sf.prune_4_partition_using_split_info =
-        !allow_screen_content_tools;
+    sf->part_sf.prune_4_partition_using_split_info = 1;
 
     // adaptive_motion_search breaks encoder multi-thread tests.
     // The values in x->pred_mv[] differ for single and multi-thread cases.
     // See aomedia:1778.
     // sf->mv_sf.adaptive_motion_search = 1;
     sf->mv_sf.full_pixel_search_level = 1;
+    sf->mv_sf.simple_motion_subpel_force_stop = QUARTER_PEL;
     sf->mv_sf.subpel_search_method = SUBPEL_TREE_PRUNED;
-    sf->mv_sf.use_accurate_subpel_search = USE_2_TAPS;
     sf->mv_sf.search_method = DIAMOND;
 
-    sf->inter_sf.disable_sb_level_mv_cost_upd = 1;
+    sf->inter_sf.mv_cost_upd_level = 1;
     // TODO(yunqing): evaluate this speed feature for speed 1 & 2, and combine
     // it with cpi->sf.disable_wedge_search_var_thresh.
     sf->inter_sf.disable_wedge_interintra_search = 1;
+    sf->inter_sf.disable_smooth_interintra = boosted ? 0 : 1;
     // TODO(any): Experiment with the early exit mechanism for speeds 0, 1 and 2
     // and clean-up the speed feature
     sf->inter_sf.perform_best_rd_based_gating_for_chroma = 1;
     sf->inter_sf.prune_inter_modes_based_on_tpl = boosted ? 0 : 1;
     sf->inter_sf.prune_comp_search_by_single_result = boosted ? 4 : 2;
-    sf->inter_sf.prune_motion_mode_level = boosted ? 2 : 3;
-    sf->inter_sf.selective_ref_frame = 4;
+    sf->inter_sf.selective_ref_frame = 5;
     sf->inter_sf.skip_repeated_ref_mv = 1;
     sf->inter_sf.skip_repeated_full_newmv = 1;
-    if (cpi->oxcf.enable_smooth_interintra)
-      sf->inter_sf.disable_smooth_interintra = boosted ? 0 : 1;
     sf->inter_sf.reuse_compound_type_decision = 1;
-    sf->inter_sf.txfm_rd_gate_level = (boosted || allow_screen_content_tools)
-                                          ? 0
-                                          : (is_boosted_arf2_bwd_type ? 1 : 2);
+    sf->inter_sf.txfm_rd_gate_level =
+        boosted ? 0 : (is_boosted_arf2_bwd_type ? 1 : 2);
 
+    // TODO(chiyotsai@google.com): the thresholds chosen for intra hog are
+    // inherited directly from luma hog with some minor tweaking. Eventually we
+    // should run this with a bayesian optimizer to find the Pareto frontier.
+    sf->intra_sf.chroma_intra_pruning_with_hog = 2;
+    sf->intra_sf.intra_pruning_with_hog = 3;
     sf->intra_sf.prune_palette_search_level = 2;
 
     sf->tpl_sf.skip_alike_starting_mv = 2;
     sf->tpl_sf.prune_intra_modes = 1;
+    sf->tpl_sf.prune_starting_mv = 1;
     sf->tpl_sf.reduce_first_step_size = 6;
+    sf->tpl_sf.subpel_force_stop = QUARTER_PEL;
+    sf->tpl_sf.search_method = DIAMOND;
 
     sf->tx_sf.adaptive_txb_search_level = boosted ? 2 : 3;
-    sf->tx_sf.tx_type_search.use_skip_flag_prediction =
-        allow_screen_content_tools ? 1 : 2;
+    sf->tx_sf.tx_type_search.use_skip_flag_prediction = 2;
 
     // TODO(any): Refactor the code related to following winner mode speed
     // features
@@ -486,8 +579,7 @@
     // TODO(any): Experiment with this speed feature by enabling for key frames
     sf->winner_mode_sf.enable_winner_mode_for_tx_size_srch =
         frame_is_intra_only(&cpi->common) ? 0 : 1;
-    sf->winner_mode_sf.enable_winner_mode_for_use_tx_domain_dist =
-        !allow_screen_content_tools;
+    sf->winner_mode_sf.enable_winner_mode_for_use_tx_domain_dist = 1;
     sf->winner_mode_sf.motion_mode_for_winner_cand =
         boosted
             ? 0
@@ -495,31 +587,32 @@
                                                                          : 2;
 
     // TODO(any): evaluate if these lpf features can be moved to speed 2.
-    sf->lpf_sf.prune_sgr_based_on_wiener = allow_screen_content_tools ? 0 : 2;
-    sf->lpf_sf.disable_loop_restoration_chroma =
-        (boosted || allow_screen_content_tools) ? 0 : 1;
+    // For screen content, "prune_sgr_based_on_wiener = 2" cause large quality
+    // loss.
+    sf->lpf_sf.prune_sgr_based_on_wiener = allow_screen_content_tools ? 1 : 2;
+    sf->lpf_sf.disable_loop_restoration_chroma = boosted ? 0 : 1;
     sf->lpf_sf.reduce_wiener_window_size = !boosted;
     sf->lpf_sf.prune_wiener_based_on_src_var = 2;
-
-    sf->hl_sf.second_alt_ref_filtering = 0;
   }
 
   if (speed >= 4) {
     sf->mv_sf.subpel_search_method = SUBPEL_TREE_PRUNED_MORE;
 
     sf->part_sf.simple_motion_search_prune_agg = 2;
-    sf->part_sf.prune_ab_partition_using_split_info =
-        !allow_screen_content_tools;
+    sf->part_sf.simple_motion_search_reduce_search_steps = 4;
+    sf->part_sf.prune_ab_partition_using_split_info = 1;
+    sf->part_sf.early_term_after_none_split = 1;
+    sf->part_sf.ml_predict_breakout_level = 3;
 
-    sf->inter_sf.adaptive_mode_search = 1;
     sf->inter_sf.alt_ref_search_fp = 1;
-    sf->inter_sf.prune_ref_mv_idx_search = 1;
-    sf->inter_sf.txfm_rd_gate_level =
-        (boosted || allow_screen_content_tools) ? 0 : 3;
+    sf->inter_sf.txfm_rd_gate_level = boosted ? 0 : 4;
 
     sf->inter_sf.prune_inter_modes_based_on_tpl = boosted ? 0 : 2;
     sf->inter_sf.prune_compound_using_neighbors = 2;
+    sf->inter_sf.prune_comp_using_best_single_mode_ref = 2;
     sf->inter_sf.disable_smooth_interintra = 1;
+    sf->inter_sf.disable_obmc = 1;
+    sf->inter_sf.disable_onesided_comp = 1;
 
     sf->interp_sf.cb_pred_filter_search = 1;
     sf->interp_sf.skip_sharp_interp_filter_search = 1;
@@ -536,43 +629,35 @@
     // presets as well
     sf->intra_sf.skip_intra_in_interframe = 2;
 
-    sf->tx_sf.tx_type_search.enable_winner_mode_tx_type_pruning = 1;
+    sf->mv_sf.simple_motion_subpel_force_stop = HALF_PEL;
+
+    sf->tpl_sf.prune_starting_mv = 2;
+    sf->tpl_sf.subpel_force_stop = HALF_PEL;
+    sf->tpl_sf.search_method = FAST_BIGDIA;
+
+    sf->tx_sf.tx_type_search.winner_mode_tx_type_pruning = 1;
     sf->tx_sf.tx_type_search.fast_intra_tx_type_search = 1;
-    sf->tx_sf.tx_type_search.prune_mode = PRUNE_2D_MORE;
+    sf->tx_sf.tx_type_search.prune_2d_txfm_mode = TX_TYPE_PRUNE_3;
     sf->tx_sf.tx_type_search.prune_tx_type_est_rd = 1;
     // TODO(any): Experiment with enabling of this speed feature as hash state
     // is reset during winner mode processing
     sf->tx_sf.use_intra_txb_hash = 0;
 
     sf->rd_sf.perform_coeff_opt = is_boosted_arf2_bwd_type ? 3 : 5;
+    sf->rd_sf.perform_coeff_opt_based_on_satd =
+        is_boosted_arf2_bwd_type ? 1 : 2;
     sf->rd_sf.tx_domain_dist_thres_level = 2;
 
     // TODO(any): Extend multi-winner mode processing support for inter frames
-    sf->winner_mode_sf.enable_multiwinner_mode_process =
-        frame_is_intra_only(&cpi->common) ? 1 : 0;
+    sf->winner_mode_sf.multi_winner_mode_type =
+        frame_is_intra_only(&cpi->common) ? MULTI_WINNER_MODE_DEFAULT
+                                          : MULTI_WINNER_MODE_OFF;
     sf->winner_mode_sf.enable_winner_mode_for_tx_size_srch = 1;
 
-    sf->lpf_sf.cdef_pick_method = allow_screen_content_tools
-                                      ? CDEF_FAST_SEARCH_LVL1
-                                      : CDEF_FAST_SEARCH_LVL2;
+    sf->lpf_sf.lpf_pick = LPF_PICK_FROM_FULL_IMAGE_NON_DUAL;
+    sf->lpf_sf.cdef_pick_method = CDEF_FAST_SEARCH_LVL3;
 
-    // TODO(any): The following features have no impact on quality and speed,
-    // and are disabled.
-    // sf->part_sf.partition_search_breakout_rate_thr = 300;
-    // sf->interp_sf.disable_filter_search_var_thresh = 200;
-    // sf->rd_sf.use_fast_coef_costing = 1;
-
-    // TODO(any): The following features give really bad quality/speed trade
-    // off. Needs to be re-worked.
-    // sf->mv_sf.search_method = BIGDIA;
-    // sf->inter_sf.adaptive_rd_thresh = 4;
-    // sf->rd_sf.tx_domain_dist_level = 2;
-    // sf->rt_sf.mode_search_skip_flags =
-    //     (cm->current_frame.frame_type == KEY_FRAME)
-    //     ? 0
-    //     : FLAG_SKIP_INTRA_DIRMISMATCH | FLAG_SKIP_INTRA_BESTINTER |
-    //     FLAG_SKIP_COMP_BESTINTRA | FLAG_SKIP_INTRA_LOWVAR |
-    //     FLAG_EARLY_TERMINATE;
+    sf->mv_sf.reduce_search_range = 1;
   }
 
   if (speed >= 5) {
@@ -580,26 +665,69 @@
     sf->part_sf.ext_partition_eval_thresh =
         allow_screen_content_tools ? BLOCK_8X8 : BLOCK_16X16;
 
-    sf->inter_sf.prune_inter_modes_based_on_tpl = boosted ? 0 : 3;
     sf->inter_sf.disable_interinter_wedge = 1;
-    sf->inter_sf.disable_obmc = 1;
-    sf->inter_sf.disable_onesided_comp = 1;
-    sf->inter_sf.txfm_rd_gate_level =
-        (boosted || allow_screen_content_tools) ? 0 : 4;
     sf->inter_sf.prune_inter_modes_if_skippable = 1;
+    sf->inter_sf.txfm_rd_gate_level = boosted ? 0 : 5;
 
-    sf->lpf_sf.lpf_pick = LPF_PICK_FROM_FULL_IMAGE_NON_DUAL;
+    sf->intra_sf.chroma_intra_pruning_with_hog = 3;
+
+    // TODO(any): Extend multi-winner mode processing support for inter frames
+    sf->winner_mode_sf.multi_winner_mode_type =
+        frame_is_intra_only(&cpi->common) ? MULTI_WINNER_MODE_FAST
+                                          : MULTI_WINNER_MODE_OFF;
+
+    sf->lpf_sf.use_coarse_filter_level_search =
+        frame_is_intra_only(&cpi->common) ? 0 : 1;
     sf->lpf_sf.disable_lr_filter = 1;
 
-    sf->mv_sf.simple_motion_subpel_force_stop = QUARTER_PEL;
     sf->mv_sf.prune_mesh_search = 1;
-    sf->mv_sf.reduce_search_range = 1;
 
-    sf->tpl_sf.subpel_force_stop = QUARTER_PEL;
+    sf->tpl_sf.prune_starting_mv = 3;
+
+    sf->winner_mode_sf.dc_blk_pred_level = 1;
   }
 
   if (speed >= 6) {
+    sf->hl_sf.disable_extra_sc_testing = 1;
+    sf->hl_sf.second_alt_ref_filtering = 0;
+    sf->hl_sf.recode_tolerance = 55;
+
+    sf->inter_sf.prune_inter_modes_based_on_tpl = boosted ? 0 : 3;
+    sf->inter_sf.prune_nearmv_using_neighbors = 1;
+
+    sf->intra_sf.chroma_intra_pruning_with_hog = 4;
+    sf->intra_sf.intra_pruning_with_hog = 4;
+
+    sf->part_sf.prune_rectangular_split_based_on_qidx =
+        boosted || allow_screen_content_tools ? 0 : 1;
+    sf->part_sf.prune_sub_8x8_partition_level =
+        allow_screen_content_tools ? 0
+                                   : frame_is_intra_only(&cpi->common) ? 1 : 2;
+
+    sf->mv_sf.simple_motion_subpel_force_stop = FULL_PEL;
+    sf->mv_sf.use_bsize_dependent_search_method = 1;
+
+    sf->tpl_sf.disable_gop_length_decision = 1;
+    sf->tpl_sf.subpel_force_stop = FULL_PEL;
+    sf->tpl_sf.disable_filtered_key_tpl = 1;
+
+    sf->tx_sf.tx_type_search.winner_mode_tx_type_pruning = 2;
+    sf->tx_sf.use_intra_txb_hash = 1;
+    sf->tx_sf.tx_type_search.prune_tx_type_est_rd = 0;
+
+    sf->rd_sf.perform_coeff_opt = is_boosted_arf2_bwd_type ? 4 : 6;
+
+    sf->winner_mode_sf.dc_blk_pred_level = 2;
+    sf->winner_mode_sf.multi_winner_mode_type = MULTI_WINNER_MODE_OFF;
+
+    sf->lpf_sf.cdef_pick_method = CDEF_FAST_SEARCH_LVL4;
   }
+
+  // Intra txb hash is currently not compatible with multi-winner mode as the
+  // hashes got reset during multi-winner mode processing.
+  assert(IMPLIES(
+      sf->winner_mode_sf.multi_winner_mode_type != MULTI_WINNER_MODE_OFF,
+      !sf->tx_sf.use_intra_txb_hash));
 }
 
 // TODO(kyslov): now this is very similar to
@@ -624,23 +752,19 @@
 
   // TODO(debargha): Test, tweak and turn on either 1 or 2
   sf->inter_sf.inter_mode_rd_model_estimation = 0;
-  sf->inter_sf.disable_wedge_search_edge_thresh = 0;
   sf->inter_sf.disable_wedge_search_var_thresh = 0;
   sf->inter_sf.model_based_post_interp_filter_breakout = 1;
   sf->inter_sf.prune_compound_using_single_ref = 0;
   sf->inter_sf.prune_mode_search_simple_translation = 1;
-  sf->inter_sf.prune_motion_mode_level = 1;
   sf->inter_sf.prune_ref_frame_for_rect_partitions = !boosted;
   sf->inter_sf.prune_wedge_pred_diff_based = 1;
   sf->inter_sf.reduce_inter_modes = 1;
   sf->inter_sf.selective_ref_frame = 1;
   sf->inter_sf.use_dist_wtd_comp_flag = DIST_WTD_COMP_SKIP_MV_SEARCH;
 
-  sf->interp_sf.cb_pred_filter_search = 0;
   sf->interp_sf.use_fast_interpolation_filter_search = 1;
 
   sf->intra_sf.intra_pruning_with_hog = 1;
-  sf->intra_sf.intra_pruning_with_hog_thresh = -1.2f;
 
   sf->mv_sf.full_pixel_search_level = 1;
   sf->mv_sf.exhaustive_searches_thresh = INT_MAX;
@@ -654,13 +778,18 @@
   sf->rt_sf.use_nonrd_filter_search = 1;
   sf->rt_sf.use_nonrd_pick_mode = 0;
   sf->rt_sf.use_real_time_ref_set = 0;
+  sf->rt_sf.check_scene_detection = 0;
+  sf->rt_sf.overshoot_detection_cbr = NO_DETECTION;
   sf->tx_sf.adaptive_txb_search_level = 1;
   sf->tx_sf.intra_tx_size_search_init_depth_sqr = 1;
   sf->tx_sf.model_based_prune_tx_search_level = 1;
   sf->tx_sf.tx_type_search.use_reduced_intra_txset = 1;
+  sf->rt_sf.fullpel_search_step_param = 0;
+  sf->rt_sf.skip_loopfilter_non_reference = 0;
+
+  sf->hl_sf.superres_auto_search_type = SUPERRES_AUTO_SOLO;
 
   if (speed >= 1) {
-    sf->gm_sf.gm_erroradv_type = GM_ERRORADV_TR_1;
     sf->gm_sf.gm_search_type = GM_REDUCED_REF_SEARCH_SKIP_L2_L3_ARF2;
 
     sf->part_sf.prune_ext_partition_types_search_level = 2;
@@ -674,10 +803,7 @@
     sf->inter_sf.selective_ref_frame = 2;
     sf->inter_sf.skip_repeated_newmv = 1;
     sf->inter_sf.disable_wedge_search_var_thresh = 0;
-    sf->inter_sf.disable_wedge_search_edge_thresh = 0;
     sf->inter_sf.prune_comp_type_by_comp_avg = 1;
-    sf->inter_sf.prune_motion_mode_level = 2;
-    sf->inter_sf.prune_single_motion_modes_by_simple_trans = 1;
 
     sf->interp_sf.cb_pred_filter_search = 1;
     sf->interp_sf.use_interp_filter = 1;
@@ -689,7 +815,6 @@
     sf->tx_sf.tx_type_search.skip_tx_search = 1;
     sf->tx_sf.use_intra_txb_hash = 1;
 
-    sf->rd_sf.optimize_b_precheck = 1;
     sf->rd_sf.tx_domain_dist_level = boosted ? 0 : 1;
     sf->rd_sf.tx_domain_dist_thres_level = 1;
 
@@ -697,8 +822,6 @@
   }
 
   if (speed >= 2) {
-    sf->gm_sf.gm_erroradv_type = GM_ERRORADV_TR_2;
-
     sf->part_sf.allow_partition_search_skip = 1;
     sf->part_sf.partition_search_breakout_rate_thr = 80;
 
@@ -707,7 +830,6 @@
 
     sf->inter_sf.adaptive_rd_thresh = 1;
     sf->inter_sf.comp_inter_joint_search_thresh = BLOCK_SIZES_ALL;
-    sf->inter_sf.disable_wedge_search_edge_thresh = 0;
     sf->inter_sf.disable_wedge_search_var_thresh = 100;
     sf->inter_sf.fast_wedge_sign_estimate = 1;
     sf->inter_sf.prune_comp_type_by_comp_avg = 2;
@@ -717,7 +839,6 @@
     sf->interp_sf.adaptive_interp_filter_search = 1;
     sf->interp_sf.cb_pred_filter_search = 0;
     sf->interp_sf.disable_dual_filter = 1;
-    sf->interp_sf.disable_filter_search_var_thresh = 100;
 
     sf->tx_sf.inter_tx_size_search_init_depth_rect = 1;
     sf->tx_sf.inter_tx_size_search_init_depth_sqr = 1;
@@ -740,16 +861,14 @@
     // sf->mv_sf.adaptive_motion_search = 1;
 
     sf->inter_sf.adaptive_rd_thresh = 2;
-    sf->inter_sf.disable_sb_level_mv_cost_upd = 1;
+    sf->inter_sf.mv_cost_upd_level = 1;
     // TODO(yunqing): evaluate this speed feature for speed 1 & 2, and combine
     // it with cpi->sf.disable_wedge_search_var_thresh.
     sf->inter_sf.disable_wedge_interintra_search = 1;
     sf->inter_sf.prune_comp_search_by_single_result = 2;
-    sf->inter_sf.prune_motion_mode_level = boosted ? 2 : 3;
-    sf->inter_sf.prune_warp_using_wmtype = 1;
     sf->inter_sf.selective_ref_frame = 4;
 
-    sf->tx_sf.tx_type_search.prune_mode = PRUNE_2D_FAST;
+    sf->tx_sf.tx_type_search.prune_2d_txfm_mode = TX_TYPE_PRUNE_2;
 
     sf->rd_sf.tx_domain_dist_level = 1;
 
@@ -759,7 +878,6 @@
   if (speed >= 4) {
     sf->mv_sf.subpel_search_method = SUBPEL_TREE_PRUNED;
 
-    sf->inter_sf.adaptive_mode_search = 1;
     sf->inter_sf.alt_ref_search_fp = 1;
 
     sf->interp_sf.skip_sharp_interp_filter_search = 1;
@@ -774,12 +892,8 @@
   }
 
   if (speed >= 5) {
-    sf->hl_sf.recode_loop = ALLOW_RECODE_KFMAXBW;
-
     sf->inter_sf.adaptive_rd_thresh = 4;
-    sf->interp_sf.disable_filter_search_var_thresh = 200;
 
-    sf->rd_sf.use_fast_coef_costing = 1;
     sf->rd_sf.tx_domain_dist_level = 2;
     sf->rd_sf.tx_domain_dist_thres_level = 2;
     sf->winner_mode_sf.tx_size_search_level = 1;
@@ -804,7 +918,6 @@
     sf->mv_sf.use_fullpel_costlist = 1;
     sf->mv_sf.subpel_search_method = SUBPEL_TREE_PRUNED_MORE;
 
-    sf->inter_sf.adaptive_mode_search = 2;
     sf->inter_sf.inter_mode_rd_model_estimation = 2;
 
     for (int i = 0; i < TX_SIZES; ++i) {
@@ -812,14 +925,14 @@
       sf->intra_sf.intra_uv_mode_mask[i] = UV_INTRA_DC_CFL;
     }
 
-    sf->tx_sf.tx_type_search.prune_mode = PRUNE_2D_MORE;
+    sf->tx_sf.tx_type_search.prune_2d_txfm_mode = TX_TYPE_PRUNE_3;
     sf->tx_sf.use_inter_txb_hash = 0;
     sf->tx_sf.refine_fast_tx_search_results = 0;
 
     sf->rd_sf.optimize_coefficients = NO_TRELLIS_OPT;
     sf->rd_sf.simple_model_rd_from_var = 1;
 
-    sf->lpf_sf.cdef_pick_method = CDEF_PICK_FROM_Q;
+    sf->lpf_sf.cdef_pick_method = CDEF_FAST_SEARCH_LVL4;
     sf->lpf_sf.lpf_pick = LPF_PICK_FROM_Q;
 
     sf->rt_sf.mode_search_skip_flags |= FLAG_SKIP_INTRA_DIRMISMATCH;
@@ -828,15 +941,22 @@
     sf->rt_sf.use_comp_ref_nonrd = 0;
     sf->rt_sf.use_real_time_ref_set = 1;
     sf->rt_sf.use_simple_rd_model = 1;
+
+    sf->rt_sf.check_scene_detection = 1;
+    if (cm->current_frame.frame_type != KEY_FRAME &&
+        cpi->oxcf.rc_cfg.mode == AOM_CBR)
+      sf->rt_sf.overshoot_detection_cbr = FAST_DETECTION_MAXQ;
+    // Enable noise estimation only for high resolutions for now.
+    if (cm->width * cm->height > 640 * 480)
+      sf->rt_sf.use_temporal_noise_estimate = 1;
   }
 
   if (speed >= 6) {
     sf->part_sf.adjust_var_based_rd_partitioning = 1;
+    sf->lpf_sf.cdef_pick_method = CDEF_PICK_FROM_Q;
   }
 
   if (speed >= 7) {
-    sf->hl_sf.frame_parameter_update = 0;
-
     sf->part_sf.default_max_partition_size = BLOCK_128X128;
     sf->part_sf.default_min_partition_size = BLOCK_8X8;
     sf->part_sf.partition_search_type = VAR_BASED_PARTITION;
@@ -847,7 +967,6 @@
 
     sf->inter_sf.inter_mode_rd_model_estimation = 2;
 
-    sf->lpf_sf.cdef_pick_method = CDEF_PICK_FROM_Q;
     sf->lpf_sf.lpf_pick = LPF_PICK_FROM_Q;
 
     sf->rt_sf.mode_search_skip_flags |= FLAG_SKIP_INTRA_DIRMISMATCH;
@@ -856,15 +975,50 @@
     sf->rt_sf.short_circuit_low_temp_var = 0;
     sf->rt_sf.skip_interp_filter_search = 0;
     sf->rt_sf.use_comp_ref_nonrd = 0;
-    sf->rt_sf.use_nonrd_altref_frame = 1;
+    // For spatial layers, only LAST and GOLDEN are currently used in the SVC
+    // for nonrd. The flag use_nonrd_altref_frame can disable GOLDEN in the
+    // get_ref_frame_flags() for some patterns, so disable it here for
+    // spatial layers.
+    sf->rt_sf.use_nonrd_altref_frame =
+        (cpi->svc.number_spatial_layers > 1) ? 0 : 1;
     sf->rt_sf.use_nonrd_pick_mode = 1;
     sf->rt_sf.nonrd_check_partition_merge_mode = 1;
     sf->rt_sf.nonrd_check_partition_split = 0;
     sf->rt_sf.hybrid_intra_pickmode = 1;
+    sf->rt_sf.skip_intra_pred_if_tx_skip = 1;
+    // For SVC: use better mv search on base temporal layer, and only
+    // on base spatial layer if highest resolution is above 640x360.
+    if (cpi->svc.number_temporal_layers > 1) {
+      if (cpi->svc.temporal_layer_id == 0 &&
+          (cpi->svc.spatial_layer_id == 0 ||
+           cpi->oxcf.frm_dim_cfg.width * cpi->oxcf.frm_dim_cfg.height <=
+               640 * 360)) {
+        sf->mv_sf.search_method = NSTEP;
+        sf->mv_sf.subpel_search_method = SUBPEL_TREE;
+        sf->rt_sf.fullpel_search_step_param = 6;
+      } else if (cpi->svc.non_reference_frame) {
+        sf->mv_sf.subpel_search_method = SUBPEL_TREE_PRUNED_MORE;
+        sf->rt_sf.fullpel_search_step_param = 10;
+      }
+    }
+    // TODO(marpan): Look into why enabling skip_loopfilter_non_reference is
+    // not bitexact on rtc testset, its very close (< ~0.01 bdrate), but not
+    // always bitexact.
+    if (cpi->use_svc && cpi->svc.non_reference_frame &&
+        sf->lpf_sf.cdef_pick_method == CDEF_PICK_FROM_Q &&
+        sf->lpf_sf.lpf_pick == LPF_PICK_FROM_Q)
+      sf->rt_sf.skip_loopfilter_non_reference = 1;
+    // Set mask for intra modes.
+    for (int i = 0; i < BLOCK_SIZES; ++i)
+      if (i >= BLOCK_32X32)
+        sf->rt_sf.intra_y_mode_bsize_mask_nrd[i] = INTRA_DC;
+      else
+        // Use DC, H, V intra mode for block sizes < 32X32.
+        sf->rt_sf.intra_y_mode_bsize_mask_nrd[i] = INTRA_DC_H_V;
   }
 
   if (speed >= 8) {
-    sf->rt_sf.estimate_motion_for_var_based_partition = 0;
+    sf->rt_sf.estimate_motion_for_var_based_partition = 1;
     sf->rt_sf.short_circuit_low_temp_var = 1;
     sf->rt_sf.reuse_inter_pred_nonrd = 1;
     sf->rt_sf.use_nonrd_altref_frame = 0;
@@ -873,32 +1027,42 @@
     sf->rt_sf.nonrd_check_partition_split = 0;
     sf->rt_sf.use_modeled_non_rd_cost = 1;
     sf->rt_sf.source_metrics_sb_nonrd = 1;
+    sf->rt_sf.skip_intra_pred_if_tx_skip = 0;
     sf->interp_sf.cb_pred_filter_search = 1;
   }
+  if (speed >= 9) {
+    sf->rt_sf.estimate_motion_for_var_based_partition = 0;
+    sf->rt_sf.force_large_partition_blocks = 1;
+    for (int i = 0; i < BLOCK_SIZES; ++i)
+      sf->rt_sf.intra_y_mode_bsize_mask_nrd[i] = INTRA_DC;
+  }
 }
 
 static AOM_INLINE void init_hl_sf(HIGH_LEVEL_SPEED_FEATURES *hl_sf) {
   // best quality defaults
   hl_sf->frame_parameter_update = 1;
   hl_sf->recode_loop = ALLOW_RECODE;
-  hl_sf->disable_overlay_frames = 0;
-  hl_sf->adaptive_overlay_encoding = 1;
   // Recode loop tolerance %.
   hl_sf->recode_tolerance = 25;
   hl_sf->high_precision_mv_usage = CURRENT_Q;
+  hl_sf->superres_auto_search_type = SUPERRES_AUTO_ALL;
+  hl_sf->disable_extra_sc_testing = 0;
   hl_sf->second_alt_ref_filtering = 1;
 }
 
 static AOM_INLINE void init_tpl_sf(TPL_SPEED_FEATURES *tpl_sf) {
+  tpl_sf->disable_gop_length_decision = 0;
   tpl_sf->prune_intra_modes = 0;
+  tpl_sf->prune_starting_mv = 0;
   tpl_sf->reduce_first_step_size = 0;
   tpl_sf->skip_alike_starting_mv = 0;
   tpl_sf->subpel_force_stop = EIGHTH_PEL;
+  tpl_sf->search_method = NSTEP;
+  tpl_sf->disable_filtered_key_tpl = 0;
+  tpl_sf->prune_ref_frames_in_tpl = 0;
 }
 
 static AOM_INLINE void init_gm_sf(GLOBAL_MOTION_SPEED_FEATURES *gm_sf) {
-  gm_sf->gm_erroradv_type = GM_ERRORADV_TR_0;
-  gm_sf->disable_adaptive_warp_error_thresh = 1;
   gm_sf->selective_ref_gm = 1;
   gm_sf->gm_search_type = GM_FULL_SEARCH;
   gm_sf->gm_disable_recode = 0;
@@ -918,7 +1082,7 @@
   part_sf->max_intra_bsize = BLOCK_LARGEST;
   // This setting only takes effect when partition_search_type is set
   // to FIXED_PARTITION.
-  part_sf->always_this_block_size = BLOCK_16X16;
+  part_sf->fixed_partition_size = BLOCK_16X16;
   // Recode loop tolerance %.
   part_sf->partition_search_breakout_dist_thr = 0;
   part_sf->partition_search_breakout_rate_thr = 0;
@@ -935,15 +1099,19 @@
   part_sf->simple_motion_search_split = 0;
   part_sf->simple_motion_search_prune_rect = 0;
   part_sf->simple_motion_search_early_term_none = 0;
+  part_sf->simple_motion_search_reduce_search_steps = 0;
   part_sf->intra_cnn_split = 0;
   part_sf->ext_partition_eval_thresh = BLOCK_8X8;
   part_sf->prune_4_partition_using_split_info = 0;
   part_sf->prune_ab_partition_using_split_info = 0;
+  part_sf->prune_rectangular_split_based_on_qidx = 0;
+  part_sf->early_term_after_none_split = 0;
+  part_sf->ml_predict_breakout_level = 0;
+  part_sf->prune_sub_8x8_partition_level = 0;
 }
 
 static AOM_INLINE void init_mv_sf(MV_SPEED_FEATURES *mv_sf) {
   mv_sf->full_pixel_search_level = 0;
-  mv_sf->adaptive_motion_search = 0;
   mv_sf->auto_mv_step_size = 0;
   mv_sf->exhaustive_searches_thresh = 0;
   mv_sf->obmc_full_pixel_search_level = 0;
@@ -955,7 +1123,9 @@
   mv_sf->subpel_iters_per_step = 2;
   mv_sf->subpel_search_method = SUBPEL_TREE;
   mv_sf->use_accurate_subpel_search = USE_8_TAPS;
+  mv_sf->use_bsize_dependent_search_method = 0;
   mv_sf->use_fullpel_costlist = 0;
+  mv_sf->use_downsampled_sad = 0;
 }
 
 static AOM_INLINE void init_inter_sf(INTER_MODE_SPEED_FEATURES *inter_sf) {
@@ -963,35 +1133,32 @@
   inter_sf->adaptive_rd_thresh = 0;
   inter_sf->model_based_post_interp_filter_breakout = 0;
   inter_sf->reduce_inter_modes = 0;
-  inter_sf->adaptive_mode_search = 0;
   inter_sf->alt_ref_search_fp = 0;
   inter_sf->selective_ref_frame = 0;
   inter_sf->prune_ref_frame_for_rect_partitions = 0;
-  inter_sf->disable_wedge_search_edge_thresh = 0;
   inter_sf->disable_wedge_search_var_thresh = 0;
   inter_sf->fast_wedge_sign_estimate = 0;
   inter_sf->prune_wedge_pred_diff_based = 0;
   inter_sf->use_dist_wtd_comp_flag = DIST_WTD_COMP_ENABLED;
   inter_sf->reuse_inter_intra_mode = 0;
   inter_sf->disable_sb_level_coeff_cost_upd = 0;
-  inter_sf->disable_sb_level_mv_cost_upd = 0;
+  inter_sf->mv_cost_upd_level = 0;
   inter_sf->prune_inter_modes_based_on_tpl = 0;
+  inter_sf->prune_nearmv_using_neighbors = 0;
   inter_sf->prune_comp_search_by_single_result = 0;
   inter_sf->skip_repeated_ref_mv = 0;
   inter_sf->skip_repeated_newmv = 0;
   inter_sf->skip_repeated_full_newmv = 0;
-  inter_sf->prune_single_motion_modes_by_simple_trans = 0;
   inter_sf->inter_mode_rd_model_estimation = 0;
   inter_sf->prune_compound_using_single_ref = 0;
   inter_sf->prune_compound_using_neighbors = 0;
+  inter_sf->prune_comp_using_best_single_mode_ref = 0;
   inter_sf->disable_onesided_comp = 0;
   inter_sf->prune_mode_search_simple_translation = 0;
   inter_sf->prune_comp_type_by_comp_avg = 0;
   inter_sf->disable_interinter_wedge_newmv_search = 0;
   inter_sf->enable_interinter_diffwtd_newmv_search = 0;
   inter_sf->disable_smooth_interintra = 0;
-  inter_sf->prune_motion_mode_level = 0;
-  inter_sf->prune_warp_using_wmtype = 0;
   inter_sf->disable_wedge_interintra_search = 0;
   inter_sf->fast_interintra_wedge_search = 0;
   inter_sf->prune_comp_type_by_model_rd = 0;
@@ -1004,18 +1171,21 @@
   inter_sf->reuse_compound_type_decision = 0;
   inter_sf->txfm_rd_gate_level = 0;
   inter_sf->prune_inter_modes_if_skippable = 0;
+  inter_sf->disable_masked_comp = 0;
+  inter_sf->reuse_best_prediction_for_part_ab = 0;
 }
 
 static AOM_INLINE void init_interp_sf(INTERP_FILTER_SPEED_FEATURES *interp_sf) {
-  interp_sf->disable_filter_search_var_thresh = 0;
   interp_sf->adaptive_interp_filter_search = 0;
-  interp_sf->use_fast_interpolation_filter_search = 0;
+  interp_sf->cb_pred_filter_search = 0;
   interp_sf->disable_dual_filter = 0;
-  interp_sf->use_interp_filter = 0;
   interp_sf->skip_sharp_interp_filter_search = 0;
+  interp_sf->use_fast_interpolation_filter_search = 0;
+  interp_sf->use_interp_filter = 0;
 }
 
 static AOM_INLINE void init_intra_sf(INTRA_MODE_SPEED_FEATURES *intra_sf) {
+  intra_sf->chroma_intra_pruning_with_hog = 0;
   intra_sf->skip_intra_in_interframe = 1;
   intra_sf->intra_pruning_with_hog = 0;
   intra_sf->src_var_thresh_intra_skip = 1;
@@ -1035,7 +1205,7 @@
   tx_sf->intra_tx_size_search_init_depth_sqr = 0;
   tx_sf->tx_size_search_lgr_block = 0;
   tx_sf->model_based_prune_tx_search_level = 0;
-  tx_sf->tx_type_search.prune_mode = PRUNE_2D_ACCURATE;
+  tx_sf->tx_type_search.prune_2d_txfm_mode = TX_TYPE_PRUNE_1;
   tx_sf->tx_type_search.ml_tx_split_thresh = 8500;
   tx_sf->tx_type_search.use_skip_flag_prediction = 1;
   tx_sf->tx_type_search.use_reduced_intra_txset = 0;
@@ -1044,45 +1214,43 @@
   tx_sf->tx_type_search.skip_tx_search = 0;
   tx_sf->tx_type_search.prune_tx_type_using_stats = 0;
   tx_sf->tx_type_search.prune_tx_type_est_rd = 0;
-  tx_sf->tx_type_search.enable_winner_mode_tx_type_pruning = 0;
+  tx_sf->tx_type_search.winner_mode_tx_type_pruning = 0;
   tx_sf->txb_split_cap = 1;
   tx_sf->adaptive_txb_search_level = 0;
   tx_sf->use_intra_txb_hash = 0;
   tx_sf->use_inter_txb_hash = 1;
   tx_sf->refine_fast_tx_search_results = 1;
+  tx_sf->prune_tx_size_level = 0;
 }
 
 static AOM_INLINE void init_rd_sf(RD_CALC_SPEED_FEATURES *rd_sf,
-                                  const AV1_COMP *cpi) {
-  if (cpi->oxcf.disable_trellis_quant == 3) {
-    rd_sf->optimize_coefficients = !is_lossless_requested(&cpi->oxcf)
+                                  const AV1EncoderConfig *oxcf) {
+  const int disable_trellis_quant = oxcf->algo_cfg.disable_trellis_quant;
+  if (disable_trellis_quant == 3) {
+    rd_sf->optimize_coefficients = !is_lossless_requested(&oxcf->rc_cfg)
                                        ? NO_ESTIMATE_YRD_TRELLIS_OPT
                                        : NO_TRELLIS_OPT;
-  } else if (cpi->oxcf.disable_trellis_quant == 2) {
-    rd_sf->optimize_coefficients = !is_lossless_requested(&cpi->oxcf)
+  } else if (disable_trellis_quant == 2) {
+    rd_sf->optimize_coefficients = !is_lossless_requested(&oxcf->rc_cfg)
                                        ? FINAL_PASS_TRELLIS_OPT
                                        : NO_TRELLIS_OPT;
-  } else if (cpi->oxcf.disable_trellis_quant == 0) {
-    if (is_lossless_requested(&cpi->oxcf)) {
+  } else if (disable_trellis_quant == 0) {
+    if (is_lossless_requested(&oxcf->rc_cfg)) {
       rd_sf->optimize_coefficients = NO_TRELLIS_OPT;
     } else {
       rd_sf->optimize_coefficients = FULL_TRELLIS_OPT;
     }
-  } else if (cpi->oxcf.disable_trellis_quant == 1) {
+  } else if (disable_trellis_quant == 1) {
     rd_sf->optimize_coefficients = NO_TRELLIS_OPT;
   } else {
     assert(0 && "Invalid disable_trellis_quant value");
   }
-  // TODO(sarahparker) Pair this with a speed setting once experiments are done
-  rd_sf->trellis_eob_fast = 0;
   rd_sf->use_mb_rd_hash = 1;
-  rd_sf->optimize_b_precheck = 0;
-  rd_sf->use_fast_coef_costing = 0;
   rd_sf->simple_model_rd_from_var = 0;
   rd_sf->tx_domain_dist_level = 0;
   rd_sf->tx_domain_dist_thres_level = 0;
-  rd_sf->use_hash_based_trellis = 0;
   rd_sf->perform_coeff_opt = 0;
+  rd_sf->perform_coeff_opt_based_on_satd = 0;
 }
 
 static AOM_INLINE void init_winner_mode_sf(
@@ -1093,7 +1261,8 @@
   winner_mode_sf->enable_winner_mode_for_coeff_opt = 0;
   winner_mode_sf->enable_winner_mode_for_tx_size_srch = 0;
   winner_mode_sf->enable_winner_mode_for_use_tx_domain_dist = 0;
-  winner_mode_sf->enable_multiwinner_mode_process = 0;
+  winner_mode_sf->multi_winner_mode_type = 0;
+  winner_mode_sf->dc_blk_pred_level = 0;
 }
 
 static AOM_INLINE void init_lpf_sf(LOOP_FILTER_SPEED_FEATURES *lpf_sf) {
@@ -1103,6 +1272,7 @@
   lpf_sf->enable_sgr_ep_pruning = 0;
   lpf_sf->reduce_wiener_window_size = 0;
   lpf_sf->lpf_pick = LPF_PICK_FROM_FULL_IMAGE;
+  lpf_sf->use_coarse_filter_level_search = 0;
   lpf_sf->cdef_pick_method = CDEF_FULL_SEARCH;
   // Set decoder side speed feature to use less dual sgr modes
   lpf_sf->dual_sgr_penalty_level = 0;
@@ -1117,6 +1287,7 @@
   rt_sf->use_simple_rd_model = 0;
   rt_sf->nonrd_check_partition_merge_mode = 0;
   rt_sf->nonrd_check_partition_split = 0;
+  rt_sf->skip_intra_pred_if_tx_skip = 0;
 }
 
 void av1_set_speed_features_framesize_dependent(AV1_COMP *cpi, int speed) {
@@ -1130,25 +1301,21 @@
   }
 
   // This is only used in motion vector unit test.
-  if (cpi->oxcf.motion_vector_unit_test == 1)
+  if (cpi->oxcf.unit_test_cfg.motion_vector_unit_test == 1)
     cpi->mv_search_params.find_fractional_mv_step = av1_return_max_sub_pixel_mv;
-  else if (cpi->oxcf.motion_vector_unit_test == 2)
+  else if (cpi->oxcf.unit_test_cfg.motion_vector_unit_test == 2)
     cpi->mv_search_params.find_fractional_mv_step = av1_return_min_sub_pixel_mv;
 
-  MACROBLOCK *const x = &cpi->td.mb;
-  AV1_COMMON *const cm = &cpi->common;
-  x->min_partition_size = AOMMAX(sf->part_sf.default_min_partition_size,
-                                 dim_to_size(cpi->oxcf.min_partition_size));
-  x->max_partition_size = AOMMIN(sf->part_sf.default_max_partition_size,
-                                 dim_to_size(cpi->oxcf.max_partition_size));
-  x->min_partition_size = AOMMIN(x->min_partition_size, cm->seq_params.sb_size);
-  x->max_partition_size = AOMMIN(x->max_partition_size, cm->seq_params.sb_size);
+  if ((cpi->oxcf.row_mt == 1) && (cpi->oxcf.max_threads > 1)) {
+    if (sf->inter_sf.mv_cost_upd_level > 1) {
+      // Set mv_cost_upd_level to use row level update.
+      sf->inter_sf.mv_cost_upd_level = 1;
+    }
+  }
 }
 
 void av1_set_speed_features_framesize_independent(AV1_COMP *cpi, int speed) {
-  AV1_COMMON *const cm = &cpi->common;
   SPEED_FEATURES *const sf = &cpi->sf;
-  MACROBLOCK *const x = &cpi->td.mb;
   WinnerModeParams *const winner_mode_params = &cpi->winner_mode_params;
   const AV1EncoderConfig *const oxcf = &cpi->oxcf;
   int i;
@@ -1162,7 +1329,7 @@
   init_interp_sf(&sf->interp_sf);
   init_intra_sf(&sf->intra_sf);
   init_tx_sf(&sf->tx_sf);
-  init_rd_sf(&sf->rd_sf, cpi);
+  init_rd_sf(&sf->rd_sf, oxcf);
   init_winner_mode_sf(&sf->winner_mode_sf);
   init_lpf_sf(&sf->lpf_sf);
   init_rt_sf(&sf->rt_sf);
@@ -1176,6 +1343,11 @@
     cpi->common.seq_params.enable_dual_filter &=
         !sf->interp_sf.disable_dual_filter;
     cpi->common.seq_params.enable_restoration &= !sf->lpf_sf.disable_lr_filter;
+
+    cpi->common.seq_params.enable_masked_compound &=
+        !sf->inter_sf.disable_masked_comp;
+    cpi->common.seq_params.enable_interintra_compound &=
+        !sf->inter_sf.disable_wedge_interintra_search;
   }
 
   // sf->part_sf.partition_search_breakout_dist_thr is set assuming max 64x64
@@ -1209,7 +1381,8 @@
     sf->rd_sf.optimize_coefficients = NO_TRELLIS_OPT;
 
   // No recode or trellis for 1 pass.
-  if (oxcf->pass == 0) sf->hl_sf.recode_loop = DISALLOW_RECODE;
+  if (oxcf->pass == 0 && has_no_stats_stage(cpi))
+    sf->hl_sf.recode_loop = DISALLOW_RECODE;
 
   MotionVectorSearchParams *const mv_search_params = &cpi->mv_search_params;
   if (sf->mv_sf.subpel_search_method == SUBPEL_TREE) {
@@ -1220,22 +1393,12 @@
   } else if (sf->mv_sf.subpel_search_method == SUBPEL_TREE_PRUNED_MORE) {
     mv_search_params->find_fractional_mv_step =
         av1_find_best_sub_pixel_tree_pruned_more;
-  } else if (sf->mv_sf.subpel_search_method == SUBPEL_TREE_PRUNED_EVENMORE) {
-    mv_search_params->find_fractional_mv_step =
-        av1_find_best_sub_pixel_tree_pruned_evenmore;
   }
 
-  x->min_partition_size = AOMMAX(sf->part_sf.default_min_partition_size,
-                                 dim_to_size(cpi->oxcf.min_partition_size));
-  x->max_partition_size = AOMMIN(sf->part_sf.default_max_partition_size,
-                                 dim_to_size(cpi->oxcf.max_partition_size));
-  x->min_partition_size = AOMMIN(x->min_partition_size, cm->seq_params.sb_size);
-  x->max_partition_size = AOMMIN(x->max_partition_size, cm->seq_params.sb_size);
-
   // This is only used in motion vector unit test.
-  if (cpi->oxcf.motion_vector_unit_test == 1)
+  if (cpi->oxcf.unit_test_cfg.motion_vector_unit_test == 1)
     mv_search_params->find_fractional_mv_step = av1_return_max_sub_pixel_mv;
-  else if (cpi->oxcf.motion_vector_unit_test == 2)
+  else if (cpi->oxcf.unit_test_cfg.motion_vector_unit_test == 2)
     mv_search_params->find_fractional_mv_step = av1_return_min_sub_pixel_mv;
 
   // assert ensures that tx_domain_dist_level is accessed correctly
@@ -1253,18 +1416,26 @@
 
   // assert ensures that coeff_opt_dist_thresholds is accessed correctly
   assert(cpi->sf.rd_sf.perform_coeff_opt >= 0 &&
-         cpi->sf.rd_sf.perform_coeff_opt < 6);
+         cpi->sf.rd_sf.perform_coeff_opt < 7);
   memcpy(winner_mode_params->coeff_opt_dist_threshold,
          coeff_opt_dist_thresholds[cpi->sf.rd_sf.perform_coeff_opt],
          sizeof(winner_mode_params->coeff_opt_dist_threshold));
 
+  // assert ensures that coeff_opt_satd_thresholds is accessed correctly
+  assert(cpi->sf.rd_sf.perform_coeff_opt_based_on_satd >= 0 &&
+         cpi->sf.rd_sf.perform_coeff_opt_based_on_satd < 3);
+  memcpy(
+      winner_mode_params->coeff_opt_satd_threshold,
+      coeff_opt_satd_thresholds[cpi->sf.rd_sf.perform_coeff_opt_based_on_satd],
+      sizeof(winner_mode_params->coeff_opt_satd_threshold));
+
   // assert ensures that predict_skip_levels is accessed correctly
   assert(cpi->sf.tx_sf.tx_type_search.use_skip_flag_prediction >= 0 &&
          cpi->sf.tx_sf.tx_type_search.use_skip_flag_prediction < 3);
-  memcpy(winner_mode_params->predict_skip_level,
+  memcpy(winner_mode_params->skip_txfm_level,
          predict_skip_levels[cpi->sf.tx_sf.tx_type_search
                                  .use_skip_flag_prediction],
-         sizeof(winner_mode_params->predict_skip_level));
+         sizeof(winner_mode_params->skip_txfm_level));
 
   // assert ensures that tx_size_search_level is accessed correctly
   assert(cpi->sf.winner_mode_sf.tx_size_search_level >= 0 &&
@@ -1272,12 +1443,23 @@
   memcpy(winner_mode_params->tx_size_search_methods,
          tx_size_search_methods[cpi->sf.winner_mode_sf.tx_size_search_level],
          sizeof(winner_mode_params->tx_size_search_methods));
+  memcpy(winner_mode_params->predict_dc_level,
+         predict_dc_levels[cpi->sf.winner_mode_sf.dc_blk_pred_level],
+         sizeof(winner_mode_params->predict_dc_level));
 
   if (cpi->oxcf.row_mt == 1 && (cpi->oxcf.max_threads > 1)) {
     if (sf->inter_sf.inter_mode_rd_model_estimation == 1) {
       // Revert to type 2
       sf->inter_sf.inter_mode_rd_model_estimation = 2;
     }
+
+    // Disable the speed feature 'prune_ref_frame_for_gm_search' to achieve
+    // better parallelism when number of threads available are greater than or
+    // equal to maximum number of reference frames allowed for global motion.
+    if (sf->gm_sf.gm_search_type != GM_DISABLE_SEARCH &&
+        (cpi->oxcf.max_threads >=
+         gm_available_reference_frames[sf->gm_sf.gm_search_type]))
+      sf->gm_sf.prune_ref_frame_for_gm_search = 0;
   }
 }
 
@@ -1288,9 +1470,23 @@
   WinnerModeParams *const winner_mode_params = &cpi->winner_mode_params;
   const int boosted = frame_is_boosted(cpi);
   const int is_720p_or_larger = AOMMIN(cm->width, cm->height) >= 720;
-  if (is_720p_or_larger && cpi->oxcf.mode == GOOD && speed == 0) {
-    if (cm->quant_params.base_qindex <= 80) {
-      sf->rd_sf.perform_coeff_opt = 2;
+  const int is_1080p_or_larger = AOMMIN(cm->width, cm->height) >= 1080;
+  const int is_arf2_bwd_type =
+      cpi->gf_group.update_type[cpi->gf_group.index] == INTNL_ARF_UPDATE;
+
+  if (cpi->oxcf.mode == GOOD && speed == 0) {
+    // qindex_thresh for resolution < 720p
+    const int qindex_thresh = boosted ? 70 : (is_arf2_bwd_type ? 110 : 140);
+    if (!is_720p_or_larger && cm->quant_params.base_qindex <= qindex_thresh) {
+      sf->inter_sf.skip_repeated_newmv = 1;
+      sf->part_sf.simple_motion_search_split =
+          cm->features.allow_screen_content_tools ? 1 : 2;
+      sf->part_sf.simple_motion_search_early_term_none = 1;
+      sf->tx_sf.model_based_prune_tx_search_level = 0;
+    }
+
+    if (is_720p_or_larger && cm->quant_params.base_qindex <= 128) {
+      sf->rd_sf.perform_coeff_opt = 2 + is_1080p_or_larger;
       memcpy(winner_mode_params->coeff_opt_dist_threshold,
              coeff_opt_dist_thresholds[sf->rd_sf.perform_coeff_opt],
              sizeof(winner_mode_params->coeff_opt_dist_threshold));
@@ -1299,13 +1495,28 @@
       sf->tx_sf.inter_tx_size_search_init_depth_rect = 1;
       sf->tx_sf.inter_tx_size_search_init_depth_sqr = 1;
       sf->tx_sf.intra_tx_size_search_init_depth_rect = 1;
+      sf->inter_sf.skip_repeated_newmv = 1;
+      sf->tx_sf.model_based_prune_tx_search_level = 0;
+
+      if (is_1080p_or_larger && cm->quant_params.base_qindex <= 108) {
+        sf->inter_sf.selective_ref_frame = 2;
+        sf->rd_sf.tx_domain_dist_level = boosted ? 1 : 2;
+        sf->rd_sf.tx_domain_dist_thres_level = 1;
+        sf->part_sf.simple_motion_search_early_term_none = 1;
+        sf->tx_sf.tx_type_search.ml_tx_split_thresh = 4000;
+        sf->interp_sf.cb_pred_filter_search = 0;
+        sf->tx_sf.tx_type_search.prune_2d_txfm_mode = TX_TYPE_PRUNE_2;
+        sf->tx_sf.tx_type_search.skip_tx_search = 1;
+        sf->tx_sf.use_intra_txb_hash = 1;
+      }
     }
   }
 
   if (cpi->oxcf.mode == GOOD && speed >= 3) {
     // Disable extended partitions for lower quantizers
-    if (cm->quant_params.base_qindex <= 100 &&
-        !cm->features.allow_screen_content_tools && !boosted) {
+    const int qindex_thresh =
+        cm->features.allow_screen_content_tools ? 50 : 100;
+    if (cm->quant_params.base_qindex <= qindex_thresh && !boosted) {
       sf->part_sf.ext_partition_eval_thresh = BLOCK_128X128;
     }
   }
@@ -1314,9 +1525,35 @@
     // Disable extended partitions for lower quantizers
     const int qindex_thresh = boosted ? 80 : 120;
     if (cm->quant_params.base_qindex <= qindex_thresh &&
-        !cm->features.allow_screen_content_tools &&
         !frame_is_intra_only(&cpi->common)) {
       sf->part_sf.ext_partition_eval_thresh = BLOCK_128X128;
     }
   }
+
+  if (cpi->oxcf.mode == GOOD && speed >= 5) {
+    const int qindex_thresh = boosted ? 100 : 160;
+    if (cm->quant_params.base_qindex <= qindex_thresh &&
+        !frame_is_intra_only(&cpi->common)) {
+      sf->part_sf.ext_partition_eval_thresh = BLOCK_128X128;
+    }
+  }
+
+  if (cpi->oxcf.mode == GOOD && (speed <= 2)) {
+    if (!is_stat_generation_stage(cpi)) {
+      // Use faster full-pel motion search for high quantizers.
+      // Also use reduced total search range for low resolutions at high
+      // quantizers.
+      const int aggr = speed;
+      const int qindex_thresh1 = ms_qindex_thresh[aggr][is_720p_or_larger][0];
+      const int qindex_thresh2 = ms_qindex_thresh[aggr][is_720p_or_larger][1];
+      const SEARCH_METHODS search_method =
+          motion_search_method[is_720p_or_larger];
+      if (cm->quant_params.base_qindex > qindex_thresh1) {
+        sf->mv_sf.search_method = search_method;
+        sf->tpl_sf.search_method = search_method;
+      } else if (cm->quant_params.base_qindex > qindex_thresh2) {
+        sf->mv_sf.search_method = NSTEP_8PT;
+      }
+    }
+  }
 }
diff --git a/av1/encoder/speed_features.h b/av1/encoder/speed_features.h
index d12c3c0..da522b7 100644
--- a/av1/encoder/speed_features.h
+++ b/av1/encoder/speed_features.h
@@ -13,11 +13,17 @@
 #define AOM_AV1_ENCODER_SPEED_FEATURES_H_
 
 #include "av1/common/enums.h"
+#include "av1/encoder/enc_enums.h"
+#include "av1/encoder/mcomp.h"
+#include "av1/encoder/encodemb.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
+/*! @file */
+
+/*!\cond */
 #define MAX_MESH_STEP 4
 
 typedef struct MESH_PATTERN {
@@ -33,18 +39,6 @@
 } UENUM1BYTE(GM_SEARCH_TYPE);
 
 enum {
-  GM_ERRORADV_TR_0,
-  GM_ERRORADV_TR_1,
-  GM_ERRORADV_TR_2,
-  GM_ERRORADV_TR_TYPES,
-} UENUM1BYTE(GM_ERRORADV_TYPE);
-
-enum {
-  FULL_TXFM_RD,
-  LOW_TXFM_RD,
-} UENUM1BYTE(TXFM_RD_MODEL);
-
-enum {
   DIST_WTD_COMP_ENABLED,
   DIST_WTD_COMP_SKIP_MV_SEARCH,
   DIST_WTD_COMP_DISABLED,
@@ -77,6 +71,8 @@
   INTRA_DC = (1 << DC_PRED),
   INTRA_DC_TM = (1 << DC_PRED) | (1 << PAETH_PRED),
   INTRA_DC_H_V = (1 << DC_PRED) | (1 << V_PRED) | (1 << H_PRED),
+  INTRA_DC_H_V_SMOOTH =
+      (1 << DC_PRED) | (1 << V_PRED) | (1 << H_PRED) | (1 << SMOOTH_PRED),
   INTRA_DC_PAETH_H_V =
       (1 << DC_PRED) | (1 << PAETH_PRED) | (1 << V_PRED) | (1 << H_PRED)
 };
@@ -116,32 +112,31 @@
   RESERVE_3_SF = 128,
 } UENUM1BYTE(DEV_SPEED_FEATURES);
 
+/* This enumeration defines when the rate control recode loop will be
+ * enabled.
+ */
 enum {
-  // No recode.
+  /*
+   * No recodes allowed
+   */
   DISALLOW_RECODE = 0,
-  // Allow recode for KF and exceeding maximum frame bandwidth.
-  ALLOW_RECODE_KFMAXBW = 1,
-  // Allow recode only for KF/ARF/GF frames.
-  ALLOW_RECODE_KFARFGF = 2,
-  // Allow recode for all frames based on bitrate constraints.
-  ALLOW_RECODE = 3,
+  /*
+   * Allow recode only for KF/ARF/GF frames
+   */
+  ALLOW_RECODE_KFARFGF = 1,
+  /*
+   * Allow recode for all frame types based on bitrate constraints.
+   */
+  ALLOW_RECODE = 2,
 } UENUM1BYTE(RECODE_LOOP_TYPE);
 
 enum {
   SUBPEL_TREE = 0,
-  SUBPEL_TREE_PRUNED = 1,           // Prunes 1/2-pel searches
-  SUBPEL_TREE_PRUNED_MORE = 2,      // Prunes 1/2-pel searches more aggressively
-  SUBPEL_TREE_PRUNED_EVENMORE = 3,  // Prunes 1/2- and 1/4-pel searches
-  // Other methods to come
+  SUBPEL_TREE_PRUNED = 1,       // Prunes 1/2-pel searches
+  SUBPEL_TREE_PRUNED_MORE = 2,  // Prunes 1/2-pel searches more aggressively
 } UENUM1BYTE(SUBPEL_SEARCH_METHODS);
 
 enum {
-  USE_FULL_RD = 0,
-  USE_FAST_RD,
-  USE_LARGESTALL,
-} UENUM1BYTE(TX_SIZE_SEARCH_METHOD);
-
-enum {
   // Try the full image with different values.
   LPF_PICK_FROM_FULL_IMAGE,
   // Try the full image filter search with non-dual filter only.
@@ -153,15 +148,23 @@
   // Pick 0 to disable LPF if LPF was enabled last frame
   LPF_PICK_MINIMAL_LPF
 } UENUM1BYTE(LPF_PICK_METHOD);
+/*!\endcond */
 
-enum {
-  CDEF_FULL_SEARCH,
-  CDEF_FAST_SEARCH_LVL1,  // Search among a subset of all possible filters.
-  CDEF_FAST_SEARCH_LVL2,  // Search reduced subset of filters than Level 1.
-  CDEF_PICK_FROM_Q,       // Estimate filter strength based on quantizer.
+/*!\enum CDEF_PICK_METHOD
+ * \brief This enumeration defines a variety of CDEF pick methods
+ */
+typedef enum {
+  CDEF_FULL_SEARCH,      /**< Full search */
+  CDEF_FAST_SEARCH_LVL1, /**< Search among a subset of all possible filters. */
+  CDEF_FAST_SEARCH_LVL2, /**< Search reduced subset of filters than Level 1. */
+  CDEF_FAST_SEARCH_LVL3, /**< Search reduced subset of secondary filters than
+                              Level 2. */
+  CDEF_FAST_SEARCH_LVL4, /**< Search reduced subset of filters than Level 3. */
+  CDEF_PICK_FROM_Q,      /**< Estimate filter strength based on quantizer. */
   CDEF_PICK_METHODS
-} UENUM1BYTE(CDEF_PICK_METHOD);
+} CDEF_PICK_METHOD;
 
+/*!\cond */
 enum {
   // Terminate search early based on distortion so far compared to
   // qp step, distortion in the neighborhood of the frame, etc.
@@ -182,19 +185,44 @@
 } UENUM1BYTE(MODE_SEARCH_SKIP_LOGIC);
 
 enum {
-  NO_PRUNE = 0,
+  // No tx type pruning
+  TX_TYPE_PRUNE_0 = 0,
   // adaptively prunes the least perspective tx types out of all 16
   // (tuned to provide negligible quality loss)
-  PRUNE_2D_ACCURATE = 1,
+  TX_TYPE_PRUNE_1 = 1,
   // similar, but applies much more aggressive pruning to get better speed-up
-  PRUNE_2D_FAST = 2,
-  PRUNE_2D_MORE = 3,
+  TX_TYPE_PRUNE_2 = 2,
+  TX_TYPE_PRUNE_3 = 3,
   // More aggressive pruning based on tx type score and allowed tx count
-  PRUNE_2D_AGGRESSIVE = 4,
+  TX_TYPE_PRUNE_4 = 4,
+  TX_TYPE_PRUNE_5 = 5,
 } UENUM1BYTE(TX_TYPE_PRUNE_MODE);
 
+enum {
+  // No reaction to rate control on a detected slide/scene change.
+  NO_DETECTION = 0,
+
+  // Set to larger Q based only on the detected slide/scene change and
+  // current/past Q.
+  FAST_DETECTION_MAXQ = 1,
+} UENUM1BYTE(OVERSHOOT_DETECTION_CBR);
+
+enum {
+  // Turns off multi-winner mode. So we will do txfm search on either all modes
+  // if winner mode is off, or we will only on txfm search on a single winner
+  // mode.
+  MULTI_WINNER_MODE_OFF = 0,
+
+  // Limits the number of winner modes to at most 2
+  MULTI_WINNER_MODE_FAST = 1,
+
+  // Uses the default number of winner modes, which is 3 for intra mode, and 1
+  // for inter mode.
+  MULTI_WINNER_MODE_DEFAULT = 2,
+} UENUM1BYTE(MULTI_WINNER_MODE_TYPE);
+
 typedef struct {
-  TX_TYPE_PRUNE_MODE prune_mode;
+  TX_TYPE_PRUNE_MODE prune_2d_txfm_mode;
   int fast_intra_tx_type_search;
   int fast_inter_tx_type_search;
 
@@ -221,7 +249,7 @@
   // inter blocks. It enables further tx type mode pruning based on ML model for
   // mode evaluation and disables tx type mode pruning for winner mode
   // processing.
-  int enable_winner_mode_tx_type_pruning;
+  int winner_mode_tx_type_pruning;
 } TX_TYPE_SEARCH;
 
 enum {
@@ -231,9 +259,11 @@
   // Always use a fixed size partition
   FIXED_PARTITION,
 
-  REFERENCE_PARTITION,
+  // Partition using source variance
+  VAR_BASED_PARTITION,
 
-  VAR_BASED_PARTITION
+  // Partition using ML model
+  ML_BASED_PARTITION
 } UENUM1BYTE(PARTITION_SEARCH_TYPE);
 
 enum {
@@ -249,30 +279,40 @@
   QTR_ONLY,
 } UENUM1BYTE(MV_PREC_LOGIC);
 
+enum {
+  SUPERRES_AUTO_ALL,   // Tries all possible superres ratios
+  SUPERRES_AUTO_DUAL,  // Tries no superres and q-based superres ratios
+  SUPERRES_AUTO_SOLO,  // Only apply the q-based superres ratio
+} UENUM1BYTE(SUPERRES_AUTO_SEARCH_TYPE);
+
+/*!\endcond */
+/*!
+ * \brief Sequence/frame level speed vs quality features
+ */
 typedef struct HIGH_LEVEL_SPEED_FEATURES {
+  /*!\cond */
   // Frame level coding parameter update
   int frame_parameter_update;
 
+  /*!\endcond */
+  /*!
+   * Cases and frame types for which the recode loop is enabled.
+   */
   RECODE_LOOP_TYPE recode_loop;
 
-  // This feature controls the tolerence vs target used in deciding whether to
-  // recode a frame. It has no meaning if recode is disabled.
+  /*!
+   * Controls the tolerance vs target rate used in deciding whether to
+   * recode a frame. It has no meaning if recode is disabled.
+   */
   int recode_tolerance;
 
+  /*!\cond */
   // Determine how motion vector precision is chosen. The possibilities are:
   // LAST_MV_DATA: use the mv data from the last coded frame
   // CURRENT_Q: use the current q as a threshold
   // QTR_ONLY: use quarter pel precision only.
   MV_PREC_LOGIC high_precision_mv_usage;
 
-  // Whether to disable overlay frames for filtered Altref frames,
-  // overiding oxcf->enable_overlay flag set as 1.
-  int disable_overlay_frames;
-
-  // Enable/disable adaptively deciding whether or not to encode ALTREF overlay
-  // frame.
-  int adaptive_overlay_encoding;
-
   // Always set to 0. If on it enables 0 cost background transmission
   // (except for the initial transmission of the segmentation). The feature is
   // disabled because the addition of very large block sizes make the
@@ -280,11 +320,27 @@
   // adds overhead.
   int static_segmentation;
 
-  // Enable/disable second_alt_ref temporal filtering.
+  /*!
+   * Superres-auto mode search type:
+   */
+  SUPERRES_AUTO_SEARCH_TYPE superres_auto_search_type;
+  /*!\endcond */
+
+  /*!
+   * Enable/disable extra screen content test by encoding key frame twice.
+   */
+  int disable_extra_sc_testing;
+
+  /*!
+   * Enable/disable second_alt_ref temporal filtering.
+   */
   int second_alt_ref_filtering;
 } HIGH_LEVEL_SPEED_FEATURES;
 
+/*!\cond */
 typedef struct TPL_SPEED_FEATURES {
+  // Enable/disable GOP length adaptive decision.
+  int disable_gop_length_decision;
   // Prune the intra modes search by tpl.
   // If set to 0, we will search all intra modes from DC_PRED to PAETH_PRED.
   // If set to 1, we only search DC_PRED, V_PRED, and H_PRED.
@@ -301,15 +357,21 @@
 
   // When to stop subpel search.
   SUBPEL_FORCE_STOP subpel_force_stop;
+
+  // Which search method to use.
+  SEARCH_METHODS search_method;
+
+  // Prune starting mvs in TPL based on sad scores.
+  int prune_starting_mv;
+
+  // Not run TPL for filtered Key frame.
+  int disable_filtered_key_tpl;
+
+  // Prune reference frames in TPL.
+  int prune_ref_frames_in_tpl;
 } TPL_SPEED_FEATURES;
 
 typedef struct GLOBAL_MOTION_SPEED_FEATURES {
-  // Global motion warp error threshold
-  GM_ERRORADV_TYPE gm_erroradv_type;
-
-  // Disable adaptive threshold for global motion warp error
-  int disable_adaptive_warp_error_thresh;
-
   // Do not compute the global motion parameters for a LAST2_FRAME or
   // LAST3_FRAME if the GOLDEN_FRAME is closer and it has a non identity
   // global model.
@@ -329,8 +391,8 @@
 typedef struct PARTITION_SPEED_FEATURES {
   PARTITION_SEARCH_TYPE partition_search_type;
 
-  // Used if partition_search_type = FIXED_SIZE_PARTITION
-  BLOCK_SIZE always_this_block_size;
+  // Used if partition_search_type = FIXED_PARTITION
+  BLOCK_SIZE fixed_partition_size;
 
   // Prune extended partition types search
   // Can take values 0 - 2, 0 referring to no pruning, and 1 - 2 increasing
@@ -370,8 +432,8 @@
   BLOCK_SIZE default_min_partition_size;
   BLOCK_SIZE default_max_partition_size;
 
-  // Sets level of adjustmet of variace-based partitioning during
-  // rd_use_partition 0 - no partition adjusment, 1 - try to merge partitions
+  // Sets level of adjustment of variance-based partitioning during
+  // rd_use_partition 0 - no partition adjustment, 1 - try to merge partitions
   // for small blocks and high QP, 2 - always try to merge leaf partitions, 3 -
   // try to merge and split leaf partitions
   int adjust_var_based_rd_partitioning;
@@ -386,7 +448,7 @@
   // Allow skipping partition search for still image frame
   int allow_partition_search_skip;
 
-  // The aggresiveness of pruning with simple_motion_search.
+  // The aggressiveness of pruning with simple_motion_search.
   // Currently 0 is the lowest, and 2 the highest.
   int simple_motion_search_prune_agg;
 
@@ -405,6 +467,12 @@
   // partition after PARTITION_NONE
   int simple_motion_search_early_term_none;
 
+  // Controls whether to reduce the number of motion search steps. If this is 0,
+  // then simple_motion_search has the same number of steps as
+  // single_motion_search (assuming no other speed features). Otherwise, reduce
+  // the number of steps by the value contained in this variable.
+  int simple_motion_search_reduce_search_steps;
+
   // This variable controls the maximum block size where intra blocks can be
   // used in inter frames.
   // TODO(aconverse): Fold this into one of the other many mode skips
@@ -422,12 +490,37 @@
 
   // Prune AB partition search using split and HORZ/VERT info
   int prune_ab_partition_using_split_info;
+
+  // Prunt rectangular, AB and 4-way partition based on q index and block size
+  int prune_rectangular_split_based_on_qidx;
+
+  // Terminate partition search for child partition,
+  // when NONE and SPLIT partition rd_costs are INT64_MAX.
+  int early_term_after_none_split;
+
+  // Level used to adjust threshold for av1_ml_predict_breakout(). At lower
+  // levels, more conservative threshold is used, and value of 0 indicates
+  // av1_ml_predict_breakout() is disabled. Value of 3 corresponds to default
+  // case with no adjustment to lbd thresholds.
+  int ml_predict_breakout_level;
+
+  // Prune sub_8x8 (BLOCK_4X4, BLOCK_4X8 and BLOCK_8X4) partitions.
+  // 0 : no pruning
+  // 1 : pruning based on neighbour block information
+  // 2 : prune always
+  int prune_sub_8x8_partition_level;
 } PARTITION_SPEED_FEATURES;
 
 typedef struct MV_SPEED_FEATURES {
   // Motion search method (Diamond, NSTEP, Hex, Big Diamond, Square, etc).
   SEARCH_METHODS search_method;
 
+  // Enable the use of faster, less accurate mv search method on bsize >=
+  // BLOCK_32X32.
+  // TODO(chiyotsai@google.com): Take the clip's resolution and mv activity into
+  // account.
+  int use_bsize_dependent_search_method;
+
   // If this is set to 1, we limit the motion search range to 2 times the
   // largest motion vector found in the last frame.
   int auto_mv_step_size;
@@ -451,11 +544,6 @@
   // encoding and decoding; otherwise, it uses bilinear interpolation.
   SUBPEL_SEARCH_TYPE use_accurate_subpel_search;
 
-  // TODO(jingning): combine the related motion search speed features
-  // This allows us to use motion search at other sizes as a starting
-  // point for this motion search and limits the search range around it.
-  int adaptive_motion_search;
-
   // Threshold for allowing exhaustive motion search.
   int exhaustive_searches_thresh;
 
@@ -481,6 +569,10 @@
 
   // Accurate full pixel motion search based on TPL stats.
   int full_pixel_search_level;
+
+  // Whether to downsample the rows in sad calculation during motion search.
+  // This is only active when there are at least 16 rows.
+  int use_downsampled_sad;
 } MV_SPEED_FEATURES;
 
 typedef struct INTER_MODE_SPEED_FEATURES {
@@ -498,9 +590,6 @@
   // Limit the inter mode tested in the RD loop
   int reduce_inter_modes;
 
-  // Adaptive prediction mode search
-  int adaptive_mode_search;
-
   // This variable is used to cap the maximum number of times we skip testing a
   // mode to be evaluated. A high value means we will be faster.
   int adaptive_rd_thresh;
@@ -509,8 +598,8 @@
   int prune_inter_modes_if_skippable;
 
   // Drop less likely to be picked reference frames in the RD search.
-  // Has five levels for now: 0, 1, 2, 3 and 4, where higher levels prune more
-  // aggressively than lower ones. (0 means no pruning).
+  // Has six levels for now: 0, 1, 2, 3, 4 and 5, where higher levels prune
+  // more aggressively than lower ones. (0 means no pruning).
   int selective_ref_frame;
 
   // Prune reference frames for rectangular partitions.
@@ -541,10 +630,6 @@
   // Flag used to control the ref_best_rd based gating for chroma
   int perform_best_rd_based_gating_for_chroma;
 
-  // Skip certain motion modes (OBMC, warped, interintra) for single reference
-  // motion search, using the results of single ref SIMPLE_TRANSLATION
-  int prune_single_motion_modes_by_simple_trans;
-
   // Reuse the inter_intra_mode search result from NEARESTMV mode to other
   // single ref modes
   int reuse_inter_intra_mode;
@@ -588,28 +673,25 @@
   // 2 : prune extended compound mode (high aggressiveness)
   int prune_compound_using_neighbors;
 
+  // Skip extended compound mode when ref frame corresponding to NEWMV does not
+  // have NEWMV as single mode winner.
+  // 0 : no pruning
+  // 1 : prune extended compound mode (less aggressiveness)
+  // 2 : prune extended compound mode (high aggressiveness)
+  int prune_comp_using_best_single_mode_ref;
+
   // Based on previous ref_mv_idx search result, prune the following search.
   int prune_ref_mv_idx_search;
 
   // Disable one sided compound modes.
   int disable_onesided_comp;
 
-  // Prune/gate motion mode evaluation based on token based rd
-  // during transform search for inter blocks
-  // Values are 0 (not used) , 1 - 3 with progressively increasing
-  // aggressiveness
-  int prune_motion_mode_level;
-
   // Prune obmc search using previous frame stats.
   int prune_obmc_prob_thresh;
 
   // Disable obmc.
   int disable_obmc;
 
-  // Gate warp evaluation for motions of type IDENTITY,
-  // TRANSLATION and AFFINE(based on number of warp neighbors)
-  int prune_warp_using_wmtype;
-
   // Prune warped motion search using previous frame stats.
   int prune_warped_prob_thresh;
 
@@ -619,10 +701,6 @@
   // De-couple wedge and mode search during interintra RDO.
   int fast_interintra_wedge_search;
 
-  // Only enable wedge search if the edge strength is greater than
-  // this threshold. A value of 0 signals that this check is disabled.
-  unsigned int disable_wedge_search_edge_thresh;
-
   // Only enable wedge search if the variance is above this threshold.
   unsigned int disable_wedge_search_var_thresh;
 
@@ -649,18 +727,24 @@
   DIST_WTD_COMP_FLAG use_dist_wtd_comp_flag;
 
   // Whether to override and disable sb level coeff cost updates, if
-  // cpi->oxcf.coeff_cost_upd_freq = COST_UPD_SB (i.e. set at SB level)
+  // cpi->oxcf.cost_upd_freq.coeff = COST_UPD_SB (i.e. set at SB level)
   int disable_sb_level_coeff_cost_upd;
 
-  // Whether to override and disable sb level mv cost updates, if
-  // cpi->oxcf.coeff_cost_upd_freq = COST_UPD_SB (i.e. set at SB level)
-  int disable_sb_level_mv_cost_upd;
-
+  // To skip cost update for mv.
+  // mv_cost_upd_level indicates the aggressiveness of skipping.
+  // 0: update happens at each sb level.
+  // 1: update happens once for each sb row.
+  // 2: update happens once for a set of rows.
+  int mv_cost_upd_level;
   // Prune inter modes based on tpl stats
   // 0 : no pruning
   // 1 - 3 indicate increasing aggressiveness in order.
   int prune_inter_modes_based_on_tpl;
 
+  // Skip NEARMV and NEAR_NEARMV modes using ref frames of above and left
+  // neighbor blocks and qindex.
+  int prune_nearmv_using_neighbors;
+
   // Model based breakout after interpolation filter search
   // 0: no breakout
   // 1: use model based rd breakout
@@ -670,13 +754,16 @@
   // 0: No reuse
   // 1: Reuse the compound type decision
   int reuse_compound_type_decision;
+
+  // Enable/disable masked compound.
+  int disable_masked_comp;
+
+  // Reuse the best prediction modes found in PARTITION_SPLIT and PARTITION_RECT
+  // when encoding PARTITION_AB.
+  int reuse_best_prediction_for_part_ab;
 } INTER_MODE_SPEED_FEATURES;
 
 typedef struct INTERP_FILTER_SPEED_FEATURES {
-  // A source variance threshold below which filter search is disabled
-  // Choose a very large value (UINT_MAX) to use 8-tap always
-  unsigned int disable_filter_search_var_thresh;
-
   // Do limited interpolation filter search for dual filters, since best choice
   // usually includes EIGHTTAP_REGULAR.
   int use_fast_interpolation_filter_search;
@@ -713,10 +800,12 @@
   unsigned int src_var_thresh_intra_skip;
 
   // Prune intra mode candidates based on source block histogram of gradient.
+  // Applies to luma plane only.
   int intra_pruning_with_hog;
 
-  // TODO(anyone): tune intra_pruning_with_hog_thresh for various speeds.
-  float intra_pruning_with_hog_thresh;
+  // Prune intra mode candidates based on source block histogram of gradient.
+  // Applies to chroma plane only.
+  int chroma_intra_pruning_with_hog;
 
   // Enable/disable smooth intra modes.
   int disable_smooth_intra;
@@ -762,7 +851,9 @@
   int model_based_prune_tx_search_level;
 
   // Use hash table to store intra(keyframe only) txb transform search results
-  // to avoid repeated search on the same residue signal.
+  // to avoid repeated search on the same residue signal. This is currently not
+  // compatible with multi-winner mode as the hash states are reset during
+  // winner mode processing.
   int use_intra_txb_hash;
 
   // Use hash table to store inter txb transform search results
@@ -771,13 +862,14 @@
 
   // Refine TX type after fast TX search.
   int refine_fast_tx_search_results;
+
+  // Prune transform split/no_split eval based on residual properties. A value
+  // of 0 indicates no pruning, and the aggressiveness of pruning progressively
+  // increases from levels 1 to 3.
+  int prune_tx_size_level;
 } TX_SPEED_FEATURES;
 
 typedef struct RD_CALC_SPEED_FEATURES {
-  // This feature controls whether we do the expensive context update and
-  // calculation in the rd coefficient costing loop.
-  int use_fast_coef_costing;
-
   // Fast approximation of av1_model_rd_from_var_lapndz
   int simple_model_rd_from_var;
 
@@ -795,22 +887,17 @@
   // Trellis (dynamic programming) optimization of quantized values
   TRELLIS_OPT_TYPE optimize_coefficients;
 
-  // Use a hash table to store previously computed optimized qcoeffs from
-  // expensive calls to optimize_txb.
-  int use_hash_based_trellis;
-
   // Use hash table to store macroblock RD search results
   // to avoid repeated search on the same residue signal.
   int use_mb_rd_hash;
 
-  // Flag used to control the speed of the eob selection in trellis.
-  int trellis_eob_fast;
-
-  // Calculate RD cost before doing optimize_b, and skip if the cost is large.
-  int optimize_b_precheck;
-
   // Flag used to control the extent of coeff R-D optimization
   int perform_coeff_opt;
+
+  // Enable coeff R-D optimization based on SATD values.
+  // 0    : Do not disable coeff R-D opt.
+  // 1, 2 : Disable coeff R-D opt with progressively increasing aggressiveness.
+  int perform_coeff_opt_based_on_satd;
 } RD_CALC_SPEED_FEATURES;
 
 typedef struct WINNER_MODE_SPEED_FEATURES {
@@ -834,18 +921,27 @@
   int enable_winner_mode_for_use_tx_domain_dist;
 
   // Flag used to enable processing of multiple winner modes
-  int enable_multiwinner_mode_process;
+  MULTI_WINNER_MODE_TYPE multi_winner_mode_type;
 
   // Motion mode for winner candidates:
   // 0: speed feature OFF
   // 1 / 2 : Use configured number of winner candidates
   int motion_mode_for_winner_cand;
+
+  // Early DC only txfm block prediction
+  // 0: speed feature OFF
+  // 1 / 2 : Use the configured level for different modes
+  int dc_blk_pred_level;
 } WINNER_MODE_SPEED_FEATURES;
 
 typedef struct LOOP_FILTER_SPEED_FEATURES {
   // This feature controls how the loop filter level is determined.
   LPF_PICK_METHOD lpf_pick;
 
+  // Skip some final iterations in the determination of the best loop filter
+  // level.
+  int use_coarse_filter_level_search;
+
   // Control how the CDEF strength is determined.
   CDEF_PICK_METHOD cdef_pick_method;
 
@@ -884,6 +980,9 @@
   // check intra prediction for non-RD mode.
   int check_intra_pred_nonrd;
 
+  // skip checking intra prediction if TX is skipped
+  int skip_intra_pred_if_tx_skip;
+
   // Perform coarse ME before calculating variance in variance-based partition
   int estimate_motion_for_var_based_partition;
 
@@ -950,81 +1049,152 @@
 
   // Compute variance/sse on source difference, prior to encoding superblock.
   int source_metrics_sb_nonrd;
+
+  // Flag to indicate process for handling overshoot on slide/scene change,
+  // for real-time CBR mode.
+  OVERSHOOT_DETECTION_CBR overshoot_detection_cbr;
+
+  // Check for scene/content change detection on every frame before encoding.
+  int check_scene_detection;
+
+  // Forces larger partition blocks in variance based partitioning
+  int force_large_partition_blocks;
+
+  // uses results of temporal noise estimate
+  int use_temporal_noise_estimate;
+
+  // Parameter indicating initial search window to be used in full-pixel search
+  // for nonrd_pickmode. Range [0, MAX_MVSEARCH_STEPS - 1]. Lower value
+  // indicates larger window. If set to 0, step_param is set based on internal
+  // logic in set_mv_search_params().
+  int fullpel_search_step_param;
+
+  // Skip loopfilter (and cdef) in svc real-time mode for
+  // non_reference/droppable frames.
+  int skip_loopfilter_non_reference;
+
+  // Bit mask to enable or disable intra modes for each prediction block size
+  // separately, for nonrd pickmode.
+  int intra_y_mode_bsize_mask_nrd[BLOCK_SIZES];
+
+  // Skips mode checks more agressively in nonRD mode
+  int nonrd_agressive_skip;
 } REAL_TIME_SPEED_FEATURES;
 
+/*!\endcond */
+
+/*!
+ * \brief Top level speed vs quality trade off data struture.
+ */
 typedef struct SPEED_FEATURES {
-  /*
+  /*!
    * Sequence/frame level speed features:
    */
   HIGH_LEVEL_SPEED_FEATURES hl_sf;
 
-  /*
+  /*!
    * Speed features related to how tpl's searches are done.
    */
   TPL_SPEED_FEATURES tpl_sf;
 
-  /*
+  /*!
    * Global motion speed features:
    */
   GLOBAL_MOTION_SPEED_FEATURES gm_sf;
 
-  /*
+  /*!
    * Partition search speed features:
    */
   PARTITION_SPEED_FEATURES part_sf;
 
-  /*
+  /*!
    * Motion search speed features:
    */
   MV_SPEED_FEATURES mv_sf;
 
-  /*
+  /*!
    * Inter mode search speed features:
    */
   INTER_MODE_SPEED_FEATURES inter_sf;
 
-  /*
+  /*!
    * Interpolation filter search speed features:
    */
   INTERP_FILTER_SPEED_FEATURES interp_sf;
 
-  /*
+  /*!
    * Intra mode search speed features:
    */
   INTRA_MODE_SPEED_FEATURES intra_sf;
 
-  /*
+  /*!
    * Transform size/type search speed features:
    */
   TX_SPEED_FEATURES tx_sf;
 
-  /*
+  /*!
    * RD calculation speed features:
    */
   RD_CALC_SPEED_FEATURES rd_sf;
 
-  /*
+  /*!
    * Two-pass mode evaluation features:
    */
   WINNER_MODE_SPEED_FEATURES winner_mode_sf;
 
-  /*
+  /*!
    * In-loop filter speed features:
    */
   LOOP_FILTER_SPEED_FEATURES lpf_sf;
 
-  /*
+  /*!
    * Real-time mode speed features:
    */
   REAL_TIME_SPEED_FEATURES rt_sf;
 } SPEED_FEATURES;
+/*!\cond */
 
 struct AV1_COMP;
 
+/*!\endcond */
+/*!\brief Frame size independent speed vs quality trade off flags
+ *
+ *\ingroup speed_features
+ *
+ * \param[in]    cpi     Top - level encoder instance structure
+ * \param[in]    speed   Speed setting passed in from the command  line
+ *
+ * \return No return value but configures the various speed trade off flags
+ *         based on the passed in speed setting. (Higher speed gives lower
+ *         quality)
+ */
 void av1_set_speed_features_framesize_independent(struct AV1_COMP *cpi,
                                                   int speed);
+
+/*!\brief Frame size dependent speed vs quality trade off flags
+ *
+ *\ingroup speed_features
+ *
+ * \param[in]    cpi     Top - level encoder instance structure
+ * \param[in]    speed   Speed setting passed in from the command  line
+ *
+ * \return No return value but configures the various speed trade off flags
+ *         based on the passed in speed setting and frame size. (Higher speed
+ *         corresponds to lower quality)
+ */
 void av1_set_speed_features_framesize_dependent(struct AV1_COMP *cpi,
                                                 int speed);
+/*!\brief Q index dependent speed vs quality trade off flags
+ *
+ *\ingroup speed_features
+ *
+ * \param[in]    cpi     Top - level encoder instance structure
+ * \param[in]    speed   Speed setting passed in from the command  line
+ *
+ * \return No return value but configures the various speed trade off flags
+ *         based on the passed in speed setting and current frame's Q index.
+ *         (Higher speed corresponds to lower quality)
+ */
 void av1_set_speed_features_qindex_dependent(struct AV1_COMP *cpi, int speed);
 
 #ifdef __cplusplus
diff --git a/av1/encoder/superres_scale.c b/av1/encoder/superres_scale.c
new file mode 100644
index 0000000..bcd3fef
--- /dev/null
+++ b/av1/encoder/superres_scale.c
@@ -0,0 +1,423 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/encoder/encoder_alloc.h"
+#include "av1/encoder/superres_scale.h"
+#include "av1/encoder/random.h"
+
+// Compute the horizontal frequency components' energy in a frame
+// by calculuating the 16x4 Horizontal DCT. This is to be used to
+// decide the superresolution parameters.
+static void analyze_hor_freq(const AV1_COMP *cpi, double *energy) {
+  uint64_t freq_energy[16] = { 0 };
+  const YV12_BUFFER_CONFIG *buf = cpi->source;
+  const int bd = cpi->td.mb.e_mbd.bd;
+  const int width = buf->y_crop_width;
+  const int height = buf->y_crop_height;
+  DECLARE_ALIGNED(16, int32_t, coeff[16 * 4]);
+  int n = 0;
+  memset(freq_energy, 0, sizeof(freq_energy));
+  if (buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    const int16_t *src16 = (const int16_t *)CONVERT_TO_SHORTPTR(buf->y_buffer);
+    for (int i = 0; i < height - 4; i += 4) {
+      for (int j = 0; j < width - 16; j += 16) {
+        av1_fwd_txfm2d_16x4(src16 + i * buf->y_stride + j, coeff, buf->y_stride,
+                            H_DCT, bd);
+        for (int k = 1; k < 16; ++k) {
+          const uint64_t this_energy =
+              ((int64_t)coeff[k] * coeff[k]) +
+              ((int64_t)coeff[k + 16] * coeff[k + 16]) +
+              ((int64_t)coeff[k + 32] * coeff[k + 32]) +
+              ((int64_t)coeff[k + 48] * coeff[k + 48]);
+          freq_energy[k] += ROUND_POWER_OF_TWO(this_energy, 2 + 2 * (bd - 8));
+        }
+        n++;
+      }
+    }
+  } else {
+    assert(bd == 8);
+    DECLARE_ALIGNED(16, int16_t, src16[16 * 4]);
+    for (int i = 0; i < height - 4; i += 4) {
+      for (int j = 0; j < width - 16; j += 16) {
+        for (int ii = 0; ii < 4; ++ii)
+          for (int jj = 0; jj < 16; ++jj)
+            src16[ii * 16 + jj] =
+                buf->y_buffer[(i + ii) * buf->y_stride + (j + jj)];
+        av1_fwd_txfm2d_16x4(src16, coeff, 16, H_DCT, bd);
+        for (int k = 1; k < 16; ++k) {
+          const uint64_t this_energy =
+              ((int64_t)coeff[k] * coeff[k]) +
+              ((int64_t)coeff[k + 16] * coeff[k + 16]) +
+              ((int64_t)coeff[k + 32] * coeff[k + 32]) +
+              ((int64_t)coeff[k + 48] * coeff[k + 48]);
+          freq_energy[k] += ROUND_POWER_OF_TWO(this_energy, 2);
+        }
+        n++;
+      }
+    }
+  }
+  if (n) {
+    for (int k = 1; k < 16; ++k) energy[k] = (double)freq_energy[k] / n;
+    // Convert to cumulative energy
+    for (int k = 14; k > 0; --k) energy[k] += energy[k + 1];
+  } else {
+    for (int k = 1; k < 16; ++k) energy[k] = 1e+20;
+  }
+}
+
+static uint8_t calculate_next_resize_scale(const AV1_COMP *cpi) {
+  // Choose an arbitrary random number
+  static unsigned int seed = 56789;
+  const ResizeCfg *resize_cfg = &cpi->oxcf.resize_cfg;
+  if (is_stat_generation_stage(cpi)) return SCALE_NUMERATOR;
+  uint8_t new_denom = SCALE_NUMERATOR;
+
+  if (cpi->common.seq_params.reduced_still_picture_hdr) return SCALE_NUMERATOR;
+  switch (resize_cfg->resize_mode) {
+    case RESIZE_NONE: new_denom = SCALE_NUMERATOR; break;
+    case RESIZE_FIXED:
+      if (cpi->common.current_frame.frame_type == KEY_FRAME)
+        new_denom = resize_cfg->resize_kf_scale_denominator;
+      else
+        new_denom = resize_cfg->resize_scale_denominator;
+      break;
+    case RESIZE_RANDOM: new_denom = lcg_rand16(&seed) % 9 + 8; break;
+    default: assert(0);
+  }
+  return new_denom;
+}
+
+int av1_superres_in_recode_allowed(const AV1_COMP *const cpi) {
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  // Empirically found to not be beneficial for image coding.
+  return oxcf->superres_cfg.superres_mode == AOM_SUPERRES_AUTO &&
+         cpi->sf.hl_sf.superres_auto_search_type != SUPERRES_AUTO_SOLO &&
+         cpi->rc.frames_to_key > 1;
+}
+
+#define SUPERRES_ENERGY_BY_Q2_THRESH_KEYFRAME_SOLO 0.012
+#define SUPERRES_ENERGY_BY_Q2_THRESH_KEYFRAME 0.008
+#define SUPERRES_ENERGY_BY_Q2_THRESH_ARFFRAME 0.008
+#define SUPERRES_ENERGY_BY_AC_THRESH 0.2
+
+static double get_energy_by_q2_thresh(const GF_GROUP *gf_group,
+                                      const RATE_CONTROL *rc) {
+  // TODO(now): Return keyframe thresh * factor based on frame type / pyramid
+  // level.
+  if (gf_group->update_type[gf_group->index] == ARF_UPDATE) {
+    return SUPERRES_ENERGY_BY_Q2_THRESH_ARFFRAME;
+  } else if (gf_group->update_type[gf_group->index] == KF_UPDATE) {
+    if (rc->frames_to_key <= 1)
+      return SUPERRES_ENERGY_BY_Q2_THRESH_KEYFRAME_SOLO;
+    else
+      return SUPERRES_ENERGY_BY_Q2_THRESH_KEYFRAME;
+  } else {
+    assert(0);
+  }
+  return 0;
+}
+
+static uint8_t get_superres_denom_from_qindex_energy(int qindex, double *energy,
+                                                     double threshq,
+                                                     double threshp) {
+  const double q = av1_convert_qindex_to_q(qindex, AOM_BITS_8);
+  const double tq = threshq * q * q;
+  const double tp = threshp * energy[1];
+  const double thresh = AOMMIN(tq, tp);
+  int k;
+  for (k = SCALE_NUMERATOR * 2; k > SCALE_NUMERATOR; --k) {
+    if (energy[k - 1] > thresh) break;
+  }
+  return 3 * SCALE_NUMERATOR - k;
+}
+
+static uint8_t get_superres_denom_for_qindex(const AV1_COMP *cpi, int qindex,
+                                             int sr_kf, int sr_arf) {
+  // Use superres for Key-frames and Alt-ref frames only.
+  const GF_GROUP *gf_group = &cpi->gf_group;
+  if (gf_group->update_type[gf_group->index] != KF_UPDATE &&
+      gf_group->update_type[gf_group->index] != ARF_UPDATE) {
+    return SCALE_NUMERATOR;
+  }
+  if (gf_group->update_type[gf_group->index] == KF_UPDATE && !sr_kf) {
+    return SCALE_NUMERATOR;
+  }
+  if (gf_group->update_type[gf_group->index] == ARF_UPDATE && !sr_arf) {
+    return SCALE_NUMERATOR;
+  }
+
+  double energy[16];
+  analyze_hor_freq(cpi, energy);
+
+  const double energy_by_q2_thresh =
+      get_energy_by_q2_thresh(gf_group, &cpi->rc);
+  int denom = get_superres_denom_from_qindex_energy(
+      qindex, energy, energy_by_q2_thresh, SUPERRES_ENERGY_BY_AC_THRESH);
+  /*
+  printf("\nenergy = [");
+  for (int k = 1; k < 16; ++k) printf("%f, ", energy[k]);
+  printf("]\n");
+  printf("boost = %d\n",
+         (gf_group->update_type[gf_group->index] == KF_UPDATE)
+             ? cpi->rc.kf_boost
+             : cpi->rc.gfu_boost);
+  printf("denom = %d\n", denom);
+  */
+  if (av1_superres_in_recode_allowed(cpi)) {
+    assert(cpi->superres_mode != AOM_SUPERRES_NONE);
+    // Force superres to be tried in the recode loop, as full-res is also going
+    // to be tried anyway.
+    denom = AOMMAX(denom, SCALE_NUMERATOR + 1);
+  }
+  return denom;
+}
+
+static uint8_t calculate_next_superres_scale(AV1_COMP *cpi) {
+  // Choose an arbitrary random number
+  static unsigned int seed = 34567;
+  const AV1EncoderConfig *oxcf = &cpi->oxcf;
+  const SuperResCfg *const superres_cfg = &oxcf->superres_cfg;
+  const FrameDimensionCfg *const frm_dim_cfg = &oxcf->frm_dim_cfg;
+  const RateControlCfg *const rc_cfg = &oxcf->rc_cfg;
+
+  if (is_stat_generation_stage(cpi)) return SCALE_NUMERATOR;
+  uint8_t new_denom = SCALE_NUMERATOR;
+
+  // Make sure that superres mode of the frame is consistent with the
+  // sequence-level flag.
+  assert(IMPLIES(superres_cfg->superres_mode != AOM_SUPERRES_NONE,
+                 cpi->common.seq_params.enable_superres));
+  assert(IMPLIES(!cpi->common.seq_params.enable_superres,
+                 superres_cfg->superres_mode == AOM_SUPERRES_NONE));
+  // Make sure that superres mode for current encoding is consistent with user
+  // provided superres mode.
+  assert(IMPLIES(superres_cfg->superres_mode != AOM_SUPERRES_AUTO,
+                 cpi->superres_mode == superres_cfg->superres_mode));
+
+  // Note: we must look at the current superres_mode to be tried in 'cpi' here,
+  // not the user given mode in 'oxcf'.
+  switch (cpi->superres_mode) {
+    case AOM_SUPERRES_NONE: new_denom = SCALE_NUMERATOR; break;
+    case AOM_SUPERRES_FIXED:
+      if (cpi->common.current_frame.frame_type == KEY_FRAME)
+        new_denom = superres_cfg->superres_kf_scale_denominator;
+      else
+        new_denom = superres_cfg->superres_scale_denominator;
+      break;
+    case AOM_SUPERRES_RANDOM: new_denom = lcg_rand16(&seed) % 9 + 8; break;
+    case AOM_SUPERRES_QTHRESH: {
+      // Do not use superres when screen content tools are used.
+      if (cpi->common.features.allow_screen_content_tools) break;
+      if (rc_cfg->mode == AOM_VBR || rc_cfg->mode == AOM_CQ)
+        av1_set_target_rate(cpi, frm_dim_cfg->width, frm_dim_cfg->height);
+
+      // Now decide the use of superres based on 'q'.
+      int bottom_index, top_index;
+      const int q = av1_rc_pick_q_and_bounds(
+          cpi, &cpi->rc, frm_dim_cfg->width, frm_dim_cfg->height,
+          cpi->gf_group.index, &bottom_index, &top_index);
+
+      const int qthresh = (frame_is_intra_only(&cpi->common))
+                              ? superres_cfg->superres_kf_qthresh
+                              : superres_cfg->superres_qthresh;
+      if (q <= qthresh) {
+        new_denom = SCALE_NUMERATOR;
+      } else {
+        new_denom = get_superres_denom_for_qindex(cpi, q, 1, 1);
+      }
+      break;
+    }
+    case AOM_SUPERRES_AUTO: {
+      if (cpi->common.features.allow_screen_content_tools) break;
+      if (rc_cfg->mode == AOM_VBR || rc_cfg->mode == AOM_CQ)
+        av1_set_target_rate(cpi, frm_dim_cfg->width, frm_dim_cfg->height);
+
+      // Now decide the use of superres based on 'q'.
+      int bottom_index, top_index;
+      const int q = av1_rc_pick_q_and_bounds(
+          cpi, &cpi->rc, frm_dim_cfg->width, frm_dim_cfg->height,
+          cpi->gf_group.index, &bottom_index, &top_index);
+
+      const SUPERRES_AUTO_SEARCH_TYPE sr_search_type =
+          cpi->sf.hl_sf.superres_auto_search_type;
+      const int qthresh = (sr_search_type == SUPERRES_AUTO_SOLO) ? 128 : 0;
+      if (q <= qthresh) {
+        new_denom = SCALE_NUMERATOR;  // Don't use superres.
+      } else {
+        if (sr_search_type == SUPERRES_AUTO_ALL) {
+          if (cpi->common.current_frame.frame_type == KEY_FRAME)
+            new_denom = superres_cfg->superres_kf_scale_denominator;
+          else
+            new_denom = superres_cfg->superres_scale_denominator;
+        } else {
+          new_denom = get_superres_denom_for_qindex(cpi, q, 1, 1);
+        }
+      }
+      break;
+    }
+    default: assert(0);
+  }
+  return new_denom;
+}
+
+static int dimension_is_ok(int orig_dim, int resized_dim, int denom) {
+  return (resized_dim * SCALE_NUMERATOR >= orig_dim * denom / 2);
+}
+
+static int dimensions_are_ok(int owidth, int oheight, size_params_type *rsz) {
+  // Only need to check the width, as scaling is horizontal only.
+  (void)oheight;
+  return dimension_is_ok(owidth, rsz->resize_width, rsz->superres_denom);
+}
+
+static int validate_size_scales(RESIZE_MODE resize_mode,
+                                aom_superres_mode superres_mode, int owidth,
+                                int oheight, size_params_type *rsz) {
+  if (dimensions_are_ok(owidth, oheight, rsz)) {  // Nothing to do.
+    return 1;
+  }
+
+  // Calculate current resize scale.
+  int resize_denom =
+      AOMMAX(DIVIDE_AND_ROUND(owidth * SCALE_NUMERATOR, rsz->resize_width),
+             DIVIDE_AND_ROUND(oheight * SCALE_NUMERATOR, rsz->resize_height));
+
+  if (resize_mode != RESIZE_RANDOM && superres_mode == AOM_SUPERRES_RANDOM) {
+    // Alter superres scale as needed to enforce conformity.
+    rsz->superres_denom =
+        (2 * SCALE_NUMERATOR * SCALE_NUMERATOR) / resize_denom;
+    if (!dimensions_are_ok(owidth, oheight, rsz)) {
+      if (rsz->superres_denom > SCALE_NUMERATOR) --rsz->superres_denom;
+    }
+  } else if (resize_mode == RESIZE_RANDOM &&
+             superres_mode != AOM_SUPERRES_RANDOM) {
+    // Alter resize scale as needed to enforce conformity.
+    resize_denom =
+        (2 * SCALE_NUMERATOR * SCALE_NUMERATOR) / rsz->superres_denom;
+    rsz->resize_width = owidth;
+    rsz->resize_height = oheight;
+    av1_calculate_scaled_size(&rsz->resize_width, &rsz->resize_height,
+                              resize_denom);
+    if (!dimensions_are_ok(owidth, oheight, rsz)) {
+      if (resize_denom > SCALE_NUMERATOR) {
+        --resize_denom;
+        rsz->resize_width = owidth;
+        rsz->resize_height = oheight;
+        av1_calculate_scaled_size(&rsz->resize_width, &rsz->resize_height,
+                                  resize_denom);
+      }
+    }
+  } else if (resize_mode == RESIZE_RANDOM &&
+             superres_mode == AOM_SUPERRES_RANDOM) {
+    // Alter both resize and superres scales as needed to enforce conformity.
+    do {
+      if (resize_denom > rsz->superres_denom)
+        --resize_denom;
+      else
+        --rsz->superres_denom;
+      rsz->resize_width = owidth;
+      rsz->resize_height = oheight;
+      av1_calculate_scaled_size(&rsz->resize_width, &rsz->resize_height,
+                                resize_denom);
+    } while (!dimensions_are_ok(owidth, oheight, rsz) &&
+             (resize_denom > SCALE_NUMERATOR ||
+              rsz->superres_denom > SCALE_NUMERATOR));
+  } else {  // We are allowed to alter neither resize scale nor superres
+            // scale.
+    return 0;
+  }
+  return dimensions_are_ok(owidth, oheight, rsz);
+}
+
+// Calculates resize and superres params for next frame
+static size_params_type calculate_next_size_params(AV1_COMP *cpi) {
+  const AV1EncoderConfig *oxcf = &cpi->oxcf;
+  ResizePendingParams *resize_pending_params = &cpi->resize_pending_params;
+  const FrameDimensionCfg *const frm_dim_cfg = &oxcf->frm_dim_cfg;
+  size_params_type rsz = { frm_dim_cfg->width, frm_dim_cfg->height,
+                           SCALE_NUMERATOR };
+  int resize_denom = SCALE_NUMERATOR;
+  if (has_no_stats_stage(cpi) && cpi->use_svc &&
+      cpi->svc.spatial_layer_id < cpi->svc.number_spatial_layers - 1) {
+    rsz.resize_width = cpi->common.width;
+    rsz.resize_height = cpi->common.height;
+    return rsz;
+  }
+  if (is_stat_generation_stage(cpi)) return rsz;
+  if (resize_pending_params->width && resize_pending_params->height) {
+    rsz.resize_width = resize_pending_params->width;
+    rsz.resize_height = resize_pending_params->height;
+    resize_pending_params->width = resize_pending_params->height = 0;
+    if (oxcf->superres_cfg.superres_mode == AOM_SUPERRES_NONE) return rsz;
+  } else {
+    resize_denom = calculate_next_resize_scale(cpi);
+    rsz.resize_width = frm_dim_cfg->width;
+    rsz.resize_height = frm_dim_cfg->height;
+    av1_calculate_scaled_size(&rsz.resize_width, &rsz.resize_height,
+                              resize_denom);
+  }
+  rsz.superres_denom = calculate_next_superres_scale(cpi);
+  if (!validate_size_scales(oxcf->resize_cfg.resize_mode, cpi->superres_mode,
+                            frm_dim_cfg->width, frm_dim_cfg->height, &rsz))
+    assert(0 && "Invalid scale parameters");
+  return rsz;
+}
+
+static void setup_frame_size_from_params(AV1_COMP *cpi,
+                                         const size_params_type *rsz) {
+  int encode_width = rsz->resize_width;
+  int encode_height = rsz->resize_height;
+
+  AV1_COMMON *cm = &cpi->common;
+  cm->superres_upscaled_width = encode_width;
+  cm->superres_upscaled_height = encode_height;
+  cm->superres_scale_denominator = rsz->superres_denom;
+  av1_calculate_scaled_superres_size(&encode_width, &encode_height,
+                                     rsz->superres_denom);
+  av1_set_frame_size(cpi, encode_width, encode_height);
+}
+
+void av1_setup_frame_size(AV1_COMP *cpi) {
+  AV1_COMMON *cm = &cpi->common;
+  // Reset superres params from previous frame.
+  cm->superres_scale_denominator = SCALE_NUMERATOR;
+  const size_params_type rsz = calculate_next_size_params(cpi);
+  setup_frame_size_from_params(cpi, &rsz);
+
+  assert(av1_is_min_tile_width_satisfied(cm));
+}
+
+void av1_superres_post_encode(AV1_COMP *cpi) {
+  AV1_COMMON *cm = &cpi->common;
+
+  if (!av1_superres_scaled(cm)) return;
+
+  assert(cpi->oxcf.superres_cfg.enable_superres);
+  assert(!is_lossless_requested(&cpi->oxcf.rc_cfg));
+  assert(!cm->features.all_lossless);
+
+  av1_superres_upscale(cm, NULL);
+
+  // If regular resizing is occurring the source will need to be downscaled to
+  // match the upscaled superres resolution. Otherwise the original source is
+  // used.
+  if (!av1_resize_scaled(cm)) {
+    cpi->source = cpi->unscaled_source;
+    if (cpi->last_source != NULL) cpi->last_source = cpi->unscaled_last_source;
+  } else {
+    assert(cpi->unscaled_source->y_crop_width != cm->superres_upscaled_width);
+    assert(cpi->unscaled_source->y_crop_height != cm->superres_upscaled_height);
+    // Do downscale. cm->(width|height) has been updated by
+    // av1_superres_upscale
+    cpi->source = realloc_and_scale_source(cpi, cm->superres_upscaled_width,
+                                           cm->superres_upscaled_height);
+  }
+}
diff --git a/av1/encoder/superres_scale.h b/av1/encoder/superres_scale.h
new file mode 100644
index 0000000..450a4ed9
--- /dev/null
+++ b/av1/encoder/superres_scale.h
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_SUPERRES_SCALE_H_
+#define AOM_AV1_ENCODER_SUPERRES_SCALE_H_
+
+#include "av1/encoder/encoder.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int av1_superres_in_recode_allowed(const AV1_COMP *const cpi);
+void av1_superres_post_encode(AV1_COMP *cpi);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_SUPERRES_SCALE_H_
diff --git a/av1/encoder/svc_layercontext.c b/av1/encoder/svc_layercontext.c
index b72d8aa..c43edf9 100644
--- a/av1/encoder/svc_layercontext.c
+++ b/av1/encoder/svc_layercontext.c
@@ -28,13 +28,15 @@
   int mi_cols = cpi->common.mi_params.mi_cols;
   svc->base_framerate = 30.0;
   svc->current_superframe = 0;
+  svc->force_zero_mode_spatial_ref = 1;
+  svc->num_encoded_top_layer = 0;
 
   for (int sl = 0; sl < svc->number_spatial_layers; ++sl) {
     for (int tl = 0; tl < svc->number_temporal_layers; ++tl) {
       int layer = LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers);
       LAYER_CONTEXT *const lc = &svc->layer_context[layer];
       RATE_CONTROL *const lrc = &lc->rc;
-      lrc->ni_av_qi = oxcf->worst_allowed_q;
+      lrc->ni_av_qi = oxcf->rc_cfg.worst_allowed_q;
       lrc->total_actual_bits = 0;
       lrc->total_target_vs_actual = 0;
       lrc->ni_tot_qi = 0;
@@ -53,7 +55,7 @@
       lrc->avg_frame_qindex[INTER_FRAME] = lrc->worst_quality;
       lrc->avg_frame_qindex[KEY_FRAME] = lrc->worst_quality;
       lrc->buffer_level =
-          oxcf->starting_buffer_level_ms * lc->target_bandwidth / 1000;
+          oxcf->rc_cfg.starting_buffer_level_ms * lc->target_bandwidth / 1000;
       lrc->bits_off_target = lrc->buffer_level;
       // Initialize the cyclic refresh parameters. If spatial layers are used
       // (i.e., ss_number_layers > 1), these need to be updated per spatial
@@ -64,17 +66,24 @@
         lc->actual_num_seg1_blocks = 0;
         lc->actual_num_seg2_blocks = 0;
         lc->counter_encode_maxq_scene_change = 0;
+        if (lc->map) aom_free(lc->map);
         CHECK_MEM_ERROR(cm, lc->map,
                         aom_malloc(mi_rows * mi_cols * sizeof(*lc->map)));
         memset(lc->map, 0, mi_rows * mi_cols);
         last_coded_q_map_size =
             mi_rows * mi_cols * sizeof(*lc->last_coded_q_map);
+        if (lc->last_coded_q_map) aom_free(lc->last_coded_q_map);
         CHECK_MEM_ERROR(cm, lc->last_coded_q_map,
                         aom_malloc(last_coded_q_map_size));
         assert(MAXQ <= 255);
         memset(lc->last_coded_q_map, MAXQ, last_coded_q_map_size);
       }
     }
+    svc->downsample_filter_type[sl] = BILINEAR;
+    svc->downsample_filter_phase[sl] = 8;
+  }
+  if (svc->number_spatial_layers == 3) {
+    svc->downsample_filter_type[0] = EIGHTTAP_SMOOTH;
   }
 }
 
@@ -118,6 +127,13 @@
   }
 }
 
+/*!\brief Return layer context for current layer.
+ *
+ * \ingroup rate_control
+ * \param[in]       cpi   Top level encoder structure
+ *
+ * \return LAYER_CONTEXT for current layer.
+ */
 static LAYER_CONTEXT *get_layer_context(AV1_COMP *const cpi) {
   return &cpi->svc.layer_context[cpi->svc.spatial_layer_id *
                                      cpi->svc.number_temporal_layers +
@@ -151,20 +167,24 @@
 void av1_restore_layer_context(AV1_COMP *const cpi) {
   GF_GROUP *const gf_group = &cpi->gf_group;
   SVC *const svc = &cpi->svc;
+  const AV1_COMMON *const cm = &cpi->common;
   LAYER_CONTEXT *const lc = get_layer_context(cpi);
   const int old_frame_since_key = cpi->rc.frames_since_key;
   const int old_frame_to_key = cpi->rc.frames_to_key;
   // Restore layer rate control.
   cpi->rc = lc->rc;
-  cpi->oxcf.target_bandwidth = lc->target_bandwidth;
-  gf_group->index = lc->group_index;
+  cpi->oxcf.rc_cfg.target_bandwidth = lc->target_bandwidth;
+  gf_group->index = 0;
+  cpi->mv_search_params.max_mv_magnitude = lc->max_mv_magnitude;
+  if (cpi->mv_search_params.max_mv_magnitude == 0)
+    cpi->mv_search_params.max_mv_magnitude = AOMMAX(cm->width, cm->height);
   // Reset the frames_since_key and frames_to_key counters to their values
   // before the layer restore. Keep these defined for the stream (not layer).
   cpi->rc.frames_since_key = old_frame_since_key;
   cpi->rc.frames_to_key = old_frame_to_key;
   // For spatial-svc, allow cyclic-refresh to be applied on the spatial layers,
   // for the base temporal layer.
-  if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ &&
+  if (cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ &&
       svc->number_spatial_layers > 1 && svc->temporal_layer_id == 0) {
     CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
     swap_ptr(&cr->map, &lc->map);
@@ -178,15 +198,15 @@
   // For each reference (LAST/GOLDEN) set the skip_nonzero_last/gf frame flags.
   // This is to skip testing nonzero-mv for that reference if it was last
   // refreshed (i.e., buffer slot holding that reference was refreshed) on the
-  // previous spatial layer at the same time (current_superframe).
-  if (svc->external_ref_frame_config) {
+  // previous spatial layer(s) at the same time (current_superframe).
+  if (svc->external_ref_frame_config && svc->force_zero_mode_spatial_ref) {
     int ref_frame_idx = svc->ref_idx[LAST_FRAME - 1];
     if (svc->buffer_time_index[ref_frame_idx] == svc->current_superframe &&
-        svc->buffer_spatial_layer[ref_frame_idx] == svc->spatial_layer_id - 1)
+        svc->buffer_spatial_layer[ref_frame_idx] <= svc->spatial_layer_id - 1)
       svc->skip_nonzeromv_last = 1;
     ref_frame_idx = svc->ref_idx[GOLDEN_FRAME - 1];
     if (svc->buffer_time_index[ref_frame_idx] == svc->current_superframe &&
-        svc->buffer_spatial_layer[ref_frame_idx] == svc->spatial_layer_id - 1)
+        svc->buffer_spatial_layer[ref_frame_idx] <= svc->spatial_layer_id - 1)
       svc->skip_nonzeromv_gf = 1;
   }
 }
@@ -194,14 +214,16 @@
 void av1_save_layer_context(AV1_COMP *const cpi) {
   GF_GROUP *const gf_group = &cpi->gf_group;
   SVC *const svc = &cpi->svc;
+  const AV1_COMMON *const cm = &cpi->common;
   LAYER_CONTEXT *lc = get_layer_context(cpi);
   lc->rc = cpi->rc;
-  lc->target_bandwidth = (int)cpi->oxcf.target_bandwidth;
+  lc->target_bandwidth = (int)cpi->oxcf.rc_cfg.target_bandwidth;
   lc->group_index = gf_group->index;
+  lc->max_mv_magnitude = cpi->mv_search_params.max_mv_magnitude;
   if (svc->spatial_layer_id == 0) svc->base_framerate = cpi->framerate;
   // For spatial-svc, allow cyclic-refresh to be applied on the spatial layers,
   // for the base temporal layer.
-  if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ &&
+  if (cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ &&
       cpi->svc.number_spatial_layers > 1 && svc->temporal_layer_id == 0) {
     CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
     signed char *temp = lc->map;
@@ -231,10 +253,40 @@
       }
     }
   }
+  for (unsigned int i = 0; i < REF_FRAMES; i++) {
+    if (frame_is_intra_only(cm) ||
+        cm->current_frame.refresh_frame_flags & (1 << i)) {
+      svc->spatial_layer_fb[i] = svc->spatial_layer_id;
+      svc->temporal_layer_fb[i] = svc->temporal_layer_id;
+    }
+  }
   if (svc->spatial_layer_id == svc->number_spatial_layers - 1)
     svc->current_superframe++;
 }
 
+int av1_svc_primary_ref_frame(const AV1_COMP *const cpi) {
+  const SVC *const svc = &cpi->svc;
+  const AV1_COMMON *const cm = &cpi->common;
+  int wanted_fb = -1;
+  int primary_ref_frame = PRIMARY_REF_NONE;
+  for (unsigned int i = 0; i < REF_FRAMES; i++) {
+    if (svc->spatial_layer_fb[i] == svc->spatial_layer_id &&
+        svc->temporal_layer_fb[i] == svc->temporal_layer_id) {
+      wanted_fb = i;
+      break;
+    }
+  }
+  if (wanted_fb != -1) {
+    for (int ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
+      if (get_ref_frame_map_idx(cm, ref_frame) == wanted_fb) {
+        primary_ref_frame = ref_frame - LAST_FRAME;
+        break;
+      }
+    }
+  }
+  return primary_ref_frame;
+}
+
 void av1_free_svc_cyclic_refresh(AV1_COMP *const cpi) {
   SVC *const svc = &cpi->svc;
   for (int sl = 0; sl < svc->number_spatial_layers; ++sl) {
@@ -247,7 +299,6 @@
   }
 }
 
-// Reset on key frame: reset counters, references and buffer updates.
 void av1_svc_reset_temporal_layers(AV1_COMP *const cpi, int is_key) {
   SVC *const svc = &cpi->svc;
   LAYER_CONTEXT *lc = NULL;
@@ -261,6 +312,18 @@
   av1_restore_layer_context(cpi);
 }
 
+/*!\brief Get resolution for current layer.
+ *
+ * \ingroup rate_control
+ * \param[in]       width_org    Original width, unscaled
+ * \param[in]       height_org   Original height, unscaled
+ * \param[in]       num          Numerator for the scale ratio
+ * \param[in]       den          Denominator for the scale ratio
+ * \param[in]       width_out    Output width, scaled for current layer
+ * \param[in]       height_out   Output height, scaled for current layer
+ *
+ * \return Nothing is returned. Instead the scaled width and height are set.
+ */
 static void get_layer_resolution(const int width_org, const int height_org,
                                  const int num, const int den, int *width_out,
                                  int *height_out) {
@@ -281,8 +344,8 @@
   int width = 0, height = 0;
   lc = &svc->layer_context[svc->spatial_layer_id * svc->number_temporal_layers +
                            svc->temporal_layer_id];
-  get_layer_resolution(cpi->oxcf.width, cpi->oxcf.height,
-                       lc->scaling_factor_num, lc->scaling_factor_den, &width,
-                       &height);
+  get_layer_resolution(cpi->oxcf.frm_dim_cfg.width,
+                       cpi->oxcf.frm_dim_cfg.height, lc->scaling_factor_num,
+                       lc->scaling_factor_den, &width, &height);
   av1_set_size_literal(cpi, width, height);
 }
diff --git a/av1/encoder/svc_layercontext.h b/av1/encoder/svc_layercontext.h
index 7cb85a3..3c7ae67 100644
--- a/av1/encoder/svc_layercontext.h
+++ b/av1/encoder/svc_layercontext.h
@@ -19,7 +19,12 @@
 extern "C" {
 #endif
 
+/*!
+ * \brief The stucture of quantities related to each spatial and temporal layer.
+ * \ingroup SVC
+ */
 typedef struct {
+  /*!\cond */
   RATE_CONTROL rc;
   int framerate_factor;
   int64_t layer_target_bitrate;
@@ -32,27 +37,73 @@
   int max_q;
   int min_q;
   int frames_from_key_frame;
-  // Cyclic refresh parameters (aq-mode=3), that need to be updated per-frame.
+  /*!\endcond */
+
+  /*!
+   * Cyclic refresh parameters (aq-mode=3), that need to be updated per-frame.
+   */
   int sb_index;
+  /*!
+   * Segmentation map
+   */
   int8_t *map;
+  /*!
+   * Segmentation map for last coded quantization paramters.
+   */
   uint8_t *last_coded_q_map;
+
+  /*!
+   * Number of blocks on segment 1
+   */
   int actual_num_seg1_blocks;
+
+  /*!
+   * Number of blocks on segment 2
+   */
   int actual_num_seg2_blocks;
+  /*!
+   * Counter used to detect scene change.
+   */
   int counter_encode_maxq_scene_change;
+
+  /*!
+   * Speed settings for each layer.
+   */
   uint8_t speed;
+  /*!
+   * GF group index.
+   */
   unsigned char group_index;
+  /*!
+   * If current layer is key frame.
+   */
+  int is_key_frame;
+  /*!
+   * Maximum motion magnitude of previous encoded layer.
+   */
+  int max_mv_magnitude;
 } LAYER_CONTEXT;
 
+/*!
+ * \brief The stucture of SVC.
+ * \ingroup SVC
+ */
 typedef struct SVC {
+  /*!\cond */
   int spatial_layer_id;
   int temporal_layer_id;
   int number_spatial_layers;
   int number_temporal_layers;
   int external_ref_frame_config;
   int non_reference_frame;
-  // LAST_FRAME (0), LAST2_FRAME(1), LAST3_FRAME(2), GOLDEN_FRAME(3),
-  // BWDREF_FRAME(4), ALTREF2_FRAME(5), ALTREF_FRAME(6).
+  /*!\endcond */
+
+  /*!
+   * LAST_FRAME (0), LAST2_FRAME(1), LAST3_FRAME(2), GOLDEN_FRAME(3),
+   * BWDREF_FRAME(4), ALTREF2_FRAME(5), ALTREF_FRAME(6).
+   */
   int reference[INTER_REFS_PER_FRAME];
+  /*!\cond */
   int ref_idx[INTER_REFS_PER_FRAME];
   int refresh[REF_FRAMES];
   double base_framerate;
@@ -61,37 +112,149 @@
   unsigned char buffer_spatial_layer[REF_FRAMES];
   int skip_nonzeromv_last;
   int skip_nonzeromv_gf;
-  // Layer context used for rate control in one pass temporal CBR mode or
-  // two pass spatial mode.
+  int spatial_layer_fb[REF_FRAMES];
+  int temporal_layer_fb[REF_FRAMES];
+  int num_encoded_top_layer;
+  /*!\endcond */
+
+  /*!
+   * Layer context used for rate control in CBR mode.
+   */
   LAYER_CONTEXT layer_context[AOM_MAX_LAYERS];
+
+  /*!
+   * EIGHTTAP_SMOOTH or BILINEAR
+   */
+  InterpFilter downsample_filter_type[AOM_MAX_SS_LAYERS];
+
+  /*!
+   * Downsample_filter_phase: = 0 will do sub-sampling (no weighted average),
+   * = 8 will center the target pixel and get a symmetric averaging filter.
+   */
+  int downsample_filter_phase[AOM_MAX_SS_LAYERS];
+
+  /*!
+   * Force zero-mv in mode search for the spatial/inter-layer reference.
+   */
+  int force_zero_mode_spatial_ref;
 } SVC;
 
 struct AV1_COMP;
 
-// Initialize layer context data from init_config().
+/*!\brief Initialize layer context data from init_config().
+ *
+ * \ingroup SVC
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in]       cpi  Top level encoder structure
+ *
+ * \return  Nothing returned. Set cpi->svc.
+ */
 void av1_init_layer_context(struct AV1_COMP *const cpi);
 
-// Update the layer context from a change_config() call.
+/*!\brief Update the layer context from a change_config() call.
+ *
+ * \ingroup SVC
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in]       cpi  Top level encoder structure
+ * \param[in]       target_bandwidth  Total target bandwidth
+ *
+ * \return  Nothing returned. Buffer level for each layer is set.
+ */
 void av1_update_layer_context_change_config(struct AV1_COMP *const cpi,
                                             const int64_t target_bandwidth);
 
-// Prior to encoding the frame, update framerate-related quantities
-// for the current temporal layer.
+/*!\brief Prior to encoding the frame, update framerate-related quantities
+          for the current temporal layer.
+ *
+ * \ingroup SVC
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in]       cpi  Top level encoder structure
+ *
+ * \return  Nothing returned. Frame related quantities for current temporal
+ layer are updated.
+ */
 void av1_update_temporal_layer_framerate(struct AV1_COMP *const cpi);
 
-// Prior to encoding the frame, set the layer context, for the current layer
-// to be encoded, to the cpi struct.
+/*!\brief Prior to encoding the frame, set the layer context, for the current
+ layer to be encoded, to the cpi struct.
+ *
+ * \ingroup SVC
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in]       cpi  Top level encoder structure
+ *
+ * \return  Nothing returned. Layer context for current layer is set.
+ */
 void av1_restore_layer_context(struct AV1_COMP *const cpi);
 
-// Save the layer context after encoding the frame.
+/*!\brief Save the layer context after encoding the frame.
+ *
+ * \ingroup SVC
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in]       cpi  Top level encoder structure
+ *
+ * \return  Nothing returned.
+ */
 void av1_save_layer_context(struct AV1_COMP *const cpi);
 
+/*!\brief Free the memory used for cyclic refresh in layer context.
+ *
+ * \ingroup SVC
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in]       cpi  Top level encoder structure
+ *
+ * \return  Nothing returned.
+ */
 void av1_free_svc_cyclic_refresh(struct AV1_COMP *const cpi);
 
+/*!\brief Reset on key frame: reset counters, references and buffer updates.
+ *
+ * \ingroup SVC
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in]       cpi  Top level encoder structure
+ * \param[in]       is_key  Whether current layer is key frame
+ *
+ * \return  Nothing returned.
+ */
 void av1_svc_reset_temporal_layers(struct AV1_COMP *const cpi, int is_key);
 
+/*!\brief Before encoding, set resolutions and allocate compressor data.
+ *
+ * \ingroup SVC
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in]       cpi  Top level encoder structure
+ *
+ * \return  Nothing returned.
+ */
 void av1_one_pass_cbr_svc_start_layer(struct AV1_COMP *const cpi);
 
+/*!\brief Get primary reference frame for current layer
+ *
+ * \ingroup SVC
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in]       cpi  Top level encoder structure
+ *
+ * \return  The primary reference frame for current layer.
+ */
+int av1_svc_primary_ref_frame(const struct AV1_COMP *const cpi);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/av1/encoder/temporal_filter.c b/av1/encoder/temporal_filter.c
index a637df5..0b6f4d7 100644
--- a/av1/encoder/temporal_filter.c
+++ b/av1/encoder/temporal_filter.c
@@ -20,7 +20,9 @@
 #include "av1/common/quant_common.h"
 #include "av1/common/reconinter.h"
 #include "av1/encoder/av1_quantize.h"
+#include "av1/encoder/encodeframe.h"
 #include "av1/encoder/encoder.h"
+#include "av1/encoder/ethread.h"
 #include "av1/encoder/extend.h"
 #include "av1/encoder/firstpass.h"
 #include "av1/encoder/mcomp.h"
@@ -35,34 +37,56 @@
 #include "aom_ports/system_state.h"
 #include "aom_scale/aom_scale.h"
 
+/*!\cond */
+
 // NOTE: All `tf` in this file means `temporal filtering`.
 
-// Does motion search for blocks in temporal filtering. This is the first step
-// for temporal filtering. More specifically, given a frame to be filtered and
-// another frame as reference, this function searches the reference frame to
-// find out the most alike block as that from the frame to be filtered. This
-// found block will be further used for weighted averaging.
-// NOTE: Besides doing motion search for the entire block, this function will
-// also do motion search for each 1/4 sub-block to get more precise prediction.
-// Inputs:
-//   cpi: Pointer to the composed information of input video.
-//   frame_to_filter: Pointer to the frame to be filtered.
-//   ref_frame: Pointer to the reference frame.
-//   block_size: Block size used for motion search.
-//   mb_row: Row index of the block in the entire frame.
-//   mb_col: Column index of the block in the entire frame.
-//   ref_mv: Reference motion vector, which is commonly inherited from the
-//           motion search result of previous frame.
-//   subblock_mvs: Pointer to the result motion vectors for 4 sub-blocks.
-//   subblock_mses: Pointer to the search errors (MSE) for 4 sub-blocks.
-// Returns:
-//   Search error (MSE) of the entire block.
-static int tf_motion_search(AV1_COMP *cpi,
-                            const YV12_BUFFER_CONFIG *frame_to_filter,
-                            const YV12_BUFFER_CONFIG *ref_frame,
-                            const BLOCK_SIZE block_size, const int mb_row,
-                            const int mb_col, MV *ref_mv, MV *subblock_mvs,
-                            int *subblock_mses) {
+// Forward Declaration.
+static void tf_determine_block_partition(const MV block_mv, const int block_mse,
+                                         MV *subblock_mvs, int *subblock_mses);
+
+/*!\endcond */
+/*!\brief Does motion search for blocks in temporal filtering. This is
+ *  the first step for temporal filtering. More specifically, given a frame to
+ * be filtered and another frame as reference, this function searches the
+ * reference frame to find out the most similar block as that from the frame
+ * to be filtered. This found block will be further used for weighted
+ * averaging.
+ *
+ * NOTE: Besides doing motion search for the entire block, this function will
+ *       also do motion search for each 1/4 sub-block to get more precise
+ *       predictions. Then, this function will determines whether to use 4
+ *       sub-blocks to replace the entire block. If we do need to split the
+ *       entire block, 4 elements in `subblock_mvs` and `subblock_mses` refer to
+ *       the searched motion vector and search error (MSE) w.r.t. each sub-block
+ *       respectively. Otherwise, the 4 elements will be the same, all of which
+ *       are assigned as the searched motion vector and search error (MSE) for
+ *       the entire block.
+ *
+ * \ingroup src_frame_proc
+ * \param[in]   cpi             Top level encoder instance structure
+ * \param[in]   mb              Pointer to macroblock
+ * \param[in]   frame_to_filter Pointer to the frame to be filtered
+ * \param[in]   ref_frame       Pointer to the reference frame
+ * \param[in]   block_size      Block size used for motion search
+ * \param[in]   mb_row          Row index of the block in the frame
+ * \param[in]   mb_col          Column index of the block in the frame
+ * \param[in]   ref_mv          Reference motion vector, which is commonly
+ *                              inherited from the motion search result of
+ *                              previous frame.
+ * \param[out]  subblock_mvs    Pointer to the motion vectors for 4 sub-blocks
+ * \param[out]  subblock_mses   Pointer to the search errors (MSE) for 4
+ *                              sub-blocks
+ *
+ * \return Nothing will be returned. Results are saved in subblock_mvs and
+ *         subblock_mses
+ */
+static void tf_motion_search(AV1_COMP *cpi, MACROBLOCK *mb,
+                             const YV12_BUFFER_CONFIG *frame_to_filter,
+                             const YV12_BUFFER_CONFIG *ref_frame,
+                             const BLOCK_SIZE block_size, const int mb_row,
+                             const int mb_col, MV *ref_mv, MV *subblock_mvs,
+                             int *subblock_mses) {
   // Frame information
   const int min_frame_size = AOMMIN(cpi->common.width, cpi->common.height);
 
@@ -75,19 +99,16 @@
   const int y_offset = mb_row * mb_height * y_stride + mb_col * mb_width;
 
   // Save input state.
-  MACROBLOCK *const mb = &cpi->td.mb;
   MACROBLOCKD *const mbd = &mb->e_mbd;
   const struct buf_2d ori_src_buf = mb->plane[0].src;
   const struct buf_2d ori_pre_buf = mbd->plane[0].pre[0];
-  const MV_COST_TYPE ori_mv_cost_type = mb->mv_cost_type;
 
   // Parameters used for motion search.
   FULLPEL_MOTION_SEARCH_PARAMS full_ms_params;
   SUBPEL_MOTION_SEARCH_PARAMS ms_params;
-
-  const search_site_config ss_cfg =
-      cpi->mv_search_params.ss_cfg[SS_CFG_LOOKAHEAD];
-  const SEARCH_METHODS full_search_method = NSTEP;
+  const SEARCH_METHODS search_method = NSTEP;
+  const search_site_config *search_site_cfg =
+      cpi->mv_search_params.search_site_cfg[SS_CFG_LOOKAHEAD];
   const int step_param = av1_init_search_range(
       AOMMAX(frame_to_filter->y_crop_width, frame_to_filter->y_crop_height));
   const SUBPEL_SEARCH_TYPE subpel_search_type = USE_8_TAPS;
@@ -113,23 +134,21 @@
   int cost_list[5];
 
   // Do motion search.
-  // NOTE: In `av1_full_pixel_search()` and `find_fractional_mv_step()`, the
-  // searched result will be stored in `mb->best_mv`.
-  int_mv best_mv;
+  int_mv best_mv;  // Searched motion vector.
   int block_mse = INT_MAX;
-  mb->mv_cost_type = mv_cost_type;
+  MV block_mv = kZeroMv;
 
   av1_make_default_fullpel_ms_params(&full_ms_params, cpi, mb, block_size,
-                                     &baseline_mv, &ss_cfg);
+                                     &baseline_mv, search_site_cfg,
+                                     /*fine_search_interval=*/0);
+  av1_set_mv_search_method(&full_ms_params, search_site_cfg, search_method);
   full_ms_params.run_mesh_search = 1;
-  full_ms_params.search_method = full_search_method;
+  full_ms_params.mv_cost_params.mv_cost_type = mv_cost_type;
+
   av1_full_pixel_search(start_mv, &full_ms_params, step_param,
                         cond_cost_list(cpi, cost_list), &best_mv.as_fullmv,
                         NULL);
 
-  // Since we are merely refining the result from full pixel search, we don't
-  // need regularization for subpel search
-  mb->mv_cost_type = MV_COST_NONE;
   if (force_integer_mv == 1) {  // Only do full search on the entire block.
     const int mv_row = best_mv.as_mv.row;
     const int mv_col = best_mv.as_mv.col;
@@ -140,18 +159,22 @@
         ref_frame->y_buffer + y_offset + mv_offset, y_stride,
         frame_to_filter->y_buffer + y_offset, y_stride, &sse);
     block_mse = DIVIDE_AND_ROUND(error, mb_pels);
-    mb->e_mbd.mi[0]->mv[0] = best_mv;
+    block_mv = best_mv.as_mv;
   } else {  // Do fractional search on the entire block and all sub-blocks.
     av1_make_default_subpel_ms_params(&ms_params, cpi, mb, block_size,
                                       &baseline_mv, cost_list);
     ms_params.forced_stop = EIGHTH_PEL;
     ms_params.var_params.subpel_search_type = subpel_search_type;
+    // Since we are merely refining the result from full pixel search, we don't
+    // need regularization for subpel search
+    ms_params.mv_cost_params.mv_cost_type = MV_COST_NONE;
+
     MV subpel_start_mv = get_mv_from_fullmv(&best_mv.as_fullmv);
     error = cpi->mv_search_params.find_fractional_mv_step(
         &mb->e_mbd, &cpi->common, &ms_params, subpel_start_mv, &best_mv.as_mv,
         &distortion, &sse, NULL);
     block_mse = DIVIDE_AND_ROUND(error, mb_pels);
-    mb->e_mbd.mi[0]->mv[0] = best_mv;
+    block_mv = best_mv.as_mv;
     *ref_mv = best_mv.as_mv;
     // On 4 sub-blocks.
     const BLOCK_SIZE subblock_size = ss_size_lookup[block_size][1][1];
@@ -166,23 +189,27 @@
         const int offset = i * y_stride + j;
         mb->plane[0].src.buf = frame_to_filter->y_buffer + y_offset + offset;
         mbd->plane[0].pre[0].buf = ref_frame->y_buffer + y_offset + offset;
-        mb->mv_cost_type = mv_cost_type;
-
-        av1_make_default_fullpel_ms_params(
-            &full_ms_params, cpi, mb, subblock_size, &baseline_mv, &ss_cfg);
+        av1_make_default_fullpel_ms_params(&full_ms_params, cpi, mb,
+                                           subblock_size, &baseline_mv,
+                                           search_site_cfg,
+                                           /*fine_search_interval=*/0);
+        av1_set_mv_search_method(&full_ms_params, search_site_cfg,
+                                 search_method);
         full_ms_params.run_mesh_search = 1;
-        full_ms_params.search_method = full_search_method;
+        full_ms_params.mv_cost_params.mv_cost_type = mv_cost_type;
+
         av1_full_pixel_search(start_mv, &full_ms_params, step_param,
                               cond_cost_list(cpi, cost_list),
                               &best_mv.as_fullmv, NULL);
 
-        // Since we are merely refining the result from full pixel search, we
-        // don't need regularization for subpel search
-        mb->mv_cost_type = MV_COST_NONE;
         av1_make_default_subpel_ms_params(&ms_params, cpi, mb, subblock_size,
                                           &baseline_mv, cost_list);
         ms_params.forced_stop = EIGHTH_PEL;
         ms_params.var_params.subpel_search_type = subpel_search_type;
+        // Since we are merely refining the result from full pixel search, we
+        // don't need regularization for subpel search
+        ms_params.mv_cost_params.mv_cost_type = MV_COST_NONE;
+
         subpel_start_mv = get_mv_from_fullmv(&best_mv.as_fullmv);
         error = cpi->mv_search_params.find_fractional_mv_step(
             &mb->e_mbd, &cpi->common, &ms_params, subpel_start_mv,
@@ -197,51 +224,35 @@
   // Restore input state.
   mb->plane[0].src = ori_src_buf;
   mbd->plane[0].pre[0] = ori_pre_buf;
-  mb->mv_cost_type = ori_mv_cost_type;
 
-  return block_mse;
-}
+  // Make partition decision.
+  tf_determine_block_partition(block_mv, block_mse, subblock_mvs,
+                               subblock_mses);
 
-// Helper function to get weight according to thresholds.
-static INLINE int get_weight_by_thresh(const int value, const int low,
-                                       const int high) {
-  return value < low ? 2 : value < high ? 1 : 0;
-}
-
-// Gets filter weight for blocks in temporal filtering. The weights will be
-// assigned based on the motion search errors.
-// NOTE: Besides assigning filter weight for the block, this function will also
-// determine whether to split the entire block into 4 sub-blocks for further
-// filtering.
-// TODO(any): Many magic numbers are used in this function. They may be tuned
-// to improve the performance.
-// Inputs:
-//   block_mse: Motion search error (MSE) for the entire block.
-//   subblock_mses: Pointer to the search errors (MSE) for 4 sub-blocks.
-//   is_second_arf: Whether the to-filter frame is the second ARF. This field
-//                  will affect the filter weight for the to-filter frame.
-//   subblock_filter_weights: Pointer to the assigned filter weight for each
-//                            sub-block. If not using sub-blocks, the first
-//                            element will be used for the entire block.
-// Returns: Whether to use 4 sub-blocks to replace the original block.
-static int tf_get_filter_weight(const int block_mse, const int *subblock_mses,
-                                const int is_second_arf,
-                                int *subblock_filter_weights) {
-  // `block_mse` is initialized as INT_MAX and will be overwritten after the
-  // motion search with reference frame, therefore INT_MAX can ONLY be accessed
-  // by to-filter frame.
-  if (block_mse == INT_MAX) {
-    const int weight = TF_ENABLE_PLANEWISE_STRATEGY
-                           ? TF_PLANEWISE_FILTER_WEIGHT_SCALE
-                           : is_second_arf ? 64 : 32;
-    subblock_filter_weights[0] = subblock_filter_weights[1] =
-        subblock_filter_weights[2] = subblock_filter_weights[3] = weight;
-    return 0;
+  // Do not pass down the reference motion vector if error is too large.
+  const int thresh = (min_frame_size >= 720) ? 12 : 3;
+  if (block_mse > (thresh << (mbd->bd - 8))) {
+    *ref_mv = kZeroMv;
   }
+}
+/*!\cond */
 
-  const int thresh_low = is_second_arf ? 20 : 40;
-  const int thresh_high = is_second_arf ? 40 : 80;
-
+// Determines whether to split the entire block to 4 sub-blocks for filtering.
+// In particular, this decision is made based on the comparison between the
+// motion search error of the entire block and the errors of all sub-blocks.
+// Inputs:
+//   block_mv: Motion vector for the entire block (ONLY as reference).
+//   block_mse: Motion search error (MSE) for the entire block (ONLY as
+//              reference).
+//   subblock_mvs: Pointer to the motion vectors for 4 sub-blocks (will be
+//                 modified based on the partition decision).
+//   subblock_mses: Pointer to the search errors (MSE) for 4 sub-blocks (will
+//                  be modified based on the partition decision).
+// Returns:
+//   Nothing will be returned. Results are saved in `subblock_mvs` and
+//   `subblock_mses`.
+static void tf_determine_block_partition(const MV block_mv, const int block_mse,
+                                         MV *subblock_mvs, int *subblock_mses) {
   int min_subblock_mse = INT_MAX;
   int max_subblock_mse = INT_MIN;
   int sum_subblock_mse = 0;
@@ -249,20 +260,18 @@
     sum_subblock_mse += subblock_mses[i];
     min_subblock_mse = AOMMIN(min_subblock_mse, subblock_mses[i]);
     max_subblock_mse = AOMMAX(max_subblock_mse, subblock_mses[i]);
-    subblock_filter_weights[i] =
-        get_weight_by_thresh(subblock_mses[i], thresh_low, thresh_high);
   }
 
+  // TODO(any): The following magic numbers may be tuned to improve the
+  // performance OR find a way to get rid of these magic numbers.
   if (((block_mse * 15 < sum_subblock_mse * 4) &&
        max_subblock_mse - min_subblock_mse < 48) ||
       ((block_mse * 14 < sum_subblock_mse * 4) &&
        max_subblock_mse - min_subblock_mse < 24)) {  // No split.
-    const int weight = get_weight_by_thresh(block_mse, thresh_low, thresh_high);
-    subblock_filter_weights[0] = subblock_filter_weights[1] =
-        subblock_filter_weights[2] = subblock_filter_weights[3] = weight;
-    return 0;
-  } else {  // Do split.
-    return 1;
+    for (int i = 0; i < 4; ++i) {
+      subblock_mvs[i] = block_mv;
+      subblock_mses[i] = block_mse;
+    }
   }
 }
 
@@ -271,58 +280,53 @@
   return (frame->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
 }
 
-// Builds predictor for blocks in temporal filtering. This is the second step
-// for temporal filtering, which is to construct predictions from all reference
-// frames INCLUDING the frame to be filtered itself. These predictors are built
-// based on the motion search results (motion vector is set as 0 for the frame
-// to be filtered), and will be futher used for weighted averaging.
-// Inputs:
-//   ref_frame: Pointer to the reference frame (or the frame to be filtered).
-//   mbd: Pointer to the block for filtering. Besides containing the subsampling
-//        information of all planes, this field also gives the searched motion
-//        vector for the entire block, i.e., `mbd->mi[0]->mv[0]`. This vector
-//        should be 0 if the `ref_frame` itself is the frame to be filtered.
-//   block_size: Size of the block.
-//   mb_row: Row index of the block in the entire frame.
-//   mb_col: Column index of the block in the entire frame.
-//   num_planes: Number of planes in the frame.
-//   scale: Scaling factor.
-//   use_subblock: Whether to use 4 sub-blocks to replace the original block.
-//   subblock_mvs: The motion vectors for each sub-block (row-major order).
-//   pred: Pointer to the predictor to build.
-// Returns:
-//   Nothing will be returned. But the content to which `pred` points will be
-//   modified.
+/*!\endcond */
+/*!\brief Builds predictor for blocks in temporal filtering. This is the
+ * second step for temporal filtering, which is to construct predictions from
+ * all reference frames INCLUDING the frame to be filtered itself. These
+ * predictors are built based on the motion search results (motion vector is
+ * set as 0 for the frame to be filtered), and will be futher used for
+ * weighted averaging.
+ *
+ * \ingroup src_frame_proc
+ * \param[in]   ref_frame      Pointer to the reference frame (or the frame
+ *                             to be filtered)
+ * \param[in]   mbd            Pointer to the block for filtering. Besides
+ *                             containing the subsampling information of all
+ *                             planes, this field also gives the searched
+ *                             motion vector for the entire block, i.e.,
+ *                             `mbd->mi[0]->mv[0]`. This vector  should be 0
+ *                             if the `ref_frame` itself is the frame to be
+ *                             filtered.
+ * \param[in]   block_size     Size of the block
+ * \param[in]   mb_row         Row index of the block in the frame
+ * \param[in]   mb_col         Column index of the block in the frame
+ * \param[in]   num_planes     Number of planes in the frame
+ * \param[in]   scale          Scaling factor
+ * \param[in]   subblock_mvs   The motion vectors for each sub-block (row-major
+ *                             order)
+ * \param[out]  pred           Pointer to the predictor to be built
+ *
+ * \return Nothing returned, But the contents of `pred` will be modified
+ */
 static void tf_build_predictor(const YV12_BUFFER_CONFIG *ref_frame,
                                const MACROBLOCKD *mbd,
                                const BLOCK_SIZE block_size, const int mb_row,
                                const int mb_col, const int num_planes,
                                const struct scale_factors *scale,
-                               const int use_subblock, const MV *subblock_mvs,
-                               uint8_t *pred) {
-  assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
-
+                               const MV *subblock_mvs, uint8_t *pred) {
   // Information of the entire block.
   const int mb_height = block_size_high[block_size];  // Height.
   const int mb_width = block_size_wide[block_size];   // Width.
-  const int mb_pels = mb_height * mb_width;           // Number of pixels.
   const int mb_y = mb_height * mb_row;                // Y-coord (Top-left).
   const int mb_x = mb_width * mb_col;                 // X-coord (Top-left).
   const int bit_depth = mbd->bd;                      // Bit depth.
   const int is_intrabc = 0;                           // Is intra-copied?
-  const int mb_mv_row = mbd->mi[0]->mv[0].as_mv.row;  // Motion vector (y).
-  const int mb_mv_col = mbd->mi[0]->mv[0].as_mv.col;  // Motion vector (x).
-  const MV mb_mv = { (int16_t)mb_mv_row, (int16_t)mb_mv_col };
   const int is_high_bitdepth = is_frame_high_bitdepth(ref_frame);
 
-  // Information of each sub-block (actually in use).
-  const int num_blocks = use_subblock ? 2 : 1;  // Num of blocks on each side.
-  const int block_height = mb_height >> (num_blocks - 1);  // Height.
-  const int block_width = mb_width >> (num_blocks - 1);    // Width.
-
   // Default interpolation filters.
   const int_interpfilters interp_filters =
-      av1_broadcast_interp_filter(MULTITAP_SHARP);
+      av1_broadcast_interp_filter(MULTITAP_SHARP2);
 
   // Handle Y-plane, U-plane and V-plane (if needed) in sequence.
   int plane_offset = 0;
@@ -334,8 +338,8 @@
     const int plane_w = mb_width >> subsampling_x;   // Plane width.
     const int plane_y = mb_y >> subsampling_y;       // Y-coord (Top-left).
     const int plane_x = mb_x >> subsampling_x;       // X-coord (Top-left).
-    const int h = block_height >> subsampling_y;     // Sub-block height.
-    const int w = block_width >> subsampling_x;      // Sub-block width.
+    const int h = plane_h >> 1;                      // Sub-block height.
+    const int w = plane_w >> 1;                      // Sub-block width.
     const int is_y_plane = (plane == 0);             // Is Y-plane?
 
     const struct buf_2d ref_buf = { NULL, ref_frame->buffers[plane],
@@ -343,12 +347,12 @@
                                     ref_frame->heights[is_y_plane ? 0 : 1],
                                     ref_frame->strides[is_y_plane ? 0 : 1] };
 
-    // Handle entire block or sub-blocks if needed.
+    // Handle each subblock.
     int subblock_idx = 0;
     for (int i = 0; i < plane_h; i += h) {
       for (int j = 0; j < plane_w; j += w) {
         // Choose proper motion vector.
-        const MV mv = use_subblock ? subblock_mvs[subblock_idx] : mb_mv;
+        const MV mv = subblock_mvs[subblock_idx++];
         assert(mv.row >= INT16_MIN && mv.row <= INT16_MAX &&
                mv.col >= INT16_MIN && mv.col <= INT16_MAX);
 
@@ -363,13 +367,12 @@
         inter_pred_params.conv_params = get_conv_params(0, plane, bit_depth);
         av1_enc_build_one_inter_predictor(&pred[plane_offset + i * plane_w + j],
                                           plane_w, &mv, &inter_pred_params);
-
-        ++subblock_idx;
       }
     }
-    plane_offset += mb_pels;
+    plane_offset += plane_h * plane_w;
   }
 }
+/*!\cond */
 
 // Computes temporal filter weights and accumulators for the frame to be
 // filtered. More concretely, the filter weights for all pixels are the same.
@@ -378,27 +381,22 @@
 //        subsampling information of all planes as well as the bit-depth.
 //   block_size: Size of the block.
 //   num_planes: Number of planes in the frame.
-//   filter_weight: Weight used for filtering.
 //   pred: Pointer to the well-built predictors.
 //   accum: Pointer to the pixel-wise accumulator for filtering.
 //   count: Pointer to the pixel-wise counter fot filtering.
 // Returns:
 //   Nothing will be returned. But the content to which `accum` and `pred`
 //   point will be modified.
-void av1_apply_temporal_filter_self(const MACROBLOCKD *mbd,
-                                    const BLOCK_SIZE block_size,
-                                    const int num_planes,
-                                    const int filter_weight,
-                                    const uint8_t *pred, uint32_t *accum,
-                                    uint16_t *count) {
-  assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
-
+void tf_apply_temporal_filter_self(const YV12_BUFFER_CONFIG *ref_frame,
+                                   const MACROBLOCKD *mbd,
+                                   const BLOCK_SIZE block_size,
+                                   const int mb_row, const int mb_col,
+                                   const int num_planes, uint32_t *accum,
+                                   uint16_t *count) {
   // Block information.
   const int mb_height = block_size_high[block_size];
   const int mb_width = block_size_wide[block_size];
-  const int mb_pels = mb_height * mb_width;
   const int is_high_bitdepth = is_cur_buf_hbd(mbd);
-  const uint16_t *pred16 = CONVERT_TO_SHORTPTR(pred);
 
   int plane_offset = 0;
   for (int plane = 0; plane < num_planes; ++plane) {
@@ -407,17 +405,27 @@
     const int h = mb_height >> subsampling_y;  // Plane height.
     const int w = mb_width >> subsampling_x;   // Plane width.
 
+    const int frame_stride = ref_frame->strides[plane == AOM_PLANE_Y ? 0 : 1];
+    const uint8_t *buf8 = ref_frame->buffers[plane];
+    const uint16_t *buf16 = CONVERT_TO_SHORTPTR(buf8);
+    const int frame_offset = mb_row * h * frame_stride + mb_col * w;
+
     int pred_idx = 0;
+    int pixel_idx = 0;
     for (int i = 0; i < h; ++i) {
       for (int j = 0; j < w; ++j) {
         const int idx = plane_offset + pred_idx;  // Index with plane shift.
-        const int pred_value = is_high_bitdepth ? pred16[idx] : pred[idx];
-        accum[idx] += filter_weight * pred_value;
-        count[idx] += filter_weight;
+        const int pred_value = is_high_bitdepth
+                                   ? buf16[frame_offset + pixel_idx]
+                                   : buf8[frame_offset + pixel_idx];
+        accum[idx] += TF_WEIGHT_SCALE * pred_value;
+        count[idx] += TF_WEIGHT_SCALE;
         ++pred_idx;
+        ++pixel_idx;
       }
+      pixel_idx += (frame_stride - w);
     }
-    plane_offset += mb_pels;
+    plane_offset += h * w;
   }
 }
 
@@ -468,239 +476,155 @@
   }
 }
 
-// Function to adjust the filter weight when use YUV strategy.
+// Function to accumulate pixel-wise squared difference between two luma buffers
+// to be consumed while filtering the chroma planes.
 // Inputs:
-//   filter_weight: Original filter weight.
-//   sum_square_diff: Sum of squared difference between input frame and
-//                    prediction. This field is computed pixel by pixel, and
-//                    is used as a reference for the filter weight adjustment.
-//   num_ref_pixels: Number of pixels used to compute the `sum_square_diff`.
-//                   This field should align with the above lookup tables
-//                   `filter_weight_adjustment_lookup_table_yuv` and
-//                   `highbd_filter_weight_adjustment_lookup_table_yuv`.
-//   strength: Strength for filter weight adjustment.
+//   square_diff: Pointer to squared differences from luma plane.
+//   luma_sse_sum: Pointer to save the sum of luma squared differences.
+//   block_height: Height of block for computation.
+//   block_width: Width of block for computation.
+//   ss_x_shift: Chroma subsampling shift in 'X' direction
+//   ss_y_shift: Chroma subsampling shift in 'Y' direction
 // Returns:
-//   Adjusted filter weight which will finally be used for filtering.
-static INLINE int adjust_filter_weight_yuv(const int filter_weight,
-                                           const uint64_t sum_square_diff,
-                                           const int num_ref_pixels,
-                                           const int strength) {
-  int modifier =
-      (int)(AOMMIN(sum_square_diff * TF_YUV_FILTER_WEIGHT_SCALE, INT32_MAX)) /
-      num_ref_pixels;
-  const int rounding = (1 << strength) >> 1;
-  modifier = (modifier + rounding) >> strength;
-  return (modifier >= 16) ? 0 : (16 - modifier) * filter_weight;
-}
-
-// Applies temporal filter with YUV strategy.
-// Inputs:
-//   frame_to_filter: Pointer to the frame to be filtered, which is used as
-//                    reference to compute squared differece from the predictor.
-//   mbd: Pointer to the block for filtering, which is ONLY used to get
-//        subsampling information of all YUV planes.
-//   block_size: Size of the block.
-//   mb_row: Row index of the block in the entire frame.
-//   mb_col: Column index of the block in the entire frame.
-//   num_planes: Number of planes in the frame.
-//   strength: Strength for filter weight adjustment.
-//   use_subblock: Whether to use 4 sub-blocks to replace the original block.
-//   subblock_filter_weights: The filter weights for each sub-block (row-major
-//                            order). If `use_subblock` is set as 0, the first
-//                            weight will be applied to the entire block.
-//   pred: Pointer to the well-built predictors.
-//   accum: Pointer to the pixel-wise accumulator for filtering.
-//   count: Pointer to the pixel-wise counter fot filtering.
-// Returns:
-//   Nothing will be returned. But the content to which `accum` and `pred`
-//   point will be modified.
-void av1_apply_temporal_filter_yuv_c(
-    const YV12_BUFFER_CONFIG *frame_to_filter, const MACROBLOCKD *mbd,
-    const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
-    const int num_planes, const int strength, const int use_subblock,
-    const int *subblock_filter_weights, const uint8_t *pred, uint32_t *accum,
-    uint16_t *count) {
-  assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
-
-  // Block information.
-  const int mb_height = block_size_high[block_size];
-  const int mb_width = block_size_wide[block_size];
-  const int mb_pels = mb_height * mb_width;
-  const int is_high_bitdepth = is_frame_high_bitdepth(frame_to_filter);
-  const uint16_t *pred16 = CONVERT_TO_SHORTPTR(pred);
-
-  // Allocate memory for pixel-wise squared differences for all planes. They,
-  // regardless of the subsampling, are assigned with memory of size `mb_pels`.
-  uint32_t *square_diff =
-      aom_memalign(16, num_planes * mb_pels * sizeof(uint32_t));
-  memset(square_diff, 0, num_planes * mb_pels * sizeof(square_diff[0]));
-
-  int plane_offset = 0;
-  for (int plane = 0; plane < num_planes; ++plane) {
-    // Locate pixel on reference frame.
-    const int plane_h = mb_height >> mbd->plane[plane].subsampling_y;
-    const int plane_w = mb_width >> mbd->plane[plane].subsampling_x;
-    const int frame_stride = frame_to_filter->strides[plane == 0 ? 0 : 1];
-    const int frame_offset = mb_row * plane_h * frame_stride + mb_col * plane_w;
-    const uint8_t *ref = frame_to_filter->buffers[plane];
-    compute_square_diff(ref, frame_offset, frame_stride, pred, plane_offset,
-                        plane_w, plane_h, plane_w, is_high_bitdepth,
-                        square_diff + plane_offset);
-    plane_offset += mb_pels;
-  }
-
-  // Get window size for pixel-wise filtering.
-  assert(TF_YUV_FILTER_WINDOW_LENGTH % 2 == 1);
-  const int half_window = TF_YUV_FILTER_WINDOW_LENGTH >> 1;
-
-  // Handle planes in sequence.
-  plane_offset = 0;
-  for (int plane = 0; plane < num_planes; ++plane) {
-    const int subsampling_y = mbd->plane[plane].subsampling_y;
-    const int subsampling_x = mbd->plane[plane].subsampling_x;
-    const int h = mb_height >> subsampling_y;  // Plane height.
-    const int w = mb_width >> subsampling_x;   // Plane width.
-
-    // Perform filtering.
-    int pred_idx = 0;
-    for (int i = 0; i < h; ++i) {
-      for (int j = 0; j < w; ++j) {
-        // non-local mean approach
-        uint64_t sum_square_diff = 0;
-        int num_ref_pixels = 0;
-
-        for (int wi = -half_window; wi <= half_window; ++wi) {
-          for (int wj = -half_window; wj <= half_window; ++wj) {
-            const int y = i + wi;  // Y-coord on the current plane.
-            const int x = j + wj;  // X-coord on the current plane.
-            if (y >= 0 && y < h && x >= 0 && x < w) {
-              sum_square_diff += square_diff[plane_offset + y * w + x];
-              ++num_ref_pixels;
-            }
-          }
+//   Nothing will be returned. But the content to which `luma_sse_sum` points
+//   will be modified.
+void compute_luma_sq_error_sum(uint32_t *square_diff, uint32_t *luma_sse_sum,
+                               int block_height, int block_width,
+                               int ss_x_shift, int ss_y_shift) {
+  for (int i = 0; i < block_height; ++i) {
+    for (int j = 0; j < block_width; ++j) {
+      for (int ii = 0; ii < (1 << ss_y_shift); ++ii) {
+        for (int jj = 0; jj < (1 << ss_x_shift); ++jj) {
+          const int yy = (i << ss_y_shift) + ii;     // Y-coord on Y-plane.
+          const int xx = (j << ss_x_shift) + jj;     // X-coord on Y-plane.
+          const int ww = block_width << ss_x_shift;  // Width of Y-plane.
+          luma_sse_sum[i * block_width + j] += square_diff[yy * ww + xx];
         }
-
-        if (plane == 0) {  // Filter Y-plane using both U-plane and V-plane.
-          for (int p = 1; p < num_planes; ++p) {
-            const int ss_y_shift = mbd->plane[p].subsampling_y - subsampling_y;
-            const int ss_x_shift = mbd->plane[p].subsampling_x - subsampling_x;
-            const int yy = i >> ss_y_shift;  // Y-coord on UV-plane.
-            const int xx = j >> ss_x_shift;  // X-coord on UV-plane.
-            const int ww = w >> ss_x_shift;  // Width of UV-plane.
-            sum_square_diff += square_diff[p * mb_pels + yy * ww + xx];
-            ++num_ref_pixels;
-          }
-        } else {  // Filter U-plane and V-plane using Y-plane.
-          const int ss_y_shift = subsampling_y - mbd->plane[0].subsampling_y;
-          const int ss_x_shift = subsampling_x - mbd->plane[0].subsampling_x;
-          for (int ii = 0; ii < (1 << ss_y_shift); ++ii) {
-            for (int jj = 0; jj < (1 << ss_x_shift); ++jj) {
-              const int yy = (i << ss_y_shift) + ii;  // Y-coord on Y-plane.
-              const int xx = (j << ss_x_shift) + jj;  // X-coord on Y-plane.
-              const int ww = w << ss_x_shift;         // Width of Y-plane.
-              sum_square_diff += square_diff[yy * ww + xx];
-              ++num_ref_pixels;
-            }
-          }
-        }
-
-        // Base filter weight estimated by motion search error.
-        const int subblock_idx =
-            use_subblock ? (i >= h / 2) * 2 + (j >= w / 2) : 0;
-        const int filter_weight = subblock_filter_weights[subblock_idx];
-
-        const int idx = plane_offset + pred_idx;  // Index with plane shift.
-        const int pred_value = is_high_bitdepth ? pred16[idx] : pred[idx];
-        const int adjusted_weight = adjust_filter_weight_yuv(
-            filter_weight, sum_square_diff, num_ref_pixels, strength);
-        accum[idx] += adjusted_weight * pred_value;
-        count[idx] += adjusted_weight;
-
-        ++pred_idx;
       }
     }
-    plane_offset += mb_pels;
   }
-
-  aom_free(square_diff);
 }
 
-// Applies temporal filter with plane-wise strategy.
-// The strategy of filter weight adjustment is different from the function
-// `av1_apply_temporal_filter_yuv_c()`.
-// Inputs:
-//   frame_to_filter: Pointer to the frame to be filtered, which is used as
-//                    reference to compute squared differece from the predictor.
-//   mbd: Pointer to the block for filtering, which is ONLY used to get
-//        subsampling information of all planes.
-//   block_size: Size of the block.
-//   mb_row: Row index of the block in the entire frame.
-//   mb_col: Column index of the block in the entire frame.
-//   num_planes: Number of planes in the frame.
-//   noise_levels: Pointer to the noise levels of the to-filter frame, estimated
-//                 with each plane (in Y, U, V order).
-//   use_subblock: Whether to use 4 sub-blocks to replace the original block.
-//   block_mse: Motion search error (MSE) for the entire block.
-//   subblock_mses: Pointer to the search errors (MSE) for 4 sub-blocks.
-//   q_factor: Quantization factor. This is actually the `q` defined in libaom,
-//             which is converted from `qindex`.
-//   pred: Pointer to the well-built predictors.
-//   accum: Pointer to the pixel-wise accumulator for filtering.
-//   count: Pointer to the pixel-wise counter fot filtering.
-// Returns:
-//   Nothing will be returned. But the content to which `accum` and `pred`
-//   point will be modified.
-void av1_apply_temporal_filter_planewise_c(
+/*!\endcond */
+/*!\brief Applies temporal filtering. NOTE that there are various optimised
+ * versions of this function called where the appropriate instruction set is
+ * supported.
+ *
+ * \ingroup src_frame_proc
+ * \param[in]   frame_to_filter Pointer to the frame to be filtered, which is
+ *                              used as reference to compute squared
+ *                              difference from the predictor.
+ * \param[in]   mbd             Pointer to the block for filtering, ONLY used
+ *                              to get subsampling information for the  planes
+ * \param[in]   block_size      Size of the block
+ * \param[in]   mb_row          Row index of the block in the frame
+ * \param[in]   mb_col          Column index of the block in the frame
+ * \param[in]   num_planes      Number of planes in the frame
+ * \param[in]   noise_levels    Estimated noise levels for each plane
+ *                              in the frame (Y,U,V)
+ * \param[in]   subblock_mvs    Pointer to the motion vectors for 4 sub-blocks
+ * \param[in]   subblock_mses   Pointer to the search errors (MSE) for 4
+ *                              sub-blocks
+ * \param[in]   q_factor        Quantization factor. This is actually the `q`
+ *                              defined in libaom, converted from `qindex`
+ * \param[in]   filter_strength Filtering strength. This value lies in range
+ *                              [0, 6] where 6 is the maximum strength.
+ * \param[out]  pred            Pointer to the well-built predictors
+ * \param[out]  accum           Pointer to the pixel-wise accumulator for
+ *                              filtering
+ * \param[out]  count           Pointer to the pixel-wise counter for
+ *                              filtering
+ *
+ * \return Nothing returned, But the contents of `accum`, `pred` and 'count'
+ *         will be modified
+ */
+void av1_apply_temporal_filter_c(
     const YV12_BUFFER_CONFIG *frame_to_filter, const MACROBLOCKD *mbd,
     const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
-    const int num_planes, const double *noise_levels, const int use_subblock,
-    const int block_mse, const int *subblock_mses, const int q_factor,
+    const int num_planes, const double *noise_levels, const MV *subblock_mvs,
+    const int *subblock_mses, const int q_factor, const int filter_strength,
     const uint8_t *pred, uint32_t *accum, uint16_t *count) {
-  assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
-
   // Block information.
   const int mb_height = block_size_high[block_size];
   const int mb_width = block_size_wide[block_size];
   const int mb_pels = mb_height * mb_width;
   const int is_high_bitdepth = is_frame_high_bitdepth(frame_to_filter);
   const uint16_t *pred16 = CONVERT_TO_SHORTPTR(pred);
+  // Frame information.
+  const int frame_height = frame_to_filter->y_crop_height;
+  const int frame_width = frame_to_filter->y_crop_width;
+  const int min_frame_size = AOMMIN(frame_height, frame_width);
+  // Variables to simplify combined error calculation.
+  const double inv_factor = 1.0 / ((TF_WINDOW_BLOCK_BALANCE_WEIGHT + 1) *
+                                   TF_SEARCH_ERROR_NORM_WEIGHT);
+  const double weight_factor =
+      (double)TF_WINDOW_BLOCK_BALANCE_WEIGHT * inv_factor;
+  // Decay factors for non-local mean approach.
+  double decay_factor[MAX_MB_PLANE] = { 0 };
+  // Smaller q -> smaller filtering weight.
+  double q_decay = pow((double)q_factor / TF_Q_DECAY_THRESHOLD, 2);
+  q_decay = CLIP(q_decay, 1e-5, 1);
+  // Smaller strength -> smaller filtering weight.
+  double s_decay = pow((double)filter_strength / TF_STRENGTH_THRESHOLD, 2);
+  s_decay = CLIP(s_decay, 1e-5, 1);
+  for (int plane = 0; plane < num_planes; plane++) {
+    // Larger noise -> larger filtering weight.
+    const double n_decay = 0.5 + log(2 * noise_levels[plane] + 5.0);
+    decay_factor[plane] = 1 / (n_decay * q_decay * s_decay);
+  }
+  double d_factor[4] = { 0 };
+  for (int subblock_idx = 0; subblock_idx < 4; subblock_idx++) {
+    // Larger motion vector -> smaller filtering weight.
+    const MV mv = subblock_mvs[subblock_idx];
+    const double distance = sqrt(pow(mv.row, 2) + pow(mv.col, 2));
+    double distance_threshold = min_frame_size * TF_SEARCH_DISTANCE_THRESHOLD;
+    distance_threshold = AOMMAX(distance_threshold, 1);
+    d_factor[subblock_idx] = distance / distance_threshold;
+    d_factor[subblock_idx] = AOMMAX(d_factor[subblock_idx], 1);
+  }
 
-  // Allocate memory for pixel-wise squared differences for all planes. They,
+  // Allocate memory for pixel-wise squared differences. They,
   // regardless of the subsampling, are assigned with memory of size `mb_pels`.
-  uint32_t *square_diff =
-      aom_memalign(16, num_planes * mb_pels * sizeof(uint32_t));
-  memset(square_diff, 0, num_planes * mb_pels * sizeof(square_diff[0]));
+  uint32_t *square_diff = aom_memalign(16, mb_pels * sizeof(uint32_t));
+  memset(square_diff, 0, mb_pels * sizeof(square_diff[0]));
 
+  // Allocate memory for accumulated luma squared error. This value will be
+  // consumed while filtering the chroma planes.
+  uint32_t *luma_sse_sum = aom_memalign(32, mb_pels * sizeof(uint32_t));
+  memset(luma_sse_sum, 0, mb_pels * sizeof(luma_sse_sum[0]));
+
+  // Get window size for pixel-wise filtering.
+  assert(TF_WINDOW_LENGTH % 2 == 1);
+  const int half_window = TF_WINDOW_LENGTH >> 1;
+
+  // Handle planes in sequence.
   int plane_offset = 0;
   for (int plane = 0; plane < num_planes; ++plane) {
     // Locate pixel on reference frame.
-    const int plane_h = mb_height >> mbd->plane[plane].subsampling_y;
-    const int plane_w = mb_width >> mbd->plane[plane].subsampling_x;
-    const int frame_stride = frame_to_filter->strides[plane == 0 ? 0 : 1];
-    const int frame_offset = mb_row * plane_h * frame_stride + mb_col * plane_w;
-    const uint8_t *ref = frame_to_filter->buffers[plane];
-    compute_square_diff(ref, frame_offset, frame_stride, pred, plane_offset,
-                        plane_w, plane_h, plane_w, is_high_bitdepth,
-                        square_diff + plane_offset);
-    plane_offset += mb_pels;
-  }
-
-  // Get window size for pixel-wise filtering.
-  assert(TF_PLANEWISE_FILTER_WINDOW_LENGTH % 2 == 1);
-  const int half_window = TF_PLANEWISE_FILTER_WINDOW_LENGTH >> 1;
-
-  // Hyper-parameter for filter weight adjustment.
-  const int frame_height = frame_to_filter->heights[0]
-                           << mbd->plane[0].subsampling_y;
-  const int decay_control = frame_height >= 720 ? 4 : 3;
-
-  // Handle planes in sequence.
-  plane_offset = 0;
-  for (int plane = 0; plane < num_planes; ++plane) {
     const int subsampling_y = mbd->plane[plane].subsampling_y;
     const int subsampling_x = mbd->plane[plane].subsampling_x;
     const int h = mb_height >> subsampling_y;  // Plane height.
     const int w = mb_width >> subsampling_x;   // Plane width.
+    const int frame_stride =
+        frame_to_filter->strides[plane == AOM_PLANE_Y ? 0 : 1];
+    const int frame_offset = mb_row * h * frame_stride + mb_col * w;
+    const uint8_t *ref = frame_to_filter->buffers[plane];
+    const int ss_y_shift =
+        subsampling_y - mbd->plane[AOM_PLANE_Y].subsampling_y;
+    const int ss_x_shift =
+        subsampling_x - mbd->plane[AOM_PLANE_Y].subsampling_x;
+    const int num_ref_pixels = TF_WINDOW_LENGTH * TF_WINDOW_LENGTH +
+                               ((plane) ? (1 << (ss_x_shift + ss_y_shift)) : 0);
+    const double inv_num_ref_pixels = 1.0 / num_ref_pixels;
+
+    // Filter U-plane and V-plane using Y-plane. This is because motion
+    // search is only done on Y-plane, so the information from Y-plane will
+    // be more accurate. The luma sse sum is reused in both chroma planes.
+    if (plane == AOM_PLANE_U)
+      compute_luma_sq_error_sum(square_diff, luma_sse_sum, h, w, ss_x_shift,
+                                ss_y_shift);
+    compute_square_diff(ref, frame_offset, frame_stride, pred, plane_offset, w,
+                        h, w, is_high_bitdepth, square_diff);
 
     // Perform filtering.
     int pred_idx = 0;
@@ -708,166 +632,86 @@
       for (int j = 0; j < w; ++j) {
         // non-local mean approach
         uint64_t sum_square_diff = 0;
-        int num_ref_pixels = 0;
 
         for (int wi = -half_window; wi <= half_window; ++wi) {
           for (int wj = -half_window; wj <= half_window; ++wj) {
             const int y = CLIP(i + wi, 0, h - 1);  // Y-coord on current plane.
             const int x = CLIP(j + wj, 0, w - 1);  // X-coord on current plane.
-            sum_square_diff += square_diff[plane_offset + y * w + x];
-            ++num_ref_pixels;
+            sum_square_diff += square_diff[y * w + x];
           }
         }
 
-        // Filter U-plane and V-plane using Y-plane. This is because motion
-        // search is only done on Y-plane, so the information from Y-plane will
-        // be more accurate.
-        if (plane != 0) {
-          const int ss_y_shift = subsampling_y - mbd->plane[0].subsampling_y;
-          const int ss_x_shift = subsampling_x - mbd->plane[0].subsampling_x;
-          for (int ii = 0; ii < (1 << ss_y_shift); ++ii) {
-            for (int jj = 0; jj < (1 << ss_x_shift); ++jj) {
-              const int yy = (i << ss_y_shift) + ii;  // Y-coord on Y-plane.
-              const int xx = (j << ss_x_shift) + jj;  // X-coord on Y-plane.
-              const int ww = w << ss_x_shift;         // Width of Y-plane.
-              sum_square_diff += square_diff[yy * ww + xx];
-              ++num_ref_pixels;
-            }
-          }
-        }
+        sum_square_diff += luma_sse_sum[i * w + j];
 
         // Scale down the difference for high bit depth input.
-        if (mbd->bd > 8) sum_square_diff >>= (mbd->bd - 8) * (mbd->bd - 8);
-        const double window_error = (double)(sum_square_diff) / num_ref_pixels;
-        const int subblock_idx = (i >= h / 2) * 2 + (j >= w / 2);
-        const double block_error =
-            (double)(use_subblock ? subblock_mses[subblock_idx] : block_mse);
+        if (mbd->bd > 8) sum_square_diff >>= ((mbd->bd - 8) * 2);
 
-        // Control factor for non-local mean approach.
-        const double r =
-            (double)decay_control * (0.7 + log(noise_levels[plane] + 1.0));
-        const double q = AOMMIN((double)(q_factor * q_factor) / 256.0, 1);
+        // Combine window error and block error, and normalize it.
+        const double window_error = sum_square_diff * inv_num_ref_pixels;
+        const int subblock_idx = (i >= h / 2) * 2 + (j >= w / 2);
+        const double block_error = (double)subblock_mses[subblock_idx];
+        const double combined_error =
+            weight_factor * window_error + block_error * inv_factor;
 
         // Compute filter weight.
-        const double scaled_diff =
-            AOMMAX(-(window_error + block_error / 10) / (2 * r * r * q), -15.0);
-        const int adjusted_weight =
-            (int)(exp(scaled_diff) * TF_PLANEWISE_FILTER_WEIGHT_SCALE);
+        double scaled_error =
+            combined_error * d_factor[subblock_idx] * decay_factor[plane];
+        scaled_error = AOMMIN(scaled_error, 7);
+        const int weight = (int)(exp(-scaled_error) * TF_WEIGHT_SCALE);
 
         const int idx = plane_offset + pred_idx;  // Index with plane shift.
         const int pred_value = is_high_bitdepth ? pred16[idx] : pred[idx];
-        accum[idx] += adjusted_weight * pred_value;
-        count[idx] += adjusted_weight;
+        accum[idx] += weight * pred_value;
+        count[idx] += weight;
 
         ++pred_idx;
       }
     }
-    plane_offset += mb_pels;
+    plane_offset += h * w;
   }
 
   aom_free(square_diff);
+  aom_free(luma_sse_sum);
 }
-
-// Computes temporal filter weights and accumulators from all reference frames
-// excluding the current frame to be filtered.
-// Inputs:
-//   frame_to_filter: Pointer to the frame to be filtered, which is used as
-//                    reference to compute squared differece from the predictor.
-//   mbd: Pointer to the block for filtering, which is ONLY used to get
-//        subsampling information of all planes and the bit-depth.
-//   block_size: Size of the block.
-//   mb_row: Row index of the block in the entire frame.
-//   mb_col: Column index of the block in the entire frame.
-//   num_planes: Number of planes in the frame.
-//   strength: Strength for filter weight adjustment. (Used in YUV strategy)
-//   use_subblock: Whether to use 4 sub-blocks to replace the original block.
-//                 (Used in YUV strategy)
-//   subblock_filter_weights: The filter weights for each sub-block (row-major
-//                            order). If `use_subblock` is set as 0, the first
-//                            weight will be applied to the entire block. (Used
-//                            in YUV strategy)
-//   noise_levels: Pointer to the noise levels of the to-filter frame, estimated
-//                 with each plane (in Y, U, V order). (Used in plane-wise
-//                 strategy)
-//   block_mse: Motion search error (MSE) for the entire block.
-//   subblock_mses: Pointer to the search errors (MSE) for 4 sub-blocks.
-//   q_factor: Quantization factor.
-//   pred: Pointer to the well-built predictors.
-//   accum: Pointer to the pixel-wise accumulator for filtering.
-//   count: Pointer to the pixel-wise counter fot filtering.
-// Returns:
-//   Nothing will be returned. But the content to which `accum` and `pred`
-//   point will be modified.
-void av1_apply_temporal_filter_others(
+#if CONFIG_AV1_HIGHBITDEPTH
+// Calls High bit-depth temporal filter
+void av1_highbd_apply_temporal_filter_c(
     const YV12_BUFFER_CONFIG *frame_to_filter, const MACROBLOCKD *mbd,
     const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
-    const int num_planes, const int strength, const int use_subblock,
-    const int *subblock_filter_weights, const double *noise_levels,
-    const int block_mse, const int *subblock_mses, const int q_factor,
+    const int num_planes, const double *noise_levels, const MV *subblock_mvs,
+    const int *subblock_mses, const int q_factor, const int filter_strength,
     const uint8_t *pred, uint32_t *accum, uint16_t *count) {
-  assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
-
-  if (TF_ENABLE_PLANEWISE_STRATEGY) {
-    // TODO(any): avx2 and sse2 version should be changed to align with C
-    // function before using.
-    if (is_frame_high_bitdepth(frame_to_filter) || block_size != BLOCK_32X32) {
-      av1_apply_temporal_filter_planewise_c(
-          frame_to_filter, mbd, block_size, mb_row, mb_col, num_planes,
-          noise_levels, use_subblock, block_mse, subblock_mses, q_factor, pred,
-          accum, count);
-    } else {
-      av1_apply_temporal_filter_planewise(
-          frame_to_filter, mbd, block_size, mb_row, mb_col, num_planes,
-          noise_levels, use_subblock, block_mse, subblock_mses, q_factor, pred,
-          accum, count);
-    }
-  } else {  // Commonly used for low-resolution video.
-    if (subblock_filter_weights[0] == 0 && subblock_filter_weights[1] == 0 &&
-        subblock_filter_weights[2] == 0 && subblock_filter_weights[3] == 0) {
-      return;
-    }
-    const int adj_strength = strength + 2 * (mbd->bd - 8);
-    if (num_planes == 3 && TF_YUV_FILTER_WEIGHT_SCALE == 3 &&
-        block_size != BLOCK_32X32) {
-      av1_apply_temporal_filter_yuv(frame_to_filter, mbd, block_size, mb_row,
-                                    mb_col, num_planes, adj_strength,
-                                    use_subblock, subblock_filter_weights, pred,
-                                    accum, count);
-    } else {
-      // TODO(any): sse4 version should be changed to align with C function
-      // before using.
-      av1_apply_temporal_filter_yuv_c(frame_to_filter, mbd, block_size, mb_row,
-                                      mb_col, num_planes, adj_strength,
-                                      use_subblock, subblock_filter_weights,
-                                      pred, accum, count);
-    }
-  }
+  av1_apply_temporal_filter_c(frame_to_filter, mbd, block_size, mb_row, mb_col,
+                              num_planes, noise_levels, subblock_mvs,
+                              subblock_mses, q_factor, filter_strength, pred,
+                              accum, count);
 }
-
-// Normalizes the accumulated filtering result to produce the filtered frame.
-// Inputs:
-//   mbd: Pointer to the block for filtering, which is ONLY used to get
-//        subsampling information of all planes.
-//   block_size: Size of the block.
-//   mb_row: Row index of the block in the entire frame.
-//   mb_col: Column index of the block in the entire frame.
-//   num_planes: Number of planes in the frame.
-//   accum: Pointer to the pre-computed accumulator.
-//   count: Pointer to the pre-computed count.
-//   result_buffer: Pointer to result buffer.
-// Returns:
-//   Nothing will be returned. But the content to which `result_buffer` point
-//   will be modified.
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+/*!\brief Normalizes the accumulated filtering result to produce the filtered
+ *        frame
+ *
+ * \ingroup src_frame_proc
+ * \param[in]   mbd            Pointer to the block for filtering, which is
+ *                             ONLY used to get subsampling information for
+ *                             all the planes
+ * \param[in]   block_size     Size of the block
+ * \param[in]   mb_row         Row index of the block in the frame
+ * \param[in]   mb_col         Column index of the block in the frame
+ * \param[in]   num_planes     Number of planes in the frame
+ * \param[in]   accum          Pointer to the pre-computed accumulator
+ * \param[in]   count          Pointer to the pre-computed count
+ * \param[out]  result_buffer  Pointer to result buffer
+ *
+ * \return Nothing returned, but the content to which `result_buffer` pointer
+ *         will be modified
+ */
 static void tf_normalize_filtered_frame(
     const MACROBLOCKD *mbd, const BLOCK_SIZE block_size, const int mb_row,
     const int mb_col, const int num_planes, const uint32_t *accum,
     const uint16_t *count, YV12_BUFFER_CONFIG *result_buffer) {
-  assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
-
   // Block information.
   const int mb_height = block_size_high[block_size];
   const int mb_width = block_size_wide[block_size];
-  const int mb_pels = mb_height * mb_width;
   const int is_high_bitdepth = is_frame_high_bitdepth(result_buffer);
 
   int plane_offset = 0;
@@ -896,186 +740,327 @@
       }
       frame_idx += (frame_stride - plane_w);
     }
-    plane_offset += mb_pels;
+    plane_offset += plane_h * plane_w;
   }
 }
 
-// Helper function to compute number of blocks on either side of the frame.
-static INLINE int get_num_blocks(const int frame_length, const int mb_length) {
-  return (frame_length + mb_length - 1) / mb_length;
+int av1_get_q(const AV1_COMP *cpi) {
+  const GF_GROUP *gf_group = &cpi->gf_group;
+  const FRAME_TYPE frame_type = gf_group->frame_type[gf_group->index];
+  const int q = (int)av1_convert_qindex_to_q(
+      cpi->rc.avg_frame_qindex[frame_type], cpi->common.seq_params.bit_depth);
+  return q;
 }
 
-typedef struct {
-  int64_t sum;
-  int64_t sse;
-} FRAME_DIFF;
-
-// Does temporal filter for a particular frame.
-// Inputs:
-//   cpi: Pointer to the composed information of input video.
-//   frames: Frame buffers used for temporal filtering.
-//   num_frames: Number of frames in the frame buffer.
-//   filter_frame_idx: Index of the frame to be filtered.
-//   is_key_frame: Whether the to-filter is a key frame.
-//   is_second_arf: Whether the to-filter frame is the second ARF. This field
-//                  is ONLY used for assigning filter weight.
-//   block_size: Block size used for temporal filtering.
-//   scale: Scaling factor.
-//   strength: Pre-estimated strength for filter weight adjustment.
-//   noise_levels: Pointer to the noise levels of the to-filter frame, estimated
-//                 with each plane (in Y, U, V order).
-// Returns:
-//   Difference between filtered frame and the original frame.
-static FRAME_DIFF tf_do_filtering(
-    AV1_COMP *cpi, YV12_BUFFER_CONFIG **frames, const int num_frames,
-    const int filter_frame_idx, const int is_key_frame, const int is_second_arf,
-    const BLOCK_SIZE block_size, const struct scale_factors *scale,
-    const int strength, const double *noise_levels) {
-  // Basic information.
+void av1_tf_do_filtering_row(AV1_COMP *cpi, ThreadData *td, int mb_row) {
+  TemporalFilterCtx *tf_ctx = &cpi->tf_ctx;
+  YV12_BUFFER_CONFIG **frames = tf_ctx->frames;
+  const int num_frames = tf_ctx->num_frames;
+  const int filter_frame_idx = tf_ctx->filter_frame_idx;
+  const int check_show_existing = tf_ctx->check_show_existing;
+  const struct scale_factors *scale = &tf_ctx->sf;
+  const double *noise_levels = tf_ctx->noise_levels;
+  const int num_pels = tf_ctx->num_pels;
+  const int q_factor = tf_ctx->q_factor;
+  const BLOCK_SIZE block_size = TF_BLOCK_SIZE;
   const YV12_BUFFER_CONFIG *const frame_to_filter = frames[filter_frame_idx];
-  const int frame_height = frame_to_filter->y_crop_height;
-  const int frame_width = frame_to_filter->y_crop_width;
+  MACROBLOCK *const mb = &td->mb;
+  MACROBLOCKD *const mbd = &mb->e_mbd;
+  TemporalFilterData *const tf_data = &td->tf_data;
   const int mb_height = block_size_high[block_size];
   const int mb_width = block_size_wide[block_size];
-  const int mb_pels = mb_height * mb_width;
-  const int mb_rows = get_num_blocks(frame_height, mb_height);
-  const int mb_cols = get_num_blocks(frame_width, mb_width);
-  const int num_planes = av1_num_planes(&cpi->common);
   const int mi_h = mi_size_high_log2[block_size];
   const int mi_w = mi_size_wide_log2[block_size];
-  assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
-  const int is_high_bitdepth = is_frame_high_bitdepth(frame_to_filter);
+  const int num_planes = av1_num_planes(&cpi->common);
+  uint32_t *accum = tf_data->accum;
+  uint16_t *count = tf_data->count;
+  uint8_t *pred = tf_data->pred;
 
-  // Save input state.
-  MACROBLOCK *const mb = &cpi->td.mb;
-  MACROBLOCKD *const mbd = &mb->e_mbd;
-  uint8_t *input_buffer[MAX_MB_PLANE];
-  for (int i = 0; i < num_planes; i++) {
-    input_buffer[i] = mbd->plane[i].pre[0].buf;
-  }
-  MB_MODE_INFO **input_mb_mode_info = mbd->mi;
-
-  // Setup.
-  mbd->block_ref_scale_factors[0] = scale;
-  mbd->block_ref_scale_factors[1] = scale;
-  // A temporary block info used to store state in temporal filtering process.
-  MB_MODE_INFO *tmp_mb_mode_info = (MB_MODE_INFO *)malloc(sizeof(MB_MODE_INFO));
-  memset(tmp_mb_mode_info, 0, sizeof(MB_MODE_INFO));
-  mbd->mi = &tmp_mb_mode_info;
-  mbd->mi[0]->motion_mode = SIMPLE_TRANSLATION;
-  // Allocate memory for predictor, accumulator and count.
-  uint8_t *pred8 = aom_memalign(32, num_planes * mb_pels * sizeof(uint8_t));
-  uint16_t *pred16 = aom_memalign(32, num_planes * mb_pels * sizeof(uint16_t));
-  uint32_t *accum = aom_memalign(16, num_planes * mb_pels * sizeof(uint32_t));
-  uint16_t *count = aom_memalign(16, num_planes * mb_pels * sizeof(uint16_t));
-  memset(pred8, 0, num_planes * mb_pels * sizeof(pred8[0]));
-  memset(pred16, 0, num_planes * mb_pels * sizeof(pred16[0]));
-  uint8_t *const pred = is_high_bitdepth ? CONVERT_TO_BYTEPTR(pred16) : pred8;
+  // Factor to control the filering strength.
+  const int filter_strength = cpi->oxcf.algo_cfg.arnr_strength;
 
   // Do filtering.
-  FRAME_DIFF diff = { 0, 0 };
-  // Perform temporal filtering block by block.
-  for (int mb_row = 0; mb_row < mb_rows; mb_row++) {
-    av1_set_mv_row_limits(&cpi->common.mi_params, &mb->mv_limits,
-                          (mb_row << mi_h), (mb_height >> MI_SIZE_LOG2),
+  FRAME_DIFF *diff = &td->tf_data.diff;
+  av1_set_mv_row_limits(&cpi->common.mi_params, &mb->mv_limits,
+                        (mb_row << mi_h), (mb_height >> MI_SIZE_LOG2),
+                        cpi->oxcf.border_in_pixels);
+  for (int mb_col = 0; mb_col < tf_ctx->mb_cols; mb_col++) {
+    av1_set_mv_col_limits(&cpi->common.mi_params, &mb->mv_limits,
+                          (mb_col << mi_w), (mb_width >> MI_SIZE_LOG2),
                           cpi->oxcf.border_in_pixels);
-    for (int mb_col = 0; mb_col < mb_cols; mb_col++) {
-      av1_set_mv_col_limits(&cpi->common.mi_params, &mb->mv_limits,
-                            (mb_col << mi_w), (mb_width >> MI_SIZE_LOG2),
-                            cpi->oxcf.border_in_pixels);
-      memset(accum, 0, num_planes * mb_pels * sizeof(accum[0]));
-      memset(count, 0, num_planes * mb_pels * sizeof(count[0]));
-      MV ref_mv = kZeroMv;  // Reference motion vector passed down along frames.
-      // Perform temporal filtering frame by frame.
-      for (int frame = 0; frame < num_frames; frame++) {
-        if (frames[frame] == NULL) continue;
+    memset(accum, 0, num_pels * sizeof(accum[0]));
+    memset(count, 0, num_pels * sizeof(count[0]));
+    MV ref_mv = kZeroMv;  // Reference motion vector passed down along frames.
+                          // Perform temporal filtering frame by frame.
+    for (int frame = 0; frame < num_frames; frame++) {
+      if (frames[frame] == NULL) continue;
 
-        // Motion search.
-        MV subblock_mvs[4] = { kZeroMv, kZeroMv, kZeroMv, kZeroMv };
-        int subblock_filter_weights[4] = { 0, 0, 0, 0 };
-        int block_mse = INT_MAX;
-        int subblock_mses[4] = { INT_MAX, INT_MAX, INT_MAX, INT_MAX };
-
-        if (frame == filter_frame_idx) {  // Frame to be filtered.
-          // Set motion vector as 0 for the frame to be filtered.
-          mbd->mi[0]->mv[0].as_mv = kZeroMv;
-          // Change ref_mv sign for following frames.
-          ref_mv.row *= -1;
-          ref_mv.col *= -1;
-        } else {  // Other reference frames.
-          block_mse = tf_motion_search(cpi, frame_to_filter, frames[frame],
-                                       block_size, mb_row, mb_col, &ref_mv,
-                                       subblock_mvs, subblock_mses);
-          // Do not pass down the reference motion vector if error is too large.
-          const int thresh = AOMMIN(frame_height, frame_width) >= 720 ? 12 : 3;
-          if (block_mse > (thresh << (mbd->bd - 8))) {
-            ref_mv = kZeroMv;
-          }
-        }
-
-        // Build predictor.
-        int use_subblock = tf_get_filter_weight(
-            block_mse, subblock_mses, is_second_arf, subblock_filter_weights);
-        tf_build_predictor(frames[frame], mbd, block_size, mb_row, mb_col,
-                           num_planes, scale, use_subblock, subblock_mvs, pred);
-
-        // Perform weighted averaging.
-        if (frame == filter_frame_idx) {  // Frame to be filtered.
-          av1_apply_temporal_filter_self(mbd, block_size, num_planes,
-                                         subblock_filter_weights[0], pred,
-                                         accum, count);
-        } else {  // Other reference frames.
-          const FRAME_TYPE frame_type =
-              (cpi->common.current_frame.frame_number > 1) ? INTER_FRAME
-                                                           : KEY_FRAME;
-          const int q_factor =
-              (int)av1_convert_qindex_to_q(cpi->rc.avg_frame_qindex[frame_type],
-                                           cpi->common.seq_params.bit_depth);
-          av1_apply_temporal_filter_others(
-              frame_to_filter, mbd, block_size, mb_row, mb_col, num_planes,
-              strength, use_subblock, subblock_filter_weights, noise_levels,
-              block_mse, subblock_mses, q_factor, pred, accum, count);
-        }
+      // Motion search.
+      MV subblock_mvs[4] = { kZeroMv, kZeroMv, kZeroMv, kZeroMv };
+      int subblock_mses[4] = { INT_MAX, INT_MAX, INT_MAX, INT_MAX };
+      if (frame ==
+          filter_frame_idx) {  // Frame to be filtered.
+                               // Change ref_mv sign for following frames.
+        ref_mv.row *= -1;
+        ref_mv.col *= -1;
+      } else {  // Other reference frames.
+        tf_motion_search(cpi, mb, frame_to_filter, frames[frame], block_size,
+                         mb_row, mb_col, &ref_mv, subblock_mvs, subblock_mses);
       }
 
-      tf_normalize_filtered_frame(mbd, block_size, mb_row, mb_col, num_planes,
-                                  accum, count, &cpi->alt_ref_buffer);
+      // Perform weighted averaging.
+      if (frame == filter_frame_idx) {  // Frame to be filtered.
+        tf_apply_temporal_filter_self(frames[frame], mbd, block_size, mb_row,
+                                      mb_col, num_planes, accum, count);
+      } else {  // Other reference frames.
+        tf_build_predictor(frames[frame], mbd, block_size, mb_row, mb_col,
+                           num_planes, scale, subblock_mvs, pred);
 
-      if (!is_key_frame && cpi->sf.hl_sf.adaptive_overlay_encoding) {
-        const int y_height = mb_height >> mbd->plane[0].subsampling_y;
-        const int y_width = mb_width >> mbd->plane[0].subsampling_x;
-        const int source_y_stride = frame_to_filter->y_stride;
-        const int filter_y_stride = cpi->alt_ref_buffer.y_stride;
-        const int source_offset =
-            mb_row * y_height * source_y_stride + mb_col * y_width;
-        const int filter_offset =
-            mb_row * y_height * filter_y_stride + mb_col * y_width;
-        unsigned int sse = 0;
-        cpi->fn_ptr[block_size].vf(frame_to_filter->y_buffer + source_offset,
-                                   source_y_stride,
-                                   cpi->alt_ref_buffer.y_buffer + filter_offset,
-                                   filter_y_stride, &sse);
-        diff.sum += sse;
-        diff.sse += sse * sse;
+        // All variants of av1_apply_temporal_filter() contain floating point
+        // operations. Hence, clear the system state.
+        aom_clear_system_state();
+
+        // TODO(any): avx2/sse2 version should be changed to align with C
+        // function before using. In particular, current avx2/sse2 function
+        // only supports 32x32 block size and 5x5 filtering window.
+        if (is_frame_high_bitdepth(frame_to_filter)) {  // for high bit-depth
+#if CONFIG_AV1_HIGHBITDEPTH
+          if (TF_BLOCK_SIZE == BLOCK_32X32 && TF_WINDOW_LENGTH == 5) {
+            av1_highbd_apply_temporal_filter(
+                frame_to_filter, mbd, block_size, mb_row, mb_col, num_planes,
+                noise_levels, subblock_mvs, subblock_mses, q_factor,
+                filter_strength, pred, accum, count);
+          } else {
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+            av1_apply_temporal_filter_c(
+                frame_to_filter, mbd, block_size, mb_row, mb_col, num_planes,
+                noise_levels, subblock_mvs, subblock_mses, q_factor,
+                filter_strength, pred, accum, count);
+#if CONFIG_AV1_HIGHBITDEPTH
+          }
+#endif            // CONFIG_AV1_HIGHBITDEPTH
+        } else {  // for 8-bit
+          if (TF_BLOCK_SIZE == BLOCK_32X32 && TF_WINDOW_LENGTH == 5) {
+            av1_apply_temporal_filter(frame_to_filter, mbd, block_size, mb_row,
+                                      mb_col, num_planes, noise_levels,
+                                      subblock_mvs, subblock_mses, q_factor,
+                                      filter_strength, pred, accum, count);
+          } else {
+            av1_apply_temporal_filter_c(
+                frame_to_filter, mbd, block_size, mb_row, mb_col, num_planes,
+                noise_levels, subblock_mvs, subblock_mses, q_factor,
+                filter_strength, pred, accum, count);
+          }
+        }
+      }
+    }
+    tf_normalize_filtered_frame(mbd, block_size, mb_row, mb_col, num_planes,
+                                accum, count, &cpi->alt_ref_buffer);
+
+    if (check_show_existing) {
+      const int y_height = mb_height >> mbd->plane[0].subsampling_y;
+      const int y_width = mb_width >> mbd->plane[0].subsampling_x;
+      const int source_y_stride = frame_to_filter->y_stride;
+      const int filter_y_stride = cpi->alt_ref_buffer.y_stride;
+      const int source_offset =
+          mb_row * y_height * source_y_stride + mb_col * y_width;
+      const int filter_offset =
+          mb_row * y_height * filter_y_stride + mb_col * y_width;
+      unsigned int sse = 0;
+      cpi->fn_ptr[block_size].vf(
+          frame_to_filter->y_buffer + source_offset, source_y_stride,
+          cpi->alt_ref_buffer.y_buffer + filter_offset, filter_y_stride, &sse);
+      diff->sum += sse;
+      diff->sse += sse * sse;
+    }
+  }
+}
+
+/*!\brief Does temporal filter for a given frame.
+ *
+ * \ingroup src_frame_proc
+ * \param[in]   cpi                   Top level encoder instance structure
+ *
+ * \return Nothing will be returned, but the contents of td->diff will be
+ modified.
+ */
+static void tf_do_filtering(AV1_COMP *cpi) {
+  // Basic information.
+  ThreadData *td = &cpi->td;
+  TemporalFilterCtx *tf_ctx = &cpi->tf_ctx;
+  const struct scale_factors *scale = &tf_ctx->sf;
+  const int num_planes = av1_num_planes(&cpi->common);
+  assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
+
+  MACROBLOCKD *mbd = &td->mb.e_mbd;
+  uint8_t *input_buffer[MAX_MB_PLANE];
+  MB_MODE_INFO **input_mb_mode_info;
+  tf_save_state(mbd, &input_mb_mode_info, input_buffer, num_planes);
+  tf_setup_macroblockd(mbd, &td->tf_data, scale);
+
+  // Perform temporal filtering for each row.
+  for (int mb_row = 0; mb_row < tf_ctx->mb_rows; mb_row++)
+    av1_tf_do_filtering_row(cpi, td, mb_row);
+
+  tf_restore_state(mbd, input_mb_mode_info, input_buffer, num_planes);
+}
+
+/*!\brief Setups the frame buffer for temporal filtering. This fuction
+ * determines how many frames will be used for temporal filtering and then
+ * groups them into a buffer. This function will also estimate the noise level
+ * of the to-filter frame.
+ *
+ * \ingroup src_frame_proc
+ * \param[in]   cpi             Top level encoder instance structure
+ * \param[in]   filter_frame_lookahead_idx  The index of the to-filter frame
+ *                              in the lookahead buffer cpi->lookahead
+ * \param[in]   is_second_arf   Whether the to-filter frame is the second ARF.
+ *                              This field will affect the number of frames
+ *                              used for filtering.
+ *
+ * \return Nothing will be returned. But the fields `frames`, `num_frames`,
+ *         `filter_frame_idx` and `noise_levels` will be updated in cpi->tf_ctx.
+ */
+static void tf_setup_filtering_buffer(AV1_COMP *cpi,
+                                      const int filter_frame_lookahead_idx,
+                                      const int is_second_arf) {
+  TemporalFilterCtx *tf_ctx = &cpi->tf_ctx;
+  YV12_BUFFER_CONFIG **frames = tf_ctx->frames;
+  // Number of frames used for filtering. Set `arnr_max_frames` as 1 to disable
+  // temporal filtering.
+  int num_frames = AOMMAX(cpi->oxcf.algo_cfg.arnr_max_frames, 1);
+  int num_before = 0;  // Number of filtering frames before the to-filter frame.
+  int num_after = 0;   // Number of filtering frames after the to-filer frame.
+  const int lookahead_depth =
+      av1_lookahead_depth(cpi->lookahead, cpi->compressor_stage);
+
+  // Temporal filtering should not go beyond key frames
+  const int key_to_curframe =
+      AOMMAX(cpi->rc.frames_since_key +
+                 cpi->gf_group.arf_src_offset[cpi->gf_group.index],
+             0);
+  const int curframe_to_key =
+      AOMMAX(cpi->rc.frames_to_key -
+                 cpi->gf_group.arf_src_offset[cpi->gf_group.index] - 1,
+             0);
+
+  // Number of buffered frames before the to-filter frame.
+  const int max_before =
+      AOMMIN(filter_frame_lookahead_idx < -1 ? -filter_frame_lookahead_idx + 1
+                                             : filter_frame_lookahead_idx + 1,
+             key_to_curframe);
+  // Number of buffered frames after the to-filter frame.
+  const int max_after = AOMMIN(lookahead_depth - max_before, curframe_to_key);
+
+  const int filter_frame_offset = filter_frame_lookahead_idx < -1
+                                      ? -filter_frame_lookahead_idx
+                                      : filter_frame_lookahead_idx;
+
+  // Estimate noises for each plane.
+  const struct lookahead_entry *to_filter_buf = av1_lookahead_peek(
+      cpi->lookahead, filter_frame_offset, cpi->compressor_stage);
+  assert(to_filter_buf != NULL);
+  const YV12_BUFFER_CONFIG *to_filter_frame = &to_filter_buf->img;
+  const int num_planes = av1_num_planes(&cpi->common);
+  double *noise_levels = tf_ctx->noise_levels;
+  for (int plane = 0; plane < num_planes; ++plane) {
+    noise_levels[plane] = av1_estimate_noise_from_single_plane(
+        to_filter_frame, plane, cpi->common.seq_params.bit_depth);
+  }
+  // Get quantization factor.
+  const int q = av1_get_q(cpi);
+  // Get correlation estimates from first-pass
+  RATE_CONTROL *rc = &cpi->rc;
+  const double *coeff = rc->cor_coeff;
+  const int offset = rc->regions_offset;
+  int cur_frame_idx = filter_frame_offset + rc->frames_since_key - offset;
+  if (rc->frames_since_key == 0) cur_frame_idx++;
+
+  double accu_coeff0 = 1.0, accu_coeff1 = 1.0;
+  for (int i = 1; i <= max_after; i++) {
+    accu_coeff1 *= coeff[cur_frame_idx + i];
+  }
+  if (max_after >= 1) {
+    accu_coeff1 = pow(accu_coeff1, 1.0 / (double)max_after);
+  }
+  for (int i = 1; i <= max_before; i++) {
+    accu_coeff0 *= coeff[cur_frame_idx - i + 1];
+  }
+  if (max_before >= 1) {
+    accu_coeff0 = pow(accu_coeff0, 1.0 / (double)max_before);
+  }
+
+  // Adjust number of filtering frames based on quantization factor. When the
+  // quantization factor is small enough (lossless compression), we will not
+  // change the number of frames for key frame filtering, which is to avoid
+  // visual quality drop.
+  int adjust_num = 6;
+  if (num_frames == 1) {  // `arnr_max_frames = 1` is used to disable filtering.
+    adjust_num = 0;
+  } else if (filter_frame_lookahead_idx < 0 && q <= 10) {
+    adjust_num = 0;
+  }
+  num_frames = AOMMIN(num_frames + adjust_num, lookahead_depth + 1);
+
+  if (filter_frame_lookahead_idx == -1 ||
+      filter_frame_lookahead_idx == 0) {  // Key frame.
+    num_before = 0;
+    num_after = AOMMIN(num_frames - 1, max_after);
+  } else if (filter_frame_lookahead_idx < -1) {  // Key frame in one-pass mode.
+    num_before = AOMMIN(num_frames - 1, max_before);
+    num_after = 0;
+  } else {
+    num_frames = AOMMIN(num_frames, cpi->rc.gfu_boost / 150);
+    num_frames += !(num_frames & 1);  // Make the number odd.
+    // Only use 2 neighbours for the second ARF.
+    if (is_second_arf) num_frames = AOMMIN(num_frames, 3);
+    if (AOMMIN(max_after, max_before) >= num_frames / 2) {
+      // just use half half
+      num_before = num_frames / 2;
+      num_after = num_frames / 2;
+    } else {
+      if (max_after < num_frames / 2) {
+        num_after = max_after;
+        num_before = AOMMIN(num_frames - 1 - num_after, max_before);
+      } else {
+        num_before = max_before;
+        num_after = AOMMIN(num_frames - 1 - num_before, max_after);
+      }
+      // Adjust insymmetry based on frame-level correlation
+      if (max_after > 0 && max_before > 0) {
+        if (num_after < num_before) {
+          const int insym = (int)(0.4 / AOMMAX(1 - accu_coeff1, 0.01));
+          num_before = AOMMIN(num_before, num_after + insym);
+        } else {
+          const int insym = (int)(0.4 / AOMMAX(1 - accu_coeff0, 0.01));
+          num_after = AOMMIN(num_after, num_before + insym);
+        }
       }
     }
   }
+  num_frames = num_before + 1 + num_after;
 
-  // Restore input state
-  for (int i = 0; i < num_planes; i++) {
-    mbd->plane[i].pre[0].buf = input_buffer[i];
+  // Setup the frame buffer.
+  for (int frame = 0; frame < num_frames; ++frame) {
+    const int lookahead_idx = frame - num_before + filter_frame_offset;
+    struct lookahead_entry *buf = av1_lookahead_peek(
+        cpi->lookahead, lookahead_idx, cpi->compressor_stage);
+    assert(buf != NULL);
+    frames[frame] = &buf->img;
   }
-  mbd->mi = input_mb_mode_info;
+  tf_ctx->num_frames = num_frames;
+  tf_ctx->filter_frame_idx = num_before;
+  assert(frames[tf_ctx->filter_frame_idx] == to_filter_frame);
 
-  free(tmp_mb_mode_info);
-  aom_free(pred8);
-  aom_free(pred16);
-  aom_free(accum);
-  aom_free(count);
-
-  return diff;
+  av1_setup_src_planes(&cpi->td.mb, &to_filter_buf->img, 0, 0, num_planes,
+                       cpi->common.seq_params.sb_size);
+  av1_setup_block_planes(&cpi->td.mb.e_mbd,
+                         cpi->common.seq_params.subsampling_x,
+                         cpi->common.seq_params.subsampling_y, num_planes);
 }
 
+/*!\cond */
+
 // A constant number, sqrt(pi / 2),  used for noise estimation.
 static const double SQRT_PI_BY_2 = 1.25331413732;
 
@@ -1124,112 +1109,74 @@
   return (count < 16) ? -1.0 : (double)accum / (6 * count) * SQRT_PI_BY_2;
 }
 
-// Estimates the strength for filter weight adjustment, which is used in YUV
-// strategy. This estimation is based on the pre-estimated noise level of the
-// to-filter frame.
+// Initializes the members of TemporalFilterCtx
 // Inputs:
-//   cpi: Pointer to the composed information of input video.
-//   noise_level: Noise level of the to-filter frame, estimated with Y-plane.
-//   group_boost: Boost level for the current group of frames.
+//   cpi: Top level encoder instance structure
+//   filter_frame_lookahead_idx: The index of the frame to be filtered in the
+//                               lookahead buffer cpi->lookahead.
+//   is_second_arf: Flag indiacting whether second ARF filtering is required.
 // Returns:
-//   Estimated strength which will be used for filter weight adjustment.
-static int tf_estimate_strength(const AV1_COMP *cpi, const double noise_level,
-                                const int group_boost) {
-  int strength = cpi->oxcf.arnr_strength;
+//   Nothing will be returned. But the contents of cpi->tf_ctx will be modified.
+static void init_tf_ctx(AV1_COMP *cpi, int filter_frame_lookahead_idx,
+                        int is_second_arf) {
+  TemporalFilterCtx *tf_ctx = &cpi->tf_ctx;
+  // Setup frame buffer for filtering.
+  YV12_BUFFER_CONFIG **frames = tf_ctx->frames;
+  tf_ctx->num_frames = 0;
+  tf_ctx->filter_frame_idx = -1;
+  tf_setup_filtering_buffer(cpi, filter_frame_lookahead_idx, is_second_arf);
+  assert(tf_ctx->num_frames > 0);
+  assert(tf_ctx->filter_frame_idx < tf_ctx->num_frames);
 
-  // Adjust the strength based on the estimated noise level.
-  if (noise_level > 0) {       // Adjust when the noise level is reliable.
-    if (noise_level < 0.75) {  // Noise level lies in range (0, 0.75).
-      strength = strength - 2;
-    } else if (noise_level < 1.75) {  // Noise level lies in range [0.75, 1.75).
-      strength = strength - 1;
-    } else if (noise_level < 4.0) {  // Noise level lies in range [1.75, 4.0).
-      strength = strength + 0;
-    } else {  // Noise level lies in range [4.0, +inf).
-      strength = strength + 1;
-    }
+  // Check show existing condition for non-keyframes. For KFs, only check when
+  // KF overlay is enabled.
+  tf_ctx->check_show_existing = !(filter_frame_lookahead_idx <= 0) ||
+                                cpi->oxcf.kf_cfg.enable_keyframe_filtering > 1;
+
+  // Setup scaling factors. Scaling on each of the arnr frames is not
+  // supported.
+  // ARF is produced at the native frame size and resized when coded.
+  struct scale_factors *sf = &tf_ctx->sf;
+  av1_setup_scale_factors_for_frame(
+      sf, frames[0]->y_crop_width, frames[0]->y_crop_height,
+      frames[0]->y_crop_width, frames[0]->y_crop_height);
+
+  // Initialize temporal filter parameters.
+  MACROBLOCKD *mbd = &cpi->td.mb.e_mbd;
+  const int filter_frame_idx = tf_ctx->filter_frame_idx;
+  const YV12_BUFFER_CONFIG *const frame_to_filter = frames[filter_frame_idx];
+  const BLOCK_SIZE block_size = TF_BLOCK_SIZE;
+  const int frame_height = frame_to_filter->y_crop_height;
+  const int frame_width = frame_to_filter->y_crop_width;
+  const int mb_width = block_size_wide[block_size];
+  const int mb_height = block_size_high[block_size];
+  const int mb_rows = get_num_blocks(frame_height, mb_height);
+  const int mb_cols = get_num_blocks(frame_width, mb_width);
+  const int mb_pels = mb_width * mb_height;
+  const int is_highbitdepth = is_frame_high_bitdepth(frame_to_filter);
+  const int num_planes = av1_num_planes(&cpi->common);
+  int num_pels = 0;
+  for (int i = 0; i < num_planes; i++) {
+    const int subsampling_x = mbd->plane[i].subsampling_x;
+    const int subsampling_y = mbd->plane[i].subsampling_y;
+    num_pels += mb_pels >> (subsampling_x + subsampling_y);
   }
-
-  // Adjust the strength based on active max q.
-  const FRAME_TYPE frame_type =
-      (cpi->common.current_frame.frame_number > 1) ? INTER_FRAME : KEY_FRAME;
-  const int q = (int)av1_convert_qindex_to_q(
-      cpi->rc.avg_frame_qindex[frame_type], cpi->common.seq_params.bit_depth);
-  strength = strength - AOMMAX(0, (16 - q) / 2);
-
-  return CLIP(strength, 0, group_boost / 300);
-}
-
-// Setups the frame buffer for temporal filtering. Basically, this fuction
-// determines how many frames will be used for temporal filtering and then
-// groups them into a buffer.
-// Inputs:
-//   cpi: Pointer to the composed information of input video.
-//   filter_frame_lookahead_idx: The index of the to-filter frame in the
-//                               lookahead buffer `cpi->lookahead`.
-//   is_second_arf: Whether the to-filter frame is the second ARF. This field
-//                  will affect the number of frames used for filtering.
-//   frames: Pointer to the frame buffer to setup.
-//   num_frames_for_filtering: Number of frames used for filtering.
-//   filter_frame_idx: Index of the to-filter frame in the setup frame buffer.
-// Returns:
-//   Nothing will be returned. But the frame buffer `frames`, number of frames
-//   in the buffer `num_frames_for_filtering`, and the index of the to-filter
-//   frame in the buffer `filter_frame_idx` will be updated in this function.
-static void tf_setup_filtering_buffer(const AV1_COMP *cpi,
-                                      const int filter_frame_lookahead_idx,
-                                      const int is_second_arf,
-                                      YV12_BUFFER_CONFIG **frames,
-                                      int *num_frames_for_filtering,
-                                      int *filter_frame_idx) {
-  int num_frames = 0;          // Number of frames used for filtering.
-  int num_frames_before = -1;  // Number of frames before the to-filter frame.
-  int filter_frame_offset;
-
-  if (filter_frame_lookahead_idx == -1) {  // Key frame.
-    num_frames = TF_NUM_FILTERING_FRAMES_FOR_KEY_FRAME;
-    num_frames_before = 0;
-    filter_frame_offset = filter_frame_lookahead_idx;
-  } else if (filter_frame_lookahead_idx < -1) {  // Key frame in one-pass mode.
-    num_frames = TF_NUM_FILTERING_FRAMES_FOR_KEY_FRAME;
-    num_frames_before = num_frames - 1;
-    filter_frame_offset = -filter_frame_lookahead_idx;
-  } else {
-    num_frames = cpi->oxcf.arnr_max_frames;
-    if (is_second_arf) {  // Only use 2 neighbours for the second ARF.
-      num_frames = AOMMIN(num_frames, 3);
-    }
-    if (num_frames > cpi->rc.gfu_boost / 150) {
-      num_frames = cpi->rc.gfu_boost / 150;
-      num_frames += !(num_frames & 1);
-    }
-    num_frames_before = AOMMIN(num_frames >> 1, filter_frame_lookahead_idx + 1);
-    const int lookahead_depth =
-        av1_lookahead_depth(cpi->lookahead, cpi->compressor_stage);
-    const int num_frames_after =
-        AOMMIN((num_frames - 1) >> 1,
-               lookahead_depth - filter_frame_lookahead_idx - 1);
-    num_frames = num_frames_before + 1 + num_frames_after;
-    filter_frame_offset = filter_frame_lookahead_idx;
-  }
-  *num_frames_for_filtering = num_frames;
-  *filter_frame_idx = num_frames_before;
-
-  // Setup the frame buffer.
-  for (int frame = 0; frame < num_frames; ++frame) {
-    const int lookahead_idx = frame - num_frames_before + filter_frame_offset;
-    struct lookahead_entry *buf = av1_lookahead_peek(
-        cpi->lookahead, lookahead_idx, cpi->compressor_stage);
-    frames[frame] = (buf == NULL) ? NULL : &buf->img;
-  }
+  tf_ctx->num_pels = num_pels;
+  tf_ctx->mb_rows = mb_rows;
+  tf_ctx->mb_cols = mb_cols;
+  tf_ctx->is_highbitdepth = is_highbitdepth;
+  tf_ctx->q_factor = av1_get_q(cpi);
 }
 
 int av1_temporal_filter(AV1_COMP *cpi, const int filter_frame_lookahead_idx,
                         int *show_existing_arf) {
+  MultiThreadInfo *const mt_info = &cpi->mt_info;
   // Basic informaton of the current frame.
   const GF_GROUP *const gf_group = &cpi->gf_group;
   const uint8_t group_idx = gf_group->index;
   const FRAME_UPDATE_TYPE update_type = gf_group->update_type[group_idx];
+  TemporalFilterCtx *tf_ctx = &cpi->tf_ctx;
+  TemporalFilterData *tf_data = &cpi->td.tf_data;
   // Filter one more ARF if the lookahead index is leq 7 (w.r.t. 9-th frame).
   // This frame is ALWAYS a show existing frame.
   const int is_second_arf = (update_type == INTNL_ARF_UPDATE) &&
@@ -1244,60 +1191,43 @@
 
   // TODO(yunqing): For INTNL_ARF_UPDATE type, the following me initialization
   // is used somewhere unexpectedly. Should be resolved later.
-  // Initialize errorperbit, sadperbit16 and sadperbit4.
+  // Initialize errorperbit and sadperbit
   const int rdmult = av1_compute_rd_mult_based_on_qindex(cpi, TF_QINDEX);
-  set_error_per_bit(&cpi->td.mb, rdmult);
-  av1_initialize_me_consts(cpi, &cpi->td.mb, TF_QINDEX);
+  MvCosts *mv_costs = &cpi->td.mb.mv_costs;
+  av1_set_error_per_bit(mv_costs, rdmult);
+  av1_set_sad_per_bit(cpi, mv_costs, TF_QINDEX);
   av1_fill_mv_costs(cpi->common.fc,
                     cpi->common.features.cur_frame_force_integer_mv,
-                    cpi->common.features.allow_high_precision_mv, &cpi->td.mb);
+                    cpi->common.features.allow_high_precision_mv, mv_costs);
 
-  // Setup frame buffer for filtering.
-  YV12_BUFFER_CONFIG *frames[MAX_LAG_BUFFERS] = { NULL };
-  int num_frames_for_filtering = 0;
-  int filter_frame_idx = -1;
-  tf_setup_filtering_buffer(cpi, filter_frame_lookahead_idx, is_second_arf,
-                            frames, &num_frames_for_filtering,
-                            &filter_frame_idx);
+  // Initialize temporal filter context structure.
+  init_tf_ctx(cpi, filter_frame_lookahead_idx, is_second_arf);
 
-  // Estimate noise and strength.
-  const int bit_depth = cpi->common.seq_params.bit_depth;
-  const int num_planes = av1_num_planes(&cpi->common);
-  double noise_levels[MAX_MB_PLANE] = { 0 };
-  for (int plane = 0; plane < num_planes; ++plane) {
-    noise_levels[plane] = av1_estimate_noise_from_single_plane(
-        frames[filter_frame_idx], plane, bit_depth);
-  }
-  const int strength =
-      tf_estimate_strength(cpi, noise_levels[0], cpi->rc.gfu_boost);
+  // Set showable frame.
   if (filter_frame_lookahead_idx >= 0) {
-    cpi->common.showable_frame =
-        (strength == 0 && num_frames_for_filtering == 1) || is_second_arf ||
-        (cpi->oxcf.enable_overlay == 0 || cpi->sf.hl_sf.disable_overlay_frames);
+    cpi->common.showable_frame = tf_ctx->num_frames == 1 || is_second_arf ||
+                                 (cpi->oxcf.algo_cfg.enable_overlay == 0);
   }
 
-  // Do filtering.
-  const int is_key_frame = (filter_frame_lookahead_idx < 0);
-  FRAME_DIFF diff = { 0, 0 };
-  if (num_frames_for_filtering > 0 && frames[0] != NULL) {
-    // Setup scaling factors. Scaling on each of the arnr frames is not
-    // supported.
-    // ARF is produced at the native frame size and resized when coded.
-    struct scale_factors sf;
-    av1_setup_scale_factors_for_frame(
-        &sf, frames[0]->y_crop_width, frames[0]->y_crop_height,
-        frames[0]->y_crop_width, frames[0]->y_crop_height);
-    diff = tf_do_filtering(cpi, frames, num_frames_for_filtering,
-                           filter_frame_idx, is_key_frame, is_second_arf,
-                           TF_BLOCK_SIZE, &sf, strength, noise_levels);
-  }
+  // Allocate and reset temporal filter buffers.
+  const int is_highbitdepth = tf_ctx->is_highbitdepth;
+  tf_alloc_and_reset_data(tf_data, tf_ctx->num_pels, is_highbitdepth);
 
-  if (is_key_frame) {  // Key frame should always be filtered.
-    return 1;
-  }
+  // Perform temporal filtering process.
+  if (mt_info->num_workers > 1)
+    av1_tf_do_filtering_mt(cpi);
+  else
+    tf_do_filtering(cpi);
 
-  if ((show_existing_arf != NULL && cpi->sf.hl_sf.adaptive_overlay_encoding) ||
-      is_second_arf) {
+  // Deallocate temporal filter buffers.
+  tf_dealloc_data(tf_data, is_highbitdepth);
+
+  if (!tf_ctx->check_show_existing) return 1;
+
+  if (show_existing_arf != NULL || is_second_arf) {
+    YV12_BUFFER_CONFIG **frames = tf_ctx->frames;
+    const FRAME_DIFF *diff = &tf_data->diff;
+    const int filter_frame_idx = tf_ctx->filter_frame_idx;
     const int frame_height = frames[filter_frame_idx]->y_crop_height;
     const int frame_width = frames[filter_frame_idx]->y_crop_width;
     const int block_height = block_size_high[TF_BLOCK_SIZE];
@@ -1305,8 +1235,8 @@
     const int mb_rows = get_num_blocks(frame_height, block_height);
     const int mb_cols = get_num_blocks(frame_width, block_width);
     const int num_mbs = AOMMAX(1, mb_rows * mb_cols);
-    const float mean = (float)diff.sum / num_mbs;
-    const float std = (float)sqrt((float)diff.sse / num_mbs - mean * mean);
+    const float mean = (float)diff->sum / num_mbs;
+    const float std = (float)sqrt((float)diff->sse / num_mbs - mean * mean);
 
     aom_clear_system_state();
     // TODO(yunqing): This can be combined with TPL q calculation later.
@@ -1314,10 +1244,10 @@
     av1_set_target_rate(cpi, cpi->common.width, cpi->common.height);
     int top_index = 0;
     int bottom_index = 0;
-    const int q = av1_rc_pick_q_and_bounds(cpi, &cpi->rc, cpi->oxcf.width,
-                                           cpi->oxcf.height, group_idx,
-                                           &bottom_index, &top_index);
-    const int ac_q = av1_ac_quant_QTX(q, 0, bit_depth);
+    const int q = av1_rc_pick_q_and_bounds(
+        cpi, &cpi->rc, cpi->oxcf.frm_dim_cfg.width,
+        cpi->oxcf.frm_dim_cfg.height, group_idx, &bottom_index, &top_index);
+    const int ac_q = av1_ac_quant_QTX(q, 0, cpi->common.seq_params.bit_depth);
     const float threshold = 0.7f * ac_q * ac_q;
 
     if (!is_second_arf) {
@@ -1336,3 +1266,4 @@
 
   return 1;
 }
+/*!\endcond */
diff --git a/av1/encoder/temporal_filter.h b/av1/encoder/temporal_filter.h
index 5a6bde2..461e445 100644
--- a/av1/encoder/temporal_filter.h
+++ b/av1/encoder/temporal_filter.h
@@ -15,7 +15,9 @@
 #ifdef __cplusplus
 extern "C" {
 #endif
-
+/*!\cond */
+struct AV1_COMP;
+struct ThreadData;
 // TODO(any): These two variables are only used in avx2, sse2, sse4
 // implementations, where the block size is still hard coded. This should be
 // fixed to align with the c implementation.
@@ -25,21 +27,132 @@
 // Block size used in temporal filtering.
 #define TF_BLOCK_SIZE BLOCK_32X32
 
-// Window size for YUV temporal filtering.
-// This is particually used for function `av1_apply_temporal_filter_yuv()`.
-#define TF_YUV_FILTER_WINDOW_LENGTH 3
-// A scale factor used in YUV temporal filtering for weight adjustment.
-#define TF_YUV_FILTER_WEIGHT_SCALE 3
+// Window size for temporal filtering.
+#define TF_WINDOW_LENGTH 5
 
-#define TF_ENABLE_PLANEWISE_STRATEGY 1
-// Window size for plane-wise temporal filtering.
-// This is particually used for function `av1_apply_temporal_filter_planewise()`
-#define TF_PLANEWISE_FILTER_WINDOW_LENGTH 5
-// A scale factor used in plane-wise temporal filtering to raise the filter
-// weight from `double` with range [0, 1] to `int` with range [0, 1000].
-#define TF_PLANEWISE_FILTER_WEIGHT_SCALE 1000
+// Hyper-parameters used to compute filtering weight. These hyper-parameters can
+// be tuned for a better performance.
+// 0. A scale factor used in temporal filtering to raise the filter weight from
+//    `double` with range [0, 1] to `int` with range [0, 1000].
+#define TF_WEIGHT_SCALE 1000
+// 1. Weight factor used to balance the weighted-average between window error
+//    and block error. The weight is for window error while the weight for block
+//    error is always set as 1.
+#define TF_WINDOW_BLOCK_BALANCE_WEIGHT 5
+// 2. Threshold for using q to adjust the filtering weight. Concretely, when
+//    using a small q (high bitrate), we would like to reduce the filtering
+//    strength such that more detailed information can be preserved. Hence, when
+//    q is smaller than this threshold, we will adjust the filtering weight
+//    based on the q-value.
+#define TF_Q_DECAY_THRESHOLD 20
+// 3. Normalization factor used to normalize the motion search error. Since the
+//    motion search error can be large and uncontrollable, we will simply
+//    normalize it before using it to compute the filtering weight.
+#define TF_SEARCH_ERROR_NORM_WEIGHT 20
+// 4. Threshold for using `arnr_strength` to adjust the filtering strength.
+//    Concretely, users can use `arnr_strength` arguments to control the
+//    strength of temporal filtering. When `arnr_strength` is small enough (
+//    i.e., smaller than this threshold), we will adjust the filtering weight
+//    based on the strength value.
+#define TF_STRENGTH_THRESHOLD 4
+// 5. Threshold for using motion search distance to adjust the filtering weight.
+//    Concretely, larger motion search vector leads to a higher probability of
+//    unreliable search. Hence, we would like to reduce the filtering strength
+//    when the distance is large enough. Considering that the distance actually
+//    relies on the frame size, this threshold is also a resolution-based
+//    threshold. Taking 720p videos as an instance, if this field equals to 0.1,
+//    then the actual threshold will be 720 * 0.1 = 72. Similarly, the threshold
+//    for 360p videos will be 360 * 0.1 = 36.
+#define TF_SEARCH_DISTANCE_THRESHOLD 0.1
 
 #define NOISE_ESTIMATION_EDGE_THRESHOLD 50
+
+/*!\endcond */
+
+/*!
+ * \brief Parameters related to temporal filtering.
+ */
+typedef struct {
+  /*!
+   * Frame buffers used for temporal filtering.
+   */
+  YV12_BUFFER_CONFIG *frames[MAX_LAG_BUFFERS];
+  /*!
+   * Number of frames in the frame buffer.
+   */
+  int num_frames;
+  /*!
+   * Index of the frame to be filtered.
+   */
+  int filter_frame_idx;
+  /*!
+   * Whether to accumulate diff for show existing condition check.
+   */
+  int check_show_existing;
+  /*!
+   * Frame scaling factor.
+   */
+  struct scale_factors sf;
+  /*!
+   * Estimated noise levels for each plane in the frame.
+   */
+  double noise_levels[MAX_MB_PLANE];
+  /*!
+   * Number of pixels in the temporal filtering block across all planes.
+   */
+  int num_pels;
+  /*!
+   * Number of temporal filtering block rows.
+   */
+  int mb_rows;
+  /*!
+   * Number of temporal filtering block columns.
+   */
+  int mb_cols;
+  /*!
+   * Whether the frame is high-bitdepth or not.
+   */
+  int is_highbitdepth;
+  /*!
+   * Quantization factor used in temporal filtering.
+   */
+  int q_factor;
+} TemporalFilterCtx;
+
+/*!\cond */
+
+// Sum and SSE source vs filtered frame difference returned by
+// temporal filter.
+typedef struct {
+  int64_t sum;
+  int64_t sse;
+} FRAME_DIFF;
+
+// Data related to temporal filtering.
+typedef struct {
+  // Source vs filtered frame error.
+  FRAME_DIFF diff;
+  // Pointer to temporary block info used to store state in temporal filtering
+  // process.
+  MB_MODE_INFO *tmp_mbmi;
+  // Pointer to accumulator buffer used in temporal filtering process.
+  uint32_t *accum;
+  // Pointer to count buffer used in temporal filtering process.
+  uint16_t *count;
+  // Pointer to predictor used in temporal filtering process.
+  uint8_t *pred;
+} TemporalFilterData;
+
+// Data related to temporal filter multi-thread synchronization.
+typedef struct {
+#if CONFIG_MULTITHREAD
+  // Mutex lock used for dispatching jobs.
+  pthread_mutex_t *mutex_;
+#endif  // CONFIG_MULTITHREAD
+  // Next temporal filter block row to be filtered.
+  int next_tf_row;
+} AV1TemporalFilterSync;
+
 // Estimates noise level from a given frame using a single plane (Y, U, or V).
 // This is an adaptation of the mehtod in the following paper:
 // Shen-Chuan Tai, Shih-Ming Yang, "A fast method for image noise
@@ -58,28 +171,149 @@
                                             const int bit_depth);
 
 #define TF_QINDEX 128  // Q-index used in temporal filtering.
-#define TF_NUM_FILTERING_FRAMES_FOR_KEY_FRAME 7
-// Performs temporal filtering if needed.
-// NOTE: In this function, the lookahead index is different from the 0-based
-// real index. For example, if we want to filter the first frame in the
-// pre-fetched buffer `cpi->lookahead`, the lookahead index will be -1 instead
-// of 0. More concretely, 0 indicates the first LOOKAHEAD frame, which is the
-// second frame in the pre-fetched buffer. Another example: if we want to filter
-// the 17-th frame, which is an ARF, the lookahead index is 15 instead of 16.
-// Futhermore, negative number is used for key frame in one-pass mode, where key
-// frame is filtered with the frames before it instead of after it. For example,
-// -15 means to filter the 17-th frame, which is a key frame in one-pass mode.
-// Inputs:
-//   cpi: Pointer to the composed information of input video.
-//   filter_frame_lookahead_idx: The index of the to-filter frame in the
-//                               lookahead buffer `cpi->lookahead`.
-//   show_existing_arf: Whether to show existing ARF. This field will be updated
-//                      in this function.
-// Returns:
-//   Whether temporal filtering is successfully done.
-int av1_temporal_filter(AV1_COMP *cpi, const int filter_frame_lookahead_idx,
+
+/*!\endcond */
+
+/*!\brief Does temporal filter for a given macroblock row.
+*
+* \ingroup src_frame_proc
+* \param[in]   cpi                   Top level encoder instance structure
+* \param[in]   td                    Pointer to thread data
+* \param[in]   mb_row                Macroblock row to be filtered
+filtering
+*
+* \return Nothing will be returned, but the contents of td->diff will be
+modified.
+*/
+void av1_tf_do_filtering_row(struct AV1_COMP *cpi, struct ThreadData *td,
+                             int mb_row);
+
+/*!\brief Performs temporal filtering if needed on a source frame.
+ * For example to create a filtered alternate reference frame (ARF)
+ *
+ * In this function, the lookahead index is different from the 0-based
+ * real index. For example, if we want to filter the first frame in the
+ * pre-fetched buffer `cpi->lookahead`, the lookahead index will be -1 instead
+ * of 0. More concretely, 0 indicates the first LOOKAHEAD frame, which is the
+ * second frame in the pre-fetched buffer. Another example: if we want to filter
+ * the 17-th frame, which is an ARF, the lookahead index is 15 instead of 16.
+ * Futhermore, negative number is used for key frame in one-pass mode, where key
+ * frame is filtered with the frames before it instead of after it. For example,
+ * -15 means to filter the 17-th frame, which is a key frame in one-pass mode.
+ *
+ * \ingroup src_frame_proc
+ * \param[in]   cpi                        Top level encoder instance structure
+ * \param[in]   filter_frame_lookahead_idx The index of the to-filter frame in
+ *                                         the lookahead buffer cpi->lookahead.
+ * \param[in,out]   show_existing_arf      Whether to show existing ARF. This
+ *                                         field is updated in this function.
+ *
+ * \return Whether temporal filtering is successfully done.
+ */
+int av1_temporal_filter(struct AV1_COMP *cpi,
+                        const int filter_frame_lookahead_idx,
                         int *show_existing_arf);
 
+/*!\cond */
+// Helper function to get `q` used for encoding.
+int av1_get_q(const struct AV1_COMP *cpi);
+
+// Allocates memory for members of TemporalFilterData.
+// Inputs:
+//   tf_data: Pointer to the structure containing temporal filter related data.
+//   num_pels: Number of pixels in the block across all planes.
+//   is_high_bitdepth: Whether the frame is high-bitdepth or not.
+// Returns:
+//   Nothing will be returned. But the contents of tf_data will be modified.
+static AOM_INLINE void tf_alloc_and_reset_data(TemporalFilterData *tf_data,
+                                               int num_pels,
+                                               int is_high_bitdepth) {
+  tf_data->tmp_mbmi = (MB_MODE_INFO *)malloc(sizeof(*tf_data->tmp_mbmi));
+  memset(tf_data->tmp_mbmi, 0, sizeof(*tf_data->tmp_mbmi));
+  tf_data->accum =
+      (uint32_t *)aom_memalign(16, num_pels * sizeof(*tf_data->accum));
+  tf_data->count =
+      (uint16_t *)aom_memalign(16, num_pels * sizeof(*tf_data->count));
+  memset(&tf_data->diff, 0, sizeof(tf_data->diff));
+  if (is_high_bitdepth)
+    tf_data->pred = CONVERT_TO_BYTEPTR(
+        aom_memalign(32, num_pels * 2 * sizeof(*tf_data->pred)));
+  else
+    tf_data->pred =
+        (uint8_t *)aom_memalign(32, num_pels * sizeof(*tf_data->pred));
+}
+
+// Setup macroblockd params for temporal filtering process.
+// Inputs:
+//   mbd: Pointer to the block for filtering.
+//   tf_data: Pointer to the structure containing temporal filter related data.
+//   scale: Scaling factor.
+// Returns:
+//   Nothing will be returned. Contents of mbd will be modified.
+static AOM_INLINE void tf_setup_macroblockd(MACROBLOCKD *mbd,
+                                            TemporalFilterData *tf_data,
+                                            const struct scale_factors *scale) {
+  mbd->block_ref_scale_factors[0] = scale;
+  mbd->block_ref_scale_factors[1] = scale;
+  mbd->mi = &tf_data->tmp_mbmi;
+  mbd->mi[0]->motion_mode = SIMPLE_TRANSLATION;
+}
+
+// Deallocates the memory allocated for members of TemporalFilterData.
+// Inputs:
+//   tf_data: Pointer to the structure containing temporal filter related data.
+//   is_high_bitdepth: Whether the frame is high-bitdepth or not.
+// Returns:
+//   Nothing will be returned.
+static AOM_INLINE void tf_dealloc_data(TemporalFilterData *tf_data,
+                                       int is_high_bitdepth) {
+  if (is_high_bitdepth)
+    tf_data->pred = (uint8_t *)CONVERT_TO_SHORTPTR(tf_data->pred);
+  free(tf_data->tmp_mbmi);
+  aom_free(tf_data->accum);
+  aom_free(tf_data->count);
+  aom_free(tf_data->pred);
+}
+
+// Helper function to compute number of blocks on either side of the frame.
+static INLINE int get_num_blocks(const int frame_length, const int mb_length) {
+  return (frame_length + mb_length - 1) / mb_length;
+}
+
+// Saves the state prior to temporal filter process.
+// Inputs:
+//   mbd: Pointer to the block for filtering.
+//   input_mbmi: Backup block info to save input state.
+//   input_buffer: Backup buffer pointer to save input state.
+//   num_planes: Number of planes.
+// Returns:
+//   Nothing will be returned. Contents of input_mbmi and input_buffer will be
+//   modified.
+static INLINE void tf_save_state(MACROBLOCKD *mbd, MB_MODE_INFO ***input_mbmi,
+                                 uint8_t **input_buffer, int num_planes) {
+  for (int i = 0; i < num_planes; i++) {
+    input_buffer[i] = mbd->plane[i].pre[0].buf;
+  }
+  *input_mbmi = mbd->mi;
+}
+
+// Restores the initial state after temporal filter process.
+// Inputs:
+//   mbd: Pointer to the block for filtering.
+//   input_mbmi: Backup block info from where input state is restored.
+//   input_buffer: Backup buffer pointer from where input state is restored.
+//   num_planes: Number of planes.
+// Returns:
+//   Nothing will be returned. Contents of mbd will be modified.
+static INLINE void tf_restore_state(MACROBLOCKD *mbd, MB_MODE_INFO **input_mbmi,
+                                    uint8_t **input_buffer, int num_planes) {
+  for (int i = 0; i < num_planes; i++) {
+    mbd->plane[i].pre[0].buf = input_buffer[i];
+  }
+  mbd->mi = input_mbmi;
+}
+
+/*!\endcond */
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/av1/encoder/tokenize.c b/av1/encoder/tokenize.c
index e674153..43ac270 100644
--- a/av1/encoder/tokenize.c
+++ b/av1/encoder/tokenize.c
@@ -27,9 +27,9 @@
 #include "av1/encoder/rdopt.h"
 #include "av1/encoder/tokenize.h"
 
-static int cost_and_tokenize_map(Av1ColorMapParam *param, TOKENEXTRA **t,
+static int cost_and_tokenize_map(Av1ColorMapParam *param, TokenExtra **t,
                                  int plane, int calc_rate, int allow_update_cdf,
-                                 FRAME_COUNTS *counts, MapCdf map_pb_cdf) {
+                                 FRAME_COUNTS *counts) {
   const uint8_t *const color_map = param->color_map;
   MapCdf map_cdf = param->map_cdf;
   ColorCost color_cost = param->color_cost;
@@ -39,7 +39,6 @@
   const int n = param->n_colors;
   const int palette_size_idx = n - PALETTE_MIN_SIZE;
   int this_rate = 0;
-  uint8_t color_order[PALETTE_MAX_SIZE];
 
   (void)plane;
   (void)counts;
@@ -48,14 +47,14 @@
     for (int j = AOMMIN(k, cols - 1); j >= AOMMAX(0, k - rows + 1); --j) {
       int i = k - j;
       int color_new_idx;
-      const int color_ctx = av1_get_palette_color_index_context(
-          color_map, plane_block_width, i, j, n, color_order, &color_new_idx);
+      const int color_ctx = av1_fast_palette_color_index_context(
+          color_map, plane_block_width, i, j, &color_new_idx);
       assert(color_new_idx >= 0 && color_new_idx < n);
       if (calc_rate) {
         this_rate += (*color_cost)[palette_size_idx][color_ctx][color_new_idx];
       } else {
         (*t)->token = color_new_idx;
-        (*t)->color_map_cdf = map_pb_cdf[palette_size_idx][color_ctx];
+        (*t)->color_ctx = color_ctx;
         ++(*t);
         if (allow_update_cdf)
           update_cdf(map_cdf[palette_size_idx][color_ctx], color_new_idx, n);
@@ -83,13 +82,14 @@
   params->color_map = xd->plane[plane].color_index_map;
   params->map_cdf = plane ? xd->tile_ctx->palette_uv_color_index_cdf
                           : xd->tile_ctx->palette_y_color_index_cdf;
-  params->color_cost =
-      plane ? &x->palette_uv_color_cost : &x->palette_y_color_cost;
+  params->color_cost = plane ? &x->mode_costs.palette_uv_color_cost
+                             : &x->mode_costs.palette_y_color_cost;
   params->n_colors = pmi->palette_size[plane];
   av1_get_block_dimensions(bsize, plane, xd, &params->plane_width, NULL,
                            &params->rows, &params->cols);
 }
 
+// TODO(any): Remove this function
 static void get_color_map_params(const MACROBLOCK *const x, int plane,
                                  BLOCK_SIZE bsize, TX_SIZE tx_size,
                                  COLOR_MAP_TYPE type,
@@ -107,14 +107,11 @@
   assert(plane == 0 || plane == 1);
   Av1ColorMapParam color_map_params;
   get_color_map_params(x, plane, bsize, tx_size, type, &color_map_params);
-  MapCdf map_pb_cdf = plane ? x->tile_pb_ctx->palette_uv_color_index_cdf
-                            : x->tile_pb_ctx->palette_y_color_index_cdf;
-  return cost_and_tokenize_map(&color_map_params, NULL, plane, 1, 0, NULL,
-                               map_pb_cdf);
+  return cost_and_tokenize_map(&color_map_params, NULL, plane, 1, 0, NULL);
 }
 
 void av1_tokenize_color_map(const MACROBLOCK *const x, int plane,
-                            TOKENEXTRA **t, BLOCK_SIZE bsize, TX_SIZE tx_size,
+                            TokenExtra **t, BLOCK_SIZE bsize, TX_SIZE tx_size,
                             COLOR_MAP_TYPE type, int allow_update_cdf,
                             FRAME_COUNTS *counts) {
   assert(plane == 0 || plane == 1);
@@ -122,12 +119,10 @@
   get_color_map_params(x, plane, bsize, tx_size, type, &color_map_params);
   // The first color index does not use context or entropy.
   (*t)->token = color_map_params.color_map[0];
-  (*t)->color_map_cdf = NULL;
+  (*t)->color_ctx = -1;
   ++(*t);
-  MapCdf map_pb_cdf = plane ? x->tile_pb_ctx->palette_uv_color_index_cdf
-                            : x->tile_pb_ctx->palette_y_color_index_cdf;
   cost_and_tokenize_map(&color_map_params, t, plane, 0, allow_update_cdf,
-                        counts, map_pb_cdf);
+                        counts);
 }
 
 static void tokenize_vartx(ThreadData *td, TX_SIZE tx_size,
@@ -143,14 +138,14 @@
   if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
 
   const TX_SIZE plane_tx_size =
-      plane ? av1_get_max_uv_txsize(mbmi->sb_type, pd->subsampling_x,
+      plane ? av1_get_max_uv_txsize(mbmi->bsize, pd->subsampling_x,
                                     pd->subsampling_y)
             : mbmi->inter_tx_size[av1_get_txb_size_index(plane_bsize, blk_row,
                                                          blk_col)];
 
   if (tx_size == plane_tx_size || plane) {
-    plane_bsize = get_plane_block_size(mbmi->sb_type, pd->subsampling_x,
-                                       pd->subsampling_y);
+    plane_bsize =
+        get_plane_block_size(mbmi->bsize, pd->subsampling_x, pd->subsampling_y);
     av1_update_and_record_txb_context(plane, block, blk_row, blk_col,
                                       plane_bsize, tx_size, arg);
 
@@ -194,7 +189,7 @@
   MB_MODE_INFO *const mbmi = xd->mi[0];
   struct tokenize_b_args arg = { cpi, td, 0, allow_update_cdf, dry_run };
 
-  if (mbmi->skip) {
+  if (mbmi->skip_txfm) {
     av1_reset_entropy_context(xd, bsize, num_planes);
     return;
   }
diff --git a/av1/encoder/tokenize.h b/av1/encoder/tokenize.h
index 52caacb..d5e1dfd 100644
--- a/av1/encoder/tokenize.h
+++ b/av1/encoder/tokenize.h
@@ -21,9 +21,24 @@
 #endif
 
 typedef struct {
-  aom_cdf_prob *color_map_cdf;
+  int8_t color_ctx;
   uint8_t token;
-} TOKENEXTRA;
+} TokenExtra;
+
+typedef struct {
+  TokenExtra *start;
+  unsigned int count;
+} TokenList;
+
+typedef struct {
+  // tile_tok[i][j] is a pointer to the buffer storing palette tokens of the ith
+  // tile row, jth tile column.
+  TokenExtra *tile_tok[MAX_TILE_ROWS][MAX_TILE_COLS];
+  // tplist[i][j][k] holds the start pointer of tile_tok[i][j] and the count of
+  // palette tokens for the kth superblock row of the ith tile row, jth tile
+  // column.
+  TokenList *tplist[MAX_TILE_ROWS][MAX_TILE_COLS];
+} TokenInfo;
 
 struct AV1_COMP;
 struct ThreadData;
@@ -54,7 +69,7 @@
                        TX_SIZE tx_size, COLOR_MAP_TYPE type);
 
 void av1_tokenize_color_map(const MACROBLOCK *const x, int plane,
-                            TOKENEXTRA **t, BLOCK_SIZE bsize, TX_SIZE tx_size,
+                            TokenExtra **t, BLOCK_SIZE bsize, TX_SIZE tx_size,
                             COLOR_MAP_TYPE type, int allow_update_cdf,
                             struct FRAME_COUNTS *counts);
 
@@ -64,6 +79,51 @@
   return segfeature_active(seg, segment_id, SEG_LVL_SKIP) ? 0 : eob_max;
 }
 
+// Token buffer is only used for palette tokens.
+static INLINE unsigned int get_token_alloc(int mb_rows, int mb_cols,
+                                           int sb_size_log2,
+                                           const int num_planes) {
+  // Calculate the maximum number of max superblocks in the image.
+  const int shift = sb_size_log2 - 4;
+  const int sb_size = 1 << sb_size_log2;
+  const int sb_size_square = sb_size * sb_size;
+  const int sb_rows = ALIGN_POWER_OF_TWO(mb_rows, shift) >> shift;
+  const int sb_cols = ALIGN_POWER_OF_TWO(mb_cols, shift) >> shift;
+
+  // One palette token for each pixel. There can be palettes on two planes.
+  const int sb_palette_toks = AOMMIN(2, num_planes) * sb_size_square;
+
+  return sb_rows * sb_cols * sb_palette_toks;
+}
+
+// Allocate memory for token related info.
+static AOM_INLINE void alloc_token_info(AV1_COMMON *cm, TokenInfo *token_info) {
+  int mi_rows_aligned_to_sb =
+      ALIGN_POWER_OF_TWO(cm->mi_params.mi_rows, cm->seq_params.mib_size_log2);
+  int sb_rows = mi_rows_aligned_to_sb >> cm->seq_params.mib_size_log2;
+  const int num_planes = av1_num_planes(cm);
+  unsigned int tokens =
+      get_token_alloc(cm->mi_params.mb_rows, cm->mi_params.mb_cols,
+                      MAX_SB_SIZE_LOG2, num_planes);
+  CHECK_MEM_ERROR(
+      cm, token_info->tile_tok[0][0],
+      (TokenExtra *)aom_calloc(tokens, sizeof(*token_info->tile_tok[0][0])));
+
+  CHECK_MEM_ERROR(
+      cm, token_info->tplist[0][0],
+      (TokenList *)aom_calloc(sb_rows * MAX_TILE_ROWS * MAX_TILE_COLS,
+                              sizeof(*token_info->tplist[0][0])));
+}
+
+// Free memory from token related variables.
+static AOM_INLINE void free_token_info(TokenInfo *token_info) {
+  aom_free(token_info->tile_tok[0][0]);
+  token_info->tile_tok[0][0] = NULL;
+
+  aom_free(token_info->tplist[0][0]);
+  token_info->tplist[0][0] = NULL;
+}
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/av1/encoder/tpl_model.c b/av1/encoder/tpl_model.c
index 79b94f3..1aa308a 100644
--- a/av1/encoder/tpl_model.c
+++ b/av1/encoder/tpl_model.c
@@ -25,8 +25,11 @@
 #include "av1/common/reconintra.h"
 
 #include "av1/encoder/encoder.h"
+#include "av1/encoder/ethread.h"
+#include "av1/encoder/encodeframe_utils.h"
 #include "av1/encoder/encode_strategy.h"
 #include "av1/encoder/hybrid_fwd_txfm.h"
+#include "av1/encoder/rd.h"
 #include "av1/encoder/rdopt.h"
 #include "av1/encoder/reconinter_enc.h"
 #include "av1/encoder/tpl_model.h"
@@ -38,15 +41,32 @@
                                           uint16_t *eob, int64_t *recon_error,
                                           int64_t *sse) {
   const struct macroblock_plane *const p = &x->plane[plane];
+  const MACROBLOCKD *xd = &x->e_mbd;
   const SCAN_ORDER *const scan_order = &av1_default_scan_orders[tx_size];
   int pix_num = 1 << num_pels_log2_lookup[txsize_to_bsize[tx_size]];
   const int shift = tx_size == TX_32X32 ? 0 : 2;
 
-  av1_quantize_fp(coeff, pix_num, p->zbin_QTX, p->round_fp_QTX, p->quant_fp_QTX,
-                  p->quant_shift_QTX, qcoeff, dqcoeff, p->dequant_QTX, eob,
-                  scan_order->scan, scan_order->iscan);
+  QUANT_PARAM quant_param;
+  av1_setup_quant(tx_size, 0, AV1_XFORM_QUANT_FP, 0, &quant_param);
 
+#if CONFIG_AV1_HIGHBITDEPTH
+  if (is_cur_buf_hbd(xd)) {
+    av1_highbd_quantize_fp_facade(coeff, pix_num, p, qcoeff, dqcoeff, eob,
+                                  scan_order, &quant_param);
+    *recon_error =
+        av1_highbd_block_error(coeff, dqcoeff, pix_num, sse, xd->bd) >> shift;
+  } else {
+    av1_quantize_fp_facade(coeff, pix_num, p, qcoeff, dqcoeff, eob, scan_order,
+                           &quant_param);
+    *recon_error = av1_block_error(coeff, dqcoeff, pix_num, sse) >> shift;
+  }
+#else
+  (void)xd;
+  av1_quantize_fp_facade(coeff, pix_num, p, qcoeff, dqcoeff, eob, scan_order,
+                         &quant_param);
   *recon_error = av1_block_error(coeff, dqcoeff, pix_num, sse) >> shift;
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
   *recon_error = AOMMAX(*recon_error, 1);
 
   *sse = (*sse) >> shift;
@@ -86,7 +106,7 @@
   const SCAN_ORDER *const scan_order = &av1_default_scan_orders[tx_size];
 
   assert((1 << num_pels_log2_lookup[txsize_to_bsize[tx_size]]) >= eob);
-
+  aom_clear_system_state();
   int rate_cost = 1;
 
   for (int idx = 0; idx < eob; ++idx) {
@@ -142,15 +162,18 @@
   step_param = tpl_sf->reduce_first_step_size;
   step_param = AOMMIN(step_param, MAX_MVSEARCH_STEPS - 2);
 
-  search_site_config *ss_cfg = &cpi->mv_search_params.ss_cfg[SS_CFG_SRC];
-  if (ss_cfg->stride != stride_ref)
-    ss_cfg = &cpi->mv_search_params.ss_cfg[SS_CFG_LOOKAHEAD];
-
-  assert(ss_cfg->stride == stride_ref);
+  const search_site_config *search_site_cfg =
+      cpi->mv_search_params.search_site_cfg[SS_CFG_SRC];
+  if (search_site_cfg->stride != stride_ref)
+    search_site_cfg = cpi->mv_search_params.search_site_cfg[SS_CFG_LOOKAHEAD];
+  assert(search_site_cfg->stride == stride_ref);
 
   FULLPEL_MOTION_SEARCH_PARAMS full_ms_params;
   av1_make_default_fullpel_ms_params(&full_ms_params, cpi, x, bsize, &center_mv,
-                                     ss_cfg);
+                                     search_site_cfg,
+                                     /*fine_search_interval=*/0);
+  av1_set_mv_search_method(&full_ms_params, search_site_cfg,
+                           tpl_sf->search_method);
 
   av1_full_pixel_search(start_mv, &full_ms_params, step_param,
                         cond_cost_list(cpi, cost_list), &best_mv->as_fullmv,
@@ -170,7 +193,21 @@
   return bestsme;
 }
 
-static int is_alike_mv(int_mv candidate_mv, int_mv *center_mvs,
+typedef struct {
+  int_mv mv;
+  int sad;
+} center_mv_t;
+
+static int compare_sad(const void *a, const void *b) {
+  const int diff = ((center_mv_t *)a)->sad - ((center_mv_t *)b)->sad;
+  if (diff < 0)
+    return -1;
+  else if (diff > 0)
+    return 1;
+  return 0;
+}
+
+static int is_alike_mv(int_mv candidate_mv, center_mv_t *center_mvs,
                        int center_mvs_count, int skip_alike_starting_mv) {
   // MV difference threshold is in 1/8 precision.
   const int mv_diff_thr[3] = { 1, (8 << 3), (16 << 3) };
@@ -178,26 +215,26 @@
   int i;
 
   for (i = 0; i < center_mvs_count; i++) {
-    if (abs(center_mvs[i].as_mv.col - candidate_mv.as_mv.col) < thr &&
-        abs(center_mvs[i].as_mv.row - candidate_mv.as_mv.row) < thr)
+    if (abs(center_mvs[i].mv.as_mv.col - candidate_mv.as_mv.col) < thr &&
+        abs(center_mvs[i].mv.as_mv.row - candidate_mv.as_mv.row) < thr)
       return 1;
   }
 
   return 0;
 }
 
-static AOM_INLINE void mode_estimation(
-    AV1_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd, struct scale_factors *sf,
-    int frame_idx, int mi_row, int mi_col, BLOCK_SIZE bsize, TX_SIZE tx_size,
-    const YV12_BUFFER_CONFIG *ref_frame[],
-    const YV12_BUFFER_CONFIG *src_ref_frame[], TplDepStats *tpl_stats) {
+static AOM_INLINE void mode_estimation(AV1_COMP *cpi, MACROBLOCK *x, int mi_row,
+                                       int mi_col, BLOCK_SIZE bsize,
+                                       TX_SIZE tx_size,
+                                       TplDepStats *tpl_stats) {
   AV1_COMMON *cm = &cpi->common;
   const GF_GROUP *gf_group = &cpi->gf_group;
 
   (void)gf_group;
 
+  MACROBLOCKD *xd = &x->e_mbd;
   TplParams *tpl_data = &cpi->tpl_data;
-  TplDepFrame *tpl_frame = &tpl_data->tpl_frame[frame_idx];
+  TplDepFrame *tpl_frame = &tpl_data->tpl_frame[tpl_data->frame_idx];
   const uint8_t block_mis_log2 = tpl_data->tpl_stats_block_mis_log2;
 
   const int bw = 4 << mi_size_wide_log2[bsize];
@@ -218,13 +255,16 @@
   uint8_t *dst_buffer = tpl_frame->rec_picture->y_buffer + dst_mb_offset;
   const int dst_buffer_stride = tpl_frame->rec_picture->y_stride;
 
-  // Temporaray buffers
-  DECLARE_ALIGNED(32, uint8_t, predictor8[MC_FLOW_NUM_PELS * 2]);
-  DECLARE_ALIGNED(32, int16_t, src_diff[MC_FLOW_NUM_PELS]);
-  DECLARE_ALIGNED(32, tran_low_t, coeff[MC_FLOW_NUM_PELS]);
-  DECLARE_ALIGNED(32, tran_low_t, qcoeff[MC_FLOW_NUM_PELS]);
-  DECLARE_ALIGNED(32, tran_low_t, dqcoeff[MC_FLOW_NUM_PELS]);
-  DECLARE_ALIGNED(32, tran_low_t, best_coeff[MC_FLOW_NUM_PELS]);
+  // Number of pixels in a tpl block
+  const int tpl_block_pels = tpl_data->tpl_bsize_1d * tpl_data->tpl_bsize_1d;
+  // Allocate temporary buffers used in motion estimation.
+  uint8_t *predictor8 = aom_memalign(32, tpl_block_pels * 2 * sizeof(uint8_t));
+  int16_t *src_diff = aom_memalign(32, tpl_block_pels * sizeof(int16_t));
+  tran_low_t *coeff = aom_memalign(32, tpl_block_pels * sizeof(tran_low_t));
+  tran_low_t *qcoeff = aom_memalign(32, tpl_block_pels * sizeof(tran_low_t));
+  tran_low_t *dqcoeff = aom_memalign(32, tpl_block_pels * sizeof(tran_low_t));
+  tran_low_t *best_coeff =
+      aom_memalign(32, tpl_block_pels * sizeof(tran_low_t));
   uint8_t *predictor =
       is_cur_buf_hbd(xd) ? CONVERT_TO_BYTEPTR(predictor8) : predictor8;
   int64_t recon_error = 1, sse = 1;
@@ -239,7 +279,7 @@
                  cm->mi_params.mi_rows, cm->mi_params.mi_cols);
   set_plane_n4(xd, mi_size_wide[bsize], mi_size_high[bsize],
                av1_num_planes(cm));
-  xd->mi[0]->sb_type = bsize;
+  xd->mi[0]->bsize = bsize;
   xd->mi[0]->motion_mode = SIMPLE_TRANSLATION;
 
   // Intra prediction search
@@ -298,12 +338,13 @@
   best_mv.as_int = INVALID_MV;
 
   for (rf_idx = 0; rf_idx < INTER_REFS_PER_FRAME; ++rf_idx) {
-    if (ref_frame[rf_idx] == NULL || src_ref_frame[rf_idx] == NULL) {
+    if (tpl_data->ref_frame[rf_idx] == NULL ||
+        tpl_data->src_ref_frame[rf_idx] == NULL) {
       tpl_stats->mv[rf_idx].as_int = INVALID_MV;
       continue;
     }
 
-    const YV12_BUFFER_CONFIG *ref_frame_ptr = src_ref_frame[rf_idx];
+    const YV12_BUFFER_CONFIG *ref_frame_ptr = tpl_data->src_ref_frame[rf_idx];
     int ref_mb_offset =
         mi_row * MI_SIZE * ref_frame_ptr->y_stride + mi_col * MI_SIZE;
     uint8_t *ref_mb = ref_frame_ptr->y_buffer + ref_mb_offset;
@@ -312,15 +353,19 @@
     int_mv best_rfidx_mv = { 0 };
     uint32_t bestsme = UINT32_MAX;
 
-    int_mv center_mvs[4] = { { 0 } };
+    center_mv_t center_mvs[4] = { { { 0 }, INT_MAX },
+                                  { { 0 }, INT_MAX },
+                                  { { 0 }, INT_MAX },
+                                  { { 0 }, INT_MAX } };
     int refmv_count = 1;
+    int idx;
 
     if (xd->up_available) {
       TplDepStats *ref_tpl_stats = &tpl_frame->tpl_stats_ptr[av1_tpl_ptr_pos(
           mi_row - mi_height, mi_col, tpl_frame->stride, block_mis_log2)];
       if (!is_alike_mv(ref_tpl_stats->mv[rf_idx], center_mvs, refmv_count,
                        cpi->sf.tpl_sf.skip_alike_starting_mv)) {
-        center_mvs[refmv_count].as_int = ref_tpl_stats->mv[rf_idx].as_int;
+        center_mvs[refmv_count].mv.as_int = ref_tpl_stats->mv[rf_idx].as_int;
         ++refmv_count;
       }
     }
@@ -330,7 +375,7 @@
           mi_row, mi_col - mi_width, tpl_frame->stride, block_mis_log2)];
       if (!is_alike_mv(ref_tpl_stats->mv[rf_idx], center_mvs, refmv_count,
                        cpi->sf.tpl_sf.skip_alike_starting_mv)) {
-        center_mvs[refmv_count].as_int = ref_tpl_stats->mv[rf_idx].as_int;
+        center_mvs[refmv_count].mv.as_int = ref_tpl_stats->mv[rf_idx].as_int;
         ++refmv_count;
       }
     }
@@ -341,16 +386,41 @@
           block_mis_log2)];
       if (!is_alike_mv(ref_tpl_stats->mv[rf_idx], center_mvs, refmv_count,
                        cpi->sf.tpl_sf.skip_alike_starting_mv)) {
-        center_mvs[refmv_count].as_int = ref_tpl_stats->mv[rf_idx].as_int;
+        center_mvs[refmv_count].mv.as_int = ref_tpl_stats->mv[rf_idx].as_int;
         ++refmv_count;
       }
     }
 
-    for (int idx = 0; idx < refmv_count; ++idx) {
+    // Prune starting mvs
+    if (cpi->sf.tpl_sf.prune_starting_mv) {
+      // Get each center mv's sad.
+      for (idx = 0; idx < refmv_count; ++idx) {
+        FULLPEL_MV mv = get_fullmv_from_mv(&center_mvs[idx].mv.as_mv);
+        clamp_fullmv(&mv, &x->mv_limits);
+        center_mvs[idx].sad = (int)cpi->fn_ptr[bsize].sdf(
+            src_mb_buffer, src_stride, &ref_mb[mv.row * ref_stride + mv.col],
+            ref_stride);
+      }
+
+      // Rank center_mv using sad.
+      if (refmv_count > 1) {
+        qsort(center_mvs, refmv_count, sizeof(center_mvs[0]), compare_sad);
+      }
+      refmv_count = AOMMIN(4 - cpi->sf.tpl_sf.prune_starting_mv, refmv_count);
+      // Further reduce number of refmv based on sad difference.
+      if (refmv_count > 1) {
+        int last_sad = center_mvs[refmv_count - 1].sad;
+        int second_to_last_sad = center_mvs[refmv_count - 2].sad;
+        if ((last_sad - second_to_last_sad) * 5 > second_to_last_sad)
+          refmv_count--;
+      }
+    }
+
+    for (idx = 0; idx < refmv_count; ++idx) {
       int_mv this_mv;
-      uint32_t thissme =
-          motion_estimation(cpi, x, src_mb_buffer, ref_mb, src_stride,
-                            ref_stride, bsize, center_mvs[idx].as_mv, &this_mv);
+      uint32_t thissme = motion_estimation(cpi, x, src_mb_buffer, ref_mb,
+                                           src_stride, ref_stride, bsize,
+                                           center_mvs[idx].mv.as_mv, &this_mv);
 
       if (thissme < bestsme) {
         bestsme = thissme;
@@ -366,7 +436,7 @@
     InterPredParams inter_pred_params;
     av1_init_inter_params(&inter_pred_params, bw, bh, mi_row * MI_SIZE,
                           mi_col * MI_SIZE, 0, 0, xd->bd, is_cur_buf_hbd(xd), 0,
-                          sf, &ref_buf, kernel);
+                          &tpl_data->sf, &ref_buf, kernel);
     inter_pred_params.conv_params = get_conv_params(0, 0, xd->bd);
 
     av1_enc_build_one_inter_predictor(predictor, bw, &best_rfidx_mv.as_mv,
@@ -378,7 +448,7 @@
     tpl_stats->pred_error[rf_idx] = AOMMAX(1, inter_cost);
 
     if (inter_cost < best_inter_cost) {
-      memcpy(best_coeff, coeff, sizeof(best_coeff));
+      memcpy(best_coeff, coeff, tpl_block_pels * sizeof(best_coeff[0]));
       best_rf_idx = rf_idx;
 
       best_inter_cost = inter_cost;
@@ -401,11 +471,7 @@
   }
 
   best_intra_cost = AOMMAX(best_intra_cost, 1);
-  if (frame_idx == 0) {
-    best_inter_cost = 0;
-  } else {
-    best_inter_cost = AOMMIN(best_intra_cost, best_inter_cost);
-  }
+  best_inter_cost = AOMMIN(best_intra_cost, best_inter_cost);
   tpl_stats->inter_cost = best_inter_cost << TPL_DEP_COST_SCALE_LOG2;
   tpl_stats->intra_cost = best_intra_cost << TPL_DEP_COST_SCALE_LOG2;
 
@@ -413,7 +479,7 @@
 
   // Final encode
   if (is_inter_mode(best_mode)) {
-    const YV12_BUFFER_CONFIG *ref_frame_ptr = ref_frame[best_rf_idx];
+    const YV12_BUFFER_CONFIG *ref_frame_ptr = tpl_data->ref_frame[best_rf_idx];
 
     InterPredParams inter_pred_params;
     struct buf_2d ref_buf = { NULL, ref_frame_ptr->y_buffer,
@@ -421,7 +487,7 @@
                               ref_frame_ptr->y_stride };
     av1_init_inter_params(&inter_pred_params, bw, bh, mi_row * MI_SIZE,
                           mi_col * MI_SIZE, 0, 0, xd->bd, is_cur_buf_hbd(xd), 0,
-                          sf, &ref_buf, kernel);
+                          &tpl_data->sf, &ref_buf, kernel);
     inter_pred_params.conv_params = get_conv_params(0, 0, xd->bd);
 
     av1_enc_build_one_inter_predictor(dst_buffer, dst_buffer_stride,
@@ -460,6 +526,14 @@
       }
     }
   }
+
+  // Free temporary buffers.
+  aom_free(predictor8);
+  aom_free(src_diff);
+  aom_free(coeff);
+  aom_free(qcoeff);
+  aom_free(dqcoeff);
+  aom_free(best_coeff);
 }
 
 static int round_floor(int ref_pos, int bsize_pix) {
@@ -613,13 +687,13 @@
   const int mi_height = mi_size_high[bsize];
   const int mi_width = mi_size_wide[bsize];
   const int step = 1 << tpl_data->tpl_stats_block_mis_log2;
-  const BLOCK_SIZE tpl_block_size =
+  const BLOCK_SIZE tpl_stats_block_size =
       convert_length_to_bsize(MI_SIZE << tpl_data->tpl_stats_block_mis_log2);
 
   for (int idy = 0; idy < mi_height; idy += step) {
     for (int idx = 0; idx < mi_width; idx += step) {
-      tpl_model_update_b(tpl_data, mi_row + idy, mi_col + idx, tpl_block_size,
-                         frame_idx);
+      tpl_model_update_b(tpl_data, mi_row + idy, mi_col + idx,
+                         tpl_stats_block_size, frame_idx);
     }
   }
 }
@@ -631,13 +705,14 @@
   const int mi_height = mi_size_high[bsize];
   const int mi_width = mi_size_wide[bsize];
   const int step = 1 << block_mis_log2;
+  const int div = (mi_height >> block_mis_log2) * (mi_width >> block_mis_log2);
 
-  int64_t intra_cost = src_stats->intra_cost / (mi_height * mi_width);
-  int64_t inter_cost = src_stats->inter_cost / (mi_height * mi_width);
-  int64_t srcrf_dist = src_stats->srcrf_dist / (mi_height * mi_width);
-  int64_t recrf_dist = src_stats->recrf_dist / (mi_height * mi_width);
-  int64_t srcrf_rate = src_stats->srcrf_rate / (mi_height * mi_width);
-  int64_t recrf_rate = src_stats->recrf_rate / (mi_height * mi_width);
+  int64_t intra_cost = src_stats->intra_cost / div;
+  int64_t inter_cost = src_stats->inter_cost / div;
+  int64_t srcrf_dist = src_stats->srcrf_dist / div;
+  int64_t recrf_dist = src_stats->recrf_dist / div;
+  int64_t srcrf_rate = src_stats->srcrf_rate / div;
+  int64_t recrf_rate = src_stats->recrf_rate / div;
 
   intra_cost = AOMMAX(1, intra_cost);
   inter_cost = AOMMAX(1, inter_cost);
@@ -665,64 +740,89 @@
   }
 }
 
-static AOM_INLINE void mc_flow_dispenser(AV1_COMP *cpi, int frame_idx,
-                                         int pframe_qindex) {
-  const GF_GROUP *gf_group = &cpi->gf_group;
-  if (frame_idx == gf_group->size) return;
+// Reset the ref and source frame pointers of tpl_data.
+static AOM_INLINE void tpl_reset_src_ref_frames(TplParams *tpl_data) {
+  for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
+    tpl_data->ref_frame[i] = NULL;
+    tpl_data->src_ref_frame[i] = NULL;
+  }
+}
+
+static AOM_INLINE int get_gop_length(const GF_GROUP *gf_group) {
+  int gop_length = AOMMIN(gf_group->size, MAX_TPL_FRAME_IDX - 1);
+  return gop_length;
+}
+
+// Initialize the mc_flow parameters used in computing tpl data.
+static AOM_INLINE void init_mc_flow_dispenser(AV1_COMP *cpi, int frame_idx,
+                                              int pframe_qindex) {
   TplParams *const tpl_data = &cpi->tpl_data;
   TplDepFrame *tpl_frame = &tpl_data->tpl_frame[frame_idx];
   const YV12_BUFFER_CONFIG *this_frame = tpl_frame->gf_picture;
-  const YV12_BUFFER_CONFIG *ref_frame[7] = { NULL, NULL, NULL, NULL,
-                                             NULL, NULL, NULL };
   const YV12_BUFFER_CONFIG *ref_frames_ordered[INTER_REFS_PER_FRAME];
+  uint32_t ref_frame_display_indices[INTER_REFS_PER_FRAME];
+  GF_GROUP *gf_group = &cpi->gf_group;
+  int ref_pruning_enabled = is_frame_eligible_for_ref_pruning(
+      gf_group, cpi->sf.inter_sf.selective_ref_frame,
+      cpi->sf.tpl_sf.prune_ref_frames_in_tpl, frame_idx);
+  int gop_length = get_gop_length(gf_group);
   int ref_frame_flags;
-  const YV12_BUFFER_CONFIG *src_frame[7] = { NULL, NULL, NULL, NULL,
-                                             NULL, NULL, NULL };
-
   AV1_COMMON *cm = &cpi->common;
-  const CommonModeInfoParams *const mi_params = &cm->mi_params;
-  struct scale_factors sf;
   int rdmult, idx;
   ThreadData *td = &cpi->td;
   MACROBLOCK *x = &td->mb;
   MACROBLOCKD *xd = &x->e_mbd;
-  int mi_row, mi_col;
-  const BLOCK_SIZE bsize = convert_length_to_bsize(MC_FLOW_BSIZE_1D);
+  tpl_data->frame_idx = frame_idx;
+  tpl_reset_src_ref_frames(tpl_data);
   av1_tile_init(&xd->tile, cm, 0, 0);
 
-  const TX_SIZE tx_size = max_txsize_lookup[bsize];
-  const int mi_height = mi_size_high[bsize];
-  const int mi_width = mi_size_wide[bsize];
-
   // Setup scaling factor
   av1_setup_scale_factors_for_frame(
-      &sf, this_frame->y_crop_width, this_frame->y_crop_height,
+      &tpl_data->sf, this_frame->y_crop_width, this_frame->y_crop_height,
       this_frame->y_crop_width, this_frame->y_crop_height);
 
   xd->cur_buf = this_frame;
 
   for (idx = 0; idx < INTER_REFS_PER_FRAME; ++idx) {
-    ref_frame[idx] =
-        tpl_data->tpl_frame[tpl_frame->ref_map_index[idx]].rec_picture;
-    src_frame[idx] =
-        tpl_data->tpl_frame[tpl_frame->ref_map_index[idx]].gf_picture;
+    TplDepFrame *tpl_ref_frame =
+        &tpl_data->tpl_frame[tpl_frame->ref_map_index[idx]];
+    tpl_data->ref_frame[idx] = tpl_ref_frame->rec_picture;
+    tpl_data->src_ref_frame[idx] = tpl_ref_frame->gf_picture;
+    ref_frame_display_indices[idx] = tpl_ref_frame->frame_display_index;
   }
 
   // Store the reference frames based on priority order
   for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
-    ref_frames_ordered[i] = ref_frame[ref_frame_priority_order[i] - 1];
+    ref_frames_ordered[i] =
+        tpl_data->ref_frame[ref_frame_priority_order[i] - 1];
   }
 
   // Work out which reference frame slots may be used.
   ref_frame_flags = get_ref_frame_flags(&cpi->sf, ref_frames_ordered,
                                         cpi->ext_flags.ref_frame_flags);
 
-  enforce_max_ref_frames(cpi, &ref_frame_flags);
+  enforce_max_ref_frames(cpi, &ref_frame_flags, ref_frame_display_indices,
+                         tpl_frame->frame_display_index);
 
   // Prune reference frames
   for (idx = 0; idx < INTER_REFS_PER_FRAME; ++idx) {
     if ((ref_frame_flags & (1 << idx)) == 0) {
-      ref_frame[idx] = NULL;
+      tpl_data->ref_frame[idx] = NULL;
+    }
+  }
+
+  // Skip motion estimation w.r.t. reference frames which are not
+  // considered in RD search, using "selective_ref_frame" speed feature.
+  // The reference frame pruning is not enabled for frames beyond the gop
+  // length, as there are fewer reference frames and the reference frames
+  // differ from the frames considered during RD search.
+  if (ref_pruning_enabled && (frame_idx < gop_length)) {
+    for (idx = 0; idx < INTER_REFS_PER_FRAME; ++idx) {
+      const MV_REFERENCE_FRAME refs[2] = { idx + 1, NONE_FRAME };
+      if (prune_ref_by_selective_ref_frame(cpi, NULL, refs,
+                                           ref_frame_display_indices)) {
+        tpl_data->ref_frame[idx] = NULL;
+      }
     }
   }
 
@@ -732,14 +832,15 @@
   MB_MODE_INFO *mbmi_ptr = &mbmi;
   xd->mi = &mbmi_ptr;
 
-  xd->block_ref_scale_factors[0] = &sf;
+  xd->block_ref_scale_factors[0] = &tpl_data->sf;
 
   const int base_qindex = pframe_qindex;
   // Get rd multiplier set up.
   rdmult = (int)av1_compute_rd_mult(cpi, base_qindex);
   if (rdmult < 1) rdmult = 1;
-  set_error_per_bit(x, rdmult);
-  av1_initialize_me_consts(cpi, x, base_qindex);
+  MvCosts *mv_costs = &x->mv_costs;
+  av1_set_error_per_bit(mv_costs, rdmult);
+  av1_set_sad_per_bit(cpi, mv_costs, base_qindex);
 
   tpl_frame->is_valid = 1;
 
@@ -748,43 +849,73 @@
 
   tpl_frame->base_rdmult =
       av1_compute_rd_mult_based_on_qindex(cpi, pframe_qindex) / 6;
+}
 
-  for (mi_row = 0; mi_row < mi_params->mi_rows; mi_row += mi_height) {
+// This function stores the motion estimation dependencies of all the blocks in
+// a row
+void av1_mc_flow_dispenser_row(AV1_COMP *cpi, MACROBLOCK *x, int mi_row,
+                               BLOCK_SIZE bsize, TX_SIZE tx_size) {
+  AV1_COMMON *const cm = &cpi->common;
+  MultiThreadInfo *const mt_info = &cpi->mt_info;
+  AV1TplRowMultiThreadInfo *const tpl_row_mt = &mt_info->tpl_row_mt;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  const int mi_width = mi_size_wide[bsize];
+  TplParams *const tpl_data = &cpi->tpl_data;
+  TplDepFrame *tpl_frame = &tpl_data->tpl_frame[tpl_data->frame_idx];
+  MACROBLOCKD *xd = &x->e_mbd;
+
+  const int tplb_cols_in_tile =
+      ROUND_POWER_OF_TWO(mi_params->mi_cols, mi_size_wide_log2[bsize]);
+  const int tplb_row = ROUND_POWER_OF_TWO(mi_row, mi_size_high_log2[bsize]);
+
+  for (int mi_col = 0, tplb_col_in_tile = 0; mi_col < mi_params->mi_cols;
+       mi_col += mi_width, tplb_col_in_tile++) {
+    (*tpl_row_mt->sync_read_ptr)(&tpl_data->tpl_mt_sync, tplb_row,
+                                 tplb_col_in_tile);
+    TplDepStats tpl_stats;
+
+    // Motion estimation column boundary
+    av1_set_mv_col_limits(mi_params, &x->mv_limits, mi_col, mi_width,
+                          tpl_data->border_in_pixels);
+    xd->mb_to_left_edge = -GET_MV_SUBPEL(mi_col * MI_SIZE);
+    xd->mb_to_right_edge =
+        GET_MV_SUBPEL(mi_params->mi_cols - mi_width - mi_col);
+    mode_estimation(cpi, x, mi_row, mi_col, bsize, tx_size, &tpl_stats);
+
+    // Motion flow dependency dispenser.
+    tpl_model_store(tpl_frame->tpl_stats_ptr, mi_row, mi_col, bsize,
+                    tpl_frame->stride, &tpl_stats,
+                    tpl_data->tpl_stats_block_mis_log2);
+    (*tpl_row_mt->sync_write_ptr)(&tpl_data->tpl_mt_sync, tplb_row,
+                                  tplb_col_in_tile, tplb_cols_in_tile);
+  }
+}
+
+static AOM_INLINE void mc_flow_dispenser(AV1_COMP *cpi) {
+  AV1_COMMON *cm = &cpi->common;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  ThreadData *td = &cpi->td;
+  MACROBLOCK *x = &td->mb;
+  MACROBLOCKD *xd = &x->e_mbd;
+  const BLOCK_SIZE bsize = convert_length_to_bsize(cpi->tpl_data.tpl_bsize_1d);
+  const TX_SIZE tx_size = max_txsize_lookup[bsize];
+  const int mi_height = mi_size_high[bsize];
+  for (int mi_row = 0; mi_row < mi_params->mi_rows; mi_row += mi_height) {
     // Motion estimation row boundary
     av1_set_mv_row_limits(mi_params, &x->mv_limits, mi_row, mi_height,
-                          cpi->oxcf.border_in_pixels);
+                          cpi->tpl_data.border_in_pixels);
     xd->mb_to_top_edge = -GET_MV_SUBPEL(mi_row * MI_SIZE);
     xd->mb_to_bottom_edge =
         GET_MV_SUBPEL((mi_params->mi_rows - mi_height - mi_row) * MI_SIZE);
-    for (mi_col = 0; mi_col < mi_params->mi_cols; mi_col += mi_width) {
-      TplDepStats tpl_stats;
-
-      // Motion estimation column boundary
-      av1_set_mv_col_limits(mi_params, &x->mv_limits, mi_col, mi_width,
-                            cpi->oxcf.border_in_pixels);
-      xd->mb_to_left_edge = -GET_MV_SUBPEL(mi_col * MI_SIZE);
-      xd->mb_to_right_edge =
-          GET_MV_SUBPEL(mi_params->mi_cols - mi_width - mi_col);
-      mode_estimation(cpi, x, xd, &sf, frame_idx, mi_row, mi_col, bsize,
-                      tx_size, ref_frame, src_frame, &tpl_stats);
-
-      // Motion flow dependency dispenser.
-      tpl_model_store(tpl_frame->tpl_stats_ptr, mi_row, mi_col, bsize,
-                      tpl_frame->stride, &tpl_stats,
-                      tpl_data->tpl_stats_block_mis_log2);
-    }
+    av1_mc_flow_dispenser_row(cpi, x, mi_row, bsize, tx_size);
   }
 }
 
 static void mc_flow_synthesizer(AV1_COMP *cpi, int frame_idx) {
   AV1_COMMON *cm = &cpi->common;
-
-  const GF_GROUP *gf_group = &cpi->gf_group;
-  if (frame_idx == gf_group->size) return;
-
   TplParams *const tpl_data = &cpi->tpl_data;
 
-  const BLOCK_SIZE bsize = convert_length_to_bsize(MC_FLOW_BSIZE_1D);
+  const BLOCK_SIZE bsize = convert_length_to_bsize(tpl_data->tpl_bsize_1d);
   const int mi_height = mi_size_high[bsize];
   const int mi_width = mi_size_wide[bsize];
 
@@ -812,7 +943,7 @@
   int ref_picture_map[REF_FRAMES];
 
   for (int i = 0; i < REF_FRAMES; ++i) {
-    if (frame_params.frame_type == KEY_FRAME || gop_eval) {
+    if (frame_params.frame_type == KEY_FRAME) {
       tpl_data->tpl_frame[-i - 1].gf_picture = NULL;
       tpl_data->tpl_frame[-1 - 1].rec_picture = NULL;
       tpl_data->tpl_frame[-i - 1].frame_display_index = 0;
@@ -829,47 +960,55 @@
   *tpl_group_frames = cur_frame_idx;
 
   int gf_index;
-  int use_arf = gf_group->update_type[1] == ARF_UPDATE;
-  int anc_frame_offset = gf_group->cur_frame_idx[cur_frame_idx] + 1;
+  int anc_frame_offset = gop_eval ? 0 : gf_group->cur_frame_idx[cur_frame_idx];
   int process_frame_count = 0;
-  const int gop_length =
-      AOMMIN(gf_group->size - 1 + use_arf, MAX_LENGTH_TPL_FRAME_STATS - 1);
-  for (gf_index = cur_frame_idx; gf_index <= gop_length; ++gf_index) {
+  const int gop_length = get_gop_length(gf_group);
+
+  for (gf_index = cur_frame_idx; gf_index < gop_length; ++gf_index) {
     TplDepFrame *tpl_frame = &tpl_data->tpl_frame[gf_index];
     FRAME_UPDATE_TYPE frame_update_type = gf_group->update_type[gf_index];
+    int frame_display_index = gf_index == gf_group->size
+                                  ? cpi->rc.baseline_gf_interval
+                                  : gf_group->cur_frame_idx[gf_index] +
+                                        gf_group->arf_src_offset[gf_index];
+
+    int lookahead_index = frame_display_index - anc_frame_offset;
 
     frame_params.show_frame = frame_update_type != ARF_UPDATE &&
                               frame_update_type != INTNL_ARF_UPDATE;
     frame_params.show_existing_frame =
         frame_update_type == INTNL_OVERLAY_UPDATE ||
         frame_update_type == OVERLAY_UPDATE;
-    frame_params.frame_type =
-        frame_update_type == KF_UPDATE ? KEY_FRAME : INTER_FRAME;
+    frame_params.frame_type = gf_group->frame_type[gf_index];
 
     if (frame_update_type == LF_UPDATE)
       *pframe_qindex = gf_group->q_val[gf_index];
 
     if (gf_index == cur_frame_idx) {
-      tpl_frame->gf_picture = frame_input->source;
-      // frame display index = frame offset within the gf group + start frame of
-      // the gf group
-      tpl_frame->frame_display_index =
-          gf_group->frame_disp_idx[gf_index] +
-          cpi->common.current_frame.display_order_hint;
-    } else {
-      int frame_display_index = gf_index == gf_group->size
-                                    ? cpi->rc.baseline_gf_interval
-                                    : gf_group->frame_disp_idx[gf_index];
       struct lookahead_entry *buf = av1_lookahead_peek(
-          cpi->lookahead, frame_display_index - anc_frame_offset,
-          cpi->compressor_stage);
+          cpi->lookahead, lookahead_index, cpi->compressor_stage);
+      tpl_frame->gf_picture = gop_eval ? &buf->img : frame_input->source;
+    } else {
+      struct lookahead_entry *buf = av1_lookahead_peek(
+          cpi->lookahead, lookahead_index, cpi->compressor_stage);
+
       if (buf == NULL) break;
       tpl_frame->gf_picture = &buf->img;
-      // frame display index = frame offset within the gf group + start frame of
-      // the gf group
-      tpl_frame->frame_display_index =
-          frame_display_index + cpi->common.current_frame.display_order_hint;
     }
+    if (gop_eval && cpi->rc.frames_since_key > 0 &&
+        gf_group->arf_index == gf_index)
+      tpl_frame->gf_picture = &cpi->alt_ref_buffer;
+
+    // 'cm->current_frame.frame_number' is the display number
+    // of the current frame.
+    // 'anc_frame_offset' is the number of frames displayed so
+    // far within the gf group. 'cm->current_frame.frame_number -
+    // anc_frame_offset' is the offset of the first frame in the gf group.
+    // 'frame display index' is frame offset within the gf group.
+    // 'frame_display_index + cm->current_frame.frame_number - anc_frame_offset'
+    // is the display index of the frame.
+    tpl_frame->frame_display_index =
+        frame_display_index + cm->current_frame.frame_number - anc_frame_offset;
 
     if (frame_update_type != OVERLAY_UPDATE &&
         frame_update_type != INTNL_OVERLAY_UPDATE) {
@@ -883,7 +1022,7 @@
         cpi, &frame_params, frame_update_type, &ref_buffer_stack);
 
     int refresh_frame_map_index = av1_get_refresh_ref_frame_map(refresh_mask);
-    av1_update_ref_frame_map(cpi, frame_update_type,
+    av1_update_ref_frame_map(cpi, frame_update_type, frame_params.frame_type,
                              frame_params.show_existing_frame,
                              refresh_frame_map_index, &ref_buffer_stack);
 
@@ -896,16 +1035,16 @@
     ++*tpl_group_frames;
   }
 
-  if (cur_frame_idx == 0) return;
+  if (cpi->rc.frames_since_key == 0) return;
 
   int extend_frame_count = 0;
-  int extend_frame_length =
-      AOMMIN(cpi->rc.baseline_gf_interval,
-             cpi->rc.frames_to_key - cpi->rc.baseline_gf_interval);
-  int frame_display_index = cpi->rc.baseline_gf_interval + 1;
+  int extend_frame_length = AOMMIN(
+      MAX_TPL_EXTEND, cpi->rc.frames_to_key - cpi->rc.baseline_gf_interval);
+  int frame_display_index = gf_group->cur_frame_idx[gop_length - 1] +
+                            gf_group->arf_src_offset[gop_length - 1] + 1;
 
-  for (; gf_index < MAX_LENGTH_TPL_FRAME_STATS &&
-         extend_frame_count < extend_frame_length;
+  for (;
+       gf_index < MAX_TPL_FRAME_IDX && extend_frame_count < extend_frame_length;
        ++gf_index) {
     TplDepFrame *tpl_frame = &tpl_data->tpl_frame[gf_index];
     FRAME_UPDATE_TYPE frame_update_type = LF_UPDATE;
@@ -915,21 +1054,27 @@
         frame_update_type == INTNL_OVERLAY_UPDATE;
     frame_params.frame_type = INTER_FRAME;
 
+    int lookahead_index = frame_display_index - anc_frame_offset;
     struct lookahead_entry *buf = av1_lookahead_peek(
-        cpi->lookahead, frame_display_index - anc_frame_offset,
-        cpi->compressor_stage);
+        cpi->lookahead, lookahead_index, cpi->compressor_stage);
 
     if (buf == NULL) break;
 
     tpl_frame->gf_picture = &buf->img;
     tpl_frame->rec_picture = &tpl_data->tpl_rec_pool[process_frame_count];
     tpl_frame->tpl_stats_ptr = tpl_data->tpl_stats_pool[process_frame_count];
-    ++process_frame_count;
-
-    // frame display index = frame offset within the gf group + start frame of
-    // the gf group
+    // 'cm->current_frame.frame_number' is the display number
+    // of the current frame.
+    // 'anc_frame_offset' is the number of frames displayed so
+    // far within the gf group. 'cm->current_frame.frame_number -
+    // anc_frame_offset' is the offset of the first frame in the gf group.
+    // 'frame display index' is frame offset within the gf group.
+    // 'frame_display_index + cm->current_frame.frame_number - anc_frame_offset'
+    // is the display index of the frame.
     tpl_frame->frame_display_index =
-        frame_display_index + cpi->common.current_frame.display_order_hint;
+        frame_display_index + cm->current_frame.frame_number - anc_frame_offset;
+
+    ++process_frame_count;
 
     gf_group->update_type[gf_index] = LF_UPDATE;
     gf_group->q_val[gf_index] = *pframe_qindex;
@@ -938,7 +1083,7 @@
     int refresh_mask = av1_get_refresh_frame_flags(
         cpi, &frame_params, frame_update_type, &ref_buffer_stack);
     int refresh_frame_map_index = av1_get_refresh_ref_frame_map(refresh_mask);
-    av1_update_ref_frame_map(cpi, frame_update_type,
+    av1_update_ref_frame_map(cpi, frame_update_type, frame_params.frame_type,
                              frame_params.show_existing_frame,
                              refresh_frame_map_index, &ref_buffer_stack);
 
@@ -961,9 +1106,10 @@
   av1_get_ref_frames(cpi, &cpi->ref_buffer_stack);
 }
 
-static AOM_INLINE void init_tpl_stats(TplParams *const tpl_data) {
+void av1_init_tpl_stats(TplParams *const tpl_data) {
   for (int frame_idx = 0; frame_idx < MAX_LAG_BUFFERS; ++frame_idx) {
     TplDepFrame *tpl_frame = &tpl_data->tpl_stats_buffer[frame_idx];
+    if (tpl_data->tpl_stats_pool[frame_idx] == NULL) continue;
     memset(tpl_data->tpl_stats_pool[frame_idx], 0,
            tpl_frame->height * tpl_frame->width *
                sizeof(*tpl_frame->tpl_stats_ptr));
@@ -974,22 +1120,32 @@
 int av1_tpl_setup_stats(AV1_COMP *cpi, int gop_eval,
                         const EncodeFrameParams *const frame_params,
                         const EncodeFrameInput *const frame_input) {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  start_timing(cpi, av1_tpl_setup_stats_time);
+#endif
   AV1_COMMON *cm = &cpi->common;
+  MultiThreadInfo *const mt_info = &cpi->mt_info;
+  AV1TplRowMultiThreadInfo *const tpl_row_mt = &mt_info->tpl_row_mt;
   GF_GROUP *gf_group = &cpi->gf_group;
   int bottom_index, top_index;
   EncodeFrameParams this_frame_params = *frame_params;
   TplParams *const tpl_data = &cpi->tpl_data;
 
-  if (cpi->superres_mode != SUPERRES_NONE) return 0;
+  if (cpi->superres_mode != AOM_SUPERRES_NONE) {
+    assert(cpi->superres_mode != AOM_SUPERRES_AUTO);
+    av1_init_tpl_stats(tpl_data);
+    return 0;
+  }
 
   cm->current_frame.frame_type = frame_params->frame_type;
   for (int gf_index = gf_group->index; gf_index < gf_group->size; ++gf_index) {
-    av1_configure_buffer_updates(cpi, &this_frame_params,
-                                 gf_group->update_type[gf_index], 0);
+    cm->current_frame.frame_type = gf_group->frame_type[gf_index];
+    av1_configure_buffer_updates(cpi, &this_frame_params.refresh_frame,
+                                 gf_group->update_type[gf_index],
+                                 cm->current_frame.frame_type, 0);
 
-    cpi->refresh_golden_frame = this_frame_params.refresh_golden_frame;
-    cpi->refresh_bwd_ref_frame = this_frame_params.refresh_bwd_ref_frame;
-    cpi->refresh_alt_ref_frame = this_frame_params.refresh_alt_ref_frame;
+    memcpy(&cpi->refresh_frame, &this_frame_params.refresh_frame,
+           sizeof(cpi->refresh_frame));
 
     cm->show_frame = gf_group->update_type[gf_index] != ARF_UPDATE &&
                      gf_group->update_type[gf_index] != INTNL_ARF_UPDATE;
@@ -997,8 +1153,6 @@
     gf_group->q_val[gf_index] =
         av1_rc_pick_q_and_bounds(cpi, &cpi->rc, cm->width, cm->height, gf_index,
                                  &bottom_index, &top_index);
-
-    cm->current_frame.frame_type = INTER_FRAME;
   }
 
   int pframe_qindex;
@@ -1008,7 +1162,10 @@
 
   cpi->rc.base_layer_qp = pframe_qindex;
 
-  init_tpl_stats(tpl_data);
+  av1_init_tpl_stats(tpl_data);
+
+  tpl_row_mt->sync_read_ptr = av1_tpl_row_mt_sync_read_dummy;
+  tpl_row_mt->sync_write_ptr = av1_tpl_row_mt_sync_write_dummy;
 
   // Backward propagation from tpl_group_frames to 1.
   for (int frame_idx = gf_group->index; frame_idx < tpl_gf_group_frames;
@@ -1017,7 +1174,14 @@
         gf_group->update_type[frame_idx] == OVERLAY_UPDATE)
       continue;
 
-    mc_flow_dispenser(cpi, frame_idx, pframe_qindex);
+    init_mc_flow_dispenser(cpi, frame_idx, pframe_qindex);
+    if (mt_info->num_workers > 1) {
+      tpl_row_mt->sync_read_ptr = av1_tpl_row_mt_sync_read;
+      tpl_row_mt->sync_write_ptr = av1_tpl_row_mt_sync_write;
+      av1_mc_flow_dispenser_mt(cpi);
+    } else {
+      mc_flow_dispenser(cpi);
+    }
 
     aom_extend_frame_borders(tpl_data->tpl_frame[frame_idx].rec_picture,
                              av1_num_planes(cm));
@@ -1032,16 +1196,19 @@
     mc_flow_synthesizer(cpi, frame_idx);
   }
 
-  av1_configure_buffer_updates(cpi, &this_frame_params,
-                               gf_group->update_type[gf_group->index], 0);
+  av1_configure_buffer_updates(cpi, &this_frame_params.refresh_frame,
+                               gf_group->update_type[gf_group->index],
+                               frame_params->frame_type, 0);
   cm->current_frame.frame_type = frame_params->frame_type;
   cm->show_frame = frame_params->show_frame;
 
   if (cpi->common.tiles.large_scale) return 0;
   if (gf_group->max_layer_depth_allowed == 0) return 1;
+  assert(gf_group->arf_index >= 0);
 
   double beta[2] = { 0.0 };
-  for (int frame_idx = 1; frame_idx <= AOMMIN(tpl_gf_group_frames - 1, 2);
+  for (int frame_idx = gf_group->arf_index;
+       frame_idx <= AOMMIN(tpl_gf_group_frames - 1, gf_group->arf_index + 1);
        ++frame_idx) {
     TplDepFrame *tpl_frame = &tpl_data->tpl_frame[frame_idx];
     TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr;
@@ -1049,10 +1216,13 @@
     int64_t intra_cost_base = 0;
     int64_t mc_dep_cost_base = 0;
     const int step = 1 << tpl_data->tpl_stats_block_mis_log2;
+    const int row_step = step;
+    const int col_step_sr =
+        coded_to_superres_mi(step, cm->superres_scale_denominator);
     const int mi_cols_sr = av1_pixels_to_mi(cm->superres_upscaled_width);
 
-    for (int row = 0; row < cm->mi_params.mi_rows; row += step) {
-      for (int col = 0; col < mi_cols_sr; col += step) {
+    for (int row = 0; row < cm->mi_params.mi_rows; row += row_step) {
+      for (int col = 0; col < mi_cols_sr; col += col_step_sr) {
         TplDepStats *this_stats = &tpl_stats[av1_tpl_ptr_pos(
             row, col, tpl_stride, tpl_data->tpl_stats_block_mis_log2)];
         int64_t mc_dep_delta =
@@ -1063,13 +1233,18 @@
             (this_stats->recrf_dist << RDDIV_BITS) + mc_dep_delta;
       }
     }
-    beta[frame_idx - 1] = (double)mc_dep_cost_base / intra_cost_base;
+    beta[frame_idx - gf_group->arf_index] =
+        (double)mc_dep_cost_base / intra_cost_base;
   }
 
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  end_timing(cpi, av1_tpl_setup_stats_time);
+#endif
+
   // Allow larger GOP size if the base layer ARF has higher dependency factor
   // than the intermediate ARF and both ARFs have reasonably high dependency
   // factors.
-  return (beta[0] >= beta[1] + 0.7) && beta[0] > 3.0;
+  return (beta[0] >= beta[1] + 0.7) && beta[0] > 8.0;
 }
 
 void av1_tpl_rdmult_setup(AV1_COMP *cpi) {
@@ -1083,7 +1258,6 @@
   const TplDepFrame *const tpl_frame = &tpl_data->tpl_frame[tpl_idx];
 
   if (!tpl_frame->is_valid) return;
-  if (cpi->superres_mode != SUPERRES_NONE) return;
 
   const TplDepStats *const tpl_stats = tpl_frame->tpl_stats_ptr;
   const int tpl_stride = tpl_frame->stride;
@@ -1130,23 +1304,29 @@
 void av1_tpl_rdmult_setup_sb(AV1_COMP *cpi, MACROBLOCK *const x,
                              BLOCK_SIZE sb_size, int mi_row, int mi_col) {
   AV1_COMMON *const cm = &cpi->common;
+  GF_GROUP *gf_group = &cpi->gf_group;
   assert(IMPLIES(cpi->gf_group.size > 0,
                  cpi->gf_group.index < cpi->gf_group.size));
   const int tpl_idx = cpi->gf_group.index;
   TplDepFrame *tpl_frame = &cpi->tpl_data.tpl_frame[tpl_idx];
 
   if (tpl_frame->is_valid == 0) return;
-  if (!is_frame_tpl_eligible(cpi)) return;
-  if (tpl_idx >= MAX_LAG_BUFFERS) return;
-  if (cpi->superres_mode != SUPERRES_NONE) return;
-  if (cpi->oxcf.aq_mode != NO_AQ) return;
+  if (!is_frame_tpl_eligible(gf_group, gf_group->index)) return;
+  if (tpl_idx >= MAX_TPL_FRAME_IDX) return;
+  if (cpi->oxcf.q_cfg.aq_mode != NO_AQ) return;
+
+  const int mi_col_sr =
+      coded_to_superres_mi(mi_col, cm->superres_scale_denominator);
+  const int mi_cols_sr = av1_pixels_to_mi(cm->superres_upscaled_width);
+  const int sb_mi_width_sr = coded_to_superres_mi(
+      mi_size_wide[sb_size], cm->superres_scale_denominator);
 
   const int bsize_base = BLOCK_16X16;
   const int num_mi_w = mi_size_wide[bsize_base];
   const int num_mi_h = mi_size_high[bsize_base];
-  const int num_cols = (cm->mi_params.mi_cols + num_mi_w - 1) / num_mi_w;
+  const int num_cols = (mi_cols_sr + num_mi_w - 1) / num_mi_w;
   const int num_rows = (cm->mi_params.mi_rows + num_mi_h - 1) / num_mi_h;
-  const int num_bcols = (mi_size_wide[sb_size] + num_mi_w - 1) / num_mi_w;
+  const int num_bcols = (sb_mi_width_sr + num_mi_w - 1) / num_mi_w;
   const int num_brows = (mi_size_high[sb_size] + num_mi_h - 1) / num_mi_h;
   int row, col;
 
@@ -1156,20 +1336,19 @@
   aom_clear_system_state();
   for (row = mi_row / num_mi_w;
        row < num_rows && row < mi_row / num_mi_w + num_brows; ++row) {
-    for (col = mi_col / num_mi_h;
-         col < num_cols && col < mi_col / num_mi_h + num_bcols; ++col) {
+    for (col = mi_col_sr / num_mi_h;
+         col < num_cols && col < mi_col_sr / num_mi_h + num_bcols; ++col) {
       const int index = row * num_cols + col;
       log_sum += log(cpi->tpl_rdmult_scaling_factors[index]);
       base_block_count += 1.0;
     }
   }
 
-  MACROBLOCKD *const xd = &x->e_mbd;
   const CommonQuantParams *quant_params = &cm->quant_params;
   const int orig_rdmult = av1_compute_rd_mult(
       cpi, quant_params->base_qindex + quant_params->y_dc_delta_q);
   const int new_rdmult =
-      av1_compute_rd_mult(cpi, quant_params->base_qindex + xd->delta_qindex +
+      av1_compute_rd_mult(cpi, quant_params->base_qindex + x->delta_qindex +
                                    quant_params->y_dc_delta_q);
   const double scaling_factor = (double)new_rdmult / (double)orig_rdmult;
 
@@ -1178,8 +1357,8 @@
 
   for (row = mi_row / num_mi_w;
        row < num_rows && row < mi_row / num_mi_w + num_brows; ++row) {
-    for (col = mi_col / num_mi_h;
-         col < num_cols && col < mi_col / num_mi_h + num_bcols; ++col) {
+    for (col = mi_col_sr / num_mi_h;
+         col < num_cols && col < mi_col_sr / num_mi_h + num_bcols; ++col) {
       const int index = row * num_cols + col;
       cpi->tpl_sb_rdmult_scaling_factors[index] =
           scale_adj * cpi->tpl_rdmult_scaling_factors[index];
diff --git a/av1/encoder/tpl_model.h b/av1/encoder/tpl_model.h
index 11a61b6..66a80a3 100644
--- a/av1/encoder/tpl_model.h
+++ b/av1/encoder/tpl_model.h
@@ -16,6 +16,14 @@
 extern "C" {
 #endif
 
+/*!\cond */
+
+struct AV1_COMP;
+struct EncodeFrameParams;
+struct EncodeFrameInput;
+
+#include "av1/encoder/encoder.h"
+
 static INLINE BLOCK_SIZE convert_length_to_bsize(int length) {
   switch (length) {
     case 64: return BLOCK_64X64;
@@ -29,17 +37,191 @@
   }
 }
 
-int av1_tpl_setup_stats(AV1_COMP *cpi, int gop_eval,
-                        const EncodeFrameParams *const frame_params,
-                        const EncodeFrameInput *const frame_input);
+typedef struct AV1TplRowMultiThreadSync {
+#if CONFIG_MULTITHREAD
+  // Synchronization objects for top-right dependency.
+  pthread_mutex_t *mutex_;
+  pthread_cond_t *cond_;
+#endif
+  // Buffer to store the macroblock whose encoding is complete.
+  // num_finished_cols[i] stores the number of macroblocks which finished
+  // encoding in the ith macroblock row.
+  int *num_finished_cols;
+  // Number of extra macroblocks of the top row to be complete for encoding
+  // of the current macroblock to start. A value of 1 indicates top-right
+  // dependency.
+  int sync_range;
+  // Number of macroblock rows.
+  int rows;
+  // Number of threads processing the current tile.
+  int num_threads_working;
+} AV1TplRowMultiThreadSync;
+
+typedef struct AV1TplRowMultiThreadInfo {
+  // Row synchronization related function pointers.
+  void (*sync_read_ptr)(AV1TplRowMultiThreadSync *tpl_mt_sync, int r, int c);
+  void (*sync_write_ptr)(AV1TplRowMultiThreadSync *tpl_mt_sync, int r, int c,
+                         int cols);
+} AV1TplRowMultiThreadInfo;
+
+// TODO(jingning): This needs to be cleaned up next.
+
+// TPL stats buffers are prepared for every frame in the GOP,
+// including (internal) overlays and (internal) arfs.
+// In addition, frames in the lookahead that are outside of the GOP
+// are also used.
+// Thus it should use
+// (gop_length) + (# overlays) + (MAX_LAG_BUFFERS - gop_len) =
+// MAX_LAG_BUFFERS + (# overlays)
+// 2 * MAX_LAG_BUFFERS is therefore a safe estimate.
+// TODO(bohanli): test setting it to 1.5 * MAX_LAG_BUFFER
+#define MAX_TPL_FRAME_IDX (2 * MAX_LAG_BUFFERS)
+// The first REF_FRAMES + 1 buffers are reserved.
+// tpl_data->tpl_frame starts after REF_FRAMES + 1
+#define MAX_LENGTH_TPL_FRAME_STATS (MAX_TPL_FRAME_IDX + REF_FRAMES + 1)
+#define MAX_TPL_EXTEND (MAX_LAG_BUFFERS - MAX_GF_INTERVAL)
+#define TPL_DEP_COST_SCALE_LOG2 4
+
+typedef struct TplDepStats {
+  int64_t intra_cost;
+  int64_t inter_cost;
+  int64_t srcrf_dist;
+  int64_t recrf_dist;
+  int64_t srcrf_rate;
+  int64_t recrf_rate;
+  int64_t mc_dep_rate;
+  int64_t mc_dep_dist;
+  int_mv mv[INTER_REFS_PER_FRAME];
+  int ref_frame_index;
+  int64_t pred_error[INTER_REFS_PER_FRAME];
+} TplDepStats;
+
+typedef struct TplDepFrame {
+  uint8_t is_valid;
+  TplDepStats *tpl_stats_ptr;
+  const YV12_BUFFER_CONFIG *gf_picture;
+  YV12_BUFFER_CONFIG *rec_picture;
+  int ref_map_index[REF_FRAMES];
+  int stride;
+  int width;
+  int height;
+  int mi_rows;
+  int mi_cols;
+  int base_rdmult;
+  uint32_t frame_display_index;
+} TplDepFrame;
+
+/*!\endcond */
+/*!
+ * \brief Params related to temporal dependency model.
+ */
+typedef struct TplParams {
+  /*!
+   * Block granularity of tpl score storage.
+   */
+  uint8_t tpl_stats_block_mis_log2;
+
+  /*!
+   * Tpl motion estimation block 1d size. tpl_bsize_1d >= 16.
+   */
+  uint8_t tpl_bsize_1d;
+
+  /*!
+   * Buffer to store the frame level tpl information for each frame in a gf
+   * group. tpl_stats_buffer[i] stores the tpl information of ith frame in a gf
+   * group
+   */
+  TplDepFrame tpl_stats_buffer[MAX_LENGTH_TPL_FRAME_STATS];
+
+  /*!
+   * Buffer to store tpl stats at block granularity.
+   * tpl_stats_pool[i][j] stores the tpl stats of jth block of ith frame in a gf
+   * group.
+   */
+  TplDepStats *tpl_stats_pool[MAX_LAG_BUFFERS];
+
+  /*!
+   * Buffer to store tpl reconstructed frame.
+   * tpl_rec_pool[i] stores the reconstructed frame of ith frame in a gf group.
+   */
+  YV12_BUFFER_CONFIG tpl_rec_pool[MAX_LAG_BUFFERS];
+
+  /*!
+   * Pointer to tpl_stats_buffer.
+   */
+  TplDepFrame *tpl_frame;
+
+  /*!
+   * Scale factors for the current frame.
+   */
+  struct scale_factors sf;
+
+  /*!
+   * GF group index of the current frame.
+   */
+  int frame_idx;
+
+  /*!
+   * Array of pointers to the frame buffers holding the source frame.
+   * src_ref_frame[i] stores the pointer to the source frame of the ith
+   * reference frame type.
+   */
+  const YV12_BUFFER_CONFIG *src_ref_frame[INTER_REFS_PER_FRAME];
+
+  /*!
+   * Array of pointers to the frame buffers holding the tpl reconstructed frame.
+   * ref_frame[i] stores the pointer to the tpl reconstructed frame of the ith
+   * reference frame type.
+   */
+  const YV12_BUFFER_CONFIG *ref_frame[INTER_REFS_PER_FRAME];
+
+  /*!
+   * Parameters related to synchronization for top-right dependency in row based
+   * multi-threading of tpl
+   */
+  AV1TplRowMultiThreadSync tpl_mt_sync;
+
+  /*!
+   * Frame border for tpl frame.
+   */
+  int border_in_pixels;
+
+  /*!
+   * Skip tpl setup when tpl data from gop length decision can be reused.
+   */
+  int skip_tpl_setup_stats;
+} TplParams;
+
+/*!\brief Implements temporal dependency modelling for a GOP (GF/ARF
+ * group) and selects between 16 and 32 frame GOP structure.
+ *
+ *\ingroup tpl_modelling
+ *
+ * \param[in]    cpi           Top - level encoder instance structure
+ * \param[in]    gop_eval      Flag if it is in the GOP length decision stage
+ * \param[in]    frame_params  Per frame encoding parameters
+ * \param[in]    frame_input   Input frame buffers
+ *
+ * \return Indicates whether or not we should use a longer GOP length.
+ */
+int av1_tpl_setup_stats(struct AV1_COMP *cpi, int gop_eval,
+                        const struct EncodeFrameParams *const frame_params,
+                        const struct EncodeFrameInput *const frame_input);
+
+/*!\cond */
 
 int av1_tpl_ptr_pos(int mi_row, int mi_col, int stride, uint8_t right_shift);
 
-void av1_tpl_rdmult_setup(AV1_COMP *cpi);
+void av1_init_tpl_stats(TplParams *const tpl_data);
 
-void av1_tpl_rdmult_setup_sb(AV1_COMP *cpi, MACROBLOCK *const x,
+void av1_tpl_rdmult_setup(struct AV1_COMP *cpi);
+
+void av1_tpl_rdmult_setup_sb(struct AV1_COMP *cpi, MACROBLOCK *const x,
                              BLOCK_SIZE sb_size, int mi_row, int mi_col);
 
+void av1_mc_flow_dispenser_row(struct AV1_COMP *cpi, MACROBLOCK *x, int mi_row,
+                               BLOCK_SIZE bsize, TX_SIZE tx_size);
+/*!\endcond */
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/av1/encoder/tune_vmaf.c b/av1/encoder/tune_vmaf.c
index 997f78e..03f6d8d 100644
--- a/av1/encoder/tune_vmaf.c
+++ b/av1/encoder/tune_vmaf.c
@@ -12,13 +12,187 @@
 #include "av1/encoder/tune_vmaf.h"
 
 #include "aom_dsp/psnr.h"
-#include "aom_dsp/vmaf.h"
 #include "aom_ports/system_state.h"
 #include "av1/encoder/extend.h"
 #include "av1/encoder/rdopt.h"
+#if CONFIG_USE_VMAF_RC
+#include "config/aom_scale_rtcd.h"
+#endif
 
 static const double kBaselineVmaf = 97.42773;
 
+static double get_layer_value(const double *array, int layer) {
+  while (array[layer] < 0.0 && layer > 0) layer--;
+  return AOMMAX(array[layer], 0.0);
+}
+
+static void motion_search(AV1_COMP *cpi, const YV12_BUFFER_CONFIG *src,
+                          const YV12_BUFFER_CONFIG *ref,
+                          const BLOCK_SIZE block_size, const int mb_row,
+                          const int mb_col, FULLPEL_MV *ref_mv) {
+  // Block information (ONLY Y-plane is used for motion search).
+  const int mb_height = block_size_high[block_size];
+  const int mb_width = block_size_wide[block_size];
+  const int y_stride = src->y_stride;
+  assert(y_stride == ref->y_stride);
+  const int y_offset = mb_row * mb_height * y_stride + mb_col * mb_width;
+
+  // Save input state.
+  MACROBLOCK *const mb = &cpi->td.mb;
+  MACROBLOCKD *const mbd = &mb->e_mbd;
+  const struct buf_2d ori_src_buf = mb->plane[0].src;
+  const struct buf_2d ori_pre_buf = mbd->plane[0].pre[0];
+
+  // Parameters used for motion search.
+  FULLPEL_MOTION_SEARCH_PARAMS full_ms_params;
+  const SEARCH_METHODS search_method = NSTEP;
+  const search_site_config *search_site_cfg =
+      cpi->mv_search_params.search_site_cfg[SS_CFG_FPF];
+  const int step_param =
+      av1_init_search_range(AOMMAX(src->y_crop_width, src->y_crop_height));
+
+  // Baseline position for motion search (used for rate distortion comparison).
+  const MV baseline_mv = kZeroMv;
+
+  // Setup.
+  mb->plane[0].src.buf = src->y_buffer + y_offset;
+  mb->plane[0].src.stride = y_stride;
+  mbd->plane[0].pre[0].buf = ref->y_buffer + y_offset;
+  mbd->plane[0].pre[0].stride = y_stride;
+
+  // Unused intermediate results for motion search.
+  int cost_list[5];
+
+  // Do motion search.
+  // Only do full search on the entire block.
+  av1_make_default_fullpel_ms_params(&full_ms_params, cpi, mb, block_size,
+                                     &baseline_mv, search_site_cfg,
+                                     /*fine_search_interval=*/0);
+  av1_set_mv_search_method(&full_ms_params, search_site_cfg, search_method);
+  av1_full_pixel_search(*ref_mv, &full_ms_params, step_param,
+                        cond_cost_list(cpi, cost_list), ref_mv, NULL);
+
+  // Restore input state.
+  mb->plane[0].src = ori_src_buf;
+  mbd->plane[0].pre[0] = ori_pre_buf;
+}
+
+static unsigned int residual_variance(const AV1_COMP *cpi,
+                                      const YV12_BUFFER_CONFIG *src,
+                                      const YV12_BUFFER_CONFIG *ref,
+                                      const BLOCK_SIZE block_size,
+                                      const int mb_row, const int mb_col,
+                                      FULLPEL_MV ref_mv, unsigned int *sse) {
+  const int mb_height = block_size_high[block_size];
+  const int mb_width = block_size_wide[block_size];
+  const int y_stride = src->y_stride;
+  assert(y_stride == ref->y_stride);
+  const int y_offset = mb_row * mb_height * y_stride + mb_col * mb_width;
+  const int mv_offset = ref_mv.row * y_stride + ref_mv.col;
+  const unsigned int var =
+      cpi->fn_ptr[block_size].vf(ref->y_buffer + y_offset + mv_offset, y_stride,
+                                 src->y_buffer + y_offset, y_stride, sse);
+  return var;
+}
+
+static double frame_average_variance(const AV1_COMP *const cpi,
+                                     const YV12_BUFFER_CONFIG *const frame) {
+  const uint8_t *const y_buffer = frame->y_buffer;
+  const int y_stride = frame->y_stride;
+  const BLOCK_SIZE block_size = BLOCK_64X64;
+
+  const int block_w = mi_size_wide[block_size] * 4;
+  const int block_h = mi_size_high[block_size] * 4;
+  int row, col;
+  const int bit_depth = cpi->td.mb.e_mbd.bd;
+  double var = 0.0, var_count = 0.0;
+
+  // Loop through each block.
+  for (row = 0; row < frame->y_height / block_h; ++row) {
+    for (col = 0; col < frame->y_width / block_w; ++col) {
+      struct buf_2d buf;
+      const int row_offset_y = row * block_h;
+      const int col_offset_y = col * block_w;
+
+      buf.buf = (uint8_t *)y_buffer + row_offset_y * y_stride + col_offset_y;
+      buf.stride = y_stride;
+
+      if (cpi->common.seq_params.use_highbitdepth) {
+        assert(frame->flags & YV12_FLAG_HIGHBITDEPTH);
+        var += av1_high_get_sby_perpixel_variance(cpi, &buf, block_size,
+                                                  bit_depth);
+      } else {
+        var += av1_get_sby_perpixel_variance(cpi, &buf, block_size);
+      }
+      var_count += 1.0;
+    }
+  }
+  var /= var_count;
+  return var;
+}
+
+static double residual_frame_average_variance(AV1_COMP *cpi,
+                                              const YV12_BUFFER_CONFIG *src,
+                                              const YV12_BUFFER_CONFIG *ref,
+                                              FULLPEL_MV *mvs) {
+  if (ref == NULL) return frame_average_variance(cpi, src);
+  const BLOCK_SIZE block_size = BLOCK_16X16;
+  const int frame_height = src->y_height;
+  const int frame_width = src->y_width;
+  const int mb_height = block_size_high[block_size];
+  const int mb_width = block_size_wide[block_size];
+  const int mb_rows = (frame_height + mb_height - 1) / mb_height;
+  const int mb_cols = (frame_width + mb_width - 1) / mb_width;
+  const int num_planes = av1_num_planes(&cpi->common);
+  const int mi_h = mi_size_high_log2[block_size];
+  const int mi_w = mi_size_wide_log2[block_size];
+  assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
+
+  // Save input state.
+  MACROBLOCK *const mb = &cpi->td.mb;
+  MACROBLOCKD *const mbd = &mb->e_mbd;
+  uint8_t *input_buffer[MAX_MB_PLANE];
+  for (int i = 0; i < num_planes; i++) {
+    input_buffer[i] = mbd->plane[i].pre[0].buf;
+  }
+  MB_MODE_INFO **input_mb_mode_info = mbd->mi;
+
+  bool do_motion_search = false;
+  if (mvs == NULL) {
+    do_motion_search = true;
+    mvs = (FULLPEL_MV *)aom_malloc(sizeof(*mvs) * mb_rows * mb_cols);
+    memset(mvs, 0, sizeof(*mvs) * mb_rows * mb_cols);
+  }
+
+  unsigned int variance = 0;
+  // Perform temporal filtering block by block.
+  for (int mb_row = 0; mb_row < mb_rows; mb_row++) {
+    av1_set_mv_row_limits(&cpi->common.mi_params, &mb->mv_limits,
+                          (mb_row << mi_h), (mb_height >> MI_SIZE_LOG2),
+                          cpi->oxcf.border_in_pixels);
+    for (int mb_col = 0; mb_col < mb_cols; mb_col++) {
+      av1_set_mv_col_limits(&cpi->common.mi_params, &mb->mv_limits,
+                            (mb_col << mi_w), (mb_width >> MI_SIZE_LOG2),
+                            cpi->oxcf.border_in_pixels);
+      FULLPEL_MV *ref_mv = &mvs[mb_col + mb_row * mb_cols];
+      if (do_motion_search) {
+        motion_search(cpi, src, ref, block_size, mb_row, mb_col, ref_mv);
+      }
+      unsigned int mv_sse;
+      const unsigned int blk_var = residual_variance(
+          cpi, src, ref, block_size, mb_row, mb_col, *ref_mv, &mv_sse);
+      variance += blk_var;
+    }
+  }
+
+  // Restore input state
+  for (int i = 0; i < num_planes; i++) {
+    mbd->plane[i].pre[0].buf = input_buffer[i];
+  }
+  mbd->mi = input_mb_mode_info;
+  return (double)variance / (double)(mb_rows * mb_cols);
+}
+
 // TODO(sdeng): Add the SIMD implementation.
 static AOM_INLINE void highbd_unsharp_rect(const uint16_t *source,
                                            int source_stride,
@@ -60,7 +234,10 @@
                                const YV12_BUFFER_CONFIG *blurred,
                                const YV12_BUFFER_CONFIG *dst, double amount) {
   const int bit_depth = cpi->td.mb.e_mbd.bd;
-  if (bit_depth > 8) {
+  if (cpi->common.seq_params.use_highbitdepth) {
+    assert(source->flags & YV12_FLAG_HIGHBITDEPTH);
+    assert(blurred->flags & YV12_FLAG_HIGHBITDEPTH);
+    assert(dst->flags & YV12_FLAG_HIGHBITDEPTH);
     highbd_unsharp_rect(CONVERT_TO_SHORTPTR(source->y_buffer), source->y_stride,
                         CONVERT_TO_SHORTPTR(blurred->y_buffer),
                         blurred->y_stride, CONVERT_TO_SHORTPTR(dst->y_buffer),
@@ -90,7 +267,6 @@
   ConvolveParams conv_params = get_conv_params(0, 0, bit_depth);
   InterpFilterParams filter = { .filter_ptr = gauss_filter,
                                 .taps = 8,
-                                .subpel_shifts = 0,
                                 .interp_filter = EIGHTTAP_REGULAR };
 
   for (row = 0; row < num_rows; ++row) {
@@ -103,7 +279,7 @@
       uint8_t *dst_buf =
           dst->y_buffer + row_offset_y * dst->y_stride + col_offset_y;
 
-      if (bit_depth > 8) {
+      if (source->flags & YV12_FLAG_HIGHBITDEPTH) {
         av1_highbd_convolve_2d_sr(
             CONVERT_TO_SHORTPTR(src_buf), source->y_stride,
             CONVERT_TO_SHORTPTR(dst_buf), dst->y_stride, block_w, block_h,
@@ -117,58 +293,39 @@
   }
 }
 
-static double frame_average_variance(const AV1_COMP *const cpi,
-                                     const YV12_BUFFER_CONFIG *const frame) {
-  const uint8_t *const y_buffer = frame->y_buffer;
-  const int y_stride = frame->y_stride;
-  const BLOCK_SIZE block_size = BLOCK_64X64;
-
-  const int block_w = mi_size_wide[block_size] * 4;
-  const int block_h = mi_size_high[block_size] * 4;
-  int row, col;
-  const int bit_depth = cpi->td.mb.e_mbd.bd;
-  double var = 0.0, var_count = 0.0;
-
-  // Loop through each block.
-  for (row = 0; row < frame->y_height / block_h; ++row) {
-    for (col = 0; col < frame->y_width / block_w; ++col) {
-      struct buf_2d buf;
-      const int row_offset_y = row * block_h;
-      const int col_offset_y = col * block_w;
-
-      buf.buf = (uint8_t *)y_buffer + row_offset_y * y_stride + col_offset_y;
-      buf.stride = y_stride;
-
-      if (bit_depth > 8) {
-        var += av1_high_get_sby_perpixel_variance(cpi, &buf, block_size,
-                                                  bit_depth);
-      } else {
-        var += av1_get_sby_perpixel_variance(cpi, &buf, block_size);
-      }
-      var_count += 1.0;
-    }
-  }
-  var /= var_count;
-  return var;
-}
-
-static double cal_approx_vmaf(const AV1_COMP *const cpi, double source_variance,
-                              YV12_BUFFER_CONFIG *const source,
-                              YV12_BUFFER_CONFIG *const sharpened) {
+static AOM_INLINE double cal_approx_vmaf(const AV1_COMP *const cpi,
+#if CONFIG_USE_VMAF_RC
+                                         VmafContext *vmaf_context,
+                                         int *vmaf_cal_index,
+#endif
+                                         double source_variance,
+                                         YV12_BUFFER_CONFIG *const source,
+                                         YV12_BUFFER_CONFIG *const sharpened) {
   const int bit_depth = cpi->td.mb.e_mbd.bd;
   double new_vmaf;
-  aom_calc_vmaf(cpi->oxcf.vmaf_model_path, source, sharpened, bit_depth,
-                &new_vmaf);
+
+#if CONFIG_USE_VMAF_RC
+  aom_calc_vmaf_at_index_rc(vmaf_context, cpi->vmaf_info.vmaf_model, source,
+                            sharpened, bit_depth, *vmaf_cal_index, &new_vmaf);
+  (*vmaf_cal_index)++;
+#else
+  aom_calc_vmaf(cpi->oxcf.tune_cfg.vmaf_model_path, source, sharpened,
+                bit_depth, &new_vmaf);
+#endif
+
   const double sharpened_var = frame_average_variance(cpi, sharpened);
   return source_variance / sharpened_var * (new_vmaf - kBaselineVmaf);
 }
 
 static double find_best_frame_unsharp_amount_loop(
-    const AV1_COMP *const cpi, YV12_BUFFER_CONFIG *const source,
-    YV12_BUFFER_CONFIG *const blurred, YV12_BUFFER_CONFIG *const sharpened,
-    double best_vmaf, const double baseline_variance,
-    const double unsharp_amount_start, const double step_size,
-    const int max_loop_count, const double max_amount) {
+    const AV1_COMP *const cpi,
+#if CONFIG_USE_VMAF_RC
+    VmafContext *vmaf_context, int *vmaf_cal_index,
+#endif
+    YV12_BUFFER_CONFIG *const source, YV12_BUFFER_CONFIG *const blurred,
+    YV12_BUFFER_CONFIG *const sharpened, double best_vmaf,
+    const double baseline_variance, const double unsharp_amount_start,
+    const double step_size, const int max_loop_count, const double max_amount) {
   const double min_amount = 0.0;
   int loop_count = 0;
   double approx_vmaf = best_vmaf;
@@ -178,7 +335,11 @@
     unsharp_amount += step_size;
     if (unsharp_amount > max_amount || unsharp_amount < min_amount) break;
     unsharp(cpi, source, blurred, sharpened, unsharp_amount);
-    approx_vmaf = cal_approx_vmaf(cpi, baseline_variance, source, sharpened);
+    approx_vmaf = cal_approx_vmaf(cpi,
+#if CONFIG_USE_VMAF_RC
+                                  vmaf_context, vmaf_cal_index,
+#endif
+                                  baseline_variance, source, sharpened);
 
     loop_count++;
   } while (approx_vmaf > best_vmaf && loop_count < max_loop_count);
@@ -197,7 +358,13 @@
   const AV1_COMMON *const cm = &cpi->common;
   const int width = source->y_width;
   const int height = source->y_height;
-
+#if CONFIG_USE_VMAF_RC
+  VmafContext *vmaf_context;
+  aom_init_vmaf_context_rc(
+      &vmaf_context, cpi->vmaf_info.vmaf_model,
+      cpi->oxcf.tune_cfg.tuning == AOM_TUNE_VMAF_NEG_MAX_GAIN);
+  int vmaf_cal_index = 0;
+#endif
   YV12_BUFFER_CONFIG sharpened;
   memset(&sharpened, 0, sizeof(sharpened));
   aom_alloc_frame_buffer(
@@ -208,32 +375,85 @@
   double unsharp_amount;
   if (unsharp_amount_start <= step_size) {
     unsharp_amount = find_best_frame_unsharp_amount_loop(
-        cpi, source, blurred, &sharpened, 0.0, baseline_variance, 0.0,
-        step_size, max_loop_count, max_filter_amount);
+        cpi,
+#if CONFIG_USE_VMAF_RC
+        vmaf_context, &vmaf_cal_index,
+#endif
+        source, blurred, &sharpened, 0.0, baseline_variance, 0.0, step_size,
+        max_loop_count, max_filter_amount);
   } else {
     double a0 = unsharp_amount_start - step_size, a1 = unsharp_amount_start;
     double v0, v1;
     unsharp(cpi, source, blurred, &sharpened, a0);
-    v0 = cal_approx_vmaf(cpi, baseline_variance, source, &sharpened);
+    v0 = cal_approx_vmaf(cpi,
+#if CONFIG_USE_VMAF_RC
+                         vmaf_context, &vmaf_cal_index,
+#endif
+                         baseline_variance, source, &sharpened);
     unsharp(cpi, source, blurred, &sharpened, a1);
-    v1 = cal_approx_vmaf(cpi, baseline_variance, source, &sharpened);
+    v1 = cal_approx_vmaf(cpi,
+#if CONFIG_USE_VMAF_RC
+                         vmaf_context, &vmaf_cal_index,
+#endif
+                         baseline_variance, source, &sharpened);
     if (fabs(v0 - v1) < 0.01) {
       unsharp_amount = a0;
     } else if (v0 > v1) {
       unsharp_amount = find_best_frame_unsharp_amount_loop(
-          cpi, source, blurred, &sharpened, v0, baseline_variance, a0,
-          -step_size, max_loop_count, max_filter_amount);
+          cpi,
+#if CONFIG_USE_VMAF_RC
+          vmaf_context, &vmaf_cal_index,
+#endif
+          source, blurred, &sharpened, v0, baseline_variance, a0, -step_size,
+          max_loop_count, max_filter_amount);
     } else {
       unsharp_amount = find_best_frame_unsharp_amount_loop(
-          cpi, source, blurred, &sharpened, v1, baseline_variance, a1,
-          step_size, max_loop_count, max_filter_amount);
+          cpi,
+#if CONFIG_USE_VMAF_RC
+          vmaf_context, &vmaf_cal_index,
+#endif
+          source, blurred, &sharpened, v1, baseline_variance, a1, step_size,
+          max_loop_count, max_filter_amount);
     }
   }
 
   aom_free_frame_buffer(&sharpened);
+#if CONFIG_USE_VMAF_RC
+  aom_close_vmaf_context_rc(vmaf_context);
+#endif
   return unsharp_amount;
 }
 
+#if CONFIG_USE_VMAF_RC
+void av1_vmaf_neg_preprocessing(AV1_COMP *const cpi,
+                                YV12_BUFFER_CONFIG *const source) {
+  aom_clear_system_state();
+  const AV1_COMMON *const cm = &cpi->common;
+  const int bit_depth = cpi->td.mb.e_mbd.bd;
+  const int width = source->y_width;
+  const int height = source->y_height;
+
+  const GF_GROUP *const gf_group = &cpi->gf_group;
+  const int layer_depth =
+      AOMMIN(gf_group->layer_depth[gf_group->index], MAX_ARF_LAYERS - 1);
+  const double best_frame_unsharp_amount =
+      get_layer_value(cpi->vmaf_info.last_frame_unsharp_amount, layer_depth);
+
+  if (best_frame_unsharp_amount <= 0.0) return;
+
+  YV12_BUFFER_CONFIG blurred;
+  memset(&blurred, 0, sizeof(blurred));
+  aom_alloc_frame_buffer(
+      &blurred, width, height, 1, 1, cm->seq_params.use_highbitdepth,
+      cpi->oxcf.border_in_pixels, cm->features.byte_alignment);
+
+  gaussian_blur(bit_depth, source, &blurred);
+  unsharp(cpi, source, &blurred, source, best_frame_unsharp_amount);
+  aom_free_frame_buffer(&blurred);
+  aom_clear_system_state();
+}
+#endif
+
 void av1_vmaf_frame_preprocessing(AV1_COMP *const cpi,
                                   YV12_BUFFER_CONFIG *const source) {
   aom_clear_system_state();
@@ -256,9 +476,17 @@
   gaussian_blur(bit_depth, &source_extended, &blurred);
   aom_free_frame_buffer(&source_extended);
 
+  const GF_GROUP *const gf_group = &cpi->gf_group;
+  const int layer_depth =
+      AOMMIN(gf_group->layer_depth[gf_group->index], MAX_ARF_LAYERS - 1);
+  const double last_frame_unsharp_amount =
+      get_layer_value(cpi->vmaf_info.last_frame_unsharp_amount, layer_depth);
+
   const double best_frame_unsharp_amount = find_best_frame_unsharp_amount(
-      cpi, source, &blurred, cpi->last_frame_unsharp_amount, 0.05, 20, 1.01);
-  cpi->last_frame_unsharp_amount = best_frame_unsharp_amount;
+      cpi, source, &blurred, last_frame_unsharp_amount, 0.05, 20, 1.01);
+
+  cpi->vmaf_info.last_frame_unsharp_amount[layer_depth] =
+      best_frame_unsharp_amount;
 
   unsharp(cpi, source, &blurred, source, best_frame_unsharp_amount);
   aom_free_frame_buffer(&blurred);
@@ -287,9 +515,17 @@
   gaussian_blur(bit_depth, &source_extended, &blurred);
   aom_free_frame_buffer(&source_extended);
 
+  const GF_GROUP *const gf_group = &cpi->gf_group;
+  const int layer_depth =
+      AOMMIN(gf_group->layer_depth[gf_group->index], MAX_ARF_LAYERS - 1);
+  const double last_frame_unsharp_amount =
+      get_layer_value(cpi->vmaf_info.last_frame_unsharp_amount, layer_depth);
+
   const double best_frame_unsharp_amount = find_best_frame_unsharp_amount(
-      cpi, source, &blurred, cpi->last_frame_unsharp_amount, 0.05, 20, 1.01);
-  cpi->last_frame_unsharp_amount = best_frame_unsharp_amount;
+      cpi, source, &blurred, last_frame_unsharp_amount, 0.05, 20, 1.01);
+
+  cpi->vmaf_info.last_frame_unsharp_amount[layer_depth] =
+      best_frame_unsharp_amount;
 
   const int block_size = BLOCK_64X64;
   const int block_w = mi_size_wide[block_size] * 4;
@@ -319,7 +555,9 @@
       const int block_height = AOMMIN(height - row_offset_y, block_h);
       const int index = col + row * num_cols;
 
-      if (bit_depth > 8) {
+      if (cm->seq_params.use_highbitdepth) {
+        assert(source->flags & YV12_FLAG_HIGHBITDEPTH);
+        assert(blurred.flags & YV12_FLAG_HIGHBITDEPTH);
         uint16_t *frame_src_buf = CONVERT_TO_SHORTPTR(source->y_buffer) +
                                   row_offset_y * source->y_stride +
                                   col_offset_y;
@@ -386,7 +624,9 @@
       const int block_height = AOMMIN(source->y_height - row_offset_y, block_h);
       const int index = col + row * num_cols;
 
-      if (bit_depth > 8) {
+      if (cm->seq_params.use_highbitdepth) {
+        assert(source->flags & YV12_FLAG_HIGHBITDEPTH);
+        assert(blurred.flags & YV12_FLAG_HIGHBITDEPTH);
         uint16_t *src_buf = CONVERT_TO_SHORTPTR(source->y_buffer) +
                             row_offset_y * source->y_stride + col_offset_y;
         uint16_t *blurred_buf = CONVERT_TO_SHORTPTR(blurred.y_buffer) +
@@ -414,6 +654,7 @@
   aom_clear_system_state();
 }
 
+#if !CONFIG_USE_VMAF_RC
 typedef struct FrameData {
   const YV12_BUFFER_CONFIG *source, *blurred;
   int block_w, block_h, num_rows, num_cols, row, col, bit_depth;
@@ -444,18 +685,18 @@
     float *ref, *main;
     ref = ref_data + i * stride;
     main = main_data + i * stride;
-    if (bit_depth == 8) {
-      uint8_t *src;
-      src = source->y_buffer + i * source->y_stride;
-      for (int j = 0; j < width; ++j) {
-        ref[j] = main[j] = (float)src[j];
-      }
-    } else {
+    if (source->flags & YV12_FLAG_HIGHBITDEPTH) {
       uint16_t *src;
       src = CONVERT_TO_SHORTPTR(source->y_buffer) + i * source->y_stride;
       for (int j = 0; j < width; ++j) {
         ref[j] = main[j] = scale_factor * (float)src[j];
       }
+    } else {
+      uint8_t *src;
+      src = source->y_buffer + i * source->y_stride;
+      for (int j = 0; j < width; ++j) {
+        ref[j] = main[j] = (float)src[j];
+      }
     }
   }
   if (row < num_rows && col < num_cols) {
@@ -466,17 +707,7 @@
     const int block_height = AOMMIN(height - row_offset, block_h);
 
     float *main_buf = main_data + col_offset + row_offset * stride;
-    if (bit_depth == 8) {
-      uint8_t *blurred_buf =
-          blurred->y_buffer + row_offset * blurred->y_stride + col_offset;
-      for (int i = 0; i < block_height; ++i) {
-        for (int j = 0; j < block_width; ++j) {
-          main_buf[j] = (float)blurred_buf[j];
-        }
-        main_buf += stride;
-        blurred_buf += blurred->y_stride;
-      }
-    } else {
+    if (source->flags & YV12_FLAG_HIGHBITDEPTH) {
       uint16_t *blurred_buf = CONVERT_TO_SHORTPTR(blurred->y_buffer) +
                               row_offset * blurred->y_stride + col_offset;
       for (int i = 0; i < block_height; ++i) {
@@ -486,6 +717,16 @@
         main_buf += stride;
         blurred_buf += blurred->y_stride;
       }
+    } else {
+      uint8_t *blurred_buf =
+          blurred->y_buffer + row_offset * blurred->y_stride + col_offset;
+      for (int i = 0; i < block_height; ++i) {
+        for (int j = 0; j < block_width; ++j) {
+          main_buf[j] = (float)blurred_buf[j];
+        }
+        main_buf += stride;
+        blurred_buf += blurred->y_stride;
+      }
     }
 
     frames->col++;
@@ -498,6 +739,7 @@
     return 2;
   }
 }
+#endif
 
 void av1_set_mb_vmaf_rdmult_scaling(AV1_COMP *cpi) {
   AV1_COMMON *cm = &cpi->common;
@@ -514,8 +756,8 @@
       &resized_source, y_width / resize_factor, y_height / resize_factor, 1, 1,
       cm->seq_params.use_highbitdepth, cpi->oxcf.border_in_pixels,
       cm->features.byte_alignment);
-  av1_resize_and_extend_frame(cpi->source, &resized_source, bit_depth,
-                              av1_num_planes(cm));
+  av1_resize_and_extend_frame_nonnormative(cpi->source, &resized_source,
+                                           bit_depth, av1_num_planes(cm));
 
   const int resized_y_width = resized_source.y_width;
   const int resized_y_height = resized_source.y_height;
@@ -534,6 +776,20 @@
                          cm->features.byte_alignment);
   gaussian_blur(bit_depth, &resized_source, &blurred);
 
+#if CONFIG_USE_VMAF_RC
+  YV12_BUFFER_CONFIG recon;
+  memset(&recon, 0, sizeof(recon));
+  aom_alloc_frame_buffer(&recon, resized_y_width, resized_y_height, 1, 1,
+                         cm->seq_params.use_highbitdepth,
+                         cpi->oxcf.border_in_pixels,
+                         cm->features.byte_alignment);
+  aom_yv12_copy_frame(&resized_source, &recon, 1);
+
+  VmafContext *vmaf_context;
+  aom_init_vmaf_context_rc(
+      &vmaf_context, cpi->vmaf_info.vmaf_model,
+      cpi->oxcf.tune_cfg.tuning == AOM_TUNE_VMAF_NEG_MAX_GAIN);
+#else
   double *scores = aom_malloc(sizeof(*scores) * (num_rows * num_cols));
   memset(scores, 0, sizeof(*scores) * (num_rows * num_cols));
   FrameData frame_data;
@@ -546,9 +802,10 @@
   frame_data.row = 0;
   frame_data.col = 0;
   frame_data.bit_depth = bit_depth;
-  aom_calc_vmaf_multi_frame(&frame_data, cpi->oxcf.vmaf_model_path,
+  aom_calc_vmaf_multi_frame(&frame_data, cpi->oxcf.tune_cfg.vmaf_model_path,
                             update_frame, resized_y_width, resized_y_height,
                             bit_depth, scores);
+#endif
 
   // Loop through each 'block_size' block.
   for (int row = 0; row < num_rows; ++row) {
@@ -563,12 +820,47 @@
       uint8_t *const blurred_buf =
           blurred.y_buffer + row_offset_y * blurred.y_stride + col_offset_y;
 
-      const double vmaf = scores[index];
-      const double dvmaf = kBaselineVmaf - vmaf;
       unsigned int sse;
       cpi->fn_ptr[resized_block_size].vf(orig_buf, resized_source.y_stride,
                                          blurred_buf, blurred.y_stride, &sse);
 
+#if CONFIG_USE_VMAF_RC
+      uint8_t *const recon_buf =
+          recon.y_buffer + row_offset_y * recon.y_stride + col_offset_y;
+      // Set recon buf
+      if (cpi->common.seq_params.use_highbitdepth) {
+        highbd_unsharp_rect(CONVERT_TO_SHORTPTR(blurred_buf), blurred.y_stride,
+                            CONVERT_TO_SHORTPTR(blurred_buf), blurred.y_stride,
+                            CONVERT_TO_SHORTPTR(recon_buf), recon.y_stride,
+                            resized_block_w, resized_block_h, 0.0, bit_depth);
+      } else {
+        unsharp_rect(blurred_buf, blurred.y_stride, blurred_buf,
+                     blurred.y_stride, recon_buf, recon.y_stride,
+                     resized_block_w, resized_block_h, 0.0);
+      }
+
+      double vmaf;
+      aom_calc_vmaf_at_index_rc(vmaf_context, cpi->vmaf_info.vmaf_model,
+                                &resized_source, &recon, bit_depth, index,
+                                &vmaf);
+
+      // Restore recon buf
+      if (cpi->common.seq_params.use_highbitdepth) {
+        highbd_unsharp_rect(
+            CONVERT_TO_SHORTPTR(orig_buf), resized_source.y_stride,
+            CONVERT_TO_SHORTPTR(orig_buf), resized_source.y_stride,
+            CONVERT_TO_SHORTPTR(recon_buf), recon.y_stride, resized_block_w,
+            resized_block_h, 0.0, bit_depth);
+      } else {
+        unsharp_rect(orig_buf, resized_source.y_stride, orig_buf,
+                     resized_source.y_stride, recon_buf, recon.y_stride,
+                     resized_block_w, resized_block_h, 0.0);
+      }
+#else
+      const double vmaf = scores[index];
+#endif
+      const double dvmaf = kBaselineVmaf - vmaf;
+
       const double mse =
           (double)sse / (double)(resized_y_width * resized_y_height);
       double weight;
@@ -581,13 +873,17 @@
 
       // Normalize it with a data fitted model.
       weight = 6.0 * (1.0 - exp(-0.05 * weight)) + 0.8;
-      cpi->vmaf_rdmult_scaling_factors[index] = weight;
+      cpi->vmaf_info.rdmult_scaling_factors[index] = weight;
     }
   }
 
   aom_free_frame_buffer(&resized_source);
   aom_free_frame_buffer(&blurred);
+#if CONFIG_USE_VMAF_RC
+  aom_close_vmaf_context_rc(vmaf_context);
+#else
   aom_free(scores);
+#endif
   aom_clear_system_state();
 }
 
@@ -613,7 +909,7 @@
     for (col = mi_col / num_mi_h;
          col < num_cols && col < mi_col / num_mi_h + num_bcols; ++col) {
       const int index = row * num_cols + col;
-      geom_mean_of_scale += log(cpi->vmaf_rdmult_scaling_factors[index]);
+      geom_mean_of_scale += log(cpi->vmaf_info.rdmult_scaling_factors[index]);
       num_of_mi += 1.0;
     }
   }
@@ -621,7 +917,7 @@
 
   *rdmult = (int)((double)(*rdmult) * geom_mean_of_scale + 0.5);
   *rdmult = AOMMAX(*rdmult, 0);
-  set_error_per_bit(x, *rdmult);
+  av1_set_error_per_bit(&x->mv_costs, *rdmult);
   aom_clear_system_state();
 }
 
@@ -662,10 +958,11 @@
   return accum / (double)(h * w);
 }
 
-static AOM_INLINE double calc_vmaf_motion_score(
-    const AV1_COMP *const cpi, const AV1_COMMON *const cm,
-    const YV12_BUFFER_CONFIG *const cur, const YV12_BUFFER_CONFIG *const last,
-    const YV12_BUFFER_CONFIG *const next) {
+static double calc_vmaf_motion_score(const AV1_COMP *const cpi,
+                                     const AV1_COMMON *const cm,
+                                     const YV12_BUFFER_CONFIG *const cur,
+                                     const YV12_BUFFER_CONFIG *const last,
+                                     const YV12_BUFFER_CONFIG *const next) {
   const int y_width = cur->y_width;
   const int y_height = cur->y_height;
   YV12_BUFFER_CONFIG blurred_cur, blurred_last, blurred_next;
@@ -690,7 +987,9 @@
   if (next) gaussian_blur(bit_depth, next, &blurred_next);
 
   double motion1, motion2 = 65536.0;
-  if (bit_depth > 8) {
+  if (cm->seq_params.use_highbitdepth) {
+    assert(blurred_cur.flags & YV12_FLAG_HIGHBITDEPTH);
+    assert(blurred_last.flags & YV12_FLAG_HIGHBITDEPTH);
     const float scale_factor = 1.0f / (float)(1 << (bit_depth - 8));
     motion1 = highbd_image_sad_c(CONVERT_TO_SHORTPTR(blurred_cur.y_buffer),
                                  blurred_cur.y_stride,
@@ -698,6 +997,7 @@
                                  blurred_last.y_stride, y_width, y_height) *
               scale_factor;
     if (next) {
+      assert(blurred_next.flags & YV12_FLAG_HIGHBITDEPTH);
       motion2 = highbd_image_sad_c(CONVERT_TO_SHORTPTR(blurred_cur.y_buffer),
                                    blurred_cur.y_stride,
                                    CONVERT_TO_SHORTPTR(blurred_next.y_buffer),
@@ -722,6 +1022,21 @@
   return AOMMIN(motion1, motion2);
 }
 
+static AOM_INLINE void get_neighbor_frames(const AV1_COMP *const cpi,
+                                           YV12_BUFFER_CONFIG **last,
+                                           YV12_BUFFER_CONFIG **next) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const GF_GROUP *gf_group = &cpi->gf_group;
+  const int src_index =
+      cm->show_frame != 0 ? 0 : gf_group->arf_src_offset[gf_group->index];
+  struct lookahead_entry *last_entry =
+      av1_lookahead_peek(cpi->lookahead, src_index - 1, cpi->compressor_stage);
+  struct lookahead_entry *next_entry =
+      av1_lookahead_peek(cpi->lookahead, src_index + 1, cpi->compressor_stage);
+  *next = &next_entry->img;
+  *last = cm->show_frame ? cpi->last_source : &last_entry->img;
+}
+
 // Calculates the new qindex from the VMAF motion score. This is based on the
 // observation: when the motion score becomes higher, the VMAF score of the
 // same source and distorted frames would become higher.
@@ -730,37 +1045,35 @@
   if (cm->current_frame.frame_number == 0 || cpi->oxcf.pass == 1) {
     return current_qindex;
   }
+  aom_clear_system_state();
+  const GF_GROUP *const gf_group = &cpi->gf_group;
+  const int layer_depth =
+      AOMMIN(gf_group->layer_depth[gf_group->index], MAX_ARF_LAYERS - 1);
+  const double last_frame_ysse =
+      get_layer_value(cpi->vmaf_info.last_frame_ysse, layer_depth);
+  const double last_frame_vmaf =
+      get_layer_value(cpi->vmaf_info.last_frame_vmaf, layer_depth);
   const int bit_depth = cpi->td.mb.e_mbd.bd;
-  const double approx_sse =
-      cpi->last_frame_ysse /
-      (double)((1 << (bit_depth - 8)) * (1 << (bit_depth - 8)));
-  const double approx_dvmaf = kBaselineVmaf - cpi->last_frame_vmaf;
+  const double approx_sse = last_frame_ysse / (double)((1 << (bit_depth - 8)) *
+                                                       (1 << (bit_depth - 8)));
+  const double approx_dvmaf = kBaselineVmaf - last_frame_vmaf;
   const double sse_threshold =
       0.01 * cpi->source->y_width * cpi->source->y_height;
   const double vmaf_threshold = 0.01;
   if (approx_sse < sse_threshold || approx_dvmaf < vmaf_threshold) {
     return current_qindex;
   }
-  aom_clear_system_state();
-  const GF_GROUP *gf_group = &cpi->gf_group;
   YV12_BUFFER_CONFIG *cur_buf = cpi->source;
-  int src_index = 0;
   if (cm->show_frame == 0) {
-    src_index = gf_group->arf_src_offset[gf_group->index];
+    const int src_index = gf_group->arf_src_offset[gf_group->index];
     struct lookahead_entry *cur_entry =
         av1_lookahead_peek(cpi->lookahead, src_index, cpi->compressor_stage);
     cur_buf = &cur_entry->img;
   }
   assert(cur_buf);
 
-  const struct lookahead_entry *last_entry =
-      av1_lookahead_peek(cpi->lookahead, src_index - 1, cpi->compressor_stage);
-  const struct lookahead_entry *next_entry =
-      av1_lookahead_peek(cpi->lookahead, src_index + 1, cpi->compressor_stage);
-  const YV12_BUFFER_CONFIG *next_buf = &next_entry->img;
-  const YV12_BUFFER_CONFIG *last_buf =
-      cm->show_frame ? cpi->last_source : &last_entry->img;
-
+  YV12_BUFFER_CONFIG *next_buf, *last_buf;
+  get_neighbor_frames(cpi, &last_buf, &next_buf);
   assert(last_buf);
 
   const double motion =
@@ -781,14 +1094,172 @@
   return qindex;
 }
 
-void av1_update_vmaf_curve(AV1_COMP *cpi, YV12_BUFFER_CONFIG *source,
-                           YV12_BUFFER_CONFIG *recon) {
+#if CONFIG_USE_VMAF_RC
+static AOM_INLINE double cal_approx_score(
+    AV1_COMP *const cpi, VmafContext *vmaf_context, int vmaf_cal_index,
+    double src_variance, double new_variance, double src_score,
+    YV12_BUFFER_CONFIG *const src, YV12_BUFFER_CONFIG *const recon_sharpened) {
+  double score;
+  const uint32_t bit_depth = cpi->td.mb.e_mbd.bd;
+  aom_calc_vmaf_at_index_rc(vmaf_context, cpi->vmaf_info.vmaf_model, src,
+                            recon_sharpened, bit_depth, vmaf_cal_index, &score);
+  return src_variance / new_variance * (score - src_score);
+}
+
+static double find_best_frame_unsharp_amount_loop_neg(
+    AV1_COMP *const cpi, VmafContext *vmaf_context, double src_variance,
+    double base_score, YV12_BUFFER_CONFIG *const src,
+    YV12_BUFFER_CONFIG *const recon, YV12_BUFFER_CONFIG *const ref,
+    YV12_BUFFER_CONFIG *const src_blurred,
+    YV12_BUFFER_CONFIG *const recon_blurred,
+    YV12_BUFFER_CONFIG *const src_sharpened,
+    YV12_BUFFER_CONFIG *const recon_sharpened, FULLPEL_MV *mvs,
+    double best_score, const double unsharp_amount_start,
+    const double step_size, const int max_loop_count, const double max_amount) {
+  const double min_amount = 0.0;
+  int loop_count = 0;
+  double approx_score = best_score;
+  double unsharp_amount = unsharp_amount_start;
+  int vmaf_cal_index = 3;
+
+  do {
+    best_score = approx_score;
+    unsharp_amount += step_size;
+    if (unsharp_amount > max_amount || unsharp_amount < min_amount) break;
+    unsharp(cpi, recon, recon_blurred, recon_sharpened, unsharp_amount);
+    unsharp(cpi, src, src_blurred, src_sharpened, unsharp_amount);
+    const double new_variance =
+        residual_frame_average_variance(cpi, src_sharpened, ref, mvs);
+    approx_score =
+        cal_approx_score(cpi, vmaf_context, vmaf_cal_index++, src_variance,
+                         new_variance, base_score, src, recon_sharpened);
+
+    loop_count++;
+  } while (approx_score > best_score && loop_count < max_loop_count);
+  unsharp_amount =
+      approx_score > best_score ? unsharp_amount : unsharp_amount - step_size;
+
+  return AOMMIN(max_amount, AOMMAX(unsharp_amount, min_amount));
+}
+
+static double find_best_frame_unsharp_amount_neg(
+    AV1_COMP *const cpi, VmafContext *vmaf_context,
+    YV12_BUFFER_CONFIG *const src, YV12_BUFFER_CONFIG *const recon,
+    YV12_BUFFER_CONFIG *const ref, double base_score,
+    const double unsharp_amount_start, const double step_size,
+    const int max_loop_count, const double max_filter_amount) {
+  FULLPEL_MV *mvs = NULL;
+  const double src_variance =
+      residual_frame_average_variance(cpi, src, ref, mvs);
+
+  const AV1_COMMON *const cm = &cpi->common;
+  const int width = recon->y_width;
+  const int height = recon->y_height;
   const int bit_depth = cpi->td.mb.e_mbd.bd;
-  aom_calc_vmaf(cpi->oxcf.vmaf_model_path, source, recon, bit_depth,
-                &cpi->last_frame_vmaf);
-  if (bit_depth > 8) {
-    cpi->last_frame_ysse = (double)aom_highbd_get_y_sse(source, recon);
+  YV12_BUFFER_CONFIG src_blurred, recon_blurred, src_sharpened, recon_sharpened;
+  memset(&recon_sharpened, 0, sizeof(recon_sharpened));
+  memset(&src_sharpened, 0, sizeof(src_sharpened));
+  memset(&recon_blurred, 0, sizeof(recon_blurred));
+  memset(&src_blurred, 0, sizeof(src_blurred));
+  aom_alloc_frame_buffer(
+      &recon_sharpened, width, height, 1, 1, cm->seq_params.use_highbitdepth,
+      cpi->oxcf.border_in_pixels, cm->features.byte_alignment);
+  aom_alloc_frame_buffer(
+      &src_sharpened, width, height, 1, 1, cm->seq_params.use_highbitdepth,
+      cpi->oxcf.border_in_pixels, cm->features.byte_alignment);
+  aom_alloc_frame_buffer(
+      &recon_blurred, width, height, 1, 1, cm->seq_params.use_highbitdepth,
+      cpi->oxcf.border_in_pixels, cm->features.byte_alignment);
+  aom_alloc_frame_buffer(
+      &src_blurred, width, height, 1, 1, cm->seq_params.use_highbitdepth,
+      cpi->oxcf.border_in_pixels, cm->features.byte_alignment);
+
+  gaussian_blur(bit_depth, recon, &recon_blurred);
+  gaussian_blur(bit_depth, src, &src_blurred);
+
+  unsharp(cpi, recon, &recon_blurred, &recon_sharpened, unsharp_amount_start);
+  unsharp(cpi, src, &src_blurred, &src_sharpened, unsharp_amount_start);
+  const double variance_start =
+      residual_frame_average_variance(cpi, &src_sharpened, ref, mvs);
+  const double score_start =
+      cal_approx_score(cpi, vmaf_context, 1, src_variance, variance_start,
+                       base_score, src, &recon_sharpened);
+
+  const double unsharp_amount_next = unsharp_amount_start + step_size;
+  unsharp(cpi, recon, &recon_blurred, &recon_sharpened, unsharp_amount_next);
+  unsharp(cpi, src, &src_blurred, &src_sharpened, unsharp_amount_next);
+  const double variance_next =
+      residual_frame_average_variance(cpi, &src_sharpened, ref, mvs);
+  const double score_next =
+      cal_approx_score(cpi, vmaf_context, 2, src_variance, variance_next,
+                       base_score, src, &recon_sharpened);
+
+  double unsharp_amount;
+  if (score_next > score_start) {
+    unsharp_amount = find_best_frame_unsharp_amount_loop_neg(
+        cpi, vmaf_context, src_variance, base_score, src, recon, ref,
+        &src_blurred, &recon_blurred, &src_sharpened, &recon_sharpened, mvs,
+        score_next, unsharp_amount_next, step_size, max_loop_count,
+        max_filter_amount);
   } else {
-    cpi->last_frame_ysse = (double)aom_get_y_sse(source, recon);
+    unsharp_amount = find_best_frame_unsharp_amount_loop_neg(
+        cpi, vmaf_context, src_variance, base_score, src, recon, ref,
+        &src_blurred, &recon_blurred, &src_sharpened, &recon_sharpened, mvs,
+        score_start, unsharp_amount_start, -step_size, max_loop_count,
+        max_filter_amount);
   }
+
+  aom_free_frame_buffer(&recon_sharpened);
+  aom_free_frame_buffer(&src_sharpened);
+  aom_free_frame_buffer(&recon_blurred);
+  aom_free_frame_buffer(&src_blurred);
+  aom_free(mvs);
+  return unsharp_amount;
+}
+#endif  // CONFIG_USE_VMAF_RC
+
+void av1_update_vmaf_curve(AV1_COMP *cpi) {
+  YV12_BUFFER_CONFIG *source = cpi->source;
+  YV12_BUFFER_CONFIG *recon = &cpi->common.cur_frame->buf;
+  const int bit_depth = cpi->td.mb.e_mbd.bd;
+  const GF_GROUP *const gf_group = &cpi->gf_group;
+  const int layer_depth =
+      AOMMIN(gf_group->layer_depth[gf_group->index], MAX_ARF_LAYERS - 1);
+#if CONFIG_USE_VMAF_RC
+  double base_score;
+  VmafContext *vmaf_context;
+  aom_init_vmaf_context_rc(
+      &vmaf_context, cpi->vmaf_info.vmaf_model,
+      cpi->oxcf.tune_cfg.tuning == AOM_TUNE_VMAF_NEG_MAX_GAIN);
+  aom_calc_vmaf_at_index_rc(vmaf_context, cpi->vmaf_info.vmaf_model, source,
+                            recon, bit_depth, 0, &base_score);
+  cpi->vmaf_info.last_frame_vmaf[layer_depth] = base_score;
+#else
+  aom_calc_vmaf(cpi->oxcf.tune_cfg.vmaf_model_path, source, recon, bit_depth,
+                &cpi->vmaf_info.last_frame_vmaf[layer_depth]);
+#endif  // CONFIG_USE_VMAF_RC
+  if (cpi->common.seq_params.use_highbitdepth) {
+    assert(source->flags & YV12_FLAG_HIGHBITDEPTH);
+    assert(recon->flags & YV12_FLAG_HIGHBITDEPTH);
+    cpi->vmaf_info.last_frame_ysse[layer_depth] =
+        (double)aom_highbd_get_y_sse(source, recon);
+  } else {
+    cpi->vmaf_info.last_frame_ysse[layer_depth] =
+        (double)aom_get_y_sse(source, recon);
+  }
+
+#if CONFIG_USE_VMAF_RC
+  if (cpi->oxcf.tune_cfg.tuning == AOM_TUNE_VMAF_NEG_MAX_GAIN) {
+    YV12_BUFFER_CONFIG *last, *next;
+    get_neighbor_frames(cpi, &last, &next);
+    double best_unsharp_amount_start =
+        get_layer_value(cpi->vmaf_info.last_frame_unsharp_amount, layer_depth);
+    const int max_loop_count = 5;
+    cpi->vmaf_info.last_frame_unsharp_amount[layer_depth] =
+        find_best_frame_unsharp_amount_neg(
+            cpi, vmaf_context, source, recon, last, base_score,
+            best_unsharp_amount_start, 0.025, max_loop_count, 1.01);
+  }
+  aom_close_vmaf_context_rc(vmaf_context);
+#endif  // CONFIG_USE_VMAF_RC
 }
diff --git a/av1/encoder/tune_vmaf.h b/av1/encoder/tune_vmaf.h
index c4cf072..01c3068 100644
--- a/av1/encoder/tune_vmaf.h
+++ b/av1/encoder/tune_vmaf.h
@@ -12,13 +12,46 @@
 #ifndef AOM_AV1_ENCODER_TUNE_VMAF_H_
 #define AOM_AV1_ENCODER_TUNE_VMAF_H_
 
+#include "aom_dsp/vmaf.h"
 #include "aom_scale/yv12config.h"
-#include "av1/encoder/encoder.h"
+#include "av1/common/enums.h"
+#include "av1/encoder/ratectrl.h"
+#include "av1/encoder/block.h"
+
+typedef struct {
+  // Stores the scaling factors for rdmult when tuning for VMAF.
+  // rdmult_scaling_factors[row * num_cols + col] stores the scaling factors for
+  // 64x64 block at (row, col).
+  double *rdmult_scaling_factors;
+
+  // Stores the luma sse of the last frame.
+  double last_frame_ysse[MAX_ARF_LAYERS];
+
+  // Stores the VMAF of the last frame.
+  double last_frame_vmaf[MAX_ARF_LAYERS];
+
+  // Stores the filter strength of the last frame.
+  double last_frame_unsharp_amount[MAX_ARF_LAYERS];
+
+  // Stores the origial qindex before scaling.
+  int original_qindex;
+
+#if CONFIG_USE_VMAF_RC
+  // VMAF model used in VMAF caculations.
+  VmafModel *vmaf_model;
+#endif
+} TuneVMAFInfo;
+
+typedef struct AV1_COMP AV1_COMP;
 
 void av1_vmaf_blk_preprocessing(AV1_COMP *cpi, YV12_BUFFER_CONFIG *source);
 
 void av1_vmaf_frame_preprocessing(AV1_COMP *cpi, YV12_BUFFER_CONFIG *source);
 
+#ifdef CONFIG_USE_VMAF_RC
+void av1_vmaf_neg_preprocessing(AV1_COMP *cpi, YV12_BUFFER_CONFIG *source);
+#endif
+
 void av1_set_mb_vmaf_rdmult_scaling(AV1_COMP *cpi);
 
 void av1_set_vmaf_rdmult(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
@@ -26,7 +59,6 @@
 
 int av1_get_vmaf_base_qindex(const AV1_COMP *cpi, int current_qindex);
 
-void av1_update_vmaf_curve(AV1_COMP *cpi, YV12_BUFFER_CONFIG *source,
-                           YV12_BUFFER_CONFIG *recon);
+void av1_update_vmaf_curve(AV1_COMP *cpi);
 
 #endif  // AOM_AV1_ENCODER_TUNE_VMAF_H_
diff --git a/av1/encoder/tx_search.c b/av1/encoder/tx_search.c
index 65b9a24..f88ce3f 100644
--- a/av1/encoder/tx_search.c
+++ b/av1/encoder/tx_search.c
@@ -11,6 +11,7 @@
 
 #include "av1/common/cfl.h"
 #include "av1/common/reconintra.h"
+#include "av1/encoder/block.h"
 #include "av1/encoder/encodetxb.h"
 #include "av1/encoder/hybrid_fwd_txfm.h"
 #include "av1/common/idct.h"
@@ -30,7 +31,6 @@
   int64_t best_rd;
   int exit_early;
   int incomplete_exit;
-  int use_fast_coef_costing;
   FAST_TX_SEARCH_MODE ftxs_mode;
   int skip_trellis;
 };
@@ -46,6 +46,11 @@
   int8_t children[4];
 } RD_RECORD_IDX_NODE;
 
+typedef struct tx_size_rd_info_node {
+  TXB_RD_INFO *rd_info_array;  // Points to array of size TX_TYPES.
+  struct tx_size_rd_info_node *children[4];
+} TXB_RD_INFO_NODE;
+
 // origin_threshold * 128 / 100
 static const uint32_t skip_pred_threshold[3][BLOCK_SIZES_ALL] = {
   {
@@ -62,7 +67,7 @@
   },
 };
 
-// lookup table for predict_skip_flag
+// lookup table for predict_skip_txfm
 // int max_tx_size = max_txsize_rect_lookup[bsize];
 // if (tx_size_high[max_tx_size] > 16 || tx_size_wide[max_tx_size] > 16)
 //   max_tx_size = AOMMIN(max_txsize_lookup[bsize], TX_16X16);
@@ -73,6 +78,12 @@
   TX_8X8,   TX_8X8,   TX_16X16, TX_16X16,
 };
 
+// look-up table for sqrt of number of pixels in a transform block
+// rounded up to the nearest integer.
+static const int sqrt_tx_pixels_2d[TX_SIZES_ALL] = { 4,  8,  16, 32, 32, 6,  6,
+                                                     12, 12, 23, 23, 32, 32, 8,
+                                                     8,  16, 16, 23, 23 };
+
 static int find_tx_size_rd_info(TXB_RD_RECORD *cur_record,
                                 const uint32_t hash) {
   // Linear search through the circular buffer to find matching hash.
@@ -251,10 +262,11 @@
 // the form of a quadtree for easier access in actual TX size search.
 static int find_tx_size_rd_records(MACROBLOCK *x, BLOCK_SIZE bsize,
                                    TXB_RD_INFO_NODE *dst_rd_info) {
-  TXB_RD_RECORD *rd_records_table[4] = { x->txb_rd_record_8X8,
-                                         x->txb_rd_record_16X16,
-                                         x->txb_rd_record_32X32,
-                                         x->txb_rd_record_64X64 };
+  TxfmSearchInfo *txfm_info = &x->txfm_search_info;
+  TXB_RD_RECORD *rd_records_table[4] = { txfm_info->txb_rd_record_8X8,
+                                         txfm_info->txb_rd_record_16X16,
+                                         txfm_info->txb_rd_record_32X32,
+                                         txfm_info->txb_rd_record_64X64 };
   const TX_SIZE max_square_tx_size = max_txsize_lookup[bsize];
   const int bw = block_size_wide[bsize];
   const int bh = block_size_high[bsize];
@@ -300,9 +312,9 @@
             cur_hash_row += cur_tx_bw;
             cur_diff_row += diff_stride;
           }
-          const int hash = av1_get_crc32c_value(&x->mb_rd_record.crc_calculator,
-                                                (uint8_t *)hash_data,
-                                                2 * cur_tx_bw * cur_tx_bh);
+          const int hash = av1_get_crc32c_value(
+              &txfm_info->mb_rd_record.crc_calculator, (uint8_t *)hash_data,
+              2 * cur_tx_bw * cur_tx_bh);
           // Find corresponding RD info based on the hash value.
           const int record_idx =
               row_in_sb * (MAX_MIB_SIZE >> (tx_size_idx + 1)) + col_in_sb;
@@ -324,8 +336,9 @@
   const int rows = block_size_high[bsize];
   const int cols = block_size_wide[bsize];
   const int16_t *diff = x->plane[0].src_diff;
-  const uint32_t hash = av1_get_crc32c_value(&x->mb_rd_record.crc_calculator,
-                                             (uint8_t *)diff, 2 * rows * cols);
+  const uint32_t hash =
+      av1_get_crc32c_value(&x->txfm_search_info.mb_rd_record.crc_calculator,
+                           (uint8_t *)diff, 2 * rows * cols);
   return (hash << 5) + bsize;
 }
 
@@ -354,7 +367,7 @@
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = xd->mi[0];
   mbmi->tx_size = tx_rd_info->tx_size;
-  memcpy(x->blk_skip, tx_rd_info->blk_skip,
+  memcpy(x->txfm_search_info.blk_skip, tx_rd_info->blk_skip,
          sizeof(tx_rd_info->blk_skip[0]) * n4);
   av1_copy(mbmi->inter_tx_size, tx_rd_info->inter_tx_size);
   av1_copy_array(xd->tx_type_map, tx_rd_info->tx_type_map, n4);
@@ -388,11 +401,44 @@
   return sse;
 }
 
+// Computes the residual block's SSE and mean on all visible 4x4s in the
+// transform block
+static INLINE int64_t pixel_diff_stats(
+    MACROBLOCK *x, int plane, int blk_row, int blk_col,
+    const BLOCK_SIZE plane_bsize, const BLOCK_SIZE tx_bsize,
+    unsigned int *block_mse_q8, int64_t *per_px_mean, uint64_t *block_var) {
+  int visible_rows, visible_cols;
+  const MACROBLOCKD *xd = &x->e_mbd;
+  get_txb_dimensions(xd, plane, plane_bsize, blk_row, blk_col, tx_bsize, NULL,
+                     NULL, &visible_cols, &visible_rows);
+  const int diff_stride = block_size_wide[plane_bsize];
+  const int16_t *diff = x->plane[plane].src_diff;
+
+  diff += ((blk_row * diff_stride + blk_col) << MI_SIZE_LOG2);
+  uint64_t sse = 0;
+  int sum = 0;
+  sse = aom_sum_sse_2d_i16(diff, diff_stride, visible_cols, visible_rows, &sum);
+  if (visible_cols > 0 && visible_rows > 0) {
+    aom_clear_system_state();
+    double norm_factor = 1.0 / (visible_cols * visible_rows);
+    int sign_sum = sum > 0 ? 1 : -1;
+    // Conversion to transform domain
+    *per_px_mean = (int64_t)(norm_factor * abs(sum)) << 7;
+    *per_px_mean = sign_sum * (*per_px_mean);
+    *block_mse_q8 = (unsigned int)(norm_factor * (256 * sse));
+    *block_var = (uint64_t)(sse - (uint64_t)(norm_factor * sum * sum));
+  } else {
+    *block_mse_q8 = UINT_MAX;
+  }
+  return sse;
+}
+
 // Uses simple features on top of DCT coefficients to quickly predict
 // whether optimal RD decision is to skip encoding the residual.
 // The sse value is stored in dist.
-static int predict_skip_flag(MACROBLOCK *x, BLOCK_SIZE bsize, int64_t *dist,
+static int predict_skip_txfm(MACROBLOCK *x, BLOCK_SIZE bsize, int64_t *dist,
                              int reduced_tx_set) {
+  const TxfmSearchParams *txfm_params = &x->txfm_search_params;
   const int bw = block_size_wide[bsize];
   const int bh = block_size_high[bsize];
   const MACROBLOCKD *xd = &x->e_mbd;
@@ -408,12 +454,12 @@
   // For faster early skip decision, use dist to compare against threshold so
   // that quality risk is less for the skip=1 decision. Otherwise, use mse
   // since the fwd_txfm coeff checks will take care of quality
-  // TODO(any): Use dist to return 0 when predict_skip_level is 1
-  int64_t pred_err = (x->predict_skip_level >= 2) ? *dist : mse;
+  // TODO(any): Use dist to return 0 when skip_txfm_level is 1
+  int64_t pred_err = (txfm_params->skip_txfm_level >= 2) ? *dist : mse;
   // Predict not to skip when error is larger than threshold.
   if (pred_err > mse_thresh) return 0;
   // Return as skip otherwise for aggressive early skip
-  else if (x->predict_skip_level >= 2)
+  else if (txfm_params->skip_txfm_level >= 2)
     return 1;
 
   const int max_tx_size = max_predict_sf_tx_size[bsize];
@@ -452,7 +498,7 @@
 }
 
 // Used to set proper context for early termination with skip = 1.
-static AOM_INLINE void set_skip_flag(MACROBLOCK *x, RD_STATS *rd_stats,
+static AOM_INLINE void set_skip_txfm(MACROBLOCK *x, RD_STATS *rd_stats,
                                      int bsize, int64_t dist) {
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = xd->mi[0];
@@ -461,8 +507,9 @@
   memset(xd->tx_type_map, DCT_DCT, sizeof(xd->tx_type_map[0]) * n4);
   memset(mbmi->inter_tx_size, tx_size, sizeof(mbmi->inter_tx_size));
   mbmi->tx_size = tx_size;
-  for (int i = 0; i < n4; ++i) set_blk_skip(x, 0, i, 1);
-  rd_stats->skip = 1;
+  for (int i = 0; i < n4; ++i)
+    set_blk_skip(x->txfm_search_info.blk_skip, 0, i, 1);
+  rd_stats->skip_txfm = 1;
   if (is_cur_buf_hbd(xd)) dist = ROUND_POWER_OF_TWO(dist, (xd->bd - 8) * 2);
   rd_stats->dist = rd_stats->sse = (dist << 4);
   // Though decision is to make the block as skip based on luma stats,
@@ -484,7 +531,7 @@
   const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
   TXB_CTX txb_ctx;
   get_txb_ctx(bsize, tx_size, 0, ta, tl, &txb_ctx);
-  const int zero_blk_rate = x->coeff_costs[txs_ctx][PLANE_TYPE_Y]
+  const int zero_blk_rate = x->coeff_costs.coeff_costs[txs_ctx][PLANE_TYPE_Y]
                                 .txb_skip_cost[txb_ctx.txb_skip_ctx][1];
   rd_stats->rate = zero_blk_rate *
                    (block_size_wide[bsize] >> tx_size_wide_log2[tx_size]) *
@@ -510,7 +557,7 @@
   const MB_MODE_INFO *const mbmi = xd->mi[0];
   tx_rd_info->hash_value = hash;
   tx_rd_info->tx_size = mbmi->tx_size;
-  memcpy(tx_rd_info->blk_skip, x->blk_skip,
+  memcpy(tx_rd_info->blk_skip, x->txfm_search_info.blk_skip,
          sizeof(tx_rd_info->blk_skip[0]) * n4);
   av1_copy(tx_rd_info->inter_tx_size, mbmi->inter_tx_size);
   av1_copy_array(tx_rd_info->tx_type_map, xd->tx_type_map, n4);
@@ -850,8 +897,8 @@
   for (int plane = 0; plane < num_planes; ++plane) {
     const struct macroblock_plane *const p = &x->plane[plane];
     const struct macroblockd_plane *const pd = &xd->plane[plane];
-    const BLOCK_SIZE bs = get_plane_block_size(mbmi->sb_type, pd->subsampling_x,
-                                               pd->subsampling_y);
+    const BLOCK_SIZE bs =
+        get_plane_block_size(mbmi->bsize, pd->subsampling_x, pd->subsampling_y);
     unsigned int sse;
 
     if (x->skip_chroma_rd && plane) continue;
@@ -1056,19 +1103,21 @@
 #endif  // CONFIG_COLLECT_RD_STATS >= 2
 #endif  // CONFIG_COLLECT_RD_STATS
 
-static AOM_INLINE void inverse_transform_block_facade(MACROBLOCKD *xd,
+static AOM_INLINE void inverse_transform_block_facade(MACROBLOCK *const x,
                                                       int plane, int block,
                                                       int blk_row, int blk_col,
                                                       int eob,
                                                       int reduced_tx_set) {
   if (!eob) return;
-
-  struct macroblockd_plane *const pd = &xd->plane[plane];
-  tran_low_t *dqcoeff = pd->dqcoeff + BLOCK_OFFSET(block);
+  struct macroblock_plane *const p = &x->plane[plane];
+  MACROBLOCKD *const xd = &x->e_mbd;
+  tran_low_t *dqcoeff = p->dqcoeff + BLOCK_OFFSET(block);
   const PLANE_TYPE plane_type = get_plane_type(plane);
   const TX_SIZE tx_size = av1_get_tx_size(plane, xd);
   const TX_TYPE tx_type = av1_get_tx_type(xd, plane_type, blk_row, blk_col,
                                           tx_size, reduced_tx_set);
+
+  struct macroblockd_plane *const pd = &xd->plane[plane];
   const int dst_stride = pd->dst.stride;
   uint8_t *dst = &pd->dst.buf[(blk_row * dst_stride + blk_col) << MI_SIZE_LOG2];
   av1_inverse_transform_block(xd, dqcoeff, plane, tx_type, tx_size, dst,
@@ -1099,18 +1148,18 @@
                           ? (USE_B_QUANT_NO_TRELLIS ? AV1_XFORM_QUANT_B
                                                     : AV1_XFORM_QUANT_FP)
                           : AV1_XFORM_QUANT_FP,
-                      cpi->oxcf.quant_b_adapt, &quant_param_intra);
+                      cpi->oxcf.q_cfg.quant_b_adapt, &quant_param_intra);
       av1_setup_qmatrix(&cm->quant_params, xd, plane, tx_size, best_tx_type,
                         &quant_param_intra);
       av1_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize,
                       &txfm_param_intra, &quant_param_intra);
       if (quant_param_intra.use_optimize_b) {
         av1_optimize_b(cpi, x, plane, block, tx_size, best_tx_type, txb_ctx,
-                       cpi->sf.rd_sf.trellis_eob_fast, rate_cost);
+                       rate_cost);
       }
     }
 
-    inverse_transform_block_facade(xd, plane, block, blk_row, blk_col,
+    inverse_transform_block_facade(x, plane, block, blk_row, blk_col,
                                    x->plane[plane].eobs[block],
                                    cm->features.reduced_tx_set_used);
 
@@ -1181,7 +1230,6 @@
                                            TX_SIZE tx_size) {
   MACROBLOCKD *const xd = &x->e_mbd;
   const struct macroblock_plane *const p = &x->plane[plane];
-  const struct macroblockd_plane *const pd = &xd->plane[plane];
   const uint16_t eob = p->eobs[block];
   const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size];
   const int bsw = block_size_wide[tx_bsize];
@@ -1193,7 +1241,7 @@
   const int dst_idx = (blk_row * dst_stride + blk_col) << MI_SIZE_LOG2;
   const uint8_t *src = &x->plane[plane].src.buf[src_idx];
   const uint8_t *dst = &xd->plane[plane].dst.buf[dst_idx];
-  const tran_low_t *dqcoeff = pd->dqcoeff + BLOCK_OFFSET(block);
+  const tran_low_t *dqcoeff = p->dqcoeff + BLOCK_OFFSET(block);
 
   assert(cpi != NULL);
   assert(tx_size_wide_log2[0] == tx_size_high_log2[0]);
@@ -1204,18 +1252,15 @@
 #if CONFIG_AV1_HIGHBITDEPTH
   if (is_cur_buf_hbd(xd)) {
     recon = CONVERT_TO_BYTEPTR(recon16);
-    av1_highbd_convolve_2d_copy_sr(CONVERT_TO_SHORTPTR(dst), dst_stride,
-                                   CONVERT_TO_SHORTPTR(recon), MAX_TX_SIZE, bsw,
-                                   bsh, NULL, NULL, 0, 0, NULL, xd->bd);
+    aom_highbd_convolve_copy(CONVERT_TO_SHORTPTR(dst), dst_stride,
+                             CONVERT_TO_SHORTPTR(recon), MAX_TX_SIZE, bsw, bsh);
   } else {
     recon = (uint8_t *)recon16;
-    av1_convolve_2d_copy_sr(dst, dst_stride, recon, MAX_TX_SIZE, bsw, bsh, NULL,
-                            NULL, 0, 0, NULL);
+    aom_convolve_copy(dst, dst_stride, recon, MAX_TX_SIZE, bsw, bsh);
   }
 #else
   recon = (uint8_t *)recon16;
-  av1_convolve_2d_copy_sr(dst, dst_stride, recon, MAX_TX_SIZE, bsw, bsh, NULL,
-                          NULL, 0, 0, NULL);
+  aom_convolve_copy(dst, dst_stride, recon, MAX_TX_SIZE, bsw, bsh);
 #endif
 
   const PLANE_TYPE plane_type = get_plane_type(plane);
@@ -1248,7 +1293,7 @@
     }
     hash_data = (uint8_t *)tmp_data;
   }
-  CRC32C *crc = &x->mb_rd_record.crc_calculator;
+  CRC32C *crc = &x->txfm_search_info.mb_rd_record.crc_calculator;
   const uint32_t hash = av1_get_crc32c_value(crc, hash_data, 2 * txb_w * txb_h);
   return (hash << 5) + tx_size;
 }
@@ -1265,17 +1310,19 @@
                                       const int tx_type_map_idx,
                                       uint16_t *cur_joint_ctx) {
   MACROBLOCKD *xd = &x->e_mbd;
+  TxfmSearchInfo *txfm_info = &x->txfm_search_info;
   assert(cpi->sf.tx_sf.use_intra_txb_hash &&
          frame_is_intra_only(&cpi->common) && !is_inter_block(xd->mi[0]) &&
          plane == 0 && tx_size_wide[tx_size] == tx_size_high[tx_size]);
   const uint32_t intra_hash =
       get_intra_txb_hash(x, plane, blk_row, blk_col, plane_bsize, tx_size);
   const int intra_hash_idx =
-      find_tx_size_rd_info(&x->txb_rd_record_intra, intra_hash);
-  *intra_txb_rd_info = &x->txb_rd_record_intra.tx_rd_info[intra_hash_idx];
+      find_tx_size_rd_info(&txfm_info->txb_rd_record_intra, intra_hash);
+  *intra_txb_rd_info =
+      &txfm_info->txb_rd_record_intra.tx_rd_info[intra_hash_idx];
   *cur_joint_ctx = (txb_ctx->dc_sign_ctx << 8) + txb_ctx->txb_skip_ctx;
   if ((*intra_txb_rd_info)->entropy_context == *cur_joint_ctx &&
-      x->txb_rd_record_intra.tx_rd_info[intra_hash_idx].valid) {
+      txfm_info->txb_rd_record_intra.tx_rd_info[intra_hash_idx].valid) {
     xd->tx_type_map[tx_type_map_idx] = (*intra_txb_rd_info)->tx_type;
     const TX_TYPE ref_tx_type =
         av1_get_tx_type(xd, get_plane_type(plane), blk_row, blk_col, tx_size,
@@ -1314,9 +1361,7 @@
 static INLINE void dist_block_tx_domain(MACROBLOCK *x, int plane, int block,
                                         TX_SIZE tx_size, int64_t *out_dist,
                                         int64_t *out_sse) {
-  MACROBLOCKD *const xd = &x->e_mbd;
   const struct macroblock_plane *const p = &x->plane[plane];
-  const struct macroblockd_plane *const pd = &xd->plane[plane];
   // Transform domain distortion computation is more efficient as it does
   // not involve an inverse transform, but it is less accurate.
   const int buffer_length = av1_get_max_eob(tx_size);
@@ -1326,16 +1371,16 @@
   int shift = (MAX_TX_SCALE - av1_get_tx_scale(tx_size)) * 2;
   const int block_offset = BLOCK_OFFSET(block);
   tran_low_t *const coeff = p->coeff + block_offset;
-  tran_low_t *const dqcoeff = pd->dqcoeff + block_offset;
+  tran_low_t *const dqcoeff = p->dqcoeff + block_offset;
 #if CONFIG_AV1_HIGHBITDEPTH
+  MACROBLOCKD *const xd = &x->e_mbd;
   if (is_cur_buf_hbd(xd))
     *out_dist = av1_highbd_block_error(coeff, dqcoeff, buffer_length, &this_sse,
                                        xd->bd);
   else
-    *out_dist = av1_block_error(coeff, dqcoeff, buffer_length, &this_sse);
-#else
-  *out_dist = av1_block_error(coeff, dqcoeff, buffer_length, &this_sse);
 #endif
+    *out_dist = av1_block_error(coeff, dqcoeff, buffer_length, &this_sse);
+
   *out_dist = RIGHT_SIGNED_SHIFT(*out_dist, shift);
   *out_sse = RIGHT_SIGNED_SHIFT(this_sse, shift);
 }
@@ -1374,7 +1419,7 @@
   QUANT_PARAM quant_param;
   TxfmParam txfm_param;
   av1_setup_xform(cm, x, tx_size, DCT_DCT, &txfm_param);
-  av1_setup_quant(tx_size, 1, AV1_XFORM_QUANT_B, cpi->oxcf.quant_b_adapt,
+  av1_setup_quant(tx_size, 1, AV1_XFORM_QUANT_B, cpi->oxcf.q_cfg.quant_b_adapt,
                   &quant_param);
   int tx_type;
   // to ensure we can try ones even outside of ext_tx_set of current block
@@ -1489,7 +1534,7 @@
   TxfmParam txfm_param;
   QUANT_PARAM quant_param;
   av1_setup_xform(cm, x, tx_size, DCT_DCT, &txfm_param);
-  av1_setup_quant(tx_size, 1, AV1_XFORM_QUANT_B, cpi->oxcf.quant_b_adapt,
+  av1_setup_quant(tx_size, 1, AV1_XFORM_QUANT_B, cpi->oxcf.q_cfg.quant_b_adapt,
                   &quant_param);
 
   for (int idx = 0; idx < TX_TYPES; idx++) {
@@ -1622,17 +1667,19 @@
   }
 }
 
-static INLINE float get_adaptive_thresholds(TX_SIZE tx_size,
-                                            TxSetType tx_set_type,
-                                            TX_TYPE_PRUNE_MODE prune_mode) {
-  const int prune_aggr_table[4][2] = { { 4, 1 }, { 6, 3 }, { 9, 6 }, { 9, 6 } };
+static INLINE float get_adaptive_thresholds(
+    TX_SIZE tx_size, TxSetType tx_set_type,
+    TX_TYPE_PRUNE_MODE prune_2d_txfm_mode) {
+  const int prune_aggr_table[5][2] = {
+    { 4, 1 }, { 6, 3 }, { 9, 6 }, { 9, 6 }, { 12, 9 }
+  };
   int pruning_aggressiveness = 0;
   if (tx_set_type == EXT_TX_SET_ALL16)
     pruning_aggressiveness =
-        prune_aggr_table[prune_mode - PRUNE_2D_ACCURATE][0];
+        prune_aggr_table[prune_2d_txfm_mode - TX_TYPE_PRUNE_1][0];
   else if (tx_set_type == EXT_TX_SET_DTT9_IDTX_1DDCT)
     pruning_aggressiveness =
-        prune_aggr_table[prune_mode - PRUNE_2D_ACCURATE][1];
+        prune_aggr_table[prune_2d_txfm_mode - TX_TYPE_PRUNE_1][1];
 
   return prune_2D_adaptive_thresholds[tx_size][pruning_aggressiveness];
 }
@@ -1703,7 +1750,7 @@
 
 static void prune_tx_2D(MACROBLOCK *x, BLOCK_SIZE bsize, TX_SIZE tx_size,
                         int blk_row, int blk_col, TxSetType tx_set_type,
-                        TX_TYPE_PRUNE_MODE prune_mode, int *txk_map,
+                        TX_TYPE_PRUNE_MODE prune_2d_txfm_mode, int *txk_map,
                         uint16_t *allowed_tx_mask) {
   int tx_type_table_2D[16] = {
     DCT_DCT,      DCT_ADST,      DCT_FLIPADST,      V_DCT,
@@ -1764,7 +1811,7 @@
   av1_nn_softmax(scores_2D_raw, scores_2D, 16);
 
   const float score_thresh =
-      get_adaptive_thresholds(tx_size, tx_set_type, prune_mode);
+      get_adaptive_thresholds(tx_size, tx_set_type, prune_2d_txfm_mode);
 
   // Always keep the TX type with the highest score, prune all others with
   // score below score_thresh.
@@ -1798,7 +1845,7 @@
 
   // Enable more pruning based on tx type probability and number of allowed tx
   // types
-  if (prune_mode == PRUNE_2D_AGGRESSIVE) {
+  if (prune_2d_txfm_mode >= TX_TYPE_PRUNE_4) {
     float temp_score = 0.0;
     float score_ratio = 0.0;
     int tx_idx, tx_count = 0;
@@ -1914,16 +1961,17 @@
   const AV1_COMMON *cm = &cpi->common;
   MACROBLOCKD *xd = &x->e_mbd;
   MB_MODE_INFO *mbmi = xd->mi[0];
+  const TxfmSearchParams *txfm_params = &x->txfm_search_params;
   const int is_inter = is_inter_block(mbmi);
   const int fast_tx_search = ftxs_mode & FTXS_DCT_AND_1D_DCT_ONLY;
   // if txk_allowed = TX_TYPES, >1 tx types are allowed, else, if txk_allowed <
   // TX_TYPES, only that specific tx type is allowed.
   TX_TYPE txk_allowed = TX_TYPES;
 
-  if ((!is_inter && x->use_default_intra_tx_type) ||
-      (is_inter && x->use_default_inter_tx_type)) {
+  if ((!is_inter && txfm_params->use_default_intra_tx_type) ||
+      (is_inter && txfm_params->use_default_inter_tx_type)) {
     txk_allowed =
-        get_default_tx_type(0, xd, tx_size, cpi->is_screen_content_type);
+        get_default_tx_type(0, xd, tx_size, cpi->use_screen_content_tools);
   } else if (x->rd_model == LOW_TXFM_RD) {
     if (plane == 0) txk_allowed = DCT_DCT;
   }
@@ -1949,12 +1997,13 @@
           : av1_ext_tx_used_flag[tx_set_type];
   if (xd->lossless[mbmi->segment_id] || txsize_sqr_up_map[tx_size] > TX_32X32 ||
       ext_tx_used_flag == 0x0001 ||
-      (is_inter && cpi->oxcf.use_inter_dct_only) ||
-      (!is_inter && cpi->oxcf.use_intra_dct_only)) {
+      (is_inter && cpi->oxcf.txfm_cfg.use_inter_dct_only) ||
+      (!is_inter && cpi->oxcf.txfm_cfg.use_intra_dct_only)) {
     txk_allowed = DCT_DCT;
   }
 
-  if (cpi->oxcf.enable_flip_idtx == 0) ext_tx_used_flag &= DCT_ADST_TX_MASK;
+  if (cpi->oxcf.txfm_cfg.enable_flip_idtx == 0)
+    ext_tx_used_flag &= DCT_ADST_TX_MASK;
 
   uint16_t allowed_tx_mask = 0;  // 1: allow; 0: skip.
   if (txk_allowed < TX_TYPES) {
@@ -1997,8 +2046,8 @@
     assert(num_allowed > 0);
 
     if (num_allowed > 2 && cpi->sf.tx_sf.tx_type_search.prune_tx_type_est_rd) {
-      int pf = prune_factors[x->prune_mode];
-      int mf = mul_factors[x->prune_mode];
+      int pf = prune_factors[txfm_params->prune_2d_txfm_mode];
+      int mf = mul_factors[txfm_params->prune_2d_txfm_mode];
       if (num_allowed <= 7) {
         const uint16_t prune =
             prune_txk_type(cpi, x, plane, block, tx_size, blk_row, blk_col,
@@ -2016,12 +2065,13 @@
       }
     } else {
       assert(num_allowed > 0);
-      int allowed_tx_count = (x->prune_mode == PRUNE_2D_AGGRESSIVE) ? 1 : 5;
+      int allowed_tx_count =
+          (txfm_params->prune_2d_txfm_mode >= TX_TYPE_PRUNE_4) ? 1 : 5;
       // !fast_tx_search && txk_end != txk_start && plane == 0
-      if (x->prune_mode >= PRUNE_2D_ACCURATE && is_inter &&
+      if (txfm_params->prune_2d_txfm_mode >= TX_TYPE_PRUNE_1 && is_inter &&
           num_allowed > allowed_tx_count) {
         prune_tx_2D(x, plane_bsize, tx_size, blk_row, blk_col, tx_set_type,
-                    x->prune_mode, txk_map, &allowed_tx_mask);
+                    txfm_params->prune_2d_txfm_mode, txk_map, &allowed_tx_mask);
       }
     }
   }
@@ -2064,13 +2114,11 @@
 static INLINE int cost_coeffs(MACROBLOCK *x, int plane, int block,
                               TX_SIZE tx_size, const TX_TYPE tx_type,
                               const TXB_CTX *const txb_ctx,
-                              int use_fast_coef_costing,
                               int reduced_tx_set_used) {
 #if TXCOEFF_COST_TIMER
   struct aom_usec_timer timer;
   aom_usec_timer_start(&timer);
 #endif
-  (void)use_fast_coef_costing;
   const int cost = av1_cost_coeffs_txb(x, plane, block, tx_size, tx_type,
                                        txb_ctx, reduced_tx_set_used);
 #if TXCOEFF_COST_TIMER
@@ -2083,19 +2131,111 @@
   return cost;
 }
 
+static int skip_trellis_opt_based_on_satd(MACROBLOCK *x,
+                                          QUANT_PARAM *quant_param, int plane,
+                                          int block, TX_SIZE tx_size,
+                                          int quant_b_adapt, int qstep,
+                                          unsigned int coeff_opt_satd_threshold,
+                                          int skip_trellis, int dc_only_blk) {
+  if (skip_trellis || (coeff_opt_satd_threshold == UINT_MAX))
+    return skip_trellis;
+
+  const struct macroblock_plane *const p = &x->plane[plane];
+  const int block_offset = BLOCK_OFFSET(block);
+  tran_low_t *const coeff_ptr = p->coeff + block_offset;
+  const int n_coeffs = av1_get_max_eob(tx_size);
+  const int shift = (MAX_TX_SCALE - av1_get_tx_scale(tx_size));
+  int satd = (dc_only_blk) ? abs(coeff_ptr[0]) : aom_satd(coeff_ptr, n_coeffs);
+  satd = RIGHT_SIGNED_SHIFT(satd, shift);
+  satd >>= (x->e_mbd.bd - 8);
+
+  const int skip_block_trellis =
+      ((uint64_t)satd >
+       (uint64_t)coeff_opt_satd_threshold * qstep * sqrt_tx_pixels_2d[tx_size]);
+
+  av1_setup_quant(
+      tx_size, !skip_block_trellis,
+      skip_block_trellis
+          ? (USE_B_QUANT_NO_TRELLIS ? AV1_XFORM_QUANT_B : AV1_XFORM_QUANT_FP)
+          : AV1_XFORM_QUANT_FP,
+      quant_b_adapt, quant_param);
+
+  return skip_block_trellis;
+}
+
+// Predict DC only blocks if the residual variance is below a qstep based
+// threshold.For such blocks, transform type search is bypassed.
+static INLINE void predict_dc_only_block(
+    MACROBLOCK *x, int plane, BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+    int block, int blk_row, int blk_col, RD_STATS *best_rd_stats,
+    int64_t *block_sse, unsigned int *block_mse_q8, int64_t *per_px_mean,
+    int *dc_only_blk) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  const int dequant_shift = (is_cur_buf_hbd(xd)) ? xd->bd - 5 : 3;
+  const int qstep = x->plane[plane].dequant_QTX[1] >> dequant_shift;
+  uint64_t block_var = UINT64_MAX;
+  const int dc_qstep = x->plane[plane].dequant_QTX[0] >> 3;
+  *block_sse = pixel_diff_stats(x, plane, blk_row, blk_col, plane_bsize,
+                                txsize_to_bsize[tx_size], block_mse_q8,
+                                per_px_mean, &block_var);
+  assert((*block_mse_q8) != UINT_MAX);
+  uint64_t var_threshold = (uint64_t)(1.8 * qstep * qstep);
+  if (is_cur_buf_hbd(xd))
+    block_var = ROUND_POWER_OF_TWO(block_var, (xd->bd - 8) * 2);
+  // Early prediction of skip block if residual mean and variance are less
+  // than qstep based threshold
+  if (((llabs(*per_px_mean) * dc_coeff_scale[tx_size]) < (dc_qstep << 12)) &&
+      (block_var < var_threshold)) {
+    // If the normalized mean of residual block is less than the dc qstep and
+    // the  normalized block variance is less than ac qstep, then the block is
+    // assumed to be a skip block and its rdcost is updated accordingly.
+    best_rd_stats->skip_txfm = 1;
+
+    x->plane[plane].eobs[block] = 0;
+
+    if (is_cur_buf_hbd(xd))
+      *block_sse = ROUND_POWER_OF_TWO((*block_sse), (xd->bd - 8) * 2);
+
+    best_rd_stats->dist = (*block_sse) << 4;
+    best_rd_stats->sse = best_rd_stats->dist;
+
+    ENTROPY_CONTEXT ctxa[MAX_MIB_SIZE];
+    ENTROPY_CONTEXT ctxl[MAX_MIB_SIZE];
+    av1_get_entropy_contexts(plane_bsize, &xd->plane[plane], ctxa, ctxl);
+    ENTROPY_CONTEXT *ta = ctxa;
+    ENTROPY_CONTEXT *tl = ctxl;
+    const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
+    TXB_CTX txb_ctx_tmp;
+    const PLANE_TYPE plane_type = get_plane_type(plane);
+    get_txb_ctx(plane_bsize, tx_size, plane, ta, tl, &txb_ctx_tmp);
+    const int zero_blk_rate = x->coeff_costs.coeff_costs[txs_ctx][plane_type]
+                                  .txb_skip_cost[txb_ctx_tmp.txb_skip_ctx][1];
+    best_rd_stats->rate = zero_blk_rate;
+
+    best_rd_stats->rdcost =
+        RDCOST(x->rdmult, best_rd_stats->rate, best_rd_stats->sse);
+
+    x->plane[plane].txb_entropy_ctx[block] = 0;
+  } else if (block_var < var_threshold) {
+    // Predict DC only blocks based on residual variance.
+    // For chroma plane, this early prediction is disabled for intra blocks.
+    if ((plane == 0) || (plane > 0 && is_inter_block(mbmi))) *dc_only_blk = 1;
+  }
+}
+
 // Search for the best transform type for a given transform block.
 // This function can be used for both inter and intra, both luma and chroma.
 static void search_tx_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
                            int block, int blk_row, int blk_col,
                            BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
                            const TXB_CTX *const txb_ctx,
-                           FAST_TX_SEARCH_MODE ftxs_mode,
-                           int use_fast_coef_costing, int skip_trellis,
+                           FAST_TX_SEARCH_MODE ftxs_mode, int skip_trellis,
                            int64_t ref_best_rd, RD_STATS *best_rd_stats) {
   const AV1_COMMON *cm = &cpi->common;
   MACROBLOCKD *xd = &x->e_mbd;
-  struct macroblockd_plane *const pd = &xd->plane[plane];
   MB_MODE_INFO *mbmi = xd->mi[0];
+  const TxfmSearchParams *txfm_params = &x->txfm_search_params;
   int64_t best_rd = INT64_MAX;
   uint16_t best_eob = 0;
   TX_TYPE best_tx_type = DCT_DCT;
@@ -2103,7 +2243,8 @@
   // The buffer used to swap dqcoeff in macroblockd_plane so we can keep dqcoeff
   // of the best tx_type
   DECLARE_ALIGNED(32, tran_low_t, this_dqcoeff[MAX_SB_SQUARE]);
-  tran_low_t *orig_dqcoeff = pd->dqcoeff;
+  struct macroblock_plane *const p = &x->plane[plane];
+  tran_low_t *orig_dqcoeff = p->dqcoeff;
   tran_low_t *best_dqcoeff = this_dqcoeff;
   const int tx_type_map_idx =
       plane ? 0 : blk_row * xd->tx_type_map_stride + blk_col;
@@ -2136,7 +2277,7 @@
       best_rd_stats->rate = intra_txb_rd_info->rate;
       best_rd_stats->dist = intra_txb_rd_info->dist;
       best_rd_stats->sse = intra_txb_rd_info->sse;
-      best_rd_stats->skip = intra_txb_rd_info->eob == 0;
+      best_rd_stats->skip_txfm = intra_txb_rd_info->eob == 0;
       x->plane[plane].eobs[block] = intra_txb_rd_info->eob;
       x->plane[plane].txb_entropy_ctx[block] =
           intra_txb_rd_info->txb_entropy_ctx;
@@ -2146,7 +2287,7 @@
       update_txk_array(xd, blk_row, blk_col, tx_size, best_tx_type);
       recon_intra(cpi, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
                   txb_ctx, skip_trellis, best_tx_type, 1, &rate_cost, best_eob);
-      pd->dqcoeff = orig_dqcoeff;
+      p->dqcoeff = orig_dqcoeff;
       return;
     }
   }
@@ -2158,22 +2299,45 @@
   int txk_map[TX_TYPES] = {
     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
   };
-  // Bit mask to indicate which transform types are allowed in the RD search.
-  const uint16_t allowed_tx_mask =
-      get_tx_mask(cpi, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
-                  txb_ctx, ftxs_mode, ref_best_rd, &txk_allowed, txk_map);
+  const int dequant_shift = (is_cur_buf_hbd(xd)) ? xd->bd - 5 : 3;
+  const int qstep = x->plane[plane].dequant_QTX[1] >> dequant_shift;
 
+  const uint8_t txw = tx_size_wide[tx_size];
+  const uint8_t txh = tx_size_high[tx_size];
+  int64_t block_sse;
   unsigned int block_mse_q8;
-  int64_t block_sse = pixel_diff_dist(x, plane, blk_row, blk_col, plane_bsize,
-                                      txsize_to_bsize[tx_size], &block_mse_q8);
-  assert(block_mse_q8 != UINT_MAX);
+  int dc_only_blk = 0;
+  const bool predict_dc_block =
+      txfm_params->predict_dc_level && txw != 64 && txh != 64;
+  int64_t per_px_mean = INT64_MAX;
+  if (predict_dc_block) {
+    predict_dc_only_block(x, plane, plane_bsize, tx_size, block, blk_row,
+                          blk_col, best_rd_stats, &block_sse, &block_mse_q8,
+                          &per_px_mean, &dc_only_blk);
+    if (best_rd_stats->skip_txfm == 1) return;
+  } else {
+    block_sse = pixel_diff_dist(x, plane, blk_row, blk_col, plane_bsize,
+                                txsize_to_bsize[tx_size], &block_mse_q8);
+    assert(block_mse_q8 != UINT_MAX);
+  }
+
+  // Bit mask to indicate which transform types are allowed in the RD search.
+  uint16_t tx_mask;
+
+  // Use DCT_DCT transform for DC only block.
+  if (dc_only_blk)
+    tx_mask = 1 << DCT_DCT;
+  else
+    tx_mask = get_tx_mask(cpi, x, plane, block, blk_row, blk_col, plane_bsize,
+                          tx_size, txb_ctx, ftxs_mode, ref_best_rd,
+                          &txk_allowed, txk_map);
+  const uint16_t allowed_tx_mask = tx_mask;
+
   if (is_cur_buf_hbd(xd)) {
     block_sse = ROUND_POWER_OF_TWO(block_sse, (xd->bd - 8) * 2);
     block_mse_q8 = ROUND_POWER_OF_TWO(block_mse_q8, (xd->bd - 8) * 2);
   }
   block_sse *= 16;
-  const int dequant_shift = (is_cur_buf_hbd(xd)) ? xd->bd - 5 : 3;
-  const int qstep = x->plane[plane].dequant_QTX[1] >> dequant_shift;
   // Use mse / qstep^2 based threshold logic to take decision of R-D
   // optimization of coeffs. For smaller residuals, coeff optimization
   // would be helpful. For larger residuals, R-D optimization may not be
@@ -2181,7 +2345,7 @@
   // TODO(any): Experiment with variance and mean based thresholds
   const int perform_block_coeff_opt =
       ((uint64_t)block_mse_q8 <=
-       (uint64_t)x->coeff_opt_dist_threshold * qstep * qstep);
+       (uint64_t)txfm_params->coeff_opt_dist_threshold * qstep * qstep);
   skip_trellis |= !perform_block_coeff_opt;
 
   // Flag to indicate if distortion should be calculated in transform domain or
@@ -2189,17 +2353,19 @@
   // Transform domain distortion is accurate for higher residuals.
   // TODO(any): Experiment with variance and mean based thresholds
   int use_transform_domain_distortion =
-      (x->use_transform_domain_distortion > 0) &&
-      (block_mse_q8 >= x->tx_domain_dist_threshold) &&
+      (txfm_params->use_transform_domain_distortion > 0) &&
+      (block_mse_q8 >= txfm_params->tx_domain_dist_threshold) &&
       // Any 64-pt transforms only preserves half the coefficients.
       // Therefore transform domain distortion is not valid for these
       // transform sizes.
-      txsize_sqr_up_map[tx_size] != TX_64X64;
+      (txsize_sqr_up_map[tx_size] != TX_64X64) &&
+      // Use pixel domain distortion for DC only blocks
+      !dc_only_blk;
   // Flag to indicate if an extra calculation of distortion in the pixel domain
   // should be performed at the end, after the best transform type has been
   // decided.
   int calc_pixel_domain_distortion_final =
-      x->use_transform_domain_distortion == 1 &&
+      txfm_params->use_transform_domain_distortion == 1 &&
       use_transform_domain_distortion && x->rd_model != LOW_TXFM_RD;
   if (calc_pixel_domain_distortion_final &&
       (txk_allowed < TX_TYPES || allowed_tx_mask == 0x0001))
@@ -2209,12 +2375,13 @@
 
   TxfmParam txfm_param;
   QUANT_PARAM quant_param;
+  int skip_trellis_based_on_satd[TX_TYPES] = { 0 };
   av1_setup_xform(cm, x, tx_size, DCT_DCT, &txfm_param);
   av1_setup_quant(tx_size, !skip_trellis,
                   skip_trellis ? (USE_B_QUANT_NO_TRELLIS ? AV1_XFORM_QUANT_B
                                                          : AV1_XFORM_QUANT_FP)
                                : AV1_XFORM_QUANT_FP,
-                  cpi->oxcf.quant_b_adapt, &quant_param);
+                  cpi->oxcf.q_cfg.quant_b_adapt, &quant_param);
 
   // Iterate through all transform type candidates.
   for (int idx = 0; idx < TX_TYPES; ++idx) {
@@ -2229,28 +2396,25 @@
     RD_STATS this_rd_stats;
     av1_invalid_rd_stats(&this_rd_stats);
 
-    av1_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, &txfm_param,
-                    &quant_param);
+    if (!dc_only_blk)
+      av1_xform(x, plane, block, blk_row, blk_col, plane_bsize, &txfm_param);
+    else
+      av1_xform_dc_only(x, plane, block, &txfm_param, per_px_mean);
+
+    skip_trellis_based_on_satd[tx_type] = skip_trellis_opt_based_on_satd(
+        x, &quant_param, plane, block, tx_size, cpi->oxcf.q_cfg.quant_b_adapt,
+        qstep, txfm_params->coeff_opt_satd_threshold, skip_trellis,
+        dc_only_blk);
+
+    av1_quant(x, plane, block, &txfm_param, &quant_param);
 
     // Calculate rate cost of quantized coefficients.
     if (quant_param.use_optimize_b) {
-      if (cpi->sf.rd_sf.optimize_b_precheck && best_rd < INT64_MAX &&
-          eobs_ptr[block] >= 4) {
-        // Calculate distortion quickly in transform domain.
-        dist_block_tx_domain(x, plane, block, tx_size, &this_rd_stats.dist,
-                             &this_rd_stats.sse);
-
-        const int64_t best_rd_ = AOMMIN(best_rd, ref_best_rd);
-        const int64_t dist_cost_estimate =
-            RDCOST(x->rdmult, 0, AOMMIN(this_rd_stats.dist, this_rd_stats.sse));
-        if (dist_cost_estimate - (dist_cost_estimate >> 3) > best_rd_) continue;
-      }
       av1_optimize_b(cpi, x, plane, block, tx_size, tx_type, txb_ctx,
-                     cpi->sf.rd_sf.trellis_eob_fast, &rate_cost);
+                     &rate_cost);
     } else {
-      rate_cost =
-          cost_coeffs(x, plane, block, tx_size, tx_type, txb_ctx,
-                      use_fast_coef_costing, cm->features.reduced_tx_set_used);
+      rate_cost = cost_coeffs(x, plane, block, tx_size, tx_type, txb_ctx,
+                              cm->features.reduced_tx_set_used);
     }
 
     // If rd cost based on coeff rate alone is already more than best_rd,
@@ -2261,6 +2425,10 @@
     if (eobs_ptr[block] == 0) {
       // When eob is 0, pixel domain distortion is more efficient and accurate.
       this_rd_stats.dist = this_rd_stats.sse = block_sse;
+    } else if (dc_only_blk) {
+      this_rd_stats.sse = block_sse;
+      this_rd_stats.dist = dist_block_px_domain(
+          cpi, x, plane, plane_bsize, block, blk_row, blk_col, tx_size);
     } else if (use_transform_domain_distortion) {
       dist_block_tx_domain(x, plane, block, tx_size, &this_rd_stats.dist,
                            &this_rd_stats.sse);
@@ -2268,9 +2436,9 @@
       int64_t sse_diff = INT64_MAX;
       // high_energy threshold assumes that every pixel within a txfm block
       // has a residue energy of at least 25% of the maximum, i.e. 128 * 128
-      // for 8 bit, then the threshold is scaled based on input bit depth.
+      // for 8 bit.
       const int64_t high_energy_thresh =
-          ((int64_t)128 * 128 * tx_size_2d[tx_size]) << ((xd->bd - 8) * 2);
+          ((int64_t)128 * 128 * tx_size_2d[tx_size]);
       const int is_high_energy = (block_sse >= high_energy_thresh);
       if (tx_size == TX_64X64 || is_high_energy) {
         // Because 3 out 4 quadrants of transform coefficients are forced to
@@ -2314,8 +2482,8 @@
       best_eob = x->plane[plane].eobs[block];
       // Swap dqcoeff buffers
       tran_low_t *const tmp_dqcoeff = best_dqcoeff;
-      best_dqcoeff = pd->dqcoeff;
-      pd->dqcoeff = tmp_dqcoeff;
+      best_dqcoeff = p->dqcoeff;
+      p->dqcoeff = tmp_dqcoeff;
     }
 
 #if CONFIG_COLLECT_RD_STATS == 1
@@ -2376,15 +2544,16 @@
 
   assert(best_rd != INT64_MAX);
 
-  best_rd_stats->skip = best_eob == 0;
+  best_rd_stats->skip_txfm = best_eob == 0;
   if (plane == 0) update_txk_array(xd, blk_row, blk_col, tx_size, best_tx_type);
   x->plane[plane].txb_entropy_ctx[block] = best_txb_ctx;
   x->plane[plane].eobs[block] = best_eob;
+  skip_trellis = skip_trellis_based_on_satd[best_tx_type];
 
   // Point dqcoeff to the quantized coefficients corresponding to the best
   // transform type, then we can skip transform and quantization, e.g. in the
   // final pixel domain distortion calculation and recon_intra().
-  pd->dqcoeff = best_dqcoeff;
+  p->dqcoeff = best_dqcoeff;
 
   if (calc_pixel_domain_distortion_final && best_eob) {
     best_rd_stats->dist = dist_block_px_domain(
@@ -2408,7 +2577,7 @@
   // can use them for prediction.
   recon_intra(cpi, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
               txb_ctx, skip_trellis, best_tx_type, 0, &rate_cost, best_eob);
-  pd->dqcoeff = orig_dqcoeff;
+  p->dqcoeff = orig_dqcoeff;
 }
 
 // Pick transform type for a luma transform block of tx_size. Note this function
@@ -2438,7 +2607,7 @@
       rd_stats->rate += rd_info_array->rate;
       rd_stats->dist += rd_info_array->dist;
       rd_stats->sse += rd_info_array->sse;
-      rd_stats->skip &= rd_info_array->eob == 0;
+      rd_stats->skip_txfm &= rd_info_array->eob == 0;
       p->eobs[block] = rd_info_array->eob;
       p->txb_entropy_ctx[block] = rd_info_array->txb_entropy_ctx;
       return;
@@ -2448,8 +2617,7 @@
   RD_STATS this_rd_stats;
   const int skip_trellis = 0;
   search_tx_type(cpi, x, 0, block, blk_row, blk_col, plane_bsize, tx_size,
-                 txb_ctx, ftxs_mode, 0, skip_trellis, ref_rdcost,
-                 &this_rd_stats);
+                 txb_ctx, ftxs_mode, skip_trellis, ref_rdcost, &this_rd_stats);
 
   av1_merge_rd_stats(rd_stats, &this_rd_stats);
 
@@ -2482,7 +2650,7 @@
   const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
   TXB_CTX txb_ctx;
   get_txb_ctx(plane_bsize, tx_size, 0, pta, ptl, &txb_ctx);
-  const int zero_blk_rate = x->coeff_costs[txs_ctx][PLANE_TYPE_Y]
+  const int zero_blk_rate = x->coeff_costs.coeff_costs[txs_ctx][PLANE_TYPE_Y]
                                 .txb_skip_cost[txb_ctx.txb_skip_ctx][1];
   rd_stats->zero_rate = zero_blk_rate;
   const int index = av1_get_txb_size_index(plane_bsize, blk_row, blk_col);
@@ -2492,11 +2660,12 @@
              rd_info_node != NULL ? rd_info_node->rd_info_array : NULL);
   assert(rd_stats->rate < INT_MAX);
 
-  const int pick_skip = !xd->lossless[mbmi->segment_id] &&
-                        (rd_stats->skip == 1 ||
-                         RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist) >=
-                             RDCOST(x->rdmult, zero_blk_rate, rd_stats->sse));
-  if (pick_skip) {
+  const int pick_skip_txfm =
+      !xd->lossless[mbmi->segment_id] &&
+      (rd_stats->skip_txfm == 1 ||
+       RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist) >=
+           RDCOST(x->rdmult, zero_blk_rate, rd_stats->sse));
+  if (pick_skip_txfm) {
 #if CONFIG_RD_DEBUG
     update_txb_coeff_cost(rd_stats, 0, tx_size, blk_row, blk_col,
                           zero_blk_rate - rd_stats->rate);
@@ -2506,11 +2675,12 @@
     p->eobs[block] = 0;
     update_txk_array(xd, blk_row, blk_col, tx_size, DCT_DCT);
   }
-  rd_stats->skip = pick_skip;
-  set_blk_skip(x, 0, blk_row * bw + blk_col, pick_skip);
+  rd_stats->skip_txfm = pick_skip_txfm;
+  set_blk_skip(x->txfm_search_info.blk_skip, 0, blk_row * bw + blk_col,
+               pick_skip_txfm);
 
   if (tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH)
-    rd_stats->rate += x->txfm_partition_cost[txfm_partition_ctx][0];
+    rd_stats->rate += x->mode_costs.txfm_partition_cost[txfm_partition_ctx][0];
 
   no_split->rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
   no_split->txb_entropy_ctx = p->txb_entropy_ctx[block];
@@ -2539,7 +2709,8 @@
   const int nblks = (txb_height / sub_txb_height) * (txb_width / sub_txb_width);
   assert(nblks > 0);
   av1_init_rd_stats(split_rd_stats);
-  split_rd_stats->rate = x->txfm_partition_cost[txfm_partition_ctx][1];
+  split_rd_stats->rate =
+      x->mode_costs.txfm_partition_cost[txfm_partition_ctx][1];
 
   for (int r = 0, blk_idx = 0; r < txb_height; r += sub_txb_height) {
     for (int c = 0; c < txb_width; c += sub_txb_width, ++blk_idx) {
@@ -2571,6 +2742,98 @@
   }
 }
 
+static float get_var(float mean, double x2_sum, int num) {
+  const float e_x2 = (float)(x2_sum / num);
+  const float diff = e_x2 - mean * mean;
+  return diff;
+}
+
+static AOM_INLINE void get_blk_var_dev(const int16_t *data, int stride, int bw,
+                                       int bh, float *dev_of_mean,
+                                       float *var_of_vars) {
+  const int16_t *const data_ptr = &data[0];
+  const int subh = (bh >= bw) ? (bh >> 1) : bh;
+  const int subw = (bw >= bh) ? (bw >> 1) : bw;
+  const int num = bw * bh;
+  const int sub_num = subw * subh;
+  int total_x_sum = 0;
+  int64_t total_x2_sum = 0;
+  int blk_idx = 0;
+  float var_sum = 0.0f;
+  float mean_sum = 0.0f;
+  double var2_sum = 0.0f;
+  double mean2_sum = 0.0f;
+
+  for (int row = 0; row < bh; row += subh) {
+    for (int col = 0; col < bw; col += subw) {
+      int x_sum;
+      int64_t x2_sum;
+      aom_get_blk_sse_sum(data_ptr + row * stride + col, stride, subw, subh,
+                          &x_sum, &x2_sum);
+      total_x_sum += x_sum;
+      total_x2_sum += x2_sum;
+
+      aom_clear_system_state();
+      const float mean = (float)x_sum / sub_num;
+      const float var = get_var(mean, (double)x2_sum, sub_num);
+      mean_sum += mean;
+      mean2_sum += (double)(mean * mean);
+      var_sum += var;
+      var2_sum += var * var;
+      blk_idx++;
+    }
+  }
+
+  const float lvl0_mean = (float)total_x_sum / num;
+  const float block_var = get_var(lvl0_mean, (double)total_x2_sum, num);
+  mean_sum += lvl0_mean;
+  mean2_sum += (double)(lvl0_mean * lvl0_mean);
+  var_sum += block_var;
+  var2_sum += block_var * block_var;
+  const float av_mean = mean_sum / 5;
+
+  if (blk_idx > 1) {
+    // Deviation of means.
+    *dev_of_mean = get_dev(av_mean, mean2_sum, (blk_idx + 1));
+    // Variance of variances.
+    const float mean_var = var_sum / (blk_idx + 1);
+    *var_of_vars = get_var(mean_var, var2_sum, (blk_idx + 1));
+  }
+}
+
+static void prune_tx_split_no_split(MACROBLOCK *x, BLOCK_SIZE bsize,
+                                    int blk_row, int blk_col, TX_SIZE tx_size,
+                                    int *try_no_split, int *try_split,
+                                    int pruning_level) {
+  const int diff_stride = block_size_wide[bsize];
+  const int16_t *diff =
+      x->plane[0].src_diff + 4 * blk_row * diff_stride + 4 * blk_col;
+  const int bw = tx_size_wide[tx_size];
+  const int bh = tx_size_high[tx_size];
+  aom_clear_system_state();
+  float dev_of_means = 0.0f;
+  float var_of_vars = 0.0f;
+
+  // This function calculates the deviation of means, and the variance of pixel
+  // variances of the block as well as it's sub-blocks.
+  get_blk_var_dev(diff, diff_stride, bw, bh, &dev_of_means, &var_of_vars);
+  const int dc_q = x->plane[0].dequant_QTX[0] >> 3;
+  const int ac_q = x->plane[0].dequant_QTX[1] >> 3;
+  const int no_split_thresh_scales[4] = { 0, 24, 8, 8 };
+  const int no_split_thresh_scale = no_split_thresh_scales[pruning_level];
+  const int split_thresh_scales[4] = { 0, 24, 10, 8 };
+  const int split_thresh_scale = split_thresh_scales[pruning_level];
+
+  if ((dev_of_means <= dc_q) &&
+      (split_thresh_scale * var_of_vars <= ac_q * ac_q)) {
+    *try_split = 0;
+  }
+  if ((dev_of_means > no_split_thresh_scale * dc_q) &&
+      (var_of_vars > no_split_thresh_scale * ac_q * ac_q)) {
+    *try_no_split = 0;
+  }
+}
+
 // Search for the best transform partition(recursive)/type for a given
 // inter-predicted luma block. The obtained transform selection will be saved
 // in xd->mi[0], the corresponding RD stats will be saved in rd_stats.
@@ -2593,14 +2856,24 @@
          blk_col < max_block_wide(xd, plane_bsize, 0));
   MB_MODE_INFO *const mbmi = xd->mi[0];
   const int ctx = txfm_partition_context(tx_above + blk_col, tx_left + blk_row,
-                                         mbmi->sb_type, tx_size);
+                                         mbmi->bsize, tx_size);
   struct macroblock_plane *const p = &x->plane[0];
 
-  const int try_no_split =
-      cpi->oxcf.enable_tx64 || txsize_sqr_up_map[tx_size] != TX_64X64;
+  int try_no_split = (cpi->oxcf.txfm_cfg.enable_tx64 ||
+                      txsize_sqr_up_map[tx_size] != TX_64X64) &&
+                     (cpi->oxcf.txfm_cfg.enable_rect_tx ||
+                      tx_size_wide[tx_size] == tx_size_high[tx_size]);
   int try_split = tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH;
   TxCandidateInfo no_split = { INT64_MAX, 0, TX_TYPES };
 
+  // Prune tx_split and no-split based on sub-block properties.
+  if (tx_size != TX_4X4 && try_split == 1 && try_no_split == 1 &&
+      cpi->sf.tx_sf.prune_tx_size_level > 0) {
+    prune_tx_split_no_split(x, plane_bsize, blk_row, blk_col, tx_size,
+                            &try_no_split, &try_split,
+                            cpi->sf.tx_sf.prune_tx_size_level);
+  }
+
   // Try using current block as a single transform block without split.
   if (try_no_split) {
     try_tx_block_no_split(cpi, x, blk_row, blk_col, block, tx_size, depth,
@@ -2661,7 +2934,8 @@
     mbmi->tx_size = tx_size;
     update_txk_array(xd, blk_row, blk_col, tx_size, no_split.tx_type);
     const int bw = mi_size_wide[plane_bsize];
-    set_blk_skip(x, 0, blk_row * bw + blk_col, rd_stats->skip);
+    set_blk_skip(x->txfm_search_info.blk_skip, 0, blk_row * bw + blk_col,
+                 rd_stats->skip_txfm);
   } else {
     *rd_stats = split_rd_stats;
     if (split_rd_stats.rdcost == INT64_MAX) *is_cost_valid = 0;
@@ -2674,10 +2948,11 @@
                                               BLOCK_SIZE bs) {
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = xd->mi[0];
-  mbmi->tx_size = tx_size_from_tx_mode(bs, x->tx_mode_search_type);
+  const TxfmSearchParams *txfm_params = &x->txfm_search_params;
+  mbmi->tx_size = tx_size_from_tx_mode(bs, txfm_params->tx_mode_search_type);
 
   // If tx64 is not enabled, we need to go down to the next available size
-  if (!cpi->oxcf.enable_tx64) {
+  if (!cpi->oxcf.txfm_cfg.enable_tx64 && cpi->oxcf.txfm_cfg.enable_rect_tx) {
     static const TX_SIZE tx_size_max_32[TX_SIZES_ALL] = {
       TX_4X4,    // 4x4 transform
       TX_8X8,    // 8x8 transform
@@ -2699,22 +2974,69 @@
       TX_16X32,  // 16x64 transform
       TX_32X16,  // 64x16 transform
     };
-
     mbmi->tx_size = tx_size_max_32[mbmi->tx_size];
+  } else if (cpi->oxcf.txfm_cfg.enable_tx64 &&
+             !cpi->oxcf.txfm_cfg.enable_rect_tx) {
+    static const TX_SIZE tx_size_max_square[TX_SIZES_ALL] = {
+      TX_4X4,    // 4x4 transform
+      TX_8X8,    // 8x8 transform
+      TX_16X16,  // 16x16 transform
+      TX_32X32,  // 32x32 transform
+      TX_64X64,  // 64x64 transform
+      TX_4X4,    // 4x8 transform
+      TX_4X4,    // 8x4 transform
+      TX_8X8,    // 8x16 transform
+      TX_8X8,    // 16x8 transform
+      TX_16X16,  // 16x32 transform
+      TX_16X16,  // 32x16 transform
+      TX_32X32,  // 32x64 transform
+      TX_32X32,  // 64x32 transform
+      TX_4X4,    // 4x16 transform
+      TX_4X4,    // 16x4 transform
+      TX_8X8,    // 8x32 transform
+      TX_8X8,    // 32x8 transform
+      TX_16X16,  // 16x64 transform
+      TX_16X16,  // 64x16 transform
+    };
+    mbmi->tx_size = tx_size_max_square[mbmi->tx_size];
+  } else if (!cpi->oxcf.txfm_cfg.enable_tx64 &&
+             !cpi->oxcf.txfm_cfg.enable_rect_tx) {
+    static const TX_SIZE tx_size_max_32_square[TX_SIZES_ALL] = {
+      TX_4X4,    // 4x4 transform
+      TX_8X8,    // 8x8 transform
+      TX_16X16,  // 16x16 transform
+      TX_32X32,  // 32x32 transform
+      TX_32X32,  // 64x64 transform
+      TX_4X4,    // 4x8 transform
+      TX_4X4,    // 8x4 transform
+      TX_8X8,    // 8x16 transform
+      TX_8X8,    // 16x8 transform
+      TX_16X16,  // 16x32 transform
+      TX_16X16,  // 32x16 transform
+      TX_32X32,  // 32x64 transform
+      TX_32X32,  // 64x32 transform
+      TX_4X4,    // 4x16 transform
+      TX_4X4,    // 16x4 transform
+      TX_8X8,    // 8x32 transform
+      TX_8X8,    // 32x8 transform
+      TX_16X16,  // 16x64 transform
+      TX_16X16,  // 64x16 transform
+    };
+
+    mbmi->tx_size = tx_size_max_32_square[mbmi->tx_size];
   }
 
-  const int skip_ctx = av1_get_skip_context(xd);
-  const int no_skip_flag_rate = x->skip_cost[skip_ctx][0];
-  const int skip_flag_rate = x->skip_cost[skip_ctx][1];
+  const int skip_ctx = av1_get_skip_txfm_context(xd);
+  const int no_skip_txfm_rate = x->mode_costs.skip_txfm_cost[skip_ctx][0];
+  const int skip_txfm_rate = x->mode_costs.skip_txfm_cost[skip_ctx][1];
   // Skip RDcost is used only for Inter blocks
-  const int64_t skip_rd =
-      is_inter_block(mbmi) ? RDCOST(x->rdmult, skip_flag_rate, 0) : INT64_MAX;
-  const int64_t no_skip_rd = RDCOST(x->rdmult, no_skip_flag_rate, 0);
+  const int64_t skip_txfm_rd =
+      is_inter_block(mbmi) ? RDCOST(x->rdmult, skip_txfm_rate, 0) : INT64_MAX;
+  const int64_t no_skip_txfm_rd = RDCOST(x->rdmult, no_skip_txfm_rate, 0);
   const int skip_trellis = 0;
   av1_txfm_rd_in_plane(x, cpi, rd_stats, ref_best_rd,
-                       AOMMIN(no_skip_rd, skip_rd), AOM_PLANE_Y, bs,
-                       mbmi->tx_size, cpi->sf.rd_sf.use_fast_coef_costing,
-                       FTXS_NONE, skip_trellis);
+                       AOMMIN(no_skip_txfm_rd, skip_txfm_rd), AOM_PLANE_Y, bs,
+                       mbmi->tx_size, FTXS_NONE, skip_trellis);
 }
 
 static AOM_INLINE void choose_smallest_tx_size(const AV1_COMP *const cpi,
@@ -2729,8 +3051,7 @@
   // TODO(any) : Pass this_rd based on skip/non-skip cost
   const int skip_trellis = 0;
   av1_txfm_rd_in_plane(x, cpi, rd_stats, ref_best_rd, 0, 0, bs, mbmi->tx_size,
-                       cpi->sf.rd_sf.use_fast_coef_costing, FTXS_NONE,
-                       skip_trellis);
+                       FTXS_NONE, skip_trellis);
 }
 
 // Search for the best uniform transform size and type for current coding block.
@@ -2743,8 +3064,9 @@
 
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = xd->mi[0];
+  const TxfmSearchParams *txfm_params = &x->txfm_search_params;
   const TX_SIZE max_rect_tx_size = max_txsize_rect_lookup[bs];
-  const int tx_select = x->tx_mode_search_type == TX_MODE_SELECT;
+  const int tx_select = txfm_params->tx_mode_search_type == TX_MODE_SELECT;
   int start_tx;
   // The split depth can be at most MAX_TX_DEPTH, so the init_depth controls
   // how many times of splitting is allowed during the RD search.
@@ -2754,10 +3076,10 @@
     start_tx = max_rect_tx_size;
     init_depth = get_search_init_depth(mi_size_wide[bs], mi_size_high[bs],
                                        is_inter_block(mbmi), &cpi->sf,
-                                       x->tx_size_search_method);
+                                       txfm_params->tx_size_search_method);
   } else {
     const TX_SIZE chosen_tx_size =
-        tx_size_from_tx_mode(bs, x->tx_mode_search_type);
+        tx_size_from_tx_mode(bs, txfm_params->tx_mode_search_type);
     start_tx = chosen_tx_size;
     init_depth = MAX_TX_DEPTH;
   }
@@ -2770,9 +3092,13 @@
   const int num_blks = bsize_to_num_blk(bs);
   x->rd_model = FULL_TXFM_RD;
   int64_t rd[MAX_TX_DEPTH + 1] = { INT64_MAX, INT64_MAX, INT64_MAX };
+  TxfmSearchInfo *txfm_info = &x->txfm_search_info;
   for (int tx_size = start_tx, depth = init_depth; depth <= MAX_TX_DEPTH;
        depth++, tx_size = sub_tx_size_map[tx_size]) {
-    if (!cpi->oxcf.enable_tx64 && txsize_sqr_up_map[tx_size] == TX_64X64) {
+    if ((!cpi->oxcf.txfm_cfg.enable_tx64 &&
+         txsize_sqr_up_map[tx_size] == TX_64X64) ||
+        (!cpi->oxcf.txfm_cfg.enable_rect_tx &&
+         tx_size_wide[tx_size] != tx_size_high[tx_size])) {
       continue;
     }
 
@@ -2780,7 +3106,7 @@
     rd[depth] = av1_uniform_txfm_yrd(cpi, x, &this_rd_stats, ref_best_rd, bs,
                                      tx_size, FTXS_NONE, skip_trellis);
     if (rd[depth] < best_rd) {
-      av1_copy_array(best_blk_skip, x->blk_skip, num_blks);
+      av1_copy_array(best_blk_skip, txfm_info->blk_skip, num_blks);
       av1_copy_array(best_txk_type_map, xd->tx_type_map, num_blks);
       best_tx_size = tx_size;
       best_rd = rd[depth];
@@ -2798,7 +3124,7 @@
   if (rd_stats->rate != INT_MAX) {
     mbmi->tx_size = best_tx_size;
     av1_copy_array(xd->tx_type_map, best_txk_type_map, num_blks);
-    av1_copy_array(x->blk_skip, best_blk_skip, num_blks);
+    av1_copy_array(txfm_info->blk_skip, best_blk_skip, num_blks);
   }
 }
 
@@ -2831,9 +3157,8 @@
   TXB_CTX txb_ctx;
   get_txb_ctx(plane_bsize, tx_size, plane, a, l, &txb_ctx);
   search_tx_type(cpi, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
-                 &txb_ctx, args->ftxs_mode, args->use_fast_coef_costing,
-                 args->skip_trellis, args->best_rd - args->current_rd,
-                 &this_rd_stats);
+                 &txb_ctx, args->ftxs_mode, args->skip_trellis,
+                 args->best_rd - args->current_rd, &this_rd_stats);
 
   if (plane == AOM_PLANE_Y && xd->cfl.store_y) {
     assert(!is_inter || plane_bsize < BLOCK_8X8);
@@ -2848,22 +3173,25 @@
 
   const int blk_idx =
       blk_row * (block_size_wide[plane_bsize] >> MI_SIZE_LOG2) + blk_col;
+
+  TxfmSearchInfo *txfm_info = &x->txfm_search_info;
   if (plane == 0)
-    set_blk_skip(x, plane, blk_idx, x->plane[plane].eobs[block] == 0);
+    set_blk_skip(txfm_info->blk_skip, plane, blk_idx,
+                 x->plane[plane].eobs[block] == 0);
   else
-    set_blk_skip(x, plane, blk_idx, 0);
+    set_blk_skip(txfm_info->blk_skip, plane, blk_idx, 0);
 
   int64_t rd;
   if (is_inter) {
-    const int64_t no_skip_rd =
+    const int64_t no_skip_txfm_rd =
         RDCOST(x->rdmult, this_rd_stats.rate, this_rd_stats.dist);
-    const int64_t skip_rd = RDCOST(x->rdmult, 0, this_rd_stats.sse);
-    rd = AOMMIN(no_skip_rd, skip_rd);
-    this_rd_stats.skip &= !x->plane[plane].eobs[block];
+    const int64_t skip_txfm_rd = RDCOST(x->rdmult, 0, this_rd_stats.sse);
+    rd = AOMMIN(no_skip_txfm_rd, skip_txfm_rd);
+    this_rd_stats.skip_txfm &= !x->plane[plane].eobs[block];
   } else {
-    // Signal non-skip for Intra blocks
+    // Signal non-skip_txfm for Intra blocks
     rd = RDCOST(x->rdmult, this_rd_stats.rate, this_rd_stats.dist);
-    this_rd_stats.skip = 0;
+    this_rd_stats.skip_txfm = 0;
   }
 
   av1_merge_rd_stats(&args->rd_stats, &this_rd_stats);
@@ -2872,8 +3200,6 @@
   if (args->current_rd > args->best_rd) args->exit_early = 1;
 }
 
-// Search for the best transform type and return the transform coefficients RD
-// cost of current luma coding block with the given uniform transform size.
 int64_t av1_uniform_txfm_yrd(const AV1_COMP *const cpi, MACROBLOCK *x,
                              RD_STATS *rd_stats, int64_t ref_best_rd,
                              BLOCK_SIZE bs, TX_SIZE tx_size,
@@ -2881,29 +3207,30 @@
   assert(IMPLIES(is_rect_tx(tx_size), is_rect_tx_allowed_bsize(bs)));
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = xd->mi[0];
+  const TxfmSearchParams *txfm_params = &x->txfm_search_params;
+  const ModeCosts *mode_costs = &x->mode_costs;
   const int is_inter = is_inter_block(mbmi);
-  const int tx_select = x->tx_mode_search_type == TX_MODE_SELECT &&
-                        block_signals_txsize(mbmi->sb_type);
+  const int tx_select = txfm_params->tx_mode_search_type == TX_MODE_SELECT &&
+                        block_signals_txsize(mbmi->bsize);
   int tx_size_rate = 0;
   if (tx_select) {
     const int ctx = txfm_partition_context(
-        xd->above_txfm_context, xd->left_txfm_context, mbmi->sb_type, tx_size);
-    tx_size_rate = is_inter ? x->txfm_partition_cost[ctx][0]
+        xd->above_txfm_context, xd->left_txfm_context, mbmi->bsize, tx_size);
+    tx_size_rate = is_inter ? mode_costs->txfm_partition_cost[ctx][0]
                             : tx_size_cost(x, bs, tx_size);
   }
-  const int skip_ctx = av1_get_skip_context(xd);
-  const int no_skip_flag_rate = x->skip_cost[skip_ctx][0];
-  const int skip_flag_rate = x->skip_cost[skip_ctx][1];
-  const int64_t skip_rd =
-      is_inter ? RDCOST(x->rdmult, skip_flag_rate, 0) : INT64_MAX;
+  const int skip_ctx = av1_get_skip_txfm_context(xd);
+  const int no_skip_txfm_rate = mode_costs->skip_txfm_cost[skip_ctx][0];
+  const int skip_txfm_rate = mode_costs->skip_txfm_cost[skip_ctx][1];
+  const int64_t skip_txfm_rd =
+      is_inter ? RDCOST(x->rdmult, skip_txfm_rate, 0) : INT64_MAX;
   const int64_t no_this_rd =
-      RDCOST(x->rdmult, no_skip_flag_rate + tx_size_rate, 0);
+      RDCOST(x->rdmult, no_skip_txfm_rate + tx_size_rate, 0);
 
   mbmi->tx_size = tx_size;
   av1_txfm_rd_in_plane(x, cpi, rd_stats, ref_best_rd,
-                       AOMMIN(no_this_rd, skip_rd), AOM_PLANE_Y, bs, tx_size,
-                       cpi->sf.rd_sf.use_fast_coef_costing, ftxs_mode,
-                       skip_trellis);
+                       AOMMIN(no_this_rd, skip_txfm_rd), AOM_PLANE_Y, bs,
+                       tx_size, ftxs_mode, skip_trellis);
   if (rd_stats->rate == INT_MAX) return INT64_MAX;
 
   int64_t rd;
@@ -2911,22 +3238,23 @@
   // same is accounted in the caller functions after rd evaluation of all
   // planes. However the decisions should be done after considering the
   // skip/non-skip header cost
-  if (rd_stats->skip && is_inter) {
-    rd = RDCOST(x->rdmult, skip_flag_rate, rd_stats->sse);
+  if (rd_stats->skip_txfm && is_inter) {
+    rd = RDCOST(x->rdmult, skip_txfm_rate, rd_stats->sse);
   } else {
     // Intra blocks are always signalled as non-skip
-    rd = RDCOST(x->rdmult, rd_stats->rate + no_skip_flag_rate + tx_size_rate,
+    rd = RDCOST(x->rdmult, rd_stats->rate + no_skip_txfm_rate + tx_size_rate,
                 rd_stats->dist);
     rd_stats->rate += tx_size_rate;
   }
   // Check if forcing the block to skip transform leads to smaller RD cost.
-  if (is_inter && !rd_stats->skip && !xd->lossless[mbmi->segment_id]) {
-    int64_t temp_skip_rd = RDCOST(x->rdmult, skip_flag_rate, rd_stats->sse);
-    if (temp_skip_rd <= rd) {
-      rd = temp_skip_rd;
+  if (is_inter && !rd_stats->skip_txfm && !xd->lossless[mbmi->segment_id]) {
+    int64_t temp_skip_txfm_rd =
+        RDCOST(x->rdmult, skip_txfm_rate, rd_stats->sse);
+    if (temp_skip_txfm_rd <= rd) {
+      rd = temp_skip_txfm_rd;
       rd_stats->rate = 0;
       rd_stats->dist = rd_stats->sse;
-      rd_stats->skip = 1;
+      rd_stats->skip_txfm = 1;
     }
   }
 
@@ -2954,7 +3282,7 @@
   const TX_SIZE plane_tx_size = mbmi->inter_tx_size[av1_get_txb_size_index(
       plane_bsize, blk_row, blk_col)];
   const int ctx = txfm_partition_context(tx_above + blk_col, tx_left + blk_row,
-                                         mbmi->sb_type, tx_size);
+                                         mbmi->bsize, tx_size);
 
   av1_init_rd_stats(rd_stats);
   if (tx_size == plane_tx_size) {
@@ -2964,28 +3292,30 @@
     TXB_CTX txb_ctx;
     get_txb_ctx(plane_bsize, tx_size, 0, ta, tl, &txb_ctx);
 
-    const int zero_blk_rate = x->coeff_costs[txs_ctx][get_plane_type(0)]
-                                  .txb_skip_cost[txb_ctx.txb_skip_ctx][1];
+    const int zero_blk_rate =
+        x->coeff_costs.coeff_costs[txs_ctx][get_plane_type(0)]
+            .txb_skip_cost[txb_ctx.txb_skip_ctx][1];
     rd_stats->zero_rate = zero_blk_rate;
     tx_type_rd(cpi, x, tx_size, blk_row, blk_col, block, plane_bsize, &txb_ctx,
                rd_stats, ftxs_mode, ref_best_rd, NULL);
     const int mi_width = mi_size_wide[plane_bsize];
+    TxfmSearchInfo *txfm_info = &x->txfm_search_info;
     if (RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist) >=
             RDCOST(x->rdmult, zero_blk_rate, rd_stats->sse) ||
-        rd_stats->skip == 1) {
+        rd_stats->skip_txfm == 1) {
       rd_stats->rate = zero_blk_rate;
       rd_stats->dist = rd_stats->sse;
-      rd_stats->skip = 1;
-      set_blk_skip(x, 0, blk_row * mi_width + blk_col, 1);
+      rd_stats->skip_txfm = 1;
+      set_blk_skip(txfm_info->blk_skip, 0, blk_row * mi_width + blk_col, 1);
       x->plane[0].eobs[block] = 0;
       x->plane[0].txb_entropy_ctx[block] = 0;
       update_txk_array(xd, blk_row, blk_col, tx_size, DCT_DCT);
     } else {
-      rd_stats->skip = 0;
-      set_blk_skip(x, 0, blk_row * mi_width + blk_col, 0);
+      rd_stats->skip_txfm = 0;
+      set_blk_skip(txfm_info->blk_skip, 0, blk_row * mi_width + blk_col, 0);
     }
     if (tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH)
-      rd_stats->rate += x->txfm_partition_cost[ctx][0];
+      rd_stats->rate += x->mode_costs.txfm_partition_cost[ctx][0];
     av1_set_txb_context(x, 0, block, tx_size, ta, tl);
     txfm_partition_update(tx_above + blk_col, tx_left + blk_row, tx_size,
                           tx_size);
@@ -3019,7 +3349,7 @@
     }
 
     if (tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH)
-      rd_stats->rate += x->txfm_partition_cost[ctx][1];
+      rd_stats->rate += x->mode_costs.txfm_partition_cost[ctx][1];
   }
 }
 
@@ -3038,6 +3368,7 @@
   av1_init_rd_stats(rd_stats);
 
   MACROBLOCKD *const xd = &x->e_mbd;
+  const TxfmSearchParams *txfm_params = &x->txfm_search_params;
   const struct macroblockd_plane *const pd = &xd->plane[0];
   const int mi_width = mi_size_wide[bsize];
   const int mi_height = mi_size_high[bsize];
@@ -3045,8 +3376,8 @@
   const int bh = tx_size_high_unit[max_tx_size];
   const int bw = tx_size_wide_unit[max_tx_size];
   const int step = bw * bh;
-  const int init_depth = get_search_init_depth(mi_width, mi_height, 1, &cpi->sf,
-                                               x->tx_size_search_method);
+  const int init_depth = get_search_init_depth(
+      mi_width, mi_height, 1, &cpi->sf, txfm_params->tx_size_search_method);
   ENTROPY_CONTEXT ctxa[MAX_MIB_SIZE];
   ENTROPY_CONTEXT ctxl[MAX_MIB_SIZE];
   TXFM_CONTEXT tx_above[MAX_MIB_SIZE];
@@ -3075,17 +3406,17 @@
     }
   }
 
-  const int skip_ctx = av1_get_skip_context(xd);
-  const int no_skip_flag_rate = x->skip_cost[skip_ctx][0];
-  const int skip_flag_rate = x->skip_cost[skip_ctx][1];
-  const int64_t skip_rd = RDCOST(x->rdmult, skip_flag_rate, rd_stats->sse);
+  const int skip_ctx = av1_get_skip_txfm_context(xd);
+  const int no_skip_txfm_rate = x->mode_costs.skip_txfm_cost[skip_ctx][0];
+  const int skip_txfm_rate = x->mode_costs.skip_txfm_cost[skip_ctx][1];
+  const int64_t skip_txfm_rd = RDCOST(x->rdmult, skip_txfm_rate, rd_stats->sse);
   this_rd =
-      RDCOST(x->rdmult, rd_stats->rate + no_skip_flag_rate, rd_stats->dist);
-  if (skip_rd < this_rd) {
-    this_rd = skip_rd;
+      RDCOST(x->rdmult, rd_stats->rate + no_skip_txfm_rate, rd_stats->dist);
+  if (skip_txfm_rd < this_rd) {
+    this_rd = skip_txfm_rd;
     rd_stats->rate = 0;
     rd_stats->dist = rd_stats->sse;
-    rd_stats->skip = 1;
+    rd_stats->skip_txfm = 1;
   }
 
   const int is_cost_valid = this_rd > ref_best_rd;
@@ -3105,10 +3436,15 @@
                                        int64_t ref_best_rd,
                                        TXB_RD_INFO_NODE *rd_info_tree) {
   MACROBLOCKD *const xd = &x->e_mbd;
+  const TxfmSearchParams *txfm_params = &x->txfm_search_params;
   assert(is_inter_block(xd->mi[0]));
   assert(bsize < BLOCK_SIZES_ALL);
-  const int fast_tx_search = x->tx_size_search_method > USE_FULL_RD;
+  const int fast_tx_search = txfm_params->tx_size_search_method > USE_FULL_RD;
   int64_t rd_thresh = ref_best_rd;
+  if (rd_thresh == 0) {
+    av1_invalid_rd_stats(rd_stats);
+    return INT64_MAX;
+  }
   if (fast_tx_search && rd_thresh < INT64_MAX) {
     if (INT64_MAX - rd_thresh > (rd_thresh >> 3)) rd_thresh += (rd_thresh >> 3);
   }
@@ -3126,17 +3462,17 @@
   av1_get_entropy_contexts(bsize, pd, ctxa, ctxl);
   memcpy(tx_above, xd->above_txfm_context, sizeof(TXFM_CONTEXT) * mi_width);
   memcpy(tx_left, xd->left_txfm_context, sizeof(TXFM_CONTEXT) * mi_height);
-  const int init_depth = get_search_init_depth(mi_width, mi_height, 1, &cpi->sf,
-                                               x->tx_size_search_method);
+  const int init_depth = get_search_init_depth(
+      mi_width, mi_height, 1, &cpi->sf, txfm_params->tx_size_search_method);
   const TX_SIZE max_tx_size = max_txsize_rect_lookup[bsize];
   const int bh = tx_size_high_unit[max_tx_size];
   const int bw = tx_size_wide_unit[max_tx_size];
   const int step = bw * bh;
-  const int skip_ctx = av1_get_skip_context(xd);
-  const int no_skip_flag_cost = x->skip_cost[skip_ctx][0];
-  const int skip_flag_cost = x->skip_cost[skip_ctx][1];
-  int64_t skip_rd = RDCOST(x->rdmult, skip_flag_cost, 0);
-  int64_t no_skip_rd = RDCOST(x->rdmult, no_skip_flag_cost, 0);
+  const int skip_ctx = av1_get_skip_txfm_context(xd);
+  const int no_skip_txfm_cost = x->mode_costs.skip_txfm_cost[skip_ctx][0];
+  const int skip_txfm_cost = x->mode_costs.skip_txfm_cost[skip_ctx][1];
+  int64_t skip_txfm_rd = RDCOST(x->rdmult, skip_txfm_cost, 0);
+  int64_t no_skip_txfm_rd = RDCOST(x->rdmult, no_skip_txfm_cost, 0);
   int block = 0;
 
   av1_init_rd_stats(rd_stats);
@@ -3145,7 +3481,7 @@
       const int64_t best_rd_sofar =
           (rd_thresh == INT64_MAX)
               ? INT64_MAX
-              : (rd_thresh - (AOMMIN(skip_rd, no_skip_rd)));
+              : (rd_thresh - (AOMMIN(skip_txfm_rd, no_skip_txfm_rd)));
       int is_cost_valid = 1;
       RD_STATS pn_rd_stats;
       // Search for the best transform block size and type for the sub-block.
@@ -3157,9 +3493,9 @@
         return INT64_MAX;
       }
       av1_merge_rd_stats(rd_stats, &pn_rd_stats);
-      skip_rd = RDCOST(x->rdmult, skip_flag_cost, rd_stats->sse);
-      no_skip_rd =
-          RDCOST(x->rdmult, rd_stats->rate + no_skip_flag_cost, rd_stats->dist);
+      skip_txfm_rd = RDCOST(x->rdmult, skip_txfm_cost, rd_stats->sse);
+      no_skip_txfm_rd =
+          RDCOST(x->rdmult, rd_stats->rate + no_skip_txfm_cost, rd_stats->dist);
       block += step;
       if (rd_info_tree != NULL) rd_info_tree += 1;
     }
@@ -3167,7 +3503,7 @@
 
   if (rd_stats->rate == INT_MAX) return INT64_MAX;
 
-  rd_stats->skip = (skip_rd <= no_skip_rd);
+  rd_stats->skip_txfm = (skip_txfm_rd <= no_skip_txfm_rd);
 
   // If fast_tx_search is true, only DCT and 1D DCT were tested in
   // select_inter_block_yrd() above. Do a better search for tx type with
@@ -3178,14 +3514,14 @@
   }
 
   int64_t final_rd;
-  if (rd_stats->skip) {
-    final_rd = RDCOST(x->rdmult, skip_flag_cost, rd_stats->sse);
+  if (rd_stats->skip_txfm) {
+    final_rd = RDCOST(x->rdmult, skip_txfm_cost, rd_stats->sse);
   } else {
     final_rd =
-        RDCOST(x->rdmult, rd_stats->rate + no_skip_flag_cost, rd_stats->dist);
+        RDCOST(x->rdmult, rd_stats->rate + no_skip_txfm_cost, rd_stats->dist);
     if (!xd->lossless[xd->mi[0]->segment_id]) {
       final_rd =
-          AOMMIN(final_rd, RDCOST(x->rdmult, skip_flag_cost, rd_stats->sse));
+          AOMMIN(final_rd, RDCOST(x->rdmult, skip_txfm_cost, rd_stats->sse));
     }
   }
 
@@ -3216,14 +3552,11 @@
   return ((model_rd * factor) >> 3) > ref_best_rd;
 }
 
-// Search for best transform size and type for luma inter blocks. The transform
-// block partitioning can be recursive resulting in non-uniform transform sizes.
-// The best transform size and type, if found, will be saved in the MB_MODE_INFO
-// structure, and the corresponding RD stats will be saved in rd_stats.
 void av1_pick_recursive_tx_size_type_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
                                          RD_STATS *rd_stats, BLOCK_SIZE bsize,
                                          int64_t ref_best_rd) {
   MACROBLOCKD *const xd = &x->e_mbd;
+  const TxfmSearchParams *txfm_params = &x->txfm_search_params;
   assert(is_inter_block(xd->mi[0]));
 
   av1_invalid_rd_stats(rd_stats);
@@ -3250,7 +3583,7 @@
   const int n4 = bsize_to_num_blk(bsize);
   if (is_mb_rd_hash_enabled) {
     hash = get_block_residue_hash(x, bsize);
-    mb_rd_record = &x->mb_rd_record;
+    mb_rd_record = &x->txfm_search_info.mb_rd_record;
     const int match_index = find_mb_rd_info(mb_rd_record, ref_best_rd, hash);
     if (match_index != -1) {
       MB_RD_INFO *tx_rd_info = &mb_rd_record->tx_rd_info[match_index];
@@ -3262,17 +3595,17 @@
   // If we predict that skip is the optimal RD decision - set the respective
   // context and terminate early.
   int64_t dist;
-  if (x->predict_skip_level &&
-      predict_skip_flag(x, bsize, &dist,
+  if (txfm_params->skip_txfm_level &&
+      predict_skip_txfm(x, bsize, &dist,
                         cpi->common.features.reduced_tx_set_used)) {
-    set_skip_flag(x, rd_stats, bsize, dist);
+    set_skip_txfm(x, rd_stats, bsize, dist);
     // Save the RD search results into tx_rd_record.
     if (is_mb_rd_hash_enabled)
       save_tx_rd_info(n4, hash, x, rd_stats, mb_rd_record);
     return;
   }
 #if CONFIG_SPEED_STATS
-  ++x->tx_search_count;
+  ++x->txfm_search_info.tx_search_count;
 #endif  // CONFIG_SPEED_STATS
 
   // Pre-compute residue hashes (transform block level) and find existing or
@@ -3305,17 +3638,13 @@
   }
 }
 
-// Search for the best transform size and type for current coding block, with
-// the assumption that all the transform blocks have a uniform size (VP9 style).
-// The selected transform size and type will be saved in the MB_MODE_INFO
-// structure; the corresponding RD stats will be saved in rd_stats.
-// This function may be used for both intra and inter predicted blocks.
 void av1_pick_uniform_tx_size_type_yrd(const AV1_COMP *const cpi, MACROBLOCK *x,
                                        RD_STATS *rd_stats, BLOCK_SIZE bs,
                                        int64_t ref_best_rd) {
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = xd->mi[0];
-  assert(bs == mbmi->sb_type);
+  const TxfmSearchParams *tx_params = &x->txfm_search_params;
+  assert(bs == mbmi->bsize);
   const int is_inter = is_inter_block(mbmi);
   const int mi_row = xd->mi_row;
   const int mi_col = xd->mi_col;
@@ -3336,7 +3665,7 @@
         (mi_col + mi_size_wide[bs] < xd->tile.mi_col_end);
     if (within_border) {
       hash = get_block_residue_hash(x, bs);
-      mb_rd_record = &x->mb_rd_record;
+      mb_rd_record = &x->txfm_search_info.mb_rd_record;
       const int match_index = find_mb_rd_info(mb_rd_record, ref_best_rd, hash);
       if (match_index != -1) {
         MB_RD_INFO *tx_rd_info = &mb_rd_record->tx_rd_info[match_index];
@@ -3349,11 +3678,12 @@
   // If we predict that skip is the optimal RD decision - set the respective
   // context and terminate early.
   int64_t dist;
-  if (x->predict_skip_level && is_inter && !xd->lossless[mbmi->segment_id] &&
-      predict_skip_flag(x, bs, &dist,
+  if (tx_params->skip_txfm_level && is_inter &&
+      !xd->lossless[mbmi->segment_id] &&
+      predict_skip_txfm(x, bs, &dist,
                         cpi->common.features.reduced_tx_set_used)) {
     // Populate rdstats as per skip decision
-    set_skip_flag(x, rd_stats, bs, dist);
+    set_skip_txfm(x, rd_stats, bs, dist);
     // Save the RD search results into tx_rd_record.
     if (mb_rd_record) {
       save_tx_rd_info(num_blks, hash, x, rd_stats, mb_rd_record);
@@ -3364,7 +3694,7 @@
   if (xd->lossless[mbmi->segment_id]) {
     // Lossless mode can only pick the smallest (4x4) transform size.
     choose_smallest_tx_size(cpi, x, rd_stats, ref_best_rd, bs);
-  } else if (x->tx_size_search_method == USE_LARGESTALL) {
+  } else if (tx_params->tx_size_search_method == USE_LARGESTALL) {
     choose_largest_tx_size(cpi, x, rd_stats, ref_best_rd, bs);
   } else {
     choose_tx_size_type_from_rd(cpi, x, rd_stats, ref_best_rd, bs);
@@ -3376,9 +3706,6 @@
   }
 }
 
-// Calculate the transform coefficient RD cost for the given chroma coding block
-// Return value 0: early termination triggered, no valid rd cost available;
-//              1: rd cost values are valid.
 int av1_txfm_uvrd(const AV1_COMP *const cpi, MACROBLOCK *x, RD_STATS *rd_stats,
                   BLOCK_SIZE bsize, int64_t ref_best_rd) {
   av1_init_rd_stats(rd_stats);
@@ -3389,7 +3716,7 @@
   MB_MODE_INFO *const mbmi = xd->mi[0];
   struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_U];
   const int is_inter = is_inter_block(mbmi);
-  int64_t this_rd = 0, skip_rd = 0;
+  int64_t this_rd = 0, skip_txfm_rd = 0;
   const BLOCK_SIZE plane_bsize =
       get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
 
@@ -3411,19 +3738,17 @@
     // TODO(any): Extend the early exit mechanism for intra modes as well
     if (cpi->sf.inter_sf.perform_best_rd_based_gating_for_chroma && is_inter &&
         chroma_ref_best_rd != INT64_MAX)
-      chroma_ref_best_rd = ref_best_rd - AOMMIN(this_rd, skip_rd);
+      chroma_ref_best_rd = ref_best_rd - AOMMIN(this_rd, skip_txfm_rd);
     av1_txfm_rd_in_plane(x, cpi, &this_rd_stats, chroma_ref_best_rd, 0, plane,
-                         plane_bsize, uv_tx_size,
-                         cpi->sf.rd_sf.use_fast_coef_costing, FTXS_NONE,
-                         skip_trellis);
+                         plane_bsize, uv_tx_size, FTXS_NONE, skip_trellis);
     if (this_rd_stats.rate == INT_MAX) {
       is_cost_valid = 0;
       break;
     }
     av1_merge_rd_stats(rd_stats, &this_rd_stats);
     this_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
-    skip_rd = RDCOST(x->rdmult, 0, rd_stats->sse);
-    if (AOMMIN(this_rd, skip_rd) > ref_best_rd) {
+    skip_txfm_rd = RDCOST(x->rdmult, 0, rd_stats->sse);
+    if (AOMMIN(this_rd, skip_txfm_rd) > ref_best_rd) {
       is_cost_valid = 0;
       break;
     }
@@ -3437,17 +3762,15 @@
   return is_cost_valid;
 }
 
-// Search for the best transform type and calculate the transform coefficients
-// RD cost of the current coding block with the specified (uniform) transform
-// size and channel. The RD results will be saved in rd_stats.
 void av1_txfm_rd_in_plane(MACROBLOCK *x, const AV1_COMP *cpi,
                           RD_STATS *rd_stats, int64_t ref_best_rd,
                           int64_t current_rd, int plane, BLOCK_SIZE plane_bsize,
-                          TX_SIZE tx_size, int use_fast_coef_costing,
-                          FAST_TX_SEARCH_MODE ftxs_mode, int skip_trellis) {
+                          TX_SIZE tx_size, FAST_TX_SEARCH_MODE ftxs_mode,
+                          int skip_trellis) {
   assert(IMPLIES(plane == 0, x->e_mbd.mi[0]->tx_size == tx_size));
 
-  if (!cpi->oxcf.enable_tx64 && txsize_sqr_up_map[tx_size] == TX_64X64) {
+  if (!cpi->oxcf.txfm_cfg.enable_tx64 &&
+      txsize_sqr_up_map[tx_size] == TX_64X64) {
     av1_invalid_rd_stats(rd_stats);
     return;
   }
@@ -3465,7 +3788,6 @@
   args.cpi = cpi;
   args.best_rd = ref_best_rd;
   args.current_rd = current_rd;
-  args.use_fast_coef_costing = use_fast_coef_costing;
   args.ftxs_mode = ftxs_mode;
   args.skip_trellis = skip_trellis;
   av1_init_rd_stats(&args.rd_stats);
@@ -3485,23 +3807,16 @@
   }
 }
 
-// This function combines y and uv planes' transform search processes together
-// for inter-predicted blocks (including IntraBC), when the prediction is
-// already generated. It first does subtraction to obtain the prediction error.
-// Then it calls
-// av1_pick_recursive_tx_size_type_yrd/av1_pick_uniform_tx_size_type_yrd and
-// av1_txfm_uvrd sequentially and handles the early terminations
-// happening in those functions. At the end, it computes the
-// rd_stats/_y/_uv accordingly.
 int av1_txfm_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
                     RD_STATS *rd_stats, RD_STATS *rd_stats_y,
                     RD_STATS *rd_stats_uv, int mode_rate, int64_t ref_best_rd) {
   MACROBLOCKD *const xd = &x->e_mbd;
-  const int skip_ctx = av1_get_skip_context(xd);
-  const int skip_flag_cost[2] = { x->skip_cost[skip_ctx][0],
-                                  x->skip_cost[skip_ctx][1] };
+  TxfmSearchParams *txfm_params = &x->txfm_search_params;
+  const int skip_ctx = av1_get_skip_txfm_context(xd);
+  const int skip_txfm_cost[2] = { x->mode_costs.skip_txfm_cost[skip_ctx][0],
+                                  x->mode_costs.skip_txfm_cost[skip_ctx][1] };
   const int64_t min_header_rate =
-      mode_rate + AOMMIN(skip_flag_cost[0], skip_flag_cost[1]);
+      mode_rate + AOMMIN(skip_txfm_cost[0], skip_txfm_cost[1]);
   // Account for minimum skip and non_skip rd.
   // Eventually either one of them will be added to mode_rate
   const int64_t min_header_rd_possible = RDCOST(x->rdmult, min_header_rate, 0);
@@ -3521,7 +3836,7 @@
 
   // cost and distortion
   av1_subtract_plane(x, bsize, 0);
-  if (x->tx_mode_search_type == TX_MODE_SELECT &&
+  if (txfm_params->tx_mode_search_type == TX_MODE_SELECT &&
       !xd->lossless[mbmi->segment_id]) {
     av1_pick_recursive_tx_size_type_yrd(cpi, x, rd_stats_y, bsize, rd_thresh);
 #if CONFIG_COLLECT_RD_STATS == 2
@@ -3531,30 +3846,19 @@
     av1_pick_uniform_tx_size_type_yrd(cpi, x, rd_stats_y, bsize, rd_thresh);
     memset(mbmi->inter_tx_size, mbmi->tx_size, sizeof(mbmi->inter_tx_size));
     for (int i = 0; i < xd->height * xd->width; ++i)
-      set_blk_skip(x, 0, i, rd_stats_y->skip);
+      set_blk_skip(x->txfm_search_info.blk_skip, 0, i, rd_stats_y->skip_txfm);
   }
 
   if (rd_stats_y->rate == INT_MAX) return 0;
 
   av1_merge_rd_stats(rd_stats, rd_stats_y);
 
-  const int64_t non_skip_rdcosty =
-      RDCOST(x->rdmult, rd_stats->rate + skip_flag_cost[0], rd_stats->dist);
-  const int64_t skip_rdcosty =
-      RDCOST(x->rdmult, mode_rate + skip_flag_cost[1], rd_stats->sse);
-  const int64_t min_rdcosty = AOMMIN(non_skip_rdcosty, skip_rdcosty);
-  if (min_rdcosty > ref_best_rd) {
-    const int64_t tokenonly_rdy =
-        AOMMIN(RDCOST(x->rdmult, rd_stats_y->rate, rd_stats_y->dist),
-               RDCOST(x->rdmult, 0, rd_stats_y->sse));
-    // Invalidate rd_stats_y to skip the rest of the motion modes search
-    if (tokenonly_rdy -
-            (tokenonly_rdy >> cpi->sf.inter_sf.prune_motion_mode_level) >
-        rd_thresh) {
-      av1_invalid_rd_stats(rd_stats_y);
-    }
-    return 0;
-  }
+  const int64_t non_skip_txfm_rdcosty =
+      RDCOST(x->rdmult, rd_stats->rate + skip_txfm_cost[0], rd_stats->dist);
+  const int64_t skip_txfm_rdcosty =
+      RDCOST(x->rdmult, mode_rate + skip_txfm_cost[1], rd_stats->sse);
+  const int64_t min_rdcosty = AOMMIN(non_skip_txfm_rdcosty, skip_txfm_rdcosty);
+  if (min_rdcosty > ref_best_rd) return 0;
 
   av1_init_rd_stats(rd_stats_uv);
   const int num_planes = av1_num_planes(cm);
@@ -3563,8 +3867,8 @@
     // Calculate best rd cost possible for chroma
     if (cpi->sf.inter_sf.perform_best_rd_based_gating_for_chroma &&
         (ref_best_chroma_rd != INT64_MAX)) {
-      ref_best_chroma_rd =
-          (ref_best_chroma_rd - AOMMIN(non_skip_rdcosty, skip_rdcosty));
+      ref_best_chroma_rd = (ref_best_chroma_rd -
+                            AOMMIN(non_skip_txfm_rdcosty, skip_txfm_rdcosty));
     }
     const int is_cost_valid_uv =
         av1_txfm_uvrd(cpi, x, rd_stats_uv, bsize, ref_best_chroma_rd);
@@ -3572,30 +3876,30 @@
     av1_merge_rd_stats(rd_stats, rd_stats_uv);
   }
 
-  int choose_skip = rd_stats->skip;
-  if (!choose_skip && !xd->lossless[mbmi->segment_id]) {
-    const int64_t rdcost_no_skip = RDCOST(
-        x->rdmult, rd_stats_y->rate + rd_stats_uv->rate + skip_flag_cost[0],
+  int choose_skip_txfm = rd_stats->skip_txfm;
+  if (!choose_skip_txfm && !xd->lossless[mbmi->segment_id]) {
+    const int64_t rdcost_no_skip_txfm = RDCOST(
+        x->rdmult, rd_stats_y->rate + rd_stats_uv->rate + skip_txfm_cost[0],
         rd_stats->dist);
-    const int64_t rdcost_skip =
-        RDCOST(x->rdmult, skip_flag_cost[1], rd_stats->sse);
-    if (rdcost_no_skip >= rdcost_skip) choose_skip = 1;
+    const int64_t rdcost_skip_txfm =
+        RDCOST(x->rdmult, skip_txfm_cost[1], rd_stats->sse);
+    if (rdcost_no_skip_txfm >= rdcost_skip_txfm) choose_skip_txfm = 1;
   }
-  if (choose_skip) {
+  if (choose_skip_txfm) {
     rd_stats_y->rate = 0;
     rd_stats_uv->rate = 0;
-    rd_stats->rate = mode_rate + skip_flag_cost[1];
+    rd_stats->rate = mode_rate + skip_txfm_cost[1];
     rd_stats->dist = rd_stats->sse;
     rd_stats_y->dist = rd_stats_y->sse;
     rd_stats_uv->dist = rd_stats_uv->sse;
-    mbmi->skip = 1;
-    if (rd_stats->skip) {
+    mbmi->skip_txfm = 1;
+    if (rd_stats->skip_txfm) {
       const int64_t tmprd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
       if (tmprd > ref_best_rd) return 0;
     }
   } else {
-    rd_stats->rate += skip_flag_cost[0];
-    mbmi->skip = 0;
+    rd_stats->rate += skip_txfm_cost[0];
+    mbmi->skip_txfm = 0;
   }
 
   return 1;
diff --git a/av1/encoder/tx_search.h b/av1/encoder/tx_search.h
index 82d5671..617f354 100644
--- a/av1/encoder/tx_search.h
+++ b/av1/encoder/tx_search.h
@@ -35,39 +35,161 @@
 
 static AOM_INLINE int tx_size_cost(const MACROBLOCK *const x, BLOCK_SIZE bsize,
                                    TX_SIZE tx_size) {
-  assert(bsize == x->e_mbd.mi[0]->sb_type);
-  if (x->tx_mode_search_type != TX_MODE_SELECT || !block_signals_txsize(bsize))
+  assert(bsize == x->e_mbd.mi[0]->bsize);
+  if (x->txfm_search_params.tx_mode_search_type != TX_MODE_SELECT ||
+      !block_signals_txsize(bsize))
     return 0;
 
   const int32_t tx_size_cat = bsize_to_tx_size_cat(bsize);
   const int depth = tx_size_to_depth(tx_size, bsize);
   const MACROBLOCKD *const xd = &x->e_mbd;
   const int tx_size_ctx = get_tx_size_context(xd);
-  return x->tx_size_cost[tx_size_cat][tx_size_ctx][depth];
+  return x->mode_costs.tx_size_cost[tx_size_cat][tx_size_ctx][depth];
 }
 
+/*!\brief Transform type search for luma macroblock with fixed transform size.
+ *
+ * \ingroup transform_search
+ * Search for the best transform type and return the transform coefficients RD
+ * cost of current luma macroblock with the given uniform transform size.
+ *
+ * \param[in]    x              Pointer to structure holding the data for the
+                                current encoding macroblock
+ * \param[in]    cpi            Top-level encoder structure
+ * \param[in]    rd_stats       Pointer to struct to keep track of the RD stats
+ * \param[in]    ref_best_rd    Best RD cost seen for this block so far
+ * \param[in]    bs             Size of the current macroblock
+ * \param[in]    tx_size        The given transform size
+ * \param[in]    ftxs_mode      Transform search mode specifying desired speed
+                                and quality tradeoff
+ * \param[in]    skip_trellis   Binary flag indicating if trellis optimization
+                                should be skipped
+ * \return       An int64_t value that is the best RD cost found.
+ */
 int64_t av1_uniform_txfm_yrd(const AV1_COMP *const cpi, MACROBLOCK *x,
                              RD_STATS *rd_stats, int64_t ref_best_rd,
                              BLOCK_SIZE bs, TX_SIZE tx_size,
                              FAST_TX_SEARCH_MODE ftxs_mode, int skip_trellis);
 
+/*!\brief Recursive transform size and type search.
+ *
+ * \ingroup transform_search
+ * Search for best transform size and type for luma inter blocks. The transform
+ * block partitioning can be recursive resulting in non-uniform transform sizes.
+ * The best transform size and type, if found, will be saved in the MB_MODE_INFO
+ * structure, and the corresponding RD stats will be saved in rd_stats.
+ *
+ * \param[in]    cpi            Top-level encoder structure
+ * \param[in]    x              Pointer to structure holding the data for the
+                                current encoding macroblock
+ * \param[in]    rd_stats       Pointer to struct to keep track of the RD stats
+ * \param[in]    bsize          Current macroblock size
+ * \param[in]    ref_best_rd    Best RD cost seen for this block so far
+ * \return       Nothing is returned. The selected transform size and type will
+                 be saved in the MB_MODE_INFO structure
+ */
 void av1_pick_recursive_tx_size_type_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
                                          RD_STATS *rd_stats, BLOCK_SIZE bsize,
                                          int64_t ref_best_rd);
 
+/*!\brief Uniform transform size and type search.
+ *
+ * \ingroup transform_search
+ * Search for the best transform size and type for current macroblock block,
+ * with the assumption that all the transform blocks have a uniform size
+ * (VP9 style). The selected transform size and type will be saved in the
+ * MB_MODE_INFO structure; the corresponding RD stats will be saved in rd_stats.
+ * This function may be used for both intra and inter predicted blocks.
+ *
+ * \param[in]    cpi            Top-level encoder structure
+ * \param[in]    x              Pointer to structure holding the data for the
+                                current encoding macroblock
+ * \param[in]    rd_stats       Pointer to struct to keep track of the RD stats
+ * \param[in]    bs             Current macroblock size
+ * \param[in]    ref_best_rd    Best RD cost seen for this block so far
+ * \return       Nothing is returned. The selected transform size and type will
+                 be saved in the MB_MODE_INFO structure
+ */
 void av1_pick_uniform_tx_size_type_yrd(const AV1_COMP *const cpi, MACROBLOCK *x,
                                        RD_STATS *rd_stats, BLOCK_SIZE bs,
                                        int64_t ref_best_rd);
 
+/*!\brief Chroma block transform search.
+ *
+ * \ingroup transform_search
+ * Calculate the transform coefficient RD cost for the given chroma macroblock
+ * If the current mode is intra, then this function will compute the predictor.
+ *
+ * \param[in]    cpi            Top-level encoder structure
+ * \param[in]    x              Pointer to structure holding the data for the
+                                current encoding macroblock
+ * \param[in]    rd_stats       Pointer to struct to keep track of the RD stats
+ * \param[in]    bsize          Current macroblock size
+ * \param[in]    ref_best_rd    Best RD cost seen for this block so far
+ * \return       An integer value is returned. 0: early termination triggered,
+                 no valid rd cost available; 1: rd cost values are valid.
+ */
 int av1_txfm_uvrd(const AV1_COMP *const cpi, MACROBLOCK *x, RD_STATS *rd_stats,
                   BLOCK_SIZE bsize, int64_t ref_best_rd);
 
+/*!\brief Transform type search with fixed transform size.
+ *
+ * \ingroup transform_search
+ * Search for the best transform type and calculate the transform coefficients
+ * RD cost of the current transform block with the specified (uniform) transform
+ * size and plane. The RD results will be saved in rd_stats.
+ *
+ * \param[in]    x              Pointer to structure holding the data for the
+                                current encoding macroblock
+ * \param[in]    cpi            Top-level encoder structure
+ * \param[in]    rd_stats       Pointer to struct to keep track of the RD stats
+ * \param[in]    ref_best_rd    Best RD cost seen for this block so far
+ * \param[in]    current_rd     Current RD cost for this block so far
+ * \param[in]    plane          Plane index
+ * \param[in]    plane_bsize    Size of the current macroblock considering
+                                sup-sampling
+ * \param[in]    tx_size        The given transform size
+ * \param[in]    ftxs_mode      Transform search mode specifying desired speed
+                                and quality tradeoff
+ * \param[in]    skip_trellis   Binary flag indicating if trellis optimization
+                                should be skipped
+ *
+ * \return       Nothing is returned. The RD results will be saved in rd_stats.
+ */
 void av1_txfm_rd_in_plane(MACROBLOCK *x, const AV1_COMP *cpi,
                           RD_STATS *rd_stats, int64_t ref_best_rd,
-                          int64_t this_rd, int plane, BLOCK_SIZE plane_bsize,
-                          TX_SIZE tx_size, int use_fast_coef_costing,
-                          FAST_TX_SEARCH_MODE ftxs_mode, int skip_trellis);
+                          int64_t current_rd, int plane, BLOCK_SIZE plane_bsize,
+                          TX_SIZE tx_size, FAST_TX_SEARCH_MODE ftxs_mode,
+                          int skip_trellis);
 
+/*!\brief Recursive transform size and type search.
+ *
+ * \ingroup transform_search
+ * This function combines y and uv planes' transform search processes together
+ * for inter-predicted blocks (including IntraBC), when the prediction is
+ * already generated. It first does subtraction to obtain the prediction error.
+ * Then it calls
+ * av1_pick_recursive_tx_size_type_yrd/av1_pick_uniform_tx_size_type_yrd and
+ * av1_txfm_uvrd sequentially and handles possible early terminations.
+ * The RD metrics are calculated and stored in rd_stats/_y/_uv.
+ *
+ * \param[in]    cpi            Top-level encoder structure
+ * \param[in]    x              Pointer to structure holding the data for the
+                                current encoding macroblock
+ * \param[in]    bsize          Current macroblock size
+ * \param[in]    rd_stats       Pointer to struct to keep track of the overal RD
+                                stats
+ * \param[in]    rd_stats_y     Pointer to struct to keep track of the RD
+                                stats for the luma plane
+ * \param[in]    rd_stats_uv    Pointer to struct to keep track of the RD
+                                stats for the chroma planes
+ * \param[in]    mode_rate      Rate cost to encode the prediction mode info. of
+                                the current macroblock
+ * \param[in]    ref_best_rd    Best RD cost seen for this block so far
+ *
+ * \return       An integer value is returned indicating if a valid transform
+                 candidate is found (1) or not (0).
+ */
 int av1_txfm_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
                     RD_STATS *rd_stats, RD_STATS *rd_stats_y,
                     RD_STATS *rd_stats_uv, int mode_rate, int64_t ref_best_rd);
diff --git a/av1/encoder/var_based_part.c b/av1/encoder/var_based_part.c
index e3cb1fa..c6d0f05 100644
--- a/av1/encoder/var_based_part.c
+++ b/av1/encoder/var_based_part.c
@@ -130,7 +130,7 @@
       cpi->common.mi_params.mi_rows > mi_row) {
     set_mode_info_offsets(&cpi->common.mi_params, &cpi->mbmi_ext_info, x, xd,
                           mi_row, mi_col);
-    xd->mi[0]->sb_type = bsize;
+    xd->mi[0]->bsize = bsize;
   }
 }
 
@@ -326,30 +326,29 @@
 }
 
 // TODO(kyslov) Bring back threshold adjustment based on content state
-static int64_t scale_part_thresh_sumdiff(int64_t threshold_base, int speed,
+static int64_t scale_part_thresh_content(int64_t threshold_base, int speed,
                                          int width, int height,
-                                         int content_state) {
+                                         int non_reference_frame) {
   (void)width;
   (void)height;
-  (void)content_state;
+  int64_t threshold = threshold_base;
+  if (non_reference_frame) threshold = (3 * threshold) >> 1;
   if (speed >= 8) {
-    return (5 * threshold_base) >> 2;
+    return (5 * threshold) >> 2;
   }
-  return threshold_base;
+  return threshold;
 }
 
-// Set the variance split thresholds for following the block sizes:
-// 0 - threshold_128x128, 1 - threshold_64x64, 2 - threshold_32x32,
-// 3 - vbp_threshold_16x16. 4 - vbp_threshold_8x8 (to split to 4x4 partition) is
-// currently only used on key frame.
 static AOM_INLINE void set_vbp_thresholds(AV1_COMP *cpi, int64_t thresholds[],
-                                          int q, int content_state) {
+                                          int q, int content_lowsumdiff,
+                                          int segment_id) {
   AV1_COMMON *const cm = &cpi->common;
   const int is_key_frame = frame_is_intra_only(cm);
   const int threshold_multiplier = is_key_frame ? 40 : 1;
   int64_t threshold_base =
       (int64_t)(threshold_multiplier *
                 cpi->enc_quant_dequant_params.dequants.y_dequant_QTX[q][1]);
+  const int current_qindex = cm->quant_params.base_qindex;
 
   if (is_key_frame) {
     thresholds[0] = threshold_base;
@@ -358,9 +357,26 @@
     thresholds[3] = threshold_base >> 2;
     thresholds[4] = threshold_base << 2;
   } else {
+    // Increase partition thresholds for noisy content. Apply it only for
+    // superblocks where sumdiff is low, as we assume the sumdiff of superblock
+    // whose only change is due to noise will be low (i.e, noise will average
+    // out over large block).
+    if (cpi->noise_estimate.enabled && content_lowsumdiff &&
+        (cm->width * cm->height > 640 * 480) &&
+        cm->current_frame.frame_number > 60) {
+      NOISE_LEVEL noise_level =
+          av1_noise_estimate_extract_level(&cpi->noise_estimate);
+      if (noise_level == kHigh)
+        threshold_base = (5 * threshold_base) >> 1;
+      else if (noise_level == kMedium &&
+               !cpi->sf.rt_sf.force_large_partition_blocks)
+        threshold_base = (5 * threshold_base) >> 2;
+    }
+
     // Increase base variance threshold based on content_state/sum_diff level.
-    threshold_base = scale_part_thresh_sumdiff(
-        threshold_base, cpi->oxcf.speed, cm->width, cm->height, content_state);
+    threshold_base =
+        scale_part_thresh_content(threshold_base, cpi->oxcf.speed, cm->width,
+                                  cm->height, cpi->svc.non_reference_frame);
 
     thresholds[0] = threshold_base >> 1;
     thresholds[1] = threshold_base;
@@ -368,19 +384,18 @@
     if (cm->width >= 1280 && cm->height >= 720)
       thresholds[3] = thresholds[3] << 1;
     if (cm->width * cm->height <= 352 * 288) {
-      int last_qindex = cpi->rc.last_q[INTER_FRAME];
-      if (last_qindex >= QINDEX_HIGH_THR) {
+      if (current_qindex >= QINDEX_HIGH_THR) {
         threshold_base = (5 * threshold_base) >> 1;
         thresholds[1] = threshold_base >> 3;
         thresholds[2] = threshold_base << 2;
         thresholds[3] = threshold_base << 5;
-      } else if (last_qindex < QINDEX_LOW_THR) {
+      } else if (current_qindex < QINDEX_LOW_THR) {
         thresholds[1] = threshold_base >> 3;
         thresholds[2] = threshold_base >> 1;
         thresholds[3] = threshold_base << 3;
       } else {
-        int64_t qi_diff_low = last_qindex - QINDEX_LOW_THR;
-        int64_t qi_diff_high = QINDEX_HIGH_THR - last_qindex;
+        int64_t qi_diff_low = current_qindex - QINDEX_LOW_THR;
+        int64_t qi_diff_high = QINDEX_HIGH_THR - current_qindex;
         int64_t threshold_diff = QINDEX_HIGH_THR - QINDEX_LOW_THR;
         int64_t threshold_base_high = (5 * threshold_base) >> 1;
 
@@ -403,26 +418,45 @@
     } else {
       thresholds[2] = (5 * threshold_base) >> 1;
     }
+    if (cpi->sf.rt_sf.force_large_partition_blocks) {
+      if (cm->width * cm->height <= 352 * 288) {
+        thresholds[1] <<= 2;
+        thresholds[2] <<= 5;
+        thresholds[3] = INT32_MAX;
+      } else if (cm->width * cm->height > 640 * 480 && segment_id == 0) {
+        thresholds[0] = (3 * thresholds[0]) >> 1;
+        thresholds[3] = INT32_MAX;
+        if (current_qindex >= QINDEX_LARGE_BLOCK_THR) {
+          thresholds[1] <<= 1;
+          thresholds[2] <<= 1;
+        }
+      } else if (current_qindex > QINDEX_LARGE_BLOCK_THR && segment_id == 0) {
+        thresholds[1] <<= 2;
+        thresholds[2] <<= 5;
+        thresholds[3] = INT32_MAX;
+      }
+    }
   }
 }
 
 // Set temporal variance low flag for superblock 64x64.
 // Only first 25 in the array are used in this case.
 static AOM_INLINE void set_low_temp_var_flag_64x64(
-    CommonModeInfoParams *mi_params, MACROBLOCK *x, MACROBLOCKD *xd,
-    VP64x64 *vt, const int64_t thresholds[], int mi_col, int mi_row) {
-  if (xd->mi[0]->sb_type == BLOCK_64X64) {
+    CommonModeInfoParams *mi_params, PartitionSearchInfo *part_info,
+    MACROBLOCKD *xd, VP64x64 *vt, const int64_t thresholds[], int mi_col,
+    int mi_row) {
+  if (xd->mi[0]->bsize == BLOCK_64X64) {
     if ((vt->part_variances).none.variance < (thresholds[0] >> 1))
-      x->variance_low[0] = 1;
-  } else if (xd->mi[0]->sb_type == BLOCK_64X32) {
+      part_info->variance_low[0] = 1;
+  } else if (xd->mi[0]->bsize == BLOCK_64X32) {
     for (int i = 0; i < 2; i++) {
       if (vt->part_variances.horz[i].variance < (thresholds[0] >> 2))
-        x->variance_low[i + 1] = 1;
+        part_info->variance_low[i + 1] = 1;
     }
-  } else if (xd->mi[0]->sb_type == BLOCK_32X64) {
+  } else if (xd->mi[0]->bsize == BLOCK_32X64) {
     for (int i = 0; i < 2; i++) {
       if (vt->part_variances.vert[i].variance < (thresholds[0] >> 2))
-        x->variance_low[i + 3] = 1;
+        part_info->variance_low[i + 3] = 1;
     }
   } else {
     static const int idx[4][2] = { { 0, 0 }, { 0, 8 }, { 8, 0 }, { 8, 8 } };
@@ -437,20 +471,20 @@
 
       if (*this_mi == NULL) continue;
 
-      if ((*this_mi)->sb_type == BLOCK_32X32) {
+      if ((*this_mi)->bsize == BLOCK_32X32) {
         int64_t threshold_32x32 = (5 * thresholds[1]) >> 3;
         if (vt->split[i].part_variances.none.variance < threshold_32x32)
-          x->variance_low[i + 5] = 1;
+          part_info->variance_low[i + 5] = 1;
       } else {
         // For 32x16 and 16x32 blocks, the flag is set on each 16x16 block
         // inside.
-        if ((*this_mi)->sb_type == BLOCK_16X16 ||
-            (*this_mi)->sb_type == BLOCK_32X16 ||
-            (*this_mi)->sb_type == BLOCK_16X32) {
+        if ((*this_mi)->bsize == BLOCK_16X16 ||
+            (*this_mi)->bsize == BLOCK_32X16 ||
+            (*this_mi)->bsize == BLOCK_16X32) {
           for (int j = 0; j < 4; j++) {
             if (vt->split[i].split[j].part_variances.none.variance <
                 (thresholds[2] >> 8))
-              x->variance_low[(i << 2) + j + 9] = 1;
+              part_info->variance_low[(i << 2) + j + 9] = 1;
           }
         }
       }
@@ -459,20 +493,21 @@
 }
 
 static AOM_INLINE void set_low_temp_var_flag_128x128(
-    CommonModeInfoParams *mi_params, MACROBLOCK *x, MACROBLOCKD *xd,
-    VP128x128 *vt, const int64_t thresholds[], int mi_col, int mi_row) {
-  if (xd->mi[0]->sb_type == BLOCK_128X128) {
+    CommonModeInfoParams *mi_params, PartitionSearchInfo *part_info,
+    MACROBLOCKD *xd, VP128x128 *vt, const int64_t thresholds[], int mi_col,
+    int mi_row) {
+  if (xd->mi[0]->bsize == BLOCK_128X128) {
     if (vt->part_variances.none.variance < (thresholds[0] >> 1))
-      x->variance_low[0] = 1;
-  } else if (xd->mi[0]->sb_type == BLOCK_128X64) {
+      part_info->variance_low[0] = 1;
+  } else if (xd->mi[0]->bsize == BLOCK_128X64) {
     for (int i = 0; i < 2; i++) {
       if (vt->part_variances.horz[i].variance < (thresholds[0] >> 2))
-        x->variance_low[i + 1] = 1;
+        part_info->variance_low[i + 1] = 1;
     }
-  } else if (xd->mi[0]->sb_type == BLOCK_64X128) {
+  } else if (xd->mi[0]->bsize == BLOCK_64X128) {
     for (int i = 0; i < 2; i++) {
       if (vt->part_variances.vert[i].variance < (thresholds[0] >> 2))
-        x->variance_low[i + 3] = 1;
+        part_info->variance_low[i + 3] = 1;
     }
   } else {
     static const int idx64[4][2] = {
@@ -488,19 +523,19 @@
           mi_params->mi_rows <= mi_row + idx64[i][0])
         continue;
       const int64_t threshold_64x64 = (5 * thresholds[1]) >> 3;
-      if ((*mi_64)->sb_type == BLOCK_64X64) {
+      if ((*mi_64)->bsize == BLOCK_64X64) {
         if (vt->split[i].part_variances.none.variance < threshold_64x64)
-          x->variance_low[5 + i] = 1;
-      } else if ((*mi_64)->sb_type == BLOCK_64X32) {
+          part_info->variance_low[5 + i] = 1;
+      } else if ((*mi_64)->bsize == BLOCK_64X32) {
         for (int j = 0; j < 2; j++)
           if (vt->split[i].part_variances.horz[j].variance <
               (threshold_64x64 >> 1))
-            x->variance_low[9 + (i << 1) + j] = 1;
-      } else if ((*mi_64)->sb_type == BLOCK_32X64) {
+            part_info->variance_low[9 + (i << 1) + j] = 1;
+      } else if ((*mi_64)->bsize == BLOCK_32X64) {
         for (int j = 0; j < 2; j++)
           if (vt->split[i].part_variances.vert[j].variance <
               (threshold_64x64 >> 1))
-            x->variance_low[17 + (i << 1) + j] = 1;
+            part_info->variance_low[17 + (i << 1) + j] = 1;
       } else {
         for (int k = 0; k < 4; k++) {
           const int idx_str1 = mi_params->mi_stride * idx32[k][0] + idx32[k][1];
@@ -511,22 +546,22 @@
               mi_params->mi_rows <= mi_row + idx64[i][0] + idx32[k][0])
             continue;
           const int64_t threshold_32x32 = (5 * thresholds[2]) >> 3;
-          if ((*mi_32)->sb_type == BLOCK_32X32) {
+          if ((*mi_32)->bsize == BLOCK_32X32) {
             if (vt->split[i].split[k].part_variances.none.variance <
                 threshold_32x32)
-              x->variance_low[25 + (i << 2) + k] = 1;
+              part_info->variance_low[25 + (i << 2) + k] = 1;
           } else {
             // For 32x16 and 16x32 blocks, the flag is set on each 16x16 block
             // inside.
-            if ((*mi_32)->sb_type == BLOCK_16X16 ||
-                (*mi_32)->sb_type == BLOCK_32X16 ||
-                (*mi_32)->sb_type == BLOCK_16X32) {
+            if ((*mi_32)->bsize == BLOCK_16X16 ||
+                (*mi_32)->bsize == BLOCK_32X16 ||
+                (*mi_32)->bsize == BLOCK_16X32) {
               for (int j = 0; j < 4; j++) {
                 if (vt->split[i]
                         .split[k]
                         .split[j]
                         .part_variances.none.variance < (thresholds[3] >> 8))
-                  x->variance_low[41 + (i << 4) + (k << 2) + j] = 1;
+                  part_info->variance_low[41 + (i << 4) + (k << 2) + j] = 1;
               }
             }
           }
@@ -537,9 +572,9 @@
 }
 
 static AOM_INLINE void set_low_temp_var_flag(
-    AV1_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd, VP128x128 *vt,
-    int64_t thresholds[], MV_REFERENCE_FRAME ref_frame_partition, int mi_col,
-    int mi_row) {
+    AV1_COMP *cpi, PartitionSearchInfo *part_info, MACROBLOCKD *xd,
+    VP128x128 *vt, int64_t thresholds[], MV_REFERENCE_FRAME ref_frame_partition,
+    int mi_col, int mi_row) {
   AV1_COMMON *const cm = &cpi->common;
   const int mv_thr = cm->width > 640 ? 8 : 4;
   // Check temporal variance for bsize >= 16x16, if LAST_FRAME was selected and
@@ -555,21 +590,21 @@
         xd->mi[0]->mv[0].as_mv.row > -mv_thr))) {
     const int is_small_sb = (cm->seq_params.sb_size == BLOCK_64X64);
     if (is_small_sb)
-      set_low_temp_var_flag_64x64(&cm->mi_params, x, xd, &(vt->split[0]),
-                                  thresholds, mi_col, mi_row);
+      set_low_temp_var_flag_64x64(&cm->mi_params, part_info, xd,
+                                  &(vt->split[0]), thresholds, mi_col, mi_row);
     else
-      set_low_temp_var_flag_128x128(&cm->mi_params, x, xd, vt, thresholds,
-                                    mi_col, mi_row);
+      set_low_temp_var_flag_128x128(&cm->mi_params, part_info, xd, vt,
+                                    thresholds, mi_col, mi_row);
   }
 }
 
 void av1_set_variance_partition_thresholds(AV1_COMP *cpi, int q,
-                                           int content_state) {
+                                           int content_lowsumdiff) {
   SPEED_FEATURES *const sf = &cpi->sf;
   if (sf->part_sf.partition_search_type != VAR_BASED_PARTITION) {
     return;
   } else {
-    set_vbp_thresholds(cpi, cpi->vbp_info.thresholds, q, content_state);
+    set_vbp_thresholds(cpi, cpi->vbp_info.thresholds, q, content_lowsumdiff, 0);
     // The threshold below is not changed locally.
     cpi->vbp_info.threshold_minmax = 15 + (q >> 3);
   }
@@ -581,7 +616,7 @@
   int i;
   MACROBLOCKD *xd = &x->e_mbd;
 
-  if (is_key_frame || cpi->oxcf.monochrome) return;
+  if (is_key_frame || cpi->oxcf.tool_cfg.enable_monochrome) return;
 
   for (i = 1; i <= 2; ++i) {
     unsigned int uv_sad = UINT_MAX;
@@ -598,181 +633,34 @@
   }
 }
 
-// This function chooses partitioning based on the variance between source and
-// reconstructed last, where variance is computed for down-sampled inputs.
-// TODO(kyslov): lot of things. Bring back noise estimation, brush up partition
-// selection and most of all - retune the thresholds
-int av1_choose_var_based_partitioning(AV1_COMP *cpi, const TileInfo *const tile,
-                                      ThreadData *td, MACROBLOCK *x, int mi_row,
-                                      int mi_col) {
-  AV1_COMMON *const cm = &cpi->common;
+static void fill_variance_tree_leaves(
+    AV1_COMP *cpi, MACROBLOCK *x, VP128x128 *vt, VP16x16 *vt2,
+    unsigned char *force_split, int avg_16x16[][4], int maxvar_16x16[][4],
+    int minvar_16x16[][4], int *variance4x4downsample, int64_t *thresholds,
+    uint8_t *src, int src_stride, const uint8_t *dst, int dst_stride) {
+  AV1_COMMON *cm = &cpi->common;
   MACROBLOCKD *xd = &x->e_mbd;
-  const int64_t *const vbp_thresholds = cpi->vbp_info.thresholds;
-
-  int i, j, k, m;
-  VP128x128 *vt;
-  VP16x16 *vt2 = NULL;
-  unsigned char force_split[85];
-  int avg_32x32;
-  int max_var_32x32[4];
-  int min_var_32x32[4];
-  int var_32x32;
-  int var_64x64;
-  int min_var_64x64 = INT_MAX;
-  int max_var_64x64 = 0;
-  int avg_16x16[4][4];
-  int maxvar_16x16[4][4];
-  int minvar_16x16[4][4];
-  int64_t threshold_4x4avg;
-  int content_state = 0;
-  uint8_t *s;
-  const uint8_t *d;
-  int sp;
-  int dp;
-  // TODO(kyslov) Bring back compute_minmax_variance with content type detection
-  int compute_minmax_variance = 0;
-  int is_key_frame = frame_is_intra_only(cm);
-  int pixels_wide = 128, pixels_high = 128;
-  assert(cm->seq_params.sb_size == BLOCK_64X64 ||
-         cm->seq_params.sb_size == BLOCK_128X128);
+  const int is_key_frame = frame_is_intra_only(cm);
   const int is_small_sb = (cm->seq_params.sb_size == BLOCK_64X64);
   const int num_64x64_blocks = is_small_sb ? 1 : 4;
-
-  unsigned int y_sad = UINT_MAX;
-  unsigned int y_sad_g = UINT_MAX;
-  BLOCK_SIZE bsize = is_small_sb ? BLOCK_64X64 : BLOCK_128X128;
-
-  // Ref frame used in partitioning.
-  MV_REFERENCE_FRAME ref_frame_partition = LAST_FRAME;
-
-  CHECK_MEM_ERROR(cm, vt, aom_malloc(sizeof(*vt)));
-
-  vt->split = td->vt64x64;
-
-  int64_t thresholds[5] = { vbp_thresholds[0], vbp_thresholds[1],
-                            vbp_thresholds[2], vbp_thresholds[3],
-                            vbp_thresholds[4] };
-
-  const int low_res = (cm->width <= 352 && cm->height <= 288);
-  int variance4x4downsample[64];
-  int segment_id;
-  const int num_planes = av1_num_planes(cm);
-
-  segment_id = xd->mi[0]->segment_id;
-
-  if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled &&
-      cyclic_refresh_segment_id_boosted(segment_id) &&
-      cpi->sf.rt_sf.use_nonrd_pick_mode) {
-    int q = av1_get_qindex(&cm->seg, segment_id, cm->quant_params.base_qindex);
-    set_vbp_thresholds(cpi, thresholds, q, content_state);
-  } else {
-    set_vbp_thresholds(cpi, thresholds, cm->quant_params.base_qindex,
-                       content_state);
-  }
+  // TODO(kyslov) Bring back compute_minmax_variance with content type detection
+  const int compute_minmax_variance = 0;
+  const int segment_id = xd->mi[0]->segment_id;
+  int pixels_wide = 128, pixels_high = 128;
 
   if (is_small_sb) {
     pixels_wide = 64;
     pixels_high = 64;
   }
-
-  // For non keyframes, disable 4x4 average for low resolution when speed = 8
-  threshold_4x4avg = INT64_MAX;
-
   if (xd->mb_to_right_edge < 0) pixels_wide += (xd->mb_to_right_edge >> 3);
   if (xd->mb_to_bottom_edge < 0) pixels_high += (xd->mb_to_bottom_edge >> 3);
-
-  s = x->plane[0].src.buf;
-  sp = x->plane[0].src.stride;
-
-  // Index for force_split: 0 for 64x64, 1-4 for 32x32 blocks,
-  // 5-20 for the 16x16 blocks.
-  force_split[0] = 0;
-  memset(x->variance_low, 0, sizeof(x->variance_low));
-
-  if (!is_key_frame) {
-    // TODO(kyslov): we are assuming that the ref is LAST_FRAME! Check if it
-    // is!!
-    MB_MODE_INFO *mi = xd->mi[0];
-    const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_yv12_buf(cm, LAST_FRAME);
-    assert(yv12 != NULL);
-    const YV12_BUFFER_CONFIG *yv12_g = NULL;
-
-    // For non-SVC GOLDEN is another temporal reference. Check if it should be
-    // used as reference for partitioning.
-    if (!cpi->use_svc && (cpi->ref_frame_flags & AOM_GOLD_FLAG) &&
-        cpi->sf.rt_sf.use_nonrd_pick_mode) {
-      yv12_g = get_ref_frame_yv12_buf(cm, GOLDEN_FRAME);
-      if (yv12_g && yv12_g != yv12) {
-        av1_setup_pre_planes(xd, 0, yv12_g, mi_row, mi_col,
-                             get_ref_scale_factors(cm, GOLDEN_FRAME),
-                             num_planes);
-        y_sad_g = cpi->fn_ptr[bsize].sdf(
-            x->plane[0].src.buf, x->plane[0].src.stride,
-            xd->plane[0].pre[0].buf, xd->plane[0].pre[0].stride);
-      }
-    }
-
-    av1_setup_pre_planes(xd, 0, yv12, mi_row, mi_col,
-                         get_ref_scale_factors(cm, LAST_FRAME), num_planes);
-    mi->ref_frame[0] = LAST_FRAME;
-    mi->ref_frame[1] = NONE_FRAME;
-    mi->sb_type = cm->seq_params.sb_size;
-    mi->mv[0].as_int = 0;
-    mi->interp_filters = av1_broadcast_interp_filter(BILINEAR);
-    if (cpi->sf.rt_sf.estimate_motion_for_var_based_partition) {
-      if (xd->mb_to_right_edge >= 0 && xd->mb_to_bottom_edge >= 0) {
-        const MV dummy_mv = { 0, 0 };
-        y_sad = av1_int_pro_motion_estimation(cpi, x, cm->seq_params.sb_size,
-                                              mi_row, mi_col, &dummy_mv);
-      }
-    }
-    if (y_sad == UINT_MAX) {
-      y_sad = cpi->fn_ptr[bsize].sdf(
-          x->plane[0].src.buf, x->plane[0].src.stride, xd->plane[0].pre[0].buf,
-          xd->plane[0].pre[0].stride);
-    }
-
-    // Pick the ref frame for partitioning, use golden frame only if its
-    // lower sad.
-    if (y_sad_g < 0.9 * y_sad) {
-      av1_setup_pre_planes(xd, 0, yv12_g, mi_row, mi_col,
-                           get_ref_scale_factors(cm, GOLDEN_FRAME), num_planes);
-      mi->ref_frame[0] = GOLDEN_FRAME;
-      mi->mv[0].as_int = 0;
-      y_sad = y_sad_g;
-      ref_frame_partition = GOLDEN_FRAME;
-      x->nonrd_prune_ref_frame_search = 0;
-    } else {
-      x->pred_mv[LAST_FRAME] = mi->mv[0].as_mv;
-      ref_frame_partition = LAST_FRAME;
-      x->nonrd_prune_ref_frame_search =
-          cpi->sf.rt_sf.nonrd_prune_ref_frame_search;
-    }
-
-    set_ref_ptrs(cm, xd, mi->ref_frame[0], mi->ref_frame[1]);
-    av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL,
-                                  cm->seq_params.sb_size, AOM_PLANE_Y,
-                                  AOM_PLANE_Y);
-
-    d = xd->plane[0].dst.buf;
-    dp = xd->plane[0].dst.stride;
-  } else {
-    d = AV1_VAR_OFFS;
-    dp = 0;
-  }
-
-  if (low_res && threshold_4x4avg < INT64_MAX)
-    CHECK_MEM_ERROR(cm, vt2, aom_malloc(sizeof(*vt2)));
-  // Fill in the entire tree of 8x8 (or 4x4 under some conditions) variances
-  // for splits.
-  for (m = 0; m < num_64x64_blocks; m++) {
+  for (int m = 0; m < num_64x64_blocks; m++) {
     const int x64_idx = ((m & 1) << 6);
     const int y64_idx = ((m >> 1) << 6);
     const int m2 = m << 2;
     force_split[m + 1] = 0;
-    max_var_32x32[m] = 0;
-    min_var_32x32[m] = INT_MAX;
-    for (i = 0; i < 4; i++) {
+
+    for (int i = 0; i < 4; i++) {
       const int x32_idx = x64_idx + ((i & 1) << 5);
       const int y32_idx = y64_idx + ((i >> 1) << 5);
       const int i2 = (m2 + i) << 2;
@@ -780,7 +668,7 @@
       avg_16x16[m][i] = 0;
       maxvar_16x16[m][i] = 0;
       minvar_16x16[m][i] = INT_MAX;
-      for (j = 0; j < 4; j++) {
+      for (int j = 0; j < 4; j++) {
         const int x16_idx = x32_idx + ((j & 1) << 4);
         const int y16_idx = y32_idx + ((j >> 1) << 4);
         const int split_index = 21 + i2 + j;
@@ -788,7 +676,8 @@
         force_split[split_index] = 0;
         variance4x4downsample[i2 + j] = 0;
         if (!is_key_frame) {
-          fill_variance_8x8avg(s, sp, d, dp, x16_idx, y16_idx, vst,
+          fill_variance_8x8avg(src, src_stride, dst, dst_stride, x16_idx,
+                               y16_idx, vst,
 #if CONFIG_AV1_HIGHBITDEPTH
                                xd->cur_buf->flags,
 #endif
@@ -814,16 +703,17 @@
             force_split[5 + m2 + i] = 1;
             force_split[m + 1] = 1;
             force_split[0] = 1;
-          } else if (compute_minmax_variance &&
+          } else if (!cyclic_refresh_segment_id_boosted(segment_id) &&
+                     compute_minmax_variance &&
                      vt->split[m]
                              .split[i]
                              .split[j]
-                             .part_variances.none.variance > thresholds[2] &&
-                     !cyclic_refresh_segment_id_boosted(segment_id)) {
+                             .part_variances.none.variance > thresholds[2]) {
             // We have some nominal amount of 16x16 variance (based on average),
             // compute the minmax over the 8x8 sub-blocks, and if above
             // threshold, force split to 8x8 block for this 16x16 block.
-            int minmax = compute_minmax_8x8(s, sp, d, dp, x16_idx, y16_idx,
+            int minmax = compute_minmax_8x8(src, src_stride, dst, dst_stride,
+                                            x16_idx, y16_idx,
 #if CONFIG_AV1_HIGHBITDEPTH
                                             xd->cur_buf->flags,
 #endif
@@ -841,11 +731,12 @@
           force_split[split_index] = 0;
           // Go down to 4x4 down-sampling for variance.
           variance4x4downsample[i2 + j] = 1;
-          for (k = 0; k < 4; k++) {
+          for (int k = 0; k < 4; k++) {
             int x8_idx = x16_idx + ((k & 1) << 3);
             int y8_idx = y16_idx + ((k >> 1) << 3);
             VP8x8 *vst2 = is_key_frame ? &vst->split[k] : &vt2[i2 + j].split[k];
-            fill_variance_4x4avg(s, sp, d, dp, x8_idx, y8_idx, vst2,
+            fill_variance_4x4avg(src, src_stride, dst, dst_stride, x8_idx,
+                                 y8_idx, vst2,
 #if CONFIG_AV1_HIGHBITDEPTH
                                  xd->cur_buf->flags,
 #endif
@@ -855,10 +746,197 @@
       }
     }
   }
+}
 
-  // Fill the rest of the variance tree by summing split partition values.
+static void setup_planes(AV1_COMP *cpi, MACROBLOCK *x, unsigned int *y_sad,
+                         unsigned int *y_sad_g,
+                         MV_REFERENCE_FRAME *ref_frame_partition, int mi_row,
+                         int mi_col) {
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *xd = &x->e_mbd;
+  const int num_planes = av1_num_planes(cm);
+  const int is_small_sb = (cm->seq_params.sb_size == BLOCK_64X64);
+  BLOCK_SIZE bsize = is_small_sb ? BLOCK_64X64 : BLOCK_128X128;
+  // TODO(kyslov): we are assuming that the ref is LAST_FRAME! Check if it
+  // is!!
+  MB_MODE_INFO *mi = xd->mi[0];
+  const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_yv12_buf(cm, LAST_FRAME);
+  assert(yv12 != NULL);
+  const YV12_BUFFER_CONFIG *yv12_g = NULL;
+
+  // For non-SVC GOLDEN is another temporal reference. Check if it should be
+  // used as reference for partitioning.
+  if (!cpi->use_svc && (cpi->ref_frame_flags & AOM_GOLD_FLAG) &&
+      cpi->sf.rt_sf.use_nonrd_pick_mode) {
+    yv12_g = get_ref_frame_yv12_buf(cm, GOLDEN_FRAME);
+    if (yv12_g && yv12_g != yv12) {
+      av1_setup_pre_planes(xd, 0, yv12_g, mi_row, mi_col,
+                           get_ref_scale_factors(cm, GOLDEN_FRAME), num_planes);
+      *y_sad_g = cpi->fn_ptr[bsize].sdf(
+          x->plane[0].src.buf, x->plane[0].src.stride, xd->plane[0].pre[0].buf,
+          xd->plane[0].pre[0].stride);
+    }
+  }
+
+  av1_setup_pre_planes(xd, 0, yv12, mi_row, mi_col,
+                       get_ref_scale_factors(cm, LAST_FRAME), num_planes);
+  mi->ref_frame[0] = LAST_FRAME;
+  mi->ref_frame[1] = NONE_FRAME;
+  mi->bsize = cm->seq_params.sb_size;
+  mi->mv[0].as_int = 0;
+  mi->interp_filters = av1_broadcast_interp_filter(BILINEAR);
+  if (cpi->sf.rt_sf.estimate_motion_for_var_based_partition) {
+    if (xd->mb_to_right_edge >= 0 && xd->mb_to_bottom_edge >= 0) {
+      const MV dummy_mv = { 0, 0 };
+      *y_sad = av1_int_pro_motion_estimation(cpi, x, cm->seq_params.sb_size,
+                                             mi_row, mi_col, &dummy_mv);
+    }
+  }
+  if (*y_sad == UINT_MAX) {
+    *y_sad = cpi->fn_ptr[bsize].sdf(x->plane[0].src.buf, x->plane[0].src.stride,
+                                    xd->plane[0].pre[0].buf,
+                                    xd->plane[0].pre[0].stride);
+  }
+
+  // Pick the ref frame for partitioning, use golden frame only if its
+  // lower sad.
+  if (*y_sad_g < 0.9 * *y_sad) {
+    av1_setup_pre_planes(xd, 0, yv12_g, mi_row, mi_col,
+                         get_ref_scale_factors(cm, GOLDEN_FRAME), num_planes);
+    mi->ref_frame[0] = GOLDEN_FRAME;
+    mi->mv[0].as_int = 0;
+    *y_sad = *y_sad_g;
+    *ref_frame_partition = GOLDEN_FRAME;
+    x->nonrd_prune_ref_frame_search = 0;
+  } else {
+    *ref_frame_partition = LAST_FRAME;
+    x->nonrd_prune_ref_frame_search =
+        cpi->sf.rt_sf.nonrd_prune_ref_frame_search;
+  }
+
+  set_ref_ptrs(cm, xd, mi->ref_frame[0], mi->ref_frame[1]);
+  av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL,
+                                cm->seq_params.sb_size, AOM_PLANE_Y,
+                                AOM_PLANE_Y);
+}
+
+int av1_choose_var_based_partitioning(AV1_COMP *cpi, const TileInfo *const tile,
+                                      ThreadData *td, MACROBLOCK *x, int mi_row,
+                                      int mi_col) {
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *xd = &x->e_mbd;
+  const int64_t *const vbp_thresholds = cpi->vbp_info.thresholds;
+
+  int i, j, k, m;
+  VP128x128 *vt;
+  VP16x16 *vt2 = NULL;
+  unsigned char force_split[85];
+  int avg_32x32;
+  int avg_64x64;
+  int max_var_32x32[4];
+  int min_var_32x32[4];
+  int var_32x32;
+  int var_64x64;
+  int min_var_64x64 = INT_MAX;
+  int max_var_64x64 = 0;
+  int avg_16x16[4][4];
+  int maxvar_16x16[4][4];
+  int minvar_16x16[4][4];
+  int64_t threshold_4x4avg;
+  uint8_t *s;
+  const uint8_t *d;
+  int sp;
+  int dp;
+
+  int is_key_frame =
+      (frame_is_intra_only(cm) ||
+       (cpi->use_svc &&
+        cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame));
+
+  assert(cm->seq_params.sb_size == BLOCK_64X64 ||
+         cm->seq_params.sb_size == BLOCK_128X128);
+  const int is_small_sb = (cm->seq_params.sb_size == BLOCK_64X64);
+  const int num_64x64_blocks = is_small_sb ? 1 : 4;
+
+  unsigned int y_sad = UINT_MAX;
+  unsigned int y_sad_g = UINT_MAX;
+  BLOCK_SIZE bsize = is_small_sb ? BLOCK_64X64 : BLOCK_128X128;
+
+  // Ref frame used in partitioning.
+  MV_REFERENCE_FRAME ref_frame_partition = LAST_FRAME;
+
+  CHECK_MEM_ERROR(cm, vt, aom_malloc(sizeof(*vt)));
+
+  vt->split = td->vt64x64;
+
+  int64_t thresholds[5] = { vbp_thresholds[0], vbp_thresholds[1],
+                            vbp_thresholds[2], vbp_thresholds[3],
+                            vbp_thresholds[4] };
+
+  const int low_res = (cm->width <= 352 && cm->height <= 288);
+  int variance4x4downsample[64];
+  const int segment_id = xd->mi[0]->segment_id;
+
+  if (cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled &&
+      cyclic_refresh_segment_id_boosted(segment_id) &&
+      cpi->sf.rt_sf.use_nonrd_pick_mode) {
+    int q = av1_get_qindex(&cm->seg, segment_id, cm->quant_params.base_qindex);
+    set_vbp_thresholds(cpi, thresholds, q, x->content_state_sb.low_sumdiff, 1);
+  } else {
+    set_vbp_thresholds(cpi, thresholds, cm->quant_params.base_qindex,
+                       x->content_state_sb.low_sumdiff, 0);
+  }
+
+  // For non keyframes, disable 4x4 average for low resolution when speed = 8
+  threshold_4x4avg = INT64_MAX;
+
+  s = x->plane[0].src.buf;
+  sp = x->plane[0].src.stride;
+
+  // Index for force_split: 0 for 64x64, 1-4 for 32x32 blocks,
+  // 5-20 for the 16x16 blocks.
+  force_split[0] = 0;
+  memset(x->part_search_info.variance_low, 0,
+         sizeof(x->part_search_info.variance_low));
+
+  // Check if LAST frame is NULL or if the resolution of LAST is
+  // different than the current frame resolution, and if so, treat this frame
+  // as a key frame, for the purpose of the superblock partitioning.
+  // LAST == NULL can happen in cases where enhancement spatial layers are
+  // enabled dyanmically and the only reference is the spatial(GOLDEN).
+  // TODO(marpan): Check se of scaled references for the different resoln.
+  if (!frame_is_intra_only(cm)) {
+    const YV12_BUFFER_CONFIG *const ref =
+        get_ref_frame_yv12_buf(cm, LAST_FRAME);
+    if (ref == NULL || ref->y_crop_height != cm->height ||
+        ref->y_crop_width != cm->width) {
+      is_key_frame = 1;
+    }
+  }
+
+  if (!is_key_frame) {
+    setup_planes(cpi, x, &y_sad, &y_sad_g, &ref_frame_partition, mi_row,
+                 mi_col);
+    d = xd->plane[0].dst.buf;
+    dp = xd->plane[0].dst.stride;
+  } else {
+    d = AV1_VAR_OFFS;
+    dp = 0;
+  }
+
+  if (low_res && threshold_4x4avg < INT64_MAX)
+    CHECK_MEM_ERROR(cm, vt2, aom_malloc(sizeof(*vt2)));
+  // Fill in the entire tree of 8x8 (or 4x4 under some conditions) variances
+  // for splits.
+  fill_variance_tree_leaves(cpi, x, vt, vt2, force_split, avg_16x16,
+                            maxvar_16x16, minvar_16x16, variance4x4downsample,
+                            thresholds, s, sp, d, dp);
+
+  avg_64x64 = 0;
   for (m = 0; m < num_64x64_blocks; ++m) {
     avg_32x32 = 0;
+    max_var_32x32[m] = 0;
+    min_var_32x32[m] = INT_MAX;
     const int m2 = m << 2;
     for (i = 0; i < 4; i++) {
       const int i2 = (m2 + i) << 2;
@@ -926,6 +1004,7 @@
           (max_var_32x32[m] - min_var_32x32[m]) > 3 * (thresholds[1] >> 3) &&
           max_var_32x32[m] > thresholds[1] >> 1)
         force_split[1 + m] = 1;
+      avg_64x64 += var_64x64;
     }
     if (is_small_sb) force_split[0] = 1;
   }
@@ -934,6 +1013,10 @@
     fill_variance_tree(vt, BLOCK_128X128);
     get_variance(&vt->part_variances.none);
     if (!is_key_frame &&
+        vt->part_variances.none.variance > (9 * avg_64x64) >> 5)
+      force_split[0] = 1;
+
+    if (!is_key_frame &&
         (max_var_64x64 - min_var_64x64) > 3 * (thresholds[0] >> 3) &&
         max_var_64x64 > thresholds[0] >> 1)
       force_split[0] = 1;
@@ -995,8 +1078,8 @@
   }
 
   if (cpi->sf.rt_sf.short_circuit_low_temp_var) {
-    set_low_temp_var_flag(cpi, x, xd, vt, thresholds, ref_frame_partition,
-                          mi_col, mi_row);
+    set_low_temp_var_flag(cpi, &x->part_search_info, xd, vt, thresholds,
+                          ref_frame_partition, mi_col, mi_row);
   }
   chroma_check(cpi, x, bsize, y_sad, is_key_frame);
 
diff --git a/av1/encoder/var_based_part.h b/av1/encoder/var_based_part.h
index a80e25c..89e44e8 100644
--- a/av1/encoder/var_based_part.h
+++ b/av1/encoder/var_based_part.h
@@ -30,10 +30,61 @@
 #define QINDEX_HIGH_THR \
   220  // Use high qindex variance partition thresholds when qindex is above
        // this threshold
+#define QINDEX_LARGE_BLOCK_THR \
+  100  // Use increased thresholds for midres for speed 9 when qindex is above
+       // this threshold
 
+/*!\brief Set the thresholds for variance based partition.
+ *
+ * Set the variance split thresholds for following the block sizes:
+ * 0 - threshold_128x128, 1 - threshold_64x64, 2 - threshold_32x32,
+ * 3 - vbp_threshold_16x16. 4 - vbp_threshold_8x8 (to split to 4x4 partition) is
+ * currently only used on key frame. The thresholds are based om Q, resolution,
+ * noise level, and content state.
+ *
+ * \ingroup variance_partition
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in]      cpi                Top level encoder structure
+ * \param[in]      q                  q index
+ * \param[in]      content_lowsumdiff Low sumdiff flag for superblock
+ *
+ * \return Returns the set of thresholds in \c cpi->vbp_info.thresholds.
+ */
 void av1_set_variance_partition_thresholds(AV1_COMP *cpi, int q,
-                                           int content_state);
+                                           int content_lowsumdiff);
 
+/*!\brief Variance based partition selection.
+ *
+ * Select the partitioning based on the variance of the residual signal,
+ * residual generated as the difference between the source and prediction.
+ * The prediction is the reconstructed LAST or reconstructed GOLDEN, whichever
+ * has lower y sad. For LAST, option exists (speed feature) to use motion
+ * compensation based on superblock motion via int_pro_motion_estimation. For
+ * key frames reference is fixed 128 level, so variance is the source variance.
+ * The variance is computed for downsampled inputs (8x8 or 4x4 downsampled),
+ * and selection is done top-down via as set of partition thresholds. defined
+ * for each block level, and set based on Q, resolution, noise level, and
+ * content state.
+ *
+ * \ingroup variance_partition
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in]       cpi          Top level encoder structure
+ * \param[in]       tile         Pointer to TileInfo
+ * \param[in]       td           Pointer to ThreadData
+ * \param[in]       x            Pointer to MACROBLOCK
+ * \param[in]       mi_row       Row coordinate of the superblock in a step
+ size of MI_SIZE
+ * \param[in]       mi_col       Column coordinate of the super block in a step
+ size of MI_SIZE
+ *
+ * \return Returns the partition in \c xd->mi[0]->sb_type. Also sets the low
+ * temporal variance flag and the color sensitivity flag (both used in
+ * nonrd_pickmode).
+ */
 int av1_choose_var_based_partitioning(AV1_COMP *cpi, const TileInfo *const tile,
                                       ThreadData *td, MACROBLOCK *x, int mi_row,
                                       int mi_col);
diff --git a/av1/encoder/x86/av1_k_means_avx2.c b/av1/encoder/x86/av1_k_means_avx2.c
new file mode 100644
index 0000000..a96ed2e
--- /dev/null
+++ b/av1/encoder/x86/av1_k_means_avx2.c
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <immintrin.h>  // AVX2
+
+#include "config/aom_dsp_rtcd.h"
+#include "aom_dsp/x86/synonyms.h"
+
+void av1_calc_indices_dim1_avx2(const int *data, const int *centroids,
+                                uint8_t *indices, int n, int k) {
+  __m256i dist[PALETTE_MAX_SIZE];
+  __m256i v_zero = _mm256_setzero_si256();
+
+  for (int i = 0; i < n; i += 8) {
+    __m256i ind = _mm256_loadu_si256((__m256i *)data);
+    for (int j = 0; j < k; j++) {
+      __m256i cent = _mm256_set1_epi32((uint32_t)centroids[j]);
+      __m256i d1 = _mm256_sub_epi32(ind, cent);
+      dist[j] = _mm256_mullo_epi32(d1, d1);
+    }
+
+    ind = _mm256_setzero_si256();
+    for (int j = 1; j < k; j++) {
+      __m256i cmp = _mm256_cmpgt_epi32(dist[0], dist[j]);
+      __m256i dist1 = _mm256_andnot_si256(cmp, dist[0]);
+      __m256i dist2 = _mm256_and_si256(cmp, dist[j]);
+      dist[0] = _mm256_or_si256(dist1, dist2);
+      __m256i ind1 = _mm256_set1_epi32(j);
+      ind = _mm256_or_si256(_mm256_andnot_si256(cmp, ind),
+                            _mm256_and_si256(cmp, ind1));
+    }
+
+    __m256i p1 = _mm256_packus_epi32(ind, v_zero);
+    __m256i px = _mm256_permute4x64_epi64(p1, 0x58);
+    __m256i p2 = _mm256_packus_epi16(px, v_zero);
+    __m128i d1 = _mm256_extracti128_si256(p2, 0);
+
+    _mm_storel_epi64((__m128i *)indices, d1);
+
+    indices += 8;
+    data += 8;
+  }
+}
diff --git a/av1/encoder/x86/av1_ssim_opt_x86_64.asm b/av1/encoder/x86/av1_ssim_opt_x86_64.asm
index faa2a23..6187581 100644
--- a/av1/encoder/x86/av1_ssim_opt_x86_64.asm
+++ b/av1/encoder/x86/av1_ssim_opt_x86_64.asm
@@ -67,7 +67,7 @@
 ; or pavgb At this point this is just meant to be first pass for calculating
 ; all the parms needed for 16x16 ssim so we can play with dssim as distortion
 ; in mode selection code.
-global sym(av1_ssim_parms_16x16_sse2) PRIVATE
+globalsym(av1_ssim_parms_16x16_sse2)
 sym(av1_ssim_parms_16x16_sse2):
     push        rbp
     mov         rbp, rsp
@@ -157,7 +157,7 @@
 ; or pavgb At this point this is just meant to be first pass for calculating
 ; all the parms needed for 16x16 ssim so we can play with dssim as distortion
 ; in mode selection code.
-global sym(av1_ssim_parms_8x8_sse2) PRIVATE
+globalsym(av1_ssim_parms_8x8_sse2)
 sym(av1_ssim_parms_8x8_sse2):
     push        rbp
     mov         rbp, rsp
diff --git a/av1/encoder/x86/highbd_temporal_filter_avx2.c b/av1/encoder/x86/highbd_temporal_filter_avx2.c
new file mode 100644
index 0000000..b5477ec
--- /dev/null
+++ b/av1/encoder/x86/highbd_temporal_filter_avx2.c
@@ -0,0 +1,423 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <immintrin.h>
+
+#include "config/av1_rtcd.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/temporal_filter.h"
+
+#define SSE_STRIDE (BW + 4)
+
+DECLARE_ALIGNED(32, static const uint32_t, sse_bytemask[4][8]) = {
+  { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0, 0, 0 },
+  { 0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0, 0 },
+  { 0, 0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0 },
+  { 0, 0, 0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF }
+};
+
+static AOM_FORCE_INLINE void get_squared_error_16x16_avx2(
+    const uint16_t *frame1, const unsigned int stride, const uint16_t *frame2,
+    const unsigned int stride2, const int block_width, const int block_height,
+    uint32_t *frame_sse, const unsigned int sse_stride) {
+  (void)block_width;
+  const uint16_t *src1 = frame1;
+  const uint16_t *src2 = frame2;
+  uint32_t *dst = frame_sse + 2;
+  for (int i = 0; i < block_height; i++) {
+    __m256i v_src1 = _mm256_loadu_si256((__m256i *)src1);
+    __m256i v_src2 = _mm256_loadu_si256((__m256i *)src2);
+    __m256i v_diff = _mm256_sub_epi16(v_src1, v_src2);
+    __m256i v_mullo = _mm256_mullo_epi16(v_diff, v_diff);
+    __m256i v_mulhi = _mm256_mulhi_epi16(v_diff, v_diff);
+
+    __m256i v_lo = _mm256_unpacklo_epi16(v_mullo, v_mulhi);
+    __m256i v_hi = _mm256_unpackhi_epi16(v_mullo, v_mulhi);
+    __m256i diff_lo =
+        _mm256_inserti128_si256(v_lo, _mm256_extracti128_si256(v_hi, 0), 1);
+    __m256i diff_hi =
+        _mm256_inserti128_si256(v_hi, _mm256_extracti128_si256(v_lo, 1), 0);
+
+    _mm256_storeu_si256((__m256i *)dst, diff_lo);
+    dst += 8;
+    _mm256_storeu_si256((__m256i *)dst, diff_hi);
+
+    src1 += stride, src2 += stride2;
+    dst += sse_stride - 8;
+  }
+}
+
+static AOM_FORCE_INLINE void get_squared_error_32x32_avx2(
+    const uint16_t *frame1, const unsigned int stride, const uint16_t *frame2,
+    const unsigned int stride2, const int block_width, const int block_height,
+    uint32_t *frame_sse, const unsigned int sse_stride) {
+  (void)block_width;
+  const uint16_t *src1 = frame1;
+  const uint16_t *src2 = frame2;
+  uint32_t *dst = frame_sse + 2;
+  for (int i = 0; i < block_height; i++) {
+    __m256i v_src1 = _mm256_loadu_si256((__m256i *)src1);
+    __m256i v_src2 = _mm256_loadu_si256((__m256i *)src2);
+    __m256i v_diff = _mm256_sub_epi16(v_src1, v_src2);
+    __m256i v_mullo = _mm256_mullo_epi16(v_diff, v_diff);
+    __m256i v_mulhi = _mm256_mulhi_epi16(v_diff, v_diff);
+
+    __m256i v_lo = _mm256_unpacklo_epi16(v_mullo, v_mulhi);
+    __m256i v_hi = _mm256_unpackhi_epi16(v_mullo, v_mulhi);
+    __m256i diff_lo =
+        _mm256_inserti128_si256(v_lo, _mm256_extracti128_si256(v_hi, 0), 1);
+    __m256i diff_hi =
+        _mm256_inserti128_si256(v_hi, _mm256_extracti128_si256(v_lo, 1), 0);
+
+    _mm256_storeu_si256((__m256i *)dst, diff_lo);
+    _mm256_storeu_si256((__m256i *)(dst + 8), diff_hi);
+
+    v_src1 = _mm256_loadu_si256((__m256i *)(src1 + 16));
+    v_src2 = _mm256_loadu_si256((__m256i *)(src2 + 16));
+    v_diff = _mm256_sub_epi16(v_src1, v_src2);
+    v_mullo = _mm256_mullo_epi16(v_diff, v_diff);
+    v_mulhi = _mm256_mulhi_epi16(v_diff, v_diff);
+
+    v_lo = _mm256_unpacklo_epi16(v_mullo, v_mulhi);
+    v_hi = _mm256_unpackhi_epi16(v_mullo, v_mulhi);
+    diff_lo =
+        _mm256_inserti128_si256(v_lo, _mm256_extracti128_si256(v_hi, 0), 1);
+    diff_hi =
+        _mm256_inserti128_si256(v_hi, _mm256_extracti128_si256(v_lo, 1), 0);
+
+    _mm256_storeu_si256((__m256i *)(dst + 16), diff_lo);
+    _mm256_storeu_si256((__m256i *)(dst + 24), diff_hi);
+
+    src1 += stride;
+    src2 += stride2;
+    dst += sse_stride;
+  }
+}
+
+static AOM_FORCE_INLINE void xx_load_and_pad_left(uint32_t *src,
+                                                  __m256i *v256tmp) {
+  *v256tmp = _mm256_loadu_si256((__m256i *)src);
+  // For the first column, replicate the first element twice to the left
+  __m256i v256tmp1 = _mm256_shuffle_epi32(*v256tmp, 0xEA);
+  *v256tmp = _mm256_inserti128_si256(*v256tmp,
+                                     _mm256_extracti128_si256(v256tmp1, 0), 0);
+}
+
+static AOM_FORCE_INLINE void xx_load_and_pad_right(uint32_t *src,
+                                                   __m256i *v256tmp) {
+  *v256tmp = _mm256_loadu_si256((__m256i *)src);
+  // For the last column, replicate the last element twice to the right
+  __m256i v256tmp1 = _mm256_shuffle_epi32(*v256tmp, 0x54);
+  *v256tmp = _mm256_inserti128_si256(*v256tmp,
+                                     _mm256_extracti128_si256(v256tmp1, 1), 1);
+}
+
+static AOM_FORCE_INLINE int32_t xx_mask_and_hadd(__m256i vsum, int i) {
+  // Mask the required 5 values inside the vector
+  __m256i vtmp = _mm256_and_si256(vsum, *(__m256i *)sse_bytemask[i]);
+  __m128i v128a, v128b;
+  // Extract 256b as two 128b registers A and B
+  v128a = _mm256_castsi256_si128(vtmp);
+  v128b = _mm256_extracti128_si256(vtmp, 1);
+  // A = [A0+B0, A1+B1, A2+B2, A3+B3]
+  v128a = _mm_add_epi32(v128a, v128b);
+  // B = [A2+B2, A3+B3, 0, 0]
+  v128b = _mm_srli_si128(v128a, 8);
+  // A = [A0+B0+A2+B2, A1+B1+A3+B3, X, X]
+  v128a = _mm_add_epi32(v128a, v128b);
+  // B = [A1+B1+A3+B3, 0, 0, 0]
+  v128b = _mm_srli_si128(v128a, 4);
+  // A = [A0+B0+A2+B2+A1+B1+A3+B3, X, X, X]
+  v128a = _mm_add_epi32(v128a, v128b);
+  return _mm_extract_epi32(v128a, 0);
+}
+
+static void highbd_apply_temporal_filter(
+    const uint16_t *frame1, const unsigned int stride, const uint16_t *frame2,
+    const unsigned int stride2, const int block_width, const int block_height,
+    const int *subblock_mses, unsigned int *accumulator, uint16_t *count,
+    uint32_t *frame_sse, uint32_t *luma_sse_sum, int bd,
+    const double inv_num_ref_pixels, const double decay_factor,
+    const double inv_factor, const double weight_factor, double *d_factor) {
+  assert(((block_width == 16) || (block_width == 32)) &&
+         ((block_height == 16) || (block_height == 32)));
+
+  uint32_t acc_5x5_sse[BH][BW];
+
+  if (block_width == 32) {
+    get_squared_error_32x32_avx2(frame1, stride, frame2, stride2, block_width,
+                                 block_height, frame_sse, SSE_STRIDE);
+  } else {
+    get_squared_error_16x16_avx2(frame1, stride, frame2, stride2, block_width,
+                                 block_height, frame_sse, SSE_STRIDE);
+  }
+
+  __m256i vsrc[5];
+
+  // Traverse 4 columns at a time
+  // First and last columns will require padding
+  int col;
+  uint32_t *src = frame_sse;
+  for (int i = 2; i < 5; i++) {
+    xx_load_and_pad_left(src, &vsrc[i]);
+    src += SSE_STRIDE;
+  }
+
+  // Copy first row to first 2 vectors
+  vsrc[0] = vsrc[2];
+  vsrc[1] = vsrc[2];
+
+  for (int row = 0; row < block_height - 3; row++) {
+    __m256i vsum1 = _mm256_add_epi32(vsrc[0], vsrc[1]);
+    __m256i vsum2 = _mm256_add_epi32(vsrc[2], vsrc[3]);
+    __m256i vsum3 = _mm256_add_epi32(vsum1, vsum2);
+    __m256i vsum = _mm256_add_epi32(vsum3, vsrc[4]);
+
+    for (int i = 0; i < 4; i++) {
+      vsrc[i] = vsrc[i + 1];
+    }
+
+    xx_load_and_pad_left(src, &vsrc[4]);
+    src += SSE_STRIDE;
+
+    acc_5x5_sse[row][0] = xx_mask_and_hadd(vsum, 0);
+    acc_5x5_sse[row][1] = xx_mask_and_hadd(vsum, 1);
+    acc_5x5_sse[row][2] = xx_mask_and_hadd(vsum, 2);
+    acc_5x5_sse[row][3] = xx_mask_and_hadd(vsum, 3);
+  }
+  for (int row = block_height - 3; row < block_height; row++) {
+    __m256i vsum1 = _mm256_add_epi32(vsrc[0], vsrc[1]);
+    __m256i vsum2 = _mm256_add_epi32(vsrc[2], vsrc[3]);
+    __m256i vsum3 = _mm256_add_epi32(vsum1, vsum2);
+    __m256i vsum = _mm256_add_epi32(vsum3, vsrc[4]);
+
+    for (int i = 0; i < 4; i++) {
+      vsrc[i] = vsrc[i + 1];
+    }
+
+    acc_5x5_sse[row][0] = xx_mask_and_hadd(vsum, 0);
+    acc_5x5_sse[row][1] = xx_mask_and_hadd(vsum, 1);
+    acc_5x5_sse[row][2] = xx_mask_and_hadd(vsum, 2);
+    acc_5x5_sse[row][3] = xx_mask_and_hadd(vsum, 3);
+  }
+  for (col = 4; col < block_width - 4; col += 4) {
+    src = frame_sse + col;
+
+    // Load and pad(for first and last col) 3 rows from the top
+    for (int i = 2; i < 5; i++) {
+      vsrc[i] = _mm256_loadu_si256((__m256i *)src);
+      src += SSE_STRIDE;
+    }
+
+    // Copy first row to first 2 vectors
+    vsrc[0] = vsrc[2];
+    vsrc[1] = vsrc[2];
+
+    for (int row = 0; row < block_height - 3; row++) {
+      __m256i vsum1 = _mm256_add_epi32(vsrc[0], vsrc[1]);
+      __m256i vsum2 = _mm256_add_epi32(vsrc[2], vsrc[3]);
+      __m256i vsum3 = _mm256_add_epi32(vsum1, vsum2);
+      __m256i vsum = _mm256_add_epi32(vsum3, vsrc[4]);
+
+      for (int i = 0; i < 4; i++) {
+        vsrc[i] = vsrc[i + 1];
+      }
+
+      vsrc[4] = _mm256_loadu_si256((__m256i *)src);
+
+      src += SSE_STRIDE;
+
+      acc_5x5_sse[row][col] = xx_mask_and_hadd(vsum, 0);
+      acc_5x5_sse[row][col + 1] = xx_mask_and_hadd(vsum, 1);
+      acc_5x5_sse[row][col + 2] = xx_mask_and_hadd(vsum, 2);
+      acc_5x5_sse[row][col + 3] = xx_mask_and_hadd(vsum, 3);
+    }
+    for (int row = block_height - 3; row < block_height; row++) {
+      __m256i vsum1 = _mm256_add_epi32(vsrc[0], vsrc[1]);
+      __m256i vsum2 = _mm256_add_epi32(vsrc[2], vsrc[3]);
+      __m256i vsum3 = _mm256_add_epi32(vsum1, vsum2);
+      __m256i vsum = _mm256_add_epi32(vsum3, vsrc[4]);
+
+      for (int i = 0; i < 4; i++) {
+        vsrc[i] = vsrc[i + 1];
+      }
+
+      acc_5x5_sse[row][col] = xx_mask_and_hadd(vsum, 0);
+      acc_5x5_sse[row][col + 1] = xx_mask_and_hadd(vsum, 1);
+      acc_5x5_sse[row][col + 2] = xx_mask_and_hadd(vsum, 2);
+      acc_5x5_sse[row][col + 3] = xx_mask_and_hadd(vsum, 3);
+    }
+  }
+
+  src = frame_sse + col;
+
+  // Load and pad(for first and last col) 3 rows from the top
+  for (int i = 2; i < 5; i++) {
+    xx_load_and_pad_right(src, &vsrc[i]);
+    src += SSE_STRIDE;
+  }
+
+  // Copy first row to first 2 vectors
+  vsrc[0] = vsrc[2];
+  vsrc[1] = vsrc[2];
+
+  for (int row = 0; row < block_height - 3; row++) {
+    __m256i vsum1 = _mm256_add_epi32(vsrc[0], vsrc[1]);
+    __m256i vsum2 = _mm256_add_epi32(vsrc[2], vsrc[3]);
+    __m256i vsum3 = _mm256_add_epi32(vsum1, vsum2);
+    __m256i vsum = _mm256_add_epi32(vsum3, vsrc[4]);
+
+    for (int i = 0; i < 4; i++) {
+      vsrc[i] = vsrc[i + 1];
+    }
+
+    xx_load_and_pad_right(src, &vsrc[4]);
+    src += SSE_STRIDE;
+
+    acc_5x5_sse[row][col] = xx_mask_and_hadd(vsum, 0);
+    acc_5x5_sse[row][col + 1] = xx_mask_and_hadd(vsum, 1);
+    acc_5x5_sse[row][col + 2] = xx_mask_and_hadd(vsum, 2);
+    acc_5x5_sse[row][col + 3] = xx_mask_and_hadd(vsum, 3);
+  }
+  for (int row = block_height - 3; row < block_height; row++) {
+    __m256i vsum1 = _mm256_add_epi32(vsrc[0], vsrc[1]);
+    __m256i vsum2 = _mm256_add_epi32(vsrc[2], vsrc[3]);
+    __m256i vsum3 = _mm256_add_epi32(vsum1, vsum2);
+    __m256i vsum = _mm256_add_epi32(vsum3, vsrc[4]);
+
+    for (int i = 0; i < 4; i++) {
+      vsrc[i] = vsrc[i + 1];
+    }
+
+    acc_5x5_sse[row][col] = xx_mask_and_hadd(vsum, 0);
+    acc_5x5_sse[row][col + 1] = xx_mask_and_hadd(vsum, 1);
+    acc_5x5_sse[row][col + 2] = xx_mask_and_hadd(vsum, 2);
+    acc_5x5_sse[row][col + 3] = xx_mask_and_hadd(vsum, 3);
+  }
+
+  for (int i = 0, k = 0; i < block_height; i++) {
+    for (int j = 0; j < block_width; j++, k++) {
+      const int pixel_value = frame2[i * stride2 + j];
+      uint32_t diff_sse = acc_5x5_sse[i][j] + luma_sse_sum[i * BW + j];
+
+      // Scale down the difference for high bit depth input.
+      diff_sse >>= ((bd - 8) * 2);
+
+      const double window_error = diff_sse * inv_num_ref_pixels;
+      const int subblock_idx =
+          (i >= block_height / 2) * 2 + (j >= block_width / 2);
+      const double block_error = (double)subblock_mses[subblock_idx];
+      const double combined_error =
+          weight_factor * window_error + block_error * inv_factor;
+
+      double scaled_error =
+          combined_error * d_factor[subblock_idx] * decay_factor;
+      scaled_error = AOMMIN(scaled_error, 7);
+      const int weight = (int)(exp(-scaled_error) * TF_WEIGHT_SCALE);
+
+      count[k] += weight;
+      accumulator[k] += weight * pixel_value;
+    }
+  }
+}
+
+void av1_highbd_apply_temporal_filter_avx2(
+    const YV12_BUFFER_CONFIG *frame_to_filter, const MACROBLOCKD *mbd,
+    const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
+    const int num_planes, const double *noise_levels, const MV *subblock_mvs,
+    const int *subblock_mses, const int q_factor, const int filter_strength,
+    const uint8_t *pred, uint32_t *accum, uint16_t *count) {
+  const int is_high_bitdepth = frame_to_filter->flags & YV12_FLAG_HIGHBITDEPTH;
+  assert(block_size == BLOCK_32X32 && "Only support 32x32 block with sse2!");
+  assert(TF_WINDOW_LENGTH == 5 && "Only support window length 5 with sse2!");
+  assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
+  (void)is_high_bitdepth;
+
+  const int mb_height = block_size_high[block_size];
+  const int mb_width = block_size_wide[block_size];
+  const int frame_height = frame_to_filter->y_crop_height;
+  const int frame_width = frame_to_filter->y_crop_width;
+  const int min_frame_size = AOMMIN(frame_height, frame_width);
+  // Variables to simplify combined error calculation.
+  const double inv_factor = 1.0 / ((TF_WINDOW_BLOCK_BALANCE_WEIGHT + 1) *
+                                   TF_SEARCH_ERROR_NORM_WEIGHT);
+  const double weight_factor =
+      (double)TF_WINDOW_BLOCK_BALANCE_WEIGHT * inv_factor;
+  // Decay factors for non-local mean approach.
+  // Smaller q -> smaller filtering weight.
+  double q_decay = pow((double)q_factor / TF_Q_DECAY_THRESHOLD, 2);
+  q_decay = CLIP(q_decay, 1e-5, 1);
+  // Smaller strength -> smaller filtering weight.
+  double s_decay = pow((double)filter_strength / TF_STRENGTH_THRESHOLD, 2);
+  s_decay = CLIP(s_decay, 1e-5, 1);
+  double d_factor[4] = { 0 };
+  uint32_t frame_sse[SSE_STRIDE * BH] = { 0 };
+  uint32_t luma_sse_sum[BW * BH] = { 0 };
+  uint16_t *pred1 = CONVERT_TO_SHORTPTR(pred);
+
+  for (int subblock_idx = 0; subblock_idx < 4; subblock_idx++) {
+    // Larger motion vector -> smaller filtering weight.
+    const MV mv = subblock_mvs[subblock_idx];
+    const double distance = sqrt(pow(mv.row, 2) + pow(mv.col, 2));
+    double distance_threshold = min_frame_size * TF_SEARCH_DISTANCE_THRESHOLD;
+    distance_threshold = AOMMAX(distance_threshold, 1);
+    d_factor[subblock_idx] = distance / distance_threshold;
+    d_factor[subblock_idx] = AOMMAX(d_factor[subblock_idx], 1);
+  }
+
+  // Handle planes in sequence.
+  int plane_offset = 0;
+  for (int plane = 0; plane < num_planes; ++plane) {
+    const uint32_t plane_h = mb_height >> mbd->plane[plane].subsampling_y;
+    const uint32_t plane_w = mb_width >> mbd->plane[plane].subsampling_x;
+    const uint32_t frame_stride = frame_to_filter->strides[plane == 0 ? 0 : 1];
+    const int frame_offset = mb_row * plane_h * frame_stride + mb_col * plane_w;
+
+    const uint16_t *ref =
+        CONVERT_TO_SHORTPTR(frame_to_filter->buffers[plane]) + frame_offset;
+    const int ss_x_shift =
+        mbd->plane[plane].subsampling_x - mbd->plane[AOM_PLANE_Y].subsampling_x;
+    const int ss_y_shift =
+        mbd->plane[plane].subsampling_y - mbd->plane[AOM_PLANE_Y].subsampling_y;
+    const int num_ref_pixels = TF_WINDOW_LENGTH * TF_WINDOW_LENGTH +
+                               ((plane) ? (1 << (ss_x_shift + ss_y_shift)) : 0);
+    const double inv_num_ref_pixels = 1.0 / num_ref_pixels;
+    // Larger noise -> larger filtering weight.
+    const double n_decay = 0.5 + log(2 * noise_levels[plane] + 5.0);
+    const double decay_factor = 1 / (n_decay * q_decay * s_decay);
+
+    // Filter U-plane and V-plane using Y-plane. This is because motion
+    // search is only done on Y-plane, so the information from Y-plane
+    // will be more accurate. The luma sse sum is reused in both chroma
+    // planes.
+    if (plane == AOM_PLANE_U) {
+      for (unsigned int i = 0, k = 0; i < plane_h; i++) {
+        for (unsigned int j = 0; j < plane_w; j++, k++) {
+          for (int ii = 0; ii < (1 << ss_y_shift); ++ii) {
+            for (int jj = 0; jj < (1 << ss_x_shift); ++jj) {
+              const int yy = (i << ss_y_shift) + ii;  // Y-coord on Y-plane.
+              const int xx = (j << ss_x_shift) + jj;  // X-coord on Y-plane.
+              luma_sse_sum[i * BW + j] += frame_sse[yy * SSE_STRIDE + xx + 2];
+            }
+          }
+        }
+      }
+    }
+
+    highbd_apply_temporal_filter(
+        ref, frame_stride, pred1 + plane_offset, plane_w, plane_w, plane_h,
+        subblock_mses, accum + plane_offset, count + plane_offset, frame_sse,
+        luma_sse_sum, mbd->bd, inv_num_ref_pixels, decay_factor, inv_factor,
+        weight_factor, d_factor);
+    plane_offset += plane_h * plane_w;
+  }
+}
diff --git a/av1/encoder/x86/highbd_temporal_filter_sse2.c b/av1/encoder/x86/highbd_temporal_filter_sse2.c
new file mode 100644
index 0000000..bbb3771
--- /dev/null
+++ b/av1/encoder/x86/highbd_temporal_filter_sse2.c
@@ -0,0 +1,298 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>
+
+#include "config/av1_rtcd.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/temporal_filter.h"
+
+// For the squared error buffer, keep a padding for 4 samples
+#define SSE_STRIDE (BW + 4)
+
+DECLARE_ALIGNED(32, static const uint32_t, sse_bytemask_2x4[4][2][4]) = {
+  { { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF },
+    { 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000 } },
+  { { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF },
+    { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000 } },
+  { { 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF },
+    { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 } },
+  { { 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF },
+    { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF } }
+};
+
+static void get_squared_error(const uint16_t *frame1, const unsigned int stride,
+                              const uint16_t *frame2,
+                              const unsigned int stride2, const int block_width,
+                              const int block_height, uint32_t *frame_sse,
+                              const unsigned int dst_stride) {
+  const uint16_t *src1 = frame1;
+  const uint16_t *src2 = frame2;
+  uint32_t *dst = frame_sse;
+
+  for (int i = 0; i < block_height; i++) {
+    for (int j = 0; j < block_width; j += 8) {
+      __m128i vsrc1 = _mm_loadu_si128((__m128i *)(src1 + j));
+      __m128i vsrc2 = _mm_loadu_si128((__m128i *)(src2 + j));
+
+      __m128i vdiff = _mm_sub_epi16(vsrc1, vsrc2);
+      __m128i vmullo = _mm_mullo_epi16(vdiff, vdiff);
+      __m128i vmullh = _mm_mulhi_epi16(vdiff, vdiff);
+
+      __m128i vres1 = _mm_unpacklo_epi16(vmullo, vmullh);
+      __m128i vres2 = _mm_unpackhi_epi16(vmullo, vmullh);
+
+      _mm_storeu_si128((__m128i *)(dst + j + 2), vres1);
+      _mm_storeu_si128((__m128i *)(dst + j + 6), vres2);
+    }
+
+    src1 += stride;
+    src2 += stride2;
+    dst += dst_stride;
+  }
+}
+
+static void xx_load_and_pad(uint32_t *src, __m128i *dstvec, int col,
+                            int block_width) {
+  __m128i vtmp1 = _mm_loadu_si128((__m128i *)src);
+  __m128i vtmp2 = _mm_loadu_si128((__m128i *)(src + 4));
+  // For the first column, replicate the first element twice to the left
+  dstvec[0] = (col) ? vtmp1 : _mm_shuffle_epi32(vtmp1, 0xEA);
+  // For the last column, replicate the last element twice to the right
+  dstvec[1] = (col < block_width - 4) ? vtmp2 : _mm_shuffle_epi32(vtmp2, 0x54);
+}
+
+static int32_t xx_mask_and_hadd(__m128i vsum1, __m128i vsum2, int i) {
+  __m128i veca, vecb;
+  // Mask and obtain the required 5 values inside the vector
+  veca = _mm_and_si128(vsum1, *(__m128i *)sse_bytemask_2x4[i][0]);
+  vecb = _mm_and_si128(vsum2, *(__m128i *)sse_bytemask_2x4[i][1]);
+  // A = [A0+B0, A1+B1, A2+B2, A3+B3]
+  veca = _mm_add_epi32(veca, vecb);
+  // B = [A2+B2, A3+B3, 0, 0]
+  vecb = _mm_srli_si128(veca, 8);
+  // A = [A0+B0+A2+B2, A1+B1+A3+B3, X, X]
+  veca = _mm_add_epi32(veca, vecb);
+  // B = [A1+B1+A3+B3, 0, 0, 0]
+  vecb = _mm_srli_si128(veca, 4);
+  // A = [A0+B0+A2+B2+A1+B1+A3+B3, X, X, X]
+  veca = _mm_add_epi32(veca, vecb);
+  return _mm_cvtsi128_si32(veca);
+}
+
+static void highbd_apply_temporal_filter(
+    const uint16_t *frame1, const unsigned int stride, const uint16_t *frame2,
+    const unsigned int stride2, const int block_width, const int block_height,
+    const int *subblock_mses, unsigned int *accumulator, uint16_t *count,
+    uint32_t *frame_sse, uint32_t *luma_sse_sum, int bd,
+    const double inv_num_ref_pixels, const double decay_factor,
+    const double inv_factor, const double weight_factor, double *d_factor) {
+  assert(((block_width == 16) || (block_width == 32)) &&
+         ((block_height == 16) || (block_height == 32)));
+
+  uint32_t acc_5x5_sse[BH][BW];
+
+  get_squared_error(frame1, stride, frame2, stride2, block_width, block_height,
+                    frame_sse, SSE_STRIDE);
+
+  __m128i vsrc[5][2];
+
+  // Traverse 4 columns at a time
+  // First and last columns will require padding
+  for (int col = 0; col < block_width; col += 4) {
+    uint32_t *src = frame_sse + col;
+
+    // Load and pad(for first and last col) 3 rows from the top
+    for (int i = 2; i < 5; i++) {
+      xx_load_and_pad(src, vsrc[i], col, block_width);
+      src += SSE_STRIDE;
+    }
+
+    // Padding for top 2 rows
+    vsrc[0][0] = vsrc[2][0];
+    vsrc[0][1] = vsrc[2][1];
+    vsrc[1][0] = vsrc[2][0];
+    vsrc[1][1] = vsrc[2][1];
+
+    for (int row = 0; row < block_height - 3; row++) {
+      __m128i vsum11 = _mm_add_epi32(vsrc[0][0], vsrc[1][0]);
+      __m128i vsum12 = _mm_add_epi32(vsrc[2][0], vsrc[3][0]);
+      __m128i vsum13 = _mm_add_epi32(vsum11, vsum12);
+      __m128i vsum1 = _mm_add_epi32(vsum13, vsrc[4][0]);
+
+      __m128i vsum21 = _mm_add_epi32(vsrc[0][1], vsrc[1][1]);
+      __m128i vsum22 = _mm_add_epi32(vsrc[2][1], vsrc[3][1]);
+      __m128i vsum23 = _mm_add_epi32(vsum21, vsum22);
+      __m128i vsum2 = _mm_add_epi32(vsum23, vsrc[4][1]);
+
+      vsrc[0][0] = vsrc[1][0];
+      vsrc[0][1] = vsrc[1][1];
+      vsrc[1][0] = vsrc[2][0];
+      vsrc[1][1] = vsrc[2][1];
+      vsrc[2][0] = vsrc[3][0];
+      vsrc[2][1] = vsrc[3][1];
+      vsrc[3][0] = vsrc[4][0];
+      vsrc[3][1] = vsrc[4][1];
+
+      // Load next row
+      xx_load_and_pad(src, vsrc[4], col, block_width);
+      src += SSE_STRIDE;
+
+      acc_5x5_sse[row][col] = xx_mask_and_hadd(vsum1, vsum2, 0);
+      acc_5x5_sse[row][col + 1] = xx_mask_and_hadd(vsum1, vsum2, 1);
+      acc_5x5_sse[row][col + 2] = xx_mask_and_hadd(vsum1, vsum2, 2);
+      acc_5x5_sse[row][col + 3] = xx_mask_and_hadd(vsum1, vsum2, 3);
+    }
+    for (int row = block_height - 3; row < block_height; row++) {
+      __m128i vsum11 = _mm_add_epi32(vsrc[0][0], vsrc[1][0]);
+      __m128i vsum12 = _mm_add_epi32(vsrc[2][0], vsrc[3][0]);
+      __m128i vsum13 = _mm_add_epi32(vsum11, vsum12);
+      __m128i vsum1 = _mm_add_epi32(vsum13, vsrc[4][0]);
+
+      __m128i vsum21 = _mm_add_epi32(vsrc[0][1], vsrc[1][1]);
+      __m128i vsum22 = _mm_add_epi32(vsrc[2][1], vsrc[3][1]);
+      __m128i vsum23 = _mm_add_epi32(vsum21, vsum22);
+      __m128i vsum2 = _mm_add_epi32(vsum23, vsrc[4][1]);
+
+      vsrc[0][0] = vsrc[1][0];
+      vsrc[0][1] = vsrc[1][1];
+      vsrc[1][0] = vsrc[2][0];
+      vsrc[1][1] = vsrc[2][1];
+      vsrc[2][0] = vsrc[3][0];
+      vsrc[2][1] = vsrc[3][1];
+      vsrc[3][0] = vsrc[4][0];
+      vsrc[3][1] = vsrc[4][1];
+
+      acc_5x5_sse[row][col] = xx_mask_and_hadd(vsum1, vsum2, 0);
+      acc_5x5_sse[row][col + 1] = xx_mask_and_hadd(vsum1, vsum2, 1);
+      acc_5x5_sse[row][col + 2] = xx_mask_and_hadd(vsum1, vsum2, 2);
+      acc_5x5_sse[row][col + 3] = xx_mask_and_hadd(vsum1, vsum2, 3);
+    }
+  }
+
+  for (int i = 0, k = 0; i < block_height; i++) {
+    for (int j = 0; j < block_width; j++, k++) {
+      const int pixel_value = frame2[i * stride2 + j];
+      uint32_t diff_sse = acc_5x5_sse[i][j] + luma_sse_sum[i * BW + j];
+
+      // Scale down the difference for high bit depth input.
+      diff_sse >>= ((bd - 8) * 2);
+
+      const double window_error = diff_sse * inv_num_ref_pixels;
+      const int subblock_idx =
+          (i >= block_height / 2) * 2 + (j >= block_width / 2);
+      const double block_error = (double)subblock_mses[subblock_idx];
+      const double combined_error =
+          weight_factor * window_error + block_error * inv_factor;
+
+      double scaled_error =
+          combined_error * d_factor[subblock_idx] * decay_factor;
+      scaled_error = AOMMIN(scaled_error, 7);
+      const int weight = (int)(exp(-scaled_error) * TF_WEIGHT_SCALE);
+
+      count[k] += weight;
+      accumulator[k] += weight * pixel_value;
+    }
+  }
+}
+
+void av1_highbd_apply_temporal_filter_sse2(
+    const YV12_BUFFER_CONFIG *frame_to_filter, const MACROBLOCKD *mbd,
+    const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
+    const int num_planes, const double *noise_levels, const MV *subblock_mvs,
+    const int *subblock_mses, const int q_factor, const int filter_strength,
+    const uint8_t *pred, uint32_t *accum, uint16_t *count) {
+  const int is_high_bitdepth = frame_to_filter->flags & YV12_FLAG_HIGHBITDEPTH;
+  assert(block_size == BLOCK_32X32 && "Only support 32x32 block with sse2!");
+  assert(TF_WINDOW_LENGTH == 5 && "Only support window length 5 with sse2!");
+  assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
+  (void)is_high_bitdepth;
+
+  const int mb_height = block_size_high[block_size];
+  const int mb_width = block_size_wide[block_size];
+  const int frame_height = frame_to_filter->y_crop_height;
+  const int frame_width = frame_to_filter->y_crop_width;
+  const int min_frame_size = AOMMIN(frame_height, frame_width);
+  // Variables to simplify combined error calculation.
+  const double inv_factor = 1.0 / ((TF_WINDOW_BLOCK_BALANCE_WEIGHT + 1) *
+                                   TF_SEARCH_ERROR_NORM_WEIGHT);
+  const double weight_factor =
+      (double)TF_WINDOW_BLOCK_BALANCE_WEIGHT * inv_factor;
+  // Decay factors for non-local mean approach.
+  // Smaller q -> smaller filtering weight.
+  double q_decay = pow((double)q_factor / TF_Q_DECAY_THRESHOLD, 2);
+  q_decay = CLIP(q_decay, 1e-5, 1);
+  // Smaller strength -> smaller filtering weight.
+  double s_decay = pow((double)filter_strength / TF_STRENGTH_THRESHOLD, 2);
+  s_decay = CLIP(s_decay, 1e-5, 1);
+  double d_factor[4] = { 0 };
+  uint32_t frame_sse[SSE_STRIDE * BH] = { 0 };
+  uint32_t luma_sse_sum[BW * BH] = { 0 };
+  uint16_t *pred1 = CONVERT_TO_SHORTPTR(pred);
+
+  for (int subblock_idx = 0; subblock_idx < 4; subblock_idx++) {
+    // Larger motion vector -> smaller filtering weight.
+    const MV mv = subblock_mvs[subblock_idx];
+    const double distance = sqrt(pow(mv.row, 2) + pow(mv.col, 2));
+    double distance_threshold = min_frame_size * TF_SEARCH_DISTANCE_THRESHOLD;
+    distance_threshold = AOMMAX(distance_threshold, 1);
+    d_factor[subblock_idx] = distance / distance_threshold;
+    d_factor[subblock_idx] = AOMMAX(d_factor[subblock_idx], 1);
+  }
+
+  // Handle planes in sequence.
+  int plane_offset = 0;
+  for (int plane = 0; plane < num_planes; ++plane) {
+    const uint32_t plane_h = mb_height >> mbd->plane[plane].subsampling_y;
+    const uint32_t plane_w = mb_width >> mbd->plane[plane].subsampling_x;
+    const uint32_t frame_stride = frame_to_filter->strides[plane == 0 ? 0 : 1];
+    const int frame_offset = mb_row * plane_h * frame_stride + mb_col * plane_w;
+
+    const uint16_t *ref =
+        CONVERT_TO_SHORTPTR(frame_to_filter->buffers[plane]) + frame_offset;
+    const int ss_x_shift =
+        mbd->plane[plane].subsampling_x - mbd->plane[0].subsampling_x;
+    const int ss_y_shift =
+        mbd->plane[plane].subsampling_y - mbd->plane[0].subsampling_y;
+    const int num_ref_pixels = TF_WINDOW_LENGTH * TF_WINDOW_LENGTH +
+                               ((plane) ? (1 << (ss_x_shift + ss_y_shift)) : 0);
+    const double inv_num_ref_pixels = 1.0 / num_ref_pixels;
+    // Larger noise -> larger filtering weight.
+    const double n_decay = 0.5 + log(2 * noise_levels[plane] + 5.0);
+    const double decay_factor = 1 / (n_decay * q_decay * s_decay);
+
+    // Filter U-plane and V-plane using Y-plane. This is because motion
+    // search is only done on Y-plane, so the information from Y-plane
+    // will be more accurate. The luma sse sum is reused in both chroma
+    // planes.
+    if (plane == AOM_PLANE_U) {
+      for (unsigned int i = 0, k = 0; i < plane_h; i++) {
+        for (unsigned int j = 0; j < plane_w; j++, k++) {
+          for (int ii = 0; ii < (1 << ss_y_shift); ++ii) {
+            for (int jj = 0; jj < (1 << ss_x_shift); ++jj) {
+              const int yy = (i << ss_y_shift) + ii;  // Y-coord on Y-plane.
+              const int xx = (j << ss_x_shift) + jj;  // X-coord on Y-plane.
+              luma_sse_sum[i * BW + j] += frame_sse[yy * SSE_STRIDE + xx + 2];
+            }
+          }
+        }
+      }
+    }
+
+    highbd_apply_temporal_filter(
+        ref, frame_stride, pred1 + plane_offset, plane_w, plane_w, plane_h,
+        subblock_mses, accum + plane_offset, count + plane_offset, frame_sse,
+        luma_sse_sum, mbd->bd, inv_num_ref_pixels, decay_factor, inv_factor,
+        weight_factor, d_factor);
+    plane_offset += plane_h * plane_w;
+  }
+}
diff --git a/av1/encoder/x86/pickrst_avx2.c b/av1/encoder/x86/pickrst_avx2.c
index f8703a2..32438d5 100644
--- a/av1/encoder/x86/pickrst_avx2.c
+++ b/av1/encoder/x86/pickrst_avx2.c
@@ -10,6 +10,7 @@
  */
 
 #include <immintrin.h>  // AVX2
+#include "aom_dsp/x86/mem_sse2.h"
 #include "aom_dsp/x86/synonyms.h"
 #include "aom_dsp/x86/synonyms_avx2.h"
 #include "aom_dsp/x86/transpose_sse2.h"
@@ -49,7 +50,7 @@
         M_int[k][l] += D1 * X1 + D2 * X2;
 
         const __m256i kl =
-            _mm256_cvtepu8_epi16(_mm_set1_epi16(*((uint16_t *)(dgd_ijk + l))));
+            _mm256_cvtepu8_epi16(_mm_set1_epi16(loadu_uint16(dgd_ijk + l)));
         acc_stat_avx2(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, &kl);
         acc_stat_avx2(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, &kl);
         acc_stat_avx2(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, &kl);
@@ -181,8 +182,7 @@
 
         // Load two u16 values from dgd_ijkl combined as a u32,
         // then broadcast to 8x u32 slots of a 256
-        const __m256i dgd_ijkl =
-            _mm256_set1_epi32(*((uint32_t *)(dgd_ijk + l)));
+        const __m256i dgd_ijkl = _mm256_set1_epi32(loadu_uint32(dgd_ijk + l));
         // dgd_ijkl = [y x y x y x y x] [y x y x y x y x] where each is a u16
 
         acc_stat_highbd_avx2(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle,
@@ -285,8 +285,7 @@
 
         // Load two u16 values from dgd_ijkl combined as a u32,
         // then broadcast to 8x u32 slots of a 256
-        const __m256i dgd_ijkl =
-            _mm256_set1_epi32(*((uint32_t *)(dgd_ijk + l)));
+        const __m256i dgd_ijkl = _mm256_set1_epi32(loadu_uint32(dgd_ijk + l));
         // dgd_ijkl = [x y x y x y x y] [x y x y x y x y] where each is a u16
 
         acc_stat_highbd_avx2(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle,
@@ -406,7 +405,7 @@
         M_int[k][l] += D1 * X1 + D2 * X2;
 
         const __m256i kl =
-            _mm256_cvtepu8_epi16(_mm_set1_epi16(*((uint16_t *)(dgd_ijk + l))));
+            _mm256_cvtepu8_epi16(_mm_set1_epi16(loadu_uint16(dgd_ijk + l)));
         acc_stat_avx2(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, &kl);
         acc_stat_avx2(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, &kl);
         acc_stat_avx2(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, &kl);
@@ -861,6 +860,229 @@
   }
 }
 
+static AOM_INLINE void calc_proj_params_r0_r1_high_bd_avx2(
+    const uint8_t *src8, int width, int height, int src_stride,
+    const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
+    int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2]) {
+  const int size = width * height;
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
+  __m256i h00, h01, h11, c0, c1;
+  const __m256i zero = _mm256_setzero_si256();
+  h01 = h11 = c0 = c1 = h00 = zero;
+
+  for (int i = 0; i < height; ++i) {
+    for (int j = 0; j < width; j += 8) {
+      const __m256i u_load = _mm256_cvtepu16_epi32(
+          _mm_load_si128((__m128i *)(dat + i * dat_stride + j)));
+      const __m256i s_load = _mm256_cvtepu16_epi32(
+          _mm_load_si128((__m128i *)(src + i * src_stride + j)));
+      __m256i f1 = _mm256_loadu_si256((__m256i *)(flt0 + i * flt0_stride + j));
+      __m256i f2 = _mm256_loadu_si256((__m256i *)(flt1 + i * flt1_stride + j));
+      __m256i d = _mm256_slli_epi32(u_load, SGRPROJ_RST_BITS);
+      __m256i s = _mm256_slli_epi32(s_load, SGRPROJ_RST_BITS);
+      s = _mm256_sub_epi32(s, d);
+      f1 = _mm256_sub_epi32(f1, d);
+      f2 = _mm256_sub_epi32(f2, d);
+
+      const __m256i h00_even = _mm256_mul_epi32(f1, f1);
+      const __m256i h00_odd = _mm256_mul_epi32(_mm256_srli_epi64(f1, 32),
+                                               _mm256_srli_epi64(f1, 32));
+      h00 = _mm256_add_epi64(h00, h00_even);
+      h00 = _mm256_add_epi64(h00, h00_odd);
+
+      const __m256i h01_even = _mm256_mul_epi32(f1, f2);
+      const __m256i h01_odd = _mm256_mul_epi32(_mm256_srli_epi64(f1, 32),
+                                               _mm256_srli_epi64(f2, 32));
+      h01 = _mm256_add_epi64(h01, h01_even);
+      h01 = _mm256_add_epi64(h01, h01_odd);
+
+      const __m256i h11_even = _mm256_mul_epi32(f2, f2);
+      const __m256i h11_odd = _mm256_mul_epi32(_mm256_srli_epi64(f2, 32),
+                                               _mm256_srli_epi64(f2, 32));
+      h11 = _mm256_add_epi64(h11, h11_even);
+      h11 = _mm256_add_epi64(h11, h11_odd);
+
+      const __m256i c0_even = _mm256_mul_epi32(f1, s);
+      const __m256i c0_odd =
+          _mm256_mul_epi32(_mm256_srli_epi64(f1, 32), _mm256_srli_epi64(s, 32));
+      c0 = _mm256_add_epi64(c0, c0_even);
+      c0 = _mm256_add_epi64(c0, c0_odd);
+
+      const __m256i c1_even = _mm256_mul_epi32(f2, s);
+      const __m256i c1_odd =
+          _mm256_mul_epi32(_mm256_srli_epi64(f2, 32), _mm256_srli_epi64(s, 32));
+      c1 = _mm256_add_epi64(c1, c1_even);
+      c1 = _mm256_add_epi64(c1, c1_odd);
+    }
+  }
+
+  __m256i c_low = _mm256_unpacklo_epi64(c0, c1);
+  const __m256i c_high = _mm256_unpackhi_epi64(c0, c1);
+  c_low = _mm256_add_epi64(c_low, c_high);
+  const __m128i c_128bit = _mm_add_epi64(_mm256_extracti128_si256(c_low, 1),
+                                         _mm256_castsi256_si128(c_low));
+
+  __m256i h0x_low = _mm256_unpacklo_epi64(h00, h01);
+  const __m256i h0x_high = _mm256_unpackhi_epi64(h00, h01);
+  h0x_low = _mm256_add_epi64(h0x_low, h0x_high);
+  const __m128i h0x_128bit = _mm_add_epi64(_mm256_extracti128_si256(h0x_low, 1),
+                                           _mm256_castsi256_si128(h0x_low));
+
+  // Using the symmetric properties of H,  calculations of H[1][0] are not
+  // needed.
+  __m256i h1x_low = _mm256_unpacklo_epi64(zero, h11);
+  const __m256i h1x_high = _mm256_unpackhi_epi64(zero, h11);
+  h1x_low = _mm256_add_epi64(h1x_low, h1x_high);
+  const __m128i h1x_128bit = _mm_add_epi64(_mm256_extracti128_si256(h1x_low, 1),
+                                           _mm256_castsi256_si128(h1x_low));
+
+  xx_storeu_128(C, c_128bit);
+  xx_storeu_128(H[0], h0x_128bit);
+  xx_storeu_128(H[1], h1x_128bit);
+
+  H[0][0] /= size;
+  H[0][1] /= size;
+  H[1][1] /= size;
+
+  // Since H is a symmetric matrix
+  H[1][0] = H[0][1];
+  C[0] /= size;
+  C[1] /= size;
+}
+
+static AOM_INLINE void calc_proj_params_r0_high_bd_avx2(
+    const uint8_t *src8, int width, int height, int src_stride,
+    const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
+    int64_t H[2][2], int64_t C[2]) {
+  const int size = width * height;
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
+  __m256i h00, c0;
+  const __m256i zero = _mm256_setzero_si256();
+  c0 = h00 = zero;
+
+  for (int i = 0; i < height; ++i) {
+    for (int j = 0; j < width; j += 8) {
+      const __m256i u_load = _mm256_cvtepu16_epi32(
+          _mm_load_si128((__m128i *)(dat + i * dat_stride + j)));
+      const __m256i s_load = _mm256_cvtepu16_epi32(
+          _mm_load_si128((__m128i *)(src + i * src_stride + j)));
+      __m256i f1 = _mm256_loadu_si256((__m256i *)(flt0 + i * flt0_stride + j));
+      __m256i d = _mm256_slli_epi32(u_load, SGRPROJ_RST_BITS);
+      __m256i s = _mm256_slli_epi32(s_load, SGRPROJ_RST_BITS);
+      s = _mm256_sub_epi32(s, d);
+      f1 = _mm256_sub_epi32(f1, d);
+
+      const __m256i h00_even = _mm256_mul_epi32(f1, f1);
+      const __m256i h00_odd = _mm256_mul_epi32(_mm256_srli_epi64(f1, 32),
+                                               _mm256_srli_epi64(f1, 32));
+      h00 = _mm256_add_epi64(h00, h00_even);
+      h00 = _mm256_add_epi64(h00, h00_odd);
+
+      const __m256i c0_even = _mm256_mul_epi32(f1, s);
+      const __m256i c0_odd =
+          _mm256_mul_epi32(_mm256_srli_epi64(f1, 32), _mm256_srli_epi64(s, 32));
+      c0 = _mm256_add_epi64(c0, c0_even);
+      c0 = _mm256_add_epi64(c0, c0_odd);
+    }
+  }
+  const __m128i h00_128bit = _mm_add_epi64(_mm256_extracti128_si256(h00, 1),
+                                           _mm256_castsi256_si128(h00));
+  const __m128i h00_val =
+      _mm_add_epi64(h00_128bit, _mm_srli_si128(h00_128bit, 8));
+
+  const __m128i c0_128bit = _mm_add_epi64(_mm256_extracti128_si256(c0, 1),
+                                          _mm256_castsi256_si128(c0));
+  const __m128i c0_val = _mm_add_epi64(c0_128bit, _mm_srli_si128(c0_128bit, 8));
+
+  const __m128i c = _mm_unpacklo_epi64(c0_val, _mm256_castsi256_si128(zero));
+  const __m128i h0x = _mm_unpacklo_epi64(h00_val, _mm256_castsi256_si128(zero));
+
+  xx_storeu_128(C, c);
+  xx_storeu_128(H[0], h0x);
+
+  H[0][0] /= size;
+  C[0] /= size;
+}
+
+static AOM_INLINE void calc_proj_params_r1_high_bd_avx2(
+    const uint8_t *src8, int width, int height, int src_stride,
+    const uint8_t *dat8, int dat_stride, int32_t *flt1, int flt1_stride,
+    int64_t H[2][2], int64_t C[2]) {
+  const int size = width * height;
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
+  __m256i h11, c1;
+  const __m256i zero = _mm256_setzero_si256();
+  c1 = h11 = zero;
+
+  for (int i = 0; i < height; ++i) {
+    for (int j = 0; j < width; j += 8) {
+      const __m256i u_load = _mm256_cvtepu16_epi32(
+          _mm_load_si128((__m128i *)(dat + i * dat_stride + j)));
+      const __m256i s_load = _mm256_cvtepu16_epi32(
+          _mm_load_si128((__m128i *)(src + i * src_stride + j)));
+      __m256i f2 = _mm256_loadu_si256((__m256i *)(flt1 + i * flt1_stride + j));
+      __m256i d = _mm256_slli_epi32(u_load, SGRPROJ_RST_BITS);
+      __m256i s = _mm256_slli_epi32(s_load, SGRPROJ_RST_BITS);
+      s = _mm256_sub_epi32(s, d);
+      f2 = _mm256_sub_epi32(f2, d);
+
+      const __m256i h11_even = _mm256_mul_epi32(f2, f2);
+      const __m256i h11_odd = _mm256_mul_epi32(_mm256_srli_epi64(f2, 32),
+                                               _mm256_srli_epi64(f2, 32));
+      h11 = _mm256_add_epi64(h11, h11_even);
+      h11 = _mm256_add_epi64(h11, h11_odd);
+
+      const __m256i c1_even = _mm256_mul_epi32(f2, s);
+      const __m256i c1_odd =
+          _mm256_mul_epi32(_mm256_srli_epi64(f2, 32), _mm256_srli_epi64(s, 32));
+      c1 = _mm256_add_epi64(c1, c1_even);
+      c1 = _mm256_add_epi64(c1, c1_odd);
+    }
+  }
+
+  const __m128i h11_128bit = _mm_add_epi64(_mm256_extracti128_si256(h11, 1),
+                                           _mm256_castsi256_si128(h11));
+  const __m128i h11_val =
+      _mm_add_epi64(h11_128bit, _mm_srli_si128(h11_128bit, 8));
+
+  const __m128i c1_128bit = _mm_add_epi64(_mm256_extracti128_si256(c1, 1),
+                                          _mm256_castsi256_si128(c1));
+  const __m128i c1_val = _mm_add_epi64(c1_128bit, _mm_srli_si128(c1_128bit, 8));
+
+  const __m128i c = _mm_unpacklo_epi64(_mm256_castsi256_si128(zero), c1_val);
+  const __m128i h1x = _mm_unpacklo_epi64(_mm256_castsi256_si128(zero), h11_val);
+
+  xx_storeu_128(C, c);
+  xx_storeu_128(H[1], h1x);
+
+  H[1][1] /= size;
+  C[1] /= size;
+}
+
+// AVX2 variant of av1_calc_proj_params_high_bd_c.
+void av1_calc_proj_params_high_bd_avx2(const uint8_t *src8, int width,
+                                       int height, int src_stride,
+                                       const uint8_t *dat8, int dat_stride,
+                                       int32_t *flt0, int flt0_stride,
+                                       int32_t *flt1, int flt1_stride,
+                                       int64_t H[2][2], int64_t C[2],
+                                       const sgr_params_type *params) {
+  if ((params->r[0] > 0) && (params->r[1] > 0)) {
+    calc_proj_params_r0_r1_high_bd_avx2(src8, width, height, src_stride, dat8,
+                                        dat_stride, flt0, flt0_stride, flt1,
+                                        flt1_stride, H, C);
+  } else if (params->r[0] > 0) {
+    calc_proj_params_r0_high_bd_avx2(src8, width, height, src_stride, dat8,
+                                     dat_stride, flt0, flt0_stride, H, C);
+  } else if (params->r[1] > 0) {
+    calc_proj_params_r1_high_bd_avx2(src8, width, height, src_stride, dat8,
+                                     dat_stride, flt1, flt1_stride, H, C);
+  }
+}
+
 #if CONFIG_AV1_HIGHBITDEPTH
 int64_t av1_highbd_pixel_proj_error_avx2(
     const uint8_t *src8, int width, int height, int src_stride,
diff --git a/av1/encoder/x86/pickrst_sse4.c b/av1/encoder/x86/pickrst_sse4.c
index a2f65a5..e0e3738 100644
--- a/av1/encoder/x86/pickrst_sse4.c
+++ b/av1/encoder/x86/pickrst_sse4.c
@@ -624,6 +624,429 @@
   return err;
 }
 
+// When params->r[0] > 0 and params->r[1] > 0. In this case all elements of
+// C and H need to be computed.
+static AOM_INLINE void calc_proj_params_r0_r1_sse4_1(
+    const uint8_t *src8, int width, int height, int src_stride,
+    const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
+    int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2]) {
+  const int size = width * height;
+  const uint8_t *src = src8;
+  const uint8_t *dat = dat8;
+  __m128i h00, h01, h11, c0, c1;
+  const __m128i zero = _mm_setzero_si128();
+  h01 = h11 = c0 = c1 = h00 = zero;
+
+  for (int i = 0; i < height; ++i) {
+    for (int j = 0; j < width; j += 4) {
+      const __m128i u_load = _mm_cvtepu8_epi32(
+          _mm_cvtsi32_si128(*((int *)(dat + i * dat_stride + j))));
+      const __m128i s_load = _mm_cvtepu8_epi32(
+          _mm_cvtsi32_si128(*((int *)(src + i * src_stride + j))));
+      __m128i f1 = _mm_loadu_si128((__m128i *)(flt0 + i * flt0_stride + j));
+      __m128i f2 = _mm_loadu_si128((__m128i *)(flt1 + i * flt1_stride + j));
+      __m128i d = _mm_slli_epi32(u_load, SGRPROJ_RST_BITS);
+      __m128i s = _mm_slli_epi32(s_load, SGRPROJ_RST_BITS);
+      s = _mm_sub_epi32(s, d);
+      f1 = _mm_sub_epi32(f1, d);
+      f2 = _mm_sub_epi32(f2, d);
+
+      const __m128i h00_even = _mm_mul_epi32(f1, f1);
+      const __m128i h00_odd =
+          _mm_mul_epi32(_mm_srli_epi64(f1, 32), _mm_srli_epi64(f1, 32));
+      h00 = _mm_add_epi64(h00, h00_even);
+      h00 = _mm_add_epi64(h00, h00_odd);
+
+      const __m128i h01_even = _mm_mul_epi32(f1, f2);
+      const __m128i h01_odd =
+          _mm_mul_epi32(_mm_srli_epi64(f1, 32), _mm_srli_epi64(f2, 32));
+      h01 = _mm_add_epi64(h01, h01_even);
+      h01 = _mm_add_epi64(h01, h01_odd);
+
+      const __m128i h11_even = _mm_mul_epi32(f2, f2);
+      const __m128i h11_odd =
+          _mm_mul_epi32(_mm_srli_epi64(f2, 32), _mm_srli_epi64(f2, 32));
+      h11 = _mm_add_epi64(h11, h11_even);
+      h11 = _mm_add_epi64(h11, h11_odd);
+
+      const __m128i c0_even = _mm_mul_epi32(f1, s);
+      const __m128i c0_odd =
+          _mm_mul_epi32(_mm_srli_epi64(f1, 32), _mm_srli_epi64(s, 32));
+      c0 = _mm_add_epi64(c0, c0_even);
+      c0 = _mm_add_epi64(c0, c0_odd);
+
+      const __m128i c1_even = _mm_mul_epi32(f2, s);
+      const __m128i c1_odd =
+          _mm_mul_epi32(_mm_srli_epi64(f2, 32), _mm_srli_epi64(s, 32));
+      c1 = _mm_add_epi64(c1, c1_even);
+      c1 = _mm_add_epi64(c1, c1_odd);
+    }
+  }
+
+  __m128i c_low = _mm_unpacklo_epi64(c0, c1);
+  const __m128i c_high = _mm_unpackhi_epi64(c0, c1);
+  c_low = _mm_add_epi64(c_low, c_high);
+
+  __m128i h0x_low = _mm_unpacklo_epi64(h00, h01);
+  const __m128i h0x_high = _mm_unpackhi_epi64(h00, h01);
+  h0x_low = _mm_add_epi64(h0x_low, h0x_high);
+
+  // Using the symmetric properties of H,  calculations of H[1][0] are not
+  // needed.
+  __m128i h1x_low = _mm_unpacklo_epi64(zero, h11);
+  const __m128i h1x_high = _mm_unpackhi_epi64(zero, h11);
+  h1x_low = _mm_add_epi64(h1x_low, h1x_high);
+
+  xx_storeu_128(C, c_low);
+  xx_storeu_128(H[0], h0x_low);
+  xx_storeu_128(H[1], h1x_low);
+
+  H[0][0] /= size;
+  H[0][1] /= size;
+  H[1][1] /= size;
+
+  // Since H is a symmetric matrix
+  H[1][0] = H[0][1];
+  C[0] /= size;
+  C[1] /= size;
+}
+
+// When only params->r[0] > 0. In this case only H[0][0] and C[0] are
+// non-zero and need to be computed.
+static AOM_INLINE void calc_proj_params_r0_sse4_1(
+    const uint8_t *src8, int width, int height, int src_stride,
+    const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
+    int64_t H[2][2], int64_t C[2]) {
+  const int size = width * height;
+  const uint8_t *src = src8;
+  const uint8_t *dat = dat8;
+  __m128i h00, c0;
+  const __m128i zero = _mm_setzero_si128();
+  c0 = h00 = zero;
+
+  for (int i = 0; i < height; ++i) {
+    for (int j = 0; j < width; j += 4) {
+      const __m128i u_load = _mm_cvtepu8_epi32(
+          _mm_cvtsi32_si128(*((int *)(dat + i * dat_stride + j))));
+      const __m128i s_load = _mm_cvtepu8_epi32(
+          _mm_cvtsi32_si128(*((int *)(src + i * src_stride + j))));
+      __m128i f1 = _mm_loadu_si128((__m128i *)(flt0 + i * flt0_stride + j));
+      __m128i d = _mm_slli_epi32(u_load, SGRPROJ_RST_BITS);
+      __m128i s = _mm_slli_epi32(s_load, SGRPROJ_RST_BITS);
+      s = _mm_sub_epi32(s, d);
+      f1 = _mm_sub_epi32(f1, d);
+
+      const __m128i h00_even = _mm_mul_epi32(f1, f1);
+      const __m128i h00_odd =
+          _mm_mul_epi32(_mm_srli_epi64(f1, 32), _mm_srli_epi64(f1, 32));
+      h00 = _mm_add_epi64(h00, h00_even);
+      h00 = _mm_add_epi64(h00, h00_odd);
+
+      const __m128i c0_even = _mm_mul_epi32(f1, s);
+      const __m128i c0_odd =
+          _mm_mul_epi32(_mm_srli_epi64(f1, 32), _mm_srli_epi64(s, 32));
+      c0 = _mm_add_epi64(c0, c0_even);
+      c0 = _mm_add_epi64(c0, c0_odd);
+    }
+  }
+  const __m128i h00_val = _mm_add_epi64(h00, _mm_srli_si128(h00, 8));
+
+  const __m128i c0_val = _mm_add_epi64(c0, _mm_srli_si128(c0, 8));
+
+  const __m128i c = _mm_unpacklo_epi64(c0_val, zero);
+  const __m128i h0x = _mm_unpacklo_epi64(h00_val, zero);
+
+  xx_storeu_128(C, c);
+  xx_storeu_128(H[0], h0x);
+
+  H[0][0] /= size;
+  C[0] /= size;
+}
+
+// When only params->r[1] > 0. In this case only H[1][1] and C[1] are
+// non-zero and need to be computed.
+static AOM_INLINE void calc_proj_params_r1_sse4_1(
+    const uint8_t *src8, int width, int height, int src_stride,
+    const uint8_t *dat8, int dat_stride, int32_t *flt1, int flt1_stride,
+    int64_t H[2][2], int64_t C[2]) {
+  const int size = width * height;
+  const uint8_t *src = src8;
+  const uint8_t *dat = dat8;
+  __m128i h11, c1;
+  const __m128i zero = _mm_setzero_si128();
+  c1 = h11 = zero;
+
+  for (int i = 0; i < height; ++i) {
+    for (int j = 0; j < width; j += 4) {
+      const __m128i u_load = _mm_cvtepu8_epi32(
+          _mm_cvtsi32_si128(*((int *)(dat + i * dat_stride + j))));
+      const __m128i s_load = _mm_cvtepu8_epi32(
+          _mm_cvtsi32_si128(*((int *)(src + i * src_stride + j))));
+      __m128i f2 = _mm_loadu_si128((__m128i *)(flt1 + i * flt1_stride + j));
+      __m128i d = _mm_slli_epi32(u_load, SGRPROJ_RST_BITS);
+      __m128i s = _mm_slli_epi32(s_load, SGRPROJ_RST_BITS);
+      s = _mm_sub_epi32(s, d);
+      f2 = _mm_sub_epi32(f2, d);
+
+      const __m128i h11_even = _mm_mul_epi32(f2, f2);
+      const __m128i h11_odd =
+          _mm_mul_epi32(_mm_srli_epi64(f2, 32), _mm_srli_epi64(f2, 32));
+      h11 = _mm_add_epi64(h11, h11_even);
+      h11 = _mm_add_epi64(h11, h11_odd);
+
+      const __m128i c1_even = _mm_mul_epi32(f2, s);
+      const __m128i c1_odd =
+          _mm_mul_epi32(_mm_srli_epi64(f2, 32), _mm_srli_epi64(s, 32));
+      c1 = _mm_add_epi64(c1, c1_even);
+      c1 = _mm_add_epi64(c1, c1_odd);
+    }
+  }
+
+  const __m128i h11_val = _mm_add_epi64(h11, _mm_srli_si128(h11, 8));
+
+  const __m128i c1_val = _mm_add_epi64(c1, _mm_srli_si128(c1, 8));
+
+  const __m128i c = _mm_unpacklo_epi64(zero, c1_val);
+  const __m128i h1x = _mm_unpacklo_epi64(zero, h11_val);
+
+  xx_storeu_128(C, c);
+  xx_storeu_128(H[1], h1x);
+
+  H[1][1] /= size;
+  C[1] /= size;
+}
+
+// SSE4.1 variant of av1_calc_proj_params_c.
+void av1_calc_proj_params_sse4_1(const uint8_t *src8, int width, int height,
+                                 int src_stride, const uint8_t *dat8,
+                                 int dat_stride, int32_t *flt0, int flt0_stride,
+                                 int32_t *flt1, int flt1_stride,
+                                 int64_t H[2][2], int64_t C[2],
+                                 const sgr_params_type *params) {
+  if ((params->r[0] > 0) && (params->r[1] > 0)) {
+    calc_proj_params_r0_r1_sse4_1(src8, width, height, src_stride, dat8,
+                                  dat_stride, flt0, flt0_stride, flt1,
+                                  flt1_stride, H, C);
+  } else if (params->r[0] > 0) {
+    calc_proj_params_r0_sse4_1(src8, width, height, src_stride, dat8,
+                               dat_stride, flt0, flt0_stride, H, C);
+  } else if (params->r[1] > 0) {
+    calc_proj_params_r1_sse4_1(src8, width, height, src_stride, dat8,
+                               dat_stride, flt1, flt1_stride, H, C);
+  }
+}
+
+static AOM_INLINE void calc_proj_params_r0_r1_high_bd_sse4_1(
+    const uint8_t *src8, int width, int height, int src_stride,
+    const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
+    int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2]) {
+  const int size = width * height;
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
+  __m128i h00, h01, h11, c0, c1;
+  const __m128i zero = _mm_setzero_si128();
+  h01 = h11 = c0 = c1 = h00 = zero;
+
+  for (int i = 0; i < height; ++i) {
+    for (int j = 0; j < width; j += 4) {
+      const __m128i u_load = _mm_cvtepu16_epi32(
+          _mm_loadl_epi64((__m128i *)(dat + i * dat_stride + j)));
+      const __m128i s_load = _mm_cvtepu16_epi32(
+          _mm_loadl_epi64((__m128i *)(src + i * src_stride + j)));
+      __m128i f1 = _mm_loadu_si128((__m128i *)(flt0 + i * flt0_stride + j));
+      __m128i f2 = _mm_loadu_si128((__m128i *)(flt1 + i * flt1_stride + j));
+      __m128i d = _mm_slli_epi32(u_load, SGRPROJ_RST_BITS);
+      __m128i s = _mm_slli_epi32(s_load, SGRPROJ_RST_BITS);
+      s = _mm_sub_epi32(s, d);
+      f1 = _mm_sub_epi32(f1, d);
+      f2 = _mm_sub_epi32(f2, d);
+
+      const __m128i h00_even = _mm_mul_epi32(f1, f1);
+      const __m128i h00_odd =
+          _mm_mul_epi32(_mm_srli_epi64(f1, 32), _mm_srli_epi64(f1, 32));
+      h00 = _mm_add_epi64(h00, h00_even);
+      h00 = _mm_add_epi64(h00, h00_odd);
+
+      const __m128i h01_even = _mm_mul_epi32(f1, f2);
+      const __m128i h01_odd =
+          _mm_mul_epi32(_mm_srli_epi64(f1, 32), _mm_srli_epi64(f2, 32));
+      h01 = _mm_add_epi64(h01, h01_even);
+      h01 = _mm_add_epi64(h01, h01_odd);
+
+      const __m128i h11_even = _mm_mul_epi32(f2, f2);
+      const __m128i h11_odd =
+          _mm_mul_epi32(_mm_srli_epi64(f2, 32), _mm_srli_epi64(f2, 32));
+      h11 = _mm_add_epi64(h11, h11_even);
+      h11 = _mm_add_epi64(h11, h11_odd);
+
+      const __m128i c0_even = _mm_mul_epi32(f1, s);
+      const __m128i c0_odd =
+          _mm_mul_epi32(_mm_srli_epi64(f1, 32), _mm_srli_epi64(s, 32));
+      c0 = _mm_add_epi64(c0, c0_even);
+      c0 = _mm_add_epi64(c0, c0_odd);
+
+      const __m128i c1_even = _mm_mul_epi32(f2, s);
+      const __m128i c1_odd =
+          _mm_mul_epi32(_mm_srli_epi64(f2, 32), _mm_srli_epi64(s, 32));
+      c1 = _mm_add_epi64(c1, c1_even);
+      c1 = _mm_add_epi64(c1, c1_odd);
+    }
+  }
+
+  __m128i c_low = _mm_unpacklo_epi64(c0, c1);
+  const __m128i c_high = _mm_unpackhi_epi64(c0, c1);
+  c_low = _mm_add_epi64(c_low, c_high);
+
+  __m128i h0x_low = _mm_unpacklo_epi64(h00, h01);
+  const __m128i h0x_high = _mm_unpackhi_epi64(h00, h01);
+  h0x_low = _mm_add_epi64(h0x_low, h0x_high);
+
+  // Using the symmetric properties of H,  calculations of H[1][0] are not
+  // needed.
+  __m128i h1x_low = _mm_unpacklo_epi64(zero, h11);
+  const __m128i h1x_high = _mm_unpackhi_epi64(zero, h11);
+  h1x_low = _mm_add_epi64(h1x_low, h1x_high);
+
+  xx_storeu_128(C, c_low);
+  xx_storeu_128(H[0], h0x_low);
+  xx_storeu_128(H[1], h1x_low);
+
+  H[0][0] /= size;
+  H[0][1] /= size;
+  H[1][1] /= size;
+
+  // Since H is a symmetric matrix
+  H[1][0] = H[0][1];
+  C[0] /= size;
+  C[1] /= size;
+}
+
+// When only params->r[0] > 0. In this case only H[0][0] and C[0] are
+// non-zero and need to be computed.
+static AOM_INLINE void calc_proj_params_r0_high_bd_sse4_1(
+    const uint8_t *src8, int width, int height, int src_stride,
+    const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
+    int64_t H[2][2], int64_t C[2]) {
+  const int size = width * height;
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
+  __m128i h00, c0;
+  const __m128i zero = _mm_setzero_si128();
+  c0 = h00 = zero;
+
+  for (int i = 0; i < height; ++i) {
+    for (int j = 0; j < width; j += 4) {
+      const __m128i u_load = _mm_cvtepu16_epi32(
+          _mm_loadl_epi64((__m128i *)(dat + i * dat_stride + j)));
+      const __m128i s_load = _mm_cvtepu16_epi32(
+          _mm_loadl_epi64((__m128i *)(src + i * src_stride + j)));
+      __m128i f1 = _mm_loadu_si128((__m128i *)(flt0 + i * flt0_stride + j));
+      __m128i d = _mm_slli_epi32(u_load, SGRPROJ_RST_BITS);
+      __m128i s = _mm_slli_epi32(s_load, SGRPROJ_RST_BITS);
+      s = _mm_sub_epi32(s, d);
+      f1 = _mm_sub_epi32(f1, d);
+
+      const __m128i h00_even = _mm_mul_epi32(f1, f1);
+      const __m128i h00_odd =
+          _mm_mul_epi32(_mm_srli_epi64(f1, 32), _mm_srli_epi64(f1, 32));
+      h00 = _mm_add_epi64(h00, h00_even);
+      h00 = _mm_add_epi64(h00, h00_odd);
+
+      const __m128i c0_even = _mm_mul_epi32(f1, s);
+      const __m128i c0_odd =
+          _mm_mul_epi32(_mm_srli_epi64(f1, 32), _mm_srli_epi64(s, 32));
+      c0 = _mm_add_epi64(c0, c0_even);
+      c0 = _mm_add_epi64(c0, c0_odd);
+    }
+  }
+  const __m128i h00_val = _mm_add_epi64(h00, _mm_srli_si128(h00, 8));
+
+  const __m128i c0_val = _mm_add_epi64(c0, _mm_srli_si128(c0, 8));
+
+  const __m128i c = _mm_unpacklo_epi64(c0_val, zero);
+  const __m128i h0x = _mm_unpacklo_epi64(h00_val, zero);
+
+  xx_storeu_128(C, c);
+  xx_storeu_128(H[0], h0x);
+
+  H[0][0] /= size;
+  C[0] /= size;
+}
+
+// When only params->r[1] > 0. In this case only H[1][1] and C[1] are
+// non-zero and need to be computed.
+static AOM_INLINE void calc_proj_params_r1_high_bd_sse4_1(
+    const uint8_t *src8, int width, int height, int src_stride,
+    const uint8_t *dat8, int dat_stride, int32_t *flt1, int flt1_stride,
+    int64_t H[2][2], int64_t C[2]) {
+  const int size = width * height;
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
+  __m128i h11, c1;
+  const __m128i zero = _mm_setzero_si128();
+  c1 = h11 = zero;
+
+  for (int i = 0; i < height; ++i) {
+    for (int j = 0; j < width; j += 4) {
+      const __m128i u_load = _mm_cvtepu16_epi32(
+          _mm_loadl_epi64((__m128i *)(dat + i * dat_stride + j)));
+      const __m128i s_load = _mm_cvtepu16_epi32(
+          _mm_loadl_epi64((__m128i *)(src + i * src_stride + j)));
+      __m128i f2 = _mm_loadu_si128((__m128i *)(flt1 + i * flt1_stride + j));
+      __m128i d = _mm_slli_epi32(u_load, SGRPROJ_RST_BITS);
+      __m128i s = _mm_slli_epi32(s_load, SGRPROJ_RST_BITS);
+      s = _mm_sub_epi32(s, d);
+      f2 = _mm_sub_epi32(f2, d);
+
+      const __m128i h11_even = _mm_mul_epi32(f2, f2);
+      const __m128i h11_odd =
+          _mm_mul_epi32(_mm_srli_epi64(f2, 32), _mm_srli_epi64(f2, 32));
+      h11 = _mm_add_epi64(h11, h11_even);
+      h11 = _mm_add_epi64(h11, h11_odd);
+
+      const __m128i c1_even = _mm_mul_epi32(f2, s);
+      const __m128i c1_odd =
+          _mm_mul_epi32(_mm_srli_epi64(f2, 32), _mm_srli_epi64(s, 32));
+      c1 = _mm_add_epi64(c1, c1_even);
+      c1 = _mm_add_epi64(c1, c1_odd);
+    }
+  }
+
+  const __m128i h11_val = _mm_add_epi64(h11, _mm_srli_si128(h11, 8));
+
+  const __m128i c1_val = _mm_add_epi64(c1, _mm_srli_si128(c1, 8));
+
+  const __m128i c = _mm_unpacklo_epi64(zero, c1_val);
+  const __m128i h1x = _mm_unpacklo_epi64(zero, h11_val);
+
+  xx_storeu_128(C, c);
+  xx_storeu_128(H[1], h1x);
+
+  H[1][1] /= size;
+  C[1] /= size;
+}
+
+// SSE4.1 variant of av1_calc_proj_params_high_bd_c.
+void av1_calc_proj_params_high_bd_sse4_1(const uint8_t *src8, int width,
+                                         int height, int src_stride,
+                                         const uint8_t *dat8, int dat_stride,
+                                         int32_t *flt0, int flt0_stride,
+                                         int32_t *flt1, int flt1_stride,
+                                         int64_t H[2][2], int64_t C[2],
+                                         const sgr_params_type *params) {
+  if ((params->r[0] > 0) && (params->r[1] > 0)) {
+    calc_proj_params_r0_r1_high_bd_sse4_1(src8, width, height, src_stride, dat8,
+                                          dat_stride, flt0, flt0_stride, flt1,
+                                          flt1_stride, H, C);
+  } else if (params->r[0] > 0) {
+    calc_proj_params_r0_high_bd_sse4_1(src8, width, height, src_stride, dat8,
+                                       dat_stride, flt0, flt0_stride, H, C);
+  } else if (params->r[1] > 0) {
+    calc_proj_params_r1_high_bd_sse4_1(src8, width, height, src_stride, dat8,
+                                       dat_stride, flt1, flt1_stride, H, C);
+  }
+}
+
 #if CONFIG_AV1_HIGHBITDEPTH
 int64_t av1_highbd_pixel_proj_error_sse4_1(
     const uint8_t *src8, int width, int height, int src_stride,
diff --git a/av1/encoder/x86/rdopt_avx2.c b/av1/encoder/x86/rdopt_avx2.c
index f588bad..fefc036 100644
--- a/av1/encoder/x86/rdopt_avx2.c
+++ b/av1/encoder/x86/rdopt_avx2.c
@@ -11,6 +11,7 @@
 
 #include <assert.h>
 #include <immintrin.h>
+#include "aom_dsp/x86/mem_sse2.h"
 #include "aom_dsp/x86/synonyms_avx2.h"
 #include "aom_ports/system_state.h"
 
@@ -31,8 +32,8 @@
   //                      [ m n o p ]
 
   const __m256i pixels = _mm256_set_epi64x(
-      *(uint64_t *)&diff[0 * stride], *(uint64_t *)&diff[1 * stride],
-      *(uint64_t *)&diff[2 * stride], *(uint64_t *)&diff[3 * stride]);
+      loadu_uint64(&diff[0 * stride]), loadu_uint64(&diff[1 * stride]),
+      loadu_uint64(&diff[2 * stride]), loadu_uint64(&diff[3 * stride]));
   // pixels = [d c b a h g f e] [l k j i p o n m] as i16
 
   const __m256i slli = _mm256_slli_epi64(pixels, 16);
diff --git a/av1/encoder/x86/temporal_filter_avx2.c b/av1/encoder/x86/temporal_filter_avx2.c
index a11f791..72914e1 100644
--- a/av1/encoder/x86/temporal_filter_avx2.c
+++ b/av1/encoder/x86/temporal_filter_avx2.c
@@ -127,23 +127,17 @@
   return _mm_extract_epi32(v128a, 0);
 }
 
-static void apply_temporal_filter_planewise(
+static void apply_temporal_filter(
     const uint8_t *frame1, const unsigned int stride, const uint8_t *frame2,
     const unsigned int stride2, const int block_width, const int block_height,
-    const double sigma, const int decay_control, const int use_subblock,
-    const int block_mse, const int *subblock_mses, const int q_factor,
-    unsigned int *accumulator, uint16_t *count, uint16_t *luma_sq_error,
-    uint16_t *chroma_sq_error, int plane, int ss_x_shift, int ss_y_shift) {
-  assert(TF_PLANEWISE_FILTER_WINDOW_LENGTH == 5);
-  assert(((block_width == 32) && (block_height == 32)) ||
-         ((block_width == 16) && (block_height == 16)));
-  if (plane > PLANE_TYPE_Y) assert(chroma_sq_error != NULL);
+    const int *subblock_mses, unsigned int *accumulator, uint16_t *count,
+    uint16_t *frame_sse, uint32_t *luma_sse_sum,
+    const double inv_num_ref_pixels, const double decay_factor,
+    const double inv_factor, const double weight_factor, double *d_factor) {
+  assert(((block_width == 16) || (block_width == 32)) &&
+         ((block_height == 16) || (block_height == 32)));
 
   uint32_t acc_5x5_sse[BH][BW];
-  const double h = decay_control * (0.7 + log(sigma + 1.0));
-  const double q = AOMMIN((double)(q_factor * q_factor) / 256.0, 1);
-  uint16_t *frame_sse =
-      (plane == PLANE_TYPE_Y) ? luma_sq_error : chroma_sq_error;
 
   if (block_width == 32) {
     get_squared_error_32x32_avx2(frame1, stride, frame2, stride2, block_width,
@@ -184,7 +178,7 @@
       }
 
       // Load next row to the last element
-      if (row <= block_width - 4) {
+      if (row <= block_height - 4) {
         vsrc[4] = xx_load_and_pad(src, col, block_width);
         src += SSE_STRIDE;
       } else {
@@ -201,84 +195,113 @@
   for (int i = 0, k = 0; i < block_height; i++) {
     for (int j = 0; j < block_width; j++, k++) {
       const int pixel_value = frame2[i * stride2 + j];
+      uint32_t diff_sse = acc_5x5_sse[i][j] + luma_sse_sum[i * BW + j];
 
-      int diff_sse = acc_5x5_sse[i][j];
-      int num_ref_pixels =
-          TF_PLANEWISE_FILTER_WINDOW_LENGTH * TF_PLANEWISE_FILTER_WINDOW_LENGTH;
-
-      // Filter U-plane and V-plane using Y-plane. This is because motion
-      // search is only done on Y-plane, so the information from Y-plane will
-      // be more accurate.
-      if (plane != PLANE_TYPE_Y) {
-        for (int ii = 0; ii < (1 << ss_y_shift); ++ii) {
-          for (int jj = 0; jj < (1 << ss_x_shift); ++jj) {
-            const int yy = (i << ss_y_shift) + ii;  // Y-coord on Y-plane.
-            const int xx = (j << ss_x_shift) + jj;  // X-coord on Y-plane.
-            diff_sse += luma_sq_error[yy * SSE_STRIDE + xx];
-            ++num_ref_pixels;
-          }
-        }
-      }
-
-      const double window_error = (double)(diff_sse) / num_ref_pixels;
+      const double window_error = diff_sse * inv_num_ref_pixels;
       const int subblock_idx =
           (i >= block_height / 2) * 2 + (j >= block_width / 2);
-      const double block_error =
-          (double)(use_subblock ? subblock_mses[subblock_idx] : block_mse);
+      const double block_error = (double)subblock_mses[subblock_idx];
+      const double combined_error =
+          weight_factor * window_error + block_error * inv_factor;
 
-      const double scaled_diff =
-          AOMMAX(-(window_error + block_error / 10) / (2 * h * h * q), -15.0);
-      const int adjusted_weight =
-          (int)(exp(scaled_diff) * TF_PLANEWISE_FILTER_WEIGHT_SCALE);
+      double scaled_error =
+          combined_error * d_factor[subblock_idx] * decay_factor;
+      scaled_error = AOMMIN(scaled_error, 7);
+      const int weight = (int)(exp(-scaled_error) * TF_WEIGHT_SCALE);
 
-      count[k] += adjusted_weight;
-      accumulator[k] += adjusted_weight * pixel_value;
+      count[k] += weight;
+      accumulator[k] += weight * pixel_value;
     }
   }
 }
 
-void av1_apply_temporal_filter_planewise_avx2(
-    const YV12_BUFFER_CONFIG *ref_frame, const MACROBLOCKD *mbd,
+void av1_apply_temporal_filter_avx2(
+    const YV12_BUFFER_CONFIG *frame_to_filter, const MACROBLOCKD *mbd,
     const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
-    const int num_planes, const double *noise_levels, const int use_subblock,
-    const int block_mse, const int *subblock_mses, const int q_factor,
+    const int num_planes, const double *noise_levels, const MV *subblock_mvs,
+    const int *subblock_mses, const int q_factor, const int filter_strength,
     const uint8_t *pred, uint32_t *accum, uint16_t *count) {
-  const int is_high_bitdepth = ref_frame->flags & YV12_FLAG_HIGHBITDEPTH;
-  if (is_high_bitdepth) {
-    assert(0 && "Only support low bit-depth with avx2!");
-  }
+  const int is_high_bitdepth = frame_to_filter->flags & YV12_FLAG_HIGHBITDEPTH;
+  assert(block_size == BLOCK_32X32 && "Only support 32x32 block with avx2!");
+  assert(TF_WINDOW_LENGTH == 5 && "Only support window length 5 with avx2!");
+  assert(!is_high_bitdepth && "Only support low bit-depth with avx2!");
   assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
-
-  const int frame_height = ref_frame->heights[0] << mbd->plane[0].subsampling_y;
-  const int decay_control = frame_height >= 720 ? 4 : 3;
+  (void)is_high_bitdepth;
 
   const int mb_height = block_size_high[block_size];
   const int mb_width = block_size_wide[block_size];
-  const int mb_pels = mb_height * mb_width;
-  uint16_t luma_sq_error[SSE_STRIDE * BH];
-  uint16_t *chroma_sq_error =
-      (num_planes > 0)
-          ? (uint16_t *)aom_malloc(SSE_STRIDE * BH * sizeof(uint16_t))
-          : NULL;
+  const int frame_height = frame_to_filter->y_crop_height;
+  const int frame_width = frame_to_filter->y_crop_width;
+  const int min_frame_size = AOMMIN(frame_height, frame_width);
+  // Variables to simplify combined error calculation.
+  const double inv_factor = 1.0 / ((TF_WINDOW_BLOCK_BALANCE_WEIGHT + 1) *
+                                   TF_SEARCH_ERROR_NORM_WEIGHT);
+  const double weight_factor =
+      (double)TF_WINDOW_BLOCK_BALANCE_WEIGHT * inv_factor;
+  // Decay factors for non-local mean approach.
+  // Smaller q -> smaller filtering weight.
+  double q_decay = pow((double)q_factor / TF_Q_DECAY_THRESHOLD, 2);
+  q_decay = CLIP(q_decay, 1e-5, 1);
+  // Smaller strength -> smaller filtering weight.
+  double s_decay = pow((double)filter_strength / TF_STRENGTH_THRESHOLD, 2);
+  s_decay = CLIP(s_decay, 1e-5, 1);
+  double d_factor[4] = { 0 };
+  uint16_t frame_sse[SSE_STRIDE * BH] = { 0 };
+  uint32_t luma_sse_sum[BW * BH] = { 0 };
 
+  for (int subblock_idx = 0; subblock_idx < 4; subblock_idx++) {
+    // Larger motion vector -> smaller filtering weight.
+    const MV mv = subblock_mvs[subblock_idx];
+    const double distance = sqrt(pow(mv.row, 2) + pow(mv.col, 2));
+    double distance_threshold = min_frame_size * TF_SEARCH_DISTANCE_THRESHOLD;
+    distance_threshold = AOMMAX(distance_threshold, 1);
+    d_factor[subblock_idx] = distance / distance_threshold;
+    d_factor[subblock_idx] = AOMMAX(d_factor[subblock_idx], 1);
+  }
+
+  // Handle planes in sequence.
+  int plane_offset = 0;
   for (int plane = 0; plane < num_planes; ++plane) {
     const uint32_t plane_h = mb_height >> mbd->plane[plane].subsampling_y;
     const uint32_t plane_w = mb_width >> mbd->plane[plane].subsampling_x;
-    const uint32_t frame_stride = ref_frame->strides[plane == 0 ? 0 : 1];
+    const uint32_t frame_stride = frame_to_filter->strides[plane == 0 ? 0 : 1];
     const int frame_offset = mb_row * plane_h * frame_stride + mb_col * plane_w;
 
-    const uint8_t *ref = ref_frame->buffers[plane] + frame_offset;
+    const uint8_t *ref = frame_to_filter->buffers[plane] + frame_offset;
     const int ss_x_shift =
-        mbd->plane[plane].subsampling_x - mbd->plane[0].subsampling_x;
+        mbd->plane[plane].subsampling_x - mbd->plane[AOM_PLANE_Y].subsampling_x;
     const int ss_y_shift =
-        mbd->plane[plane].subsampling_y - mbd->plane[0].subsampling_y;
+        mbd->plane[plane].subsampling_y - mbd->plane[AOM_PLANE_Y].subsampling_y;
+    const int num_ref_pixels = TF_WINDOW_LENGTH * TF_WINDOW_LENGTH +
+                               ((plane) ? (1 << (ss_x_shift + ss_y_shift)) : 0);
+    const double inv_num_ref_pixels = 1.0 / num_ref_pixels;
+    // Larger noise -> larger filtering weight.
+    const double n_decay = 0.5 + log(2 * noise_levels[plane] + 5.0);
+    const double decay_factor = 1 / (n_decay * q_decay * s_decay);
 
-    apply_temporal_filter_planewise(
-        ref, frame_stride, pred + mb_pels * plane, plane_w, plane_w, plane_h,
-        noise_levels[plane], decay_control, use_subblock, block_mse,
-        subblock_mses, q_factor, accum + mb_pels * plane,
-        count + mb_pels * plane, luma_sq_error, chroma_sq_error, plane,
-        ss_x_shift, ss_y_shift);
+    // Filter U-plane and V-plane using Y-plane. This is because motion
+    // search is only done on Y-plane, so the information from Y-plane
+    // will be more accurate. The luma sse sum is reused in both chroma
+    // planes.
+    if (plane == AOM_PLANE_U) {
+      for (unsigned int i = 0, k = 0; i < plane_h; i++) {
+        for (unsigned int j = 0; j < plane_w; j++, k++) {
+          for (int ii = 0; ii < (1 << ss_y_shift); ++ii) {
+            for (int jj = 0; jj < (1 << ss_x_shift); ++jj) {
+              const int yy = (i << ss_y_shift) + ii;  // Y-coord on Y-plane.
+              const int xx = (j << ss_x_shift) + jj;  // X-coord on Y-plane.
+              luma_sse_sum[i * BW + j] += frame_sse[yy * SSE_STRIDE + xx];
+            }
+          }
+        }
+      }
+    }
+
+    apply_temporal_filter(ref, frame_stride, pred + plane_offset, plane_w,
+                          plane_w, plane_h, subblock_mses, accum + plane_offset,
+                          count + plane_offset, frame_sse, luma_sse_sum,
+                          inv_num_ref_pixels, decay_factor, inv_factor,
+                          weight_factor, d_factor);
+    plane_offset += plane_h * plane_w;
   }
-  if (chroma_sq_error != NULL) aom_free(chroma_sq_error);
 }
diff --git a/av1/encoder/x86/temporal_filter_constants.h b/av1/encoder/x86/temporal_filter_constants.h
deleted file mode 100644
index 7cd61d7..0000000
--- a/av1/encoder/x86/temporal_filter_constants.h
+++ /dev/null
@@ -1,407 +0,0 @@
-/*
- * Copyright (c) 2019, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AV1_ENCODER_X86_TEMPORAL_FILTER_CONSTANTS_H_
-#define AOM_AV1_ENCODER_X86_TEMPORAL_FILTER_CONSTANTS_H_
-
-// Division using multiplication and shifting. The C implementation does:
-// modifier *= 3;
-// modifier /= index;
-// where 'modifier' is a set of summed values and 'index' is the number of
-// summed values.
-//
-// This equation works out to (m * 3) / i which reduces to:
-// m * 3/4
-// m * 1/2
-// m * 1/3
-//
-// By pairing the multiply with a down shift by 16 (_mm_mulhi_epu16):
-// m * C / 65536
-// we can create a C to replicate the division.
-//
-// m * 49152 / 65536 = m * 3/4
-// m * 32758 / 65536 = m * 1/2
-// m * 21846 / 65536 = m * 0.3333
-//
-// These are loaded using an instruction expecting int16_t values but are used
-// with _mm_mulhi_epu16(), which treats them as unsigned.
-#define NEIGHBOR_CONSTANT_4 (int16_t)49152
-#define NEIGHBOR_CONSTANT_5 (int16_t)39322
-#define NEIGHBOR_CONSTANT_6 (int16_t)32768
-#define NEIGHBOR_CONSTANT_7 (int16_t)28087
-#define NEIGHBOR_CONSTANT_8 (int16_t)24576
-#define NEIGHBOR_CONSTANT_9 (int16_t)21846
-#define NEIGHBOR_CONSTANT_10 (int16_t)19661
-#define NEIGHBOR_CONSTANT_11 (int16_t)17874
-#define NEIGHBOR_CONSTANT_13 (int16_t)15124
-
-DECLARE_ALIGNED(16, static const int16_t, LEFT_CORNER_NEIGHBORS_PLUS_1[8]) = {
-  NEIGHBOR_CONSTANT_5, NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7,
-  NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7,
-  NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7
-};
-
-DECLARE_ALIGNED(16, static const int16_t, RIGHT_CORNER_NEIGHBORS_PLUS_1[8]) = {
-  NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7,
-  NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7,
-  NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_5
-};
-
-DECLARE_ALIGNED(16, static const int16_t, LEFT_EDGE_NEIGHBORS_PLUS_1[8]) = {
-  NEIGHBOR_CONSTANT_7,  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
-  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
-  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10
-};
-
-DECLARE_ALIGNED(16, static const int16_t, RIGHT_EDGE_NEIGHBORS_PLUS_1[8]) = {
-  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
-  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
-  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_7
-};
-
-DECLARE_ALIGNED(16, static const int16_t, MIDDLE_EDGE_NEIGHBORS_PLUS_1[8]) = {
-  NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7,
-  NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7,
-  NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7
-};
-
-DECLARE_ALIGNED(16, static const int16_t, MIDDLE_CENTER_NEIGHBORS_PLUS_1[8]) = {
-  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
-  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
-  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10
-};
-
-DECLARE_ALIGNED(16, static const int16_t, LEFT_CORNER_NEIGHBORS_PLUS_2[8]) = {
-  NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8,
-  NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8,
-  NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8
-};
-
-DECLARE_ALIGNED(16, static const int16_t, RIGHT_CORNER_NEIGHBORS_PLUS_2[8]) = {
-  NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8,
-  NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8,
-  NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_6
-};
-
-DECLARE_ALIGNED(16, static const int16_t, LEFT_EDGE_NEIGHBORS_PLUS_2[8]) = {
-  NEIGHBOR_CONSTANT_8,  NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11,
-  NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11,
-  NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11
-};
-
-DECLARE_ALIGNED(16, static const int16_t, RIGHT_EDGE_NEIGHBORS_PLUS_2[8]) = {
-  NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11,
-  NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11,
-  NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_8
-};
-
-DECLARE_ALIGNED(16, static const int16_t, MIDDLE_EDGE_NEIGHBORS_PLUS_2[8]) = {
-  NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8,
-  NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8,
-  NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8
-};
-
-DECLARE_ALIGNED(16, static const int16_t, MIDDLE_CENTER_NEIGHBORS_PLUS_2[8]) = {
-  NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11,
-  NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11,
-  NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11
-};
-
-DECLARE_ALIGNED(16, static const int16_t, TWO_CORNER_NEIGHBORS_PLUS_2[8]) = {
-  NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8,
-  NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8,
-  NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_6
-};
-
-DECLARE_ALIGNED(16, static const int16_t, TWO_EDGE_NEIGHBORS_PLUS_2[8]) = {
-  NEIGHBOR_CONSTANT_8,  NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11,
-  NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11,
-  NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_8
-};
-
-DECLARE_ALIGNED(16, static const int16_t, LEFT_CORNER_NEIGHBORS_PLUS_4[8]) = {
-  NEIGHBOR_CONSTANT_8,  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
-  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
-  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10
-};
-
-DECLARE_ALIGNED(16, static const int16_t, RIGHT_CORNER_NEIGHBORS_PLUS_4[8]) = {
-  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
-  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
-  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_8
-};
-
-DECLARE_ALIGNED(16, static const int16_t, LEFT_EDGE_NEIGHBORS_PLUS_4[8]) = {
-  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13,
-  NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13,
-  NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13
-};
-
-DECLARE_ALIGNED(16, static const int16_t, RIGHT_EDGE_NEIGHBORS_PLUS_4[8]) = {
-  NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13,
-  NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13,
-  NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_10
-};
-
-DECLARE_ALIGNED(16, static const int16_t, MIDDLE_EDGE_NEIGHBORS_PLUS_4[8]) = {
-  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
-  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
-  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10
-};
-
-DECLARE_ALIGNED(16, static const int16_t, MIDDLE_CENTER_NEIGHBORS_PLUS_4[8]) = {
-  NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13,
-  NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13,
-  NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13
-};
-
-DECLARE_ALIGNED(16, static const int16_t, TWO_CORNER_NEIGHBORS_PLUS_4[8]) = {
-  NEIGHBOR_CONSTANT_8,  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
-  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
-  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_8
-};
-
-DECLARE_ALIGNED(16, static const int16_t, TWO_EDGE_NEIGHBORS_PLUS_4[8]) = {
-  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13,
-  NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13,
-  NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_10
-};
-
-static const int16_t *const LUMA_LEFT_COLUMN_NEIGHBORS[2] = {
-  LEFT_CORNER_NEIGHBORS_PLUS_2, LEFT_EDGE_NEIGHBORS_PLUS_2
-};
-
-static const int16_t *const LUMA_MIDDLE_COLUMN_NEIGHBORS[2] = {
-  MIDDLE_EDGE_NEIGHBORS_PLUS_2, MIDDLE_CENTER_NEIGHBORS_PLUS_2
-};
-
-static const int16_t *const LUMA_RIGHT_COLUMN_NEIGHBORS[2] = {
-  RIGHT_CORNER_NEIGHBORS_PLUS_2, RIGHT_EDGE_NEIGHBORS_PLUS_2
-};
-
-static const int16_t *const CHROMA_NO_SS_LEFT_COLUMN_NEIGHBORS[2] = {
-  LEFT_CORNER_NEIGHBORS_PLUS_1, LEFT_EDGE_NEIGHBORS_PLUS_1
-};
-
-static const int16_t *const CHROMA_NO_SS_MIDDLE_COLUMN_NEIGHBORS[2] = {
-  MIDDLE_EDGE_NEIGHBORS_PLUS_1, MIDDLE_CENTER_NEIGHBORS_PLUS_1
-};
-
-static const int16_t *const CHROMA_NO_SS_RIGHT_COLUMN_NEIGHBORS[2] = {
-  RIGHT_CORNER_NEIGHBORS_PLUS_1, RIGHT_EDGE_NEIGHBORS_PLUS_1
-};
-
-static const int16_t *const CHROMA_SINGLE_SS_LEFT_COLUMN_NEIGHBORS[2] = {
-  LEFT_CORNER_NEIGHBORS_PLUS_2, LEFT_EDGE_NEIGHBORS_PLUS_2
-};
-
-static const int16_t *const CHROMA_SINGLE_SS_MIDDLE_COLUMN_NEIGHBORS[2] = {
-  MIDDLE_EDGE_NEIGHBORS_PLUS_2, MIDDLE_CENTER_NEIGHBORS_PLUS_2
-};
-
-static const int16_t *const CHROMA_SINGLE_SS_RIGHT_COLUMN_NEIGHBORS[2] = {
-  RIGHT_CORNER_NEIGHBORS_PLUS_2, RIGHT_EDGE_NEIGHBORS_PLUS_2
-};
-
-static const int16_t *const CHROMA_SINGLE_SS_SINGLE_COLUMN_NEIGHBORS[2] = {
-  TWO_CORNER_NEIGHBORS_PLUS_2, TWO_EDGE_NEIGHBORS_PLUS_2
-};
-
-static const int16_t *const CHROMA_DOUBLE_SS_LEFT_COLUMN_NEIGHBORS[2] = {
-  LEFT_CORNER_NEIGHBORS_PLUS_4, LEFT_EDGE_NEIGHBORS_PLUS_4
-};
-
-static const int16_t *const CHROMA_DOUBLE_SS_MIDDLE_COLUMN_NEIGHBORS[2] = {
-  MIDDLE_EDGE_NEIGHBORS_PLUS_4, MIDDLE_CENTER_NEIGHBORS_PLUS_4
-};
-
-static const int16_t *const CHROMA_DOUBLE_SS_RIGHT_COLUMN_NEIGHBORS[2] = {
-  RIGHT_CORNER_NEIGHBORS_PLUS_4, RIGHT_EDGE_NEIGHBORS_PLUS_4
-};
-
-static const int16_t *const CHROMA_DOUBLE_SS_SINGLE_COLUMN_NEIGHBORS[2] = {
-  TWO_CORNER_NEIGHBORS_PLUS_4, TWO_EDGE_NEIGHBORS_PLUS_4
-};
-
-#define HIGHBD_NEIGHBOR_CONSTANT_4 (uint32_t)3221225472U
-#define HIGHBD_NEIGHBOR_CONSTANT_5 (uint32_t)2576980378U
-#define HIGHBD_NEIGHBOR_CONSTANT_6 (uint32_t)2147483648U
-#define HIGHBD_NEIGHBOR_CONSTANT_7 (uint32_t)1840700270U
-#define HIGHBD_NEIGHBOR_CONSTANT_8 (uint32_t)1610612736U
-#define HIGHBD_NEIGHBOR_CONSTANT_9 (uint32_t)1431655766U
-#define HIGHBD_NEIGHBOR_CONSTANT_10 (uint32_t)1288490189U
-#define HIGHBD_NEIGHBOR_CONSTANT_11 (uint32_t)1171354718U
-#define HIGHBD_NEIGHBOR_CONSTANT_13 (uint32_t)991146300U
-
-DECLARE_ALIGNED(16, static const uint32_t,
-                HIGHBD_LEFT_CORNER_NEIGHBORS_PLUS_1[4]) = {
-  HIGHBD_NEIGHBOR_CONSTANT_5, HIGHBD_NEIGHBOR_CONSTANT_7,
-  HIGHBD_NEIGHBOR_CONSTANT_7, HIGHBD_NEIGHBOR_CONSTANT_7
-};
-
-DECLARE_ALIGNED(16, static const uint32_t,
-                HIGHBD_RIGHT_CORNER_NEIGHBORS_PLUS_1[4]) = {
-  HIGHBD_NEIGHBOR_CONSTANT_7, HIGHBD_NEIGHBOR_CONSTANT_7,
-  HIGHBD_NEIGHBOR_CONSTANT_7, HIGHBD_NEIGHBOR_CONSTANT_5
-};
-
-DECLARE_ALIGNED(16, static const uint32_t,
-                HIGHBD_LEFT_EDGE_NEIGHBORS_PLUS_1[4]) = {
-  HIGHBD_NEIGHBOR_CONSTANT_7, HIGHBD_NEIGHBOR_CONSTANT_10,
-  HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_10
-};
-
-DECLARE_ALIGNED(16, static const uint32_t,
-                HIGHBD_RIGHT_EDGE_NEIGHBORS_PLUS_1[4]) = {
-  HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_10,
-  HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_7
-};
-
-DECLARE_ALIGNED(16, static const uint32_t,
-                HIGHBD_MIDDLE_EDGE_NEIGHBORS_PLUS_1[4]) = {
-  HIGHBD_NEIGHBOR_CONSTANT_7, HIGHBD_NEIGHBOR_CONSTANT_7,
-  HIGHBD_NEIGHBOR_CONSTANT_7, HIGHBD_NEIGHBOR_CONSTANT_7
-};
-
-DECLARE_ALIGNED(16, static const uint32_t,
-                HIGHBD_MIDDLE_CENTER_NEIGHBORS_PLUS_1[4]) = {
-  HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_10,
-  HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_10
-};
-
-DECLARE_ALIGNED(16, static const uint32_t,
-                HIGHBD_LEFT_CORNER_NEIGHBORS_PLUS_2[4]) = {
-  HIGHBD_NEIGHBOR_CONSTANT_6, HIGHBD_NEIGHBOR_CONSTANT_8,
-  HIGHBD_NEIGHBOR_CONSTANT_8, HIGHBD_NEIGHBOR_CONSTANT_8
-};
-
-DECLARE_ALIGNED(16, static const uint32_t,
-                HIGHBD_RIGHT_CORNER_NEIGHBORS_PLUS_2[4]) = {
-  HIGHBD_NEIGHBOR_CONSTANT_8, HIGHBD_NEIGHBOR_CONSTANT_8,
-  HIGHBD_NEIGHBOR_CONSTANT_8, HIGHBD_NEIGHBOR_CONSTANT_6
-};
-
-DECLARE_ALIGNED(16, static const uint32_t,
-                HIGHBD_LEFT_EDGE_NEIGHBORS_PLUS_2[4]) = {
-  HIGHBD_NEIGHBOR_CONSTANT_8, HIGHBD_NEIGHBOR_CONSTANT_11,
-  HIGHBD_NEIGHBOR_CONSTANT_11, HIGHBD_NEIGHBOR_CONSTANT_11
-};
-
-DECLARE_ALIGNED(16, static const uint32_t,
-                HIGHBD_RIGHT_EDGE_NEIGHBORS_PLUS_2[4]) = {
-  HIGHBD_NEIGHBOR_CONSTANT_11, HIGHBD_NEIGHBOR_CONSTANT_11,
-  HIGHBD_NEIGHBOR_CONSTANT_11, HIGHBD_NEIGHBOR_CONSTANT_8
-};
-
-DECLARE_ALIGNED(16, static const uint32_t,
-                HIGHBD_MIDDLE_EDGE_NEIGHBORS_PLUS_2[4]) = {
-  HIGHBD_NEIGHBOR_CONSTANT_8, HIGHBD_NEIGHBOR_CONSTANT_8,
-  HIGHBD_NEIGHBOR_CONSTANT_8, HIGHBD_NEIGHBOR_CONSTANT_8
-};
-
-DECLARE_ALIGNED(16, static const uint32_t,
-                HIGHBD_MIDDLE_CENTER_NEIGHBORS_PLUS_2[4]) = {
-  HIGHBD_NEIGHBOR_CONSTANT_11, HIGHBD_NEIGHBOR_CONSTANT_11,
-  HIGHBD_NEIGHBOR_CONSTANT_11, HIGHBD_NEIGHBOR_CONSTANT_11
-};
-
-DECLARE_ALIGNED(16, static const uint32_t,
-                HIGHBD_LEFT_CORNER_NEIGHBORS_PLUS_4[4]) = {
-  HIGHBD_NEIGHBOR_CONSTANT_8, HIGHBD_NEIGHBOR_CONSTANT_10,
-  HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_10
-};
-
-DECLARE_ALIGNED(16, static const uint32_t,
-                HIGHBD_RIGHT_CORNER_NEIGHBORS_PLUS_4[4]) = {
-  HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_10,
-  HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_8
-};
-
-DECLARE_ALIGNED(16, static const uint32_t,
-                HIGHBD_LEFT_EDGE_NEIGHBORS_PLUS_4[4]) = {
-  HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_13,
-  HIGHBD_NEIGHBOR_CONSTANT_13, HIGHBD_NEIGHBOR_CONSTANT_13
-};
-
-DECLARE_ALIGNED(16, static const uint32_t,
-                HIGHBD_RIGHT_EDGE_NEIGHBORS_PLUS_4[4]) = {
-  HIGHBD_NEIGHBOR_CONSTANT_13, HIGHBD_NEIGHBOR_CONSTANT_13,
-  HIGHBD_NEIGHBOR_CONSTANT_13, HIGHBD_NEIGHBOR_CONSTANT_10
-};
-
-DECLARE_ALIGNED(16, static const uint32_t,
-                HIGHBD_MIDDLE_EDGE_NEIGHBORS_PLUS_4[4]) = {
-  HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_10,
-  HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_10
-};
-
-DECLARE_ALIGNED(16, static const uint32_t,
-                HIGHBD_MIDDLE_CENTER_NEIGHBORS_PLUS_4[4]) = {
-  HIGHBD_NEIGHBOR_CONSTANT_13, HIGHBD_NEIGHBOR_CONSTANT_13,
-  HIGHBD_NEIGHBOR_CONSTANT_13, HIGHBD_NEIGHBOR_CONSTANT_13
-};
-
-static const uint32_t *const HIGHBD_LUMA_LEFT_COLUMN_NEIGHBORS[2] = {
-  HIGHBD_LEFT_CORNER_NEIGHBORS_PLUS_2, HIGHBD_LEFT_EDGE_NEIGHBORS_PLUS_2
-};
-
-static const uint32_t *const HIGHBD_LUMA_MIDDLE_COLUMN_NEIGHBORS[2] = {
-  HIGHBD_MIDDLE_EDGE_NEIGHBORS_PLUS_2, HIGHBD_MIDDLE_CENTER_NEIGHBORS_PLUS_2
-};
-
-static const uint32_t *const HIGHBD_LUMA_RIGHT_COLUMN_NEIGHBORS[2] = {
-  HIGHBD_RIGHT_CORNER_NEIGHBORS_PLUS_2, HIGHBD_RIGHT_EDGE_NEIGHBORS_PLUS_2
-};
-
-static const uint32_t *const HIGHBD_CHROMA_NO_SS_LEFT_COLUMN_NEIGHBORS[2] = {
-  HIGHBD_LEFT_CORNER_NEIGHBORS_PLUS_1, HIGHBD_LEFT_EDGE_NEIGHBORS_PLUS_1
-};
-
-static const uint32_t *const HIGHBD_CHROMA_NO_SS_MIDDLE_COLUMN_NEIGHBORS[2] = {
-  HIGHBD_MIDDLE_EDGE_NEIGHBORS_PLUS_1, HIGHBD_MIDDLE_CENTER_NEIGHBORS_PLUS_1
-};
-
-static const uint32_t *const HIGHBD_CHROMA_NO_SS_RIGHT_COLUMN_NEIGHBORS[2] = {
-  HIGHBD_RIGHT_CORNER_NEIGHBORS_PLUS_1, HIGHBD_RIGHT_EDGE_NEIGHBORS_PLUS_1
-};
-
-static const uint32_t
-    *const HIGHBD_CHROMA_SINGLE_SS_LEFT_COLUMN_NEIGHBORS[2] = {
-      HIGHBD_LEFT_CORNER_NEIGHBORS_PLUS_2, HIGHBD_LEFT_EDGE_NEIGHBORS_PLUS_2
-    };
-
-static const uint32_t
-    *const HIGHBD_CHROMA_SINGLE_SS_MIDDLE_COLUMN_NEIGHBORS[2] = {
-      HIGHBD_MIDDLE_EDGE_NEIGHBORS_PLUS_2, HIGHBD_MIDDLE_CENTER_NEIGHBORS_PLUS_2
-    };
-
-static const uint32_t
-    *const HIGHBD_CHROMA_SINGLE_SS_RIGHT_COLUMN_NEIGHBORS[2] = {
-      HIGHBD_RIGHT_CORNER_NEIGHBORS_PLUS_2, HIGHBD_RIGHT_EDGE_NEIGHBORS_PLUS_2
-    };
-
-static const uint32_t
-    *const HIGHBD_CHROMA_DOUBLE_SS_LEFT_COLUMN_NEIGHBORS[2] = {
-      HIGHBD_LEFT_CORNER_NEIGHBORS_PLUS_4, HIGHBD_LEFT_EDGE_NEIGHBORS_PLUS_4
-    };
-
-static const uint32_t
-    *const HIGHBD_CHROMA_DOUBLE_SS_MIDDLE_COLUMN_NEIGHBORS[2] = {
-      HIGHBD_MIDDLE_EDGE_NEIGHBORS_PLUS_4, HIGHBD_MIDDLE_CENTER_NEIGHBORS_PLUS_4
-    };
-
-static const uint32_t
-    *const HIGHBD_CHROMA_DOUBLE_SS_RIGHT_COLUMN_NEIGHBORS[2] = {
-      HIGHBD_RIGHT_CORNER_NEIGHBORS_PLUS_4, HIGHBD_RIGHT_EDGE_NEIGHBORS_PLUS_4
-    };
-
-#define DIST_STRIDE ((BW) + 2)
-#endif  // AOM_AV1_ENCODER_X86_TEMPORAL_FILTER_CONSTANTS_H_
diff --git a/av1/encoder/x86/temporal_filter_sse2.c b/av1/encoder/x86/temporal_filter_sse2.c
index 98a6b82..d70792c 100644
--- a/av1/encoder/x86/temporal_filter_sse2.c
+++ b/av1/encoder/x86/temporal_filter_sse2.c
@@ -102,23 +102,17 @@
   return _mm_cvtsi128_si32(veca);
 }
 
-static void apply_temporal_filter_planewise(
+static void apply_temporal_filter(
     const uint8_t *frame1, const unsigned int stride, const uint8_t *frame2,
     const unsigned int stride2, const int block_width, const int block_height,
-    const double sigma, const int decay_control, const int use_subblock,
-    const int block_mse, const int *subblock_mses, const int q_factor,
-    unsigned int *accumulator, uint16_t *count, uint16_t *luma_sq_error,
-    uint16_t *chroma_sq_error, int plane, int ss_x_shift, int ss_y_shift) {
-  assert(TF_PLANEWISE_FILTER_WINDOW_LENGTH == 5);
-  assert(((block_width == 32) && (block_height == 32)) ||
-         ((block_width == 16) && (block_height == 16)));
-  if (plane > PLANE_TYPE_Y) assert(chroma_sq_error != NULL);
+    const int *subblock_mses, unsigned int *accumulator, uint16_t *count,
+    uint16_t *frame_sse, uint32_t *luma_sse_sum,
+    const double inv_num_ref_pixels, const double decay_factor,
+    const double inv_factor, const double weight_factor, double *d_factor) {
+  assert(((block_width == 16) || (block_width == 32)) &&
+         ((block_height == 16) || (block_height == 32)));
 
   uint32_t acc_5x5_sse[BH][BW];
-  const double h = decay_control * (0.7 + log(sigma + 1.0));
-  const double q = AOMMIN((double)(q_factor * q_factor) / 256.0, 1);
-  uint16_t *frame_sse =
-      (plane == PLANE_TYPE_Y) ? luma_sq_error : chroma_sq_error;
 
   get_squared_error(frame1, stride, frame2, stride2, block_width, block_height,
                     frame_sse, SSE_STRIDE);
@@ -178,85 +172,113 @@
   for (int i = 0, k = 0; i < block_height; i++) {
     for (int j = 0; j < block_width; j++, k++) {
       const int pixel_value = frame2[i * stride2 + j];
+      uint32_t diff_sse = acc_5x5_sse[i][j] + luma_sse_sum[i * BW + j];
 
-      int diff_sse = acc_5x5_sse[i][j];
-      int num_ref_pixels =
-          TF_PLANEWISE_FILTER_WINDOW_LENGTH * TF_PLANEWISE_FILTER_WINDOW_LENGTH;
-
-      // Filter U-plane and V-plane using Y-plane. This is because motion
-      // search is only done on Y-plane, so the information from Y-plane will
-      // be more accurate.
-      if (plane != PLANE_TYPE_Y) {
-        for (int ii = 0; ii < (1 << ss_y_shift); ++ii) {
-          for (int jj = 0; jj < (1 << ss_x_shift); ++jj) {
-            const int yy = (i << ss_y_shift) + ii;      // Y-coord on Y-plane.
-            const int xx = (j << ss_x_shift) + jj + 2;  // X-coord on Y-plane.
-            const int ww = SSE_STRIDE;                  // Stride of Y-plane.
-            diff_sse += luma_sq_error[yy * ww + xx];
-            ++num_ref_pixels;
-          }
-        }
-      }
-
-      const double window_error = (double)(diff_sse) / num_ref_pixels;
+      const double window_error = diff_sse * inv_num_ref_pixels;
       const int subblock_idx =
           (i >= block_height / 2) * 2 + (j >= block_width / 2);
-      const double block_error =
-          (double)(use_subblock ? subblock_mses[subblock_idx] : block_mse);
+      const double block_error = (double)subblock_mses[subblock_idx];
+      const double combined_error =
+          weight_factor * window_error + block_error * inv_factor;
 
-      const double scaled_diff =
-          AOMMAX(-(window_error + block_error / 10) / (2 * h * h * q), -15.0);
-      const int adjusted_weight =
-          (int)(exp(scaled_diff) * TF_PLANEWISE_FILTER_WEIGHT_SCALE);
+      double scaled_error =
+          combined_error * d_factor[subblock_idx] * decay_factor;
+      scaled_error = AOMMIN(scaled_error, 7);
+      const int weight = (int)(exp(-scaled_error) * TF_WEIGHT_SCALE);
 
-      count[k] += adjusted_weight;
-      accumulator[k] += adjusted_weight * pixel_value;
+      count[k] += weight;
+      accumulator[k] += weight * pixel_value;
     }
   }
 }
 
-void av1_apply_temporal_filter_planewise_sse2(
-    const YV12_BUFFER_CONFIG *ref_frame, const MACROBLOCKD *mbd,
+void av1_apply_temporal_filter_sse2(
+    const YV12_BUFFER_CONFIG *frame_to_filter, const MACROBLOCKD *mbd,
     const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
-    const int num_planes, const double *noise_levels, const int use_subblock,
-    const int block_mse, const int *subblock_mses, const int q_factor,
+    const int num_planes, const double *noise_levels, const MV *subblock_mvs,
+    const int *subblock_mses, const int q_factor, const int filter_strength,
     const uint8_t *pred, uint32_t *accum, uint16_t *count) {
-  const int is_high_bitdepth = ref_frame->flags & YV12_FLAG_HIGHBITDEPTH;
-  if (is_high_bitdepth) {
-    assert(0 && "Only support low bit-depth with sse2!");
-  }
+  const int is_high_bitdepth = frame_to_filter->flags & YV12_FLAG_HIGHBITDEPTH;
+  assert(block_size == BLOCK_32X32 && "Only support 32x32 block with sse2!");
+  assert(TF_WINDOW_LENGTH == 5 && "Only support window length 5 with sse2!");
+  assert(!is_high_bitdepth && "Only support low bit-depth with sse2!");
   assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
-
-  const int frame_height = ref_frame->heights[0] << mbd->plane[0].subsampling_y;
-  const int decay_control = frame_height >= 720 ? 4 : 3;
+  (void)is_high_bitdepth;
 
   const int mb_height = block_size_high[block_size];
   const int mb_width = block_size_wide[block_size];
-  const int mb_pels = mb_height * mb_width;
-  uint16_t luma_sq_error[SSE_STRIDE * BH];
-  uint16_t *chroma_sq_error =
-      (num_planes > 0)
-          ? (uint16_t *)aom_malloc(SSE_STRIDE * BH * sizeof(uint16_t))
-          : NULL;
+  const int frame_height = frame_to_filter->y_crop_height;
+  const int frame_width = frame_to_filter->y_crop_width;
+  const int min_frame_size = AOMMIN(frame_height, frame_width);
+  // Variables to simplify combined error calculation.
+  const double inv_factor = 1.0 / ((TF_WINDOW_BLOCK_BALANCE_WEIGHT + 1) *
+                                   TF_SEARCH_ERROR_NORM_WEIGHT);
+  const double weight_factor =
+      (double)TF_WINDOW_BLOCK_BALANCE_WEIGHT * inv_factor;
+  // Decay factors for non-local mean approach.
+  // Smaller q -> smaller filtering weight.
+  double q_decay = pow((double)q_factor / TF_Q_DECAY_THRESHOLD, 2);
+  q_decay = CLIP(q_decay, 1e-5, 1);
+  // Smaller strength -> smaller filtering weight.
+  double s_decay = pow((double)filter_strength / TF_STRENGTH_THRESHOLD, 2);
+  s_decay = CLIP(s_decay, 1e-5, 1);
+  double d_factor[4] = { 0 };
+  uint16_t frame_sse[SSE_STRIDE * BH] = { 0 };
+  uint32_t luma_sse_sum[BW * BH] = { 0 };
 
+  for (int subblock_idx = 0; subblock_idx < 4; subblock_idx++) {
+    // Larger motion vector -> smaller filtering weight.
+    const MV mv = subblock_mvs[subblock_idx];
+    const double distance = sqrt(pow(mv.row, 2) + pow(mv.col, 2));
+    double distance_threshold = min_frame_size * TF_SEARCH_DISTANCE_THRESHOLD;
+    distance_threshold = AOMMAX(distance_threshold, 1);
+    d_factor[subblock_idx] = distance / distance_threshold;
+    d_factor[subblock_idx] = AOMMAX(d_factor[subblock_idx], 1);
+  }
+
+  // Handle planes in sequence.
+  int plane_offset = 0;
   for (int plane = 0; plane < num_planes; ++plane) {
     const uint32_t plane_h = mb_height >> mbd->plane[plane].subsampling_y;
     const uint32_t plane_w = mb_width >> mbd->plane[plane].subsampling_x;
-    const uint32_t frame_stride = ref_frame->strides[plane == 0 ? 0 : 1];
+    const uint32_t frame_stride = frame_to_filter->strides[plane == 0 ? 0 : 1];
     const int frame_offset = mb_row * plane_h * frame_stride + mb_col * plane_w;
 
-    const uint8_t *ref = ref_frame->buffers[plane] + frame_offset;
+    const uint8_t *ref = frame_to_filter->buffers[plane] + frame_offset;
     const int ss_x_shift =
-        mbd->plane[plane].subsampling_x - mbd->plane[0].subsampling_x;
+        mbd->plane[plane].subsampling_x - mbd->plane[AOM_PLANE_Y].subsampling_x;
     const int ss_y_shift =
-        mbd->plane[plane].subsampling_y - mbd->plane[0].subsampling_y;
+        mbd->plane[plane].subsampling_y - mbd->plane[AOM_PLANE_Y].subsampling_y;
+    const int num_ref_pixels = TF_WINDOW_LENGTH * TF_WINDOW_LENGTH +
+                               ((plane) ? (1 << (ss_x_shift + ss_y_shift)) : 0);
+    const double inv_num_ref_pixels = 1.0 / num_ref_pixels;
+    // Larger noise -> larger filtering weight.
+    const double n_decay = 0.5 + log(2 * noise_levels[plane] + 5.0);
+    const double decay_factor = 1 / (n_decay * q_decay * s_decay);
 
-    apply_temporal_filter_planewise(
-        ref, frame_stride, pred + mb_pels * plane, plane_w, plane_w, plane_h,
-        noise_levels[plane], decay_control, use_subblock, block_mse,
-        subblock_mses, q_factor, accum + mb_pels * plane,
-        count + mb_pels * plane, luma_sq_error, chroma_sq_error, plane,
-        ss_x_shift, ss_y_shift);
+    // Filter U-plane and V-plane using Y-plane. This is because motion
+    // search is only done on Y-plane, so the information from Y-plane
+    // will be more accurate. The luma sse sum is reused in both chroma
+    // planes.
+    if (plane == AOM_PLANE_U) {
+      for (unsigned int i = 0, k = 0; i < plane_h; i++) {
+        for (unsigned int j = 0; j < plane_w; j++, k++) {
+          for (int ii = 0; ii < (1 << ss_y_shift); ++ii) {
+            for (int jj = 0; jj < (1 << ss_x_shift); ++jj) {
+              const int yy = (i << ss_y_shift) + ii;  // Y-coord on Y-plane.
+              const int xx = (j << ss_x_shift) + jj;  // X-coord on Y-plane.
+              luma_sse_sum[i * BW + j] += frame_sse[yy * SSE_STRIDE + xx + 2];
+            }
+          }
+        }
+      }
+    }
+
+    apply_temporal_filter(ref, frame_stride, pred + plane_offset, plane_w,
+                          plane_w, plane_h, subblock_mses, accum + plane_offset,
+                          count + plane_offset, frame_sse, luma_sse_sum,
+                          inv_num_ref_pixels, decay_factor, inv_factor,
+                          weight_factor, d_factor);
+    plane_offset += plane_h * plane_w;
   }
-  if (chroma_sq_error != NULL) aom_free(chroma_sq_error);
 }
diff --git a/av1/encoder/x86/temporal_filter_sse4.c b/av1/encoder/x86/temporal_filter_sse4.c
deleted file mode 100644
index e3f9f5f..0000000
--- a/av1/encoder/x86/temporal_filter_sse4.c
+++ /dev/null
@@ -1,2044 +0,0 @@
-/*
- * Copyright (c) 2019, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include <smmintrin.h>
-
-#include "config/av1_rtcd.h"
-#include "aom/aom_integer.h"
-#include "av1/encoder/encoder.h"
-#include "av1/encoder/temporal_filter.h"
-#include "av1/encoder/x86/temporal_filter_constants.h"
-
-//////////////////////////
-// Low bit-depth Begins //
-//////////////////////////
-
-// Read in 8 pixels from a and b as 8-bit unsigned integers, compute the
-// difference squared, and store as unsigned 16-bit integer to dst.
-static INLINE void store_dist_8(const uint8_t *a, const uint8_t *b,
-                                uint16_t *dst) {
-  const __m128i a_reg = _mm_loadl_epi64((const __m128i *)a);
-  const __m128i b_reg = _mm_loadl_epi64((const __m128i *)b);
-
-  const __m128i a_first = _mm_cvtepu8_epi16(a_reg);
-  const __m128i b_first = _mm_cvtepu8_epi16(b_reg);
-
-  __m128i dist_first;
-
-  dist_first = _mm_sub_epi16(a_first, b_first);
-  dist_first = _mm_mullo_epi16(dist_first, dist_first);
-
-  _mm_storeu_si128((__m128i *)dst, dist_first);
-}
-
-static INLINE void store_dist_16(const uint8_t *a, const uint8_t *b,
-                                 uint16_t *dst) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i a_reg = _mm_loadu_si128((const __m128i *)a);
-  const __m128i b_reg = _mm_loadu_si128((const __m128i *)b);
-
-  const __m128i a_first = _mm_cvtepu8_epi16(a_reg);
-  const __m128i a_second = _mm_unpackhi_epi8(a_reg, zero);
-  const __m128i b_first = _mm_cvtepu8_epi16(b_reg);
-  const __m128i b_second = _mm_unpackhi_epi8(b_reg, zero);
-
-  __m128i dist_first, dist_second;
-
-  dist_first = _mm_sub_epi16(a_first, b_first);
-  dist_second = _mm_sub_epi16(a_second, b_second);
-  dist_first = _mm_mullo_epi16(dist_first, dist_first);
-  dist_second = _mm_mullo_epi16(dist_second, dist_second);
-
-  _mm_storeu_si128((__m128i *)dst, dist_first);
-  _mm_storeu_si128((__m128i *)(dst + 8), dist_second);
-}
-
-static INLINE void read_dist_8(const uint16_t *dist, __m128i *dist_reg) {
-  *dist_reg = _mm_loadu_si128((const __m128i *)dist);
-}
-
-static INLINE void read_dist_16(const uint16_t *dist, __m128i *reg_first,
-                                __m128i *reg_second) {
-  read_dist_8(dist, reg_first);
-  read_dist_8(dist + 8, reg_second);
-}
-
-// Average the value based on the number of values summed (9 for pixels away
-// from the border, 4 for pixels in corners, and 6 for other edge values).
-//
-// Add in the rounding factor and shift, clamp to 16, invert and shift. Multiply
-// by weight.
-static __m128i average_8(__m128i sum, const __m128i *mul_constants,
-                         const int strength, const int rounding,
-                         const int weight) {
-  // _mm_srl_epi16 uses the lower 64 bit value for the shift.
-  const __m128i strength_u128 = _mm_set_epi32(0, 0, 0, strength);
-  const __m128i rounding_u16 = _mm_set1_epi16(rounding);
-  const __m128i weight_u16 = _mm_set1_epi16(weight);
-  const __m128i sixteen = _mm_set1_epi16(16);
-
-  // modifier * 3 / index;
-  sum = _mm_mulhi_epu16(sum, *mul_constants);
-
-  sum = _mm_adds_epu16(sum, rounding_u16);
-  sum = _mm_srl_epi16(sum, strength_u128);
-
-  // The maximum input to this comparison is UINT16_MAX * NEIGHBOR_CONSTANT_4
-  // >> 16 (also NEIGHBOR_CONSTANT_4 -1) which is 49151 / 0xbfff / -16385
-  // So this needs to use the epu16 version which did not come until SSE4.
-  sum = _mm_min_epu16(sum, sixteen);
-
-  sum = _mm_sub_epi16(sixteen, sum);
-
-  return _mm_mullo_epi16(sum, weight_u16);
-}
-
-static __m128i average_4_4(__m128i sum, const __m128i *mul_constants,
-                           const int strength, const int rounding,
-                           const int weight_0, const int weight_1) {
-  // _mm_srl_epi16 uses the lower 64 bit value for the shift.
-  const __m128i strength_u128 = _mm_set_epi32(0, 0, 0, strength);
-  const __m128i rounding_u16 = _mm_set1_epi16(rounding);
-  const __m128i weight_u16 =
-      _mm_setr_epi16(weight_0, weight_0, weight_0, weight_0, weight_1, weight_1,
-                     weight_1, weight_1);
-  const __m128i sixteen = _mm_set1_epi16(16);
-
-  // modifier * 3 / index;
-  sum = _mm_mulhi_epu16(sum, *mul_constants);
-
-  sum = _mm_adds_epu16(sum, rounding_u16);
-  sum = _mm_srl_epi16(sum, strength_u128);
-
-  // The maximum input to this comparison is UINT16_MAX * NEIGHBOR_CONSTANT_4
-  // >> 16 (also NEIGHBOR_CONSTANT_4 -1) which is 49151 / 0xbfff / -16385
-  // So this needs to use the epu16 version which did not come until SSE4.
-  sum = _mm_min_epu16(sum, sixteen);
-
-  sum = _mm_sub_epi16(sixteen, sum);
-
-  return _mm_mullo_epi16(sum, weight_u16);
-}
-
-static INLINE void average_16(__m128i *sum_0_u16, __m128i *sum_1_u16,
-                              const __m128i *mul_constants_0,
-                              const __m128i *mul_constants_1,
-                              const int strength, const int rounding,
-                              const int weight) {
-  const __m128i strength_u128 = _mm_set_epi32(0, 0, 0, strength);
-  const __m128i rounding_u16 = _mm_set1_epi16(rounding);
-  const __m128i weight_u16 = _mm_set1_epi16(weight);
-  const __m128i sixteen = _mm_set1_epi16(16);
-  __m128i input_0, input_1;
-
-  input_0 = _mm_mulhi_epu16(*sum_0_u16, *mul_constants_0);
-  input_0 = _mm_adds_epu16(input_0, rounding_u16);
-
-  input_1 = _mm_mulhi_epu16(*sum_1_u16, *mul_constants_1);
-  input_1 = _mm_adds_epu16(input_1, rounding_u16);
-
-  input_0 = _mm_srl_epi16(input_0, strength_u128);
-  input_1 = _mm_srl_epi16(input_1, strength_u128);
-
-  input_0 = _mm_min_epu16(input_0, sixteen);
-  input_1 = _mm_min_epu16(input_1, sixteen);
-  input_0 = _mm_sub_epi16(sixteen, input_0);
-  input_1 = _mm_sub_epi16(sixteen, input_1);
-
-  *sum_0_u16 = _mm_mullo_epi16(input_0, weight_u16);
-  *sum_1_u16 = _mm_mullo_epi16(input_1, weight_u16);
-}
-
-// Add 'sum_u16' to 'count'. Multiply by 'pred' and add to 'accumulator.'
-static void accumulate_and_store_8(const __m128i sum_u16, const uint8_t *pred,
-                                   uint16_t *count, uint32_t *accumulator) {
-  const __m128i pred_u8 = _mm_loadl_epi64((const __m128i *)pred);
-  const __m128i zero = _mm_setzero_si128();
-  __m128i count_u16 = _mm_loadu_si128((const __m128i *)count);
-  __m128i pred_u16 = _mm_cvtepu8_epi16(pred_u8);
-  __m128i pred_0_u32, pred_1_u32;
-  __m128i accum_0_u32, accum_1_u32;
-
-  count_u16 = _mm_adds_epu16(count_u16, sum_u16);
-  _mm_storeu_si128((__m128i *)count, count_u16);
-
-  pred_u16 = _mm_mullo_epi16(sum_u16, pred_u16);
-
-  pred_0_u32 = _mm_cvtepu16_epi32(pred_u16);
-  pred_1_u32 = _mm_unpackhi_epi16(pred_u16, zero);
-
-  accum_0_u32 = _mm_loadu_si128((const __m128i *)accumulator);
-  accum_1_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 4));
-
-  accum_0_u32 = _mm_add_epi32(pred_0_u32, accum_0_u32);
-  accum_1_u32 = _mm_add_epi32(pred_1_u32, accum_1_u32);
-
-  _mm_storeu_si128((__m128i *)accumulator, accum_0_u32);
-  _mm_storeu_si128((__m128i *)(accumulator + 4), accum_1_u32);
-}
-
-static INLINE void accumulate_and_store_16(const __m128i sum_0_u16,
-                                           const __m128i sum_1_u16,
-                                           const uint8_t *pred, uint16_t *count,
-                                           uint32_t *accumulator) {
-  const __m128i pred_u8 = _mm_loadu_si128((const __m128i *)pred);
-  const __m128i zero = _mm_setzero_si128();
-  __m128i count_0_u16 = _mm_loadu_si128((const __m128i *)count),
-          count_1_u16 = _mm_loadu_si128((const __m128i *)(count + 8));
-  __m128i pred_0_u16 = _mm_cvtepu8_epi16(pred_u8),
-          pred_1_u16 = _mm_unpackhi_epi8(pred_u8, zero);
-  __m128i pred_0_u32, pred_1_u32, pred_2_u32, pred_3_u32;
-  __m128i accum_0_u32, accum_1_u32, accum_2_u32, accum_3_u32;
-
-  count_0_u16 = _mm_adds_epu16(count_0_u16, sum_0_u16);
-  _mm_storeu_si128((__m128i *)count, count_0_u16);
-
-  count_1_u16 = _mm_adds_epu16(count_1_u16, sum_1_u16);
-  _mm_storeu_si128((__m128i *)(count + 8), count_1_u16);
-
-  pred_0_u16 = _mm_mullo_epi16(sum_0_u16, pred_0_u16);
-  pred_1_u16 = _mm_mullo_epi16(sum_1_u16, pred_1_u16);
-
-  pred_0_u32 = _mm_cvtepu16_epi32(pred_0_u16);
-  pred_1_u32 = _mm_unpackhi_epi16(pred_0_u16, zero);
-  pred_2_u32 = _mm_cvtepu16_epi32(pred_1_u16);
-  pred_3_u32 = _mm_unpackhi_epi16(pred_1_u16, zero);
-
-  accum_0_u32 = _mm_loadu_si128((const __m128i *)accumulator);
-  accum_1_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 4));
-  accum_2_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 8));
-  accum_3_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 12));
-
-  accum_0_u32 = _mm_add_epi32(pred_0_u32, accum_0_u32);
-  accum_1_u32 = _mm_add_epi32(pred_1_u32, accum_1_u32);
-  accum_2_u32 = _mm_add_epi32(pred_2_u32, accum_2_u32);
-  accum_3_u32 = _mm_add_epi32(pred_3_u32, accum_3_u32);
-
-  _mm_storeu_si128((__m128i *)accumulator, accum_0_u32);
-  _mm_storeu_si128((__m128i *)(accumulator + 4), accum_1_u32);
-  _mm_storeu_si128((__m128i *)(accumulator + 8), accum_2_u32);
-  _mm_storeu_si128((__m128i *)(accumulator + 12), accum_3_u32);
-}
-
-// Read in 8 pixels from y_dist. For each index i, compute y_dist[i-1] +
-// y_dist[i] + y_dist[i+1] and store in sum as 16-bit unsigned int.
-static INLINE void get_sum_8(const uint16_t *y_dist, __m128i *sum) {
-  __m128i dist_reg, dist_left, dist_right;
-
-  dist_reg = _mm_loadu_si128((const __m128i *)y_dist);
-  dist_left = _mm_loadu_si128((const __m128i *)(y_dist - 1));
-  dist_right = _mm_loadu_si128((const __m128i *)(y_dist + 1));
-
-  *sum = _mm_adds_epu16(dist_reg, dist_left);
-  *sum = _mm_adds_epu16(*sum, dist_right);
-}
-
-// Read in 16 pixels from y_dist. For each index i, compute y_dist[i-1] +
-// y_dist[i] + y_dist[i+1]. Store the result for first 8 pixels in sum_first and
-// the rest in sum_second.
-static INLINE void get_sum_16(const uint16_t *y_dist, __m128i *sum_first,
-                              __m128i *sum_second) {
-  get_sum_8(y_dist, sum_first);
-  get_sum_8(y_dist + 8, sum_second);
-}
-
-// Read in a row of chroma values corresponds to a row of 16 luma values.
-static INLINE void read_chroma_dist_row_16(int ss_x, const uint16_t *u_dist,
-                                           const uint16_t *v_dist,
-                                           __m128i *u_first, __m128i *u_second,
-                                           __m128i *v_first,
-                                           __m128i *v_second) {
-  if (!ss_x) {
-    // If there is no chroma subsampling in the horizontal direction, then we
-    // need to load 16 entries from chroma.
-    read_dist_16(u_dist, u_first, u_second);
-    read_dist_16(v_dist, v_first, v_second);
-  } else {  // ss_x == 1
-    // Otherwise, we only need to load 8 entries
-    __m128i u_reg, v_reg;
-
-    read_dist_8(u_dist, &u_reg);
-
-    *u_first = _mm_unpacklo_epi16(u_reg, u_reg);
-    *u_second = _mm_unpackhi_epi16(u_reg, u_reg);
-
-    read_dist_8(v_dist, &v_reg);
-
-    *v_first = _mm_unpacklo_epi16(v_reg, v_reg);
-    *v_second = _mm_unpackhi_epi16(v_reg, v_reg);
-  }
-}
-
-// Horizontal add unsigned 16-bit ints in src and store them as signed 32-bit
-// int in dst.
-static INLINE void hadd_epu16(__m128i *src, __m128i *dst) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i shift_right = _mm_srli_si128(*src, 2);
-
-  const __m128i odd = _mm_blend_epi16(shift_right, zero, 170);
-  const __m128i even = _mm_blend_epi16(*src, zero, 170);
-
-  *dst = _mm_add_epi32(even, odd);
-}
-
-// Add a row of luma distortion to 8 corresponding chroma mods.
-static INLINE void add_luma_dist_to_8_chroma_mod(const uint16_t *y_dist,
-                                                 int ss_x, int ss_y,
-                                                 __m128i *u_mod,
-                                                 __m128i *v_mod) {
-  __m128i y_reg;
-  if (!ss_x) {
-    read_dist_8(y_dist, &y_reg);
-    if (ss_y == 1) {
-      __m128i y_tmp;
-      read_dist_8(y_dist + DIST_STRIDE, &y_tmp);
-
-      y_reg = _mm_adds_epu16(y_reg, y_tmp);
-    }
-  } else {
-    __m128i y_first, y_second;
-    read_dist_16(y_dist, &y_first, &y_second);
-    if (ss_y == 1) {
-      __m128i y_tmp_0, y_tmp_1;
-      read_dist_16(y_dist + DIST_STRIDE, &y_tmp_0, &y_tmp_1);
-
-      y_first = _mm_adds_epu16(y_first, y_tmp_0);
-      y_second = _mm_adds_epu16(y_second, y_tmp_1);
-    }
-
-    hadd_epu16(&y_first, &y_first);
-    hadd_epu16(&y_second, &y_second);
-
-    y_reg = _mm_packus_epi32(y_first, y_second);
-  }
-
-  *u_mod = _mm_adds_epu16(*u_mod, y_reg);
-  *v_mod = _mm_adds_epu16(*v_mod, y_reg);
-}
-
-// Apply temporal filter to the luma components. This performs temporal
-// filtering on a luma block of 16 X block_height. Use blk_fw as an array of
-// size 4 for the weights for each of the 4 subblocks if blk_fw is not NULL,
-// else use top_weight for top half, and bottom weight for bottom half.
-static void apply_temporal_filter_luma_16(
-    const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre,
-    int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src,
-    int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre,
-    int uv_pre_stride, unsigned int block_width, unsigned int block_height,
-    int ss_x, int ss_y, int strength, int use_whole_blk, uint32_t *y_accum,
-    uint16_t *y_count, const uint16_t *y_dist, const uint16_t *u_dist,
-    const uint16_t *v_dist, const int16_t *const *neighbors_first,
-    const int16_t *const *neighbors_second, int top_weight, int bottom_weight,
-    const int *blk_fw) {
-  const int rounding = (1 << strength) >> 1;
-  int weight = top_weight;
-
-  __m128i mul_first, mul_second;
-
-  __m128i sum_row_1_first, sum_row_1_second;
-  __m128i sum_row_2_first, sum_row_2_second;
-  __m128i sum_row_3_first, sum_row_3_second;
-
-  __m128i u_first, u_second;
-  __m128i v_first, v_second;
-
-  __m128i sum_row_first;
-  __m128i sum_row_second;
-
-  // Loop variables
-  unsigned int h;
-
-  assert(strength >= 0);
-  assert(strength <= 6);
-
-  assert(block_width == 16);
-
-  (void)block_width;
-
-  // First row
-  mul_first = _mm_loadu_si128((const __m128i *)neighbors_first[0]);
-  mul_second = _mm_loadu_si128((const __m128i *)neighbors_second[0]);
-
-  // Add luma values
-  get_sum_16(y_dist, &sum_row_2_first, &sum_row_2_second);
-  get_sum_16(y_dist + DIST_STRIDE, &sum_row_3_first, &sum_row_3_second);
-
-  sum_row_first = _mm_adds_epu16(sum_row_2_first, sum_row_3_first);
-  sum_row_second = _mm_adds_epu16(sum_row_2_second, sum_row_3_second);
-
-  // Add chroma values
-  read_chroma_dist_row_16(ss_x, u_dist, v_dist, &u_first, &u_second, &v_first,
-                          &v_second);
-
-  sum_row_first = _mm_adds_epu16(sum_row_first, u_first);
-  sum_row_second = _mm_adds_epu16(sum_row_second, u_second);
-
-  sum_row_first = _mm_adds_epu16(sum_row_first, v_first);
-  sum_row_second = _mm_adds_epu16(sum_row_second, v_second);
-
-  // Get modifier and store result
-  if (blk_fw) {
-    sum_row_first =
-        average_8(sum_row_first, &mul_first, strength, rounding, blk_fw[0]);
-    sum_row_second =
-        average_8(sum_row_second, &mul_second, strength, rounding, blk_fw[1]);
-  } else {
-    average_16(&sum_row_first, &sum_row_second, &mul_first, &mul_second,
-               strength, rounding, weight);
-  }
-  accumulate_and_store_16(sum_row_first, sum_row_second, y_pre, y_count,
-                          y_accum);
-
-  y_src += y_src_stride;
-  y_pre += y_pre_stride;
-  y_count += y_pre_stride;
-  y_accum += y_pre_stride;
-  y_dist += DIST_STRIDE;
-
-  u_src += uv_src_stride;
-  u_pre += uv_pre_stride;
-  u_dist += DIST_STRIDE;
-  v_src += uv_src_stride;
-  v_pre += uv_pre_stride;
-  v_dist += DIST_STRIDE;
-
-  // Then all the rows except the last one
-  mul_first = _mm_loadu_si128((const __m128i *)neighbors_first[1]);
-  mul_second = _mm_loadu_si128((const __m128i *)neighbors_second[1]);
-
-  for (h = 1; h < block_height - 1; ++h) {
-    // Move the weight to bottom half
-    if (!use_whole_blk && h == block_height / 2) {
-      if (blk_fw) {
-        blk_fw += 2;
-      } else {
-        weight = bottom_weight;
-      }
-    }
-    // Shift the rows up
-    sum_row_1_first = sum_row_2_first;
-    sum_row_1_second = sum_row_2_second;
-    sum_row_2_first = sum_row_3_first;
-    sum_row_2_second = sum_row_3_second;
-
-    // Add luma values to the modifier
-    sum_row_first = _mm_adds_epu16(sum_row_1_first, sum_row_2_first);
-    sum_row_second = _mm_adds_epu16(sum_row_1_second, sum_row_2_second);
-
-    get_sum_16(y_dist + DIST_STRIDE, &sum_row_3_first, &sum_row_3_second);
-
-    sum_row_first = _mm_adds_epu16(sum_row_first, sum_row_3_first);
-    sum_row_second = _mm_adds_epu16(sum_row_second, sum_row_3_second);
-
-    // Add chroma values to the modifier
-    if (ss_y == 0 || h % 2 == 0) {
-      // Only calculate the new chroma distortion if we are at a pixel that
-      // corresponds to a new chroma row
-      read_chroma_dist_row_16(ss_x, u_dist, v_dist, &u_first, &u_second,
-                              &v_first, &v_second);
-
-      u_src += uv_src_stride;
-      u_pre += uv_pre_stride;
-      u_dist += DIST_STRIDE;
-      v_src += uv_src_stride;
-      v_pre += uv_pre_stride;
-      v_dist += DIST_STRIDE;
-    }
-
-    sum_row_first = _mm_adds_epu16(sum_row_first, u_first);
-    sum_row_second = _mm_adds_epu16(sum_row_second, u_second);
-    sum_row_first = _mm_adds_epu16(sum_row_first, v_first);
-    sum_row_second = _mm_adds_epu16(sum_row_second, v_second);
-
-    // Get modifier and store result
-    if (blk_fw) {
-      sum_row_first =
-          average_8(sum_row_first, &mul_first, strength, rounding, blk_fw[0]);
-      sum_row_second =
-          average_8(sum_row_second, &mul_second, strength, rounding, blk_fw[1]);
-    } else {
-      average_16(&sum_row_first, &sum_row_second, &mul_first, &mul_second,
-                 strength, rounding, weight);
-    }
-    accumulate_and_store_16(sum_row_first, sum_row_second, y_pre, y_count,
-                            y_accum);
-
-    y_src += y_src_stride;
-    y_pre += y_pre_stride;
-    y_count += y_pre_stride;
-    y_accum += y_pre_stride;
-    y_dist += DIST_STRIDE;
-  }
-
-  // The last row
-  mul_first = _mm_loadu_si128((const __m128i *)neighbors_first[0]);
-  mul_second = _mm_loadu_si128((const __m128i *)neighbors_second[0]);
-
-  // Shift the rows up
-  sum_row_1_first = sum_row_2_first;
-  sum_row_1_second = sum_row_2_second;
-  sum_row_2_first = sum_row_3_first;
-  sum_row_2_second = sum_row_3_second;
-
-  // Add luma values to the modifier
-  sum_row_first = _mm_adds_epu16(sum_row_1_first, sum_row_2_first);
-  sum_row_second = _mm_adds_epu16(sum_row_1_second, sum_row_2_second);
-
-  // Add chroma values to the modifier
-  if (ss_y == 0) {
-    // Only calculate the new chroma distortion if we are at a pixel that
-    // corresponds to a new chroma row
-    read_chroma_dist_row_16(ss_x, u_dist, v_dist, &u_first, &u_second, &v_first,
-                            &v_second);
-  }
-
-  sum_row_first = _mm_adds_epu16(sum_row_first, u_first);
-  sum_row_second = _mm_adds_epu16(sum_row_second, u_second);
-  sum_row_first = _mm_adds_epu16(sum_row_first, v_first);
-  sum_row_second = _mm_adds_epu16(sum_row_second, v_second);
-
-  // Get modifier and store result
-  if (blk_fw) {
-    sum_row_first =
-        average_8(sum_row_first, &mul_first, strength, rounding, blk_fw[0]);
-    sum_row_second =
-        average_8(sum_row_second, &mul_second, strength, rounding, blk_fw[1]);
-  } else {
-    average_16(&sum_row_first, &sum_row_second, &mul_first, &mul_second,
-               strength, rounding, weight);
-  }
-  accumulate_and_store_16(sum_row_first, sum_row_second, y_pre, y_count,
-                          y_accum);
-}
-
-// Perform temporal filter for the luma component.
-static void apply_temporal_filter_luma(
-    const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre,
-    int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src,
-    int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre,
-    int uv_pre_stride, unsigned int block_width, unsigned int block_height,
-    int ss_x, int ss_y, int strength, const int *blk_fw, int use_whole_blk,
-    uint32_t *y_accum, uint16_t *y_count, const uint16_t *y_dist,
-    const uint16_t *u_dist, const uint16_t *v_dist) {
-  unsigned int blk_col = 0, uv_blk_col = 0;
-  const unsigned int blk_col_step = 16, uv_blk_col_step = 16 >> ss_x;
-  const unsigned int mid_width = block_width >> 1,
-                     last_width = block_width - blk_col_step;
-  int top_weight = blk_fw[0],
-      bottom_weight = use_whole_blk ? blk_fw[0] : blk_fw[2];
-  const int16_t *const *neighbors_first;
-  const int16_t *const *neighbors_second;
-
-  if (block_width == 16) {
-    // Special Case: The blockwidth is 16 and we are operating on a row of 16
-    // chroma pixels. In this case, we can't use the usualy left-midle-right
-    // pattern. We also don't support splitting now.
-    neighbors_first = LUMA_LEFT_COLUMN_NEIGHBORS;
-    neighbors_second = LUMA_RIGHT_COLUMN_NEIGHBORS;
-    if (use_whole_blk) {
-      apply_temporal_filter_luma_16(
-          y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-          u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
-          u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, 16,
-          block_height, ss_x, ss_y, strength, use_whole_blk, y_accum + blk_col,
-          y_count + blk_col, y_dist + blk_col, u_dist + uv_blk_col,
-          v_dist + uv_blk_col, neighbors_first, neighbors_second, top_weight,
-          bottom_weight, NULL);
-    } else {
-      apply_temporal_filter_luma_16(
-          y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-          u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
-          u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, 16,
-          block_height, ss_x, ss_y, strength, use_whole_blk, y_accum + blk_col,
-          y_count + blk_col, y_dist + blk_col, u_dist + uv_blk_col,
-          v_dist + uv_blk_col, neighbors_first, neighbors_second, 0, 0, blk_fw);
-    }
-
-    return;
-  }
-
-  // Left
-  neighbors_first = LUMA_LEFT_COLUMN_NEIGHBORS;
-  neighbors_second = LUMA_MIDDLE_COLUMN_NEIGHBORS;
-  apply_temporal_filter_luma_16(
-      y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-      u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col,
-      v_pre + uv_blk_col, uv_pre_stride, 16, block_height, ss_x, ss_y, strength,
-      use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col,
-      u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first,
-      neighbors_second, top_weight, bottom_weight, NULL);
-
-  blk_col += blk_col_step;
-  uv_blk_col += uv_blk_col_step;
-
-  // Middle First
-  neighbors_first = LUMA_MIDDLE_COLUMN_NEIGHBORS;
-  for (; blk_col < mid_width;
-       blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
-    apply_temporal_filter_luma_16(
-        y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-        u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
-        u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, 16, block_height,
-        ss_x, ss_y, strength, use_whole_blk, y_accum + blk_col,
-        y_count + blk_col, y_dist + blk_col, u_dist + uv_blk_col,
-        v_dist + uv_blk_col, neighbors_first, neighbors_second, top_weight,
-        bottom_weight, NULL);
-  }
-
-  if (!use_whole_blk) {
-    top_weight = blk_fw[1];
-    bottom_weight = blk_fw[3];
-  }
-
-  // Middle Second
-  for (; blk_col < last_width;
-       blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
-    apply_temporal_filter_luma_16(
-        y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-        u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
-        u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, 16, block_height,
-        ss_x, ss_y, strength, use_whole_blk, y_accum + blk_col,
-        y_count + blk_col, y_dist + blk_col, u_dist + uv_blk_col,
-        v_dist + uv_blk_col, neighbors_first, neighbors_second, top_weight,
-        bottom_weight, NULL);
-  }
-
-  // Right
-  neighbors_second = LUMA_RIGHT_COLUMN_NEIGHBORS;
-  apply_temporal_filter_luma_16(
-      y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-      u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col,
-      v_pre + uv_blk_col, uv_pre_stride, 16, block_height, ss_x, ss_y, strength,
-      use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col,
-      u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first,
-      neighbors_second, top_weight, bottom_weight, NULL);
-}
-
-// Apply temporal filter to the chroma components. This performs temporal
-// filtering on a chroma block of 8 X uv_height. If blk_fw is not NULL, use
-// blk_fw as an array of size 4 for the weights for each of the 4 subblocks,
-// else use top_weight for top half, and bottom weight for bottom half.
-static void apply_temporal_filter_chroma_8(
-    const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre,
-    int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src,
-    int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre,
-    int uv_pre_stride, unsigned int uv_block_width,
-    unsigned int uv_block_height, int ss_x, int ss_y, int strength,
-    uint32_t *u_accum, uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count,
-    const uint16_t *y_dist, const uint16_t *u_dist, const uint16_t *v_dist,
-    const int16_t *const *neighbors, int top_weight, int bottom_weight,
-    const int *blk_fw) {
-  const int rounding = (1 << strength) >> 1;
-  int weight = top_weight;
-
-  __m128i mul;
-
-  __m128i u_sum_row_1, u_sum_row_2, u_sum_row_3;
-  __m128i v_sum_row_1, v_sum_row_2, v_sum_row_3;
-
-  __m128i u_sum_row, v_sum_row;
-
-  // Loop variable
-  unsigned int h;
-
-  (void)uv_block_width;
-
-  // First row
-  mul = _mm_loadu_si128((const __m128i *)neighbors[0]);
-
-  // Add chroma values
-  get_sum_8(u_dist, &u_sum_row_2);
-  get_sum_8(u_dist + DIST_STRIDE, &u_sum_row_3);
-
-  u_sum_row = _mm_adds_epu16(u_sum_row_2, u_sum_row_3);
-
-  get_sum_8(v_dist, &v_sum_row_2);
-  get_sum_8(v_dist + DIST_STRIDE, &v_sum_row_3);
-
-  v_sum_row = _mm_adds_epu16(v_sum_row_2, v_sum_row_3);
-
-  // Add luma values
-  add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row, &v_sum_row);
-
-  // Get modifier and store result
-  if (blk_fw) {
-    u_sum_row =
-        average_4_4(u_sum_row, &mul, strength, rounding, blk_fw[0], blk_fw[1]);
-    v_sum_row =
-        average_4_4(v_sum_row, &mul, strength, rounding, blk_fw[0], blk_fw[1]);
-  } else {
-    u_sum_row = average_8(u_sum_row, &mul, strength, rounding, weight);
-    v_sum_row = average_8(v_sum_row, &mul, strength, rounding, weight);
-  }
-  accumulate_and_store_8(u_sum_row, u_pre, u_count, u_accum);
-  accumulate_and_store_8(v_sum_row, v_pre, v_count, v_accum);
-
-  u_src += uv_src_stride;
-  u_pre += uv_pre_stride;
-  u_dist += DIST_STRIDE;
-  v_src += uv_src_stride;
-  v_pre += uv_pre_stride;
-  v_dist += DIST_STRIDE;
-  u_count += uv_pre_stride;
-  u_accum += uv_pre_stride;
-  v_count += uv_pre_stride;
-  v_accum += uv_pre_stride;
-
-  y_src += y_src_stride * (1 + ss_y);
-  y_pre += y_pre_stride * (1 + ss_y);
-  y_dist += DIST_STRIDE * (1 + ss_y);
-
-  // Then all the rows except the last one
-  mul = _mm_loadu_si128((const __m128i *)neighbors[1]);
-
-  for (h = 1; h < uv_block_height - 1; ++h) {
-    // Move the weight pointer to the bottom half of the blocks
-    if (h == uv_block_height / 2) {
-      if (blk_fw) {
-        blk_fw += 2;
-      } else {
-        weight = bottom_weight;
-      }
-    }
-
-    // Shift the rows up
-    u_sum_row_1 = u_sum_row_2;
-    u_sum_row_2 = u_sum_row_3;
-
-    v_sum_row_1 = v_sum_row_2;
-    v_sum_row_2 = v_sum_row_3;
-
-    // Add chroma values
-    u_sum_row = _mm_adds_epu16(u_sum_row_1, u_sum_row_2);
-    get_sum_8(u_dist + DIST_STRIDE, &u_sum_row_3);
-    u_sum_row = _mm_adds_epu16(u_sum_row, u_sum_row_3);
-
-    v_sum_row = _mm_adds_epu16(v_sum_row_1, v_sum_row_2);
-    get_sum_8(v_dist + DIST_STRIDE, &v_sum_row_3);
-    v_sum_row = _mm_adds_epu16(v_sum_row, v_sum_row_3);
-
-    // Add luma values
-    add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row, &v_sum_row);
-
-    // Get modifier and store result
-    if (blk_fw) {
-      u_sum_row = average_4_4(u_sum_row, &mul, strength, rounding, blk_fw[0],
-                              blk_fw[1]);
-      v_sum_row = average_4_4(v_sum_row, &mul, strength, rounding, blk_fw[0],
-                              blk_fw[1]);
-    } else {
-      u_sum_row = average_8(u_sum_row, &mul, strength, rounding, weight);
-      v_sum_row = average_8(v_sum_row, &mul, strength, rounding, weight);
-    }
-
-    accumulate_and_store_8(u_sum_row, u_pre, u_count, u_accum);
-    accumulate_and_store_8(v_sum_row, v_pre, v_count, v_accum);
-
-    u_src += uv_src_stride;
-    u_pre += uv_pre_stride;
-    u_dist += DIST_STRIDE;
-    v_src += uv_src_stride;
-    v_pre += uv_pre_stride;
-    v_dist += DIST_STRIDE;
-    u_count += uv_pre_stride;
-    u_accum += uv_pre_stride;
-    v_count += uv_pre_stride;
-    v_accum += uv_pre_stride;
-
-    y_src += y_src_stride * (1 + ss_y);
-    y_pre += y_pre_stride * (1 + ss_y);
-    y_dist += DIST_STRIDE * (1 + ss_y);
-  }
-
-  // The last row
-  mul = _mm_loadu_si128((const __m128i *)neighbors[0]);
-
-  // Shift the rows up
-  u_sum_row_1 = u_sum_row_2;
-  u_sum_row_2 = u_sum_row_3;
-
-  v_sum_row_1 = v_sum_row_2;
-  v_sum_row_2 = v_sum_row_3;
-
-  // Add chroma values
-  u_sum_row = _mm_adds_epu16(u_sum_row_1, u_sum_row_2);
-  v_sum_row = _mm_adds_epu16(v_sum_row_1, v_sum_row_2);
-
-  // Add luma values
-  add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row, &v_sum_row);
-
-  // Get modifier and store result
-  if (blk_fw) {
-    u_sum_row =
-        average_4_4(u_sum_row, &mul, strength, rounding, blk_fw[0], blk_fw[1]);
-    v_sum_row =
-        average_4_4(v_sum_row, &mul, strength, rounding, blk_fw[0], blk_fw[1]);
-  } else {
-    u_sum_row = average_8(u_sum_row, &mul, strength, rounding, weight);
-    v_sum_row = average_8(v_sum_row, &mul, strength, rounding, weight);
-  }
-
-  accumulate_and_store_8(u_sum_row, u_pre, u_count, u_accum);
-  accumulate_and_store_8(v_sum_row, v_pre, v_count, v_accum);
-}
-
-// Perform temporal filter for the chroma components.
-static void apply_temporal_filter_chroma(
-    const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre,
-    int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src,
-    int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre,
-    int uv_pre_stride, unsigned int block_width, unsigned int block_height,
-    int ss_x, int ss_y, int strength, const int *blk_fw, int use_whole_blk,
-    uint32_t *u_accum, uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count,
-    const uint16_t *y_dist, const uint16_t *u_dist, const uint16_t *v_dist) {
-  const unsigned int uv_width = block_width >> ss_x,
-                     uv_height = block_height >> ss_y;
-
-  unsigned int blk_col = 0, uv_blk_col = 0;
-  const unsigned int uv_blk_col_step = 8, blk_col_step = 8 << ss_x;
-  const unsigned int uv_mid_width = uv_width >> 1,
-                     uv_last_width = uv_width - uv_blk_col_step;
-  int top_weight = blk_fw[0],
-      bottom_weight = use_whole_blk ? blk_fw[0] : blk_fw[2];
-  const int16_t *const *neighbors;
-
-  if (uv_width == 8) {
-    // Special Case: We are subsampling in x direction on a 16x16 block. Since
-    // we are operating on a row of 8 chroma pixels, we can't use the usual
-    // left-middle-right pattern.
-    assert(ss_x);
-
-    if (ss_y) {
-      neighbors = CHROMA_DOUBLE_SS_SINGLE_COLUMN_NEIGHBORS;
-    } else {
-      neighbors = CHROMA_SINGLE_SS_SINGLE_COLUMN_NEIGHBORS;
-    }
-
-    if (use_whole_blk) {
-      apply_temporal_filter_chroma_8(
-          y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-          u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
-          u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
-          uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
-          u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
-          y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors,
-          top_weight, bottom_weight, NULL);
-    } else {
-      apply_temporal_filter_chroma_8(
-          y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-          u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
-          u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
-          uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
-          u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
-          y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors,
-          0, 0, blk_fw);
-    }
-
-    return;
-  }
-
-  // Left
-  if (ss_x && ss_y) {
-    neighbors = CHROMA_DOUBLE_SS_LEFT_COLUMN_NEIGHBORS;
-  } else if (ss_x || ss_y) {
-    neighbors = CHROMA_SINGLE_SS_LEFT_COLUMN_NEIGHBORS;
-  } else {
-    neighbors = CHROMA_NO_SS_LEFT_COLUMN_NEIGHBORS;
-  }
-
-  apply_temporal_filter_chroma_8(
-      y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-      u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col,
-      v_pre + uv_blk_col, uv_pre_stride, uv_width, uv_height, ss_x, ss_y,
-      strength, u_accum + uv_blk_col, u_count + uv_blk_col,
-      v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col,
-      u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight,
-      bottom_weight, NULL);
-
-  blk_col += blk_col_step;
-  uv_blk_col += uv_blk_col_step;
-
-  // Middle First
-  if (ss_x && ss_y) {
-    neighbors = CHROMA_DOUBLE_SS_MIDDLE_COLUMN_NEIGHBORS;
-  } else if (ss_x || ss_y) {
-    neighbors = CHROMA_SINGLE_SS_MIDDLE_COLUMN_NEIGHBORS;
-  } else {
-    neighbors = CHROMA_NO_SS_MIDDLE_COLUMN_NEIGHBORS;
-  }
-
-  for (; uv_blk_col < uv_mid_width;
-       blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
-    apply_temporal_filter_chroma_8(
-        y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-        u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
-        u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
-        uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
-        u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
-        y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors,
-        top_weight, bottom_weight, NULL);
-  }
-
-  if (!use_whole_blk) {
-    top_weight = blk_fw[1];
-    bottom_weight = blk_fw[3];
-  }
-
-  // Middle Second
-  for (; uv_blk_col < uv_last_width;
-       blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
-    apply_temporal_filter_chroma_8(
-        y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-        u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
-        u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
-        uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
-        u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
-        y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors,
-        top_weight, bottom_weight, NULL);
-  }
-
-  // Right
-  if (ss_x && ss_y) {
-    neighbors = CHROMA_DOUBLE_SS_RIGHT_COLUMN_NEIGHBORS;
-  } else if (ss_x || ss_y) {
-    neighbors = CHROMA_SINGLE_SS_RIGHT_COLUMN_NEIGHBORS;
-  } else {
-    neighbors = CHROMA_NO_SS_RIGHT_COLUMN_NEIGHBORS;
-  }
-
-  apply_temporal_filter_chroma_8(
-      y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-      u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col,
-      v_pre + uv_blk_col, uv_pre_stride, uv_width, uv_height, ss_x, ss_y,
-      strength, u_accum + uv_blk_col, u_count + uv_blk_col,
-      v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col,
-      u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight,
-      bottom_weight, NULL);
-}
-
-static void apply_temporal_filter_yuv(
-    const YV12_BUFFER_CONFIG *ref_frame, const MACROBLOCKD *mbd,
-    const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
-    const int strength, const int use_subblock,
-    const int *subblock_filter_weights, const uint8_t *pred, uint32_t *accum,
-    uint16_t *count) {
-  const int use_whole_blk = !use_subblock;
-  const int *blk_fw = subblock_filter_weights;
-
-  // Block information (Y-plane).
-  const unsigned int block_height = block_size_high[block_size];
-  const unsigned int block_width = block_size_wide[block_size];
-  const int mb_pels = block_height * block_width;
-  const int y_src_stride = ref_frame->y_stride;
-  const int y_pre_stride = block_width;
-  const int mb_y_src_offset =
-      mb_row * block_height * ref_frame->y_stride + mb_col * block_width;
-
-  // Block information (UV-plane).
-  const int ss_y = mbd->plane[1].subsampling_y;
-  const int ss_x = mbd->plane[1].subsampling_x;
-  const unsigned int uv_height = block_height >> ss_y;
-  const unsigned int uv_width = block_width >> ss_x;
-  const int uv_src_stride = ref_frame->uv_stride;
-  const int uv_pre_stride = block_width >> ss_x;
-  const int mb_uv_src_offset =
-      mb_row * uv_height * ref_frame->uv_stride + mb_col * uv_width;
-
-  const uint8_t *y_src = ref_frame->y_buffer + mb_y_src_offset;
-  const uint8_t *u_src = ref_frame->u_buffer + mb_uv_src_offset;
-  const uint8_t *v_src = ref_frame->v_buffer + mb_uv_src_offset;
-  const uint8_t *y_pre = pred;
-  const uint8_t *u_pre = pred + mb_pels;
-  const uint8_t *v_pre = pred + mb_pels * 2;
-  uint32_t *y_accum = accum;
-  uint32_t *u_accum = accum + mb_pels;
-  uint32_t *v_accum = accum + mb_pels * 2;
-  uint16_t *y_count = count;
-  uint16_t *u_count = count + mb_pels;
-  uint16_t *v_count = count + mb_pels * 2;
-
-  const unsigned int chroma_height = block_height >> ss_y,
-                     chroma_width = block_width >> ss_x;
-
-  DECLARE_ALIGNED(16, uint16_t, y_dist[BH * DIST_STRIDE]) = { 0 };
-  DECLARE_ALIGNED(16, uint16_t, u_dist[BH * DIST_STRIDE]) = { 0 };
-  DECLARE_ALIGNED(16, uint16_t, v_dist[BH * DIST_STRIDE]) = { 0 };
-  const int *blk_fw_ptr = blk_fw;
-
-  uint16_t *y_dist_ptr = y_dist + 1, *u_dist_ptr = u_dist + 1,
-           *v_dist_ptr = v_dist + 1;
-  const uint8_t *y_src_ptr = y_src, *u_src_ptr = u_src, *v_src_ptr = v_src;
-  const uint8_t *y_pre_ptr = y_pre, *u_pre_ptr = u_pre, *v_pre_ptr = v_pre;
-
-  // Loop variables
-  unsigned int row, blk_col;
-
-  assert(block_width <= BW && "block width too large");
-  assert(block_height <= BH && "block height too large");
-  assert(block_width % 16 == 0 && "block width must be multiple of 16");
-  assert(block_height % 2 == 0 && "block height must be even");
-  assert((ss_x == 0 || ss_x == 1) && (ss_y == 0 || ss_y == 1) &&
-         "invalid chroma subsampling");
-  assert(strength >= 0 && strength <= 6 && "invalid temporal filter strength");
-  assert(blk_fw[0] >= 0 && "filter weight must be positive");
-  assert(
-      (use_whole_blk || (blk_fw[1] >= 0 && blk_fw[2] >= 0 && blk_fw[3] >= 0)) &&
-      "subblock filter weight must be positive");
-  assert(blk_fw[0] <= 2 && "sublock filter weight must be less than 2");
-  assert(
-      (use_whole_blk || (blk_fw[1] <= 2 && blk_fw[2] <= 2 && blk_fw[3] <= 2)) &&
-      "subblock filter weight must be less than 2");
-
-  // Precompute the difference sqaured
-  for (row = 0; row < block_height; row++) {
-    for (blk_col = 0; blk_col < block_width; blk_col += 16) {
-      store_dist_16(y_src_ptr + blk_col, y_pre_ptr + blk_col,
-                    y_dist_ptr + blk_col);
-    }
-    y_src_ptr += y_src_stride;
-    y_pre_ptr += y_pre_stride;
-    y_dist_ptr += DIST_STRIDE;
-  }
-
-  for (row = 0; row < chroma_height; row++) {
-    for (blk_col = 0; blk_col < chroma_width; blk_col += 8) {
-      store_dist_8(u_src_ptr + blk_col, u_pre_ptr + blk_col,
-                   u_dist_ptr + blk_col);
-      store_dist_8(v_src_ptr + blk_col, v_pre_ptr + blk_col,
-                   v_dist_ptr + blk_col);
-    }
-
-    u_src_ptr += uv_src_stride;
-    u_pre_ptr += uv_pre_stride;
-    u_dist_ptr += DIST_STRIDE;
-    v_src_ptr += uv_src_stride;
-    v_pre_ptr += uv_pre_stride;
-    v_dist_ptr += DIST_STRIDE;
-  }
-
-  y_dist_ptr = y_dist + 1;
-  u_dist_ptr = u_dist + 1;
-  v_dist_ptr = v_dist + 1;
-
-  apply_temporal_filter_luma(y_src, y_src_stride, y_pre, y_pre_stride, u_src,
-                             v_src, uv_src_stride, u_pre, v_pre, uv_pre_stride,
-                             block_width, block_height, ss_x, ss_y, strength,
-                             blk_fw_ptr, use_whole_blk, y_accum, y_count,
-                             y_dist_ptr, u_dist_ptr, v_dist_ptr);
-
-  apply_temporal_filter_chroma(
-      y_src, y_src_stride, y_pre, y_pre_stride, u_src, v_src, uv_src_stride,
-      u_pre, v_pre, uv_pre_stride, block_width, block_height, ss_x, ss_y,
-      strength, blk_fw_ptr, use_whole_blk, u_accum, u_count, v_accum, v_count,
-      y_dist_ptr, u_dist_ptr, v_dist_ptr);
-}
-
-////////////////////////
-// Low bit-depth Ends //
-////////////////////////
-
-///////////////////////////
-// High bit-depth Begins //
-///////////////////////////
-
-// Compute (a-b)**2 for 8 pixels with size 16-bit
-static INLINE void highbd_store_dist_8(const uint16_t *a, const uint16_t *b,
-                                       uint32_t *dst) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i a_reg = _mm_loadu_si128((const __m128i *)a);
-  const __m128i b_reg = _mm_loadu_si128((const __m128i *)b);
-
-  const __m128i a_first = _mm_cvtepu16_epi32(a_reg);
-  const __m128i a_second = _mm_unpackhi_epi16(a_reg, zero);
-  const __m128i b_first = _mm_cvtepu16_epi32(b_reg);
-  const __m128i b_second = _mm_unpackhi_epi16(b_reg, zero);
-
-  __m128i dist_first, dist_second;
-
-  dist_first = _mm_sub_epi32(a_first, b_first);
-  dist_second = _mm_sub_epi32(a_second, b_second);
-  dist_first = _mm_mullo_epi32(dist_first, dist_first);
-  dist_second = _mm_mullo_epi32(dist_second, dist_second);
-
-  _mm_storeu_si128((__m128i *)dst, dist_first);
-  _mm_storeu_si128((__m128i *)(dst + 4), dist_second);
-}
-
-// Sum up three neighboring distortions for the pixels
-static INLINE void highbd_get_sum_4(const uint32_t *dist, __m128i *sum) {
-  __m128i dist_reg, dist_left, dist_right;
-
-  dist_reg = _mm_loadu_si128((const __m128i *)dist);
-  dist_left = _mm_loadu_si128((const __m128i *)(dist - 1));
-  dist_right = _mm_loadu_si128((const __m128i *)(dist + 1));
-
-  *sum = _mm_add_epi32(dist_reg, dist_left);
-  *sum = _mm_add_epi32(*sum, dist_right);
-}
-
-static INLINE void highbd_get_sum_8(const uint32_t *dist, __m128i *sum_first,
-                                    __m128i *sum_second) {
-  highbd_get_sum_4(dist, sum_first);
-  highbd_get_sum_4(dist + 4, sum_second);
-}
-
-// Average the value based on the number of values summed (9 for pixels away
-// from the border, 4 for pixels in corners, and 6 for other edge values, plus
-// however many values from y/uv plane are).
-//
-// Add in the rounding factor and shift, clamp to 16, invert and shift. Multiply
-// by weight.
-static INLINE void highbd_average_4(__m128i *output, const __m128i *sum,
-                                    const __m128i *mul_constants,
-                                    const int strength, const int rounding,
-                                    const int weight) {
-  // _mm_srl_epi16 uses the lower 64 bit value for the shift.
-  const __m128i strength_u128 = _mm_set_epi32(0, 0, 0, strength);
-  const __m128i rounding_u32 = _mm_set1_epi32(rounding);
-  const __m128i weight_u32 = _mm_set1_epi32(weight);
-  const __m128i sixteen = _mm_set1_epi32(16);
-  const __m128i zero = _mm_setzero_si128();
-
-  // modifier * 3 / index;
-  const __m128i sum_lo = _mm_unpacklo_epi32(*sum, zero);
-  const __m128i sum_hi = _mm_unpackhi_epi32(*sum, zero);
-  const __m128i const_lo = _mm_unpacklo_epi32(*mul_constants, zero);
-  const __m128i const_hi = _mm_unpackhi_epi32(*mul_constants, zero);
-
-  const __m128i mul_lo = _mm_mul_epu32(sum_lo, const_lo);
-  const __m128i mul_lo_div = _mm_srli_epi64(mul_lo, 32);
-  const __m128i mul_hi = _mm_mul_epu32(sum_hi, const_hi);
-  const __m128i mul_hi_div = _mm_srli_epi64(mul_hi, 32);
-
-  // Now we have
-  //   mul_lo: 00 a1 00 a0
-  //   mul_hi: 00 a3 00 a2
-  // Unpack as 64 bit words to get even and odd elements
-  //   unpack_lo: 00 a2 00 a0
-  //   unpack_hi: 00 a3 00 a1
-  // Then we can shift and OR the results to get everything in 32-bits
-  const __m128i mul_even = _mm_unpacklo_epi64(mul_lo_div, mul_hi_div);
-  const __m128i mul_odd = _mm_unpackhi_epi64(mul_lo_div, mul_hi_div);
-  const __m128i mul_odd_shift = _mm_slli_si128(mul_odd, 4);
-  const __m128i mul = _mm_or_si128(mul_even, mul_odd_shift);
-
-  // Round
-  *output = _mm_add_epi32(mul, rounding_u32);
-  *output = _mm_srl_epi32(*output, strength_u128);
-
-  // Multiply with the weight
-  *output = _mm_min_epu32(*output, sixteen);
-  *output = _mm_sub_epi32(sixteen, *output);
-  *output = _mm_mullo_epi32(*output, weight_u32);
-}
-
-static INLINE void highbd_average_8(__m128i *output_0, __m128i *output_1,
-                                    const __m128i *sum_0_u32,
-                                    const __m128i *sum_1_u32,
-                                    const __m128i *mul_constants_0,
-                                    const __m128i *mul_constants_1,
-                                    const int strength, const int rounding,
-                                    const int weight) {
-  highbd_average_4(output_0, sum_0_u32, mul_constants_0, strength, rounding,
-                   weight);
-  highbd_average_4(output_1, sum_1_u32, mul_constants_1, strength, rounding,
-                   weight);
-}
-
-// Add 'sum_u32' to 'count'. Multiply by 'pred' and add to 'accumulator.'
-static INLINE void highbd_accumulate_and_store_8(const __m128i sum_first_u32,
-                                                 const __m128i sum_second_u32,
-                                                 const uint16_t *pred,
-                                                 uint16_t *count,
-                                                 uint32_t *accumulator) {
-  // Cast down to 16-bit ints
-  const __m128i sum_u16 = _mm_packus_epi32(sum_first_u32, sum_second_u32);
-  const __m128i zero = _mm_setzero_si128();
-
-  __m128i pred_u16 = _mm_loadu_si128((const __m128i *)pred);
-  __m128i count_u16 = _mm_loadu_si128((const __m128i *)count);
-
-  __m128i pred_0_u32, pred_1_u32;
-  __m128i accum_0_u32, accum_1_u32;
-
-  count_u16 = _mm_adds_epu16(count_u16, sum_u16);
-  _mm_storeu_si128((__m128i *)count, count_u16);
-
-  pred_u16 = _mm_mullo_epi16(sum_u16, pred_u16);
-
-  pred_0_u32 = _mm_cvtepu16_epi32(pred_u16);
-  pred_1_u32 = _mm_unpackhi_epi16(pred_u16, zero);
-
-  accum_0_u32 = _mm_loadu_si128((const __m128i *)accumulator);
-  accum_1_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 4));
-
-  accum_0_u32 = _mm_add_epi32(pred_0_u32, accum_0_u32);
-  accum_1_u32 = _mm_add_epi32(pred_1_u32, accum_1_u32);
-
-  _mm_storeu_si128((__m128i *)accumulator, accum_0_u32);
-  _mm_storeu_si128((__m128i *)(accumulator + 4), accum_1_u32);
-}
-
-static INLINE void highbd_read_dist_4(const uint32_t *dist, __m128i *dist_reg) {
-  *dist_reg = _mm_loadu_si128((const __m128i *)dist);
-}
-
-static INLINE void highbd_read_dist_8(const uint32_t *dist, __m128i *reg_first,
-                                      __m128i *reg_second) {
-  highbd_read_dist_4(dist, reg_first);
-  highbd_read_dist_4(dist + 4, reg_second);
-}
-
-static INLINE void highbd_read_chroma_dist_row_8(
-    int ss_x, const uint32_t *u_dist, const uint32_t *v_dist, __m128i *u_first,
-    __m128i *u_second, __m128i *v_first, __m128i *v_second) {
-  if (!ss_x) {
-    // If there is no chroma subsampling in the horizontal direction, then we
-    // need to load 8 entries from chroma.
-    highbd_read_dist_8(u_dist, u_first, u_second);
-    highbd_read_dist_8(v_dist, v_first, v_second);
-  } else {  // ss_x == 1
-    // Otherwise, we only need to load 8 entries
-    __m128i u_reg, v_reg;
-
-    highbd_read_dist_4(u_dist, &u_reg);
-
-    *u_first = _mm_unpacklo_epi32(u_reg, u_reg);
-    *u_second = _mm_unpackhi_epi32(u_reg, u_reg);
-
-    highbd_read_dist_4(v_dist, &v_reg);
-
-    *v_first = _mm_unpacklo_epi32(v_reg, v_reg);
-    *v_second = _mm_unpackhi_epi32(v_reg, v_reg);
-  }
-}
-
-static void highbd_apply_temporal_filter_luma_8(
-    const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre,
-    int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src,
-    int uv_src_stride, const uint16_t *u_pre, const uint16_t *v_pre,
-    int uv_pre_stride, unsigned int block_width, unsigned int block_height,
-    int ss_x, int ss_y, int strength, int use_whole_blk, uint32_t *y_accum,
-    uint16_t *y_count, const uint32_t *y_dist, const uint32_t *u_dist,
-    const uint32_t *v_dist, const uint32_t *const *neighbors_first,
-    const uint32_t *const *neighbors_second, int top_weight,
-    int bottom_weight) {
-  const int rounding = (1 << strength) >> 1;
-  int weight = top_weight;
-
-  __m128i mul_first, mul_second;
-
-  __m128i sum_row_1_first, sum_row_1_second;
-  __m128i sum_row_2_first, sum_row_2_second;
-  __m128i sum_row_3_first, sum_row_3_second;
-
-  __m128i u_first, u_second;
-  __m128i v_first, v_second;
-
-  __m128i sum_row_first;
-  __m128i sum_row_second;
-
-  // Loop variables
-  unsigned int h;
-
-  assert(strength >= 0 && strength <= 14 &&
-         "invalid adjusted temporal filter strength");
-  assert(block_width == 8);
-
-  (void)block_width;
-
-  // First row
-  mul_first = _mm_loadu_si128((const __m128i *)neighbors_first[0]);
-  mul_second = _mm_loadu_si128((const __m128i *)neighbors_second[0]);
-
-  // Add luma values
-  highbd_get_sum_8(y_dist, &sum_row_2_first, &sum_row_2_second);
-  highbd_get_sum_8(y_dist + DIST_STRIDE, &sum_row_3_first, &sum_row_3_second);
-
-  // We don't need to saturate here because the maximum value is UINT12_MAX ** 2
-  // * 9 ~= 2**24 * 9 < 2 ** 28 < INT32_MAX
-  sum_row_first = _mm_add_epi32(sum_row_2_first, sum_row_3_first);
-  sum_row_second = _mm_add_epi32(sum_row_2_second, sum_row_3_second);
-
-  // Add chroma values
-  highbd_read_chroma_dist_row_8(ss_x, u_dist, v_dist, &u_first, &u_second,
-                                &v_first, &v_second);
-
-  // Max value here is 2 ** 24 * (9 + 2), so no saturation is needed
-  sum_row_first = _mm_add_epi32(sum_row_first, u_first);
-  sum_row_second = _mm_add_epi32(sum_row_second, u_second);
-
-  sum_row_first = _mm_add_epi32(sum_row_first, v_first);
-  sum_row_second = _mm_add_epi32(sum_row_second, v_second);
-
-  // Get modifier and store result
-  highbd_average_8(&sum_row_first, &sum_row_second, &sum_row_first,
-                   &sum_row_second, &mul_first, &mul_second, strength, rounding,
-                   weight);
-
-  highbd_accumulate_and_store_8(sum_row_first, sum_row_second, y_pre, y_count,
-                                y_accum);
-
-  y_src += y_src_stride;
-  y_pre += y_pre_stride;
-  y_count += y_pre_stride;
-  y_accum += y_pre_stride;
-  y_dist += DIST_STRIDE;
-
-  u_src += uv_src_stride;
-  u_pre += uv_pre_stride;
-  u_dist += DIST_STRIDE;
-  v_src += uv_src_stride;
-  v_pre += uv_pre_stride;
-  v_dist += DIST_STRIDE;
-
-  // Then all the rows except the last one
-  mul_first = _mm_loadu_si128((const __m128i *)neighbors_first[1]);
-  mul_second = _mm_loadu_si128((const __m128i *)neighbors_second[1]);
-
-  for (h = 1; h < block_height - 1; ++h) {
-    // Move the weight to bottom half
-    if (!use_whole_blk && h == block_height / 2) {
-      weight = bottom_weight;
-    }
-    // Shift the rows up
-    sum_row_1_first = sum_row_2_first;
-    sum_row_1_second = sum_row_2_second;
-    sum_row_2_first = sum_row_3_first;
-    sum_row_2_second = sum_row_3_second;
-
-    // Add luma values to the modifier
-    sum_row_first = _mm_add_epi32(sum_row_1_first, sum_row_2_first);
-    sum_row_second = _mm_add_epi32(sum_row_1_second, sum_row_2_second);
-
-    highbd_get_sum_8(y_dist + DIST_STRIDE, &sum_row_3_first, &sum_row_3_second);
-
-    sum_row_first = _mm_add_epi32(sum_row_first, sum_row_3_first);
-    sum_row_second = _mm_add_epi32(sum_row_second, sum_row_3_second);
-
-    // Add chroma values to the modifier
-    if (ss_y == 0 || h % 2 == 0) {
-      // Only calculate the new chroma distortion if we are at a pixel that
-      // corresponds to a new chroma row
-      highbd_read_chroma_dist_row_8(ss_x, u_dist, v_dist, &u_first, &u_second,
-                                    &v_first, &v_second);
-
-      u_src += uv_src_stride;
-      u_pre += uv_pre_stride;
-      u_dist += DIST_STRIDE;
-      v_src += uv_src_stride;
-      v_pre += uv_pre_stride;
-      v_dist += DIST_STRIDE;
-    }
-
-    sum_row_first = _mm_add_epi32(sum_row_first, u_first);
-    sum_row_second = _mm_add_epi32(sum_row_second, u_second);
-    sum_row_first = _mm_add_epi32(sum_row_first, v_first);
-    sum_row_second = _mm_add_epi32(sum_row_second, v_second);
-
-    // Get modifier and store result
-    highbd_average_8(&sum_row_first, &sum_row_second, &sum_row_first,
-                     &sum_row_second, &mul_first, &mul_second, strength,
-                     rounding, weight);
-    highbd_accumulate_and_store_8(sum_row_first, sum_row_second, y_pre, y_count,
-                                  y_accum);
-
-    y_src += y_src_stride;
-    y_pre += y_pre_stride;
-    y_count += y_pre_stride;
-    y_accum += y_pre_stride;
-    y_dist += DIST_STRIDE;
-  }
-
-  // The last row
-  mul_first = _mm_loadu_si128((const __m128i *)neighbors_first[0]);
-  mul_second = _mm_loadu_si128((const __m128i *)neighbors_second[0]);
-
-  // Shift the rows up
-  sum_row_1_first = sum_row_2_first;
-  sum_row_1_second = sum_row_2_second;
-  sum_row_2_first = sum_row_3_first;
-  sum_row_2_second = sum_row_3_second;
-
-  // Add luma values to the modifier
-  sum_row_first = _mm_add_epi32(sum_row_1_first, sum_row_2_first);
-  sum_row_second = _mm_add_epi32(sum_row_1_second, sum_row_2_second);
-
-  // Add chroma values to the modifier
-  if (ss_y == 0) {
-    // Only calculate the new chroma distortion if we are at a pixel that
-    // corresponds to a new chroma row
-    highbd_read_chroma_dist_row_8(ss_x, u_dist, v_dist, &u_first, &u_second,
-                                  &v_first, &v_second);
-  }
-
-  sum_row_first = _mm_add_epi32(sum_row_first, u_first);
-  sum_row_second = _mm_add_epi32(sum_row_second, u_second);
-  sum_row_first = _mm_add_epi32(sum_row_first, v_first);
-  sum_row_second = _mm_add_epi32(sum_row_second, v_second);
-
-  // Get modifier and store result
-  highbd_average_8(&sum_row_first, &sum_row_second, &sum_row_first,
-                   &sum_row_second, &mul_first, &mul_second, strength, rounding,
-                   weight);
-  highbd_accumulate_and_store_8(sum_row_first, sum_row_second, y_pre, y_count,
-                                y_accum);
-}
-
-// Perform temporal filter for the luma component.
-static void highbd_apply_temporal_filter_luma(
-    const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre,
-    int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src,
-    int uv_src_stride, const uint16_t *u_pre, const uint16_t *v_pre,
-    int uv_pre_stride, unsigned int block_width, unsigned int block_height,
-    int ss_x, int ss_y, int strength, const int *blk_fw, int use_whole_blk,
-    uint32_t *y_accum, uint16_t *y_count, const uint32_t *y_dist,
-    const uint32_t *u_dist, const uint32_t *v_dist) {
-  unsigned int blk_col = 0, uv_blk_col = 0;
-  const unsigned int blk_col_step = 8, uv_blk_col_step = 8 >> ss_x;
-  const unsigned int mid_width = block_width >> 1,
-                     last_width = block_width - blk_col_step;
-  int top_weight = blk_fw[0],
-      bottom_weight = use_whole_blk ? blk_fw[0] : blk_fw[2];
-  const uint32_t *const *neighbors_first;
-  const uint32_t *const *neighbors_second;
-
-  // Left
-  neighbors_first = HIGHBD_LUMA_LEFT_COLUMN_NEIGHBORS;
-  neighbors_second = HIGHBD_LUMA_MIDDLE_COLUMN_NEIGHBORS;
-  highbd_apply_temporal_filter_luma_8(
-      y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-      u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col,
-      v_pre + uv_blk_col, uv_pre_stride, blk_col_step, block_height, ss_x, ss_y,
-      strength, use_whole_blk, y_accum + blk_col, y_count + blk_col,
-      y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
-      neighbors_first, neighbors_second, top_weight, bottom_weight);
-
-  blk_col += blk_col_step;
-  uv_blk_col += uv_blk_col_step;
-
-  // Middle First
-  neighbors_first = HIGHBD_LUMA_MIDDLE_COLUMN_NEIGHBORS;
-  for (; blk_col < mid_width;
-       blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
-    highbd_apply_temporal_filter_luma_8(
-        y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-        u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
-        u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, blk_col_step,
-        block_height, ss_x, ss_y, strength, use_whole_blk, y_accum + blk_col,
-        y_count + blk_col, y_dist + blk_col, u_dist + uv_blk_col,
-        v_dist + uv_blk_col, neighbors_first, neighbors_second, top_weight,
-        bottom_weight);
-  }
-
-  if (!use_whole_blk) {
-    top_weight = blk_fw[1];
-    bottom_weight = blk_fw[3];
-  }
-
-  // Middle Second
-  for (; blk_col < last_width;
-       blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
-    highbd_apply_temporal_filter_luma_8(
-        y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-        u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
-        u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, blk_col_step,
-        block_height, ss_x, ss_y, strength, use_whole_blk, y_accum + blk_col,
-        y_count + blk_col, y_dist + blk_col, u_dist + uv_blk_col,
-        v_dist + uv_blk_col, neighbors_first, neighbors_second, top_weight,
-        bottom_weight);
-  }
-
-  // Right
-  neighbors_second = HIGHBD_LUMA_RIGHT_COLUMN_NEIGHBORS;
-  highbd_apply_temporal_filter_luma_8(
-      y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-      u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col,
-      v_pre + uv_blk_col, uv_pre_stride, blk_col_step, block_height, ss_x, ss_y,
-      strength, use_whole_blk, y_accum + blk_col, y_count + blk_col,
-      y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
-      neighbors_first, neighbors_second, top_weight, bottom_weight);
-}
-
-// Add a row of luma distortion that corresponds to 8 chroma mods. If we are
-// subsampling in x direction, then we have 16 lumas, else we have 8.
-static INLINE void highbd_add_luma_dist_to_8_chroma_mod(
-    const uint32_t *y_dist, int ss_x, int ss_y, __m128i *u_mod_fst,
-    __m128i *u_mod_snd, __m128i *v_mod_fst, __m128i *v_mod_snd) {
-  __m128i y_reg_fst, y_reg_snd;
-  if (!ss_x) {
-    highbd_read_dist_8(y_dist, &y_reg_fst, &y_reg_snd);
-    if (ss_y == 1) {
-      __m128i y_tmp_fst, y_tmp_snd;
-      highbd_read_dist_8(y_dist + DIST_STRIDE, &y_tmp_fst, &y_tmp_snd);
-      y_reg_fst = _mm_add_epi32(y_reg_fst, y_tmp_fst);
-      y_reg_snd = _mm_add_epi32(y_reg_snd, y_tmp_snd);
-    }
-  } else {
-    // Temporary
-    __m128i y_fst, y_snd;
-
-    // First 8
-    highbd_read_dist_8(y_dist, &y_fst, &y_snd);
-    if (ss_y == 1) {
-      __m128i y_tmp_fst, y_tmp_snd;
-      highbd_read_dist_8(y_dist + DIST_STRIDE, &y_tmp_fst, &y_tmp_snd);
-
-      y_fst = _mm_add_epi32(y_fst, y_tmp_fst);
-      y_snd = _mm_add_epi32(y_snd, y_tmp_snd);
-    }
-
-    y_reg_fst = _mm_hadd_epi32(y_fst, y_snd);
-
-    // Second 8
-    highbd_read_dist_8(y_dist + 8, &y_fst, &y_snd);
-    if (ss_y == 1) {
-      __m128i y_tmp_fst, y_tmp_snd;
-      highbd_read_dist_8(y_dist + 8 + DIST_STRIDE, &y_tmp_fst, &y_tmp_snd);
-
-      y_fst = _mm_add_epi32(y_fst, y_tmp_fst);
-      y_snd = _mm_add_epi32(y_snd, y_tmp_snd);
-    }
-
-    y_reg_snd = _mm_hadd_epi32(y_fst, y_snd);
-  }
-
-  *u_mod_fst = _mm_add_epi32(*u_mod_fst, y_reg_fst);
-  *u_mod_snd = _mm_add_epi32(*u_mod_snd, y_reg_snd);
-  *v_mod_fst = _mm_add_epi32(*v_mod_fst, y_reg_fst);
-  *v_mod_snd = _mm_add_epi32(*v_mod_snd, y_reg_snd);
-}
-
-// Apply temporal filter to the chroma components. This performs temporal
-// filtering on a chroma block of 8 X uv_height. If blk_fw is not NULL, use
-// blk_fw as an array of size 4 for the weights for each of the 4 subblocks,
-// else use top_weight for top half, and bottom weight for bottom half.
-static void highbd_apply_temporal_filter_chroma_8(
-    const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre,
-    int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src,
-    int uv_src_stride, const uint16_t *u_pre, const uint16_t *v_pre,
-    int uv_pre_stride, unsigned int uv_block_width,
-    unsigned int uv_block_height, int ss_x, int ss_y, int strength,
-    uint32_t *u_accum, uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count,
-    const uint32_t *y_dist, const uint32_t *u_dist, const uint32_t *v_dist,
-    const uint32_t *const *neighbors_fst, const uint32_t *const *neighbors_snd,
-    int top_weight, int bottom_weight, const int *blk_fw) {
-  const int rounding = (1 << strength) >> 1;
-  int weight = top_weight;
-
-  __m128i mul_fst, mul_snd;
-
-  __m128i u_sum_row_1_fst, u_sum_row_2_fst, u_sum_row_3_fst;
-  __m128i v_sum_row_1_fst, v_sum_row_2_fst, v_sum_row_3_fst;
-  __m128i u_sum_row_1_snd, u_sum_row_2_snd, u_sum_row_3_snd;
-  __m128i v_sum_row_1_snd, v_sum_row_2_snd, v_sum_row_3_snd;
-
-  __m128i u_sum_row_fst, v_sum_row_fst;
-  __m128i u_sum_row_snd, v_sum_row_snd;
-
-  // Loop variable
-  unsigned int h;
-
-  (void)uv_block_width;
-
-  // First row
-  mul_fst = _mm_loadu_si128((const __m128i *)neighbors_fst[0]);
-  mul_snd = _mm_loadu_si128((const __m128i *)neighbors_snd[0]);
-
-  // Add chroma values
-  highbd_get_sum_8(u_dist, &u_sum_row_2_fst, &u_sum_row_2_snd);
-  highbd_get_sum_8(u_dist + DIST_STRIDE, &u_sum_row_3_fst, &u_sum_row_3_snd);
-
-  u_sum_row_fst = _mm_add_epi32(u_sum_row_2_fst, u_sum_row_3_fst);
-  u_sum_row_snd = _mm_add_epi32(u_sum_row_2_snd, u_sum_row_3_snd);
-
-  highbd_get_sum_8(v_dist, &v_sum_row_2_fst, &v_sum_row_2_snd);
-  highbd_get_sum_8(v_dist + DIST_STRIDE, &v_sum_row_3_fst, &v_sum_row_3_snd);
-
-  v_sum_row_fst = _mm_add_epi32(v_sum_row_2_fst, v_sum_row_3_fst);
-  v_sum_row_snd = _mm_add_epi32(v_sum_row_2_snd, v_sum_row_3_snd);
-
-  // Add luma values
-  highbd_add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row_fst,
-                                       &u_sum_row_snd, &v_sum_row_fst,
-                                       &v_sum_row_snd);
-
-  // Get modifier and store result
-  if (blk_fw) {
-    highbd_average_4(&u_sum_row_fst, &u_sum_row_fst, &mul_fst, strength,
-                     rounding, blk_fw[0]);
-    highbd_average_4(&u_sum_row_snd, &u_sum_row_snd, &mul_snd, strength,
-                     rounding, blk_fw[1]);
-
-    highbd_average_4(&v_sum_row_fst, &v_sum_row_fst, &mul_fst, strength,
-                     rounding, blk_fw[0]);
-    highbd_average_4(&v_sum_row_snd, &v_sum_row_snd, &mul_snd, strength,
-                     rounding, blk_fw[1]);
-
-  } else {
-    highbd_average_8(&u_sum_row_fst, &u_sum_row_snd, &u_sum_row_fst,
-                     &u_sum_row_snd, &mul_fst, &mul_snd, strength, rounding,
-                     weight);
-    highbd_average_8(&v_sum_row_fst, &v_sum_row_snd, &v_sum_row_fst,
-                     &v_sum_row_snd, &mul_fst, &mul_snd, strength, rounding,
-                     weight);
-  }
-  highbd_accumulate_and_store_8(u_sum_row_fst, u_sum_row_snd, u_pre, u_count,
-                                u_accum);
-  highbd_accumulate_and_store_8(v_sum_row_fst, v_sum_row_snd, v_pre, v_count,
-                                v_accum);
-
-  u_src += uv_src_stride;
-  u_pre += uv_pre_stride;
-  u_dist += DIST_STRIDE;
-  v_src += uv_src_stride;
-  v_pre += uv_pre_stride;
-  v_dist += DIST_STRIDE;
-  u_count += uv_pre_stride;
-  u_accum += uv_pre_stride;
-  v_count += uv_pre_stride;
-  v_accum += uv_pre_stride;
-
-  y_src += y_src_stride * (1 + ss_y);
-  y_pre += y_pre_stride * (1 + ss_y);
-  y_dist += DIST_STRIDE * (1 + ss_y);
-
-  // Then all the rows except the last one
-  mul_fst = _mm_loadu_si128((const __m128i *)neighbors_fst[1]);
-  mul_snd = _mm_loadu_si128((const __m128i *)neighbors_snd[1]);
-
-  for (h = 1; h < uv_block_height - 1; ++h) {
-    // Move the weight pointer to the bottom half of the blocks
-    if (h == uv_block_height / 2) {
-      if (blk_fw) {
-        blk_fw += 2;
-      } else {
-        weight = bottom_weight;
-      }
-    }
-
-    // Shift the rows up
-    u_sum_row_1_fst = u_sum_row_2_fst;
-    u_sum_row_2_fst = u_sum_row_3_fst;
-    u_sum_row_1_snd = u_sum_row_2_snd;
-    u_sum_row_2_snd = u_sum_row_3_snd;
-
-    v_sum_row_1_fst = v_sum_row_2_fst;
-    v_sum_row_2_fst = v_sum_row_3_fst;
-    v_sum_row_1_snd = v_sum_row_2_snd;
-    v_sum_row_2_snd = v_sum_row_3_snd;
-
-    // Add chroma values
-    u_sum_row_fst = _mm_add_epi32(u_sum_row_1_fst, u_sum_row_2_fst);
-    u_sum_row_snd = _mm_add_epi32(u_sum_row_1_snd, u_sum_row_2_snd);
-    highbd_get_sum_8(u_dist + DIST_STRIDE, &u_sum_row_3_fst, &u_sum_row_3_snd);
-    u_sum_row_fst = _mm_add_epi32(u_sum_row_fst, u_sum_row_3_fst);
-    u_sum_row_snd = _mm_add_epi32(u_sum_row_snd, u_sum_row_3_snd);
-
-    v_sum_row_fst = _mm_add_epi32(v_sum_row_1_fst, v_sum_row_2_fst);
-    v_sum_row_snd = _mm_add_epi32(v_sum_row_1_snd, v_sum_row_2_snd);
-    highbd_get_sum_8(v_dist + DIST_STRIDE, &v_sum_row_3_fst, &v_sum_row_3_snd);
-    v_sum_row_fst = _mm_add_epi32(v_sum_row_fst, v_sum_row_3_fst);
-    v_sum_row_snd = _mm_add_epi32(v_sum_row_snd, v_sum_row_3_snd);
-
-    // Add luma values
-    highbd_add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row_fst,
-                                         &u_sum_row_snd, &v_sum_row_fst,
-                                         &v_sum_row_snd);
-
-    // Get modifier and store result
-    if (blk_fw) {
-      highbd_average_4(&u_sum_row_fst, &u_sum_row_fst, &mul_fst, strength,
-                       rounding, blk_fw[0]);
-      highbd_average_4(&u_sum_row_snd, &u_sum_row_snd, &mul_snd, strength,
-                       rounding, blk_fw[1]);
-
-      highbd_average_4(&v_sum_row_fst, &v_sum_row_fst, &mul_fst, strength,
-                       rounding, blk_fw[0]);
-      highbd_average_4(&v_sum_row_snd, &v_sum_row_snd, &mul_snd, strength,
-                       rounding, blk_fw[1]);
-
-    } else {
-      highbd_average_8(&u_sum_row_fst, &u_sum_row_snd, &u_sum_row_fst,
-                       &u_sum_row_snd, &mul_fst, &mul_snd, strength, rounding,
-                       weight);
-      highbd_average_8(&v_sum_row_fst, &v_sum_row_snd, &v_sum_row_fst,
-                       &v_sum_row_snd, &mul_fst, &mul_snd, strength, rounding,
-                       weight);
-    }
-
-    highbd_accumulate_and_store_8(u_sum_row_fst, u_sum_row_snd, u_pre, u_count,
-                                  u_accum);
-    highbd_accumulate_and_store_8(v_sum_row_fst, v_sum_row_snd, v_pre, v_count,
-                                  v_accum);
-
-    u_src += uv_src_stride;
-    u_pre += uv_pre_stride;
-    u_dist += DIST_STRIDE;
-    v_src += uv_src_stride;
-    v_pre += uv_pre_stride;
-    v_dist += DIST_STRIDE;
-    u_count += uv_pre_stride;
-    u_accum += uv_pre_stride;
-    v_count += uv_pre_stride;
-    v_accum += uv_pre_stride;
-
-    y_src += y_src_stride * (1 + ss_y);
-    y_pre += y_pre_stride * (1 + ss_y);
-    y_dist += DIST_STRIDE * (1 + ss_y);
-  }
-
-  // The last row
-  mul_fst = _mm_loadu_si128((const __m128i *)neighbors_fst[0]);
-  mul_snd = _mm_loadu_si128((const __m128i *)neighbors_snd[0]);
-
-  // Shift the rows up
-  u_sum_row_1_fst = u_sum_row_2_fst;
-  u_sum_row_2_fst = u_sum_row_3_fst;
-  u_sum_row_1_snd = u_sum_row_2_snd;
-  u_sum_row_2_snd = u_sum_row_3_snd;
-
-  v_sum_row_1_fst = v_sum_row_2_fst;
-  v_sum_row_2_fst = v_sum_row_3_fst;
-  v_sum_row_1_snd = v_sum_row_2_snd;
-  v_sum_row_2_snd = v_sum_row_3_snd;
-
-  // Add chroma values
-  u_sum_row_fst = _mm_add_epi32(u_sum_row_1_fst, u_sum_row_2_fst);
-  v_sum_row_fst = _mm_add_epi32(v_sum_row_1_fst, v_sum_row_2_fst);
-  u_sum_row_snd = _mm_add_epi32(u_sum_row_1_snd, u_sum_row_2_snd);
-  v_sum_row_snd = _mm_add_epi32(v_sum_row_1_snd, v_sum_row_2_snd);
-
-  // Add luma values
-  highbd_add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row_fst,
-                                       &u_sum_row_snd, &v_sum_row_fst,
-                                       &v_sum_row_snd);
-
-  // Get modifier and store result
-  if (blk_fw) {
-    highbd_average_4(&u_sum_row_fst, &u_sum_row_fst, &mul_fst, strength,
-                     rounding, blk_fw[0]);
-    highbd_average_4(&u_sum_row_snd, &u_sum_row_snd, &mul_snd, strength,
-                     rounding, blk_fw[1]);
-
-    highbd_average_4(&v_sum_row_fst, &v_sum_row_fst, &mul_fst, strength,
-                     rounding, blk_fw[0]);
-    highbd_average_4(&v_sum_row_snd, &v_sum_row_snd, &mul_snd, strength,
-                     rounding, blk_fw[1]);
-
-  } else {
-    highbd_average_8(&u_sum_row_fst, &u_sum_row_snd, &u_sum_row_fst,
-                     &u_sum_row_snd, &mul_fst, &mul_snd, strength, rounding,
-                     weight);
-    highbd_average_8(&v_sum_row_fst, &v_sum_row_snd, &v_sum_row_fst,
-                     &v_sum_row_snd, &mul_fst, &mul_snd, strength, rounding,
-                     weight);
-  }
-
-  highbd_accumulate_and_store_8(u_sum_row_fst, u_sum_row_snd, u_pre, u_count,
-                                u_accum);
-  highbd_accumulate_and_store_8(v_sum_row_fst, v_sum_row_snd, v_pre, v_count,
-                                v_accum);
-}
-
-// Perform temporal filter for the chroma components.
-static void highbd_apply_temporal_filter_chroma(
-    const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre,
-    int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src,
-    int uv_src_stride, const uint16_t *u_pre, const uint16_t *v_pre,
-    int uv_pre_stride, unsigned int block_width, unsigned int block_height,
-    int ss_x, int ss_y, int strength, const int *blk_fw, int use_whole_blk,
-    uint32_t *u_accum, uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count,
-    const uint32_t *y_dist, const uint32_t *u_dist, const uint32_t *v_dist) {
-  const unsigned int uv_width = block_width >> ss_x,
-                     uv_height = block_height >> ss_y;
-
-  unsigned int blk_col = 0, uv_blk_col = 0;
-  const unsigned int uv_blk_col_step = 8, blk_col_step = 8 << ss_x;
-  const unsigned int uv_mid_width = uv_width >> 1,
-                     uv_last_width = uv_width - uv_blk_col_step;
-  int top_weight = blk_fw[0],
-      bottom_weight = use_whole_blk ? blk_fw[0] : blk_fw[2];
-  const uint32_t *const *neighbors_fst;
-  const uint32_t *const *neighbors_snd;
-
-  if (uv_width == 8) {
-    // Special Case: We are subsampling in x direction on a 16x16 block. Since
-    // we are operating on a row of 8 chroma pixels, we can't use the usual
-    // left-middle-right pattern.
-    assert(ss_x);
-
-    if (ss_y) {
-      neighbors_fst = HIGHBD_CHROMA_DOUBLE_SS_LEFT_COLUMN_NEIGHBORS;
-      neighbors_snd = HIGHBD_CHROMA_DOUBLE_SS_RIGHT_COLUMN_NEIGHBORS;
-    } else {
-      neighbors_fst = HIGHBD_CHROMA_SINGLE_SS_LEFT_COLUMN_NEIGHBORS;
-      neighbors_snd = HIGHBD_CHROMA_SINGLE_SS_RIGHT_COLUMN_NEIGHBORS;
-    }
-
-    if (use_whole_blk) {
-      highbd_apply_temporal_filter_chroma_8(
-          y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-          u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
-          u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
-          uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
-          u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
-          y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
-          neighbors_fst, neighbors_snd, top_weight, bottom_weight, NULL);
-    } else {
-      highbd_apply_temporal_filter_chroma_8(
-          y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-          u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
-          u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
-          uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
-          u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
-          y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
-          neighbors_fst, neighbors_snd, 0, 0, blk_fw);
-    }
-
-    return;
-  }
-
-  // Left
-  if (ss_x && ss_y) {
-    neighbors_fst = HIGHBD_CHROMA_DOUBLE_SS_LEFT_COLUMN_NEIGHBORS;
-    neighbors_snd = HIGHBD_CHROMA_DOUBLE_SS_MIDDLE_COLUMN_NEIGHBORS;
-  } else if (ss_x || ss_y) {
-    neighbors_fst = HIGHBD_CHROMA_SINGLE_SS_LEFT_COLUMN_NEIGHBORS;
-    neighbors_snd = HIGHBD_CHROMA_SINGLE_SS_MIDDLE_COLUMN_NEIGHBORS;
-  } else {
-    neighbors_fst = HIGHBD_CHROMA_NO_SS_LEFT_COLUMN_NEIGHBORS;
-    neighbors_snd = HIGHBD_CHROMA_NO_SS_MIDDLE_COLUMN_NEIGHBORS;
-  }
-
-  highbd_apply_temporal_filter_chroma_8(
-      y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-      u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col,
-      v_pre + uv_blk_col, uv_pre_stride, uv_width, uv_height, ss_x, ss_y,
-      strength, u_accum + uv_blk_col, u_count + uv_blk_col,
-      v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col,
-      u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_fst, neighbors_snd,
-      top_weight, bottom_weight, NULL);
-
-  blk_col += blk_col_step;
-  uv_blk_col += uv_blk_col_step;
-
-  // Middle First
-  if (ss_x && ss_y) {
-    neighbors_fst = HIGHBD_CHROMA_DOUBLE_SS_MIDDLE_COLUMN_NEIGHBORS;
-  } else if (ss_x || ss_y) {
-    neighbors_fst = HIGHBD_CHROMA_SINGLE_SS_MIDDLE_COLUMN_NEIGHBORS;
-  } else {
-    neighbors_fst = HIGHBD_CHROMA_NO_SS_MIDDLE_COLUMN_NEIGHBORS;
-  }
-
-  for (; uv_blk_col < uv_mid_width;
-       blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
-    highbd_apply_temporal_filter_chroma_8(
-        y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-        u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
-        u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
-        uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
-        u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
-        y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
-        neighbors_fst, neighbors_snd, top_weight, bottom_weight, NULL);
-  }
-
-  if (!use_whole_blk) {
-    top_weight = blk_fw[1];
-    bottom_weight = blk_fw[3];
-  }
-
-  // Middle Second
-  for (; uv_blk_col < uv_last_width;
-       blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
-    highbd_apply_temporal_filter_chroma_8(
-        y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-        u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
-        u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
-        uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
-        u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
-        y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
-        neighbors_fst, neighbors_snd, top_weight, bottom_weight, NULL);
-  }
-
-  // Right
-  if (ss_x && ss_y) {
-    neighbors_snd = HIGHBD_CHROMA_DOUBLE_SS_RIGHT_COLUMN_NEIGHBORS;
-  } else if (ss_x || ss_y) {
-    neighbors_snd = HIGHBD_CHROMA_SINGLE_SS_RIGHT_COLUMN_NEIGHBORS;
-  } else {
-    neighbors_snd = HIGHBD_CHROMA_NO_SS_RIGHT_COLUMN_NEIGHBORS;
-  }
-
-  highbd_apply_temporal_filter_chroma_8(
-      y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-      u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col,
-      v_pre + uv_blk_col, uv_pre_stride, uv_width, uv_height, ss_x, ss_y,
-      strength, u_accum + uv_blk_col, u_count + uv_blk_col,
-      v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col,
-      u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_fst, neighbors_snd,
-      top_weight, bottom_weight, NULL);
-}
-
-static void highbd_apply_temporal_filter_yuv(
-    const YV12_BUFFER_CONFIG *ref_frame, const MACROBLOCKD *mbd,
-    const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
-    const int strength, const int use_subblock,
-    const int *subblock_filter_weights, const uint8_t *pred, uint32_t *accum,
-    uint16_t *count) {
-  const int use_whole_blk = !use_subblock;
-  const int *blk_fw = subblock_filter_weights;
-
-  // Block information (Y-plane).
-  const unsigned int block_height = block_size_high[block_size];
-  const unsigned int block_width = block_size_wide[block_size];
-  const int mb_pels = block_height * block_width;
-  const int y_src_stride = ref_frame->y_stride;
-  const int y_pre_stride = block_width;
-  const int mb_y_src_offset =
-      mb_row * block_height * ref_frame->y_stride + mb_col * block_width;
-
-  // Block information (UV-plane).
-  const int ss_y = mbd->plane[1].subsampling_y;
-  const int ss_x = mbd->plane[1].subsampling_x;
-  const unsigned int uv_height = block_height >> ss_y;
-  const unsigned int uv_width = block_width >> ss_x;
-  const int uv_src_stride = ref_frame->uv_stride;
-  const int uv_pre_stride = block_width >> ss_x;
-  const int mb_uv_src_offset =
-      mb_row * uv_height * ref_frame->uv_stride + mb_col * uv_width;
-
-  const uint8_t *y_src = ref_frame->y_buffer + mb_y_src_offset;
-  const uint8_t *u_src = ref_frame->u_buffer + mb_uv_src_offset;
-  const uint8_t *v_src = ref_frame->v_buffer + mb_uv_src_offset;
-  const uint8_t *y_pre = pred;
-  const uint8_t *u_pre = pred + mb_pels;
-  const uint8_t *v_pre = pred + mb_pels * 2;
-  uint32_t *y_accum = accum;
-  uint32_t *u_accum = accum + mb_pels;
-  uint32_t *v_accum = accum + mb_pels * 2;
-  uint16_t *y_count = count;
-  uint16_t *u_count = count + mb_pels;
-  uint16_t *v_count = count + mb_pels * 2;
-
-  const unsigned int chroma_height = block_height >> ss_y,
-                     chroma_width = block_width >> ss_x;
-
-  DECLARE_ALIGNED(16, uint32_t, y_dist[BH * DIST_STRIDE]) = { 0 };
-  DECLARE_ALIGNED(16, uint32_t, u_dist[BH * DIST_STRIDE]) = { 0 };
-  DECLARE_ALIGNED(16, uint32_t, v_dist[BH * DIST_STRIDE]) = { 0 };
-
-  uint32_t *y_dist_ptr = y_dist + 1, *u_dist_ptr = u_dist + 1,
-           *v_dist_ptr = v_dist + 1;
-  const uint16_t *y_src_ptr = CONVERT_TO_SHORTPTR(y_src),
-                 *u_src_ptr = CONVERT_TO_SHORTPTR(u_src),
-                 *v_src_ptr = CONVERT_TO_SHORTPTR(v_src);
-  const uint16_t *y_pre_ptr = CONVERT_TO_SHORTPTR(y_pre),
-                 *u_pre_ptr = CONVERT_TO_SHORTPTR(u_pre),
-                 *v_pre_ptr = CONVERT_TO_SHORTPTR(v_pre);
-
-  // Loop variables
-  unsigned int row, blk_col;
-
-  assert(block_width <= BW && "block width too large");
-  assert(block_height <= BH && "block height too large");
-  assert(block_width % 16 == 0 && "block width must be multiple of 16");
-  assert(block_height % 2 == 0 && "block height must be even");
-  assert((ss_x == 0 || ss_x == 1) && (ss_y == 0 || ss_y == 1) &&
-         "invalid chroma subsampling");
-  assert(strength >= 0 && strength <= 14 &&
-         "invalid adjusted temporal filter strength");
-  assert(blk_fw[0] >= 0 && "filter weight must be positive");
-  assert(
-      (use_whole_blk || (blk_fw[1] >= 0 && blk_fw[2] >= 0 && blk_fw[3] >= 0)) &&
-      "subblock filter weight must be positive");
-  assert(blk_fw[0] <= 2 && "sublock filter weight must be less than 2");
-  assert(
-      (use_whole_blk || (blk_fw[1] <= 2 && blk_fw[2] <= 2 && blk_fw[3] <= 2)) &&
-      "subblock filter weight must be less than 2");
-
-  // Precompute the difference squared
-  for (row = 0; row < block_height; row++) {
-    for (blk_col = 0; blk_col < block_width; blk_col += 8) {
-      highbd_store_dist_8(y_src_ptr + blk_col, y_pre_ptr + blk_col,
-                          y_dist_ptr + blk_col);
-    }
-    y_src_ptr += y_src_stride;
-    y_pre_ptr += y_pre_stride;
-    y_dist_ptr += DIST_STRIDE;
-  }
-
-  for (row = 0; row < chroma_height; row++) {
-    for (blk_col = 0; blk_col < chroma_width; blk_col += 8) {
-      highbd_store_dist_8(u_src_ptr + blk_col, u_pre_ptr + blk_col,
-                          u_dist_ptr + blk_col);
-      highbd_store_dist_8(v_src_ptr + blk_col, v_pre_ptr + blk_col,
-                          v_dist_ptr + blk_col);
-    }
-
-    u_src_ptr += uv_src_stride;
-    u_pre_ptr += uv_pre_stride;
-    u_dist_ptr += DIST_STRIDE;
-    v_src_ptr += uv_src_stride;
-    v_pre_ptr += uv_pre_stride;
-    v_dist_ptr += DIST_STRIDE;
-  }
-
-  y_src_ptr = CONVERT_TO_SHORTPTR(y_src),
-  u_src_ptr = CONVERT_TO_SHORTPTR(u_src),
-  v_src_ptr = CONVERT_TO_SHORTPTR(v_src);
-  y_pre_ptr = CONVERT_TO_SHORTPTR(y_pre),
-  u_pre_ptr = CONVERT_TO_SHORTPTR(u_pre),
-  v_pre_ptr = CONVERT_TO_SHORTPTR(v_pre);
-
-  y_dist_ptr = y_dist + 1;
-  u_dist_ptr = u_dist + 1;
-  v_dist_ptr = v_dist + 1;
-
-  highbd_apply_temporal_filter_luma(
-      y_src_ptr, y_src_stride, y_pre_ptr, y_pre_stride, u_src_ptr, v_src_ptr,
-      uv_src_stride, u_pre_ptr, v_pre_ptr, uv_pre_stride, block_width,
-      block_height, ss_x, ss_y, strength, blk_fw, use_whole_blk, y_accum,
-      y_count, y_dist_ptr, u_dist_ptr, v_dist_ptr);
-
-  highbd_apply_temporal_filter_chroma(
-      y_src_ptr, y_src_stride, y_pre_ptr, y_pre_stride, u_src_ptr, v_src_ptr,
-      uv_src_stride, u_pre_ptr, v_pre_ptr, uv_pre_stride, block_width,
-      block_height, ss_x, ss_y, strength, blk_fw, use_whole_blk, u_accum,
-      u_count, v_accum, v_count, y_dist_ptr, u_dist_ptr, v_dist_ptr);
-}
-
-/////////////////////////
-// High bit-depth Ends //
-/////////////////////////
-
-void av1_apply_temporal_filter_yuv_sse4_1(
-    const YV12_BUFFER_CONFIG *ref_frame, const MACROBLOCKD *mbd,
-    const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
-    const int num_planes, const int strength, const int use_subblock,
-    const int *subblock_filter_weights, const uint8_t *pred, uint32_t *accum,
-    uint16_t *count) {
-  const int is_high_bitdepth = ref_frame->flags & YV12_FLAG_HIGHBITDEPTH;
-  // TODO(any): Need to support when `num_planes != 3`, like C implementation.
-  assert(num_planes == 3);
-  (void)num_planes;
-  if (is_high_bitdepth) {
-    highbd_apply_temporal_filter_yuv(
-        ref_frame, mbd, block_size, mb_row, mb_col, strength, use_subblock,
-        subblock_filter_weights, pred, accum, count);
-  } else {
-    apply_temporal_filter_yuv(ref_frame, mbd, block_size, mb_row, mb_col,
-                              strength, use_subblock, subblock_filter_weights,
-                              pred, accum, count);
-  }
-}
diff --git a/build/cmake/aom_config_defaults.cmake b/build/cmake/aom_config_defaults.cmake
index f9e70eb..7e21cd5 100644
--- a/build/cmake/aom_config_defaults.cmake
+++ b/build/cmake/aom_config_defaults.cmake
@@ -84,8 +84,7 @@
 set_aom_config_var(CONFIG_WEBM_IO 1 "Enables WebM support.")
 
 # Debugging flags.
-set_aom_config_var(CONFIG_BITSTREAM_DEBUG 0 "Bitstream debugging flag.")
-set_aom_config_var(CONFIG_DEBUG 0 "Debug build flag.")
+set_aom_config_var(CONFIG_DEBUG 0 "Enable debug-only code.")
 set_aom_config_var(CONFIG_MISMATCH_DEBUG 0 "Mismatch debugging flag.")
 
 # AV1 feature flags.
@@ -108,6 +107,7 @@
 set_aom_config_var(DECODE_HEIGHT_LIMIT 0 "Set limit for decode height.")
 set_aom_config_var(DECODE_WIDTH_LIMIT 0 "Set limit for decode width.")
 set_aom_config_var(CONFIG_TUNE_VMAF 0 "Enable encoding tuning for VMAF.")
+set_aom_config_var(CONFIG_USE_VMAF_RC 0 "Use libvmaf_rc tune for VMAF_NEG.")
 
 # AV1 experiment flags.
 set_aom_config_var(CONFIG_SPEED_STATS 0 "AV1 experiment flag.")
@@ -115,6 +115,8 @@
 set_aom_config_var(CONFIG_DIST_8X8 0 "AV1 experiment flag.")
 set_aom_config_var(CONFIG_ENTROPY_STATS 0 "AV1 experiment flag.")
 set_aom_config_var(CONFIG_INTER_STATS_ONLY 0 "AV1 experiment flag.")
+set_aom_config_var(CONFIG_BITSTREAM_DEBUG 0
+                   "AV1 experiment flag for bitstream debugging.")
 set_aom_config_var(CONFIG_RD_DEBUG 0 "AV1 experiment flag.")
 set_aom_config_var(CONFIG_SHARP_SETTINGS 0 "AV1 experiment flag.")
 set_aom_config_var(CONFIG_DISABLE_FULL_PIXEL_SPLIT_8X8 1
@@ -125,16 +127,15 @@
                    "Collect encoding component timing information.")
 set_aom_config_var(CONFIG_LPF_MASK 0
                    "Enable the use loop filter bitmasks for optimizations.")
-set_aom_config_var(CONFIG_HTB_TRELLIS 0
-                   "Enable the use of hash table for trellis optimizations.")
 set_aom_config_var(CONFIG_REALTIME_ONLY 0
                    "Build for RTC-only to reduce binary size.")
 set_aom_config_var(CONFIG_AV1_HIGHBITDEPTH 1
                    "Build with high bitdepth support.")
+set_aom_config_var(CONFIG_AV1_TEMPORAL_DENOISING 0
+                   "Build with temporal denoising support.")
 set_aom_config_var(CONFIG_NN_V2 0 "Fully-connected neural nets ver.2.")
-set_aom_config_var(CONFIG_SUPERRES_IN_RECODE 1
-                   "Enable encoding both full-res and superres in recode loop"
-                   "when SUPERRES_AUTO mode is used.")
+set_aom_config_var(CONFIG_OPTICAL_FLOW_API 0
+                   "AV1 experiment flag for optical flow API.")
 #
 # Variables in this section control optional features of the build system.
 #
diff --git a/build/cmake/aom_configure.cmake b/build/cmake/aom_configure.cmake
index b870a94..43d60ae 100644
--- a/build/cmake/aom_configure.cmake
+++ b/build/cmake/aom_configure.cmake
@@ -44,7 +44,7 @@
 list(APPEND aom_build_vars ${AOM_CONFIG_VARS} ${AOM_OPTION_VARS})
 foreach(cache_var ${aom_build_vars})
   get_property(cache_var_helpstring CACHE ${cache_var} PROPERTY HELPSTRING)
-  if("${cache_var_helpstring}" STREQUAL "${cmake_cmdline_helpstring}")
+  if(cache_var_helpstring STREQUAL cmake_cmdline_helpstring)
     set(AOM_CMAKE_CONFIG "${AOM_CMAKE_CONFIG} -D${cache_var}=${${cache_var}}")
   endif()
 endforeach()
@@ -53,11 +53,10 @@
 # Detect target CPU.
 if(NOT AOM_TARGET_CPU)
   string(TOLOWER "${CMAKE_SYSTEM_PROCESSOR}" cpu_lowercase)
-  if("${cpu_lowercase}" STREQUAL "amd64"
-     OR "${cpu_lowercase}" STREQUAL "x86_64")
-    if(${CMAKE_SIZEOF_VOID_P} EQUAL 4)
+  if(cpu_lowercase STREQUAL "amd64" OR cpu_lowercase STREQUAL "x86_64")
+    if(CMAKE_SIZEOF_VOID_P EQUAL 4)
       set(AOM_TARGET_CPU "x86")
-    elseif(${CMAKE_SIZEOF_VOID_P} EQUAL 8)
+    elseif(CMAKE_SIZEOF_VOID_P EQUAL 8)
       set(AOM_TARGET_CPU "x86_64")
     else()
       message(
@@ -66,15 +65,13 @@
                     "      CMAKE_SYSTEM_PROCESSOR=${CMAKE_SYSTEM_PROCESSOR}\n"
                     "      CMAKE_GENERATOR=${CMAKE_GENERATOR}\n")
     endif()
-  elseif("${cpu_lowercase}" STREQUAL "i386"
-         OR "${cpu_lowercase}" STREQUAL "x86")
+  elseif(cpu_lowercase STREQUAL "i386" OR cpu_lowercase STREQUAL "x86")
     set(AOM_TARGET_CPU "x86")
-  elseif("${cpu_lowercase}" MATCHES "^arm"
-         OR "${cpu_lowercase}" MATCHES "^mips")
+  elseif(cpu_lowercase MATCHES "^arm" OR cpu_lowercase MATCHES "^mips")
     set(AOM_TARGET_CPU "${cpu_lowercase}")
-  elseif("${cpu_lowercase}" MATCHES "aarch64")
+  elseif(cpu_lowercase MATCHES "aarch64")
     set(AOM_TARGET_CPU "arm64")
-  elseif("${cpu_lowercase}" MATCHES "^ppc")
+  elseif(cpu_lowercase MATCHES "^ppc")
     set(AOM_TARGET_CPU "ppc")
   else()
     message(WARNING "The architecture ${CMAKE_SYSTEM_PROCESSOR} is not "
@@ -105,7 +102,8 @@
 message("--- aom_configure: Detected CPU: ${AOM_TARGET_CPU}")
 set(AOM_TARGET_SYSTEM ${CMAKE_SYSTEM_NAME})
 
-if("${CMAKE_BUILD_TYPE}" MATCHES "Deb")
+string(TOLOWER "${CMAKE_BUILD_TYPE}" build_type_lowercase)
+if(build_type_lowercase STREQUAL "debug")
   set(CONFIG_DEBUG 1)
 endif()
 
@@ -120,8 +118,8 @@
     # TODO(tomfinegan): clang needs -pie in CMAKE_EXE_LINKER_FLAGS for this to
     # work.
     set(CMAKE_POSITION_INDEPENDENT_CODE ON)
-    if("${AOM_TARGET_SYSTEM}" STREQUAL "Linux"
-       AND "${AOM_TARGET_CPU}" MATCHES "^armv[78]")
+    if(AOM_TARGET_SYSTEM STREQUAL "Linux"
+       AND AOM_TARGET_CPU MATCHES "^armv[78]")
       set(AOM_AS_FLAGS ${AOM_AS_FLAGS} --defsym PIC=1)
     else()
       set(AOM_AS_FLAGS ${AOM_AS_FLAGS} -DPIC)
@@ -129,7 +127,7 @@
   endif()
 endif()
 
-if("${AOM_TARGET_CPU}" STREQUAL "x86" OR "${AOM_TARGET_CPU}" STREQUAL "x86_64")
+if(AOM_TARGET_CPU STREQUAL "x86" OR AOM_TARGET_CPU STREQUAL "x86_64")
   find_program(AS_EXECUTABLE yasm $ENV{YASM_PATH})
   if(NOT AS_EXECUTABLE OR ENABLE_NASM)
     unset(AS_EXECUTABLE CACHE)
@@ -149,11 +147,11 @@
   get_asm_obj_format("objformat")
   set(AOM_AS_FLAGS -f ${objformat} ${AOM_AS_FLAGS})
   string(STRIP "${AOM_AS_FLAGS}" AOM_AS_FLAGS)
-elseif("${AOM_TARGET_CPU}" MATCHES "arm")
-  if("${AOM_TARGET_SYSTEM}" STREQUAL "Darwin")
+elseif(AOM_TARGET_CPU MATCHES "arm")
+  if(AOM_TARGET_SYSTEM STREQUAL "Darwin")
     set(AS_EXECUTABLE as)
     set(AOM_AS_FLAGS -arch ${AOM_TARGET_CPU} -isysroot ${CMAKE_OSX_SYSROOT})
-  elseif("${AOM_TARGET_SYSTEM}" STREQUAL "Windows")
+  elseif(AOM_TARGET_SYSTEM STREQUAL "Windows")
     if(NOT AS_EXECUTABLE)
       set(AS_EXECUTABLE ${CMAKE_C_COMPILER} -c -mimplicit-it=always)
     endif()
@@ -197,13 +195,13 @@
   require_compiler_flag("-pg" YES)
 endif()
 
-if("${AOM_TARGET_SYSTEM}" MATCHES "Darwin\|Linux\|Windows\|Android")
+if(AOM_TARGET_SYSTEM MATCHES "Darwin\|Linux\|Windows\|Android")
   set(CONFIG_OS_SUPPORT 1)
 endif()
 
 # The default _WIN32_WINNT value in MinGW is 0x0502 (Windows XP with SP2). Set
 # it to 0x0601 (Windows 7).
-if("${AOM_TARGET_SYSTEM}" STREQUAL "Windows")
+if(AOM_TARGET_SYSTEM STREQUAL "Windows")
   add_compiler_flag_if_supported("-D_WIN32_WINNT=0x0601")
 endif()
 
@@ -277,7 +275,7 @@
   add_compiler_flag_if_supported("-Wdisabled-optimization")
   add_compiler_flag_if_supported("-Wextra")
   add_compiler_flag_if_supported("-Wfloat-conversion")
-  add_compiler_flag_if_supported("-Wimplicit-function-declaration")
+  add_c_flag_if_supported("-Wimplicit-function-declaration")
   add_compiler_flag_if_supported("-Wlogical-op")
   add_compiler_flag_if_supported("-Wpointer-arith")
   add_compiler_flag_if_supported("-Wshorten-64-to-32")
@@ -288,21 +286,25 @@
   add_compiler_flag_if_supported("-Wunused")
   add_compiler_flag_if_supported("-Wvla")
 
-  if(CMAKE_C_COMPILER_ID MATCHES "GNU"
-     AND "${SANITIZE}" MATCHES "address|undefined")
+  if(CMAKE_C_COMPILER_ID MATCHES "GNU" AND SANITIZE MATCHES "address|undefined")
 
     # This combination has more stack overhead, so we account for it by
     # providing higher stack limit than usual.
     add_c_flag_if_supported("-Wstack-usage=170000")
     add_cxx_flag_if_supported("-Wstack-usage=270000")
   elseif(CONFIG_RD_DEBUG) # Another case where higher stack usage is expected.
-    add_c_flag_if_supported("-Wstack-usage=117000")
+    add_c_flag_if_supported("-Wstack-usage=135000")
     add_cxx_flag_if_supported("-Wstack-usage=240000")
   else()
     add_c_flag_if_supported("-Wstack-usage=100000")
     add_cxx_flag_if_supported("-Wstack-usage=240000")
   endif()
 
+  if(CMAKE_C_COMPILER_ID MATCHES "GNU" AND SANITIZE MATCHES "address")
+    # Disable no optimization warning when compiling with sanitizers
+    add_compiler_flag_if_supported("-Wno-disabled-optimization")
+  endif()
+
   # Add -Wshadow only for C files to avoid massive gtest warning spam.
   add_c_flag_if_supported("-Wshadow")
 
@@ -311,7 +313,7 @@
 
   # Quiet gcc 6 vs 7 abi warnings:
   # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=77728
-  if("${AOM_TARGET_CPU}" MATCHES "arm")
+  if(AOM_TARGET_CPU MATCHES "arm")
     add_cxx_flag_if_supported("-Wno-psabi")
   endif()
 
@@ -319,7 +321,7 @@
     add_compiler_flag_if_supported("-Werror")
   endif()
 
-  if("${CMAKE_BUILD_TYPE}" MATCHES "Rel")
+  if(build_type_lowercase MATCHES "rel")
     add_compiler_flag_if_supported("-U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=0")
   endif()
   add_compiler_flag_if_supported("-D_LARGEFILE_SOURCE")
diff --git a/build/cmake/rtcd.pl b/build/cmake/rtcd.pl
index dafccdc..e9f75dd 100755
--- a/build/cmake/rtcd.pl
+++ b/build/cmake/rtcd.pl
@@ -91,7 +91,9 @@
 
 sub add_proto {
   my $fn = splice(@_, -2, 1);
-  $ALL_FUNCS{$fn} = \@_;
+  my @proto = @_;
+  foreach (@proto) { tr/\t/ / }
+  $ALL_FUNCS{$fn} = \@proto;
   specialize $fn, "c";
 }
 
diff --git a/common/args.c b/common/args.c
index ec2a863..d7d7669 100644
--- a/common/args.c
+++ b/common/args.c
@@ -11,6 +11,7 @@
 
 #include "common/args.h"
 
+#include <assert.h>
 #include <stdlib.h>
 #include <string.h>
 #include <limits.h>
@@ -18,12 +19,7 @@
 #include "aom/aom_integer.h"
 #include "aom_ports/msvc.h"
 #include "aom/aom_codec.h"
-
-#if defined(__GNUC__) && __GNUC__
-extern void die(const char *fmt, ...) __attribute__((noreturn));
-#else
-extern void die(const char *fmt, ...);
-#endif
+#include "common/tools_common.h"
 
 struct arg arg_init(char **argv) {
   struct arg a;
@@ -147,19 +143,20 @@
 int arg_match(struct arg *arg_, const struct arg_def *def, char **argv) {
   struct arg arg;
 
+  assert(def->has_val == 0 || def->has_val == 1 || def->has_val == -1);
+
   if (!argv[0] || argv[0][0] != '-') return 0;
 
   arg = arg_init(argv);
 
-  if (def->short_name && strlen(arg.argv[0]) == strlen(def->short_name) + 1 &&
-      !strcmp(arg.argv[0] + 1, def->short_name)) {
+  if (def->short_name && !strcmp(arg.argv[0] + 1, def->short_name)) {
     arg.name = arg.argv[0] + 1;
     arg.val = def->has_val ? arg.argv[1] : NULL;
     arg.argv_step = def->has_val ? 2 : 1;
   } else if (def->long_name) {
     const size_t name_len = strlen(def->long_name);
 
-    if (strlen(arg.argv[0]) >= name_len + 2 && arg.argv[0][1] == '-' &&
+    if (arg.argv[0][1] == '-' &&
         !strncmp(arg.argv[0] + 2, def->long_name, name_len) &&
         (arg.argv[0][name_len + 2] == '=' ||
          arg.argv[0][name_len + 2] == '\0')) {
@@ -169,13 +166,19 @@
     }
   }
 
-  if (arg.name && !arg.val && def->has_val)
-    die("Error: option %s requires argument.\n", arg.name);
+  if (arg.name) {
+    if (def->has_val == -1) {
+      arg.def = def;
+      *arg_ = arg;
+      return 1;
+    }
 
-  if (arg.name && arg.val && !def->has_val)
-    die("Error: option %s requires no argument.\n", arg.name);
+    if (!arg.val && def->has_val)
+      die("Error: option %s requires argument.\n", arg.name);
 
-  if (arg.name && (arg.val || !def->has_val)) {
+    if (arg.val && !def->has_val)
+      die("Error: option %s requires no argument.\n", arg.name);
+
     arg.def = def;
     *arg_ = arg;
     return 1;
@@ -199,24 +202,31 @@
 }
 
 void arg_show_usage(FILE *fp, const struct arg_def *const *defs) {
-  char option_text[40] = { 0 };
-
   for (; *defs; defs++) {
     const struct arg_def *def = *defs;
     char *short_val = def->has_val ? " <arg>" : "";
     char *long_val = def->has_val ? "=<arg>" : "";
+    int n = 0;
 
+    // Short options are indented with two spaces. Long options are indented
+    // with 12 spaces.
     if (def->short_name && def->long_name) {
       char *comma = def->has_val ? "," : ",      ";
 
-      snprintf(option_text, 37, "-%s%s%s --%s%6s", def->short_name, short_val,
-               comma, def->long_name, long_val);
+      n = fprintf(fp, "  -%s%s%s --%s%s", def->short_name, short_val, comma,
+                  def->long_name, long_val);
     } else if (def->short_name)
-      snprintf(option_text, 37, "-%s%s", def->short_name, short_val);
+      n = fprintf(fp, "  -%s%s", def->short_name, short_val);
     else if (def->long_name)
-      snprintf(option_text, 37, "          --%s%s", def->long_name, long_val);
+      n = fprintf(fp, "            --%s%s", def->long_name, long_val);
 
-    fprintf(fp, "  %-37s\t%s\n", option_text, def->desc);
+    // Descriptions are indented with 40 spaces. If an option is 40 characters
+    // or longer, its description starts on the next line.
+    if (n < 40)
+      for (int i = 0; i < 40 - n; i++) fputc(' ', fp);
+    else
+      fputs("\n                                        ", fp);
+    fprintf(fp, "%s\n", def->desc);
 
     if (def->enums) {
       const struct arg_enum_list *listptr;
diff --git a/common/args.h b/common/args.h
index 286f7dd..c886762 100644
--- a/common/args.h
+++ b/common/args.h
@@ -38,7 +38,9 @@
 typedef struct arg_def {
   const char *short_name;
   const char *long_name;
-  int has_val;
+  int has_val;  //  0: The argument must not have a value.
+                //  1: The argument must have a value.
+                // -1: The argument may or may not have a value.
   const char *desc;
   const struct arg_enum_list *enums;
 } arg_def_t;
diff --git a/common/tools_common.c b/common/tools_common.c
index 51c1c52..2b199a5 100644
--- a/common/tools_common.c
+++ b/common/tools_common.c
@@ -9,14 +9,15 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#include "common/tools_common.h"
-
+#include <assert.h>
 #include <math.h>
 #include <stdarg.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 
+#include "common/tools_common.h"
+
 #if CONFIG_AV1_ENCODER
 #include "aom/aomcx.h"
 #endif
@@ -128,66 +129,107 @@
   return shortread;
 }
 
+struct CodecInfo {
+  // Pointer to a function of zero arguments that returns an aom_codec_iface_t.
+  aom_codec_iface_t *(*const interface)();
+  char *short_name;
+  uint32_t fourcc;
+};
+
 #if CONFIG_AV1_ENCODER
-static const AvxInterface aom_encoders[] = {
-  { "av1", AV1_FOURCC, &aom_codec_av1_cx },
+static const struct CodecInfo aom_encoders[] = {
+  { &aom_codec_av1_cx, "av1", AV1_FOURCC },
 };
 
 int get_aom_encoder_count(void) {
   return sizeof(aom_encoders) / sizeof(aom_encoders[0]);
 }
 
-const AvxInterface *get_aom_encoder_by_index(int i) { return &aom_encoders[i]; }
+aom_codec_iface_t *get_aom_encoder_by_index(int i) {
+  assert(i >= 0 && i < get_aom_encoder_count());
+  return aom_encoders[i].interface();
+}
 
-const AvxInterface *get_aom_encoder_by_name(const char *name) {
-  int i;
-
-  for (i = 0; i < get_aom_encoder_count(); ++i) {
-    const AvxInterface *encoder = get_aom_encoder_by_index(i);
-    if (strcmp(encoder->name, name) == 0) return encoder;
+aom_codec_iface_t *get_aom_encoder_by_short_name(const char *name) {
+  for (int i = 0; i < get_aom_encoder_count(); ++i) {
+    const struct CodecInfo *info = &aom_encoders[i];
+    if (strcmp(info->short_name, name) == 0) return info->interface();
   }
-
   return NULL;
 }
 
-// large scale tile encoding
-static const AvxInterface aom_lst_encoder = { "av1", LST_FOURCC,
-                                              &aom_codec_av1_cx };
-const AvxInterface *get_aom_lst_encoder(void) { return &aom_lst_encoder; }
+uint32_t get_fourcc_by_aom_encoder(aom_codec_iface_t *iface) {
+  for (int i = 0; i < get_aom_encoder_count(); ++i) {
+    const struct CodecInfo *info = &aom_encoders[i];
+    if (info->interface() == iface) {
+      return info->fourcc;
+    }
+  }
+  return 0;
+}
+
+const char *get_short_name_by_aom_encoder(aom_codec_iface_t *iface) {
+  for (int i = 0; i < get_aom_encoder_count(); ++i) {
+    const struct CodecInfo *info = &aom_encoders[i];
+    if (info->interface() == iface) {
+      return info->short_name;
+    }
+  }
+  return NULL;
+}
+
 #endif  // CONFIG_AV1_ENCODER
 
 #if CONFIG_AV1_DECODER
-static const AvxInterface aom_decoders[] = {
-  { "av1", AV1_FOURCC, &aom_codec_av1_dx },
+static const struct CodecInfo aom_decoders[] = {
+  { &aom_codec_av1_dx, "av1", AV1_FOURCC },
 };
 
 int get_aom_decoder_count(void) {
   return sizeof(aom_decoders) / sizeof(aom_decoders[0]);
 }
 
-const AvxInterface *get_aom_decoder_by_index(int i) { return &aom_decoders[i]; }
+aom_codec_iface_t *get_aom_decoder_by_index(int i) {
+  assert(i >= 0 && i < get_aom_decoder_count());
+  return aom_decoders[i].interface();
+}
 
-const AvxInterface *get_aom_decoder_by_name(const char *name) {
-  int i;
-
-  for (i = 0; i < get_aom_decoder_count(); ++i) {
-    const AvxInterface *const decoder = get_aom_decoder_by_index(i);
-    if (strcmp(decoder->name, name) == 0) return decoder;
+aom_codec_iface_t *get_aom_decoder_by_short_name(const char *name) {
+  for (int i = 0; i < get_aom_decoder_count(); ++i) {
+    const struct CodecInfo *info = &aom_decoders[i];
+    if (strcmp(info->short_name, name) == 0) return info->interface();
   }
-
   return NULL;
 }
 
-const AvxInterface *get_aom_decoder_by_fourcc(uint32_t fourcc) {
-  int i;
-
-  for (i = 0; i < get_aom_decoder_count(); ++i) {
-    const AvxInterface *const decoder = get_aom_decoder_by_index(i);
-    if (decoder->fourcc == fourcc) return decoder;
+aom_codec_iface_t *get_aom_decoder_by_fourcc(uint32_t fourcc) {
+  for (int i = 0; i < get_aom_decoder_count(); ++i) {
+    const struct CodecInfo *info = &aom_decoders[i];
+    if (info->fourcc == fourcc) return info->interface();
   }
-
   return NULL;
 }
+
+const char *get_short_name_by_aom_decoder(aom_codec_iface_t *iface) {
+  for (int i = 0; i < get_aom_decoder_count(); ++i) {
+    const struct CodecInfo *info = &aom_decoders[i];
+    if (info->interface() == iface) {
+      return info->short_name;
+    }
+  }
+  return NULL;
+}
+
+uint32_t get_fourcc_by_aom_decoder(aom_codec_iface_t *iface) {
+  for (int i = 0; i < get_aom_decoder_count(); ++i) {
+    const struct CodecInfo *info = &aom_decoders[i];
+    if (info->interface() == iface) {
+      return info->fourcc;
+    }
+  }
+  return 0;
+}
+
 #endif  // CONFIG_AV1_DECODER
 
 void aom_img_write(const aom_image_t *img, FILE *file) {
diff --git a/common/tools_common.h b/common/tools_common.h
index 1ed0045..873a720 100644
--- a/common/tools_common.h
+++ b/common/tools_common.h
@@ -142,107 +142,27 @@
 
 #undef AOM_NO_RETURN
 
+// The AOM library can support different encoders / decoders. These
+// functions provide different ways to lookup / iterate through them.
+// The return result may be NULL to indicate no codec was found.
+int get_aom_encoder_count();
+aom_codec_iface_t *get_aom_encoder_by_index(int i);
+aom_codec_iface_t *get_aom_encoder_by_short_name(const char *name);
+// If the interface is unknown, returns NULL.
+const char *get_short_name_by_aom_encoder(aom_codec_iface_t *encoder);
+// If the interface is unknown, returns 0.
+uint32_t get_fourcc_by_aom_encoder(aom_codec_iface_t *iface);
+
+int get_aom_decoder_count();
+aom_codec_iface_t *get_aom_decoder_by_index(int i);
+aom_codec_iface_t *get_aom_decoder_by_short_name(const char *name);
+aom_codec_iface_t *get_aom_decoder_by_fourcc(uint32_t fourcc);
+const char *get_short_name_by_aom_decoder(aom_codec_iface_t *decoder);
+// If the interface is unknown, returns 0.
+uint32_t get_fourcc_by_aom_decoder(aom_codec_iface_t *iface);
+
 int read_yuv_frame(struct AvxInputContext *input_ctx, aom_image_t *yuv_frame);
 
-///////////////////////////////////////////////////////////////////////////////
-// A description of the interfaces used to access the AOM codecs
-///////////////////////////////////////////////////////////////////////////////
-//
-// There are three levels of interfaces used to access the AOM codec: the
-// AVXInterface, the aom_codec_iface, and the aom_codec_ctx. Each of these
-// is described in detail here.
-//
-//
-// 1. AVXInterface
-//    (Related files: common/tools_common.c,  common/tools_common.h)
-//
-// The high-level interface to the AVx encoders / decoders. Each AvxInterface
-// contains the name of the codec (e.g., "av1"), the four character code
-// associated with it, and a function pointer to the actual interface (see the
-// documentation on aom_codec_iface_t for more info). This API
-// is meant for lookup / iteration over all known codecs.
-//
-// For the encoder, call get_aom_encoder_by_name(...) if you know the name
-// (e.g., "av1"); to iterate over all known encoders, use
-// get_aom_encoder_count() and get_aom_encoder_by_index(i). To get the
-// encoder specifically for large scale tile encoding, use
-// get_aom_lst_encoder().
-//
-// For the decoder, similar functions are available. There is also a
-// get_aom_decoder_by_fourcc(fourcc) to get the decoder based on the four
-// character codes.
-//
-// The main purpose of the AVXInterface is to get a reference to the
-// aom_codec_interface_t, pointed to by its codec_interface variable.
-//
-//
-// 2. aom_codec_iface_t
-//    (Related files: aom/aom_codec.h, aom/src/aom_codec.c,
-//    aom/internal/aom_codec_internal.h, av1/av1_cx_iface.c,
-//    av1/av1_dx_iface.c)
-//
-// Used to initialize the codec context, which contains the configuration for
-// for modifying the encoder/decoder during run-time. See the documentation of
-// aom/aom_codec.h for more details. For the most part, users will call the
-// helper functions listed there, such as aom_codec_iface_name,
-// aom_codec_get_caps, etc., to interact with it.
-//
-// The main purpose of the aom_codec_iface_t is to provide a way to generate
-// a default codec config, find out what capabilities the implementation has,
-// and create an aom_codec_ctx_t (which is actually used to interact with the
-// codec).
-//
-// Note that the implementations of the aom_codec_iface_t are located in
-// av1/av1_cx_iface.c and av1/av1_dx_iface.c
-//
-//
-// 3. aom_codec_ctx_t
-//  (Related files: aom/aom_codec.h, av1/av1_cx_iface.c, av1/av1_dx_iface.c,
-//   aom/aomcx.h, aom/aomdx.h, aom/src/aom_encoder.c, aom/src/aom_decoder.c)
-//
-// The actual interface between user code and the codec. It stores the name
-// of the codec, a pointer back to the aom_codec_iface_t that initialized it,
-// initialization flags, a config for either encoder or the decoder, and a
-// pointer to internal data.
-//
-// The codec is configured / queried through calls to aom_codec_control,
-// which takes a control code (listed in aomcx.h and aomdx.h) and a parameter.
-// In the case of "getter" control codes, the parameter is modified to have
-// the requested value; in the case of "setter" control codes, the codec's
-// configuration is changed based on the parameter. Note that a aom_codec_err_t
-// is returned, which indicates if the operation was successful or not.
-//
-// Note that for the encoder, the aom_codec_alg_priv_t points to the
-// the aom_codec_alg_priv structure in av1/av1_cx_iface.c, and for the decoder,
-// the struct in av1/av1_dx_iface.c. Variables such as AV1_COMP cpi are stored
-// here and also used in the core algorithm.
-//
-// At the end, aom_codec_destroy should be called for each initialized
-// aom_codec_ctx_t.
-
-typedef struct AvxInterface {
-  const char *const name;
-  const uint32_t fourcc;
-  // Pointer to a function of zero arguments that returns an aom_codec_iface_t
-  // pointer. E.g.:
-  //   aom_codec_iface_t *codec = interface->codec_interface();
-  aom_codec_iface_t *(*const codec_interface)();
-} AvxInterface;
-
-int get_aom_encoder_count(void);
-// Lookup the interface by index -- it must be the case that
-// i < get_aom_encoder_count()
-const AvxInterface *get_aom_encoder_by_index(int i);
-// Lookup the interface by name -- returns NULL if no match.
-const AvxInterface *get_aom_encoder_by_name(const char *name);
-const AvxInterface *get_aom_lst_encoder(void);
-
-int get_aom_decoder_count(void);
-const AvxInterface *get_aom_decoder_by_index(int i);
-const AvxInterface *get_aom_decoder_by_name(const char *name);
-// Lookup the interface by the fourcc -- returns NULL if no match.
-const AvxInterface *get_aom_decoder_by_fourcc(uint32_t fourcc);
-
 void aom_img_write(const aom_image_t *img, FILE *file);
 int aom_img_read(aom_image_t *img, FILE *file);
 
diff --git a/common/webmenc.cc b/common/webmenc.cc
index 6ae7df6..bb754e8 100644
--- a/common/webmenc.cc
+++ b/common/webmenc.cc
@@ -12,7 +12,10 @@
 #include "common/webmenc.h"
 
 #include <stdio.h>
+#include <string.h>
 
+#include <memory>
+#include <new>
 #include <string>
 
 #include "common/av1_config.h"
@@ -23,21 +26,73 @@
 namespace {
 const uint64_t kDebugTrackUid = 0xDEADBEEF;
 const int kVideoTrackNumber = 1;
+
+// Simplistic mechanism to detect if an argv parameter refers to
+// an input or output file. Returns the total number of arguments that
+// should be skipped.
+int skip_input_output_arg(const char *arg, const char *input_fname) {
+  if (strcmp(arg, input_fname) == 0) {
+    return 1;
+  }
+  if (strcmp(arg, "-o") == 0 || strcmp(arg, "--output") == 0) {
+    return 2;
+  }
+  if (strncmp(arg, "--output=", strlen("--output=")) == 0) {
+    return 1;
+  }
+  return 0;
+}
+
 }  // namespace
 
+char *extract_encoder_settings(const char *version, const char **argv, int argc,
+                               const char *input_fname) {
+  // + 9 for "version:" prefix and for null terminator.
+  size_t total_size = strlen(version) + 9;
+  int i = 1;
+  while (i < argc) {
+    int num_skip = skip_input_output_arg(argv[i], input_fname);
+    i += num_skip;
+    if (num_skip == 0) {
+      total_size += strlen(argv[i]) + 1;  // + 1 is for space separator.
+      ++i;
+    }
+  }
+  char *result = static_cast<char *>(malloc(total_size));
+  if (result == nullptr) {
+    return nullptr;
+  }
+  char *cur = result;
+  cur += snprintf(cur, total_size, "version:%s", version);
+  i = 1;
+  while (i < argc) {
+    int num_skip = skip_input_output_arg(argv[i], input_fname);
+    i += num_skip;
+    if (num_skip == 0) {
+      cur += snprintf(cur, total_size, " %s", argv[i]);
+      ++i;
+    }
+  }
+  *cur = '\0';
+  return result;
+}
+
 int write_webm_file_header(struct WebmOutputContext *webm_ctx,
                            aom_codec_ctx_t *encoder_ctx,
                            const aom_codec_enc_cfg_t *cfg,
                            stereo_format_t stereo_fmt, unsigned int fourcc,
-                           const struct AvxRational *par) {
-  mkvmuxer::MkvWriter *const writer = new mkvmuxer::MkvWriter(webm_ctx->stream);
-  mkvmuxer::Segment *const segment = new mkvmuxer::Segment();
-  if (!writer || !segment) {
+                           const struct AvxRational *par,
+                           const char *encoder_settings) {
+  std::unique_ptr<mkvmuxer::MkvWriter> writer(
+      new (std::nothrow) mkvmuxer::MkvWriter(webm_ctx->stream));
+  std::unique_ptr<mkvmuxer::Segment> segment(new (std::nothrow)
+                                                 mkvmuxer::Segment());
+  if (writer == nullptr || segment == nullptr) {
     fprintf(stderr, "webmenc> mkvmuxer objects alloc failed, out of memory?\n");
     return -1;
   }
 
-  bool ok = segment->Init(writer);
+  bool ok = segment->Init(writer.get());
   if (!ok) {
     fprintf(stderr, "webmenc> mkvmuxer Init failed.\n");
     return -1;
@@ -116,13 +171,27 @@
     video_track->set_display_height(cfg->g_h);
   }
 
+  if (encoder_settings != nullptr) {
+    mkvmuxer::Tag *tag = segment->AddTag();
+    if (tag == nullptr) {
+      fprintf(stderr,
+              "webmenc> Unable to allocate memory for encoder settings tag.\n");
+      return -1;
+    }
+    ok = tag->add_simple_tag("ENCODER_SETTINGS", encoder_settings);
+    if (!ok) {
+      fprintf(stderr,
+              "webmenc> Unable to allocate memory for encoder settings tag.\n");
+      return -1;
+    }
+  }
+
   if (webm_ctx->debug) {
     video_track->set_uid(kDebugTrackUid);
   }
 
-  webm_ctx->writer = writer;
-  webm_ctx->segment = segment;
-
+  webm_ctx->writer = writer.release();
+  webm_ctx->segment = segment.release();
   return 0;
 }
 
diff --git a/common/webmenc.h b/common/webmenc.h
index a4aa992..c912208 100644
--- a/common/webmenc.h
+++ b/common/webmenc.h
@@ -38,6 +38,16 @@
   STEREO_FORMAT_RIGHT_LEFT = 11
 } UENUM1BYTE(stereo_format_t);
 
+// Simplistic mechanism to extract encoder settings, without having
+// to re-invoke the entire flag-parsing logic. It lists the codec version
+// and then copies the arguments as-is from argv, but skips the binary name,
+// any arguments that match the input filename, and the output flags "-o"
+// and "--output" (and the following argument for those flags). The caller
+// is responsible for free-ing the returned string. If there is insufficient
+// memory, it returns nullptr.
+char *extract_encoder_settings(const char *version, const char **argv, int argc,
+                               const char *input_fname);
+
 // The following functions wrap libwebm's mkvmuxer. All functions return 0 upon
 // success, or -1 upon failure.
 
@@ -45,7 +55,8 @@
                            aom_codec_ctx_t *encoder_ctx,
                            const aom_codec_enc_cfg_t *cfg,
                            stereo_format_t stereo_fmt, unsigned int fourcc,
-                           const struct AvxRational *par);
+                           const struct AvxRational *par,
+                           const char *encoder_settings);
 
 int write_webm_block(struct WebmOutputContext *webm_ctx,
                      const aom_codec_enc_cfg_t *cfg,
diff --git a/common/y4menc.c b/common/y4menc.c
index e3f5d5b..d228eaa 100644
--- a/common/y4menc.c
+++ b/common/y4menc.c
@@ -86,7 +86,7 @@
                           unsigned int bit_depth) {
   const char *color = monochrome ? monochrome_colorspace(bit_depth)
                                  : colorspace(bit_depth, csp, fmt);
-  return snprintf(buf, len, "YUV4MPEG2 W%u H%u F%u:%u I%c %s\n", width, height,
+  return snprintf(buf, len, "YUV4MPEG2 W%d H%d F%d:%d I%c %s\n", width, height,
                   framerate->numerator, framerate->denominator, 'p', color);
 }
 
diff --git a/doc/AlgorithmDescription.md b/doc/AlgorithmDescription.md
new file mode 100644
index 0000000..bfd64da
--- /dev/null
+++ b/doc/AlgorithmDescription.md
@@ -0,0 +1,799 @@
+<div style="font-size:3em; text-align:center;"> Algorithm Description </div>
+
+# Abstract
+This document describes technical aspects of coding tools included in
+the associated codec. This document is not a specification of the associated
+codec. Instead, it summarizes the highlighted features of coding tools for new
+developers. This document should be updated when significant new normative
+changes have been integrated into the associated codec.
+
+# Table of Contents
+
+[Abbreviations](#Abbreviations)
+
+[Algorithm description](#Algorithm-Description)
+
+- [Block Partitioning](#Block-Partitioning)
+  - [Coding block partition](#Coding-block-partition)
+  - [Transform block partition](#Transform-block-partition)
+- [Intra Prediction](#Intra-Prediction)
+  - [Directional intra prediction modes](#Directional-intra-prediction-modes)
+  - [Non-directional intra prediction modes](#Non-directional-intra-prediction-modes)
+  - [Recursive filtering modes](#Recursive-filtering-modes)
+  - [Chroma from Luma mode](#Chroma-from-Luma-mode)
+- [Inter Prediction](#Inter-Prediction)
+  - [Motion vector prediction](#Motion-vector-prediction)
+  - [Motion vector coding](#Motion-vector-coding)
+  - [Interpolation filter for motion compensation](#Interpolation-filter-for-motion-compensation)
+  - [Warped motion compensation](#Warped-motion-compensation)
+  - [Overlapped block motion compensation](#Overlapped-block-motion-compensation)
+  - [Reference frames](#Reference-frames)
+  - [Compound Prediction](#Compound-Prediction)
+- [Transform](#Transform)
+- [Quantization](#Quantization)
+- [Entropy Coding](#Entropy-Coding)
+- [Loop filtering and post-processing](#Loop-filtering-and-post-processing)
+  - [Deblocking](#Deblocking)
+  - [Constrained directional enhancement](#Constrained-directional-enhancement)
+  - [Loop Restoration filter](#Loop-Restoration-filter)
+  - [Frame super-resolution](#Frame-super-resolution)
+  - [Film grain synthesis](#Film-grain-synthesis)
+- [Screen content coding](#Screen-content-coding)
+  - [Intra block copy](#Intra-block-copy)
+  - [Palette mode](#Palette-mode)
+
+[References](#References)
+
+# Abbreviations
+
+CfL: Chroma from Luma\
+IntraBC: Intra block copy\
+LCU: Largest coding unit\
+OBMC: Overlapped Block Motion Compensation\
+CDEF: Constrained Directional Enhancement Filter
+
+# Algorithm Description
+
+## Block Partitioning
+
+### Coding block partition
+
+The largest coding block unit (LCU) applied in this codec is 128×128. In
+addition to no split mode `PARTITION_NONE`, the partition tree supports 9
+different partitioning patterns, as shown in below figure.
+
+<figure class="image"> <center><img src="img\partition_codingblock.svg"
+alt="Partition" width="360" /> <figcaption>Figure 1: Supported coding block
+partitions</figcaption> </figure>
+
+According to the number of sub-partitions, the 9 partition modes are summarized
+as follows: 1. Four partitions: `PARTITION_SPLIT`, `PARTITION_VERT_4`,
+`PARTITION_HORZ_4` 2. Three partitions (T-Shape): `PARTITION_HORZ_A`,
+`PARTITION_HORZ_B`, `PARTITION_VERT_A`, `PARTITION_HORZ_B` 3. Two partitions:
+`PARTITION_HORZ`, `PARTITION_VERT`
+
+Among all the 9 partitioning patterns, only `PARTITION_SPLIT` mode supports
+recursive partitioning, i.e., sub-partitions can be further split, other
+partitioning modes cannot further split. Particularly, for 8x8 and 128x128,
+`PARTITION_VERT_4`, `PARTITION_HORZ_4` are not used, and for 8x8, T-Shape
+partitions are not used either.
+
+### Transform block partition
+
+For both intra and inter coded blocks, the coding block can be further
+partitioned into multiple transform units with the partitioning depth up to 2
+levels. The mapping from the transform size of the current depth to the
+transform size of the next depth is shown in the following Table 1.
+
+<figure class="image"> <center><figcaption>Table 1: Transform partition size
+setting</figcaption> <img src="img\tx_partition.svg" alt="Partition" width="220"
+/> </figure>
+
+Furthermore, for intra coded blocks, the transform partition is done in a way
+that all the transform blocks have the same size, and the transform blocks are
+coded in a raster scan order. An example of the transform block partitioning for
+intra coded block is shown in the Figure 2.
+
+<figure class="image"> <center><img src="img\intra_tx_partition.svg"
+alt="Partition" width="600" /> <figcaption>Figure 2: Example of transform
+partitioning for intra coded block</figcaption> </figure>
+
+For inter coded blocks, the transform unit partitioning can be done in a
+recursive manner with the partitioning depth up to 2 levels. The transform
+partitioning supports 1:1 (square), 1:2/2:1, and 1:4/4:1 transform unit sizes
+ranging from 4×4 to 64×64. If the coding block is smaller than or equal to
+64x64, the transform block partitioning can only apply to luma component, for
+chroma blocks, the transform block size is identical to the coding block size.
+Otherwise, if the coding block width or height is greater than 64, then both the
+luma and chroma coding blocks will implicitly split into multiples of min(W,
+64)x min(H, 64) and min(W, 32)x min(H, 32) transform blocks, respectively.
+
+<figure class="image"> <center><img src="img\inter_tx_partition.svg"
+alt="Partition" width="400" /> <figcaption>Figure 3: Example of transform
+partitioning for inter coded block</figcaption> </figure>
+
+## Intra Prediction
+
+### Directional intra prediction modes
+
+Directional intra prediction modes are applied in intra prediction, which models
+local textures using a given direction pattern. Directional intra prediction
+modes are represented by nominal modes and angle delta. The nominal modes are
+similar set of intra prediction angles used in VP9, which includes 8 angles. The
+index value of angle delta is ranging from -3 ~ +3, and zero delta angle
+indicates a nominal mode. The prediction angle is represented by a nominal intra
+angle plus an angle delta. In total, there are 56 directional intra prediction
+modes, as shown in the following figure. In the below figure, solid arrows
+indicate directional intra prediction modes and dotted arrows represent non-zero
+angle delta.
+
+<figure class="image"> <center><img src="img\intra_directional.svg"
+alt="Directional intra" width="300" /> <figcaption>Figure 4: Directional intra
+prediction modes</figcaption> </figure>
+
+The nominal mode index and angle delta index is signalled separately, and
+nominal mode index is signalled before the associated angle delta index. It is
+noted that for small block sizes, where the coding gain from extending intra
+prediction angles may saturate, only the nominal modes are used and angle delta
+index is not signalled.
+
+### Non-directional intra prediction modes
+
+In addition to directional intra prediction modes, four non-directional intra
+modes which simulate smooth textures are also included. The four non-directional
+intra modes include `SMOOTH_V`, `SMOOTH_H`, `SMOOTH` and `PAETH predictor`.
+
+In `SMOOTH V`, `SMOOTH H` and `SMOOTH modes`, the prediction values are
+generated using quadratic interpolation along vertical, horizontal directions,
+or the average thereof. The samples used in the quadratic interpolation include
+reconstructed samples from the top and left neighboring blocks and samples from
+the right and bottom boundaries which are approximated by top reconstructed
+samples and the left reconstructed samples.
+
+In `PAETH predictor` mode, the prediction for each sample is assigned as one
+from the top (T), left (L) and top-left (TL) reference samples, which has the
+value closest to the Paeth predictor value, i.e., T + L -TL. The samples used in
+`PAETH predictor` are illustrated in below figure.
+
+<figure class="image"> <center><img src="img\intra_paeth.svg" alt="Directional
+intra" width="300" /> <figcaption>Figure 5: Paeth predictor</figcaption>
+</figure>
+
+### Recursive filtering modes
+
+Five filtering intra modes are defined, and each mode specify a set of eight
+7-tap filters. Given the selected filtering mode index (0~4), the current block
+is divided into 4x2 sub-blocks. For one 4×2 sub-block, each sample is predicted
+by 7-tap interpolation using the 7 top and left neighboring samples as inputs.
+Different filters are applied for samples located at different coordinates
+within a 4×2 sub-block. The prediction process can be done recursively in unit
+4x2 sub-block, which means that prediction samples generated for one 4x2
+prediction block can be used to predict another 4x2 sub-block.
+
+<figure class="image"> <center><img src="img\intra_recursive.svg"
+alt="Directional intra" width="300" /> <figcaption>Figure 6: Recursive filtering
+modes</figcaption> </figure>
+
+### Chroma from Luma mode
+
+Chroma from Luma (CfL) is a chroma intra prediction mode, which models chroma
+samples as a linear function of co-located reconstructed luma samples. To align
+the resolution between luma and chroma samples for different chroma sampling
+format, e.g., 4:2:0 and 4:2:2, reconstructed luma pixels may need to be
+sub-sampled before being used in CfL mode. In addition, the DC component is
+removed to form the AC contribution. In CfL mode, the model parameters which
+specify the linear function between two color components are optimized by
+encoder signalled in the bitstream.
+
+<figure class="image"> <center><img src="img\intra_cfl.svg" alt="Directional
+intra" width="700" /> <figcaption>Figure 7: CfL prediction</figcaption>
+</figure>
+
+## Inter Prediction
+
+### Motion vector prediction
+
+Motion vectors are predicted by neighboring blocks which can be either spatial
+neighboring blocks, or temporal neighboring blocks located in a reference frame.
+A set of MV predictors will be identified by checking all these blocks and
+utilized to encode the motion vector information.
+
+**Spatial motion vector prediction**
+
+There are two sets of spatial neighboring blocks that can be utilized for
+finding spatial MV predictors, including the adjacent spatial neighbors which
+are direct top and left neighbors of the current block, and second outer spatial
+neighbors which are close but not directly adjacent to the current block. The
+two sets of spatial neighboring blocks are illustrated in an example shown in
+Figure 8.
+
+<figure class="image"> <center><img src="img\inter_spatial_mvp.svg"
+alt="Directional intra" width="350" /><figcaption>Figure 8: Motion field
+estimation by linear projection</figcaption></figure>
+
+For each set of spatial neighbors, the top row will be checked from left to
+right and then the left column will be checked from top to down. For the
+adjacent spatial neighbors, an additional top-right block will be also checked
+after checking the left column neighboring blocks. For the non-adjacent spatial
+neighbors, the top-left block located at (-1, -1) position will be checked
+first, then the top row and left column in a similar manner as the adjacent
+neighbors. The adjacent neighbors will be checked first, then the temporal MV
+predictor that will be described in the next subsection will be checked second,
+after that, the non-adjacent spatial neighboring blocks will be checked.
+
+For compound prediction which utilizes a pair of reference frames, the
+non-adjacent spatial neighbors are not used for deriving the MV predictor.
+
+**Temporal motion vector prediction**
+
+In addition to spatial neighboring blocks, MV predictor can be also derived
+using co-located blocks of reference pictures, namely temporal MV predictor. To
+generate temporal MV predictor, the MVs of reference frames are first stored
+together with reference indices associated with the reference frame. Then for
+each 8x8 block of the current frame, the MVs of a reference frame which pass the
+8x8 block are identified and stored together with the reference frame index in a
+temporal MV buffer. In an example shown in Figure 5, the MV of reference frame 1
+(R1) pointing from R1 to a reference frame of R1 is identified, i.e., MVref,
+which passes a 8x8 block (shaded in blue dots) of current frame. Then this MVref
+is stored in the temporal MV buffer associated with this 8x8 block. <figure
+class="image"> <center><img src="img\inter_motion_field.svg" alt="Directional
+intra" width="800" /><figcaption>Figure 9: Motion field estimation by linear
+projection</figcaption></figure> Finally, given a couple of pre-defined block
+coordinates, the associated MVs stored in the temporal MV buffer are identified
+and projected accordingly to derive a temporal MV predictor which points from
+the current block to its reference frame, e.g., MV0 in Figure 5. In Figure 6,
+the pre-defined block positions for deriving temporal MV predictors of a 16x16
+block are shown and up to 7 blocks will be checked to find valid temporal MV
+predictors.<figure class="image"> <center><img
+src="img\inter_tmvp_positions.svg" alt="Directional intra" width="300"
+/><figcaption>Figure 10: Block positions for deriving temporal MV
+predictors</figcaption></figure> The temporal MV predictors are checked after
+the nearest spatial MV predictors but before the non-adjacent spatial MV
+predictors.
+
+All the spatial and temporal MV candidates will be put together in a pool, with
+each predictor associated with a weighting determined during the scanning of the
+spatial and temporal neighboring blocks. Based on the associated weightings, the
+candidates are sorted and ranked, and up to four candidates will be used as a
+list MV predictor list.
+
+### Motion vector coding
+
+### Interpolation filter for motion compensation
+
+<mark>[Ed.: to be added]</mark>
+
+### Warped motion compensation
+
+**Global warped motion**
+
+The global motion information is signalled at each inter frame, wherein the
+global motion type and motion parameters are included. The global motion types
+and the number of the associated parameters are listed in the following table.
+
+
+| Global motion type   | Number of parameters   |
+|:------------------:|:--------------------:|
+| Identity (zero motion)| 0 |
+| Translation | 2 |
+| Rotzoom  | 4 |
+| General affine | 6 |
+
+For an inter coded block, after the reference frame index is
+transmitted, if the motion of current block is indicated as global motion, the
+global motion type and the associated parameters of the given reference will be
+used for current block.
+
+**Local warped motion**
+
+For an inter coded block, local warped motion is allowed when the following
+conditions are all satisfied:
+
+* Current block is single prediction
+* Width or height is greater than or equal to 8 samples
+* At least one of the immediate neighbors uses same reference frame with current block
+
+If the local warped motion is used for current block, instead of signalling the
+affine parameters, they are estimated by using mean square minimization of the
+distance between the reference projection and modeled projection based on the
+motion vectors of current block and its immediate neighbors. To estimate the
+parameters of local warped motion, the projection sample pair of the center
+pixel in neighboring block and its corresponding pixel in the reference frame
+are collected if the neighboring block uses the same reference frame with
+current block. After that, 3 extra samples are created by shifting the center
+position by a quarter sample in one or two dimensions, and these samples are
+also considered as projection sample pairs to ensure the stability of the model
+parameter estimation process.
+
+
+### Overlapped block motion compensation
+
+For an inter-coded block, overlapped block motion compensation (OBMC) is allowed
+when the following conditions are all satisfied.
+
+* Current block is single prediction
+* Width or height is greater than or equal to 8 samples
+* At least one of the neighboring blocks are inter-coded blocks
+
+When OBMC is applied to current block, firstly, the initial inter prediction
+samples is generated by using the assigned motion vector of current block, then
+the inter predicted samples for the current block and inter predicted samples
+based on motion vectors from the above and left blocks are blended to generate
+the final prediction samples.The maximum number of neighboring motion vectors is
+limited based on the size of current block, and up to 4 motion vectors from each
+of upper and left blocks can be involved in the OBMC process of current block.
+
+One example of the processing order of neighboring blocks is shown in the
+following picture, wherein the values marked in each block indicate the
+processing order of the motion vectors of current block and neighboring blocks.
+To be specific, the motion vector of current block is firstly applied to
+generate inter prediction samples P0(x,y). Then motion vector of block 1 is
+applied to generate the prediction samples p1(x,y). After that, the prediction
+samples in the overlapping area between block 0 and block 1 is an weighted
+average of p0(x,y) and p1(x,y). The overlapping area of block 1 and block 0 is
+marked in grey in the following picture. The motion vectors of block 2, 3, 4 are
+further applied and blended in the same way.
+
+<figure class="image"> <center><img src="img\inter_obmc.svg" alt="Directional
+intra" width="300" /><figcaption>Figure 11: neighboring blocks for OBMC
+process</figcaption></figure>
+
+### Reference frames
+
+<mark>[Ed.: to be added]</mark>
+
+### Compound Prediction
+
+<mark>[Ed.: to be added]</mark>
+
+**Compound wedge prediction**
+
+<mark>[Ed.: to be added]</mark>
+
+**Difference-modulated masked prediction**
+
+<mark>[Ed.: to be added]</mark>
+
+**Frame distance-based compound prediction**
+
+<mark>[Ed.: to be added]</mark>
+
+**Compound inter-intra prediction**
+
+<mark>[Ed.: to be added]</mark>
+
+## Transform
+
+The separable 2D transform process is applied on prediction residuals. For the
+forward transform, a 1-D vertical transform is performed first on each column of
+the input residual block, then a horizontal transform is performed on each row
+of the vertical transform output. For the backward transform, a 1-D horizontal
+transform is performed first on each row of the input de-quantized coefficient
+block, then a vertical transform is performed on each column of the horizontal
+transform output. The primary 1-D transforms include four different types of
+transform: a) 4-point, 8-point, 16-point, 32-point, 64-point DCT-2; b) 4-point,
+8-point, 16-point asymmetric DST’s (DST-4, DST-7) and c) their flipped
+versions; d) 4-point, 8-point, 16-point, 32-point identity transforms. When
+transform size is 4-point, ADST refers to DST-7, otherwise, when transform size
+is greater than 4-point, ADST refers to DST-4.
+
+<figure class="image"> <center><figcaption>Table 2: Transform basis functions
+(DCT-2, DST-4 and DST-7 for N-point input.</figcaption> <img src=
+"img\tx_basis.svg" alt="Partition" width="450" /> </figure>
+
+For luma component, each transform block can select one pair of horizontal and
+vertical transform combination given a pre-defined set of transform type
+candidates, and the selection is explicitly signalled into the bitstream.
+However, the selection is not signalled when Max(width,height) is 64. When
+the maximum of transform block width and height is greater than or equal to 32,
+the set of transform type candidates depend on the prediction mode, as described
+in Table 3. Otherwise, when the maximum of transform block width and height is
+smaller than 32, the set of transform type candidates depend on the prediction
+mode, as described in Table 4.
+
+<figure class="image"> <center><figcaption>Table 3: Transform type candidates
+for luma component when max(width, height) is greater than or equal to 32.
+</figcaption> <img src="img\tx_cands_large.svg" alt="Partition" width="370" />
+</figure>
+
+<figure class="image"> <center><figcaption>Table 4: Transform type candidates
+for luma component when max(width, height) is smaller than 32. </figcaption>
+<img src="img\tx_cands_small.svg" alt="Partition" width="440" /> </figure>
+
+The set of transform type candidates (namely transform set) is defined in Table
+5.
+
+<figure class="image"> <center><figcaption>Table 5: Definition of transform set.
+</figcaption> <img src="img\tx_set.svg" alt="Partition" width="450" /> </figure>
+
+For chroma component, the transform type selection is done in an implicit way.
+For intra prediction residuals, the transform type is selected according to the
+intra prediction mode, as specified in Table 4. For inter prediction residuals,
+the transform type is selected according to the transform type selection of the
+co-located luma block. Therefore, for chroma component, there is no transform
+type signalling in the bitstream.
+
+<figure class="image"> <center><figcaption>Table 6: Transform type selection for
+chroma component intra prediction residuals.</figcaption> <img src=
+"img\tx_chroma.svg" alt="Partition" width="500" /> </figure>
+
+The computational cost of large size (e.g., 64-point) transforms is further
+reduced by zeroing out all the coefficients except the following two cases:
+
+1. The top-left 32×32 quadrant for 64×64/64×32/32×64 DCT_DCT hybrid transforms
+2. The left 32×16 area for 64×16 and top 16×32 for16×64 DCT_DCT hybrid transforms.
+
+Both the DCT-2 and ADST (DST-4, DST-7) are implemented using butterfly structure
+[1], which included multiple stages of butterfly operations. Each butterfly
+operations can be calculated in parallel and different stages are cascaded in a
+sequential order.
+
+## Quantization
+Quantization of transform coefficients may apply different quantization step
+size for DC and AC transform coefficients, and different quantization step size
+for luma and chroma transform coefficients. To specify the quantization step
+size, in the frame header, a _**base_q_idx**_ syntax element is first signalled,
+which is a 8-bit fixed length code specifying the quantization step size for
+luma AC coefficients. The valid range of _**base_q_idx**_ is [0, 255].
+
+After that, the delta value relative to base_q_idx for Luma DC coefficients,
+indicated as DeltaQYDc is further signalled. Furthermore, if there are more than
+one color plane, then a flag _**diff_uv_delta**_ is signaled to indicate whether
+Cb and Cr color components apply different quantization index values. If
+_**diff_uv_delta**_ is signalled as 0, then only the delta values relative to
+base_q_idx for chroma DC coefficients (indicated as DeltaQUDc) and AC
+coefficients (indicated as DeltaQUAc) are signalled. Otherwise, the delta values
+relative to base_q_idx for both the Cb and Cr DC coefficients (indicated as
+DeltaQUDc and DeltaQVDc) and AC coefficients (indicated as DeltaQUAc and
+DeltaQVAc) are signalled.
+
+The above decoded DeltaQYDc, DeltaQUAc, DeltaQUDc, DeltaQVAc and DeltaQVDc are
+added to _base_q_idx_ to derive the quantization indices. Then these
+quantization indices are further mapped to quantization step size according to
+two tables. For DC coefficients, the mapping from quantization index to
+quantization step size for 8-bit, 10-bit and 12-bit internal bit depth is
+specified by a lookup table Dc_Qlookup[3][256], and the mapping from
+quantization index to quantization step size for 8-bit, 10-bit and 12-bit is
+specified by a lookup table Ac_Qlookup[3][256].
+
+<figure class="image"> <center><img src="img\quant_dc.svg" alt="quant_dc"
+width="800" /><figcaption>Figure 11: Quantization step size of DC coefficients
+for different internal bit-depth</figcaption></figure>
+
+<figure class="image"> <center><img src="img\quant_ac.svg" alt="quant_ac"
+width="800" /><figcaption>Figure 12: Quantization step size of AC coefficients
+for different internal bit-depth</figcaption></figure>
+
+Given the quantization step size, indicated as _Q<sub>step_, the input quantized
+coefficients is further de-quantized using the following formula:
+
+_F_ = sign * ( (_f_ * _Q<sub>step_) % 0xFFFFFF ) / _deNorm_
+
+, where _f_ is the input quantized coefficient, _F_ is the output dequantized
+coefficient, _deNorm_ is a constant value derived from the transform block area
+size, as indicated by the following table:
+
+| _deNorm_ | Tx block area size |
+|----------|:--------------------------|
+| 1| Less than 512 samples |
+| 2 | 512 or 1024 samples |
+| 4 | Greater than 1024 samples |
+
+When the quantization index is 0, the quantization is performed using a
+quantization step size equal to 1, which is lossless coding mode.
+
+## Entropy Coding
+
+**Entropy coding engine**
+
+<mark>[Ed.: to be added]</mark>
+
+**Coefficient coding**
+
+For each transform unit, the coefficient coding starts with coding a skip sign,
+which is followed by the signaling of primary transform kernel type and the
+end-of-block (EOB) position in case the transform coding is not skipped. After
+that, the coefficient values are coded in a multiple level map manner plus sign
+values. The level maps are coded as three level planes, namely lower-level,
+middle-level and higher-level planes, and the sign is coded as another separate
+plane. The lower-level, middle-level and higher-level planes correspond to
+correspond to different ranges of coefficient magnitudes. The lower level plane
+corresponds to the range of 0–2, the middle level plane takes care of the
+range of 3–14, and the higher-level plane covers the range of 15 and above.
+
+The three level planes are coded as follows. After the EOB position is coded,
+the lower-level and middle-level planes are coded together in backward scan
+order, and the scan order refers to zig-zag scan applied on the entire transform
+unit basis. Then the sign plane and higher-level plane are coded together in
+forward scan order. After that, the remainder (coefficient level minus 14) is
+entropy coded using Exp-Golomb code.
+
+The context model applied to the lower level plane depends on the primary
+transform directions, including: bi-directional, horizontal, and vertical, as
+well as transform size, and up to five neighbor (in frequency domain)
+coefficients are used to derive the context. The middle level plane uses a
+similar context model, but the number of context neighbor coefficients is
+reduced from 5 to 2. The higher-level plane is coded by Exp-Golomb code without
+using context model. For the sign plane, except the DC sign that is coded using
+the DC signs from its neighboring transform units, sign values of other
+coefficients are coded directly without using context model.
+
+## Loop filtering and post-processing
+
+### Deblocking
+
+There are four methods when picking deblocking filter level, which are listed
+below:
+
+* LPF_PICK_FROM_FULL_IMAGE: search the full image with different values
+* LPF_PICK_FROM_Q: estimate the filter level based on quantizer and frame type
+* LPF_PICK_FROM_SUBIMAGE: estimate the level from a portion of image
+* LPF_PICK_MINIMAL_LPF: set the filter level to 0 and disable the deblocking
+
+When estimating the filter level from the full image or sub-image, the searching
+starts from the previous frame filter level, ends when the filter step is less
+or equal to zero. In addition to filter level, there are some other parameters
+which control the deblocking filter such as sharpness level, mode deltas, and
+reference deltas.
+
+Deblocking is performed at 128x128 super block level, and the vertical and
+horizontal edges are filtered respectively. For a 128x128 super block, the
+vertical/horizontal edges aligned with each 8x8 block is firstly filtered. If
+the 4x4 transform is used, the internal edge aligned with a 4x4 block will be
+further filtered. The filter length is switchable from 4-tap, 6-tap, 8-tap,
+14-tap, and 0-tap (no filtering). The location of filter taps are identified
+based on the number of filter taps in order to compute the filter mask. When
+finally performing the filtering, outer taps are added if there is high edge
+variance.
+
+### Constrained directional enhancement filter
+
+**Edge Direction Estimation**\
+In CDEF, edge direction search is performed at 8x8 block-level. There are
+eight edge directions in total, as illustrated in Figure 13.
+<figure class="image"> <center><img src="img\edge_direction.svg"
+alt="Edge direction" width="700" /> <figcaption>Figure 13: Line number
+k for pixels following direction d=0:7 in an 8x8 block.</figcaption> </figure>
+
+The optimal edge direction d_opt is found by maximizing the following
+term [3]:
+
+<figure class="image"> <center><img src="img\equ_edge_direction.svg"
+alt="Equation edge direction" width="250" /> </figure>
+<!-- $$d_{opt}=\max_{d} s_d$$
+$$s_d = \sum_{k}\frac{1}{N_{d,k}}(\sum_{p\in P_{d,k}}x_p)^2,$$ -->
+
+where x_p is the value of pixel p, P_{d,k} is the set of pixels in
+line k following direction d, N_{d,k} is the cardinality of P_{d,k}.
+
+**Directional filter**\
+CDEF consists two filter taps: the primary tap and the secondary tap.
+The primary tap works along the edge direction (as shown in Figure 14),
+while the secondary tap forms an oriented 45 degree off the edge direction
+ (as shown in Figure 15).
+
+<figure class="image"> <center><img src="img\primary_tap.svg"
+alt="Primary tap" width="700" /> <figcaption>Figure 14: Primary filter
+taps following edge direction. For even strengths a = 2 and b = 4, for
+odd strengths a = 3 and b = 3. The filtered pixel is shown in the
+highlighted center.</figcaption> </figure>
+
+<figure class="image"> <center><img src="img\secondary_tap.svg"
+alt="Edge direction" width="700" /> <figcaption>Figure 15: Secondary
+filter taps. The filtered pixel is shown in the highlighted center.
+</figcaption> </figure>
+
+CDEF can be described by the following equation:
+
+<figure class="image"> <center><img src="img\equ_dir_search.svg"
+alt="Equation direction search" width="720" /> </figure>
+
+<!-- $$y(i,j)=x(i,j)+round(\sum_{m,n}w^{(p)}_{d,m,n}f(x(m,x)-x(i,j),S^{(p)},
+D)+\sum_{m,n}w^{(s)}_{d,m,n}f(x(m,x)-x(i,j),S^{(s)},D)),$$ -->
+
+where x(i,j) and y(i,j) are the input and output reconstructed values
+of CDEF. p denotes primary tap, and s denotes secondary tap, w is
+the weight between primary and secondary tap. f(d,S,D) is a non-linear
+filtering function, S denotes filter strength, D is a damping parameter.
+For 8-bit content, S^p ranges from 0 to 15, and S^s can be
+0, 1, 2, or 4. D ranges from 3 to 6 for luma, and 2 to 4 for chroma.
+
+**Non linear filter**\
+CDEF uses a non-linear filtering function to prevent excessive blurring
+when applied across an edge. It is achieved by ignoring pixels that are
+too different from the current pixels to be filtered. When the difference
+between current pixel and it's neighboring pixel d is within a threshold,
+f(d,S,D) = d, otherwise f(d,S,D) = 0. Specifically, the strength S
+determines the maximum difference allowed and damping D determines the
+point to ignore the filter tap.
+
+### Loop Restoration filter
+
+**Separable symmetric wiener filter**
+
+Let F be a w x w 2D filter taps around the pixel to be filtered, denoted as
+a w^2 x 1 column vector. When compared with traditional Wiener Filter,
+Separable Symmetric Wiener Filter has the following three constraints in order
+to save signaling bits and reduce complexity [4]:
+
+1) The w x w filter window of is separated into horizontal and vertical w-tap
+convolutions.
+
+2) The horizontal and vertical filters are constrained to be symmetric.
+
+3) It is assumed that the summation of horizontal/vertical filter coefficients
+is 1.
+
+As a result, F can be written as F = column_vectorize[ab^T], subject to a(i)
+= a(w - 1 - i), b(i) = b(w - 1 - i), for i = [0, r - 1], and sum(a(i)) =
+sum(b(i)) = 1, where a is the vertical filters and b is the horizontal filters.
+The derivation of the filters a and b starts from an initial guess of
+horizontal and vertical filters, optimizing one of the two while holding the
+other fixed. In the implementation w = 7, thus, 3 taps need to be sent for
+filters a and b, respectively. When signaling the filter coefficients, 4, 5 and
+6 bits are used for the first three filter taps, and the remaining ones are
+obtained from the normalization and symmetry constraints. 30 bits in total are
+transmitted for both vertical and horizontal filters.
+
+
+**Dual self-guided filter**
+
+Dual self-guided filter is designed to firstly obtain two coarse restorations
+X1 and X2 of the degraded frame X, and the final restoration Xr is obtained as
+a combination of the degraded samples, and the difference between the degraded
+samples and the coarse restorations [4]:
+
+<figure class="image"> <center><img src="img\equ_dual_self_guided.svg"
+alt="Equation dual self guided filter" width="300" /> </figure>
+<!-- $$X_r = X + \alpha (X_1 - X) + \beta (X_2 - X)$$ -->
+
+At encoder side, alpha and beta are computed using:
+
+<figure class="image"> <center><img src="img\equ_dual_self_para.svg"
+alt="Equation dual self guided filter parameter" width="220" /> </figure>
+<!-- $${\alpha, \beta}^T = (A^T A) ^{-1} A^T b,$$ -->
+
+where A = {X1 - X, X2 - X}, b = Y - X, and Y is the original source.
+
+X1 and X2 are obtained using guided filtering, and the filtering is controlled
+by a radius r and a noise parameter e, where a higher r implies a higher
+spatial variance and a higher e implies a higher range variance [4]. X1 and X2
+can be described by {r1, e1} and {r2, e2}, respectively.
+
+The encoder sends a 6-tuple {r1, e1, r2, e2, alpha, beta} to the decoder. In
+the implementation, {r1, e1, r2, e2} uses a 3-bit codebook, and {alpha, beta}
+uses 7-bit each due to much higher precision, resulting in a total of 17 bits.
+r is always less or equal to 3 [4].
+
+Guided filtering can be described by a local linear model:
+
+<figure class="image"> <center><img src="img\equ_guided_filter.svg"
+alt="Equation guided filter" width="155" /> </figure>
+<!-- $$y=Fx+G,$$ -->
+
+where x and y are the input and output samples, F and G are determined by the
+statistics in the neighboring of the pixel to be filtered. It is called
+self-guided filtering when the guidance image is the same as the degraded
+image[4].
+
+Following are three steps when deriving F and G of the self-guided filtering:
+
+1) Compute mean u and variance d of pixels in a (2r + 1) x (2r + 1) window
+around the pixel to be filtered.
+
+2) For each pixel, compute f = d / (d + e); g = (1 - f)u.
+
+3) Compute F and G for each pixel as averages of f and g values in a 3 x 3
+window around the pixel for use in step 2.
+
+### Frame super-resolution
+
+In order to improve the perceptual quality of decoded pictures, a
+super-resolution process is applied at low bit-rates [5]. First, at encoder
+side, the source video is downscaled as a non-normative procedure. Second,
+the downscaled video is encoded, followed by deblocking and CDEF process.
+Third, a linear upscaling process is applied as a normative procedure to bring
+the encoded video back to it's original spatial resolution. Lastly, the loop
+restoration is applied to resolve part of the high frequency lost. The last
+two steps together are called super-resolving process [5]. Similarly, decoding,
+deblocking and CDEF processes are applied at lower spatial resolution at
+decoder side. Then, the frames go through the super-resolving process.
+In order to reduce overheads in line-buffers with respect to hardware
+implementation, the upscaling and downscaling process are applied to
+horizontal dimension only.
+
+### Film grain synthesis
+
+At encoder side, film grain is removed from the input video as a denoising
+process. Then, the structure and intensity of the input video are analyzed
+by canny edge detector, and smooth areas are used to estimate the strength
+of film grain. Once the strength is estimated, the denoised video and film
+grain parameters are sent to decoder side. Those parameters are used to
+synthesis the grain and add it back to the decoded video, producing the final
+output video.
+
+In order to reconstruct the film grain, the following parameters are sent to
+decoder side: lag value, autoregressive coefficients, values for precomputed
+look-up table index of chroma components, and a set of points for a piece-wise
+linear scaling function [6]. Those parameters are signaled as quantized
+integers including 64 bytes for scaling function and 74 bytes for
+autoregressive coefficients. Once the parameters are received, an
+autoregressive process is applied in a raster scan order to generate one 64x64
+luma and two 32x32 chroma film grain templates [6]. Those templates are used
+to generate the grain for the remaining part of a picture.
+
+## Screen content coding
+
+To improve the coding performance of screen content coding, the associated video
+codec incorporates several coding tools,for example, intra block copy
+(IntraBC) is employed to handle the repeated patterns in a screen picture, and
+palette mode is used to handle the screen blocks with a limited number of
+different colors.
+
+### Intra block copy
+
+Intra Block Copy (IntraBC) [2] is a coding tool similar to inter-picture
+prediction. The main difference is that in IntraBC, a predictor block is
+formed from the reconstructed samples (before application of in-loop filtering)
+of the current picture. Therefore, IntraBC can be considered as "motion
+compensation" within current picture.
+
+A block vector (BV) was coded to specify the location of the predictor block.
+The BV precision is integer. The BV will be signalled in the bitstream since the
+decoder needs it to locate the predictor. For current block, the flag use
+IntraBC indicating whether current block is IntraBC mode is first transmitted in
+bit stream. Then, if the current block is IntraBC mode, the BV difference diff
+is obtained by subtracting the reference BV from the current BV, and then diff
+is classified into four types according to the diff values of horizontal and
+vertical component. Type information needs to be transmitted into the bitstream,
+after that, diff values of two components may be signalled based on the type
+info.
+
+IntraBC is very effective for screen content coding, but it also brings a lot of
+difficulties to hardware design. To facilitate the hardware design, the
+following modifications are adopted.
+
+1) when IntraBC is allowed, the loop filters are disabled, which are de-blocking
+filter, the CDEF (Constrained Directional Enhancement Filter), and the Loop
+Restoration. By doing this, picture buffer of reconstructed samples can be
+shared between IntraBC and inter prediction.
+
+2) To facilitate parallel decoding, the prediction cannot exceed the restricted
+areas. For one super block, if the coordinate of its top-left position is (x0,
+y0), the prediction at position (x, y) can be accessed by IntraBC, if y < y0 and
+x < x0 + 2 * (y0 - y)
+
+3) To allow hardware writing back delay, immediate reconstructed areas cannot be
+accessed by IntraBC prediction. The restricted immediate reconstructed area can
+be 1 ∼ n super blocks. So on top of modification 2, if the coordinate of one
+super block's top-left position is (x0, y0), the prediction at position (x, y)
+can be accessed by IntraBC, if y < y0 and x < x0 + 2 * (y0 - y) - D, where D
+denotes the restricted immediate reconstructed area. When D is one super block,
+the prediction area is shown in below figure.
+
+<figure class="image"> <center><img src="img\SCC_IntraBC.svg" alt="Intra block
+copy" width="600" /> <figcaption>Figure 13: the prediction area for IntraBC mode
+in one super block prediction</figcaption> </figure>
+
+### Palette mode
+
+# References
+
+[1] J. Han, Y. Xu and D. Mukherjee, "A butterfly structured design of the hybrid
+transform coding scheme," 2013 Picture Coding Symposium (PCS), San Jose, CA,
+2013, pp. 17-20.\
+[2] J. Li, H. Su, A. Converse, B. Li, R. Zhou, B. Lin, J. Xu, Y. Lu, and R.
+Xiong, "Intra Block Copy for Screen Content in the Emerging AV1 Video Codec,"
+2018 Data Compression Conference, Snowbird, Utah, USA.\
+[3] S. Midtskogen and J.M. Valin. "The AV1 constrained directional enhancement
+ filter (CDEF)." In 2018 IEEE International Conference on Acoustics, Speech
+  and Signal Processing (ICASSP), pp. 1193-1197. IEEE, 2018.\
+[4] D. Mukherjee, S. Li, Y. Chen, A. Anis, S. Parker, and
+J. Bankoski. "A switchable loop-restoration with side-information framework
+for the emerging AV1 video codec." In 2017 IEEE International Conference on
+Image Processing (ICIP), pp. 265-269. IEEE, 2017.\
+[5] Y. Chen, D. Murherjee, J. Han, A. Grange, Y. Xu, Z. Liu,... & C.H.Chiang,
+(2018, June). "An overview of core coding tools in the AV1 video codec.""
+In 2018 Picture Coding Symposium (PCS) (pp. 41-45). IEEE.\
+[6] A. Norkin, & N. Birkbeck, (2018, March). "Film grain synthesis for AV1
+video codec." In 2018 Data Compression Conference (pp. 3-12). IEEE.
diff --git a/doc/dev_guide/av1_decoder.dox b/doc/dev_guide/av1_decoder.dox
new file mode 100644
index 0000000..f65ddb5
--- /dev/null
+++ b/doc/dev_guide/av1_decoder.dox
@@ -0,0 +1,11 @@
+/*!\page decoder_guide AV1 DECODER GUIDE
+
+  Describe AV1 decoding techniques here.
+
+  \cond
+  \if av1_md_support
+  [AV1 Algorithm Description](\ref LALGORITHMDESCRIPTION)
+  \endif
+  \endcond
+
+*/
diff --git a/doc/dev_guide/av1_encoder.dox b/doc/dev_guide/av1_encoder.dox
new file mode 100644
index 0000000..7241b95
--- /dev/null
+++ b/doc/dev_guide/av1_encoder.dox
@@ -0,0 +1,1608 @@
+/*!\page encoder_guide AV1 ENCODER GUIDE
+
+\tableofcontents
+
+\section architecture_introduction Introduction
+
+This document provides an architectural overview of the libaom AV1 encoder.
+
+It is intended as a high level starting point for anyone wishing to contribute
+to the project, that will help them to more quickly understand the structure
+of the encoder and find their way around the codebase.
+
+It stands above and will where necessary link to more detailed function
+level documents.
+
+\subsection  architecture_gencodecs Generic Block Transform Based Codecs
+
+Most modern video encoders including VP8, H.264, VP9, HEVC and AV1
+(in increasing order of complexity) share a common basic paradigm. This
+comprises separating a stream of raw video frames into a series of discrete
+blocks (of one or more sizes), then computing a prediction signal and a
+quantized, transform coded, residual error signal. The prediction and residual
+error signal, along with any side information needed by the decoder, are then
+entropy coded and packed to form the encoded bitstream. See Figure 1: below,
+where the blue blocks are, to all intents and purposes, the lossless parts of
+the encoder and the red block is the lossy part.
+
+This is of course a gross oversimplification, even in regard to the simplest
+of the above codecs.  For example, all of them allow for block based
+prediction at multiple different scales (i.e. different block sizes) and may
+use previously coded pixels in the current frame for prediction or pixels from
+one or more previously encoded frames. Further, they may support multiple
+different transforms and transform sizes and quality optimization tools like
+loop filtering.
+
+\image html genericcodecflow.png "" width=70%
+
+\subsection architecture_av1_structure AV1 Structure and Complexity
+
+As previously stated, AV1 adopts the same underlying paradigm as other block
+transform based codecs. However, it is much more complicated than previous
+generation codecs and supports many more block partitioning, prediction and
+transform options.
+
+AV1 supports block partitions of various sizes from 128x128 pixels down to 4x4
+pixels using a multi-layer recursive tree structure as illustrated in figure 2
+below.
+
+\image html av1partitions.png "" width=70%
+
+AV1 also provides 71 basic intra prediction modes, 56 single frame inter prediction
+modes (7 reference frames x 4 modes x 2 for OBMC (overlapped block motion
+compensation)), 12768 compound inter prediction modes (that combine inter
+predictors from two reference frames) and 36708 compound inter / intra
+prediction modes. Furthermore, in addition to simple inter motion estimation,
+AV1 also supports warped motion prediction using affine transforms.
+
+In terms of transform coding, it has 16 separable 2-D transform kernels
+\f$(DCT, ADST, fADST, IDTX)^2\f$ that can be applied at up to 19 different
+scales from 64x64 down to 4x4 pixels.
+
+When combined together, this means that for any one 8x8 pixel block in a
+source frame, there are approximately 45,000,000 different ways that it can
+be encoded.
+
+Consequently, AV1 requires complex control processes. While not necessarily
+a normative part of the bitstream, these are the algorithms that turn a set
+of compression tools and a bitstream format specification, into a coherent
+and useful codec implementation. These may include but are not limited to
+things like :-
+
+- Rate distortion optimization (The process of trying to choose the most
+  efficient combination of block size, prediction mode, transform type
+  etc.)
+- Rate control (regulation of the output bitrate)
+- Encoder speed vs quality trade offs.
+- Features such as two pass encoding or optimization for low delay
+  encoding.
+
+For a more detailed overview of AV1's encoding tools and a discussion of some
+of the design considerations and hardware constraints that had to be
+accommodated, please refer to <a href="https://arxiv.org/abs/2008.06091">
+A Technical Overview of AV1</a>.
+
+Figure 3 provides a slightly expanded but still simplistic view of the
+AV1 encoder architecture with blocks that relate to some of the subsequent
+sections of this document. In this diagram, the raw uncompressed frame buffers
+are shown in dark green and the reconstructed frame buffers used for
+prediction in light green. Red indicates those parts of the codec that are
+(or may be) lossy, where fidelity can be traded off against compression
+efficiency, whilst light blue shows algorithms or coding tools that are
+lossless. The yellow blocks represent non-bitstream normative configuration
+and control algorithms.
+
+\image html av1encoderflow.png "" width=70%
+
+\section architecture_command_line The Libaom Command Line Interface
+
+ Add details or links here: TODO ? elliotk@
+
+\section architecture_enc_data_structures Main Encoder Data Structures
+
+The following are the main high level data structures used by the libaom AV1
+encoder and referenced elsewhere in this overview document:
+
+- \ref AV1_COMP
+    - \ref AV1_COMP.oxcf (\ref AV1EncoderConfig)
+    - \ref AV1_COMP.alt_ref_buffer (\ref yv12_buffer_config)
+    - \ref AV1_COMP.rc (\ref RATE_CONTROL)
+    - \ref AV1_COMP.twopass (\ref TWO_PASS)
+    - \ref AV1_COMP.gf_group (\ref GF_GROUP)
+    - \ref AV1_COMP.speed
+    - \ref AV1_COMP.sf (\ref SPEED_FEATURES)
+    - \ref AV1_COMP.lap_enabled
+
+- \ref AV1EncoderConfig (Encoder configuration parameters)
+    - \ref AV1EncoderConfig.pass
+    - \ref AV1EncoderConfig.algo_cfg (\ref AlgoCfg)
+    - \ref AV1EncoderConfig.kf_cfg (\ref KeyFrameCfg)
+    - \ref AV1EncoderConfig.rc_cfg (\ref RateControlCfg)
+
+- \ref AlgoCfg (Algorithm related configuration parameters)
+    - \ref AlgoCfg.arnr_max_frames
+    - \ref AlgoCfg.arnr_strength
+
+- \ref KeyFrameCfg (Keyframe coding configuration parameters)
+    - \ref KeyFrameCfg.enable_keyframe_filtering
+
+- \ref RateControlCfg (Rate control configuration)
+    - \ref RateControlCfg.mode
+    - \ref RateControlCfg.target_bandwidth
+    - \ref RateControlCfg.best_allowed_q
+    - \ref RateControlCfg.worst_allowed_q
+    - \ref RateControlCfg.cq_level
+    - \ref RateControlCfg.under_shoot_pct
+    - \ref RateControlCfg.over_shoot_pct
+    - \ref RateControlCfg.maximum_buffer_size_ms
+    - \ref RateControlCfg.starting_buffer_level_ms
+    - \ref RateControlCfg.optimal_buffer_level_ms
+    - \ref RateControlCfg.vbrbias
+    - \ref RateControlCfg.vbrmin_section
+    - \ref RateControlCfg.vbrmax_section
+
+- \ref RATE_CONTROL (Rate control status)
+    - \ref RATE_CONTROL.intervals_till_gf_calculate_due
+    - \ref RATE_CONTROL.gf_intervals[]
+    - \ref RATE_CONTROL.cur_gf_index
+    - \ref RATE_CONTROL.frames_till_gf_update_due
+    - \ref RATE_CONTROL.frames_to_key
+
+- \ref TWO_PASS (Two pass status and control data)
+
+- \ref GF_GROUP (Data related to the current GF/ARF group)
+
+- \ref FIRSTPASS_STATS (Defines entries in the first pass stats buffer)
+    - \ref FIRSTPASS_STATS.coded_error
+
+- \ref SPEED_FEATURES (Encode speed vs quality tradeoff parameters)
+    - \ref SPEED_FEATURES.hl_sf (\ref HIGH_LEVEL_SPEED_FEATURES)
+
+- \ref HIGH_LEVEL_SPEED_FEATURES
+    - \ref HIGH_LEVEL_SPEED_FEATURES.recode_loop
+    - \ref HIGH_LEVEL_SPEED_FEATURES.recode_tolerance
+
+- \ref TplParams
+
+\section architecture_enc_use_cases Encoder Use Cases
+
+The libaom AV1 encoder is configurable to support a number of different use
+cases and rate control strategies.
+
+The principle use cases for which it is optimised are as follows:
+
+ - <b>Video on Demand / Streaming</b>
+ - <b>Low Delay or Live Streaming</b>
+ - <b>Video Conferencing / Real Time Coding (RTC)</b>
+ - <b>Fixed Quality / Testing</b>
+
+Other examples of use cases for which the encoder could be configured but for
+which there is less by way of specific optimizations include:
+
+ - <b>Download and Play</b>
+ - <b>Disk Playback</b>>
+ - <b>Storage</b>
+ - <b>Editing</b>
+ - <b>Broadcast video</b>
+
+Specific use cases may have particular requirements or constraints. For
+example:
+
+<b>Video Conferencing:</b>  In a video conference we need to encode the video
+in real time and to avoid any coding tools that could increase latency, such
+as frame look ahead.
+
+<b>Live Streams:</b> In cases such as live streaming of games or events, it
+may be possible to allow some limited buffering of the video and use of
+lookahead coding tools to improve encoding quality. However,  whilst a lag of
+a second or two may be fine given the one way nature of this type of video,
+it is clearly not possible to use tools such as two pass coding.
+
+<b>Broadcast:</b> Broadcast video (e.g. digital TV over satellite) may have
+specific requirements such as frequent and regular key frames (e.g. once per
+second or more) as these are important as entry points to users when switching
+channels. There may also be  strict upper limits on bandwidth over a short
+window of time.
+
+<b>Download and Play:</b> Download and play applications may have less strict
+requirements in terms of local frame by frame rate control but there may be a
+requirement to accurately hit a file size target for the video clip as a
+whole. Similar considerations may apply to playback from mass storage devices
+such as DVD or disk drives.
+
+<b>Editing:</b> In certain special use cases such as offline editing, it may
+be desirable to have very high quality and data rate but also very frequent
+key frames or indeed to encode the video exclusively as key frames. Lossless
+video encoding may also be required in this use case.
+
+<b>VOD / Streaming:</b> One of the most important and common use cases for AV1
+is video on demand or streaming, for services such as YouTube and Netflix. In
+this use case it is possible to do two or even multi-pass encoding to improve
+compression efficiency. Streaming services will often store many encoded
+copies of a video at different resolutions and data rates to support users
+with different types of playback device and bandwidth limitations.
+Furthermore, these services support dynamic switching between multiple
+streams, so that they can respond to changing network conditions.
+
+Exact rate control when encoding for a specific format (e.g 360P or 1080P on
+YouTube) may not be critical, provided that the video bandwidth remains within
+allowed limits. Whilst a format may have a nominal target data rate, this can
+be considered more as the desired average egress rate over the video corpus
+rather than a strict requirement for any individual clip. Indeed, in order
+to maintain optimal quality of experience for the end user, it may be
+desirable to encode some easier videos or sections of video at a lower data
+rate and harder videos or sections at a higher rate.
+
+VOD / streaming does not usually require very frequent key frames (as in the
+broadcast case) but key frames are important in trick play (scanning back and
+forth to different points in a video) and for adaptive stream switching. As
+such, in a use case like YouTube, there is normally an upper limit on the
+maximum time between key frames of a few seconds, but within certain limits
+the encoder can try to align key frames with real scene cuts.
+
+Whilst encoder speed may not seem to be as critical in this use case, for
+services such as YouTube, where millions of new videos have to be encoded
+every day, encoder speed is still important, so libaom allows command line
+control of the encode speed vs quality trade off.
+
+<b>Fixed Quality / Testing Mode:</b> Libaom also has a fixed quality encoder
+pathway designed for testing under highly constrained conditions.
+
+\section architecture_enc_speed_quality Speed vs Quality Trade Off
+
+In any modern video encoder there are trade offs that can be made in regard to
+the amount of time spent encoding a video or video frame vs the quality of the
+final encode.
+
+These trade offs typically limit the scope of the search for an optimal
+prediction / transform combination with faster encode modes doing fewer
+partition, reference frame, prediction mode and transform searches at the cost
+of some reduction in coding efficiency.
+
+The pruning of the size of the search tree is typically based on assumptions
+about the likelihood of different search modes being selected based on what
+has gone before and features such as the dimensions of the video frames and
+the Q value selected for encoding the frame. For example certain intra modes
+are less likely to be chosen at high Q but may be more likely if similar
+modes were used for the previously coded blocks above and to the left of the
+current block.
+
+The speed settings depend both on the use case (e.g. Real Time encoding) and
+an explicit speed control passed in on the command line as <b>--cpu-used</b>
+and stored in the \ref AV1_COMP.speed field of the main compressor instance
+data structure (<b>cpi</b>).
+
+The control flags for the speed trade off are stored the \ref AV1_COMP.sf
+field of the compressor instancve and are set in the following functions:-
+
+- \ref av1_set_speed_features_framesize_independent()
+- \ref av1_set_speed_features_framesize_dependent()
+- \ref av1_set_speed_features_qindex_dependent()
+
+A second factor impacting the speed of encode is rate distortion optimisation
+(<b>rd vs non-rd</b> encoding).
+
+When rate distortion optimization is enabled each candidate combination of
+a prediction mode and transform coding strategy is fully encoded and the
+resulting error (or distortion) as compared to the original source and the
+number of bits used, are passed to a rate distortion function. This function
+converts the distortion and cost in bits to a single <b>RD</b> value (where
+lower is better). This <b>RD</b> value is used to decide between different
+encoding strategies for the current block where, for example, a one may
+result in a lower distortion but a larger number of bits.
+
+The calculation of this <b>RD</b> value is broadly speaking as follows:
+
+\f[
+  RD = (&lambda; * Rate) + Distortion
+\f]
+
+This assumes a linear relationship between the number of bits used and
+distortion (represented by the rate multiplier value <b>&lambda;</b>) which is
+not actually valid across a broad range of rate and distortion values.
+Typically, where distortion is high, expending a small number of extra bits
+will result in a large change in distortion. However, at lower values of
+distortion the cost in bits of each incremental improvement is large.
+
+To deal with this we scale the value of <b>&lambda;</b> based on the quantizer
+value chosen for the frame. This is assumed to be a proxy for our approximate
+position on the true rate distortion curve and it is further assumed that over
+a limited range of distortion values, a linear relationship between distortion
+and rate is a valid approximation.
+
+Doing a rate distortion test on each candidate prediction / transform
+combination is expensive in terms of cpu cycles. Hence, for cases where encode
+speed is critical, libaom implements a non-rd pathway where the <b>RD</b>
+value is estimated based on the prediction error and quantizer setting.
+
+\section architecture_enc_src_proc Source Frame Processing
+
+\subsection architecture_enc_frame_proc_data Main Data Structures
+
+The following are the main data structures referenced in this section
+(see also \ref architecture_enc_data_structures):
+
+- \ref AV1_COMP cpi (the main compressor instance data structure)
+    - \ref AV1_COMP.oxcf (\ref AV1EncoderConfig)
+    - \ref AV1_COMP.alt_ref_buffer (\ref yv12_buffer_config)
+
+- \ref AV1EncoderConfig (Encoder configuration parameters)
+    - \ref AV1EncoderConfig.algo_cfg (\ref AlgoCfg)
+    - \ref AV1EncoderConfig.kf_cfg (\ref KeyFrameCfg)
+
+- \ref AlgoCfg (Algorithm related configuration parameters)
+    - \ref AlgoCfg.arnr_max_frames
+    - \ref AlgoCfg.arnr_strength
+
+- \ref KeyFrameCfg (Keyframe coding configuration parameters)
+    - \ref KeyFrameCfg.enable_keyframe_filtering
+
+\subsection architecture_enc_frame_proc_ingest Frame Ingest / Coding Pipeline
+
+ To encode a frame, first call \ref av1_receive_raw_frame() to obtain the raw
+ frame data. Then call \ref av1_get_compressed_data() to encode raw frame data
+ into compressed frame data. The main body of \ref av1_get_compressed_data()
+ is \ref av1_encode_strategy(), which determines high-level encode strategy
+ (frame type, frame placement, etc.) and then encodes the frame by calling
+ \ref av1_encode(). In \ref av1_encode(), \ref av1_first_pass() will execute
+ the first_pass of two-pass encoding, while \ref encode_frame_to_data_rate()
+ will perform the final pass for either one-pass or two-pass encoding.
+
+ The main body of \ref encode_frame_to_data_rate() is
+ \ref encode_with_recode_loop_and_filter(), which handles encoding before
+ in-loop filters (with recode loops \ref encode_with_recode_loop(), or
+ without any recode loop \ref encode_without_recode()), followed by in-loop
+ filters (deblocking filters \ref loopfilter_frame(), CDEF filters and
+ restoration filters \ref cdef_restoration_frame()).
+
+ Except for rate/quality control, both \ref encode_with_recode_loop() and
+ \ref encode_without_recode() call \ref av1_encode_frame() to manage the
+ reference frame buffers and \ref encode_frame_internal() to perform the
+ rest of encoding that does not require access to external frames.
+ \ref encode_frame_internal() is the starting point for the partition search
+ (see \ref architecture_enc_partitions).
+
+\subsection architecture_enc_frame_proc_tf Temporal Filtering
+
+\subsubsection architecture_enc_frame_proc_tf_overview Overview
+
+Video codecs exploit the spatial and temporal correlations in video signals to
+achieve compression efficiency. The noise factor in the source signal
+attenuates such correlation and impedes the codec performance. Denoising the
+video signal is potentially a promising solution.
+
+One strategy for denoising a source is motion compensated temporal filtering.
+Unlike image denoising, where only the spatial information is available,
+video denoising can leverage a combination of the spatial and temporal
+information. Specifically, in the temporal domain, similar pixels can often be
+tracked along the motion trajectory of moving objects. Motion estimation is
+applied to neighboring frames to find similar patches or blocks of pixels that
+can be combined to create a temporally filtered output.
+
+AV1, in common with VP8 and VP9, uses an in-loop motion compensated temporal
+filter to generate what are referred to as alternate reference frames (or ARF
+frames). These can be encoded in the bitstream and stored as frame buffers for
+use in the prediction of subsequent frames, but are not usually directly
+displayed (hence they are sometimes referred to as non-display frames).
+
+The following command line parameters set the strength of the filter, the
+number of frames used and determine whether filtering is allowed for key
+frames.
+
+- <b>--arnr-strength</b> (\ref AlgoCfg.arnr_strength)
+- <b>--arnr-maxframes</b> (\ref AlgoCfg.arnr_max_frames)
+- <b>--enable-keyframe-filtering</b>
+  (\ref KeyFrameCfg.enable_keyframe_filtering)
+
+Note that in AV1, the temporal filtering scheme is designed around the
+hierarchical ARF based pyramid coding structure. We typically apply denoising
+only on key frame and ARF frames at the highest (and sometimes the second
+highest) layer in the hierarchical coding structure.
+
+\subsubsection architecture_enc_frame_proc_tf_algo Temporal Filtering Algorithm
+
+Our method divides the current frame into "MxM" blocks. For each block, a
+motion search is applied on frames before and after the current frame. Only
+the best matching patch with the smallest mean square error (MSE) is kept as a
+candidate patch for a neighbour frame. The current block is also a candidate
+patch. A total of N candidate patches are combined to generate the filtered
+output.
+
+Let f(i) represent the filtered sample value and \f$p_{j}(i)\f$ the sample
+value of the j-th patch. The filtering process is:
+
+\f[
+  f(i) = \frac{p_{0}(i) + \sum_{j=1}^{N} &omega;_{j}(i).p_{j}(i)}
+              {1 + \sum_{j=1}^{N} &omega;_{j}(i)}
+\f]
+
+where \f$ &omega;_{j}(i) \f$ is the weight of the j-th patch from a total of
+N patches. The weight is determined by the patch difference as:
+
+\f[
+  &omega;_{j}(i) = exp(-\frac{D_{j}(i)}{h^2})
+\f]
+
+where \f$ D_{j}(i) \f$ is the sum of squared difference between the current
+block and the j-th candidate patch:
+
+\f[
+  D_{j}(i) = \sum_{k\in&Omega;_{i}}||p_{0}(k) - p_{j}(k)||_{2}
+\f]
+
+where:
+- \f$p_{0}\f$ refers to the current frame.
+- \f$&Omega;_{i}\f$ is the patch window, an "LxL" pixel square.
+- h is a critical parameter that controls the decay of the weights measured by
+  the Euclidean distance. It is derived from an estimate of noise amplitude in
+  the source. This allows the filter coefficients to adapt for videos with
+  different noise characteristics.
+- Usually, M = 32, N = 7, and L = 5, but they can be adjusted.
+
+It is recommended that the reader refers to the code for more details.
+
+\subsubsection architecture_enc_frame_proc_tf_funcs Temporal Filter Functions
+
+The main entry point for temporal filtering is \ref av1_temporal_filter().
+This function returns 1 if temporal filtering is successful, otherwise 0.
+When temporal filtering is applied, the filtered frame will be held in
+the frame buffer \ref AV1_COMP.alt_ref_buffer, which is the frame to be
+encoded in the following encoding process.
+
+Almost all temporal filter related code is in av1/encoder/temporal_filter.c
+and av1/encoder/temporal_filter.h.
+
+Inside \ref av1_temporal_filter(), the reader's attention is directed to
+\ref tf_setup_filtering_buffer() and \ref tf_do_filtering().
+
+- \ref tf_setup_filtering_buffer(): sets up the frame buffer for
+  temporal filtering, determines the number of frames to be used, and
+  calculates the noise level of each frame.
+
+- \ref tf_do_filtering(): the main function for the temporal
+  filtering algorithm. It breaks each frame into "MxM" blocks. For each
+  block a motion search \ref tf_motion_search() is applied to find
+  the motion vector from one neighboring frame. tf_build_predictor() is then
+  called to build the matching patch and \ref av1_apply_temporal_filter_c() (see
+  also optimised SIMD versions) to apply temporal filtering. The weighted
+  average over each pixel is accumulated and finally normalized in
+  \ref tf_normalize_filtered_frame() to generate the final filtered frame.
+
+- \ref av1_apply_temporal_filter_c(): the core function of our temporal
+  filtering algorithm (see also optimised SIMD versions).
+
+\subsection architecture_enc_frame_proc_film Film Grain Modelling
+
+ Add details here.
+
+\section architecture_enc_rate_ctrl Rate Control
+
+\subsection architecture_enc_rate_ctrl_data Main Data Structures
+
+The following are the main data structures referenced in this section
+(see also \ref architecture_enc_data_structures):
+
+ - \ref AV1_COMP cpi (the main compressor instance data structure)
+    - \ref AV1_COMP.oxcf (\ref AV1EncoderConfig)
+    - \ref AV1_COMP.rc (\ref RATE_CONTROL)
+    - \ref AV1_COMP.twopass (\ref TWO_PASS)
+    - \ref AV1_COMP.sf (\ref SPEED_FEATURES)
+
+ - \ref AV1EncoderConfig (Encoder configuration parameters)
+    - \ref AV1EncoderConfig.rc_cfg (\ref RateControlCfg)
+
+ - \ref FIRSTPASS_STATS *frame_stats_buf (used to store per frame first
+   pass stats)
+
+ - \ref SPEED_FEATURES (Encode speed vs quality tradeoff parameters)
+    - \ref SPEED_FEATURES.hl_sf (\ref HIGH_LEVEL_SPEED_FEATURES)
+
+\subsection architecture_enc_rate_ctrl_options Supported Rate Control Options
+
+Different use cases (\ref architecture_enc_use_cases) may have different
+requirements in terms of data rate control.
+
+The broad rate control strategy is selected using the <b>--end-usage</b>
+parameter on the command line, which maps onto the field
+\ref aom_codec_enc_cfg_t.rc_end_usage in \ref aom_encoder.h.
+
+The four supported options are:-
+
+- <b>VBR</b> (Variable Bitrate)
+- <b>CBR</b> (Constant Bitrate)
+- <b>CQ</b> (Constrained Quality mode ; A constrained variant of VBR)
+- <b>Fixed Q</b> (Constant quality of Q mode)
+
+The value of \ref aom_codec_enc_cfg_t.rc_end_usage is in turn copied over
+into the encoder rate control configuration data structure as
+\ref RateControlCfg.mode.
+
+In regards to the most important use cases above, Video on demand uses either
+VBR or CQ mode. CBR is the preferred rate control model for RTC and Live
+streaming and Fixed Q is only used in testing.
+
+The behaviour of each of these modes is regulated by a series of secondary
+command line rate control options but also depends somewhat on the selected
+use case, whether 2-pass coding is enabled and the selected encode speed vs
+quality trade offs (\ref AV1_COMP.speed and \ref AV1_COMP.sf).
+
+The list below gives the names of the main rate control command line
+options together with the names of the corresponding fields in the rate
+control configuration data structures.
+
+- <b>--target-bitrate</b> (\ref RateControlCfg.target_bandwidth)
+- <b>--min-q</b> (\ref RateControlCfg.best_allowed_q)
+- <b>--max-q</b> (\ref RateControlCfg.worst_allowed_q)
+- <b>--cq-level</b> (\ref RateControlCfg.cq_level)
+- <b>--undershoot-pct</b> (\ref RateControlCfg.under_shoot_pct)
+- <b>--overshoot-pct</b> (\ref RateControlCfg.over_shoot_pct)
+
+The following control aspects of vbr encoding
+
+- <b>--bias-pct</b> (\ref RateControlCfg.vbrbias)
+- <b>--minsection-pct</b> ((\ref RateControlCfg.vbrmin_section)
+- <b>--maxsection-pct</b> ((\ref RateControlCfg.vbrmax_section)
+
+The following relate to buffer and delay management in one pass low delay and
+real time coding
+
+- <b>--buf-sz</b> (\ref RateControlCfg.maximum_buffer_size_ms)
+- <b>--buf-initial-sz</b> (\ref RateControlCfg.starting_buffer_level_ms)
+- <b>--buf-optimal-sz</b> (\ref RateControlCfg.optimal_buffer_level_ms)
+
+\subsection architecture_enc_vbr Variable Bitrate (VBR) Encoding
+
+For streamed VOD content the most common rate control strategy is Variable
+Bitrate (VBR) encoding. The CQ mode mentioned above is a variant of this
+where additional quantizer and quality constraints are applied.  VBR
+encoding may in theory be used in conjunction with either 1-pass or 2-pass
+encoding.
+
+VBR encoding varies the number of bits given to each frame or group of frames
+according to the difficulty of that frame or group of frames, such that easier
+frames are allocated fewer bits and harder frames are allocated more bits. The
+intent here is to even out the quality between frames. This contrasts with
+Constant Bitrate (CBR) encoding where each frame is allocated the same number
+of bits.
+
+Whilst for any given frame or group of frames the data rate may vary, the VBR
+algorithm attempts to deliver a given average bitrate over a wider time
+interval. In standard VBR encoding, the time interval over which the data rate
+is averaged is usually the duration of the video clip.  An alternative
+approach is to target an average VBR bitrate over the entire video corpus for
+a particular video format (corpus VBR).
+
+\subsubsection architecture_enc_1pass_vbr 1 Pass VBR Encoding
+
+The command line for libaom does allow 1 Pass VBR, but this has not been
+properly optimised and behaves much like 1 pass CBR in most regards, with bits
+allocated to frames by the following functions:
+
+- \ref av1_calc_iframe_target_size_one_pass_vbr()
+- \ref av1_calc_pframe_target_size_one_pass_vbr()
+
+\subsubsection architecture_enc_2pass_vbr 2 Pass VBR Encoding
+
+The main focus here will be on 2-pass VBR encoding (and the related CQ mode)
+as these are the modes most commonly used for VOD content.
+
+2-pass encoding is selected on the command line by setting --passes=2
+(or -p 2).
+
+Generally speaking, in 2-pass encoding, an encoder will first encode a video
+using a default set of parameters and assumptions. Depending on the outcome
+of that first encode, the baseline assumptions and parameters will be adjusted
+to optimize the output during the second pass.  In essence the first pass is a
+fact finding mission to establish the complexity and variability of the video,
+in order to allow a better allocation of bits in the second pass.
+
+The libaom 2-pass algorithm is unusual in that the first pass is not a full
+encode of the video. Rather it uses a limited set of prediction and transform
+options and a fixed quantizer,  to generate statistics about each frame. No
+output bitstream is created and the per frame first pass statistics are stored
+entirely in volatile memory. This has some disadvantages when compared to a
+full first pass encode, but avoids the need for file I/O and improves speed.
+
+For two pass encoding, the function \ref av1_encode() will first be called
+for each frame in the video with the value \ref AV1EncoderConfig.pass = 1.
+This will result in calls to \ref av1_first_pass().
+
+Statistics for each frame are stored in \ref FIRSTPASS_STATS frame_stats_buf.
+
+After completion of the first pass, \ref av1_encode() will be called again for
+each frame with \ref AV1EncoderConfig.pass = 2.  The frames are then encoded in
+accordance with the statistics gathered during the first pass by calls to
+\ref encode_frame_to_data_rate() which in turn calls
+ \ref av1_get_second_pass_params().
+
+In summary the second pass code :-
+
+- Searches for scene cuts (if auto key frame detection is enabled).
+- Defines the length of and hierarchical structure to be used in each
+  ARF/GF group.
+- Allocates bits based on the relative complexity of each frame, the quality
+  of frame to frame prediction and the type of frame (e.g. key frame, ARF
+  frame, golden frame or normal leaf frame).
+- Suggests a maximum Q (quantizer value) for each ARF/GF group, based on
+  estimated complexity and recent rate control compliance
+  (\ref RATE_CONTROL.active_worst_quality)
+- Tracks adherence to the overall rate control objectives and adjusts
+  heuristics.
+
+The main two pass functions in regard to the above include:-
+
+- \ref find_next_key_frame()
+- \ref define_gf_group()
+- \ref calculate_total_gf_group_bits()
+- \ref get_twopass_worst_quality()
+- \ref av1_gop_setup_structure()
+- \ref av1_gop_bit_allocation()
+- \ref av1_twopass_postencode_update()
+
+For each frame, the two pass algorithm defines a target number of bits
+\ref RATE_CONTROL.base_frame_target,  which is then adjusted if necessary to
+reflect any undershoot or overshoot on previous frames to give
+\ref RATE_CONTROL.this_frame_target.
+
+As well as \ref RATE_CONTROL.active_worst_quality, the two pass code also
+maintains a record of the actual Q value used to encode previous frames
+at each level in the current pyramid hierarchy
+(\ref RATE_CONTROL.active_best_quality). The function
+\ref rc_pick_q_and_bounds(), uses these values to set a permitted Q range
+for each frame.
+
+\subsubsection architecture_enc_1pass_lagged 1 Pass Lagged VBR Encoding
+
+1 pass lagged encode falls between simple 1 pass encoding and full two pass
+encoding and is used for cases where it is not possible to do a full first
+pass through the entire video clip, but where some delay is permissible. For
+example near live streaming where there is a delay of up to a few seconds. In
+this case the first pass and second pass are in effect combined such that the
+first pass starts encoding the clip and the second pass lags behind it by a
+few frames.  When using this method, full sequence level statistics are not
+available, but it is possible to collect and use frame or group of frame level
+data to help in the allocation of bits and in defining ARF/GF coding
+hierarchies.  The reader is referred to the \ref AV1_COMP.lap_enabled field
+in the main compressor instance (where <b>lap</b> stands for
+<b>look ahead processing</b>). This encoding mode for the most part uses the
+same rate control pathways as two pass VBR encoding.
+
+\subsection architecture_enc_rc_loop The Main Rate Control Loop
+
+Having established a target rate for a given frame and an allowed range of Q
+values, the encoder then tries to encode the frame at a rate that is as close
+as possible to the target value, given the Q range constraints.
+
+There are two main mechanisms by which this is achieved.
+
+The first selects a frame level Q, using an adaptive estimate of the number of
+bits that will be generated when the frame is encoded at any given Q.
+Fundamentally this mechanism is common to VBR, CBR and to use cases such as
+RTC with small adjustments.
+
+As the Q value mainly adjusts the precision of the residual signal, it is not
+actually a reliable basis for accurately predicting the number of bits that
+will be generated across all clips. A well predicted clip, for example, may
+have a much smaller error residual after prediction.  The algorithm copes with
+this by adapting its predictions on the fly using a feedback loop based on how
+well it did the previous time around.
+
+The main functions responsible for the prediction of Q and the adaptation over
+time, for the two pass encoding pipeline are:
+
+- \ref rc_pick_q_and_bounds()
+    - \ref get_q()
+        - \ref av1_rc_regulate_q()
+        - \ref get_rate_correction_factor()
+        - \ref set_rate_correction_factor()
+        - \ref find_closest_qindex_by_rate()
+- \ref av1_twopass_postencode_update()
+    - \ref av1_rc_update_rate_correction_factors()
+
+A second mechanism for control comes into play if there is a large rate miss
+for the current frame (much too big or too small). This is a recode mechanism
+which allows the current frame to be re-encoded one or more times with a
+revised Q value. This obviously has significant implications for encode speed
+and in the case of RTC latency (hence it is not used for the RTC pathway).
+
+Whether or not a recode is allowed for a given frame depends on the selected
+encode speed vs quality trade off. This is set on the command line using the
+--cpu-used parameter which maps onto the \ref AV1_COMP.speed field in the main
+compressor instance data structure.
+
+The value of \ref AV1_COMP.speed, combined with the use case, is used to
+populate the speed features data structure AV1_COMP.sf. In particular
+\ref HIGH_LEVEL_SPEED_FEATURES.recode_loop determines the types of frames that
+may be recoded and \ref HIGH_LEVEL_SPEED_FEATURES.recode_tolerance is a rate
+error trigger threshold.
+
+For more information the reader is directed to the following functions:
+
+- \ref encode_with_recode_loop()
+- \ref encode_without_recode()
+- \ref recode_loop_update_q()
+- \ref recode_loop_test()
+- \ref av1_set_speed_features_framesize_independent()
+- \ref av1_set_speed_features_framesize_dependent()
+
+\subsection architecture_enc_fixed_q Fixed Q Mode
+
+There are two main fixed Q cases:
+-# Fixed Q with adaptive qp offsets: same qp offset for each pyramid level
+   in a given video, but these offsets are adaptive based on video content.
+-# Fixed Q with fixed qp offsets: content-independent fixed qp offsets for
+   each pyramid level. (see \ref get_q_using_fixed_offsets()).
+
+The reader is also refered to the following functions:
+- \ref av1_rc_pick_q_and_bounds()
+- \ref rc_pick_q_and_bounds_no_stats_cbr()
+- \ref rc_pick_q_and_bounds_no_stats()
+- \ref rc_pick_q_and_bounds()
+
+\section architecture_enc_frame_groups GF/ ARF Frame Groups & Hierarchical Coding
+
+\subsection architecture_enc_frame_groups_data Main Data Structures
+
+The following are the main data structures referenced in this section
+(see also \ref architecture_enc_data_structures):
+
+- \ref AV1_COMP cpi (the main compressor instance data structure)
+    - \ref AV1_COMP.rc (\ref RATE_CONTROL)
+
+- \ref FIRSTPASS_STATS *frame_stats_buf (used to store per frame first pass
+stats)
+
+\subsection architecture_enc_frame_groups_groups Frame Groups
+
+To process a sequence/stream of video frames, the encoder divides the frames
+into groups and encodes them sequentially (possibly dependent on previous
+groups). In AV1 such a group is usually referred to as a golden frame group
+(GF group) or sometimes an Alt-Ref (ARF) group or a group of pictures (GOP).
+A GF group determines and stores the coding structure of the frames (for
+example, frame type, usage of the hierarchical structure, usage of overlay
+frames, etc.) and can be considered as the base unit to process the frames,
+therefore playing an important role in the encoder.
+
+The length of a specific GF group is arguably the most important aspect when
+determining a GF group. This is because most GF group level decisions are
+based on the frame characteristics, if not on the length itself directly.
+Note that the GF group is always a group of consecutive frames, which means
+the start and end of the group (so again, the length of it) determines which
+frames are included in it and hence determines the characteristics of the GF
+group. Therefore, in this document we will first discuss the GF group length
+decision in Libaom, followed by frame structure decisions when defining a GF
+group with a certain length.
+
+\subsection architecture_enc_gf_length GF / ARF Group Length Determination
+
+The basic intuition of determining the GF group length is that it is usually
+desirable to group together frames that are similar. Hence, we may choose
+longer groups when consecutive frames are very alike and shorter ones when
+they are very different.
+
+The determination of the GF group length is done in function \ref
+calculate_gf_length(). The following encoder use cases are supported:
+
+<ul>
+  <li><b>Single pass with look-ahead disabled(\ref has_no_stats_stage()):
+  </b> in this case there is no information available on the following stream
+  of frames, therefore the function will set the GF group length for the
+  current and the following GF groups (a total number of MAX_NUM_GF_INTERVALS
+  groups) to be the maximum value allowed.</li>
+
+  <li><b>Single pass with look-ahead enabled (\ref AV1_COMP.lap_enabled):</b>
+  look-ahead processing is enabled for single pass, therefore there is a
+  limited amount of information available regarding future frames. In this
+  case the function will determine the length based on \ref FIRSTPASS_STATS
+  (which is generated when processing the look-ahead buffer) for only the
+  current GF group.</li>
+
+  <li><b>Two pass:</b> the first pass in two-pass encoding collects the stats
+  and will not call the function. In the second pass, the function tries to
+  determine the GF group length of the current and the following GF groups (a
+  total number of MAX_NUM_GF_INTERVALS groups) based on the first-pass
+  statistics. Note that as we will be discussing later, such decisions may not
+  be accurate and can be changed later.</li>
+</ul>
+
+Except for the first trivial case where there is no prior knowledge of the
+following frames, the function \ref calculate_gf_length() tries to determine the
+GF group length based on the first pass statistics. The determination is divided
+into two parts:
+
+<ol>
+   <li>Baseline decision based on accumulated statistics: this part of the function
+   iterates through the firstpass statistics of the following frames and
+   accumulates the statistics with function accumulate_next_frame_stats.
+   The accumulated statistics are then used to determine whether the
+   correlation in the GF group has dropped too much in function detect_gf_cut.
+   If detect_gf_cut returns non-zero, or if we've reached the end of
+   first-pass statistics, the baseline decision is set at the current point.</li>
+
+   <li>If we are not at the end of the first-pass statistics, the next part will
+   try to refine the baseline decision. This algorithm is based on the analysis
+   of firstpass stats. It tries to cut the groups in stable regions or
+   relatively stable points. Also it tries to avoid cutting in a blending
+   region.</li>
+</ol>
+
+As mentioned, for two-pass encoding, the function \ref
+calculate_gf_length() tries to determine the length of as many as
+MAX_NUM_GF_INTERVALS groups. The decisions are stored in
+\ref RATE_CONTROL.gf_intervals[]. The variables
+\ref RATE_CONTROL.intervals_till_gf_calculate_due and
+\ref RATE_CONTROL.cur_gf_index help with managing and updating the stored
+decisions. In the function \ref define_gf_group(), the corresponding
+stored length decision will be used to define the current GF group.
+
+When the maximum GF group length is larger or equal to 32, the encoder will
+enforce an extra layer to determine whether to use maximum GF length of 32
+or 16 for every GF group. In such a case, \ref calculate_gf_length() is
+first called with the original maximum length (>=32). Afterwards,
+\ref av1_tpl_setup_stats() is called to analyze the determined GF group
+and compare the reference to the last frame and the middle frame. If it is
+decided that we should use a maximum GF length of 16, the function
+\ref calculate_gf_length() is called again with the updated maximum
+length, and it only sets the length for a single GF group
+(\ref RATE_CONTROL.intervals_till_gf_calculate_due is set to 1). This process
+is shown below.
+
+\image html tplgfgroupdiagram.png "" width=40%
+
+Before encoding each frame, the encoder checks
+\ref RATE_CONTROL.frames_till_gf_update_due. If it is zero, indicating
+processing of the current GF group is done, the encoder will check whether
+\ref RATE_CONTROL.intervals_till_gf_calculate_due is zero. If it is, as
+discussed above, \ref calculate_gf_length() is called with original
+maximum length. If it is not zero, then the GF group length value stored
+in \ref RATE_CONTROL.gf_intervals[\ref RATE_CONTROL.cur_gf_index] is used
+(subject to change as discussed above).
+
+\subsection architecture_enc_gf_structure Defining a GF Group's Structure
+
+The function \ref define_gf_group() defines the frame structure as well
+as other GF group level parameters (e.g. bit allocation) once the length of
+the current GF group is determined.
+
+The function first iterates through the first pass statistics in the GF group to
+accumulate various stats, using accumulate_this_frame_stats() and
+accumulate_next_frame_stats(). The accumulated statistics are then used to
+determine the use of the use of ALTREF frame along with other properties of the
+GF group. The values of \ref RATE_CONTROL.cur_gf_index, \ref
+RATE_CONTROL.intervals_till_gf_calculate_due and \ref
+RATE_CONTROL.frames_till_gf_update_due are also updated accordingly.
+
+The function \ref av1_gop_setup_structure() is called at the end to determine
+the frame layers and reference maps in the GF group, where the
+construct_multi_layer_gf_structure() function sets the frame update types for
+each frame and the group structure.
+
+- If ALTREF frames are allowed for the GF group: the first frame is set to
+  KF_UPDATE, GF_UPDATE or ARF_UPDATE. The last frames of the GF group is set to
+  OVERLAY_UPDATE.  Then in set_multi_layer_params(), frame update
+  types are determined recursively in a binary tree fashion, and assigned to
+  give the final IBBB structure for the group.  - If the current branch has more
+  than 2 frames and we have not reached maximum layer depth, then the middle
+  frame is set as INTNL_ARF_UPDATE, and the left and right branches are
+  processed recursively.  - If the current branch has less than 3 frames, or we
+  have reached maximum layer depth, then every frame in the branch is set to
+  LF_UPDATE.
+
+- If ALTREF frame is not allowed for the GF group: the frames are set
+  as LF_UPDATE. This basically forms an IPPP GF group structure.
+
+As mentioned, the encoder may use Temporal dependancy modelling (TPL - see \ref
+architecture_enc_tpl) to determine whether we should use a maximum length of 32
+or 16 for the current GF group. This requires calls to \ref define_gf_group()
+but should not change other settings (since it is in essence a trial). This
+special case is indicated by the setting parameter <b>is_final_pass</b> for to
+zero.
+
+For single pass encodes where look-ahead processing is disabled
+(\ref AV1_COMP.lap_enabled = 0), \ref define_gf_group_pass0() is used
+instead of \ref define_gf_group().
+
+\subsection architecture_enc_kf_groups Key Frame Groups
+
+A special constraint for GF group length is the location of the next keyframe
+(KF). The frames between two KFs are referred to as a KF group. Each KF group
+can be encoded and decoded independently. Because of this, a GF group cannot
+span beyond a KF and the location of the next KF is set as a hard boundary
+for GF group length.
+
+<ul>
+   <li>For two-pass encoding \ref RATE_CONTROL.frames_to_key controls when to
+   encode a key frame. When it is zero, the current frame is a keyframe and
+   the function \ref find_next_key_frame() is called. This in turn calls
+   \ref define_kf_interval() to work out where the next key frame should
+   be placed.</li>
+
+   <li>For single-pass with look-ahead enabled, \ref define_kf_interval()
+   is called whenever a GF group update is needed (when
+   \ref RATE_CONTROL.frames_till_gf_update_due is zero). This is because
+   generally KFs are more widely spaced and the look-ahead buffer is usually
+   not long enough.</li>
+
+   <li>For single-pass with look-ahead disabled, the KFs are placed according
+   to the command line parameter <b>--kf-max-dist</b> (The above two cases are
+   also subject to this constraint).</li>
+</ul>
+
+The function \ref define_kf_interval() tries to detect a scenecut.
+If a scenecut within kf-max-dist is detected, then it is set as the next
+keyframe. Otherwise the given maximum value is used.
+
+\section architecture_enc_tpl Temporal Dependency Modelling
+
+The temporal dependency model runs at the beginning of each GOP. It builds the
+motion trajectory within the GOP in units of 16x16 blocks. The temporal
+dependency of a 16x16 block is evaluated as the predictive coding gains it
+contributes to its trailing motion trajectory. This temporal dependency model
+reflects how important a coding block is for the coding efficiency of the
+overall GOP. It is hence used to scale the Lagrangian multiplier used in the
+rate-distortion optimization framework.
+
+\subsection architecture_enc_tpl_config Configurations
+
+The temporal dependency model and its applications are by default turned on in
+libaom encoder for the VoD use case. To disable it, use --tpl-model=0 in the
+aomenc configuration.
+
+\subsection architecture_enc_tpl_algoritms Algorithms
+
+The scheme works in the reverse frame processing order over the source frames,
+propagating information from future frames back to the current frame. For each
+frame, a propagation step is run for each MB. it operates as follows:
+
+<ul>
+   <li> Estimate the intra prediction cost in terms of sum of absolute Hadamard
+   transform difference (SATD) noted as intra_cost. It also loads the motion
+   information available from the first-pass encode and estimates the inter
+   prediction cost as inter_cost. Due to the use of hybrid inter/intra
+   prediction mode, the inter_cost value is further upper bounded by
+   intra_cost. A propagation cost variable is used to collect all the
+   information flowed back from future processing frames. It is initialized as
+   0 for all the blocks in the last processing frame in a group of pictures
+   (GOP).</li>
+
+   <li> The fraction of information from a current block to be propagated towards
+   its reference block is estimated as:
+\f[
+   propagation\_fraction = (1 - inter\_cost/intra\_cost)
+\f]
+   It reflects how much the motion compensated reference would reduce the
+   prediction error in percentage.</li>
+
+   <li> The total amount of information the current block contributes to the GOP
+   is estimated as intra_cost + propagation_cost. The information that it
+   propagates towards its reference block is captured by:
+
+\f[
+   propagation\_amount =
+   (intra\_cost + propagation\_cost) * propagation\_fraction
+\f]</li>
+
+   <li> Note that the reference block may not necessarily sit on the grid of
+   16x16 blocks. The propagation amount is hence dispensed to all the blocks
+   that overlap with the reference block. The corresponding block in the
+   reference frame accumulates its own propagation cost as it receives back
+   propagation.
+
+\f[
+   propagation\_cost = propagation\_cost +
+                       (\frac{overlap\_area}{(16*16)} * propagation\_amount)
+\f]</li>
+
+   <li> In the final encoding stage, the distortion propagation factor of a block
+   is evaluated as \f$(1 + \frac{propagation\_cost}{intra\_cost})\f$, where the second term
+   captures its impact on later frames in a GOP.</li>
+
+   <li> The Lagrangian multiplier is adapted at the 64x64 block level. For every
+   64x64 block in a frame, we have a distortion propagation factor:
+
+\f[
+  dist\_prop[i] = 1 + \frac{propagation\_cost[i]}{intra\_cost[i]}
+\f]
+
+   where i denotes the block index in the frame. We also have the frame level
+   distortion propagation factor:
+
+\f[
+  dist\_prop = 1 +
+  \frac{\sum_{i}propagation\_cost[i]}{\sum_{i}intra\_cost[i]}
+\f]
+
+   which is used to normalize the propagation factor at the 64x64 block level. The
+   Lagrangian multiplier is hence adapted as:
+
+\f[
+  &lambda;[i] = &lambda;[0] * \frac{dist\_prop}{dist\_prop[i]}
+\f]
+
+   where &lambda;0 is the multiplier associated with the frame level QP. The
+   64x64 block level QP is scaled according to the Lagrangian multiplier.
+</ul>
+
+\subsection architecture_enc_tpl_keyfun Key Functions and data structures
+
+The reader is also refered to the following functions and data structures:
+
+- \ref TplParams
+- \ref av1_tpl_setup_stats() builds the TPL model.
+- \ref setup_delta_q() Assign different quantization parameters to each super
+  block based on its TPL weight.
+
+\section architecture_enc_partitions Block Partition Search
+
+ A frame is first split into tiles in \ref encode_tiles(), with each tile
+ compressed by av1_encode_tile(). Then a tile is processed in superblock rows
+ via \ref av1_encode_sb_row() and then \ref encode_sb_row().
+
+ The partition search processes superblocks sequentially in \ref
+ encode_sb_row(). Two search modes are supported, depending upon the encoding
+ configuration, \ref encode_nonrd_sb() is for 1-pass and real-time modes,
+ while \ref encode_rd_sb() performs more exhaustive rate distortion based
+ searches.
+
+ Partition search over the recursive quad-tree space is implemented by
+ recursive calls to \ref av1_nonrd_use_partition(),
+ \ref av1_rd_use_partition(), or av1_rd_pick_partition() and returning best
+ options for sub-trees to their parent partitions.
+
+ In libaom, the partition search lays on top of the mode search (predictor,
+ transform, etc.), instead of being a separate module. The interface of mode
+ search is \ref pick_sb_modes(), which connects the partition_search with
+ \ref architecture_enc_inter_modes and \ref architecture_enc_intra_modes. To
+ make good decisions, reconstruction is also required in order to build
+ references and contexts. This is implemented by \ref encode_sb() at the
+ sub-tree level and \ref encode_b() at coding block level.
+
+ See also \ref partition_search
+
+\section architecture_enc_intra_modes Intra Mode Search
+
+AV1 also provides 71 different intra prediction modes, i.e. modes that predict
+only based upon information in the current frame with no dependency on
+previous or future frames. For key frames, where this independence from any
+other frame is a defining requirement and for other cases where intra only
+frames are required, the encoder need only considers these modes in the rate
+distortion loop.
+
+Even so, in most use cases, searching all possible intra prediction modes for
+every block and partition size is not practical and some pruning of the search
+tree is necessary.
+
+For the Rate distortion optimized case, the main top level function
+responsible for selecting the intra prediction mode for a given block is
+\ref av1_rd_pick_intra_mode_sb(). The readers attention is also drawn to the
+functions \ref hybrid_intra_mode_search() and \ref av1_nonrd_pick_intra_mode()
+which may be used where encode speed is critical. The choice between the
+rd path and the non rd or hybrid paths depends on the encoder use case and the
+\ref AV1_COMP.speed parameter. Further fine control of the speed vs quality
+trade off is provided by means of fields in \ref AV1_COMP.sf (which has type
+\ref SPEED_FEATURES).
+
+Note that some intra modes are only considered for specific use cases or
+types of video. For example the palette based prediction modes are often
+valueable for graphics or screen share content but not for natural video.
+(See \ref av1_search_palette_mode())
+
+See also \ref intra_mode_search for more details.
+
+\section architecture_enc_inter_modes Inter Prediction Mode Search
+
+For inter frames, where we also allow prediction using one or more previously
+coded frames (which may chronologically speaking be past or future frames or
+non-display reference buffers such as ARF frames), the size of the search tree
+that needs to be traversed, to select a prediction mode, is considerably more
+massive.
+
+In addition to the 71 possible intra modes we also need to consider 56 single
+frame inter prediction modes (7 reference frames x 4 modes x 2 for OBMC
+(overlapped block motion compensation)), 12768 compound inter prediction modes
+(these are modes that combine inter predictors from two reference frames) and
+36708 compound inter / intra prediction modes.
+
+As with the intra mode search, libaom supports an RD based pathway and a non
+rd pathway for speed critical use cases.  The entry points for these two cases
+are \ref av1_rd_pick_inter_mode() and \ref av1_nonrd_pick_inter_mode_sb()
+respectively.
+
+Various heuristics and predictive strategies are used to prune the search tree
+with fine control provided through the speed features parameter in the main
+compressor instance data structure \ref AV1_COMP.sf.
+
+It is worth noting, that some prediction modes incurr a much larger rate cost
+than others (ignoring for now the cost of coding the error residual). For
+example, a compound mode that requires the encoder to specify two reference
+frames and two new motion vectors will almost inevitable have a higher rate
+cost than a simple inter prediction mode that uses a predicted or 0,0 motion
+vector. As such, if we have already found a mode for the current block that
+has a low RD cost, we can skip a large number of the possible modes on the
+basis that even if the error residual is 0 the inherent rate cost of the
+mode itself will garauntee that it is not chosen.
+
+See also \ref inter_mode_search for more details.
+
+\section architecture_enc_tx_search Transform Search
+
+AV1 implements the transform stage using 4 seperable 1-d transforms (DCT,
+ADST, FLIPADST and IDTX, where FLIPADST is the reversed version of ADST
+and IDTX is the identity transform) which can be combined to give 16 2-d
+combinations.
+
+These combinations can be applied at 19 different scales from 64x64 pixels
+down to 4x4 pixels.
+
+This gives rise to a large number of possible candidate transform options
+for coding the residual error after prediction. An exhaustive rate-distortion
+based evaluation of all candidates would not be practical from a speed
+perspective in a production encoder implementation. Hence libaom addopts a
+number of strategies to prune the selection of both the transform size and
+transform type.
+
+There are a number of strategies that have been tested and implememnted in
+libaom including:
+
+- A statistics based approach that looks at the frequency with which certain
+  combinations are used in a given context and prunes out very unlikely
+  candidates. It is worth noting here that some size candidates can be pruned
+  out immediately based on the size of the prediction partition. For example it
+  does not make sense to use a transform size that is larger than the
+  prediction partition size but also a very large prediction partition size is
+  unlikely to be optimally pared with small transforms.
+
+- A Machine learning based model
+
+- A method that initially tests candidates using a fast algorithm that skips
+  entropy encoding and uses an estimated cost model to choose a reduced subset
+  for full RD analysis. This subject is covered more fully in a paper authored
+  by Bohan Li, Jingning Han, and Yaowu Xu titled: <b>Fast Transform Type
+  Selection Using Conditional Laplace Distribution Based Rate Estimation</b>
+
+<b>TODO Add link to paper when available</b>
+
+See also \ref transform_search for more details.
+
+\section architecture_post_enc_filt Post Encode Loop Filtering
+
+AV1 supports three types of post encode <b>in loop</b> filtering to improve
+the quality of the reconstructed video.
+
+- <b>Deblocking Filter</b> The first of these is a farily traditional boundary
+  deblocking filter that attempts to smooth discontinuities that may occur at
+  the boundaries between blocks. See also \ref in_loop_filter.
+
+- <b>CDEF Filter</b> The constrained directional enhancement filter (CDEF)
+  allows the codec to apply a non-linear deringing filter along certain
+  (potentially oblique) directions. A primary filter is applied along the
+  selected direction, whilst a secondary filter is applied at 45 degrees to
+  the primary direction. (See also \ref in_loop_cdef and
+  <a href="https://arxiv.org/abs/2008.06091"> A Technical Overview of AV1</a>.
+
+- <b>Loop Restoration Filter</b> The loop restoration filter is applied after
+  any prior post filtering stages. It acts on units of either 64 x 64,
+  128 x 128, or 256 x 256 pixel blocks, refered to as loop restoration units.
+  Each unit can independently select either to bypass filtering, use a Wiener
+  filter, or use a self-guided filter. (See also \ref in_loop_restoration and
+  <a href="https://arxiv.org/abs/2008.06091"> A Technical Overview of AV1</a>.
+
+\section architecture_entropy Entropy Coding
+
+\subsection architecture_entropy_aritmetic Arithmetic Coder
+
+VP9, used a binary arithmetic coder to encode symbols, where the propability
+of a 1 or 0 at each descision node was based on a context model that took
+into account recently coded values (for example previously coded coefficients
+in the current block). A mechanism existed to update the context model each
+frame, either explicitly in the bitstream, or implicitly at both the encoder
+and decoder based on the observed frequency of different outcomes in the
+previous frame. VP9 also supported seperate context models for different types
+of frame (e.g. inter coded frames and key frames).
+
+In contrast, AV1 uses an M-ary symbol arithmetic coder to compress the syntax
+elements, where integer \f$M\in[2, 14]\f$. This approach is based upon the entropy
+coding strategy used in the Daala video codec and allows for some bit-level
+parallelism in its implementation. AV1 also has an extended context model and
+allows for updates to the probabilities on a per symbol basis as opposed to
+the per frame strategy in VP9.
+
+To improve the performance / throughput of the arithmetic encoder, especially
+in hardware implementations, the probability model is updated and maintained
+at 15-bit precision, but the arithmetic encoder only uses the most significant
+9 bits when encoding a symbol. A more detailed discussion of the algorithm
+and design constraints can be found in
+<a href="https://arxiv.org/abs/2008.06091"> A Technical Overview of AV1</a>.
+
+TODO add references to key functions / files.
+
+As with VP9, a mechanism exists in AV1 to encode some elements into the
+bitstream as uncrompresed bits or literal values, without using the arithmetic
+coder. For example, some frame and sequence header values, where it is
+beneficial to be able to read the values directly.
+
+TODO add references to key functions / files.
+
+\subsection architecture_entropy_coef Transform Coefficient Coding and Optimization
+\image html coeff_coding.png "" width=70%
+
+\subsubsection architecture_entropy_coef_what Transform coefficient coding
+Transform coefficient coding is where the encoder compresses a quantized version
+of prediction residue into the bitstream.
+
+\paragraph architecture_entropy_coef_prepare Preparation - transform and quantize
+Before the entropy coding stage, the encoder decouple the pixel-to-pixel
+correlation of the prediction residue by transforming the residue from the
+spatial domain to the frequency domain. Then the encoder quantizes the transform
+coefficients to make the coefficients ready for entropy coding.
+
+\paragraph architecture_entropy_coef_coding The coding process
+The encoder uses \ref av1_write_coeffs_txb() to write the coefficients of
+a transform block into the bitstream.
+The coding process has three stages.
+1. The encoder will code transform block skip flag (txb_skip). If the skip flag is
+off, then the encoder will code the end of block position (eob) which is the scan
+index of the last non-zero coefficient plus one.
+2. Second, the encoder will code lower magnitude levels of each coefficient in
+reverse scan order.
+3. Finally, the encoder will code the sign and higher magnitude levels for each
+coefficient if they are available.
+
+Related functions:
+- \ref av1_write_coeffs_txb()
+- write_inter_txb_coeff()
+- \ref av1_write_intra_coeffs_mb()
+
+\paragraph architecture_entropy_coef_context Context information
+To improve the compression efficiency, the encoder uses several context models
+tailored for transform coefficients to capture the correlations between coding
+symbols. Most of the context models are built to capture the correlations
+between the coefficients within the same transform block. However, transform
+block skip flag (txb_skip) and the sign of dc coefficient (dc_sign) require
+context info from neighboring transform blocks.
+
+Here is how context info spread between transform blocks. Before coding a
+transform block, the encoder will use get_txb_ctx() to collect the context
+information from neighboring transform blocks. Then the context information
+will be used for coding transform block skip flag (txb_skip) and the sign of
+dc coefficient (dc_sign). After the transform block is coded, the encoder will
+extract the context info from the current block using
+\ref av1_get_txb_entropy_context(). Then encoder will store the context info
+into a byte (uint8_t) using av1_set_entropy_contexts(). The encoder will use
+the context info to code other transform blocks.
+
+Related functions:
+- \ref av1_get_txb_entropy_context()
+- av1_set_entropy_contexts()
+- get_txb_ctx()
+- \ref av1_update_intra_mb_txb_context()
+
+\subsubsection architecture_entropy_coef_rd RD optimization
+Beside the actual entropy coding, the encoder uses several utility functions
+to make optimal RD decisions.
+
+\paragraph architecture_entropy_coef_cost Entropy cost
+The encoder uses \ref av1_cost_coeffs_txb() or \ref av1_cost_coeffs_txb_laplacian()
+to estimate the entropy cost of a transform block. Note that
+\ref av1_cost_coeffs_txb() is slower but accurate whereas
+\ref av1_cost_coeffs_txb_laplacian() is faster but less accurate.
+
+Related functions:
+- \ref av1_cost_coeffs_txb()
+- \ref av1_cost_coeffs_txb_laplacian()
+- \ref av1_cost_coeffs_txb_estimate()
+
+\paragraph architecture_entropy_coef_opt Quantized level optimization
+Beside computing entropy cost, the encoder also uses \ref av1_optimize_txb_new()
+to adjust the coefficient’s quantized levels to achieve optimal RD trade-off.
+In \ref av1_optimize_txb_new(), the encoder goes through each quantized
+coefficient and lowers the quantized coefficient level by one if the action
+yields a better RD score.
+
+Related functions:
+- \ref av1_optimize_txb_new()
+
+All the related functions are listed in \ref coefficient_coding.
+
+*/
+
+/*!\defgroup encoder_algo Encoder Algorithm
+ *
+ * The encoder algorithm describes how a sequence is encoded, including high
+ * level decision as well as algorithm used at every encoding stage.
+ */
+
+/*!\defgroup high_level_algo High-level Algorithm
+ * \ingroup encoder_algo
+ * This module describes sequence level/frame level algorithm in AV1.
+ * More details will be added.
+ * @{
+ */
+
+/*!\defgroup speed_features Speed vs Quality Trade Off
+ * \ingroup high_level_algo
+ * This module describes the encode speed vs quality tradeoff
+ * @{
+ */
+/*! @} - end defgroup speed_features */
+
+/*!\defgroup src_frame_proc Source Frame Processing
+ * \ingroup high_level_algo
+ * This module describes algorithms in AV1 assosciated with the
+ * pre-processing of source frames. See also \ref architecture_enc_src_proc
+ *
+ * @{
+ */
+/*! @} - end defgroup src_frame_proc */
+
+/*!\defgroup rate_control Rate Control
+ * \ingroup high_level_algo
+ * This module describes rate control algorithm in AV1.
+ *  See also \ref architecture_enc_rate_ctrl
+ * @{
+ */
+/*! @} - end defgroup rate_control */
+
+/*!\defgroup tpl_modelling Temporal Dependency Modelling
+ * \ingroup high_level_algo
+ * This module includes algorithms to implement temporal dependency modelling.
+ *  See also \ref architecture_enc_tpl
+ * @{
+ */
+/*! @} - end defgroup tpl_modelling */
+
+/*!\defgroup two_pass_algo Two Pass Mode
+   \ingroup high_level_algo
+
+ In two pass mode, the input file is passed into the encoder for a quick
+ first pass, where statistics are gathered. These statistics and the input
+ file are then passed back into the encoder for a second pass. The statistics
+ help the encoder reach the desired bitrate without as much overshooting or
+ undershooting.
+
+ During the first pass, the codec will return "stats" packets that contain
+ information useful for the second pass. The caller should concatenate these
+ packets as they are received. In the second pass, the concatenated packets
+ are passed in, along with the frames to encode. During the second pass,
+ "frame" packets are returned that represent the compressed video.
+
+ A complete example can be found in `examples/twopass_encoder.c`. Pseudocode
+ is provided below to illustrate the core parts.
+
+ During the first pass, the uncompressed frames are passed in and stats
+ information is appended to a byte array.
+
+~~~~~~~~~~~~~~~{.c}
+// For simplicity, assume that there is enough memory in the stats buffer.
+// Actual code will want to use a resizable array. stats_len represents
+// the length of data already present in the buffer.
+void get_stats_data(aom_codec_ctx_t *encoder, char *stats,
+                    size_t *stats_len, bool *got_data) {
+  const aom_codec_cx_pkt_t *pkt;
+  aom_codec_iter_t iter = NULL;
+  while ((pkt = aom_codec_get_cx_data(encoder, &iter))) {
+    *got_data = true;
+    if (pkt->kind != AOM_CODEC_STATS_PKT) continue;
+    memcpy(stats + *stats_len, pkt->data.twopass_stats.buf,
+           pkt->data.twopass_stats.sz);
+    *stats_len += pkt->data.twopass_stats.sz;
+  }
+}
+
+void first_pass(char *stats, size_t *stats_len) {
+  struct aom_codec_enc_cfg first_pass_cfg;
+  ... // Initialize the config as needed.
+  first_pass_cfg.g_pass = AOM_RC_FIRST_PASS;
+  aom_codec_ctx_t first_pass_encoder;
+  ... // Initialize the encoder.
+
+  while (frame_available) {
+    // Read in the uncompressed frame, update frame_available
+    aom_image_t *frame_to_encode = ...;
+    aom_codec_encode(&first_pass_encoder, img, pts, duration, flags);
+    get_stats_data(&first_pass_encoder, stats, stats_len);
+  }
+  // After all frames have been processed, call aom_codec_encode with
+  // a NULL ptr repeatedly, until no more data is returned. The NULL
+  // ptr tells the encoder that no more frames are available.
+  bool got_data;
+  do {
+    got_data = false;
+    aom_codec_encode(&first_pass_encoder, NULL, pts, duration, flags);
+    get_stats_data(&first_pass_encoder, stats, stats_len, &got_data);
+  } while (got_data);
+
+  aom_codec_destroy(&first_pass_encoder);
+}
+~~~~~~~~~~~~~~~
+
+ During the second pass, the uncompressed frames and the stats are
+ passed into the encoder.
+
+~~~~~~~~~~~~~~~{.c}
+// Write out each encoded frame to the file.
+void get_cx_data(aom_codec_ctx_t *encoder, FILE *file,
+                 bool *got_data) {
+  const aom_codec_cx_pkt_t *pkt;
+  aom_codec_iter_t iter = NULL;
+  while ((pkt = aom_codec_get_cx_data(encoder, &iter))) {
+   *got_data = true;
+   if (pkt->kind != AOM_CODEC_CX_FRAME_PKT) continue;
+   fwrite(pkt->data.frame.buf, 1, pkt->data.frame.sz, file);
+  }
+}
+
+void second_pass(char *stats, size_t stats_len) {
+  struct aom_codec_enc_cfg second_pass_cfg;
+  ... // Initialize the config file as needed.
+  second_pass_cfg.g_pass = AOM_RC_LAST_PASS;
+  cfg.rc_twopass_stats_in.buf = stats;
+  cfg.rc_twopass_stats_in.sz = stats_len;
+  aom_codec_ctx_t second_pass_encoder;
+  ... // Initialize the encoder from the config.
+
+  FILE *output = fopen("output.obu", "wb");
+  while (frame_available) {
+    // Read in the uncompressed frame, update frame_available
+    aom_image_t *frame_to_encode = ...;
+    aom_codec_encode(&second_pass_encoder, img, pts, duration, flags);
+    get_cx_data(&second_pass_encoder, output);
+  }
+  // Pass in NULL to flush the encoder.
+  bool got_data;
+  do {
+    got_data = false;
+    aom_codec_encode(&second_pass_encoder, NULL, pts, duration, flags);
+    get_cx_data(&second_pass_encoder, output, &got_data);
+  } while (got_data);
+
+  aom_codec_destroy(&second_pass_encoder);
+}
+~~~~~~~~~~~~~~~
+ */
+
+ /*!\defgroup look_ahead_buffer The Look-Ahead Buffer
+    \ingroup high_level_algo
+
+ A program should call \ref aom_codec_encode() for each frame that needs
+ processing. These frames are internally copied and stored in a fixed-size
+ circular buffer, known as the look-ahead buffer. Other parts of the code
+ will use future frame information to inform current frame decisions;
+ examples include the first-pass algorithm, TPL model, and temporal filter.
+ Note that this buffer also keeps a reference to the last source frame.
+
+ The look-ahead buffer is defined in \ref av1/encoder/lookahead.h. It acts as an
+ opaque structure, with an interface to create and free memory associated with
+ it. It supports pushing and popping frames onto the structure in a FIFO
+ fashion. It also allows look-ahead when using the \ref av1_lookahead_peek()
+ function with a non-negative number, and look-behind when -1 is passed in (for
+ the last source frame; e.g., firstpass will use this for motion estimation).
+ The \ref av1_lookahead_depth() function returns the current number of frames
+ stored in it. Note that \ref av1_lookahead_pop() is a bit of a misnomer - it
+ only pops if either the "flush" variable is set, or the buffer is at maximum
+ capacity.
+
+ The buffer is stored in the \ref AV1_COMP::lookahead field.
+ It is initialized in the first call to \ref aom_codec_encode(), in the
+ \ref av1_receive_raw_frame() sub-routine. The buffer size is defined by
+ the g_lag_in_frames parameter set in the
+ \ref aom_codec_enc_cfg_t::g_lag_in_frames struct.
+ This can be modified manually but should only be set once. On the command
+ line, the flag "--lag-in-frames" controls it. The default size is 19 for
+ non-realtime usage and 1 for realtime. Note that a maximum value of 35 is
+ enforced.
+
+ A frame will stay in the buffer as long as possible. As mentioned above,
+ the \ref av1_lookahead_pop() only removes a frame when either flush is set,
+ or the buffer is full. Note that each call to \ref aom_codec_encode() inserts
+ another frame into the buffer, and pop is called by the sub-function
+ \ref av1_encode_strategy(). The buffer is told to flush when
+ \ref aom_codec_encode() is passed a NULL image pointer. Note that the caller
+ must repeatedly call \ref aom_codec_encode() with a NULL image pointer, until
+ no more packets are available, in order to fully flush the buffer.
+
+ */
+
+/*! @} - end defgroup high_level_algo */
+
+/*!\defgroup partition_search Partition Search
+ * \ingroup encoder_algo
+ * For and overview of the partition search see \ref architecture_enc_partitions
+ * @{
+ */
+
+/*! @} - end defgroup partition_search */
+
+/*!\defgroup intra_mode_search Intra Mode Search
+ * \ingroup encoder_algo
+ * This module describes intra mode search algorithm in AV1.
+ * More details will be added.
+ * @{
+ */
+/*! @} - end defgroup intra_mode_search */
+
+/*!\defgroup inter_mode_search Inter Mode Search
+ * \ingroup encoder_algo
+ * This module describes inter mode search algorithm in AV1.
+ * More details will be added.
+ * @{
+ */
+/*! @} - end defgroup inter_mode_search */
+
+/*!\defgroup palette_mode_search Palette Mode Search
+ * \ingroup intra_mode_search
+ * This module describes palette mode search algorithm in AV1.
+ * More details will be added.
+ * @{
+ */
+/*! @} - end defgroup palette_mode_search */
+
+/*!\defgroup transform_search Transform Search
+ * \ingroup encoder_algo
+ * This module describes transform search algorithm in AV1.
+ * @{
+ */
+/*! @} - end defgroup transform_search */
+
+/*!\defgroup coefficient_coding Transform Coefficient Coding and Optimization
+ * \ingroup encoder_algo
+ * This module describes the algorithms of transform coefficient coding and optimization in AV1.
+ * More details will be added.
+ * @{
+ */
+/*! @} - end defgroup coefficient_coding */
+
+/*!\defgroup in_loop_filter In-loop Filter
+ * \ingroup encoder_algo
+ * This module describes in-loop filter algorithm in AV1.
+ * More details will be added.
+ * @{
+ */
+/*! @} - end defgroup in_loop_filter */
+
+/*!\defgroup in_loop_cdef CDEF
+ * \ingroup encoder_algo
+ * This module describes the CDEF parameter search algorithm
+ * in AV1. More details will be added.
+ * @{
+ */
+/*! @} - end defgroup in_loop_restoration */
+
+/*!\defgroup in_loop_restoration Loop Restoration
+ * \ingroup encoder_algo
+ * This module describes the loop restoration search
+ * and estimation algorithm in AV1.
+ * More details will be added.
+ * @{
+ */
+/*! @} - end defgroup in_loop_restoration */
+
+/*!\defgroup cyclic_refresh Cyclic Refresh
+ * \ingroup encoder_algo
+ * This module describes the cyclic refresh (aq-mode=3) in AV1.
+ * More details will be added.
+ * @{
+ */
+/*! @} - end defgroup cyclic_refresh */
+
+/*!\defgroup SVC Scalable Video Coding
+ * \ingroup encoder_algo
+ * This module describes scalable video coding algorithm in AV1.
+ * More details will be added.
+ * @{
+ */
+/*! @} - end defgroup SVC */
+/*!\defgroup variance_partition Variance Partition
+ * \ingroup encoder_algo
+ * This module describes variance partition algorithm in AV1.
+ * More details will be added.
+ * @{
+ */
+/*! @} - end defgroup variance_partition */
+/*!\defgroup nonrd_mode_search NonRD Optimized Mode Search
+ * \ingroup encoder_algo
+ * This module describes NonRD Optimized Mode Search used in Real-Time mode.
+ * More details will be added.
+ * @{
+ */
+/*! @} - end defgroup nonrd_mode_search */
diff --git a/doc/dev_guide/av1encoderflow.png b/doc/dev_guide/av1encoderflow.png
new file mode 100644
index 0000000..5e69fce
--- /dev/null
+++ b/doc/dev_guide/av1encoderflow.png
Binary files differ
diff --git a/doc/dev_guide/av1partitions.png b/doc/dev_guide/av1partitions.png
new file mode 100644
index 0000000..125439f
--- /dev/null
+++ b/doc/dev_guide/av1partitions.png
Binary files differ
diff --git a/doc/dev_guide/coeff_coding.png b/doc/dev_guide/coeff_coding.png
new file mode 100644
index 0000000..cba97dd
--- /dev/null
+++ b/doc/dev_guide/coeff_coding.png
Binary files differ
diff --git a/doc/dev_guide/filter_flow.png b/doc/dev_guide/filter_flow.png
new file mode 100644
index 0000000..82849a0
--- /dev/null
+++ b/doc/dev_guide/filter_flow.png
Binary files differ
diff --git a/doc/dev_guide/filter_thr.png b/doc/dev_guide/filter_thr.png
new file mode 100644
index 0000000..b833e94
--- /dev/null
+++ b/doc/dev_guide/filter_thr.png
Binary files differ
diff --git a/doc/dev_guide/genericcodecflow.png b/doc/dev_guide/genericcodecflow.png
new file mode 100644
index 0000000..65a6b2f
--- /dev/null
+++ b/doc/dev_guide/genericcodecflow.png
Binary files differ
diff --git a/doc/dev_guide/gf_group.png b/doc/dev_guide/gf_group.png
new file mode 100644
index 0000000..1cd47d2
--- /dev/null
+++ b/doc/dev_guide/gf_group.png
Binary files differ
diff --git a/doc/dev_guide/partition.png b/doc/dev_guide/partition.png
new file mode 100644
index 0000000..914d6c2
--- /dev/null
+++ b/doc/dev_guide/partition.png
Binary files differ
diff --git a/doc/dev_guide/tplgfgroupdiagram.png b/doc/dev_guide/tplgfgroupdiagram.png
new file mode 100644
index 0000000..582049a
--- /dev/null
+++ b/doc/dev_guide/tplgfgroupdiagram.png
Binary files differ
diff --git a/doc/img/edge_direction.svg b/doc/img/edge_direction.svg
new file mode 100644
index 0000000..343a2b9
--- /dev/null
+++ b/doc/img/edge_direction.svg
@@ -0,0 +1,6319 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<!-- Generated by Microsoft Visio, SVG Export edge_direction.svg Page-1 -->
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ev="http://www.w3.org/2001/xml-events"
+		xmlns:v="http://schemas.microsoft.com/visio/2003/SVGExtensions/" width="9.25333in" height="8.04538in"
+		viewBox="0 0 666.24 579.267" xml:space="preserve" color-interpolation-filters="sRGB" class="st8">
+	<v:documentProperties v:langID="1033" v:viewMarkup="false">
+		<v:userDefs>
+			<v:ud v:nameU="msvNoAutoConnect" v:val="VT0(1):26"/>
+		</v:userDefs>
+	</v:documentProperties>
+
+	<style type="text/css">
+	<![CDATA[
+		.st1 {fill:#ffffff;stroke:#000000;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.24}
+		.st2 {fill:#000000;font-family:Calibri;font-size:0.75em}
+		.st3 {fill:#00b0f0;fill-opacity:0.5;stroke:#000000;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.24}
+		.st4 {fill:#000000;font-family:Calibri;font-size:0.833336em}
+		.st5 {fill:none;stroke:none;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.75}
+		.st6 {fill:#000000;font-family:Calibri;font-size:1.5em;font-style:italic}
+		.st7 {font-size:1em;font-style:normal}
+		.st8 {fill:none;fill-rule:evenodd;font-size:12px;overflow:visible;stroke-linecap:square;stroke-miterlimit:3}
+	]]>
+	</style>
+
+	<g v:mID="0" v:index="1" v:groupContext="foregroundPage">
+		<title>Page-1</title>
+		<v:pageProperties v:drawingScale="1" v:pageScale="1" v:drawingUnits="19" v:shadowOffsetX="9" v:shadowOffsetY="-9"/>
+		<v:layer v:name="Connector" v:index="0"/>
+		<g id="shape111-1" v:mID="111" v:groupContext="shape" transform="translate(18.12,-468.375)">
+			<title>Square</title>
+			<desc>0</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>0</text>		</g>
+		<g id="shape113-4" v:mID="113" v:groupContext="shape" transform="translate(36.12,-468.375)">
+			<title>Square.113</title>
+			<desc>1</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1</text>		</g>
+		<g id="shape114-7" v:mID="114" v:groupContext="shape" transform="translate(54.12,-468.375)">
+			<title>Square.114</title>
+			<desc>2</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text>		</g>
+		<g id="shape115-10" v:mID="115" v:groupContext="shape" transform="translate(72.12,-468.375)">
+			<title>Square.115</title>
+			<desc>3</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text>		</g>
+		<g id="shape116-13" v:mID="116" v:groupContext="shape" transform="translate(18.12,-450.375)">
+			<title>Square.116</title>
+			<desc>1</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1</text>		</g>
+		<g id="shape117-16" v:mID="117" v:groupContext="shape" transform="translate(36.12,-450.375)">
+			<title>Square.117</title>
+			<desc>2</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text>		</g>
+		<g id="shape118-19" v:mID="118" v:groupContext="shape" transform="translate(54.12,-450.375)">
+			<title>Square.118</title>
+			<desc>3</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text>		</g>
+		<g id="shape119-22" v:mID="119" v:groupContext="shape" transform="translate(72.12,-450.375)">
+			<title>Square.119</title>
+			<desc>4</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text>		</g>
+		<g id="shape124-25" v:mID="124" v:groupContext="shape" transform="translate(18.12,-432.375)">
+			<title>Square.124</title>
+			<desc>2</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text>		</g>
+		<g id="shape125-28" v:mID="125" v:groupContext="shape" transform="translate(36.12,-432.375)">
+			<title>Square.125</title>
+			<desc>3</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text>		</g>
+		<g id="shape126-31" v:mID="126" v:groupContext="shape" transform="translate(54.12,-432.375)">
+			<title>Square.126</title>
+			<desc>4</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text>		</g>
+		<g id="shape127-34" v:mID="127" v:groupContext="shape" transform="translate(72.12,-432.375)">
+			<title>Square.127</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape128-37" v:mID="128" v:groupContext="shape" transform="translate(18.12,-414.375)">
+			<title>Square.128</title>
+			<desc>3</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text>		</g>
+		<g id="shape129-40" v:mID="129" v:groupContext="shape" transform="translate(36.12,-414.375)">
+			<title>Square.129</title>
+			<desc>4</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text>		</g>
+		<g id="shape130-43" v:mID="130" v:groupContext="shape" transform="translate(54.12,-414.375)">
+			<title>Square.130</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape131-46" v:mID="131" v:groupContext="shape" transform="translate(72.12,-414.375)">
+			<title>Square.131</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape132-49" v:mID="132" v:groupContext="shape" transform="translate(18.12,-396.375)">
+			<title>Square.132</title>
+			<desc>4</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text>		</g>
+		<g id="shape133-52" v:mID="133" v:groupContext="shape" transform="translate(36.12,-396.375)">
+			<title>Square.133</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape134-55" v:mID="134" v:groupContext="shape" transform="translate(54.12,-396.375)">
+			<title>Square.134</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape135-58" v:mID="135" v:groupContext="shape" transform="translate(72.12,-396.375)">
+			<title>Square.135</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape136-61" v:mID="136" v:groupContext="shape" transform="translate(18.12,-378.375)">
+			<title>Square.136</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape137-64" v:mID="137" v:groupContext="shape" transform="translate(36.12,-378.375)">
+			<title>Square.137</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape138-67" v:mID="138" v:groupContext="shape" transform="translate(54.12,-378.375)">
+			<title>Square.138</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape139-70" v:mID="139" v:groupContext="shape" transform="translate(72.12,-378.375)">
+			<title>Square.139</title>
+			<desc>8</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>8</text>		</g>
+		<g id="shape140-73" v:mID="140" v:groupContext="shape" transform="translate(18.12,-360.375)">
+			<title>Square.140</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape141-76" v:mID="141" v:groupContext="shape" transform="translate(36.12,-360.375)">
+			<title>Square.141</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape142-79" v:mID="142" v:groupContext="shape" transform="translate(54.12,-360.375)">
+			<title>Square.142</title>
+			<desc>8</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>8</text>		</g>
+		<g id="shape143-82" v:mID="143" v:groupContext="shape" transform="translate(72.12,-360.375)">
+			<title>Square.143</title>
+			<desc>9</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>9</text>		</g>
+		<g id="shape144-85" v:mID="144" v:groupContext="shape" transform="translate(18.12,-342.375)">
+			<title>Square.144</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape145-88" v:mID="145" v:groupContext="shape" transform="translate(36.12,-342.375)">
+			<title>Square.145</title>
+			<desc>8</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>8</text>		</g>
+		<g id="shape146-91" v:mID="146" v:groupContext="shape" transform="translate(54.12,-342.375)">
+			<title>Square.146</title>
+			<desc>9</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>9</text>		</g>
+		<g id="shape147-94" v:mID="147" v:groupContext="shape" transform="translate(72.12,-342.375)">
+			<title>Square.147</title>
+			<desc>10</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="4.44" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>10</text>		</g>
+		<g id="shape148-97" v:mID="148" v:groupContext="shape" transform="translate(90.12,-468.375)">
+			<title>Square.148</title>
+			<desc>4</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text>		</g>
+		<g id="shape149-100" v:mID="149" v:groupContext="shape" transform="translate(108.12,-468.375)">
+			<title>Square.149</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape150-103" v:mID="150" v:groupContext="shape" transform="translate(126.12,-468.375)">
+			<title>Square.150</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape151-106" v:mID="151" v:groupContext="shape" transform="translate(144.12,-468.375)">
+			<title>Square.151</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape152-109" v:mID="152" v:groupContext="shape" transform="translate(90.12,-450.375)">
+			<title>Square.152</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape153-112" v:mID="153" v:groupContext="shape" transform="translate(108.12,-450.375)">
+			<title>Square.153</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape154-115" v:mID="154" v:groupContext="shape" transform="translate(126.12,-450.375)">
+			<title>Square.154</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape155-118" v:mID="155" v:groupContext="shape" transform="translate(144.12,-450.375)">
+			<title>Square.155</title>
+			<desc>8</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>8</text>		</g>
+		<g id="shape156-121" v:mID="156" v:groupContext="shape" transform="translate(90.12,-432.375)">
+			<title>Square.156</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape157-124" v:mID="157" v:groupContext="shape" transform="translate(108.12,-432.375)">
+			<title>Square.157</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape158-127" v:mID="158" v:groupContext="shape" transform="translate(126.12,-432.375)">
+			<title>Square.158</title>
+			<desc>8</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>8</text>		</g>
+		<g id="shape159-130" v:mID="159" v:groupContext="shape" transform="translate(144.12,-432.375)">
+			<title>Square.159</title>
+			<desc>9</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>9</text>		</g>
+		<g id="shape160-133" v:mID="160" v:groupContext="shape" transform="translate(90.12,-414.375)">
+			<title>Square.160</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape161-136" v:mID="161" v:groupContext="shape" transform="translate(108.12,-414.375)">
+			<title>Square.161</title>
+			<desc>8</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>8</text>		</g>
+		<g id="shape162-139" v:mID="162" v:groupContext="shape" transform="translate(126.12,-414.375)">
+			<title>Square.162</title>
+			<desc>9</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>9</text>		</g>
+		<g id="shape163-142" v:mID="163" v:groupContext="shape" transform="translate(144.12,-414.375)">
+			<title>Square.163</title>
+			<desc>10</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="4.44" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>10</text>		</g>
+		<g id="shape164-145" v:mID="164" v:groupContext="shape" transform="translate(90.12,-396.375)">
+			<title>Square.164</title>
+			<desc>8</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>8</text>		</g>
+		<g id="shape165-148" v:mID="165" v:groupContext="shape" transform="translate(108.12,-396.375)">
+			<title>Square.165</title>
+			<desc>9</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>9</text>		</g>
+		<g id="shape166-151" v:mID="166" v:groupContext="shape" transform="translate(126.12,-396.375)">
+			<title>Square.166</title>
+			<desc>10</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="4.44" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>10</text>		</g>
+		<g id="shape167-154" v:mID="167" v:groupContext="shape" transform="translate(144.12,-396.375)">
+			<title>Square.167</title>
+			<desc>11</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="4.44" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>11</text>		</g>
+		<g id="shape168-157" v:mID="168" v:groupContext="shape" transform="translate(90.12,-378.375)">
+			<title>Square.168</title>
+			<desc>9</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>9</text>		</g>
+		<g id="shape169-160" v:mID="169" v:groupContext="shape" transform="translate(108.12,-378.375)">
+			<title>Square.169</title>
+			<desc>10</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="4.44" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>10</text>		</g>
+		<g id="shape170-163" v:mID="170" v:groupContext="shape" transform="translate(126.12,-378.375)">
+			<title>Square.170</title>
+			<desc>11</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="4.44" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>11</text>		</g>
+		<g id="shape171-166" v:mID="171" v:groupContext="shape" transform="translate(144.12,-378.375)">
+			<title>Square.171</title>
+			<desc>12</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="4.44" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>12</text>		</g>
+		<g id="shape172-169" v:mID="172" v:groupContext="shape" transform="translate(90.12,-360.375)">
+			<title>Square.172</title>
+			<desc>10</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="4.44" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>10</text>		</g>
+		<g id="shape173-172" v:mID="173" v:groupContext="shape" transform="translate(108.12,-360.375)">
+			<title>Square.173</title>
+			<desc>11</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="4.44" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>11</text>		</g>
+		<g id="shape174-175" v:mID="174" v:groupContext="shape" transform="translate(126.12,-360.375)">
+			<title>Square.174</title>
+			<desc>12</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="4.44" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>12</text>		</g>
+		<g id="shape175-178" v:mID="175" v:groupContext="shape" transform="translate(144.12,-360.375)">
+			<title>Square.175</title>
+			<desc>13</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="4.44" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>13</text>		</g>
+		<g id="shape176-181" v:mID="176" v:groupContext="shape" transform="translate(90.12,-342.375)">
+			<title>Square.176</title>
+			<desc>11</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="4.44" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>11</text>		</g>
+		<g id="shape177-184" v:mID="177" v:groupContext="shape" transform="translate(108.12,-342.375)">
+			<title>Square.177</title>
+			<desc>12</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="4.44" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>12</text>		</g>
+		<g id="shape178-187" v:mID="178" v:groupContext="shape" transform="translate(126.12,-342.375)">
+			<title>Square.178</title>
+			<desc>13</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="4.44" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>13</text>		</g>
+		<g id="shape179-190" v:mID="179" v:groupContext="shape" transform="translate(144.12,-342.375)">
+			<title>Square.179</title>
+			<desc>14</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="4.44" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>14</text>		</g>
+		<g id="shape180-193" v:mID="180" v:groupContext="shape" transform="translate(180.12,-468.375)">
+			<title>Square.180</title>
+			<desc>0</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>0</text>		</g>
+		<g id="shape181-196" v:mID="181" v:groupContext="shape" transform="translate(198.12,-468.375)">
+			<title>Square.181</title>
+			<desc>0</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>0</text>		</g>
+		<g id="shape182-199" v:mID="182" v:groupContext="shape" transform="translate(216.12,-468.375)">
+			<title>Square.182</title>
+			<desc>1</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1</text>		</g>
+		<g id="shape183-202" v:mID="183" v:groupContext="shape" transform="translate(234.12,-468.375)">
+			<title>Square.183</title>
+			<desc>1</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1</text>		</g>
+		<g id="shape184-205" v:mID="184" v:groupContext="shape" transform="translate(180.12,-450.375)">
+			<title>Square.184</title>
+			<desc>1</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1</text>		</g>
+		<g id="shape185-208" v:mID="185" v:groupContext="shape" transform="translate(198.12,-450.375)">
+			<title>Square.185</title>
+			<desc>1</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1</text>		</g>
+		<g id="shape186-211" v:mID="186" v:groupContext="shape" transform="translate(216.12,-450.375)">
+			<title>Square.186</title>
+			<desc>2</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text>		</g>
+		<g id="shape187-214" v:mID="187" v:groupContext="shape" transform="translate(234.12,-450.375)">
+			<title>Square.187</title>
+			<desc>2</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text>		</g>
+		<g id="shape188-217" v:mID="188" v:groupContext="shape" transform="translate(180.12,-432.375)">
+			<title>Square.188</title>
+			<desc>2</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text>		</g>
+		<g id="shape189-220" v:mID="189" v:groupContext="shape" transform="translate(198.12,-432.375)">
+			<title>Square.189</title>
+			<desc>2</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text>		</g>
+		<g id="shape190-223" v:mID="190" v:groupContext="shape" transform="translate(216.12,-432.375)">
+			<title>Square.190</title>
+			<desc>3</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text>		</g>
+		<g id="shape191-226" v:mID="191" v:groupContext="shape" transform="translate(234.12,-432.375)">
+			<title>Square.191</title>
+			<desc>3</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text>		</g>
+		<g id="shape192-229" v:mID="192" v:groupContext="shape" transform="translate(180.12,-414.375)">
+			<title>Square.192</title>
+			<desc>3</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text>		</g>
+		<g id="shape193-232" v:mID="193" v:groupContext="shape" transform="translate(198.12,-414.375)">
+			<title>Square.193</title>
+			<desc>3</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text>		</g>
+		<g id="shape194-235" v:mID="194" v:groupContext="shape" transform="translate(216.12,-414.375)">
+			<title>Square.194</title>
+			<desc>4</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text>		</g>
+		<g id="shape195-238" v:mID="195" v:groupContext="shape" transform="translate(234.12,-414.375)">
+			<title>Square.195</title>
+			<desc>4</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text>		</g>
+		<g id="shape196-241" v:mID="196" v:groupContext="shape" transform="translate(180.12,-396.375)">
+			<title>Square.196</title>
+			<desc>4</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text>		</g>
+		<g id="shape197-244" v:mID="197" v:groupContext="shape" transform="translate(198.12,-396.375)">
+			<title>Square.197</title>
+			<desc>4</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text>		</g>
+		<g id="shape198-247" v:mID="198" v:groupContext="shape" transform="translate(216.12,-396.375)">
+			<title>Square.198</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape199-250" v:mID="199" v:groupContext="shape" transform="translate(234.12,-396.375)">
+			<title>Square.199</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape200-253" v:mID="200" v:groupContext="shape" transform="translate(180.12,-378.375)">
+			<title>Square.200</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape201-256" v:mID="201" v:groupContext="shape" transform="translate(198.12,-378.375)">
+			<title>Square.201</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape202-259" v:mID="202" v:groupContext="shape" transform="translate(216.12,-378.375)">
+			<title>Square.202</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape203-262" v:mID="203" v:groupContext="shape" transform="translate(234.12,-378.375)">
+			<title>Square.203</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape204-265" v:mID="204" v:groupContext="shape" transform="translate(180.12,-360.375)">
+			<title>Square.204</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape205-268" v:mID="205" v:groupContext="shape" transform="translate(198.12,-360.375)">
+			<title>Square.205</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape206-271" v:mID="206" v:groupContext="shape" transform="translate(216.12,-360.375)">
+			<title>Square.206</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape207-274" v:mID="207" v:groupContext="shape" transform="translate(234.12,-360.375)">
+			<title>Square.207</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape208-277" v:mID="208" v:groupContext="shape" transform="translate(180.12,-342.375)">
+			<title>Square.208</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape209-280" v:mID="209" v:groupContext="shape" transform="translate(198.12,-342.375)">
+			<title>Square.209</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape210-283" v:mID="210" v:groupContext="shape" transform="translate(216.12,-342.375)">
+			<title>Square.210</title>
+			<desc>8</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>8</text>		</g>
+		<g id="shape211-286" v:mID="211" v:groupContext="shape" transform="translate(234.12,-342.375)">
+			<title>Square.211</title>
+			<desc>8</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>8</text>		</g>
+		<g id="shape212-289" v:mID="212" v:groupContext="shape" transform="translate(252.12,-468.375)">
+			<title>Square.212</title>
+			<desc>2</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text>		</g>
+		<g id="shape213-292" v:mID="213" v:groupContext="shape" transform="translate(270.12,-468.375)">
+			<title>Square.213</title>
+			<desc>2</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text>		</g>
+		<g id="shape214-295" v:mID="214" v:groupContext="shape" transform="translate(288.12,-468.375)">
+			<title>Square.214</title>
+			<desc>3</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text>		</g>
+		<g id="shape215-298" v:mID="215" v:groupContext="shape" transform="translate(306.12,-468.375)">
+			<title>Square.215</title>
+			<desc>3</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text>		</g>
+		<g id="shape216-301" v:mID="216" v:groupContext="shape" transform="translate(252.12,-450.375)">
+			<title>Square.216</title>
+			<desc>3</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text>		</g>
+		<g id="shape217-304" v:mID="217" v:groupContext="shape" transform="translate(270.12,-450.375)">
+			<title>Square.217</title>
+			<desc>3</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text>		</g>
+		<g id="shape218-307" v:mID="218" v:groupContext="shape" transform="translate(288.12,-450.375)">
+			<title>Square.218</title>
+			<desc>4</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text>		</g>
+		<g id="shape219-310" v:mID="219" v:groupContext="shape" transform="translate(306.12,-450.375)">
+			<title>Square.219</title>
+			<desc>4</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text>		</g>
+		<g id="shape220-313" v:mID="220" v:groupContext="shape" transform="translate(252.12,-432.375)">
+			<title>Square.220</title>
+			<desc>4</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text>		</g>
+		<g id="shape221-316" v:mID="221" v:groupContext="shape" transform="translate(270.12,-432.375)">
+			<title>Square.221</title>
+			<desc>4</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text>		</g>
+		<g id="shape222-319" v:mID="222" v:groupContext="shape" transform="translate(288.12,-432.375)">
+			<title>Square.222</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape223-322" v:mID="223" v:groupContext="shape" transform="translate(306.12,-432.375)">
+			<title>Square.223</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape224-325" v:mID="224" v:groupContext="shape" transform="translate(252.12,-414.375)">
+			<title>Square.224</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape225-328" v:mID="225" v:groupContext="shape" transform="translate(270.12,-414.375)">
+			<title>Square.225</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape226-331" v:mID="226" v:groupContext="shape" transform="translate(288.12,-414.375)">
+			<title>Square.226</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape227-334" v:mID="227" v:groupContext="shape" transform="translate(306.12,-414.375)">
+			<title>Square.227</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape228-337" v:mID="228" v:groupContext="shape" transform="translate(252.12,-396.375)">
+			<title>Square.228</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape229-340" v:mID="229" v:groupContext="shape" transform="translate(270.12,-396.375)">
+			<title>Square.229</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape230-343" v:mID="230" v:groupContext="shape" transform="translate(288.12,-396.375)">
+			<title>Square.230</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape231-346" v:mID="231" v:groupContext="shape" transform="translate(306.12,-396.375)">
+			<title>Square.231</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape232-349" v:mID="232" v:groupContext="shape" transform="translate(252.12,-378.375)">
+			<title>Square.232</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape233-352" v:mID="233" v:groupContext="shape" transform="translate(270.12,-378.375)">
+			<title>Square.233</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape234-355" v:mID="234" v:groupContext="shape" transform="translate(288.12,-378.375)">
+			<title>Square.234</title>
+			<desc>8</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>8</text>		</g>
+		<g id="shape235-358" v:mID="235" v:groupContext="shape" transform="translate(306.12,-378.375)">
+			<title>Square.235</title>
+			<desc>8</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>8</text>		</g>
+		<g id="shape236-361" v:mID="236" v:groupContext="shape" transform="translate(252.12,-360.375)">
+			<title>Square.236</title>
+			<desc>8</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>8</text>		</g>
+		<g id="shape237-364" v:mID="237" v:groupContext="shape" transform="translate(270.12,-360.375)">
+			<title>Square.237</title>
+			<desc>8</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>8</text>		</g>
+		<g id="shape238-367" v:mID="238" v:groupContext="shape" transform="translate(288.12,-360.375)">
+			<title>Square.238</title>
+			<desc>9</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>9</text>		</g>
+		<g id="shape239-370" v:mID="239" v:groupContext="shape" transform="translate(306.12,-360.375)">
+			<title>Square.239</title>
+			<desc>9</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>9</text>		</g>
+		<g id="shape240-373" v:mID="240" v:groupContext="shape" transform="translate(252.12,-342.375)">
+			<title>Square.240</title>
+			<desc>9</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>9</text>		</g>
+		<g id="shape241-376" v:mID="241" v:groupContext="shape" transform="translate(270.12,-342.375)">
+			<title>Square.241</title>
+			<desc>9</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>9</text>		</g>
+		<g id="shape242-379" v:mID="242" v:groupContext="shape" transform="translate(288.12,-342.375)">
+			<title>Square.242</title>
+			<desc>10</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="4.44" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>10</text>		</g>
+		<g id="shape243-382" v:mID="243" v:groupContext="shape" transform="translate(306.12,-342.375)">
+			<title>Square.243</title>
+			<desc>10</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="4.44" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>10</text>		</g>
+		<g id="shape244-385" v:mID="244" v:groupContext="shape" transform="translate(342.12,-468.375)">
+			<title>Square.244</title>
+			<desc>0</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>0</text>		</g>
+		<g id="shape245-388" v:mID="245" v:groupContext="shape" transform="translate(360.12,-468.375)">
+			<title>Square.245</title>
+			<desc>0</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>0</text>		</g>
+		<g id="shape246-391" v:mID="246" v:groupContext="shape" transform="translate(378.12,-468.375)">
+			<title>Square.246</title>
+			<desc>0</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>0</text>		</g>
+		<g id="shape247-394" v:mID="247" v:groupContext="shape" transform="translate(396.12,-468.375)">
+			<title>Square.247</title>
+			<desc>0</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>0</text>		</g>
+		<g id="shape248-397" v:mID="248" v:groupContext="shape" transform="translate(342.12,-450.375)">
+			<title>Square.248</title>
+			<desc>1</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1</text>		</g>
+		<g id="shape249-400" v:mID="249" v:groupContext="shape" transform="translate(360.12,-450.375)">
+			<title>Square.249</title>
+			<desc>1</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1</text>		</g>
+		<g id="shape250-403" v:mID="250" v:groupContext="shape" transform="translate(378.12,-450.375)">
+			<title>Square.250</title>
+			<desc>1</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1</text>		</g>
+		<g id="shape251-406" v:mID="251" v:groupContext="shape" transform="translate(396.12,-450.375)">
+			<title>Square.251</title>
+			<desc>1</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1</text>		</g>
+		<g id="shape252-409" v:mID="252" v:groupContext="shape" transform="translate(342.12,-432.375)">
+			<title>Square.252</title>
+			<desc>2</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text>		</g>
+		<g id="shape253-412" v:mID="253" v:groupContext="shape" transform="translate(360.12,-432.375)">
+			<title>Square.253</title>
+			<desc>2</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text>		</g>
+		<g id="shape254-415" v:mID="254" v:groupContext="shape" transform="translate(378.12,-432.375)">
+			<title>Square.254</title>
+			<desc>2</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text>		</g>
+		<g id="shape255-418" v:mID="255" v:groupContext="shape" transform="translate(396.12,-432.375)">
+			<title>Square.255</title>
+			<desc>2</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text>		</g>
+		<g id="shape256-421" v:mID="256" v:groupContext="shape" transform="translate(342.12,-414.375)">
+			<title>Square.256</title>
+			<desc>3</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text>		</g>
+		<g id="shape257-424" v:mID="257" v:groupContext="shape" transform="translate(360.12,-414.375)">
+			<title>Square.257</title>
+			<desc>3</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text>		</g>
+		<g id="shape258-427" v:mID="258" v:groupContext="shape" transform="translate(378.12,-414.375)">
+			<title>Square.258</title>
+			<desc>3</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text>		</g>
+		<g id="shape259-430" v:mID="259" v:groupContext="shape" transform="translate(396.12,-414.375)">
+			<title>Square.259</title>
+			<desc>3</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text>		</g>
+		<g id="shape260-433" v:mID="260" v:groupContext="shape" transform="translate(342.12,-396.375)">
+			<title>Square.260</title>
+			<desc>4</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text>		</g>
+		<g id="shape261-436" v:mID="261" v:groupContext="shape" transform="translate(360.12,-396.375)">
+			<title>Square.261</title>
+			<desc>4</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text>		</g>
+		<g id="shape262-439" v:mID="262" v:groupContext="shape" transform="translate(378.12,-396.375)">
+			<title>Square.262</title>
+			<desc>4</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text>		</g>
+		<g id="shape263-442" v:mID="263" v:groupContext="shape" transform="translate(396.12,-396.375)">
+			<title>Square.263</title>
+			<desc>4</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text>		</g>
+		<g id="shape264-445" v:mID="264" v:groupContext="shape" transform="translate(342.12,-378.375)">
+			<title>Square.264</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape265-448" v:mID="265" v:groupContext="shape" transform="translate(360.12,-378.375)">
+			<title>Square.265</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape266-451" v:mID="266" v:groupContext="shape" transform="translate(378.12,-378.375)">
+			<title>Square.266</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape267-454" v:mID="267" v:groupContext="shape" transform="translate(396.12,-378.375)">
+			<title>Square.267</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape268-457" v:mID="268" v:groupContext="shape" transform="translate(342.12,-360.375)">
+			<title>Square.268</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape269-460" v:mID="269" v:groupContext="shape" transform="translate(360.12,-360.375)">
+			<title>Square.269</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape270-463" v:mID="270" v:groupContext="shape" transform="translate(378.12,-360.375)">
+			<title>Square.270</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape271-466" v:mID="271" v:groupContext="shape" transform="translate(396.12,-360.375)">
+			<title>Square.271</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape272-469" v:mID="272" v:groupContext="shape" transform="translate(342.12,-342.375)">
+			<title>Square.272</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape273-472" v:mID="273" v:groupContext="shape" transform="translate(360.12,-342.375)">
+			<title>Square.273</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape274-475" v:mID="274" v:groupContext="shape" transform="translate(378.12,-342.375)">
+			<title>Square.274</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape275-478" v:mID="275" v:groupContext="shape" transform="translate(396.12,-342.375)">
+			<title>Square.275</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape276-481" v:mID="276" v:groupContext="shape" transform="translate(414.12,-468.375)">
+			<title>Square.276</title>
+			<desc>0</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>0</text>		</g>
+		<g id="shape277-484" v:mID="277" v:groupContext="shape" transform="translate(432.12,-468.375)">
+			<title>Square.277</title>
+			<desc>0</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>0</text>		</g>
+		<g id="shape278-487" v:mID="278" v:groupContext="shape" transform="translate(450.12,-468.375)">
+			<title>Square.278</title>
+			<desc>0</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>0</text>		</g>
+		<g id="shape279-490" v:mID="279" v:groupContext="shape" transform="translate(468.12,-468.375)">
+			<title>Square.279</title>
+			<desc>0</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>0</text>		</g>
+		<g id="shape280-493" v:mID="280" v:groupContext="shape" transform="translate(414.12,-450.375)">
+			<title>Square.280</title>
+			<desc>1</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1</text>		</g>
+		<g id="shape281-496" v:mID="281" v:groupContext="shape" transform="translate(432.12,-450.375)">
+			<title>Square.281</title>
+			<desc>1</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1</text>		</g>
+		<g id="shape282-499" v:mID="282" v:groupContext="shape" transform="translate(450.12,-450.375)">
+			<title>Square.282</title>
+			<desc>1</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1</text>		</g>
+		<g id="shape283-502" v:mID="283" v:groupContext="shape" transform="translate(468.12,-450.375)">
+			<title>Square.283</title>
+			<desc>1</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1</text>		</g>
+		<g id="shape284-505" v:mID="284" v:groupContext="shape" transform="translate(414.12,-432.375)">
+			<title>Square.284</title>
+			<desc>2</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text>		</g>
+		<g id="shape285-508" v:mID="285" v:groupContext="shape" transform="translate(432.12,-432.375)">
+			<title>Square.285</title>
+			<desc>2</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text>		</g>
+		<g id="shape286-511" v:mID="286" v:groupContext="shape" transform="translate(450.12,-432.375)">
+			<title>Square.286</title>
+			<desc>2</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text>		</g>
+		<g id="shape287-514" v:mID="287" v:groupContext="shape" transform="translate(468.12,-432.375)">
+			<title>Square.287</title>
+			<desc>2</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text>		</g>
+		<g id="shape288-517" v:mID="288" v:groupContext="shape" transform="translate(414.12,-414.375)">
+			<title>Square.288</title>
+			<desc>3</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text>		</g>
+		<g id="shape289-520" v:mID="289" v:groupContext="shape" transform="translate(432.12,-414.375)">
+			<title>Square.289</title>
+			<desc>3</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text>		</g>
+		<g id="shape290-523" v:mID="290" v:groupContext="shape" transform="translate(450.12,-414.375)">
+			<title>Square.290</title>
+			<desc>3</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text>		</g>
+		<g id="shape291-526" v:mID="291" v:groupContext="shape" transform="translate(468.12,-414.375)">
+			<title>Square.291</title>
+			<desc>3</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text>		</g>
+		<g id="shape292-529" v:mID="292" v:groupContext="shape" transform="translate(414.12,-396.375)">
+			<title>Square.292</title>
+			<desc>4</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text>		</g>
+		<g id="shape293-532" v:mID="293" v:groupContext="shape" transform="translate(432.12,-396.375)">
+			<title>Square.293</title>
+			<desc>4</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text>		</g>
+		<g id="shape294-535" v:mID="294" v:groupContext="shape" transform="translate(450.12,-396.375)">
+			<title>Square.294</title>
+			<desc>4</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text>		</g>
+		<g id="shape295-538" v:mID="295" v:groupContext="shape" transform="translate(468.12,-396.375)">
+			<title>Square.295</title>
+			<desc>4</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text>		</g>
+		<g id="shape296-541" v:mID="296" v:groupContext="shape" transform="translate(414.12,-378.375)">
+			<title>Square.296</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape297-544" v:mID="297" v:groupContext="shape" transform="translate(432.12,-378.375)">
+			<title>Square.297</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape298-547" v:mID="298" v:groupContext="shape" transform="translate(450.12,-378.375)">
+			<title>Square.298</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape299-550" v:mID="299" v:groupContext="shape" transform="translate(468.12,-378.375)">
+			<title>Square.299</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape300-553" v:mID="300" v:groupContext="shape" transform="translate(414.12,-360.375)">
+			<title>Square.300</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape301-556" v:mID="301" v:groupContext="shape" transform="translate(432.12,-360.375)">
+			<title>Square.301</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape302-559" v:mID="302" v:groupContext="shape" transform="translate(450.12,-360.375)">
+			<title>Square.302</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape303-562" v:mID="303" v:groupContext="shape" transform="translate(468.12,-360.375)">
+			<title>Square.303</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape304-565" v:mID="304" v:groupContext="shape" transform="translate(414.12,-342.375)">
+			<title>Square.304</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape305-568" v:mID="305" v:groupContext="shape" transform="translate(432.12,-342.375)">
+			<title>Square.305</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape306-571" v:mID="306" v:groupContext="shape" transform="translate(450.12,-342.375)">
+			<title>Square.306</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape307-574" v:mID="307" v:groupContext="shape" transform="translate(468.12,-342.375)">
+			<title>Square.307</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape308-577" v:mID="308" v:groupContext="shape" transform="translate(504.12,-468.375)">
+			<title>Square.308</title>
+			<desc>3</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text>		</g>
+		<g id="shape309-580" v:mID="309" v:groupContext="shape" transform="translate(522.12,-468.375)">
+			<title>Square.309</title>
+			<desc>3</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text>		</g>
+		<g id="shape310-583" v:mID="310" v:groupContext="shape" transform="translate(540.12,-468.375)">
+			<title>Square.310</title>
+			<desc>2</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text>		</g>
+		<g id="shape311-586" v:mID="311" v:groupContext="shape" transform="translate(558.12,-468.375)">
+			<title>Square.311</title>
+			<desc>2</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text>		</g>
+		<g id="shape312-589" v:mID="312" v:groupContext="shape" transform="translate(504.12,-450.375)">
+			<title>Square.312</title>
+			<desc>4</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text>		</g>
+		<g id="shape313-592" v:mID="313" v:groupContext="shape" transform="translate(522.12,-450.375)">
+			<title>Square.313</title>
+			<desc>4</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text>		</g>
+		<g id="shape314-595" v:mID="314" v:groupContext="shape" transform="translate(540.12,-450.375)">
+			<title>Square.314</title>
+			<desc>3</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text>		</g>
+		<g id="shape315-598" v:mID="315" v:groupContext="shape" transform="translate(558.12,-450.375)">
+			<title>Square.315</title>
+			<desc>3</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text>		</g>
+		<g id="shape316-601" v:mID="316" v:groupContext="shape" transform="translate(504.12,-432.375)">
+			<title>Square.316</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape317-604" v:mID="317" v:groupContext="shape" transform="translate(522.12,-432.375)">
+			<title>Square.317</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape318-607" v:mID="318" v:groupContext="shape" transform="translate(540.12,-432.375)">
+			<title>Square.318</title>
+			<desc>4</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text>		</g>
+		<g id="shape319-610" v:mID="319" v:groupContext="shape" transform="translate(558.12,-432.375)">
+			<title>Square.319</title>
+			<desc>4</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text>		</g>
+		<g id="shape320-613" v:mID="320" v:groupContext="shape" transform="translate(504.12,-414.375)">
+			<title>Square.320</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape321-616" v:mID="321" v:groupContext="shape" transform="translate(522.12,-414.375)">
+			<title>Square.321</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape322-619" v:mID="322" v:groupContext="shape" transform="translate(540.12,-414.375)">
+			<title>Square.322</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape323-622" v:mID="323" v:groupContext="shape" transform="translate(558.12,-414.375)">
+			<title>Square.323</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape324-625" v:mID="324" v:groupContext="shape" transform="translate(504.12,-396.375)">
+			<title>Square.324</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape325-628" v:mID="325" v:groupContext="shape" transform="translate(522.12,-396.375)">
+			<title>Square.325</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape326-631" v:mID="326" v:groupContext="shape" transform="translate(540.12,-396.375)">
+			<title>Square.326</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape327-634" v:mID="327" v:groupContext="shape" transform="translate(558.12,-396.375)">
+			<title>Square.327</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape328-637" v:mID="328" v:groupContext="shape" transform="translate(504.12,-378.375)">
+			<title>Square.328</title>
+			<desc>8</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>8</text>		</g>
+		<g id="shape329-640" v:mID="329" v:groupContext="shape" transform="translate(522.12,-378.375)">
+			<title>Square.329</title>
+			<desc>8</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>8</text>		</g>
+		<g id="shape330-643" v:mID="330" v:groupContext="shape" transform="translate(540.12,-378.375)">
+			<title>Square.330</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape331-646" v:mID="331" v:groupContext="shape" transform="translate(558.12,-378.375)">
+			<title>Square.331</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape332-649" v:mID="332" v:groupContext="shape" transform="translate(504.12,-360.375)">
+			<title>Square.332</title>
+			<desc>9</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>9</text>		</g>
+		<g id="shape333-652" v:mID="333" v:groupContext="shape" transform="translate(522.12,-360.375)">
+			<title>Square.333</title>
+			<desc>9</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>9</text>		</g>
+		<g id="shape334-655" v:mID="334" v:groupContext="shape" transform="translate(540.12,-360.375)">
+			<title>Square.334</title>
+			<desc>8</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>8</text>		</g>
+		<g id="shape335-658" v:mID="335" v:groupContext="shape" transform="translate(558.12,-360.375)">
+			<title>Square.335</title>
+			<desc>8</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>8</text>		</g>
+		<g id="shape336-661" v:mID="336" v:groupContext="shape" transform="translate(504.12,-342.375)">
+			<title>Square.336</title>
+			<desc>10</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="4.44" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>10</text>		</g>
+		<g id="shape337-664" v:mID="337" v:groupContext="shape" transform="translate(522.12,-342.375)">
+			<title>Square.337</title>
+			<desc>10</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="4.44" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>10</text>		</g>
+		<g id="shape338-667" v:mID="338" v:groupContext="shape" transform="translate(540.12,-342.375)">
+			<title>Square.338</title>
+			<desc>9</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>9</text>		</g>
+		<g id="shape339-670" v:mID="339" v:groupContext="shape" transform="translate(558.12,-342.375)">
+			<title>Square.339</title>
+			<desc>9</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>9</text>		</g>
+		<g id="shape340-673" v:mID="340" v:groupContext="shape" transform="translate(576.12,-468.375)">
+			<title>Square.340</title>
+			<desc>1</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1</text>		</g>
+		<g id="shape341-676" v:mID="341" v:groupContext="shape" transform="translate(594.12,-468.375)">
+			<title>Square.341</title>
+			<desc>1</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1</text>		</g>
+		<g id="shape342-679" v:mID="342" v:groupContext="shape" transform="translate(612.12,-468.375)">
+			<title>Square.342</title>
+			<desc>0</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>0</text>		</g>
+		<g id="shape343-682" v:mID="343" v:groupContext="shape" transform="translate(630.12,-468.375)">
+			<title>Square.343</title>
+			<desc>0</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>0</text>		</g>
+		<g id="shape344-685" v:mID="344" v:groupContext="shape" transform="translate(576.12,-450.375)">
+			<title>Square.344</title>
+			<desc>2</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text>		</g>
+		<g id="shape345-688" v:mID="345" v:groupContext="shape" transform="translate(594.12,-450.375)">
+			<title>Square.345</title>
+			<desc>2</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text>		</g>
+		<g id="shape346-691" v:mID="346" v:groupContext="shape" transform="translate(612.12,-450.375)">
+			<title>Square.346</title>
+			<desc>1</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1</text>		</g>
+		<g id="shape347-694" v:mID="347" v:groupContext="shape" transform="translate(630.12,-450.375)">
+			<title>Square.347</title>
+			<desc>1</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1</text>		</g>
+		<g id="shape348-697" v:mID="348" v:groupContext="shape" transform="translate(576.12,-432.375)">
+			<title>Square.348</title>
+			<desc>3</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text>		</g>
+		<g id="shape349-700" v:mID="349" v:groupContext="shape" transform="translate(594.12,-432.375)">
+			<title>Square.349</title>
+			<desc>3</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text>		</g>
+		<g id="shape350-703" v:mID="350" v:groupContext="shape" transform="translate(612.12,-432.375)">
+			<title>Square.350</title>
+			<desc>2</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text>		</g>
+		<g id="shape351-706" v:mID="351" v:groupContext="shape" transform="translate(630.12,-432.375)">
+			<title>Square.351</title>
+			<desc>2</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text>		</g>
+		<g id="shape352-709" v:mID="352" v:groupContext="shape" transform="translate(576.12,-414.375)">
+			<title>Square.352</title>
+			<desc>4</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text>		</g>
+		<g id="shape353-712" v:mID="353" v:groupContext="shape" transform="translate(594.12,-414.375)">
+			<title>Square.353</title>
+			<desc>4</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text>		</g>
+		<g id="shape354-715" v:mID="354" v:groupContext="shape" transform="translate(612.12,-414.375)">
+			<title>Square.354</title>
+			<desc>3</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text>		</g>
+		<g id="shape355-718" v:mID="355" v:groupContext="shape" transform="translate(630.12,-414.375)">
+			<title>Square.355</title>
+			<desc>3</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text>		</g>
+		<g id="shape356-721" v:mID="356" v:groupContext="shape" transform="translate(576.12,-396.375)">
+			<title>Square.356</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape357-724" v:mID="357" v:groupContext="shape" transform="translate(594.12,-396.375)">
+			<title>Square.357</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape358-727" v:mID="358" v:groupContext="shape" transform="translate(612.12,-396.375)">
+			<title>Square.358</title>
+			<desc>4</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text>		</g>
+		<g id="shape359-730" v:mID="359" v:groupContext="shape" transform="translate(630.12,-396.375)">
+			<title>Square.359</title>
+			<desc>4</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text>		</g>
+		<g id="shape360-733" v:mID="360" v:groupContext="shape" transform="translate(576.12,-378.375)">
+			<title>Square.360</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape361-736" v:mID="361" v:groupContext="shape" transform="translate(594.12,-378.375)">
+			<title>Square.361</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape362-739" v:mID="362" v:groupContext="shape" transform="translate(612.12,-378.375)">
+			<title>Square.362</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape363-742" v:mID="363" v:groupContext="shape" transform="translate(630.12,-378.375)">
+			<title>Square.363</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape364-745" v:mID="364" v:groupContext="shape" transform="translate(576.12,-360.375)">
+			<title>Square.364</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape365-748" v:mID="365" v:groupContext="shape" transform="translate(594.12,-360.375)">
+			<title>Square.365</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape366-751" v:mID="366" v:groupContext="shape" transform="translate(612.12,-360.375)">
+			<title>Square.366</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape367-754" v:mID="367" v:groupContext="shape" transform="translate(630.12,-360.375)">
+			<title>Square.367</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape368-757" v:mID="368" v:groupContext="shape" transform="translate(576.12,-342.375)">
+			<title>Square.368</title>
+			<desc>8</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>8</text>		</g>
+		<g id="shape369-760" v:mID="369" v:groupContext="shape" transform="translate(594.12,-342.375)">
+			<title>Square.369</title>
+			<desc>8</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>8</text>		</g>
+		<g id="shape370-763" v:mID="370" v:groupContext="shape" transform="translate(612.12,-342.375)">
+			<title>Square.370</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape371-766" v:mID="371" v:groupContext="shape" transform="translate(630.12,-342.375)">
+			<title>Square.371</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape372-769" v:mID="372" v:groupContext="shape" transform="translate(18.12,-180.375)">
+			<title>Square.372</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape373-772" v:mID="373" v:groupContext="shape" transform="translate(36.12,-180.375)">
+			<title>Square.373</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape374-775" v:mID="374" v:groupContext="shape" transform="translate(54.12,-180.375)">
+			<title>Square.374</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape375-778" v:mID="375" v:groupContext="shape" transform="translate(72.12,-180.375)">
+			<title>Square.375</title>
+			<desc>4</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text>		</g>
+		<g id="shape376-781" v:mID="376" v:groupContext="shape" transform="translate(18.12,-162.375)">
+			<title>Square.376</title>
+			<desc>8</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>8</text>		</g>
+		<g id="shape377-784" v:mID="377" v:groupContext="shape" transform="translate(36.12,-162.375)">
+			<title>Square.377</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape378-787" v:mID="378" v:groupContext="shape" transform="translate(54.12,-162.375)">
+			<title>Square.378</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape379-790" v:mID="379" v:groupContext="shape" transform="translate(72.12,-162.375)">
+			<title>Square.379</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape380-793" v:mID="380" v:groupContext="shape" transform="translate(18.12,-144.375)">
+			<title>Square.380</title>
+			<desc>9</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>9</text>		</g>
+		<g id="shape381-796" v:mID="381" v:groupContext="shape" transform="translate(36.12,-144.375)">
+			<title>Square.381</title>
+			<desc>8</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>8</text>		</g>
+		<g id="shape382-799" v:mID="382" v:groupContext="shape" transform="translate(54.12,-144.375)">
+			<title>Square.382</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape383-802" v:mID="383" v:groupContext="shape" transform="translate(72.12,-144.375)">
+			<title>Square.383</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape384-805" v:mID="384" v:groupContext="shape" transform="translate(18.12,-126.375)">
+			<title>Square.384</title>
+			<desc>10</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="4.44" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>10</text>		</g>
+		<g id="shape385-808" v:mID="385" v:groupContext="shape" transform="translate(36.12,-126.375)">
+			<title>Square.385</title>
+			<desc>9</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>9</text>		</g>
+		<g id="shape386-811" v:mID="386" v:groupContext="shape" transform="translate(54.12,-126.375)">
+			<title>Square.386</title>
+			<desc>8</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>8</text>		</g>
+		<g id="shape387-814" v:mID="387" v:groupContext="shape" transform="translate(72.12,-126.375)">
+			<title>Square.387</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape388-817" v:mID="388" v:groupContext="shape" transform="translate(18.12,-108.375)">
+			<title>Square.388</title>
+			<desc>11</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="4.44" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>11</text>		</g>
+		<g id="shape389-820" v:mID="389" v:groupContext="shape" transform="translate(36.12,-108.375)">
+			<title>Square.389</title>
+			<desc>10</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="4.44" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>10</text>		</g>
+		<g id="shape390-823" v:mID="390" v:groupContext="shape" transform="translate(54.12,-108.375)">
+			<title>Square.390</title>
+			<desc>9</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>9</text>		</g>
+		<g id="shape391-826" v:mID="391" v:groupContext="shape" transform="translate(72.12,-108.375)">
+			<title>Square.391</title>
+			<desc>8</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>8</text>		</g>
+		<g id="shape392-829" v:mID="392" v:groupContext="shape" transform="translate(18.12,-90.375)">
+			<title>Square.392</title>
+			<desc>12</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="4.44" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>12</text>		</g>
+		<g id="shape393-832" v:mID="393" v:groupContext="shape" transform="translate(36.12,-90.375)">
+			<title>Square.393</title>
+			<desc>11</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="4.44" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>11</text>		</g>
+		<g id="shape394-835" v:mID="394" v:groupContext="shape" transform="translate(54.12,-90.375)">
+			<title>Square.394</title>
+			<desc>10</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="4.44" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>10</text>		</g>
+		<g id="shape395-838" v:mID="395" v:groupContext="shape" transform="translate(72.12,-90.375)">
+			<title>Square.395</title>
+			<desc>9</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>9</text>		</g>
+		<g id="shape396-841" v:mID="396" v:groupContext="shape" transform="translate(18.12,-72.375)">
+			<title>Square.396</title>
+			<desc>13</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="4.44" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>13</text>		</g>
+		<g id="shape397-844" v:mID="397" v:groupContext="shape" transform="translate(36.12,-72.375)">
+			<title>Square.397</title>
+			<desc>12</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="4.44" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>12</text>		</g>
+		<g id="shape398-847" v:mID="398" v:groupContext="shape" transform="translate(54.12,-72.375)">
+			<title>Square.398</title>
+			<desc>11</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="4.44" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>11</text>		</g>
+		<g id="shape399-850" v:mID="399" v:groupContext="shape" transform="translate(72.12,-72.375)">
+			<title>Square.399</title>
+			<desc>10</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="4.44" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>10</text>		</g>
+		<g id="shape400-853" v:mID="400" v:groupContext="shape" transform="translate(18.12,-54.375)">
+			<title>Square.400</title>
+			<desc>14</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="4.44" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>14</text>		</g>
+		<g id="shape401-856" v:mID="401" v:groupContext="shape" transform="translate(36.12,-54.375)">
+			<title>Square.401</title>
+			<desc>13</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="4.44" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>13</text>		</g>
+		<g id="shape402-859" v:mID="402" v:groupContext="shape" transform="translate(54.12,-54.375)">
+			<title>Square.402</title>
+			<desc>12</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="4.44" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>12</text>		</g>
+		<g id="shape403-862" v:mID="403" v:groupContext="shape" transform="translate(72.12,-54.375)">
+			<title>Square.403</title>
+			<desc>11</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="4.44" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>11</text>		</g>
+		<g id="shape404-865" v:mID="404" v:groupContext="shape" transform="translate(90.12,-180.375)">
+			<title>Square.404</title>
+			<desc>3</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text>		</g>
+		<g id="shape405-868" v:mID="405" v:groupContext="shape" transform="translate(108.12,-180.375)">
+			<title>Square.405</title>
+			<desc>2</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text>		</g>
+		<g id="shape406-871" v:mID="406" v:groupContext="shape" transform="translate(126.12,-180.375)">
+			<title>Square.406</title>
+			<desc>1</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1</text>		</g>
+		<g id="shape407-874" v:mID="407" v:groupContext="shape" transform="translate(144.12,-180.375)">
+			<title>Square.407</title>
+			<desc>0</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>0</text>		</g>
+		<g id="shape408-877" v:mID="408" v:groupContext="shape" transform="translate(90.12,-162.375)">
+			<title>Square.408</title>
+			<desc>4</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text>		</g>
+		<g id="shape409-880" v:mID="409" v:groupContext="shape" transform="translate(108.12,-162.375)">
+			<title>Square.409</title>
+			<desc>3</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text>		</g>
+		<g id="shape410-883" v:mID="410" v:groupContext="shape" transform="translate(126.12,-162.375)">
+			<title>Square.410</title>
+			<desc>2</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text>		</g>
+		<g id="shape411-886" v:mID="411" v:groupContext="shape" transform="translate(144.12,-162.375)">
+			<title>Square.411</title>
+			<desc>1</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1</text>		</g>
+		<g id="shape412-889" v:mID="412" v:groupContext="shape" transform="translate(90.12,-144.375)">
+			<title>Square.412</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape413-892" v:mID="413" v:groupContext="shape" transform="translate(108.12,-144.375)">
+			<title>Square.413</title>
+			<desc>4</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text>		</g>
+		<g id="shape414-895" v:mID="414" v:groupContext="shape" transform="translate(126.12,-144.375)">
+			<title>Square.414</title>
+			<desc>3</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text>		</g>
+		<g id="shape415-898" v:mID="415" v:groupContext="shape" transform="translate(144.12,-144.375)">
+			<title>Square.415</title>
+			<desc>2</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text>		</g>
+		<g id="shape416-901" v:mID="416" v:groupContext="shape" transform="translate(90.12,-126.375)">
+			<title>Square.416</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape417-904" v:mID="417" v:groupContext="shape" transform="translate(108.12,-126.375)">
+			<title>Square.417</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape418-907" v:mID="418" v:groupContext="shape" transform="translate(126.12,-126.375)">
+			<title>Square.418</title>
+			<desc>4</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text>		</g>
+		<g id="shape419-910" v:mID="419" v:groupContext="shape" transform="translate(144.12,-126.375)">
+			<title>Square.419</title>
+			<desc>3</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text>		</g>
+		<g id="shape420-913" v:mID="420" v:groupContext="shape" transform="translate(90.12,-108.375)">
+			<title>Square.420</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape421-916" v:mID="421" v:groupContext="shape" transform="translate(108.12,-108.375)">
+			<title>Square.421</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape422-919" v:mID="422" v:groupContext="shape" transform="translate(126.12,-108.375)">
+			<title>Square.422</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape423-922" v:mID="423" v:groupContext="shape" transform="translate(144.12,-108.375)">
+			<title>Square.423</title>
+			<desc>4</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text>		</g>
+		<g id="shape424-925" v:mID="424" v:groupContext="shape" transform="translate(90.12,-90.375)">
+			<title>Square.424</title>
+			<desc>8</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>8</text>		</g>
+		<g id="shape425-928" v:mID="425" v:groupContext="shape" transform="translate(108.12,-90.375)">
+			<title>Square.425</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape426-931" v:mID="426" v:groupContext="shape" transform="translate(126.12,-90.375)">
+			<title>Square.426</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape427-934" v:mID="427" v:groupContext="shape" transform="translate(144.12,-90.375)">
+			<title>Square.427</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape428-937" v:mID="428" v:groupContext="shape" transform="translate(90.12,-72.375)">
+			<title>Square.428</title>
+			<desc>9</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>9</text>		</g>
+		<g id="shape429-940" v:mID="429" v:groupContext="shape" transform="translate(108.12,-72.375)">
+			<title>Square.429</title>
+			<desc>8</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>8</text>		</g>
+		<g id="shape430-943" v:mID="430" v:groupContext="shape" transform="translate(126.12,-72.375)">
+			<title>Square.430</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape431-946" v:mID="431" v:groupContext="shape" transform="translate(144.12,-72.375)">
+			<title>Square.431</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape432-949" v:mID="432" v:groupContext="shape" transform="translate(90.12,-54.375)">
+			<title>Square.432</title>
+			<desc>10</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="4.44" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>10</text>		</g>
+		<g id="shape433-952" v:mID="433" v:groupContext="shape" transform="translate(108.12,-54.375)">
+			<title>Square.433</title>
+			<desc>9</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>9</text>		</g>
+		<g id="shape434-955" v:mID="434" v:groupContext="shape" transform="translate(126.12,-54.375)">
+			<title>Square.434</title>
+			<desc>8</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>8</text>		</g>
+		<g id="shape435-958" v:mID="435" v:groupContext="shape" transform="translate(144.12,-54.375)">
+			<title>Square.435</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape436-961" v:mID="436" v:groupContext="shape" transform="translate(180.12,-180.375)">
+			<title>Square.436</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape437-964" v:mID="437" v:groupContext="shape" transform="translate(198.12,-180.375)">
+			<title>Square.437</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape438-967" v:mID="438" v:groupContext="shape" transform="translate(216.12,-180.375)">
+			<title>Square.438</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape439-970" v:mID="439" v:groupContext="shape" transform="translate(234.12,-180.375)">
+			<title>Square.439</title>
+			<desc>4</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text>		</g>
+		<g id="shape440-973" v:mID="440" v:groupContext="shape" transform="translate(180.12,-162.375)">
+			<title>Square.440</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape441-976" v:mID="441" v:groupContext="shape" transform="translate(198.12,-162.375)">
+			<title>Square.441</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape442-979" v:mID="442" v:groupContext="shape" transform="translate(216.12,-162.375)">
+			<title>Square.442</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape443-982" v:mID="443" v:groupContext="shape" transform="translate(234.12,-162.375)">
+			<title>Square.443</title>
+			<desc>4</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text>		</g>
+		<g id="shape444-985" v:mID="444" v:groupContext="shape" transform="translate(180.12,-144.375)">
+			<title>Square.444</title>
+			<desc>8</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>8</text>		</g>
+		<g id="shape445-988" v:mID="445" v:groupContext="shape" transform="translate(198.12,-144.375)">
+			<title>Square.445</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape446-991" v:mID="446" v:groupContext="shape" transform="translate(216.12,-144.375)">
+			<title>Square.446</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape447-994" v:mID="447" v:groupContext="shape" transform="translate(234.12,-144.375)">
+			<title>Square.447</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape448-997" v:mID="448" v:groupContext="shape" transform="translate(180.12,-126.375)">
+			<title>Square.448</title>
+			<desc>8</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>8</text>		</g>
+		<g id="shape449-1000" v:mID="449" v:groupContext="shape" transform="translate(198.12,-126.375)">
+			<title>Square.449</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape450-1003" v:mID="450" v:groupContext="shape" transform="translate(216.12,-126.375)">
+			<title>Square.450</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape451-1006" v:mID="451" v:groupContext="shape" transform="translate(234.12,-126.375)">
+			<title>Square.451</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape452-1009" v:mID="452" v:groupContext="shape" transform="translate(180.12,-108.375)">
+			<title>Square.452</title>
+			<desc>9</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>9</text>		</g>
+		<g id="shape453-1012" v:mID="453" v:groupContext="shape" transform="translate(198.12,-108.375)">
+			<title>Square.453</title>
+			<desc>8</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>8</text>		</g>
+		<g id="shape454-1015" v:mID="454" v:groupContext="shape" transform="translate(216.12,-108.375)">
+			<title>Square.454</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape455-1018" v:mID="455" v:groupContext="shape" transform="translate(234.12,-108.375)">
+			<title>Square.455</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape456-1021" v:mID="456" v:groupContext="shape" transform="translate(180.12,-90.375)">
+			<title>Square.456</title>
+			<desc>9</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>9</text>		</g>
+		<g id="shape457-1024" v:mID="457" v:groupContext="shape" transform="translate(198.12,-90.375)">
+			<title>Square.457</title>
+			<desc>8</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>8</text>		</g>
+		<g id="shape458-1027" v:mID="458" v:groupContext="shape" transform="translate(216.12,-90.375)">
+			<title>Square.458</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape459-1030" v:mID="459" v:groupContext="shape" transform="translate(234.12,-90.375)">
+			<title>Square.459</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape460-1033" v:mID="460" v:groupContext="shape" transform="translate(180.12,-72.375)">
+			<title>Square.460</title>
+			<desc>10</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="4.44" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>10</text>		</g>
+		<g id="shape461-1036" v:mID="461" v:groupContext="shape" transform="translate(198.12,-72.375)">
+			<title>Square.461</title>
+			<desc>9</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>9</text>		</g>
+		<g id="shape462-1039" v:mID="462" v:groupContext="shape" transform="translate(216.12,-72.375)">
+			<title>Square.462</title>
+			<desc>8</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>8</text>		</g>
+		<g id="shape463-1042" v:mID="463" v:groupContext="shape" transform="translate(234.12,-72.375)">
+			<title>Square.463</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape464-1045" v:mID="464" v:groupContext="shape" transform="translate(180.12,-54.375)">
+			<title>Square.464</title>
+			<desc>10</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="4.44" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>10</text>		</g>
+		<g id="shape465-1048" v:mID="465" v:groupContext="shape" transform="translate(198.12,-54.375)">
+			<title>Square.465</title>
+			<desc>9</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>9</text>		</g>
+		<g id="shape466-1051" v:mID="466" v:groupContext="shape" transform="translate(216.12,-54.375)">
+			<title>Square.466</title>
+			<desc>8</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>8</text>		</g>
+		<g id="shape467-1054" v:mID="467" v:groupContext="shape" transform="translate(234.12,-54.375)">
+			<title>Square.467</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape468-1057" v:mID="468" v:groupContext="shape" transform="translate(252.12,-180.375)">
+			<title>Square.468</title>
+			<desc>3</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text>		</g>
+		<g id="shape469-1060" v:mID="469" v:groupContext="shape" transform="translate(270.12,-180.375)">
+			<title>Square.469</title>
+			<desc>2</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text>		</g>
+		<g id="shape470-1063" v:mID="470" v:groupContext="shape" transform="translate(288.12,-180.375)">
+			<title>Square.470</title>
+			<desc>1</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1</text>		</g>
+		<g id="shape471-1066" v:mID="471" v:groupContext="shape" transform="translate(306.12,-180.375)">
+			<title>Square.471</title>
+			<desc>0</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>0</text>		</g>
+		<g id="shape472-1069" v:mID="472" v:groupContext="shape" transform="translate(252.12,-162.375)">
+			<title>Square.472</title>
+			<desc>3</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text>		</g>
+		<g id="shape473-1072" v:mID="473" v:groupContext="shape" transform="translate(270.12,-162.375)">
+			<title>Square.473</title>
+			<desc>2</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text>		</g>
+		<g id="shape474-1075" v:mID="474" v:groupContext="shape" transform="translate(288.12,-162.375)">
+			<title>Square.474</title>
+			<desc>1</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1</text>		</g>
+		<g id="shape475-1078" v:mID="475" v:groupContext="shape" transform="translate(306.12,-162.375)">
+			<title>Square.475</title>
+			<desc>0</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>0</text>		</g>
+		<g id="shape476-1081" v:mID="476" v:groupContext="shape" transform="translate(252.12,-144.375)">
+			<title>Square.476</title>
+			<desc>4</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text>		</g>
+		<g id="shape477-1084" v:mID="477" v:groupContext="shape" transform="translate(270.12,-144.375)">
+			<title>Square.477</title>
+			<desc>3</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text>		</g>
+		<g id="shape478-1087" v:mID="478" v:groupContext="shape" transform="translate(288.12,-144.375)">
+			<title>Square.478</title>
+			<desc>2</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text>		</g>
+		<g id="shape479-1090" v:mID="479" v:groupContext="shape" transform="translate(306.12,-144.375)">
+			<title>Square.479</title>
+			<desc>1</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1</text>		</g>
+		<g id="shape480-1093" v:mID="480" v:groupContext="shape" transform="translate(252.12,-126.375)">
+			<title>Square.480</title>
+			<desc>4</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text>		</g>
+		<g id="shape481-1096" v:mID="481" v:groupContext="shape" transform="translate(270.12,-126.375)">
+			<title>Square.481</title>
+			<desc>3</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text>		</g>
+		<g id="shape482-1099" v:mID="482" v:groupContext="shape" transform="translate(288.12,-126.375)">
+			<title>Square.482</title>
+			<desc>2</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text>		</g>
+		<g id="shape483-1102" v:mID="483" v:groupContext="shape" transform="translate(306.12,-126.375)">
+			<title>Square.483</title>
+			<desc>1</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1</text>		</g>
+		<g id="shape484-1105" v:mID="484" v:groupContext="shape" transform="translate(252.12,-108.375)">
+			<title>Square.484</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape485-1108" v:mID="485" v:groupContext="shape" transform="translate(270.12,-108.375)">
+			<title>Square.485</title>
+			<desc>4</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text>		</g>
+		<g id="shape486-1111" v:mID="486" v:groupContext="shape" transform="translate(288.12,-108.375)">
+			<title>Square.486</title>
+			<desc>3</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text>		</g>
+		<g id="shape487-1114" v:mID="487" v:groupContext="shape" transform="translate(306.12,-108.375)">
+			<title>Square.487</title>
+			<desc>2</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text>		</g>
+		<g id="shape488-1117" v:mID="488" v:groupContext="shape" transform="translate(252.12,-90.375)">
+			<title>Square.488</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape489-1120" v:mID="489" v:groupContext="shape" transform="translate(270.12,-90.375)">
+			<title>Square.489</title>
+			<desc>4</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text>		</g>
+		<g id="shape490-1123" v:mID="490" v:groupContext="shape" transform="translate(288.12,-90.375)">
+			<title>Square.490</title>
+			<desc>3</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text>		</g>
+		<g id="shape491-1126" v:mID="491" v:groupContext="shape" transform="translate(306.12,-90.375)">
+			<title>Square.491</title>
+			<desc>2</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text>		</g>
+		<g id="shape492-1129" v:mID="492" v:groupContext="shape" transform="translate(252.12,-72.375)">
+			<title>Square.492</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape493-1132" v:mID="493" v:groupContext="shape" transform="translate(270.12,-72.375)">
+			<title>Square.493</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape494-1135" v:mID="494" v:groupContext="shape" transform="translate(288.12,-72.375)">
+			<title>Square.494</title>
+			<desc>4</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text>		</g>
+		<g id="shape495-1138" v:mID="495" v:groupContext="shape" transform="translate(306.12,-72.375)">
+			<title>Square.495</title>
+			<desc>3</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text>		</g>
+		<g id="shape496-1141" v:mID="496" v:groupContext="shape" transform="translate(252.12,-54.375)">
+			<title>Square.496</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape497-1144" v:mID="497" v:groupContext="shape" transform="translate(270.12,-54.375)">
+			<title>Square.497</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape498-1147" v:mID="498" v:groupContext="shape" transform="translate(288.12,-54.375)">
+			<title>Square.498</title>
+			<desc>4</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text>		</g>
+		<g id="shape499-1150" v:mID="499" v:groupContext="shape" transform="translate(306.12,-54.375)">
+			<title>Square.499</title>
+			<desc>3</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text>		</g>
+		<g id="shape500-1153" v:mID="500" v:groupContext="shape" transform="translate(342.12,-180.375)">
+			<title>Square.500</title>
+			<desc>0</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>0</text>		</g>
+		<g id="shape501-1156" v:mID="501" v:groupContext="shape" transform="translate(360.12,-180.375)">
+			<title>Square.501</title>
+			<desc>1</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1</text>		</g>
+		<g id="shape502-1159" v:mID="502" v:groupContext="shape" transform="translate(378.12,-180.375)">
+			<title>Square.502</title>
+			<desc>2</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text>		</g>
+		<g id="shape503-1162" v:mID="503" v:groupContext="shape" transform="translate(396.12,-180.375)">
+			<title>Square.503</title>
+			<desc>3</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text>		</g>
+		<g id="shape504-1165" v:mID="504" v:groupContext="shape" transform="translate(342.12,-162.375)">
+			<title>Square.504</title>
+			<desc>0</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>0</text>		</g>
+		<g id="shape505-1168" v:mID="505" v:groupContext="shape" transform="translate(360.12,-162.375)">
+			<title>Square.505</title>
+			<desc>1</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1</text>		</g>
+		<g id="shape506-1171" v:mID="506" v:groupContext="shape" transform="translate(378.12,-162.375)">
+			<title>Square.506</title>
+			<desc>2</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text>		</g>
+		<g id="shape507-1174" v:mID="507" v:groupContext="shape" transform="translate(396.12,-162.375)">
+			<title>Square.507</title>
+			<desc>3</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text>		</g>
+		<g id="shape508-1177" v:mID="508" v:groupContext="shape" transform="translate(342.12,-144.375)">
+			<title>Square.508</title>
+			<desc>0</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>0</text>		</g>
+		<g id="shape509-1180" v:mID="509" v:groupContext="shape" transform="translate(360.12,-144.375)">
+			<title>Square.509</title>
+			<desc>1</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1</text>		</g>
+		<g id="shape510-1183" v:mID="510" v:groupContext="shape" transform="translate(378.12,-144.375)">
+			<title>Square.510</title>
+			<desc>2</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text>		</g>
+		<g id="shape511-1186" v:mID="511" v:groupContext="shape" transform="translate(396.12,-144.375)">
+			<title>Square.511</title>
+			<desc>3</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text>		</g>
+		<g id="shape512-1189" v:mID="512" v:groupContext="shape" transform="translate(342.12,-126.375)">
+			<title>Square.512</title>
+			<desc>0</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>0</text>		</g>
+		<g id="shape513-1192" v:mID="513" v:groupContext="shape" transform="translate(360.12,-126.375)">
+			<title>Square.513</title>
+			<desc>1</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1</text>		</g>
+		<g id="shape514-1195" v:mID="514" v:groupContext="shape" transform="translate(378.12,-126.375)">
+			<title>Square.514</title>
+			<desc>2</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text>		</g>
+		<g id="shape515-1198" v:mID="515" v:groupContext="shape" transform="translate(396.12,-126.375)">
+			<title>Square.515</title>
+			<desc>3</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text>		</g>
+		<g id="shape516-1201" v:mID="516" v:groupContext="shape" transform="translate(342.12,-108.375)">
+			<title>Square.516</title>
+			<desc>0</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>0</text>		</g>
+		<g id="shape517-1204" v:mID="517" v:groupContext="shape" transform="translate(360.12,-108.375)">
+			<title>Square.517</title>
+			<desc>1</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1</text>		</g>
+		<g id="shape518-1207" v:mID="518" v:groupContext="shape" transform="translate(378.12,-108.375)">
+			<title>Square.518</title>
+			<desc>2</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text>		</g>
+		<g id="shape519-1210" v:mID="519" v:groupContext="shape" transform="translate(396.12,-108.375)">
+			<title>Square.519</title>
+			<desc>3</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text>		</g>
+		<g id="shape520-1213" v:mID="520" v:groupContext="shape" transform="translate(342.12,-90.375)">
+			<title>Square.520</title>
+			<desc>0</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>0</text>		</g>
+		<g id="shape521-1216" v:mID="521" v:groupContext="shape" transform="translate(360.12,-90.375)">
+			<title>Square.521</title>
+			<desc>1</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1</text>		</g>
+		<g id="shape522-1219" v:mID="522" v:groupContext="shape" transform="translate(378.12,-90.375)">
+			<title>Square.522</title>
+			<desc>2</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text>		</g>
+		<g id="shape523-1222" v:mID="523" v:groupContext="shape" transform="translate(396.12,-90.375)">
+			<title>Square.523</title>
+			<desc>3</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text>		</g>
+		<g id="shape524-1225" v:mID="524" v:groupContext="shape" transform="translate(342.12,-72.375)">
+			<title>Square.524</title>
+			<desc>0</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>0</text>		</g>
+		<g id="shape525-1228" v:mID="525" v:groupContext="shape" transform="translate(360.12,-72.375)">
+			<title>Square.525</title>
+			<desc>1</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1</text>		</g>
+		<g id="shape526-1231" v:mID="526" v:groupContext="shape" transform="translate(378.12,-72.375)">
+			<title>Square.526</title>
+			<desc>2</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text>		</g>
+		<g id="shape527-1234" v:mID="527" v:groupContext="shape" transform="translate(396.12,-72.375)">
+			<title>Square.527</title>
+			<desc>3</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text>		</g>
+		<g id="shape528-1237" v:mID="528" v:groupContext="shape" transform="translate(342.12,-54.375)">
+			<title>Square.528</title>
+			<desc>0</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>0</text>		</g>
+		<g id="shape529-1240" v:mID="529" v:groupContext="shape" transform="translate(360.12,-54.375)">
+			<title>Square.529</title>
+			<desc>1</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1</text>		</g>
+		<g id="shape530-1243" v:mID="530" v:groupContext="shape" transform="translate(378.12,-54.375)">
+			<title>Square.530</title>
+			<desc>2</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text>		</g>
+		<g id="shape531-1246" v:mID="531" v:groupContext="shape" transform="translate(396.12,-54.375)">
+			<title>Square.531</title>
+			<desc>3</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text>		</g>
+		<g id="shape532-1249" v:mID="532" v:groupContext="shape" transform="translate(414.12,-180.375)">
+			<title>Square.532</title>
+			<desc>4</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text>		</g>
+		<g id="shape533-1252" v:mID="533" v:groupContext="shape" transform="translate(432.12,-180.375)">
+			<title>Square.533</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape534-1255" v:mID="534" v:groupContext="shape" transform="translate(450.12,-180.375)">
+			<title>Square.534</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape535-1258" v:mID="535" v:groupContext="shape" transform="translate(468.12,-180.375)">
+			<title>Square.535</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape536-1261" v:mID="536" v:groupContext="shape" transform="translate(414.12,-162.375)">
+			<title>Square.536</title>
+			<desc>4</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text>		</g>
+		<g id="shape537-1264" v:mID="537" v:groupContext="shape" transform="translate(432.12,-162.375)">
+			<title>Square.537</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape538-1267" v:mID="538" v:groupContext="shape" transform="translate(450.12,-162.375)">
+			<title>Square.538</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape539-1270" v:mID="539" v:groupContext="shape" transform="translate(468.12,-162.375)">
+			<title>Square.539</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape540-1273" v:mID="540" v:groupContext="shape" transform="translate(414.12,-144.375)">
+			<title>Square.540</title>
+			<desc>4</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text>		</g>
+		<g id="shape541-1276" v:mID="541" v:groupContext="shape" transform="translate(432.12,-144.375)">
+			<title>Square.541</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape542-1279" v:mID="542" v:groupContext="shape" transform="translate(450.12,-144.375)">
+			<title>Square.542</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape543-1282" v:mID="543" v:groupContext="shape" transform="translate(468.12,-144.375)">
+			<title>Square.543</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape544-1285" v:mID="544" v:groupContext="shape" transform="translate(414.12,-126.375)">
+			<title>Square.544</title>
+			<desc>4</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text>		</g>
+		<g id="shape545-1288" v:mID="545" v:groupContext="shape" transform="translate(432.12,-126.375)">
+			<title>Square.545</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape546-1291" v:mID="546" v:groupContext="shape" transform="translate(450.12,-126.375)">
+			<title>Square.546</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape547-1294" v:mID="547" v:groupContext="shape" transform="translate(468.12,-126.375)">
+			<title>Square.547</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape548-1297" v:mID="548" v:groupContext="shape" transform="translate(414.12,-108.375)">
+			<title>Square.548</title>
+			<desc>4</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text>		</g>
+		<g id="shape549-1300" v:mID="549" v:groupContext="shape" transform="translate(432.12,-108.375)">
+			<title>Square.549</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape550-1303" v:mID="550" v:groupContext="shape" transform="translate(450.12,-108.375)">
+			<title>Square.550</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape551-1306" v:mID="551" v:groupContext="shape" transform="translate(468.12,-108.375)">
+			<title>Square.551</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape552-1309" v:mID="552" v:groupContext="shape" transform="translate(414.12,-90.375)">
+			<title>Square.552</title>
+			<desc>4</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text>		</g>
+		<g id="shape553-1312" v:mID="553" v:groupContext="shape" transform="translate(432.12,-90.375)">
+			<title>Square.553</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape554-1315" v:mID="554" v:groupContext="shape" transform="translate(450.12,-90.375)">
+			<title>Square.554</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape555-1318" v:mID="555" v:groupContext="shape" transform="translate(468.12,-90.375)">
+			<title>Square.555</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape556-1321" v:mID="556" v:groupContext="shape" transform="translate(414.12,-72.375)">
+			<title>Square.556</title>
+			<desc>4</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text>		</g>
+		<g id="shape557-1324" v:mID="557" v:groupContext="shape" transform="translate(432.12,-72.375)">
+			<title>Square.557</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape558-1327" v:mID="558" v:groupContext="shape" transform="translate(450.12,-72.375)">
+			<title>Square.558</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape559-1330" v:mID="559" v:groupContext="shape" transform="translate(468.12,-72.375)">
+			<title>Square.559</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape560-1333" v:mID="560" v:groupContext="shape" transform="translate(414.12,-54.375)">
+			<title>Square.560</title>
+			<desc>4</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text>		</g>
+		<g id="shape561-1336" v:mID="561" v:groupContext="shape" transform="translate(432.12,-54.375)">
+			<title>Square.561</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape562-1339" v:mID="562" v:groupContext="shape" transform="translate(450.12,-54.375)">
+			<title>Square.562</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape563-1342" v:mID="563" v:groupContext="shape" transform="translate(468.12,-54.375)">
+			<title>Square.563</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape564-1345" v:mID="564" v:groupContext="shape" transform="translate(504.12,-180.375)">
+			<title>Square.564</title>
+			<desc>0</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>0</text>		</g>
+		<g id="shape565-1348" v:mID="565" v:groupContext="shape" transform="translate(522.12,-180.375)">
+			<title>Square.565</title>
+			<desc>1</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1</text>		</g>
+		<g id="shape566-1351" v:mID="566" v:groupContext="shape" transform="translate(540.12,-180.375)">
+			<title>Square.566</title>
+			<desc>2</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text>		</g>
+		<g id="shape567-1354" v:mID="567" v:groupContext="shape" transform="translate(558.12,-180.375)">
+			<title>Square.567</title>
+			<desc>3</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text>		</g>
+		<g id="shape568-1357" v:mID="568" v:groupContext="shape" transform="translate(504.12,-162.375)">
+			<title>Square.568</title>
+			<desc>0</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>0</text>		</g>
+		<g id="shape569-1360" v:mID="569" v:groupContext="shape" transform="translate(522.12,-162.375)">
+			<title>Square.569</title>
+			<desc>1</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1</text>		</g>
+		<g id="shape570-1363" v:mID="570" v:groupContext="shape" transform="translate(540.12,-162.375)">
+			<title>Square.570</title>
+			<desc>2</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text>		</g>
+		<g id="shape571-1366" v:mID="571" v:groupContext="shape" transform="translate(558.12,-162.375)">
+			<title>Square.571</title>
+			<desc>3</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text>		</g>
+		<g id="shape572-1369" v:mID="572" v:groupContext="shape" transform="translate(504.12,-144.375)">
+			<title>Square.572</title>
+			<desc>1</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1</text>		</g>
+		<g id="shape573-1372" v:mID="573" v:groupContext="shape" transform="translate(522.12,-144.375)">
+			<title>Square.573</title>
+			<desc>2</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text>		</g>
+		<g id="shape574-1375" v:mID="574" v:groupContext="shape" transform="translate(540.12,-144.375)">
+			<title>Square.574</title>
+			<desc>3</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text>		</g>
+		<g id="shape575-1378" v:mID="575" v:groupContext="shape" transform="translate(558.12,-144.375)">
+			<title>Square.575</title>
+			<desc>4</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text>		</g>
+		<g id="shape576-1381" v:mID="576" v:groupContext="shape" transform="translate(504.12,-126.375)">
+			<title>Square.576</title>
+			<desc>1</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1</text>		</g>
+		<g id="shape577-1384" v:mID="577" v:groupContext="shape" transform="translate(522.12,-126.375)">
+			<title>Square.577</title>
+			<desc>2</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text>		</g>
+		<g id="shape578-1387" v:mID="578" v:groupContext="shape" transform="translate(540.12,-126.375)">
+			<title>Square.578</title>
+			<desc>3</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text>		</g>
+		<g id="shape579-1390" v:mID="579" v:groupContext="shape" transform="translate(558.12,-126.375)">
+			<title>Square.579</title>
+			<desc>4</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text>		</g>
+		<g id="shape580-1393" v:mID="580" v:groupContext="shape" transform="translate(504.12,-108.375)">
+			<title>Square.580</title>
+			<desc>2</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text>		</g>
+		<g id="shape581-1396" v:mID="581" v:groupContext="shape" transform="translate(522.12,-108.375)">
+			<title>Square.581</title>
+			<desc>3</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text>		</g>
+		<g id="shape582-1399" v:mID="582" v:groupContext="shape" transform="translate(540.12,-108.375)">
+			<title>Square.582</title>
+			<desc>4</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text>		</g>
+		<g id="shape583-1402" v:mID="583" v:groupContext="shape" transform="translate(558.12,-108.375)">
+			<title>Square.583</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape584-1405" v:mID="584" v:groupContext="shape" transform="translate(504.12,-90.375)">
+			<title>Square.584</title>
+			<desc>2</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text>		</g>
+		<g id="shape585-1408" v:mID="585" v:groupContext="shape" transform="translate(522.12,-90.375)">
+			<title>Square.585</title>
+			<desc>3</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text>		</g>
+		<g id="shape586-1411" v:mID="586" v:groupContext="shape" transform="translate(540.12,-90.375)">
+			<title>Square.586</title>
+			<desc>4</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text>		</g>
+		<g id="shape587-1414" v:mID="587" v:groupContext="shape" transform="translate(558.12,-90.375)">
+			<title>Square.587</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape588-1417" v:mID="588" v:groupContext="shape" transform="translate(504.12,-72.375)">
+			<title>Square.588</title>
+			<desc>3</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text>		</g>
+		<g id="shape589-1420" v:mID="589" v:groupContext="shape" transform="translate(522.12,-72.375)">
+			<title>Square.589</title>
+			<desc>4</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text>		</g>
+		<g id="shape590-1423" v:mID="590" v:groupContext="shape" transform="translate(540.12,-72.375)">
+			<title>Square.590</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape591-1426" v:mID="591" v:groupContext="shape" transform="translate(558.12,-72.375)">
+			<title>Square.591</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape592-1429" v:mID="592" v:groupContext="shape" transform="translate(504.12,-54.375)">
+			<title>Square.592</title>
+			<desc>3</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text>		</g>
+		<g id="shape593-1432" v:mID="593" v:groupContext="shape" transform="translate(522.12,-54.375)">
+			<title>Square.593</title>
+			<desc>4</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text>		</g>
+		<g id="shape594-1435" v:mID="594" v:groupContext="shape" transform="translate(540.12,-54.375)">
+			<title>Square.594</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape595-1438" v:mID="595" v:groupContext="shape" transform="translate(558.12,-54.375)">
+			<title>Square.595</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape596-1441" v:mID="596" v:groupContext="shape" transform="translate(576.12,-180.375)">
+			<title>Square.596</title>
+			<desc>4</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text>		</g>
+		<g id="shape597-1444" v:mID="597" v:groupContext="shape" transform="translate(594.12,-180.375)">
+			<title>Square.597</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape598-1447" v:mID="598" v:groupContext="shape" transform="translate(612.12,-180.375)">
+			<title>Square.598</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape599-1450" v:mID="599" v:groupContext="shape" transform="translate(630.12,-180.375)">
+			<title>Square.599</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.47" y="573.27" class="st4" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape600-1453" v:mID="600" v:groupContext="shape" transform="translate(576.12,-162.375)">
+			<title>Square.600</title>
+			<desc>4</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text>		</g>
+		<g id="shape601-1456" v:mID="601" v:groupContext="shape" transform="translate(594.12,-162.375)">
+			<title>Square.601</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape602-1459" v:mID="602" v:groupContext="shape" transform="translate(612.12,-162.375)">
+			<title>Square.602</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape603-1462" v:mID="603" v:groupContext="shape" transform="translate(630.12,-162.375)">
+			<title>Square.603</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.47" y="573.27" class="st4" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape604-1465" v:mID="604" v:groupContext="shape" transform="translate(576.12,-144.375)">
+			<title>Square.604</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape605-1468" v:mID="605" v:groupContext="shape" transform="translate(594.12,-144.375)">
+			<title>Square.605</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape606-1471" v:mID="606" v:groupContext="shape" transform="translate(612.12,-144.375)">
+			<title>Square.606</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape607-1474" v:mID="607" v:groupContext="shape" transform="translate(630.12,-144.375)">
+			<title>Square.607</title>
+			<desc>8</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.47" y="573.27" class="st4" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>8</text>		</g>
+		<g id="shape608-1477" v:mID="608" v:groupContext="shape" transform="translate(576.12,-126.375)">
+			<title>Square.608</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape609-1480" v:mID="609" v:groupContext="shape" transform="translate(594.12,-126.375)">
+			<title>Square.609</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape610-1483" v:mID="610" v:groupContext="shape" transform="translate(612.12,-126.375)">
+			<title>Square.610</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape611-1486" v:mID="611" v:groupContext="shape" transform="translate(630.12,-126.375)">
+			<title>Square.611</title>
+			<desc>8</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.47" y="573.27" class="st4" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>8</text>		</g>
+		<g id="shape612-1489" v:mID="612" v:groupContext="shape" transform="translate(576.12,-108.375)">
+			<title>Square.612</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape613-1492" v:mID="613" v:groupContext="shape" transform="translate(594.12,-108.375)">
+			<title>Square.613</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape614-1495" v:mID="614" v:groupContext="shape" transform="translate(612.12,-108.375)">
+			<title>Square.614</title>
+			<desc>8</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>8</text>		</g>
+		<g id="shape615-1498" v:mID="615" v:groupContext="shape" transform="translate(630.12,-108.375)">
+			<title>Square.615</title>
+			<desc>9</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.47" y="573.27" class="st4" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>9</text>		</g>
+		<g id="shape616-1501" v:mID="616" v:groupContext="shape" transform="translate(576.12,-90.375)">
+			<title>Square.616</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape617-1504" v:mID="617" v:groupContext="shape" transform="translate(594.12,-90.375)">
+			<title>Square.617</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape618-1507" v:mID="618" v:groupContext="shape" transform="translate(612.12,-90.375)">
+			<title>Square.618</title>
+			<desc>8</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>8</text>		</g>
+		<g id="shape619-1510" v:mID="619" v:groupContext="shape" transform="translate(630.12,-90.375)">
+			<title>Square.619</title>
+			<desc>9</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.47" y="573.27" class="st4" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>9</text>		</g>
+		<g id="shape620-1513" v:mID="620" v:groupContext="shape" transform="translate(576.12,-72.375)">
+			<title>Square.620</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape621-1516" v:mID="621" v:groupContext="shape" transform="translate(594.12,-72.375)">
+			<title>Square.621</title>
+			<desc>8</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>8</text>		</g>
+		<g id="shape622-1519" v:mID="622" v:groupContext="shape" transform="translate(612.12,-72.375)">
+			<title>Square.622</title>
+			<desc>9</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>9</text>		</g>
+		<g id="shape623-1522" v:mID="623" v:groupContext="shape" transform="translate(630.12,-72.375)">
+			<title>Square.623</title>
+			<desc>10</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="4.44" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>10</text>		</g>
+		<g id="shape624-1525" v:mID="624" v:groupContext="shape" transform="translate(576.12,-54.375)">
+			<title>Square.624</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape625-1528" v:mID="625" v:groupContext="shape" transform="translate(594.12,-54.375)">
+			<title>Square.625</title>
+			<desc>8</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>8</text>		</g>
+		<g id="shape626-1531" v:mID="626" v:groupContext="shape" transform="translate(612.12,-54.375)">
+			<title>Square.626</title>
+			<desc>9</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>9</text>		</g>
+		<g id="shape627-1534" v:mID="627" v:groupContext="shape" transform="translate(630.12,-54.375)">
+			<title>Square.627</title>
+			<desc>10</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="4.44" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>10</text>		</g>
+		<g id="shape630-1537" v:mID="630" v:groupContext="shape" transform="translate(472.189,-335.711) rotate(45)">
+			<title>Sheet.630</title>
+			<rect v:rectContext="foreign" x="0" y="500.753" width="30.4299" height="78.514" class="st5"/>
+			<switch>
+				<foreignObject x="0" y="500.753" width="30.4299" height="78.514"
+						requiredExtensions="http://schemas.microsoft.com/visio/2003/SVGExtensions/">
+					<v:foreignData v:orgSize="29760" v:data="data:metafile;base64,eNrtnXlcE8f7xzeESznUqoilCHi2VbwwUPAgQL1qRaoocq
+								ktNlCIFyCIyiEoalEBbT0iINRWRTkbxSoK+rVKrCbSVpEIaK0IqIgX1gM1v2dysUCOYbS/v5qX4z57vefZz3x2dja8dsOgKI
+								pLtX68odTS5pfoUZSeKUVZT5w+iaIYFDuconpSKGr36UZRSw0paq4ORXm2W3lqSBfKeqQeBQBqGBRrKIAbymAzqA8g7g5Fp/
+								upKrTbAXlB21bDivLusm1HsnWk2+lKiXHOH7MpaiBE5vL51tpa49FsiqHgt34Ou1LU1244+w9iU7Q6KcqKzaSM5XF/tq4y7s
+								fuoowH0pbbsA2UMb2+a/vYjoo66LEhm9JbCzqu6NaqR/sPWrbRw32ySdf3u0JoMnXKpzNh6gPltqE+/P/nmL9+QNuFzZzsSu
+								WJPriDDiDQZboLRRUmG736EtqT6rJ0incYRfWtQIVh/vtv9ynKtGnqpy6eK+bdr57K95/d99yyRNPX+i+7nNoo4nIimAdH6+
+								5e4+RZGhd3couhv2ugYUrKlsHNNro6qWxrE4ZleYLJSR8zvqfjJyUPTt475t0y/tK1Vbk1acUPip8dXGnw+ZS7XpKVojVjdn
+								CPPFtnNe/JsQvDb7x58TD5icXfuTZLqmJq/1h0NWNPocFK9nbfBmEl84T5i6FeTX6rn04u5jSW7Jn45tbGGX2qZsTaBzx7Yf
+								54z3WfYRHn/R0Cnj25m/K0aWdhsXPSq/Vx3KzIC/75lc4ZhUlWbiX+bNvRmZGlJ+dWjwzYMT7pVUvgiP6zglxK7Ko320Vcn2
+								bjG+Rrl/lga/E35/JijV8UOVxN3c1hClZGXqq8E5gaOznFdsyhvuWraiMM/7nKaUwasS4tPm+3yUT95nnHLAM5jUfKUpaYCO
+								I4LCbvygnGa9HL+2lORRPfLGLrh+8QU83jby7O+Dhi06uv47lZ3sJGt5EZJuwHRpEXLhf0fWzLELsOCXSzDguZU/v65uLxY3
+								s0DqoNrEhlCIP1UZ3LhFMbIvskMUOPRpVxBA6ly3pECcQeQRMrxn4YWezV6LTQCDYMv7M+M9c79alp1K9FgtIuArPjyeIpVy
+								5mPoie8eZ1/QTf1TVMe7frU2VHkVNnEuSyiAfZHzbiJr26ZxjFD2SWP1tXdWR24SyBGe9Kmp3FYc5JQ8f9HPaNEZFlF3yNd9
+								s8MBSkc9ysd3MCsjnHPIVxcxaeNHhTtK7A9nH+lhEDdlzUB/Co0PPcAh4jeu7iZ5VA+ihojl3mjuCKnhezOG6pjNzI+rmFfR
+								9Pz88fUC/d3C3G8ByrrLzbbzXLYxZWmr3YwYDcOGw+b1rg3ubisBou5f0gqzTZcLfNDXOb+svGQXPCKriGwXv4jIUn199uus
+								6VHXy/W9xfWUyfJckLYgPsCmYJlrMi0018+PPWXgyGJoIU1rya8sv4/Bzz4vjVZ50em/QIzefbpOTUpYkXoOa6yBmQGNNs9T
+								ggt2ux6bF8WAi7zwlrQrUXzl7gWxui8/M6acMaN/zEe6rLvwDHWHVxknFu5CWPSuMXL69WpFaJ9A85Btp/FtLkZGH24tHSln
+								n3+1b1nz9doXeaeEPIlqxbvJbnTcYNnsKQconfsC2Cp4npru4u9RXVUfa6Rx5Axb5BcJAnX/uDSssVWoRx9l7q5gIB+GExz8
+								TauUdT3FeSH/WKJpZ8bM1cOZvvElKRlOItXM4/HbzYzmJU8+/PfBfY1d7vgZKW6ZxTlwcrRbs27+NBI4zsU9Vf0t81IzDBzc
+								qkfEbi2MtrLQd0W8a1nyv0D0xAbd9YX102u2D9pIoLPJM1w/K8h88Og3bdEzbfq5kh/uemwTShvtSXg4AKFnVMNHW+n/tsZn
+								kAz6TmaweUdvJNt3JWeBM3/HVT9RiLpkxvIZwaS9hGwbuR5k6TMnSbzTlnBo05FGXnrPu02dmjmo1OgaSQ7dOuVHPsq13hcC
+								2v+o22kErA2CzJv9l1ZQ7/5vsrh1nFf5sjeSKwBHrO38V2wRF6Qk4CnHO2BxbBFDzCD2koFCaODbKXefgoR6bdP0a2KWdZF3
+								4KhiZ6OfTDxSinLSHDdyGDQFjEKl7f7wL3l4R+t9Zbxzy6Mj78G6HbxcRpQXOe6B/OLFJspv/zG3sdh/KADJOj5WrwYfZ+bm
+								Us2QkB6V+9klO67HfL5p0cJqs4fnjGZkOxq3QP/TG53u6xf/4wreEP3voMk8jycHCBEJ0/0hNfzIJdzvSGjUa4inLlZpjFv2
+								l2vAidf27CxDeiaK/WVuhbfoxVnNjvT+4va4crNmdMT7391UKv8p9Y+aH2r1zl8iMjewoXZ5WBEIF6NS7J992WmryMnlx+BO
+								XfL9g+bpxtkMsROcLfj5/h0FAoSnRuiKv2tJM1JZPXHzaN0Bt/7oHo0Y3P/zLw8ahSHPD+4Dz2eh5gl9k/fAaCDs+WZ8PnuC
+								31dx1x5QNguAn1U/br2R6AzsYY9RJJyMTJUQn97kH+7rJlUNM+QWg2r8a1jJW8CEzhIJRXUcULC7xVaSDKLBttwY4wAac4WC
+								iq8BBzx93usjK3jhIGN62yHxb6nk/KaxejYHeUd6BwEjvvck5g6knXMpDteo74oZ4ldP1zwkRcj4Uga7Z4abeaCPtvQyr4gl
+								B2zLWe0H2DZNcq/B7XiG3D76BTX6SfMsQg3TesXtz98j7x0i7If2BaqMnJunjt4oPHnnB1az8TrjSL6lou62WeOVqU7af8nS
+								x6eYv6D8lihuq9dFtP03fidr6gKXManH3crC/Cqh6IQgQr52VJld4hmtQUGXi50VRqf2nXnM0bWh7vM9ri0oq1855yM+uOi5
+								dKzwFF/yiC7rB4HRx7XNpaU8G2NRxkkPBlkJC3qaB30Ol03fAqMVsYLUocqpCZdYEFp5Blyr3gmIqxzR8HMgXb4jgKp/QOcv
+								GPNxV8z3vav/Y8u9soiwOOkfZ+MhOLeCYuwl2bPYV5mS0ZJtbo2CzHpNS4Kg8MjtWHPYQBmno88eAvcxVJe8TDyMLefJud2T
+								x/V/Bm3LiP4HyVm8coeI5wZXyXWnOf1BpXkK6ozp6/7O8BgeWoi4KdDjjy1/D6jgmOMKnhyI7R+Hgexwvp2ecqtO3tOL81Zh
+								cHNzivWg/9/+VxtZGBR9HKP3m3Z4IDEtl1F88NlKZb7zr052lIg1ksr0w91iHO+NO6ocaLAs/0Dt/D/zJk8Vhwl5HsIL8uz4
+								Y6D4ktw3eLKdTSdg3HeQEN9ePcaHVuBTf4O1gciJp9I93flS/ORCfUPNDuJ1aVeKlRzTMu5djnWkWjKU846WBd18OZnCs5Q3
+								ayt2SJuZSYBe6ijpn5WFddXj1TZAL9aoi9LivQ/trqyAzj8FkwCoi7mHkPfJw4tuFv3tNhP3+QAj4QR9hzglxsd5lYl5U78m
+								++Jz0xIeE1jaMthjhH6MrqPJ5XV3/StY24TkCHo5HXWSw6NyaTGRoghPOtWJQ4rWEfb1s5Y5cL0zFxeLa4d3iE0j2Ne+u+gZ
+								7kPHdgCFzYP2joc8LFK0UurP2VtLEWJVnM0QmLDzr2cToKjSI1K282P2OBjmOiiLcZWhosdaRcR1zPfb7OVLBZ3me6RyW4X5
+								G25fuCUeFVvGhXA+k5pKy0a/OQWWvMLm8VT6kAzRyh48vLnMUv3HUj/Qu+M5Jmd7Bo3fDsKJ+21flle89Imd+sX+MZZnuYbc
+								qyKFEcZkuaCZxW19D5ODpBP9S45jjnZFzj+05PbSoj7O+ia+0N6Wl2Hl3zTY9ncyLLi+pyxN0F+3hJgW4LAhp2nHBzuJJm5D
+								/zyeFW86SzLBjHZvIzXCL00g8iZb1GWUzI0oN2nPOoJor58s9stsFCfZNa8+PL0dE1NR/MHFG8uqXPiJqI+6uen078xB7swy
+								8Zu7fOfOWGnlWSbx/tmBMPPt/KuteFYwk2OBSfZ33rbOWNV3euny25Otzzjeu258kF37+Jqf9BkZvHLH5W5ODjfIlPadjNeR
+								Utz75aNqF66M15ry+9fD2oev5tUfTj0rPd8iakTTrHr2zR801d/2bAOTZSz7GLZUpAAx8SE/W3a+AuHJcUbb2gimc7Jvj65f
+								l5XVEfMCy0q48HPxi660vb+9/ybkzInx803iIALvRI4b90jrNG9zuTEbi3a67gxoP4TX0CYqcVdOsSYQxq6NgemFYuVQe1Do
+								efOlvRQ3hM4+x1QLXlLziabnL5RPaN9NuhIe7Cc7tKUh8N+yd6SU6Al6zd/EM45dNhKGFQe453e64wwebW7ILYoshlpQuuSu
+								2zHLrcEui/4Gz5wG3PixULFo5LTM/xvtT/AgPq3+kPY/rW/gydQ4zolbMFK8xaGGeR01wsL0CPWsQvDEXZPBcaHZHW9YdU1j
+								DOgPriJ/tsbo1q7BV+sM6vYSs6SY6ZXqs4cUew4iOuk4Xqg9uRWZaIhPAtsOrFnfNkzsumHMENi56jr7/3j2+f7ZnFFci81+
+								OkJ8Nms76XvnLuVRAGF4NqdO7qvf6jEmlwJh4uc9DHDOczaqHnGO4iOz9e7LWN31RWsDt4S1YLz+QyXB5CBF6LbCD714Kqe5
+								WDWH0uJWbWXdQP1m9O8u2N2B9XDiofxJK6PmINSmvIguTbxXwLtEvl2HjbnYtCmgLsq9FRpvH6wnGC52tnzUM53IqXXlPjcu
+								r7AclqY8E0aY/iLtw/Ic0Qmh2GVkUco9f/u5wCG/f8FkZE0FtP54/yXnhiUGPkiF7SFlr57erb3yHa73HpMo0eP7ELfQqWNq
+								ksqvtDvDO1J2TZt2AW/4tnNVGTFq8wLg6359ReDCxM7mKPurzgrBkiwdewTe+Ce8ETXpzeVH0XPFCyJf1g1KSm58P++ZMxNj
+								Fm8aSmeu/GW/HjB3Rr3l5wY1t+DAz+N1VL+JV2j+NGnMtINYg5lZHq9bDXqns280PPSxhMJ4fUXez+q9BN9dSJ7p/muS6Il9
+								6DMwwV9+mKe3DddvfgFnAvD/+oMfL7cR3Z4lJZkX2/wJQtG0dRxeOtRz6YgqZofsXgnVNtUD1QvoJiLGcolvVTfich+0hv7K
+								H0lm+r2y5G35P0ku/r6e7hoQ+wL1BFRuq/02nP9oAyEn3zAeVEV9m6RKpJuf5D2rYMStW3Eq0fiUQi/zKDtpAu4ARabEXfMa
+								41jj2lDJmSv1q/+5A8bP26SfJcGRtIXiljZ8mbVrwiF1leygpiJZJTSrxEWQED4odKvERZgQHEr5R4ibICwCsrQKGiglgUn1
+								LiFRUwpPFDJV5RgYE0fqXEKyqQ4uUVyEJZBbGy+JQSL6uAIY8fKvGyCgzk8SslXlaBHC+tQBGiCmIV8SklHlXAUMYPlXhUgY
+								EyfqXEowqUeKigNUQVMDpo8l/8X/xf/F/8X/xf/P8Rx9KuR/TrlBXt+kW/rhnQrnf06yCDdn2kXzdb47i211kr2vWXfl02oF
+								2v6ddxBu36Tr/uK+K49uMEK9r4gT6uMKCNN+jjELo+9HGLLI7rOM6xoo1/6OMiA9p4iT6OYtDGV/RxF4rjVI3TrGjjN/q4zo
+								A23qOPAxm08SF93EjFxqkeZ/ZTMy6Vf9DYuXJ258bO42CwvwnK5zNl685RG9SMnQ07cM62GztvmTmcSp45gtrua0+lB4yn7p
+								7eQD36LZmSVKVBSYc4BaYZsHw9TPfAfCqVvnA8xLspSTWa30rdPZUIcSbE22CaBdPvKMk1HsQ/wLp1MN0Ly76H/SZA/CP16M
+								L3sH4XJalB8XbYZi3EP0G8A/JgQbwP4p2wDZSa/dSji7uou6WwzfX9UA7APLDFOyDOhhjyuH4Q1idQkhuHqEfCNKjHGdYDtz
+								QeluXAsnSY5lKPRBmwHOr+K4+6W7IGpvmwDI5BlElJKiHnvwqoR5eyYF0cJblZCMvgeG7+DMt+gLIXuGyY51OPyvdS230gz8
+								qtlOTvwzD/I0yPwH6xEP9ESa6mwnwR9ej3fbAshpLcKoJyFOYh/6ug561fID5ApX/NhvXRlKT2GCxPhmXZML8a1sN87XGYPw
+								jTYkpyG5UTUE5CKaEkdaiUQjkF5TQlqUflf1DOQPmVkjSgchbKOShllOQOKgIo5ynJXVR+o5JnjKB+XDSZen51H8xfoCT3Lk
+								IRQhFRkkZULkEppyT3f4fyB5Q/qZY7Qio/Zh7sA8dx9yJMD8A2v8NyiCuzYZ0/MKdAfBDKIVh+gXouzqF+XAzLxLlUS8MFKL
+								9BnEc9v5YPy6fCtAD286OeVxVAKYT152H6M9VSf57Kj4bl1XyIBTA9DNt/Rj2vOQLzZTAtgnKUaqkrg3IO4l+o59d/gX18YX
+								oMlp2F7adBfBxKMcz/CtMTVMvtX6kfl6DlJyE+A9MS6vmNEtjPh2J04hPXmY3hFrZTm+v8W+w4HSZT59/Km1n6a2fg+GzIuv
+								TM6c7AO5E3s/R/q1afLsKH47JR1qeiVnUKjp03s/RkFGKvLsGG47Eh62MnV0RFrQR2NDYcM2/mseIVK1bIEo8+gQnHYaOsj0
+								dErlAkHv0LHhwrb0Avj4iQsRE8Bg+unY2yProc2JFKUaJjseAYeTNzj4YvBzhNlFgsuDY2ZJ17JDw8fLlcFFniMbGxfO1wrX
+								kzcw+HhYW3Ji4XJTaBn60NrpmNsi4MDQ2jJS4TJQYHriVvQC9bRmOvaBUlIeFnLXBNbJR1wTLEpouiaM2EhIR8zXCNeTNz85
+								YuBXY7UZSJr9UMV89GWectQexWUdq2ZkLK2kOa4BryZubmLAa2IvEOoqDENcLVsSHrrJxFi5a0S7yNxVHi6zTA1ebNzDrIpb
+								FVWhzYmuCq2Sjr/Vxg00VRYXEQZd26A+rgavIGdDCXy1Ulyop2oqxLPLBXNVwVG2X9U1BwiCJxTRZHiSfuUw1XmTcz68egoO
+								BgjaKsbhUlMTFDJbwjG2X9QwBih6gSpUNrIvZ6lXAVeTN3Z84PQPCQ9omrtjjA12/I+L4jvD0bsgb0/MAAhSgYFofEN6iCd8
+								gb0H4ydlA7UdRZfJ00cRXwtmyU9R5fP4AHBGhtzWh6a27YsLEDvF3ezN1pc6VshSiYFgd2RzidjbJO85w7F7HbtaZWi4MoGz
+								emt4W3yZu5nTfT03Muriir24myceuuNvBWNmS9fedMxJ7bKkonLI7g37aB0/Jmbt/h7i5n+83vvMVR4t9uo8EVbMh6zXefAV
+								spSuctvhESp8OVeQN6ymefubcVpZMWR+ykbVsUcBkbZZ06EbERfK621oxUZXG5KEmtcHnezDXJEydOmfIuRElK2iyHIzbKOt
+								7FBbHpotBaU4sotMQRe5McLs2bqfKDa3HVe+sgNrrbVc3Gsrg6NsDVX+eZqiyu6rxnMjs9hmC2dYqK1pSP3sjYeBYnYas979
+								tZnJCNZXFiNobFidiYvTgpG8fipGwci5OwO3SGaixOwsbtxYnYvn5YFidna7c4EVsuijaLk7BxBypvwdY6eiNjz8WyOBFbhS
+								iqLE7IxrI4OVu7xUnYuAMVUjaOxYnYmGPxt2NrtjgJ213V6E2FxcnYM7EsTs7WbnEittYblPC3YWNZnITd9gZFvcXJ2O5YFn
+								9rtgaLE7E/c1drcbooxGz1FleKQsLWfrv5dmyce3AiNubtJiEby+IkbNx7cDL2FCyLk7JxLE7EVoqi2eKE7I4WV3GDQsJ2aS
+								uKWouTsfG+USFna7c4EdtlIpbFidkYFidhO7u4YFmclI1jcSK2swuWxQnZWBYnYY9zdsGyOBnbGcvipGwcixOxNYrSmjghG8
+								viJGyndqKoG72RscdhWZycrd3iRGyncVgWJ2arEYVucRI2y8kJy+KkbByLE7FZTlgWJ2RjWZyEbcdywrI4GZuFZXFSNo7Fid
+								iaRVFanJCtbM0pqiwuF4WEPQpTFDJ2+9ZU/fcwcrZ2ixOxlaK0bU3fdhYnYQ/vKIrK856MPaqNKFPUiELKxrE4EXv4KBUW9+
+								xgcRK27fBRWBYnZitFcVE7eiNhf9xOFHUWJ2KrEEWVxYnYH8tE0WZxEvYQJMoo7aKQsW2xLE7EHqJszXGaLE7IxrI4CXsghi
+								gITsYegmVxIvZAPFFI2TgWJ2HbYIpCxh6IZXEidpvE1Y/eSNj9FGwtrUnCtsATJYCI3Q+vNYnYtMQ1ifIWbK0WJ2H3xRSFjI
+								3XmkTsvhZYFidkY1mchG2GKQoZG681Sdk4FidiK0XRbHFCNpYoJOxeZnitScrGsTgRu5cZlsUJ2ViikLB79MJoTRi9kbF7YV
+								mclI1jcRK2CZYonqRsrNYkYRuZ9MKyOBm7B5bFydnaRSFiG+G1JjEbw+Ik7C5GRlgWJ2RjiULE7oLXmoRsLIuTsA27GGFZnI
+								yNJwopG6c1idh0UTRYnJCNZXESdndMUcjYeK1JztZucSJ2d0MsixOzMUQhYZt3x2tNUjaOxTWx1TwPwjQ3745lcc3Pk6hlY4
+								mi4TkYKduDs3DePC+vL6ZO/fTTCRM+GTnyo4+src1xW7P9NyrffNn6/A7QPb5aiNhfyNifyNjmuBZv+zXT11+2Pncky/wrSP
+								wLWeKfyBI3p7WmmSqLq/5jwYIv2z4vhTL3eTeiKNE0NsC9ZInTRWnbmjaqWrPt38P8p3d8Pk2auQZRFBbX8k2q9/SOz9XJMp
+								/dvjW7q7K4rapeXCYKHd3uOUaP2a2ijFQlipax+Jzpqp9jlGU+WVVrGqlqTRV/D5s1Xf3zlyjzyaot3qOtxVWO3tqhO7CZgy
+								e3ioJvccSeMV3z865ABziRxT+frvk5XUXmqs57LRaf5qD9+WJp5hos3k+1xSc5aH8uWpY5u6PFNYniMlEVWs1z6IPZnbO4mw
+								Pec+jyzBWiWGNc2lwd8J+fR5k7qrd4+0vbeAf85/5lmTvSRTHUIIo6tIbrPMCxLD7WoXPvWZBlPgbn0mbv0Pn3Q6DMx2i3uA
+								a0RjZz8DBtooy2JHsfB8p8mGaLa0RrYQNczaVNmvgIS/L3n6DMB3S0uGL0NtSS/L0tsswHqBPlQ8u3e98M0HUHqG7NQZZv95
+								4caea676uyeH/Lt3+/D8r8/Y4Wt7J8+/cSyTNvL8oHlu/mfUoo855tW7NPt3fzHihZ5j3pouChsd+7BXClxXt3e3fv3ZJlbq
+								wQ5b1u7/Z9YShzY1lrmnZ7t+85k2Wuj9hd9d79+9lQ5gDvBLpTbIAb6P0778NDmev9O+/xk2b+771/8F98b2JnPx1/J8EAlv
+								WnZL+jgN5vi9bvhLKJkv3e5EA2pfK3G9X9ViTONoPZbX9P0pCto2suzwH91oT0FwNgkzFsHWXdn0NZSKl/H63i3bIfUQcpVX
+								W2jVV/2uug+M1MpIMOTQdTuYaUfD2K/w8OpCyT"/>
+				</foreignObject>
+				<svg viewBox="0 0 89.024 228.01" height="78.514" preserveAspectRatio="none" width="30.4299" x="0" y="500.753">
+					<clipPath id="mfid1">
+						<rect width="89.024" height="228.01" id="mfid2"/>
+					</clipPath>
+					<g clip-path="url(#mfid1)">
+						<mask id="mfid3">
+							<rect width="90" height="229" fill="white" stroke="none"/>
+						</mask>
+						<mask id="mfid4" fill="white" stroke="none">
+							<g>
+								<g mask="url(#mfid3)">
+									<use xlink:href="#mfid2"/>
+								</g>
+							</g>
+						</mask>
+						<!-- Unsupported Record: EmfPlusRecordTypeSetPixelOffsetMode  -->
+						<defs>
+							<image id="mfid5" width="90" height="229" xlink:href=""/>
+						</defs>
+						<!-- Unsupported Record: EmfPlusRecordTypeSetObject Obj_ImageAttributes -->
+						<g mask="url(#mfid4)">
+							<g transform="matrix(0.00018373, 0, 0, 0.00018373, 0, 0)">
+								<clipPath id="mfid6">
+									<rect x="-0.5" y="-0.5" width="90" height="229"/>
+								</clipPath>
+								<use xlink:href="#mfid5" clip-path="url(#mfid6)"
+										transform="matrix(5442.9, 0, 0, 5442.9, 2721.4, 2721.5)"/>
+							</g>
+						</g>
+					</g>
+				</svg>
+			</switch>
+			<rect v:rectContext="foreign" x="0" y="500.753" width="30.4299" height="78.514" class="st5"/>
+		</g>
+		<g id="shape631-1540" v:mID="631" v:groupContext="shape" transform="translate(773.187,-98.8741) rotate(75)">
+			<title>Sheet.631</title>
+			<rect v:rectContext="foreign" x="0" y="500.753" width="30.4299" height="78.514" class="st5"/>
+			<switch>
+				<foreignObject x="0" y="500.753" width="30.4299" height="78.514"
+						requiredExtensions="http://schemas.microsoft.com/visio/2003/SVGExtensions/">
+					<v:foreignData v:orgSize="29760" v:data="data:metafile;base64,eNrtnXlcE8f7xzeESznUqoilCHi2VbwwUPAgQL1qRaoocq
+								ktNlCIFyCIyiEoalEBbT0iINRWRTkbxSoK+rVKrCbSVpEIaK0IqIgX1gM1v2dysUCOYbS/v5qX4z57vefZz3x2dja8dsOgKI
+								pLtX68odTS5pfoUZSeKUVZT5w+iaIYFDuconpSKGr36UZRSw0paq4ORXm2W3lqSBfKeqQeBQBqGBRrKIAbymAzqA8g7g5Fp/
+								upKrTbAXlB21bDivLusm1HsnWk2+lKiXHOH7MpaiBE5vL51tpa49FsiqHgt34Ou1LU1244+w9iU7Q6KcqKzaSM5XF/tq4y7s
+								fuoowH0pbbsA2UMb2+a/vYjoo66LEhm9JbCzqu6NaqR/sPWrbRw32ySdf3u0JoMnXKpzNh6gPltqE+/P/nmL9+QNuFzZzsSu
+								WJPriDDiDQZboLRRUmG736EtqT6rJ0incYRfWtQIVh/vtv9ynKtGnqpy6eK+bdr57K95/d99yyRNPX+i+7nNoo4nIimAdH6+
+								5e4+RZGhd3couhv2ugYUrKlsHNNro6qWxrE4ZleYLJSR8zvqfjJyUPTt475t0y/tK1Vbk1acUPip8dXGnw+ZS7XpKVojVjdn
+								CPPFtnNe/JsQvDb7x58TD5icXfuTZLqmJq/1h0NWNPocFK9nbfBmEl84T5i6FeTX6rn04u5jSW7Jn45tbGGX2qZsTaBzx7Yf
+								54z3WfYRHn/R0Cnj25m/K0aWdhsXPSq/Vx3KzIC/75lc4ZhUlWbiX+bNvRmZGlJ+dWjwzYMT7pVUvgiP6zglxK7Ko320Vcn2
+								bjG+Rrl/lga/E35/JijV8UOVxN3c1hClZGXqq8E5gaOznFdsyhvuWraiMM/7nKaUwasS4tPm+3yUT95nnHLAM5jUfKUpaYCO
+								I4LCbvygnGa9HL+2lORRPfLGLrh+8QU83jby7O+Dhi06uv47lZ3sJGt5EZJuwHRpEXLhf0fWzLELsOCXSzDguZU/v65uLxY3
+								s0DqoNrEhlCIP1UZ3LhFMbIvskMUOPRpVxBA6ly3pECcQeQRMrxn4YWezV6LTQCDYMv7M+M9c79alp1K9FgtIuArPjyeIpVy
+								5mPoie8eZ1/QTf1TVMe7frU2VHkVNnEuSyiAfZHzbiJr26ZxjFD2SWP1tXdWR24SyBGe9Kmp3FYc5JQ8f9HPaNEZFlF3yNd9
+								s8MBSkc9ysd3MCsjnHPIVxcxaeNHhTtK7A9nH+lhEDdlzUB/Co0PPcAh4jeu7iZ5VA+ihojl3mjuCKnhezOG6pjNzI+rmFfR
+								9Pz88fUC/d3C3G8ByrrLzbbzXLYxZWmr3YwYDcOGw+b1rg3ubisBou5f0gqzTZcLfNDXOb+svGQXPCKriGwXv4jIUn199uus
+								6VHXy/W9xfWUyfJckLYgPsCmYJlrMi0018+PPWXgyGJoIU1rya8sv4/Bzz4vjVZ50em/QIzefbpOTUpYkXoOa6yBmQGNNs9T
+								ggt2ux6bF8WAi7zwlrQrUXzl7gWxui8/M6acMaN/zEe6rLvwDHWHVxknFu5CWPSuMXL69WpFaJ9A85Btp/FtLkZGH24tHSln
+								n3+1b1nz9doXeaeEPIlqxbvJbnTcYNnsKQconfsC2Cp4npru4u9RXVUfa6Rx5Axb5BcJAnX/uDSssVWoRx9l7q5gIB+GExz8
+								TauUdT3FeSH/WKJpZ8bM1cOZvvElKRlOItXM4/HbzYzmJU8+/PfBfY1d7vgZKW6ZxTlwcrRbs27+NBI4zsU9Vf0t81IzDBzc
+								qkfEbi2MtrLQd0W8a1nyv0D0xAbd9YX102u2D9pIoLPJM1w/K8h88Og3bdEzbfq5kh/uemwTShvtSXg4AKFnVMNHW+n/tsZn
+								kAz6TmaweUdvJNt3JWeBM3/HVT9RiLpkxvIZwaS9hGwbuR5k6TMnSbzTlnBo05FGXnrPu02dmjmo1OgaSQ7dOuVHPsq13hcC
+								2v+o22kErA2CzJv9l1ZQ7/5vsrh1nFf5sjeSKwBHrO38V2wRF6Qk4CnHO2BxbBFDzCD2koFCaODbKXefgoR6bdP0a2KWdZF3
+								4KhiZ6OfTDxSinLSHDdyGDQFjEKl7f7wL3l4R+t9Zbxzy6Mj78G6HbxcRpQXOe6B/OLFJspv/zG3sdh/KADJOj5WrwYfZ+bm
+								Us2QkB6V+9klO67HfL5p0cJqs4fnjGZkOxq3QP/TG53u6xf/4wreEP3voMk8jycHCBEJ0/0hNfzIJdzvSGjUa4inLlZpjFv2
+								l2vAidf27CxDeiaK/WVuhbfoxVnNjvT+4va4crNmdMT7391UKv8p9Y+aH2r1zl8iMjewoXZ5WBEIF6NS7J992WmryMnlx+BO
+								XfL9g+bpxtkMsROcLfj5/h0FAoSnRuiKv2tJM1JZPXHzaN0Bt/7oHo0Y3P/zLw8ahSHPD+4Dz2eh5gl9k/fAaCDs+WZ8PnuC
+								31dx1x5QNguAn1U/br2R6AzsYY9RJJyMTJUQn97kH+7rJlUNM+QWg2r8a1jJW8CEzhIJRXUcULC7xVaSDKLBttwY4wAac4WC
+								iq8BBzx93usjK3jhIGN62yHxb6nk/KaxejYHeUd6BwEjvvck5g6knXMpDteo74oZ4ldP1zwkRcj4Uga7Z4abeaCPtvQyr4gl
+								B2zLWe0H2DZNcq/B7XiG3D76BTX6SfMsQg3TesXtz98j7x0i7If2BaqMnJunjt4oPHnnB1az8TrjSL6lou62WeOVqU7af8nS
+								x6eYv6D8lihuq9dFtP03fidr6gKXManH3crC/Cqh6IQgQr52VJld4hmtQUGXi50VRqf2nXnM0bWh7vM9ri0oq1855yM+uOi5
+								dKzwFF/yiC7rB4HRx7XNpaU8G2NRxkkPBlkJC3qaB30Ol03fAqMVsYLUocqpCZdYEFp5Blyr3gmIqxzR8HMgXb4jgKp/QOcv
+								GPNxV8z3vav/Y8u9soiwOOkfZ+MhOLeCYuwl2bPYV5mS0ZJtbo2CzHpNS4Kg8MjtWHPYQBmno88eAvcxVJe8TDyMLefJud2T
+								x/V/Bm3LiP4HyVm8coeI5wZXyXWnOf1BpXkK6ozp6/7O8BgeWoi4KdDjjy1/D6jgmOMKnhyI7R+Hgexwvp2ecqtO3tOL81Zh
+								cHNzivWg/9/+VxtZGBR9HKP3m3Z4IDEtl1F88NlKZb7zr052lIg1ksr0w91iHO+NO6ocaLAs/0Dt/D/zJk8Vhwl5HsIL8uz4
+								Y6D4ktw3eLKdTSdg3HeQEN9ePcaHVuBTf4O1gciJp9I93flS/ORCfUPNDuJ1aVeKlRzTMu5djnWkWjKU846WBd18OZnCs5Q3
+								ayt2SJuZSYBe6ijpn5WFddXj1TZAL9aoi9LivQ/trqyAzj8FkwCoi7mHkPfJw4tuFv3tNhP3+QAj4QR9hzglxsd5lYl5U78m
+								++Jz0xIeE1jaMthjhH6MrqPJ5XV3/StY24TkCHo5HXWSw6NyaTGRoghPOtWJQ4rWEfb1s5Y5cL0zFxeLa4d3iE0j2Ne+u+gZ
+								7kPHdgCFzYP2joc8LFK0UurP2VtLEWJVnM0QmLDzr2cToKjSI1K282P2OBjmOiiLcZWhosdaRcR1zPfb7OVLBZ3me6RyW4X5
+								G25fuCUeFVvGhXA+k5pKy0a/OQWWvMLm8VT6kAzRyh48vLnMUv3HUj/Qu+M5Jmd7Bo3fDsKJ+21flle89Imd+sX+MZZnuYbc
+								qyKFEcZkuaCZxW19D5ODpBP9S45jjnZFzj+05PbSoj7O+ia+0N6Wl2Hl3zTY9ncyLLi+pyxN0F+3hJgW4LAhp2nHBzuJJm5D
+								/zyeFW86SzLBjHZvIzXCL00g8iZb1GWUzI0oN2nPOoJor58s9stsFCfZNa8+PL0dE1NR/MHFG8uqXPiJqI+6uen078xB7swy
+								8Zu7fOfOWGnlWSbx/tmBMPPt/KuteFYwk2OBSfZ33rbOWNV3euny25Otzzjeu258kF37+Jqf9BkZvHLH5W5ODjfIlPadjNeR
+								Utz75aNqF66M15ry+9fD2oev5tUfTj0rPd8iakTTrHr2zR801d/2bAOTZSz7GLZUpAAx8SE/W3a+AuHJcUbb2gimc7Jvj65f
+								l5XVEfMCy0q48HPxi660vb+9/ybkzInx803iIALvRI4b90jrNG9zuTEbi3a67gxoP4TX0CYqcVdOsSYQxq6NgemFYuVQe1Do
+								efOlvRQ3hM4+x1QLXlLziabnL5RPaN9NuhIe7Cc7tKUh8N+yd6SU6Al6zd/EM45dNhKGFQe453e64wwebW7ILYoshlpQuuSu
+								2zHLrcEui/4Gz5wG3PixULFo5LTM/xvtT/AgPq3+kPY/rW/gydQ4zolbMFK8xaGGeR01wsL0CPWsQvDEXZPBcaHZHW9YdU1j
+								DOgPriJ/tsbo1q7BV+sM6vYSs6SY6ZXqs4cUew4iOuk4Xqg9uRWZaIhPAtsOrFnfNkzsumHMENi56jr7/3j2+f7ZnFFci81+
+								OkJ8Nms76XvnLuVRAGF4NqdO7qvf6jEmlwJh4uc9DHDOczaqHnGO4iOz9e7LWN31RWsDt4S1YLz+QyXB5CBF6LbCD714Kqe5
+								WDWH0uJWbWXdQP1m9O8u2N2B9XDiofxJK6PmINSmvIguTbxXwLtEvl2HjbnYtCmgLsq9FRpvH6wnGC52tnzUM53IqXXlPjcu
+								r7AclqY8E0aY/iLtw/Ic0Qmh2GVkUco9f/u5wCG/f8FkZE0FtP54/yXnhiUGPkiF7SFlr57erb3yHa73HpMo0eP7ELfQqWNq
+								ksqvtDvDO1J2TZt2AW/4tnNVGTFq8wLg6359ReDCxM7mKPurzgrBkiwdewTe+Ce8ETXpzeVH0XPFCyJf1g1KSm58P++ZMxNj
+								Fm8aSmeu/GW/HjB3Rr3l5wY1t+DAz+N1VL+JV2j+NGnMtINYg5lZHq9bDXqns280PPSxhMJ4fUXez+q9BN9dSJ7p/muS6Il9
+								6DMwwV9+mKe3DddvfgFnAvD/+oMfL7cR3Z4lJZkX2/wJQtG0dRxeOtRz6YgqZofsXgnVNtUD1QvoJiLGcolvVTfich+0hv7K
+								H0lm+r2y5G35P0ku/r6e7hoQ+wL1BFRuq/02nP9oAyEn3zAeVEV9m6RKpJuf5D2rYMStW3Eq0fiUQi/zKDtpAu4ARabEXfMa
+								41jj2lDJmSv1q/+5A8bP26SfJcGRtIXiljZ8mbVrwiF1leygpiJZJTSrxEWQED4odKvERZgQHEr5R4ibICwCsrQKGiglgUn1
+								LiFRUwpPFDJV5RgYE0fqXEKyqQ4uUVyEJZBbGy+JQSL6uAIY8fKvGyCgzk8SslXlaBHC+tQBGiCmIV8SklHlXAUMYPlXhUgY
+								EyfqXEowqUeKigNUQVMDpo8l/8X/xf/F/8X/xf/P8Rx9KuR/TrlBXt+kW/rhnQrnf06yCDdn2kXzdb47i211kr2vWXfl02oF
+								2v6ddxBu36Tr/uK+K49uMEK9r4gT6uMKCNN+jjELo+9HGLLI7rOM6xoo1/6OMiA9p4iT6OYtDGV/RxF4rjVI3TrGjjN/q4zo
+								A23qOPAxm08SF93EjFxqkeZ/ZTMy6Vf9DYuXJ258bO42CwvwnK5zNl685RG9SMnQ07cM62GztvmTmcSp45gtrua0+lB4yn7p
+								7eQD36LZmSVKVBSYc4BaYZsHw9TPfAfCqVvnA8xLspSTWa30rdPZUIcSbE22CaBdPvKMk1HsQ/wLp1MN0Ly76H/SZA/CP16M
+								L3sH4XJalB8XbYZi3EP0G8A/JgQbwP4p2wDZSa/dSji7uou6WwzfX9UA7APLDFOyDOhhjyuH4Q1idQkhuHqEfCNKjHGdYDtz
+								QeluXAsnSY5lKPRBmwHOr+K4+6W7IGpvmwDI5BlElJKiHnvwqoR5eyYF0cJblZCMvgeG7+DMt+gLIXuGyY51OPyvdS230gz8
+								qtlOTvwzD/I0yPwH6xEP9ESa6mwnwR9ej3fbAshpLcKoJyFOYh/6ug561fID5ApX/NhvXRlKT2GCxPhmXZML8a1sN87XGYPw
+								jTYkpyG5UTUE5CKaEkdaiUQjkF5TQlqUflf1DOQPmVkjSgchbKOShllOQOKgIo5ynJXVR+o5JnjKB+XDSZen51H8xfoCT3Lk
+								IRQhFRkkZULkEppyT3f4fyB5Q/qZY7Qio/Zh7sA8dx9yJMD8A2v8NyiCuzYZ0/MKdAfBDKIVh+gXouzqF+XAzLxLlUS8MFKL
+								9BnEc9v5YPy6fCtAD286OeVxVAKYT152H6M9VSf57Kj4bl1XyIBTA9DNt/Rj2vOQLzZTAtgnKUaqkrg3IO4l+o59d/gX18YX
+								oMlp2F7adBfBxKMcz/CtMTVMvtX6kfl6DlJyE+A9MS6vmNEtjPh2J04hPXmY3hFrZTm+v8W+w4HSZT59/Km1n6a2fg+GzIuv
+								TM6c7AO5E3s/R/q1afLsKH47JR1qeiVnUKjp03s/RkFGKvLsGG47Eh62MnV0RFrQR2NDYcM2/mseIVK1bIEo8+gQnHYaOsj0
+								dErlAkHv0LHhwrb0Avj4iQsRE8Bg+unY2yProc2JFKUaJjseAYeTNzj4YvBzhNlFgsuDY2ZJ17JDw8fLlcFFniMbGxfO1wrX
+								kzcw+HhYW3Ji4XJTaBn60NrpmNsi4MDQ2jJS4TJQYHriVvQC9bRmOvaBUlIeFnLXBNbJR1wTLEpouiaM2EhIR8zXCNeTNz85
+								YuBXY7UZSJr9UMV89GWectQexWUdq2ZkLK2kOa4BryZubmLAa2IvEOoqDENcLVsSHrrJxFi5a0S7yNxVHi6zTA1ebNzDrIpb
+								FVWhzYmuCq2Sjr/Vxg00VRYXEQZd26A+rgavIGdDCXy1Ulyop2oqxLPLBXNVwVG2X9U1BwiCJxTRZHiSfuUw1XmTcz68egoO
+								BgjaKsbhUlMTFDJbwjG2X9QwBih6gSpUNrIvZ6lXAVeTN3Z84PQPCQ9omrtjjA12/I+L4jvD0bsgb0/MAAhSgYFofEN6iCd8
+								gb0H4ydlA7UdRZfJ00cRXwtmyU9R5fP4AHBGhtzWh6a27YsLEDvF3ezN1pc6VshSiYFgd2RzidjbJO85w7F7HbtaZWi4MoGz
+								emt4W3yZu5nTfT03Muriir24myceuuNvBWNmS9fedMxJ7bKkonLI7g37aB0/Jmbt/h7i5n+83vvMVR4t9uo8EVbMh6zXefAV
+								spSuctvhESp8OVeQN6ymefubcVpZMWR+ykbVsUcBkbZZ06EbERfK621oxUZXG5KEmtcHnezDXJEydOmfIuRElK2iyHIzbKOt
+								7FBbHpotBaU4sotMQRe5McLs2bqfKDa3HVe+sgNrrbVc3Gsrg6NsDVX+eZqiyu6rxnMjs9hmC2dYqK1pSP3sjYeBYnYas979
+								tZnJCNZXFiNobFidiYvTgpG8fipGwci5OwO3SGaixOwsbtxYnYvn5YFidna7c4EVsuijaLk7BxBypvwdY6eiNjz8WyOBFbhS
+								iqLE7IxrI4OVu7xUnYuAMVUjaOxYnYmGPxt2NrtjgJ213V6E2FxcnYM7EsTs7WbnEittYblPC3YWNZnITd9gZFvcXJ2O5YFn
+								9rtgaLE7E/c1drcbooxGz1FleKQsLWfrv5dmyce3AiNubtJiEby+IkbNx7cDL2FCyLk7JxLE7EVoqi2eKE7I4WV3GDQsJ2aS
+								uKWouTsfG+USFna7c4EdtlIpbFidkYFidhO7u4YFmclI1jcSK2swuWxQnZWBYnYY9zdsGyOBnbGcvipGwcixOxNYrSmjghG8
+								viJGyndqKoG72RscdhWZycrd3iRGyncVgWJ2arEYVucRI2y8kJy+KkbByLE7FZTlgWJ2RjWZyEbcdywrI4GZuFZXFSNo7Fid
+								iaRVFanJCtbM0pqiwuF4WEPQpTFDJ2+9ZU/fcwcrZ2ixOxlaK0bU3fdhYnYQ/vKIrK856MPaqNKFPUiELKxrE4EXv4KBUW9+
+								xgcRK27fBRWBYnZitFcVE7eiNhf9xOFHUWJ2KrEEWVxYnYH8tE0WZxEvYQJMoo7aKQsW2xLE7EHqJszXGaLE7IxrI4CXsghi
+								gITsYegmVxIvZAPFFI2TgWJ2HbYIpCxh6IZXEidpvE1Y/eSNj9FGwtrUnCtsATJYCI3Q+vNYnYtMQ1ifIWbK0WJ2H3xRSFjI
+								3XmkTsvhZYFidkY1mchG2GKQoZG681Sdk4FidiK0XRbHFCNpYoJOxeZnitScrGsTgRu5cZlsUJ2ViikLB79MJoTRi9kbF7YV
+								mclI1jcRK2CZYonqRsrNYkYRuZ9MKyOBm7B5bFydnaRSFiG+G1JjEbw+Ik7C5GRlgWJ2RjiULE7oLXmoRsLIuTsA27GGFZnI
+								yNJwopG6c1idh0UTRYnJCNZXESdndMUcjYeK1JztZucSJ2d0MsixOzMUQhYZt3x2tNUjaOxTWx1TwPwjQ3745lcc3Pk6hlY4
+								mi4TkYKduDs3DePC+vL6ZO/fTTCRM+GTnyo4+src1xW7P9NyrffNn6/A7QPb5aiNhfyNifyNjmuBZv+zXT11+2Pncky/wrSP
+								wLWeKfyBI3p7WmmSqLq/5jwYIv2z4vhTL3eTeiKNE0NsC9ZInTRWnbmjaqWrPt38P8p3d8Pk2auQZRFBbX8k2q9/SOz9XJMp
+								/dvjW7q7K4rapeXCYKHd3uOUaP2a2ijFQlipax+Jzpqp9jlGU+WVVrGqlqTRV/D5s1Xf3zlyjzyaot3qOtxVWO3tqhO7CZgy
+								e3ioJvccSeMV3z865ABziRxT+frvk5XUXmqs57LRaf5qD9+WJp5hos3k+1xSc5aH8uWpY5u6PFNYniMlEVWs1z6IPZnbO4mw
+								Pec+jyzBWiWGNc2lwd8J+fR5k7qrd4+0vbeAf85/5lmTvSRTHUIIo6tIbrPMCxLD7WoXPvWZBlPgbn0mbv0Pn3Q6DMx2i3uA
+								a0RjZz8DBtooy2JHsfB8p8mGaLa0RrYQNczaVNmvgIS/L3n6DMB3S0uGL0NtSS/L0tsswHqBPlQ8u3e98M0HUHqG7NQZZv95
+								4caea676uyeH/Lt3+/D8r8/Y4Wt7J8+/cSyTNvL8oHlu/mfUoo855tW7NPt3fzHihZ5j3pouChsd+7BXClxXt3e3fv3ZJlbq
+								wQ5b1u7/Z9YShzY1lrmnZ7t+85k2Wuj9hd9d79+9lQ5gDvBLpTbIAb6P0778NDmev9O+/xk2b+771/8F98b2JnPx1/J8EAlv
+								WnZL+jgN5vi9bvhLKJkv3e5EA2pfK3G9X9ViTONoPZbX9P0pCto2suzwH91oT0FwNgkzFsHWXdn0NZSKl/H63i3bIfUQcpVX
+								W2jVV/2uug+M1MpIMOTQdTuYaUfD2K/w8OpCyT"/>
+				</foreignObject>
+				<svg viewBox="0 0 89.024 228.01" height="78.514" preserveAspectRatio="none" width="30.4299" x="0" y="500.753">
+					<clipPath id="mfid7">
+						<rect width="89.024" height="228.01" id="mfid8"/>
+					</clipPath>
+					<g clip-path="url(#mfid7)">
+						<mask id="mfid9">
+							<rect width="90" height="229" fill="white" stroke="none"/>
+						</mask>
+						<mask id="mfid10" fill="white" stroke="none">
+							<g>
+								<g mask="url(#mfid9)">
+									<use xlink:href="#mfid8"/>
+								</g>
+							</g>
+						</mask>
+						<!-- Unsupported Record: EmfPlusRecordTypeSetPixelOffsetMode  -->
+						<defs>
+							<image id="mfid11" width="90" height="229" xlink:href=""/>
+						</defs>
+						<!-- Unsupported Record: EmfPlusRecordTypeSetObject Obj_ImageAttributes -->
+						<g mask="url(#mfid10)">
+							<g transform="matrix(0.00018373, 0, 0, 0.00018373, 0, 0)">
+								<clipPath id="mfid12">
+									<rect x="-0.5" y="-0.5" width="90" height="229"/>
+								</clipPath>
+								<use xlink:href="#mfid11" clip-path="url(#mfid12)"
+										transform="matrix(5442.9, 0, 0, 5442.9, 2721.4, 2721.5)"/>
+							</g>
+						</g>
+					</g>
+				</svg>
+			</switch>
+			<rect v:rectContext="foreign" x="0" y="500.753" width="30.4299" height="78.514" class="st5"/>
+		</g>
+		<g id="shape632-1543" v:mID="632" v:groupContext="shape" transform="translate(950.873,41.6775) rotate(90)">
+			<title>Sheet.632</title>
+			<rect v:rectContext="foreign" x="0" y="500.753" width="30.4299" height="78.514" class="st5"/>
+			<switch>
+				<foreignObject x="0" y="500.753" width="30.4299" height="78.514"
+						requiredExtensions="http://schemas.microsoft.com/visio/2003/SVGExtensions/">
+					<v:foreignData v:orgSize="29760" v:data="data:metafile;base64,eNrtnXlcE8f7xzeESznUqoilCHi2VbwwUPAgQL1qRaoocq
+								ktNlCIFyCIyiEoalEBbT0iINRWRTkbxSoK+rVKrCbSVpEIaK0IqIgX1gM1v2dysUCOYbS/v5qX4z57vefZz3x2dja8dsOgKI
+								pLtX68odTS5pfoUZSeKUVZT5w+iaIYFDuconpSKGr36UZRSw0paq4ORXm2W3lqSBfKeqQeBQBqGBRrKIAbymAzqA8g7g5Fp/
+								upKrTbAXlB21bDivLusm1HsnWk2+lKiXHOH7MpaiBE5vL51tpa49FsiqHgt34Ou1LU1244+w9iU7Q6KcqKzaSM5XF/tq4y7s
+								fuoowH0pbbsA2UMb2+a/vYjoo66LEhm9JbCzqu6NaqR/sPWrbRw32ySdf3u0JoMnXKpzNh6gPltqE+/P/nmL9+QNuFzZzsSu
+								WJPriDDiDQZboLRRUmG736EtqT6rJ0incYRfWtQIVh/vtv9ynKtGnqpy6eK+bdr57K95/d99yyRNPX+i+7nNoo4nIimAdH6+
+								5e4+RZGhd3couhv2ugYUrKlsHNNro6qWxrE4ZleYLJSR8zvqfjJyUPTt475t0y/tK1Vbk1acUPip8dXGnw+ZS7XpKVojVjdn
+								CPPFtnNe/JsQvDb7x58TD5icXfuTZLqmJq/1h0NWNPocFK9nbfBmEl84T5i6FeTX6rn04u5jSW7Jn45tbGGX2qZsTaBzx7Yf
+								54z3WfYRHn/R0Cnj25m/K0aWdhsXPSq/Vx3KzIC/75lc4ZhUlWbiX+bNvRmZGlJ+dWjwzYMT7pVUvgiP6zglxK7Ko320Vcn2
+								bjG+Rrl/lga/E35/JijV8UOVxN3c1hClZGXqq8E5gaOznFdsyhvuWraiMM/7nKaUwasS4tPm+3yUT95nnHLAM5jUfKUpaYCO
+								I4LCbvygnGa9HL+2lORRPfLGLrh+8QU83jby7O+Dhi06uv47lZ3sJGt5EZJuwHRpEXLhf0fWzLELsOCXSzDguZU/v65uLxY3
+								s0DqoNrEhlCIP1UZ3LhFMbIvskMUOPRpVxBA6ly3pECcQeQRMrxn4YWezV6LTQCDYMv7M+M9c79alp1K9FgtIuArPjyeIpVy
+								5mPoie8eZ1/QTf1TVMe7frU2VHkVNnEuSyiAfZHzbiJr26ZxjFD2SWP1tXdWR24SyBGe9Kmp3FYc5JQ8f9HPaNEZFlF3yNd9
+								s8MBSkc9ysd3MCsjnHPIVxcxaeNHhTtK7A9nH+lhEDdlzUB/Co0PPcAh4jeu7iZ5VA+ihojl3mjuCKnhezOG6pjNzI+rmFfR
+								9Pz88fUC/d3C3G8ByrrLzbbzXLYxZWmr3YwYDcOGw+b1rg3ubisBou5f0gqzTZcLfNDXOb+svGQXPCKriGwXv4jIUn199uus
+								6VHXy/W9xfWUyfJckLYgPsCmYJlrMi0018+PPWXgyGJoIU1rya8sv4/Bzz4vjVZ50em/QIzefbpOTUpYkXoOa6yBmQGNNs9T
+								ggt2ux6bF8WAi7zwlrQrUXzl7gWxui8/M6acMaN/zEe6rLvwDHWHVxknFu5CWPSuMXL69WpFaJ9A85Btp/FtLkZGH24tHSln
+								n3+1b1nz9doXeaeEPIlqxbvJbnTcYNnsKQconfsC2Cp4npru4u9RXVUfa6Rx5Axb5BcJAnX/uDSssVWoRx9l7q5gIB+GExz8
+								TauUdT3FeSH/WKJpZ8bM1cOZvvElKRlOItXM4/HbzYzmJU8+/PfBfY1d7vgZKW6ZxTlwcrRbs27+NBI4zsU9Vf0t81IzDBzc
+								qkfEbi2MtrLQd0W8a1nyv0D0xAbd9YX102u2D9pIoLPJM1w/K8h88Og3bdEzbfq5kh/uemwTShvtSXg4AKFnVMNHW+n/tsZn
+								kAz6TmaweUdvJNt3JWeBM3/HVT9RiLpkxvIZwaS9hGwbuR5k6TMnSbzTlnBo05FGXnrPu02dmjmo1OgaSQ7dOuVHPsq13hcC
+								2v+o22kErA2CzJv9l1ZQ7/5vsrh1nFf5sjeSKwBHrO38V2wRF6Qk4CnHO2BxbBFDzCD2koFCaODbKXefgoR6bdP0a2KWdZF3
+								4KhiZ6OfTDxSinLSHDdyGDQFjEKl7f7wL3l4R+t9Zbxzy6Mj78G6HbxcRpQXOe6B/OLFJspv/zG3sdh/KADJOj5WrwYfZ+bm
+								Us2QkB6V+9klO67HfL5p0cJqs4fnjGZkOxq3QP/TG53u6xf/4wreEP3voMk8jycHCBEJ0/0hNfzIJdzvSGjUa4inLlZpjFv2
+								l2vAidf27CxDeiaK/WVuhbfoxVnNjvT+4va4crNmdMT7391UKv8p9Y+aH2r1zl8iMjewoXZ5WBEIF6NS7J992WmryMnlx+BO
+								XfL9g+bpxtkMsROcLfj5/h0FAoSnRuiKv2tJM1JZPXHzaN0Bt/7oHo0Y3P/zLw8ahSHPD+4Dz2eh5gl9k/fAaCDs+WZ8PnuC
+								31dx1x5QNguAn1U/br2R6AzsYY9RJJyMTJUQn97kH+7rJlUNM+QWg2r8a1jJW8CEzhIJRXUcULC7xVaSDKLBttwY4wAac4WC
+								iq8BBzx93usjK3jhIGN62yHxb6nk/KaxejYHeUd6BwEjvvck5g6knXMpDteo74oZ4ldP1zwkRcj4Uga7Z4abeaCPtvQyr4gl
+								B2zLWe0H2DZNcq/B7XiG3D76BTX6SfMsQg3TesXtz98j7x0i7If2BaqMnJunjt4oPHnnB1az8TrjSL6lou62WeOVqU7af8nS
+								x6eYv6D8lihuq9dFtP03fidr6gKXManH3crC/Cqh6IQgQr52VJld4hmtQUGXi50VRqf2nXnM0bWh7vM9ri0oq1855yM+uOi5
+								dKzwFF/yiC7rB4HRx7XNpaU8G2NRxkkPBlkJC3qaB30Ol03fAqMVsYLUocqpCZdYEFp5Blyr3gmIqxzR8HMgXb4jgKp/QOcv
+								GPNxV8z3vav/Y8u9soiwOOkfZ+MhOLeCYuwl2bPYV5mS0ZJtbo2CzHpNS4Kg8MjtWHPYQBmno88eAvcxVJe8TDyMLefJud2T
+								x/V/Bm3LiP4HyVm8coeI5wZXyXWnOf1BpXkK6ozp6/7O8BgeWoi4KdDjjy1/D6jgmOMKnhyI7R+Hgexwvp2ecqtO3tOL81Zh
+								cHNzivWg/9/+VxtZGBR9HKP3m3Z4IDEtl1F88NlKZb7zr052lIg1ksr0w91iHO+NO6ocaLAs/0Dt/D/zJk8Vhwl5HsIL8uz4
+								Y6D4ktw3eLKdTSdg3HeQEN9ePcaHVuBTf4O1gciJp9I93flS/ORCfUPNDuJ1aVeKlRzTMu5djnWkWjKU846WBd18OZnCs5Q3
+								ayt2SJuZSYBe6ijpn5WFddXj1TZAL9aoi9LivQ/trqyAzj8FkwCoi7mHkPfJw4tuFv3tNhP3+QAj4QR9hzglxsd5lYl5U78m
+								++Jz0xIeE1jaMthjhH6MrqPJ5XV3/StY24TkCHo5HXWSw6NyaTGRoghPOtWJQ4rWEfb1s5Y5cL0zFxeLa4d3iE0j2Ne+u+gZ
+								7kPHdgCFzYP2joc8LFK0UurP2VtLEWJVnM0QmLDzr2cToKjSI1K282P2OBjmOiiLcZWhosdaRcR1zPfb7OVLBZ3me6RyW4X5
+								G25fuCUeFVvGhXA+k5pKy0a/OQWWvMLm8VT6kAzRyh48vLnMUv3HUj/Qu+M5Jmd7Bo3fDsKJ+21flle89Imd+sX+MZZnuYbc
+								qyKFEcZkuaCZxW19D5ODpBP9S45jjnZFzj+05PbSoj7O+ia+0N6Wl2Hl3zTY9ncyLLi+pyxN0F+3hJgW4LAhp2nHBzuJJm5D
+								/zyeFW86SzLBjHZvIzXCL00g8iZb1GWUzI0oN2nPOoJor58s9stsFCfZNa8+PL0dE1NR/MHFG8uqXPiJqI+6uen078xB7swy
+								8Zu7fOfOWGnlWSbx/tmBMPPt/KuteFYwk2OBSfZ33rbOWNV3euny25Otzzjeu258kF37+Jqf9BkZvHLH5W5ODjfIlPadjNeR
+								Utz75aNqF66M15ry+9fD2oev5tUfTj0rPd8iakTTrHr2zR801d/2bAOTZSz7GLZUpAAx8SE/W3a+AuHJcUbb2gimc7Jvj65f
+								l5XVEfMCy0q48HPxi660vb+9/ybkzInx803iIALvRI4b90jrNG9zuTEbi3a67gxoP4TX0CYqcVdOsSYQxq6NgemFYuVQe1Do
+								efOlvRQ3hM4+x1QLXlLziabnL5RPaN9NuhIe7Cc7tKUh8N+yd6SU6Al6zd/EM45dNhKGFQe453e64wwebW7ILYoshlpQuuSu
+								2zHLrcEui/4Gz5wG3PixULFo5LTM/xvtT/AgPq3+kPY/rW/gydQ4zolbMFK8xaGGeR01wsL0CPWsQvDEXZPBcaHZHW9YdU1j
+								DOgPriJ/tsbo1q7BV+sM6vYSs6SY6ZXqs4cUew4iOuk4Xqg9uRWZaIhPAtsOrFnfNkzsumHMENi56jr7/3j2+f7ZnFFci81+
+								OkJ8Nms76XvnLuVRAGF4NqdO7qvf6jEmlwJh4uc9DHDOczaqHnGO4iOz9e7LWN31RWsDt4S1YLz+QyXB5CBF6LbCD714Kqe5
+								WDWH0uJWbWXdQP1m9O8u2N2B9XDiofxJK6PmINSmvIguTbxXwLtEvl2HjbnYtCmgLsq9FRpvH6wnGC52tnzUM53IqXXlPjcu
+								r7AclqY8E0aY/iLtw/Ic0Qmh2GVkUco9f/u5wCG/f8FkZE0FtP54/yXnhiUGPkiF7SFlr57erb3yHa73HpMo0eP7ELfQqWNq
+								ksqvtDvDO1J2TZt2AW/4tnNVGTFq8wLg6359ReDCxM7mKPurzgrBkiwdewTe+Ce8ETXpzeVH0XPFCyJf1g1KSm58P++ZMxNj
+								Fm8aSmeu/GW/HjB3Rr3l5wY1t+DAz+N1VL+JV2j+NGnMtINYg5lZHq9bDXqns280PPSxhMJ4fUXez+q9BN9dSJ7p/muS6Il9
+								6DMwwV9+mKe3DddvfgFnAvD/+oMfL7cR3Z4lJZkX2/wJQtG0dRxeOtRz6YgqZofsXgnVNtUD1QvoJiLGcolvVTfich+0hv7K
+								H0lm+r2y5G35P0ku/r6e7hoQ+wL1BFRuq/02nP9oAyEn3zAeVEV9m6RKpJuf5D2rYMStW3Eq0fiUQi/zKDtpAu4ARabEXfMa
+								41jj2lDJmSv1q/+5A8bP26SfJcGRtIXiljZ8mbVrwiF1leygpiJZJTSrxEWQED4odKvERZgQHEr5R4ibICwCsrQKGiglgUn1
+								LiFRUwpPFDJV5RgYE0fqXEKyqQ4uUVyEJZBbGy+JQSL6uAIY8fKvGyCgzk8SslXlaBHC+tQBGiCmIV8SklHlXAUMYPlXhUgY
+								EyfqXEowqUeKigNUQVMDpo8l/8X/xf/F/8X/xf/P8Rx9KuR/TrlBXt+kW/rhnQrnf06yCDdn2kXzdb47i211kr2vWXfl02oF
+								2v6ddxBu36Tr/uK+K49uMEK9r4gT6uMKCNN+jjELo+9HGLLI7rOM6xoo1/6OMiA9p4iT6OYtDGV/RxF4rjVI3TrGjjN/q4zo
+								A23qOPAxm08SF93EjFxqkeZ/ZTMy6Vf9DYuXJ258bO42CwvwnK5zNl685RG9SMnQ07cM62GztvmTmcSp45gtrua0+lB4yn7p
+								7eQD36LZmSVKVBSYc4BaYZsHw9TPfAfCqVvnA8xLspSTWa30rdPZUIcSbE22CaBdPvKMk1HsQ/wLp1MN0Ly76H/SZA/CP16M
+								L3sH4XJalB8XbYZi3EP0G8A/JgQbwP4p2wDZSa/dSji7uou6WwzfX9UA7APLDFOyDOhhjyuH4Q1idQkhuHqEfCNKjHGdYDtz
+								QeluXAsnSY5lKPRBmwHOr+K4+6W7IGpvmwDI5BlElJKiHnvwqoR5eyYF0cJblZCMvgeG7+DMt+gLIXuGyY51OPyvdS230gz8
+								qtlOTvwzD/I0yPwH6xEP9ESa6mwnwR9ej3fbAshpLcKoJyFOYh/6ug561fID5ApX/NhvXRlKT2GCxPhmXZML8a1sN87XGYPw
+								jTYkpyG5UTUE5CKaEkdaiUQjkF5TQlqUflf1DOQPmVkjSgchbKOShllOQOKgIo5ynJXVR+o5JnjKB+XDSZen51H8xfoCT3Lk
+								IRQhFRkkZULkEppyT3f4fyB5Q/qZY7Qio/Zh7sA8dx9yJMD8A2v8NyiCuzYZ0/MKdAfBDKIVh+gXouzqF+XAzLxLlUS8MFKL
+								9BnEc9v5YPy6fCtAD286OeVxVAKYT152H6M9VSf57Kj4bl1XyIBTA9DNt/Rj2vOQLzZTAtgnKUaqkrg3IO4l+o59d/gX18YX
+								oMlp2F7adBfBxKMcz/CtMTVMvtX6kfl6DlJyE+A9MS6vmNEtjPh2J04hPXmY3hFrZTm+v8W+w4HSZT59/Km1n6a2fg+GzIuv
+								TM6c7AO5E3s/R/q1afLsKH47JR1qeiVnUKjp03s/RkFGKvLsGG47Eh62MnV0RFrQR2NDYcM2/mseIVK1bIEo8+gQnHYaOsj0
+								dErlAkHv0LHhwrb0Avj4iQsRE8Bg+unY2yProc2JFKUaJjseAYeTNzj4YvBzhNlFgsuDY2ZJ17JDw8fLlcFFniMbGxfO1wrX
+								kzcw+HhYW3Ji4XJTaBn60NrpmNsi4MDQ2jJS4TJQYHriVvQC9bRmOvaBUlIeFnLXBNbJR1wTLEpouiaM2EhIR8zXCNeTNz85
+								YuBXY7UZSJr9UMV89GWectQexWUdq2ZkLK2kOa4BryZubmLAa2IvEOoqDENcLVsSHrrJxFi5a0S7yNxVHi6zTA1ebNzDrIpb
+								FVWhzYmuCq2Sjr/Vxg00VRYXEQZd26A+rgavIGdDCXy1Ulyop2oqxLPLBXNVwVG2X9U1BwiCJxTRZHiSfuUw1XmTcz68egoO
+								BgjaKsbhUlMTFDJbwjG2X9QwBih6gSpUNrIvZ6lXAVeTN3Z84PQPCQ9omrtjjA12/I+L4jvD0bsgb0/MAAhSgYFofEN6iCd8
+								gb0H4ydlA7UdRZfJ00cRXwtmyU9R5fP4AHBGhtzWh6a27YsLEDvF3ezN1pc6VshSiYFgd2RzidjbJO85w7F7HbtaZWi4MoGz
+								emt4W3yZu5nTfT03Muriir24myceuuNvBWNmS9fedMxJ7bKkonLI7g37aB0/Jmbt/h7i5n+83vvMVR4t9uo8EVbMh6zXefAV
+								spSuctvhESp8OVeQN6ymefubcVpZMWR+ykbVsUcBkbZZ06EbERfK621oxUZXG5KEmtcHnezDXJEydOmfIuRElK2iyHIzbKOt
+								7FBbHpotBaU4sotMQRe5McLs2bqfKDa3HVe+sgNrrbVc3Gsrg6NsDVX+eZqiyu6rxnMjs9hmC2dYqK1pSP3sjYeBYnYas979
+								tZnJCNZXFiNobFidiYvTgpG8fipGwci5OwO3SGaixOwsbtxYnYvn5YFidna7c4EVsuijaLk7BxBypvwdY6eiNjz8WyOBFbhS
+								iqLE7IxrI4OVu7xUnYuAMVUjaOxYnYmGPxt2NrtjgJ213V6E2FxcnYM7EsTs7WbnEittYblPC3YWNZnITd9gZFvcXJ2O5YFn
+								9rtgaLE7E/c1drcbooxGz1FleKQsLWfrv5dmyce3AiNubtJiEby+IkbNx7cDL2FCyLk7JxLE7EVoqi2eKE7I4WV3GDQsJ2aS
+								uKWouTsfG+USFna7c4EdtlIpbFidkYFidhO7u4YFmclI1jcSK2swuWxQnZWBYnYY9zdsGyOBnbGcvipGwcixOxNYrSmjghG8
+								viJGyndqKoG72RscdhWZycrd3iRGyncVgWJ2arEYVucRI2y8kJy+KkbByLE7FZTlgWJ2RjWZyEbcdywrI4GZuFZXFSNo7Fid
+								iaRVFanJCtbM0pqiwuF4WEPQpTFDJ2+9ZU/fcwcrZ2ixOxlaK0bU3fdhYnYQ/vKIrK856MPaqNKFPUiELKxrE4EXv4KBUW9+
+								xgcRK27fBRWBYnZitFcVE7eiNhf9xOFHUWJ2KrEEWVxYnYH8tE0WZxEvYQJMoo7aKQsW2xLE7EHqJszXGaLE7IxrI4CXsghi
+								gITsYegmVxIvZAPFFI2TgWJ2HbYIpCxh6IZXEidpvE1Y/eSNj9FGwtrUnCtsATJYCI3Q+vNYnYtMQ1ifIWbK0WJ2H3xRSFjI
+								3XmkTsvhZYFidkY1mchG2GKQoZG681Sdk4FidiK0XRbHFCNpYoJOxeZnitScrGsTgRu5cZlsUJ2ViikLB79MJoTRi9kbF7YV
+								mclI1jcRK2CZYonqRsrNYkYRuZ9MKyOBm7B5bFydnaRSFiG+G1JjEbw+Ik7C5GRlgWJ2RjiULE7oLXmoRsLIuTsA27GGFZnI
+								yNJwopG6c1idh0UTRYnJCNZXESdndMUcjYeK1JztZucSJ2d0MsixOzMUQhYZt3x2tNUjaOxTWx1TwPwjQ3745lcc3Pk6hlY4
+								mi4TkYKduDs3DePC+vL6ZO/fTTCRM+GTnyo4+src1xW7P9NyrffNn6/A7QPb5aiNhfyNifyNjmuBZv+zXT11+2Pncky/wrSP
+								wLWeKfyBI3p7WmmSqLq/5jwYIv2z4vhTL3eTeiKNE0NsC9ZInTRWnbmjaqWrPt38P8p3d8Pk2auQZRFBbX8k2q9/SOz9XJMp
+								/dvjW7q7K4rapeXCYKHd3uOUaP2a2ijFQlipax+Jzpqp9jlGU+WVVrGqlqTRV/D5s1Xf3zlyjzyaot3qOtxVWO3tqhO7CZgy
+								e3ioJvccSeMV3z865ABziRxT+frvk5XUXmqs57LRaf5qD9+WJp5hos3k+1xSc5aH8uWpY5u6PFNYniMlEVWs1z6IPZnbO4mw
+								Pec+jyzBWiWGNc2lwd8J+fR5k7qrd4+0vbeAf85/5lmTvSRTHUIIo6tIbrPMCxLD7WoXPvWZBlPgbn0mbv0Pn3Q6DMx2i3uA
+								a0RjZz8DBtooy2JHsfB8p8mGaLa0RrYQNczaVNmvgIS/L3n6DMB3S0uGL0NtSS/L0tsswHqBPlQ8u3e98M0HUHqG7NQZZv95
+								4caea676uyeH/Lt3+/D8r8/Y4Wt7J8+/cSyTNvL8oHlu/mfUoo855tW7NPt3fzHihZ5j3pouChsd+7BXClxXt3e3fv3ZJlbq
+								wQ5b1u7/Z9YShzY1lrmnZ7t+85k2Wuj9hd9d79+9lQ5gDvBLpTbIAb6P0778NDmev9O+/xk2b+771/8F98b2JnPx1/J8EAlv
+								WnZL+jgN5vi9bvhLKJkv3e5EA2pfK3G9X9ViTONoPZbX9P0pCto2suzwH91oT0FwNgkzFsHWXdn0NZSKl/H63i3bIfUQcpVX
+								W2jVV/2uug+M1MpIMOTQdTuYaUfD2K/w8OpCyT"/>
+				</foreignObject>
+				<svg viewBox="0 0 89.024 228.01" height="78.514" preserveAspectRatio="none" width="30.4299" x="0" y="500.753">
+					<clipPath id="mfid13">
+						<rect width="89.024" height="228.01" id="mfid14"/>
+					</clipPath>
+					<g clip-path="url(#mfid13)">
+						<mask id="mfid15">
+							<rect width="90" height="229" fill="white" stroke="none"/>
+						</mask>
+						<mask id="mfid16" fill="white" stroke="none">
+							<g>
+								<g mask="url(#mfid15)">
+									<use xlink:href="#mfid14"/>
+								</g>
+							</g>
+						</mask>
+						<!-- Unsupported Record: EmfPlusRecordTypeSetPixelOffsetMode  -->
+						<defs>
+							<image id="mfid17" width="90" height="229" xlink:href=""/>
+						</defs>
+						<!-- Unsupported Record: EmfPlusRecordTypeSetObject Obj_ImageAttributes -->
+						<g mask="url(#mfid16)">
+							<g transform="matrix(0.00018373, 0, 0, 0.00018373, 0, 0)">
+								<clipPath id="mfid18">
+									<rect x="-0.5" y="-0.5" width="90" height="229"/>
+								</clipPath>
+								<use xlink:href="#mfid17" clip-path="url(#mfid18)"
+										transform="matrix(5442.9, 0, 0, 5442.9, 2721.4, 2721.5)"/>
+							</g>
+						</g>
+					</g>
+				</svg>
+			</switch>
+			<rect v:rectContext="foreign" x="0" y="500.753" width="30.4299" height="78.514" class="st5"/>
+		</g>
+		<g id="shape633-1546" v:mID="633" v:groupContext="shape" transform="translate(1104.93,181.961) rotate(105)">
+			<title>Sheet.633</title>
+			<rect v:rectContext="foreign" x="0" y="500.753" width="30.4299" height="78.514" class="st5"/>
+			<switch>
+				<foreignObject x="0" y="500.753" width="30.4299" height="78.514"
+						requiredExtensions="http://schemas.microsoft.com/visio/2003/SVGExtensions/">
+					<v:foreignData v:orgSize="29760" v:data="data:metafile;base64,eNrtnXlcE8f7xzeESznUqoilCHi2VbwwUPAgQL1qRaoocq
+								ktNlCIFyCIyiEoalEBbT0iINRWRTkbxSoK+rVKrCbSVpEIaK0IqIgX1gM1v2dysUCOYbS/v5qX4z57vefZz3x2dja8dsOgKI
+								pLtX68odTS5pfoUZSeKUVZT5w+iaIYFDuconpSKGr36UZRSw0paq4ORXm2W3lqSBfKeqQeBQBqGBRrKIAbymAzqA8g7g5Fp/
+								upKrTbAXlB21bDivLusm1HsnWk2+lKiXHOH7MpaiBE5vL51tpa49FsiqHgt34Ou1LU1244+w9iU7Q6KcqKzaSM5XF/tq4y7s
+								fuoowH0pbbsA2UMb2+a/vYjoo66LEhm9JbCzqu6NaqR/sPWrbRw32ySdf3u0JoMnXKpzNh6gPltqE+/P/nmL9+QNuFzZzsSu
+								WJPriDDiDQZboLRRUmG736EtqT6rJ0incYRfWtQIVh/vtv9ynKtGnqpy6eK+bdr57K95/d99yyRNPX+i+7nNoo4nIimAdH6+
+								5e4+RZGhd3couhv2ugYUrKlsHNNro6qWxrE4ZleYLJSR8zvqfjJyUPTt475t0y/tK1Vbk1acUPip8dXGnw+ZS7XpKVojVjdn
+								CPPFtnNe/JsQvDb7x58TD5icXfuTZLqmJq/1h0NWNPocFK9nbfBmEl84T5i6FeTX6rn04u5jSW7Jn45tbGGX2qZsTaBzx7Yf
+								54z3WfYRHn/R0Cnj25m/K0aWdhsXPSq/Vx3KzIC/75lc4ZhUlWbiX+bNvRmZGlJ+dWjwzYMT7pVUvgiP6zglxK7Ko320Vcn2
+								bjG+Rrl/lga/E35/JijV8UOVxN3c1hClZGXqq8E5gaOznFdsyhvuWraiMM/7nKaUwasS4tPm+3yUT95nnHLAM5jUfKUpaYCO
+								I4LCbvygnGa9HL+2lORRPfLGLrh+8QU83jby7O+Dhi06uv47lZ3sJGt5EZJuwHRpEXLhf0fWzLELsOCXSzDguZU/v65uLxY3
+								s0DqoNrEhlCIP1UZ3LhFMbIvskMUOPRpVxBA6ly3pECcQeQRMrxn4YWezV6LTQCDYMv7M+M9c79alp1K9FgtIuArPjyeIpVy
+								5mPoie8eZ1/QTf1TVMe7frU2VHkVNnEuSyiAfZHzbiJr26ZxjFD2SWP1tXdWR24SyBGe9Kmp3FYc5JQ8f9HPaNEZFlF3yNd9
+								s8MBSkc9ysd3MCsjnHPIVxcxaeNHhTtK7A9nH+lhEDdlzUB/Co0PPcAh4jeu7iZ5VA+ihojl3mjuCKnhezOG6pjNzI+rmFfR
+								9Pz88fUC/d3C3G8ByrrLzbbzXLYxZWmr3YwYDcOGw+b1rg3ubisBou5f0gqzTZcLfNDXOb+svGQXPCKriGwXv4jIUn199uus
+								6VHXy/W9xfWUyfJckLYgPsCmYJlrMi0018+PPWXgyGJoIU1rya8sv4/Bzz4vjVZ50em/QIzefbpOTUpYkXoOa6yBmQGNNs9T
+								ggt2ux6bF8WAi7zwlrQrUXzl7gWxui8/M6acMaN/zEe6rLvwDHWHVxknFu5CWPSuMXL69WpFaJ9A85Btp/FtLkZGH24tHSln
+								n3+1b1nz9doXeaeEPIlqxbvJbnTcYNnsKQconfsC2Cp4npru4u9RXVUfa6Rx5Axb5BcJAnX/uDSssVWoRx9l7q5gIB+GExz8
+								TauUdT3FeSH/WKJpZ8bM1cOZvvElKRlOItXM4/HbzYzmJU8+/PfBfY1d7vgZKW6ZxTlwcrRbs27+NBI4zsU9Vf0t81IzDBzc
+								qkfEbi2MtrLQd0W8a1nyv0D0xAbd9YX102u2D9pIoLPJM1w/K8h88Og3bdEzbfq5kh/uemwTShvtSXg4AKFnVMNHW+n/tsZn
+								kAz6TmaweUdvJNt3JWeBM3/HVT9RiLpkxvIZwaS9hGwbuR5k6TMnSbzTlnBo05FGXnrPu02dmjmo1OgaSQ7dOuVHPsq13hcC
+								2v+o22kErA2CzJv9l1ZQ7/5vsrh1nFf5sjeSKwBHrO38V2wRF6Qk4CnHO2BxbBFDzCD2koFCaODbKXefgoR6bdP0a2KWdZF3
+								4KhiZ6OfTDxSinLSHDdyGDQFjEKl7f7wL3l4R+t9Zbxzy6Mj78G6HbxcRpQXOe6B/OLFJspv/zG3sdh/KADJOj5WrwYfZ+bm
+								Us2QkB6V+9klO67HfL5p0cJqs4fnjGZkOxq3QP/TG53u6xf/4wreEP3voMk8jycHCBEJ0/0hNfzIJdzvSGjUa4inLlZpjFv2
+								l2vAidf27CxDeiaK/WVuhbfoxVnNjvT+4va4crNmdMT7391UKv8p9Y+aH2r1zl8iMjewoXZ5WBEIF6NS7J992WmryMnlx+BO
+								XfL9g+bpxtkMsROcLfj5/h0FAoSnRuiKv2tJM1JZPXHzaN0Bt/7oHo0Y3P/zLw8ahSHPD+4Dz2eh5gl9k/fAaCDs+WZ8PnuC
+								31dx1x5QNguAn1U/br2R6AzsYY9RJJyMTJUQn97kH+7rJlUNM+QWg2r8a1jJW8CEzhIJRXUcULC7xVaSDKLBttwY4wAac4WC
+								iq8BBzx93usjK3jhIGN62yHxb6nk/KaxejYHeUd6BwEjvvck5g6knXMpDteo74oZ4ldP1zwkRcj4Uga7Z4abeaCPtvQyr4gl
+								B2zLWe0H2DZNcq/B7XiG3D76BTX6SfMsQg3TesXtz98j7x0i7If2BaqMnJunjt4oPHnnB1az8TrjSL6lou62WeOVqU7af8nS
+								x6eYv6D8lihuq9dFtP03fidr6gKXManH3crC/Cqh6IQgQr52VJld4hmtQUGXi50VRqf2nXnM0bWh7vM9ri0oq1855yM+uOi5
+								dKzwFF/yiC7rB4HRx7XNpaU8G2NRxkkPBlkJC3qaB30Ol03fAqMVsYLUocqpCZdYEFp5Blyr3gmIqxzR8HMgXb4jgKp/QOcv
+								GPNxV8z3vav/Y8u9soiwOOkfZ+MhOLeCYuwl2bPYV5mS0ZJtbo2CzHpNS4Kg8MjtWHPYQBmno88eAvcxVJe8TDyMLefJud2T
+								x/V/Bm3LiP4HyVm8coeI5wZXyXWnOf1BpXkK6ozp6/7O8BgeWoi4KdDjjy1/D6jgmOMKnhyI7R+Hgexwvp2ecqtO3tOL81Zh
+								cHNzivWg/9/+VxtZGBR9HKP3m3Z4IDEtl1F88NlKZb7zr052lIg1ksr0w91iHO+NO6ocaLAs/0Dt/D/zJk8Vhwl5HsIL8uz4
+								Y6D4ktw3eLKdTSdg3HeQEN9ePcaHVuBTf4O1gciJp9I93flS/ORCfUPNDuJ1aVeKlRzTMu5djnWkWjKU846WBd18OZnCs5Q3
+								ayt2SJuZSYBe6ijpn5WFddXj1TZAL9aoi9LivQ/trqyAzj8FkwCoi7mHkPfJw4tuFv3tNhP3+QAj4QR9hzglxsd5lYl5U78m
+								++Jz0xIeE1jaMthjhH6MrqPJ5XV3/StY24TkCHo5HXWSw6NyaTGRoghPOtWJQ4rWEfb1s5Y5cL0zFxeLa4d3iE0j2Ne+u+gZ
+								7kPHdgCFzYP2joc8LFK0UurP2VtLEWJVnM0QmLDzr2cToKjSI1K282P2OBjmOiiLcZWhosdaRcR1zPfb7OVLBZ3me6RyW4X5
+								G25fuCUeFVvGhXA+k5pKy0a/OQWWvMLm8VT6kAzRyh48vLnMUv3HUj/Qu+M5Jmd7Bo3fDsKJ+21flle89Imd+sX+MZZnuYbc
+								qyKFEcZkuaCZxW19D5ODpBP9S45jjnZFzj+05PbSoj7O+ia+0N6Wl2Hl3zTY9ncyLLi+pyxN0F+3hJgW4LAhp2nHBzuJJm5D
+								/zyeFW86SzLBjHZvIzXCL00g8iZb1GWUzI0oN2nPOoJor58s9stsFCfZNa8+PL0dE1NR/MHFG8uqXPiJqI+6uen078xB7swy
+								8Zu7fOfOWGnlWSbx/tmBMPPt/KuteFYwk2OBSfZ33rbOWNV3euny25Otzzjeu258kF37+Jqf9BkZvHLH5W5ODjfIlPadjNeR
+								Utz75aNqF66M15ry+9fD2oev5tUfTj0rPd8iakTTrHr2zR801d/2bAOTZSz7GLZUpAAx8SE/W3a+AuHJcUbb2gimc7Jvj65f
+								l5XVEfMCy0q48HPxi660vb+9/ybkzInx803iIALvRI4b90jrNG9zuTEbi3a67gxoP4TX0CYqcVdOsSYQxq6NgemFYuVQe1Do
+								efOlvRQ3hM4+x1QLXlLziabnL5RPaN9NuhIe7Cc7tKUh8N+yd6SU6Al6zd/EM45dNhKGFQe453e64wwebW7ILYoshlpQuuSu
+								2zHLrcEui/4Gz5wG3PixULFo5LTM/xvtT/AgPq3+kPY/rW/gydQ4zolbMFK8xaGGeR01wsL0CPWsQvDEXZPBcaHZHW9YdU1j
+								DOgPriJ/tsbo1q7BV+sM6vYSs6SY6ZXqs4cUew4iOuk4Xqg9uRWZaIhPAtsOrFnfNkzsumHMENi56jr7/3j2+f7ZnFFci81+
+								OkJ8Nms76XvnLuVRAGF4NqdO7qvf6jEmlwJh4uc9DHDOczaqHnGO4iOz9e7LWN31RWsDt4S1YLz+QyXB5CBF6LbCD714Kqe5
+								WDWH0uJWbWXdQP1m9O8u2N2B9XDiofxJK6PmINSmvIguTbxXwLtEvl2HjbnYtCmgLsq9FRpvH6wnGC52tnzUM53IqXXlPjcu
+								r7AclqY8E0aY/iLtw/Ic0Qmh2GVkUco9f/u5wCG/f8FkZE0FtP54/yXnhiUGPkiF7SFlr57erb3yHa73HpMo0eP7ELfQqWNq
+								ksqvtDvDO1J2TZt2AW/4tnNVGTFq8wLg6359ReDCxM7mKPurzgrBkiwdewTe+Ce8ETXpzeVH0XPFCyJf1g1KSm58P++ZMxNj
+								Fm8aSmeu/GW/HjB3Rr3l5wY1t+DAz+N1VL+JV2j+NGnMtINYg5lZHq9bDXqns280PPSxhMJ4fUXez+q9BN9dSJ7p/muS6Il9
+								6DMwwV9+mKe3DddvfgFnAvD/+oMfL7cR3Z4lJZkX2/wJQtG0dRxeOtRz6YgqZofsXgnVNtUD1QvoJiLGcolvVTfich+0hv7K
+								H0lm+r2y5G35P0ku/r6e7hoQ+wL1BFRuq/02nP9oAyEn3zAeVEV9m6RKpJuf5D2rYMStW3Eq0fiUQi/zKDtpAu4ARabEXfMa
+								41jj2lDJmSv1q/+5A8bP26SfJcGRtIXiljZ8mbVrwiF1leygpiJZJTSrxEWQED4odKvERZgQHEr5R4ibICwCsrQKGiglgUn1
+								LiFRUwpPFDJV5RgYE0fqXEKyqQ4uUVyEJZBbGy+JQSL6uAIY8fKvGyCgzk8SslXlaBHC+tQBGiCmIV8SklHlXAUMYPlXhUgY
+								EyfqXEowqUeKigNUQVMDpo8l/8X/xf/F/8X/xf/P8Rx9KuR/TrlBXt+kW/rhnQrnf06yCDdn2kXzdb47i211kr2vWXfl02oF
+								2v6ddxBu36Tr/uK+K49uMEK9r4gT6uMKCNN+jjELo+9HGLLI7rOM6xoo1/6OMiA9p4iT6OYtDGV/RxF4rjVI3TrGjjN/q4zo
+								A23qOPAxm08SF93EjFxqkeZ/ZTMy6Vf9DYuXJ258bO42CwvwnK5zNl685RG9SMnQ07cM62GztvmTmcSp45gtrua0+lB4yn7p
+								7eQD36LZmSVKVBSYc4BaYZsHw9TPfAfCqVvnA8xLspSTWa30rdPZUIcSbE22CaBdPvKMk1HsQ/wLp1MN0Ly76H/SZA/CP16M
+								L3sH4XJalB8XbYZi3EP0G8A/JgQbwP4p2wDZSa/dSji7uou6WwzfX9UA7APLDFOyDOhhjyuH4Q1idQkhuHqEfCNKjHGdYDtz
+								QeluXAsnSY5lKPRBmwHOr+K4+6W7IGpvmwDI5BlElJKiHnvwqoR5eyYF0cJblZCMvgeG7+DMt+gLIXuGyY51OPyvdS230gz8
+								qtlOTvwzD/I0yPwH6xEP9ESa6mwnwR9ej3fbAshpLcKoJyFOYh/6ug561fID5ApX/NhvXRlKT2GCxPhmXZML8a1sN87XGYPw
+								jTYkpyG5UTUE5CKaEkdaiUQjkF5TQlqUflf1DOQPmVkjSgchbKOShllOQOKgIo5ynJXVR+o5JnjKB+XDSZen51H8xfoCT3Lk
+								IRQhFRkkZULkEppyT3f4fyB5Q/qZY7Qio/Zh7sA8dx9yJMD8A2v8NyiCuzYZ0/MKdAfBDKIVh+gXouzqF+XAzLxLlUS8MFKL
+								9BnEc9v5YPy6fCtAD286OeVxVAKYT152H6M9VSf57Kj4bl1XyIBTA9DNt/Rj2vOQLzZTAtgnKUaqkrg3IO4l+o59d/gX18YX
+								oMlp2F7adBfBxKMcz/CtMTVMvtX6kfl6DlJyE+A9MS6vmNEtjPh2J04hPXmY3hFrZTm+v8W+w4HSZT59/Km1n6a2fg+GzIuv
+								TM6c7AO5E3s/R/q1afLsKH47JR1qeiVnUKjp03s/RkFGKvLsGG47Eh62MnV0RFrQR2NDYcM2/mseIVK1bIEo8+gQnHYaOsj0
+								dErlAkHv0LHhwrb0Avj4iQsRE8Bg+unY2yProc2JFKUaJjseAYeTNzj4YvBzhNlFgsuDY2ZJ17JDw8fLlcFFniMbGxfO1wrX
+								kzcw+HhYW3Ji4XJTaBn60NrpmNsi4MDQ2jJS4TJQYHriVvQC9bRmOvaBUlIeFnLXBNbJR1wTLEpouiaM2EhIR8zXCNeTNz85
+								YuBXY7UZSJr9UMV89GWectQexWUdq2ZkLK2kOa4BryZubmLAa2IvEOoqDENcLVsSHrrJxFi5a0S7yNxVHi6zTA1ebNzDrIpb
+								FVWhzYmuCq2Sjr/Vxg00VRYXEQZd26A+rgavIGdDCXy1Ulyop2oqxLPLBXNVwVG2X9U1BwiCJxTRZHiSfuUw1XmTcz68egoO
+								BgjaKsbhUlMTFDJbwjG2X9QwBih6gSpUNrIvZ6lXAVeTN3Z84PQPCQ9omrtjjA12/I+L4jvD0bsgb0/MAAhSgYFofEN6iCd8
+								gb0H4ydlA7UdRZfJ00cRXwtmyU9R5fP4AHBGhtzWh6a27YsLEDvF3ezN1pc6VshSiYFgd2RzidjbJO85w7F7HbtaZWi4MoGz
+								emt4W3yZu5nTfT03Muriir24myceuuNvBWNmS9fedMxJ7bKkonLI7g37aB0/Jmbt/h7i5n+83vvMVR4t9uo8EVbMh6zXefAV
+								spSuctvhESp8OVeQN6ymefubcVpZMWR+ykbVsUcBkbZZ06EbERfK621oxUZXG5KEmtcHnezDXJEydOmfIuRElK2iyHIzbKOt
+								7FBbHpotBaU4sotMQRe5McLs2bqfKDa3HVe+sgNrrbVc3Gsrg6NsDVX+eZqiyu6rxnMjs9hmC2dYqK1pSP3sjYeBYnYas979
+								tZnJCNZXFiNobFidiYvTgpG8fipGwci5OwO3SGaixOwsbtxYnYvn5YFidna7c4EVsuijaLk7BxBypvwdY6eiNjz8WyOBFbhS
+								iqLE7IxrI4OVu7xUnYuAMVUjaOxYnYmGPxt2NrtjgJ213V6E2FxcnYM7EsTs7WbnEittYblPC3YWNZnITd9gZFvcXJ2O5YFn
+								9rtgaLE7E/c1drcbooxGz1FleKQsLWfrv5dmyce3AiNubtJiEby+IkbNx7cDL2FCyLk7JxLE7EVoqi2eKE7I4WV3GDQsJ2aS
+								uKWouTsfG+USFna7c4EdtlIpbFidkYFidhO7u4YFmclI1jcSK2swuWxQnZWBYnYY9zdsGyOBnbGcvipGwcixOxNYrSmjghG8
+								viJGyndqKoG72RscdhWZycrd3iRGyncVgWJ2arEYVucRI2y8kJy+KkbByLE7FZTlgWJ2RjWZyEbcdywrI4GZuFZXFSNo7Fid
+								iaRVFanJCtbM0pqiwuF4WEPQpTFDJ2+9ZU/fcwcrZ2ixOxlaK0bU3fdhYnYQ/vKIrK856MPaqNKFPUiELKxrE4EXv4KBUW9+
+								xgcRK27fBRWBYnZitFcVE7eiNhf9xOFHUWJ2KrEEWVxYnYH8tE0WZxEvYQJMoo7aKQsW2xLE7EHqJszXGaLE7IxrI4CXsghi
+								gITsYegmVxIvZAPFFI2TgWJ2HbYIpCxh6IZXEidpvE1Y/eSNj9FGwtrUnCtsATJYCI3Q+vNYnYtMQ1ifIWbK0WJ2H3xRSFjI
+								3XmkTsvhZYFidkY1mchG2GKQoZG681Sdk4FidiK0XRbHFCNpYoJOxeZnitScrGsTgRu5cZlsUJ2ViikLB79MJoTRi9kbF7YV
+								mclI1jcRK2CZYonqRsrNYkYRuZ9MKyOBm7B5bFydnaRSFiG+G1JjEbw+Ik7C5GRlgWJ2RjiULE7oLXmoRsLIuTsA27GGFZnI
+								yNJwopG6c1idh0UTRYnJCNZXESdndMUcjYeK1JztZucSJ2d0MsixOzMUQhYZt3x2tNUjaOxTWx1TwPwjQ3745lcc3Pk6hlY4
+								mi4TkYKduDs3DePC+vL6ZO/fTTCRM+GTnyo4+src1xW7P9NyrffNn6/A7QPb5aiNhfyNifyNjmuBZv+zXT11+2Pncky/wrSP
+								wLWeKfyBI3p7WmmSqLq/5jwYIv2z4vhTL3eTeiKNE0NsC9ZInTRWnbmjaqWrPt38P8p3d8Pk2auQZRFBbX8k2q9/SOz9XJMp
+								/dvjW7q7K4rapeXCYKHd3uOUaP2a2ijFQlipax+Jzpqp9jlGU+WVVrGqlqTRV/D5s1Xf3zlyjzyaot3qOtxVWO3tqhO7CZgy
+								e3ioJvccSeMV3z865ABziRxT+frvk5XUXmqs57LRaf5qD9+WJp5hos3k+1xSc5aH8uWpY5u6PFNYniMlEVWs1z6IPZnbO4mw
+								Pec+jyzBWiWGNc2lwd8J+fR5k7qrd4+0vbeAf85/5lmTvSRTHUIIo6tIbrPMCxLD7WoXPvWZBlPgbn0mbv0Pn3Q6DMx2i3uA
+								a0RjZz8DBtooy2JHsfB8p8mGaLa0RrYQNczaVNmvgIS/L3n6DMB3S0uGL0NtSS/L0tsswHqBPlQ8u3e98M0HUHqG7NQZZv95
+								4caea676uyeH/Lt3+/D8r8/Y4Wt7J8+/cSyTNvL8oHlu/mfUoo855tW7NPt3fzHihZ5j3pouChsd+7BXClxXt3e3fv3ZJlbq
+								wQ5b1u7/Z9YShzY1lrmnZ7t+85k2Wuj9hd9d79+9lQ5gDvBLpTbIAb6P0778NDmev9O+/xk2b+771/8F98b2JnPx1/J8EAlv
+								WnZL+jgN5vi9bvhLKJkv3e5EA2pfK3G9X9ViTONoPZbX9P0pCto2suzwH91oT0FwNgkzFsHWXdn0NZSKl/H63i3bIfUQcpVX
+								W2jVV/2uug+M1MpIMOTQdTuYaUfD2K/w8OpCyT"/>
+				</foreignObject>
+				<svg viewBox="0 0 89.024 228.01" height="78.514" preserveAspectRatio="none" width="30.4299" x="0" y="500.753">
+					<clipPath id="mfid19">
+						<rect width="89.024" height="228.01" id="mfid20"/>
+					</clipPath>
+					<g clip-path="url(#mfid19)">
+						<mask id="mfid21">
+							<rect width="90" height="229" fill="white" stroke="none"/>
+						</mask>
+						<mask id="mfid22" fill="white" stroke="none">
+							<g>
+								<g mask="url(#mfid21)">
+									<use xlink:href="#mfid20"/>
+								</g>
+							</g>
+						</mask>
+						<!-- Unsupported Record: EmfPlusRecordTypeSetPixelOffsetMode  -->
+						<defs>
+							<image id="mfid23" width="90" height="229" xlink:href=""/>
+						</defs>
+						<!-- Unsupported Record: EmfPlusRecordTypeSetObject Obj_ImageAttributes -->
+						<g mask="url(#mfid22)">
+							<g transform="matrix(0.00018373, 0, 0, 0.00018373, 0, 0)">
+								<clipPath id="mfid24">
+									<rect x="-0.5" y="-0.5" width="90" height="229"/>
+								</clipPath>
+								<use xlink:href="#mfid23" clip-path="url(#mfid24)"
+										transform="matrix(5442.9, 0, 0, 5442.9, 2721.4, 2721.5)"/>
+							</g>
+						</g>
+					</g>
+				</svg>
+			</switch>
+			<rect v:rectContext="foreign" x="0" y="500.753" width="30.4299" height="78.514" class="st5"/>
+		</g>
+		<g id="shape634-1549" v:mID="634" v:groupContext="shape" transform="translate(570.995,596.312) rotate(120)">
+			<title>Sheet.634</title>
+			<rect v:rectContext="foreign" x="0" y="500.753" width="30.4299" height="78.514" class="st5"/>
+			<switch>
+				<foreignObject x="0" y="500.753" width="30.4299" height="78.514"
+						requiredExtensions="http://schemas.microsoft.com/visio/2003/SVGExtensions/">
+					<v:foreignData v:orgSize="29760" v:data="data:metafile;base64,eNrtnXlcE8f7xzeESznUqoilCHi2VbwwUPAgQL1qRaoocq
+								ktNlCIFyCIyiEoalEBbT0iINRWRTkbxSoK+rVKrCbSVpEIaK0IqIgX1gM1v2dysUCOYbS/v5qX4z57vefZz3x2dja8dsOgKI
+								pLtX68odTS5pfoUZSeKUVZT5w+iaIYFDuconpSKGr36UZRSw0paq4ORXm2W3lqSBfKeqQeBQBqGBRrKIAbymAzqA8g7g5Fp/
+								upKrTbAXlB21bDivLusm1HsnWk2+lKiXHOH7MpaiBE5vL51tpa49FsiqHgt34Ou1LU1244+w9iU7Q6KcqKzaSM5XF/tq4y7s
+								fuoowH0pbbsA2UMb2+a/vYjoo66LEhm9JbCzqu6NaqR/sPWrbRw32ySdf3u0JoMnXKpzNh6gPltqE+/P/nmL9+QNuFzZzsSu
+								WJPriDDiDQZboLRRUmG736EtqT6rJ0incYRfWtQIVh/vtv9ynKtGnqpy6eK+bdr57K95/d99yyRNPX+i+7nNoo4nIimAdH6+
+								5e4+RZGhd3couhv2ugYUrKlsHNNro6qWxrE4ZleYLJSR8zvqfjJyUPTt475t0y/tK1Vbk1acUPip8dXGnw+ZS7XpKVojVjdn
+								CPPFtnNe/JsQvDb7x58TD5icXfuTZLqmJq/1h0NWNPocFK9nbfBmEl84T5i6FeTX6rn04u5jSW7Jn45tbGGX2qZsTaBzx7Yf
+								54z3WfYRHn/R0Cnj25m/K0aWdhsXPSq/Vx3KzIC/75lc4ZhUlWbiX+bNvRmZGlJ+dWjwzYMT7pVUvgiP6zglxK7Ko320Vcn2
+								bjG+Rrl/lga/E35/JijV8UOVxN3c1hClZGXqq8E5gaOznFdsyhvuWraiMM/7nKaUwasS4tPm+3yUT95nnHLAM5jUfKUpaYCO
+								I4LCbvygnGa9HL+2lORRPfLGLrh+8QU83jby7O+Dhi06uv47lZ3sJGt5EZJuwHRpEXLhf0fWzLELsOCXSzDguZU/v65uLxY3
+								s0DqoNrEhlCIP1UZ3LhFMbIvskMUOPRpVxBA6ly3pECcQeQRMrxn4YWezV6LTQCDYMv7M+M9c79alp1K9FgtIuArPjyeIpVy
+								5mPoie8eZ1/QTf1TVMe7frU2VHkVNnEuSyiAfZHzbiJr26ZxjFD2SWP1tXdWR24SyBGe9Kmp3FYc5JQ8f9HPaNEZFlF3yNd9
+								s8MBSkc9ysd3MCsjnHPIVxcxaeNHhTtK7A9nH+lhEDdlzUB/Co0PPcAh4jeu7iZ5VA+ihojl3mjuCKnhezOG6pjNzI+rmFfR
+								9Pz88fUC/d3C3G8ByrrLzbbzXLYxZWmr3YwYDcOGw+b1rg3ubisBou5f0gqzTZcLfNDXOb+svGQXPCKriGwXv4jIUn199uus
+								6VHXy/W9xfWUyfJckLYgPsCmYJlrMi0018+PPWXgyGJoIU1rya8sv4/Bzz4vjVZ50em/QIzefbpOTUpYkXoOa6yBmQGNNs9T
+								ggt2ux6bF8WAi7zwlrQrUXzl7gWxui8/M6acMaN/zEe6rLvwDHWHVxknFu5CWPSuMXL69WpFaJ9A85Btp/FtLkZGH24tHSln
+								n3+1b1nz9doXeaeEPIlqxbvJbnTcYNnsKQconfsC2Cp4npru4u9RXVUfa6Rx5Axb5BcJAnX/uDSssVWoRx9l7q5gIB+GExz8
+								TauUdT3FeSH/WKJpZ8bM1cOZvvElKRlOItXM4/HbzYzmJU8+/PfBfY1d7vgZKW6ZxTlwcrRbs27+NBI4zsU9Vf0t81IzDBzc
+								qkfEbi2MtrLQd0W8a1nyv0D0xAbd9YX102u2D9pIoLPJM1w/K8h88Og3bdEzbfq5kh/uemwTShvtSXg4AKFnVMNHW+n/tsZn
+								kAz6TmaweUdvJNt3JWeBM3/HVT9RiLpkxvIZwaS9hGwbuR5k6TMnSbzTlnBo05FGXnrPu02dmjmo1OgaSQ7dOuVHPsq13hcC
+								2v+o22kErA2CzJv9l1ZQ7/5vsrh1nFf5sjeSKwBHrO38V2wRF6Qk4CnHO2BxbBFDzCD2koFCaODbKXefgoR6bdP0a2KWdZF3
+								4KhiZ6OfTDxSinLSHDdyGDQFjEKl7f7wL3l4R+t9Zbxzy6Mj78G6HbxcRpQXOe6B/OLFJspv/zG3sdh/KADJOj5WrwYfZ+bm
+								Us2QkB6V+9klO67HfL5p0cJqs4fnjGZkOxq3QP/TG53u6xf/4wreEP3voMk8jycHCBEJ0/0hNfzIJdzvSGjUa4inLlZpjFv2
+								l2vAidf27CxDeiaK/WVuhbfoxVnNjvT+4va4crNmdMT7391UKv8p9Y+aH2r1zl8iMjewoXZ5WBEIF6NS7J992WmryMnlx+BO
+								XfL9g+bpxtkMsROcLfj5/h0FAoSnRuiKv2tJM1JZPXHzaN0Bt/7oHo0Y3P/zLw8ahSHPD+4Dz2eh5gl9k/fAaCDs+WZ8PnuC
+								31dx1x5QNguAn1U/br2R6AzsYY9RJJyMTJUQn97kH+7rJlUNM+QWg2r8a1jJW8CEzhIJRXUcULC7xVaSDKLBttwY4wAac4WC
+								iq8BBzx93usjK3jhIGN62yHxb6nk/KaxejYHeUd6BwEjvvck5g6knXMpDteo74oZ4ldP1zwkRcj4Uga7Z4abeaCPtvQyr4gl
+								B2zLWe0H2DZNcq/B7XiG3D76BTX6SfMsQg3TesXtz98j7x0i7If2BaqMnJunjt4oPHnnB1az8TrjSL6lou62WeOVqU7af8nS
+								x6eYv6D8lihuq9dFtP03fidr6gKXManH3crC/Cqh6IQgQr52VJld4hmtQUGXi50VRqf2nXnM0bWh7vM9ri0oq1855yM+uOi5
+								dKzwFF/yiC7rB4HRx7XNpaU8G2NRxkkPBlkJC3qaB30Ol03fAqMVsYLUocqpCZdYEFp5Blyr3gmIqxzR8HMgXb4jgKp/QOcv
+								GPNxV8z3vav/Y8u9soiwOOkfZ+MhOLeCYuwl2bPYV5mS0ZJtbo2CzHpNS4Kg8MjtWHPYQBmno88eAvcxVJe8TDyMLefJud2T
+								x/V/Bm3LiP4HyVm8coeI5wZXyXWnOf1BpXkK6ozp6/7O8BgeWoi4KdDjjy1/D6jgmOMKnhyI7R+Hgexwvp2ecqtO3tOL81Zh
+								cHNzivWg/9/+VxtZGBR9HKP3m3Z4IDEtl1F88NlKZb7zr052lIg1ksr0w91iHO+NO6ocaLAs/0Dt/D/zJk8Vhwl5HsIL8uz4
+								Y6D4ktw3eLKdTSdg3HeQEN9ePcaHVuBTf4O1gciJp9I93flS/ORCfUPNDuJ1aVeKlRzTMu5djnWkWjKU846WBd18OZnCs5Q3
+								ayt2SJuZSYBe6ijpn5WFddXj1TZAL9aoi9LivQ/trqyAzj8FkwCoi7mHkPfJw4tuFv3tNhP3+QAj4QR9hzglxsd5lYl5U78m
+								++Jz0xIeE1jaMthjhH6MrqPJ5XV3/StY24TkCHo5HXWSw6NyaTGRoghPOtWJQ4rWEfb1s5Y5cL0zFxeLa4d3iE0j2Ne+u+gZ
+								7kPHdgCFzYP2joc8LFK0UurP2VtLEWJVnM0QmLDzr2cToKjSI1K282P2OBjmOiiLcZWhosdaRcR1zPfb7OVLBZ3me6RyW4X5
+								G25fuCUeFVvGhXA+k5pKy0a/OQWWvMLm8VT6kAzRyh48vLnMUv3HUj/Qu+M5Jmd7Bo3fDsKJ+21flle89Imd+sX+MZZnuYbc
+								qyKFEcZkuaCZxW19D5ODpBP9S45jjnZFzj+05PbSoj7O+ia+0N6Wl2Hl3zTY9ncyLLi+pyxN0F+3hJgW4LAhp2nHBzuJJm5D
+								/zyeFW86SzLBjHZvIzXCL00g8iZb1GWUzI0oN2nPOoJor58s9stsFCfZNa8+PL0dE1NR/MHFG8uqXPiJqI+6uen078xB7swy
+								8Zu7fOfOWGnlWSbx/tmBMPPt/KuteFYwk2OBSfZ33rbOWNV3euny25Otzzjeu258kF37+Jqf9BkZvHLH5W5ODjfIlPadjNeR
+								Utz75aNqF66M15ry+9fD2oev5tUfTj0rPd8iakTTrHr2zR801d/2bAOTZSz7GLZUpAAx8SE/W3a+AuHJcUbb2gimc7Jvj65f
+								l5XVEfMCy0q48HPxi660vb+9/ybkzInx803iIALvRI4b90jrNG9zuTEbi3a67gxoP4TX0CYqcVdOsSYQxq6NgemFYuVQe1Do
+								efOlvRQ3hM4+x1QLXlLziabnL5RPaN9NuhIe7Cc7tKUh8N+yd6SU6Al6zd/EM45dNhKGFQe453e64wwebW7ILYoshlpQuuSu
+								2zHLrcEui/4Gz5wG3PixULFo5LTM/xvtT/AgPq3+kPY/rW/gydQ4zolbMFK8xaGGeR01wsL0CPWsQvDEXZPBcaHZHW9YdU1j
+								DOgPriJ/tsbo1q7BV+sM6vYSs6SY6ZXqs4cUew4iOuk4Xqg9uRWZaIhPAtsOrFnfNkzsumHMENi56jr7/3j2+f7ZnFFci81+
+								OkJ8Nms76XvnLuVRAGF4NqdO7qvf6jEmlwJh4uc9DHDOczaqHnGO4iOz9e7LWN31RWsDt4S1YLz+QyXB5CBF6LbCD714Kqe5
+								WDWH0uJWbWXdQP1m9O8u2N2B9XDiofxJK6PmINSmvIguTbxXwLtEvl2HjbnYtCmgLsq9FRpvH6wnGC52tnzUM53IqXXlPjcu
+								r7AclqY8E0aY/iLtw/Ic0Qmh2GVkUco9f/u5wCG/f8FkZE0FtP54/yXnhiUGPkiF7SFlr57erb3yHa73HpMo0eP7ELfQqWNq
+								ksqvtDvDO1J2TZt2AW/4tnNVGTFq8wLg6359ReDCxM7mKPurzgrBkiwdewTe+Ce8ETXpzeVH0XPFCyJf1g1KSm58P++ZMxNj
+								Fm8aSmeu/GW/HjB3Rr3l5wY1t+DAz+N1VL+JV2j+NGnMtINYg5lZHq9bDXqns280PPSxhMJ4fUXez+q9BN9dSJ7p/muS6Il9
+								6DMwwV9+mKe3DddvfgFnAvD/+oMfL7cR3Z4lJZkX2/wJQtG0dRxeOtRz6YgqZofsXgnVNtUD1QvoJiLGcolvVTfich+0hv7K
+								H0lm+r2y5G35P0ku/r6e7hoQ+wL1BFRuq/02nP9oAyEn3zAeVEV9m6RKpJuf5D2rYMStW3Eq0fiUQi/zKDtpAu4ARabEXfMa
+								41jj2lDJmSv1q/+5A8bP26SfJcGRtIXiljZ8mbVrwiF1leygpiJZJTSrxEWQED4odKvERZgQHEr5R4ibICwCsrQKGiglgUn1
+								LiFRUwpPFDJV5RgYE0fqXEKyqQ4uUVyEJZBbGy+JQSL6uAIY8fKvGyCgzk8SslXlaBHC+tQBGiCmIV8SklHlXAUMYPlXhUgY
+								EyfqXEowqUeKigNUQVMDpo8l/8X/xf/F/8X/xf/P8Rx9KuR/TrlBXt+kW/rhnQrnf06yCDdn2kXzdb47i211kr2vWXfl02oF
+								2v6ddxBu36Tr/uK+K49uMEK9r4gT6uMKCNN+jjELo+9HGLLI7rOM6xoo1/6OMiA9p4iT6OYtDGV/RxF4rjVI3TrGjjN/q4zo
+								A23qOPAxm08SF93EjFxqkeZ/ZTMy6Vf9DYuXJ258bO42CwvwnK5zNl685RG9SMnQ07cM62GztvmTmcSp45gtrua0+lB4yn7p
+								7eQD36LZmSVKVBSYc4BaYZsHw9TPfAfCqVvnA8xLspSTWa30rdPZUIcSbE22CaBdPvKMk1HsQ/wLp1MN0Ly76H/SZA/CP16M
+								L3sH4XJalB8XbYZi3EP0G8A/JgQbwP4p2wDZSa/dSji7uou6WwzfX9UA7APLDFOyDOhhjyuH4Q1idQkhuHqEfCNKjHGdYDtz
+								QeluXAsnSY5lKPRBmwHOr+K4+6W7IGpvmwDI5BlElJKiHnvwqoR5eyYF0cJblZCMvgeG7+DMt+gLIXuGyY51OPyvdS230gz8
+								qtlOTvwzD/I0yPwH6xEP9ESa6mwnwR9ej3fbAshpLcKoJyFOYh/6ug561fID5ApX/NhvXRlKT2GCxPhmXZML8a1sN87XGYPw
+								jTYkpyG5UTUE5CKaEkdaiUQjkF5TQlqUflf1DOQPmVkjSgchbKOShllOQOKgIo5ynJXVR+o5JnjKB+XDSZen51H8xfoCT3Lk
+								IRQhFRkkZULkEppyT3f4fyB5Q/qZY7Qio/Zh7sA8dx9yJMD8A2v8NyiCuzYZ0/MKdAfBDKIVh+gXouzqF+XAzLxLlUS8MFKL
+								9BnEc9v5YPy6fCtAD286OeVxVAKYT152H6M9VSf57Kj4bl1XyIBTA9DNt/Rj2vOQLzZTAtgnKUaqkrg3IO4l+o59d/gX18YX
+								oMlp2F7adBfBxKMcz/CtMTVMvtX6kfl6DlJyE+A9MS6vmNEtjPh2J04hPXmY3hFrZTm+v8W+w4HSZT59/Km1n6a2fg+GzIuv
+								TM6c7AO5E3s/R/q1afLsKH47JR1qeiVnUKjp03s/RkFGKvLsGG47Eh62MnV0RFrQR2NDYcM2/mseIVK1bIEo8+gQnHYaOsj0
+								dErlAkHv0LHhwrb0Avj4iQsRE8Bg+unY2yProc2JFKUaJjseAYeTNzj4YvBzhNlFgsuDY2ZJ17JDw8fLlcFFniMbGxfO1wrX
+								kzcw+HhYW3Ji4XJTaBn60NrpmNsi4MDQ2jJS4TJQYHriVvQC9bRmOvaBUlIeFnLXBNbJR1wTLEpouiaM2EhIR8zXCNeTNz85
+								YuBXY7UZSJr9UMV89GWectQexWUdq2ZkLK2kOa4BryZubmLAa2IvEOoqDENcLVsSHrrJxFi5a0S7yNxVHi6zTA1ebNzDrIpb
+								FVWhzYmuCq2Sjr/Vxg00VRYXEQZd26A+rgavIGdDCXy1Ulyop2oqxLPLBXNVwVG2X9U1BwiCJxTRZHiSfuUw1XmTcz68egoO
+								BgjaKsbhUlMTFDJbwjG2X9QwBih6gSpUNrIvZ6lXAVeTN3Z84PQPCQ9omrtjjA12/I+L4jvD0bsgb0/MAAhSgYFofEN6iCd8
+								gb0H4ydlA7UdRZfJ00cRXwtmyU9R5fP4AHBGhtzWh6a27YsLEDvF3ezN1pc6VshSiYFgd2RzidjbJO85w7F7HbtaZWi4MoGz
+								emt4W3yZu5nTfT03Muriir24myceuuNvBWNmS9fedMxJ7bKkonLI7g37aB0/Jmbt/h7i5n+83vvMVR4t9uo8EVbMh6zXefAV
+								spSuctvhESp8OVeQN6ymefubcVpZMWR+ykbVsUcBkbZZ06EbERfK621oxUZXG5KEmtcHnezDXJEydOmfIuRElK2iyHIzbKOt
+								7FBbHpotBaU4sotMQRe5McLs2bqfKDa3HVe+sgNrrbVc3Gsrg6NsDVX+eZqiyu6rxnMjs9hmC2dYqK1pSP3sjYeBYnYas979
+								tZnJCNZXFiNobFidiYvTgpG8fipGwci5OwO3SGaixOwsbtxYnYvn5YFidna7c4EVsuijaLk7BxBypvwdY6eiNjz8WyOBFbhS
+								iqLE7IxrI4OVu7xUnYuAMVUjaOxYnYmGPxt2NrtjgJ213V6E2FxcnYM7EsTs7WbnEittYblPC3YWNZnITd9gZFvcXJ2O5YFn
+								9rtgaLE7E/c1drcbooxGz1FleKQsLWfrv5dmyce3AiNubtJiEby+IkbNx7cDL2FCyLk7JxLE7EVoqi2eKE7I4WV3GDQsJ2aS
+								uKWouTsfG+USFna7c4EdtlIpbFidkYFidhO7u4YFmclI1jcSK2swuWxQnZWBYnYY9zdsGyOBnbGcvipGwcixOxNYrSmjghG8
+								viJGyndqKoG72RscdhWZycrd3iRGyncVgWJ2arEYVucRI2y8kJy+KkbByLE7FZTlgWJ2RjWZyEbcdywrI4GZuFZXFSNo7Fid
+								iaRVFanJCtbM0pqiwuF4WEPQpTFDJ2+9ZU/fcwcrZ2ixOxlaK0bU3fdhYnYQ/vKIrK856MPaqNKFPUiELKxrE4EXv4KBUW9+
+								xgcRK27fBRWBYnZitFcVE7eiNhf9xOFHUWJ2KrEEWVxYnYH8tE0WZxEvYQJMoo7aKQsW2xLE7EHqJszXGaLE7IxrI4CXsghi
+								gITsYegmVxIvZAPFFI2TgWJ2HbYIpCxh6IZXEidpvE1Y/eSNj9FGwtrUnCtsATJYCI3Q+vNYnYtMQ1ifIWbK0WJ2H3xRSFjI
+								3XmkTsvhZYFidkY1mchG2GKQoZG681Sdk4FidiK0XRbHFCNpYoJOxeZnitScrGsTgRu5cZlsUJ2ViikLB79MJoTRi9kbF7YV
+								mclI1jcRK2CZYonqRsrNYkYRuZ9MKyOBm7B5bFydnaRSFiG+G1JjEbw+Ik7C5GRlgWJ2RjiULE7oLXmoRsLIuTsA27GGFZnI
+								yNJwopG6c1idh0UTRYnJCNZXESdndMUcjYeK1JztZucSJ2d0MsixOzMUQhYZt3x2tNUjaOxTWx1TwPwjQ3745lcc3Pk6hlY4
+								mi4TkYKduDs3DePC+vL6ZO/fTTCRM+GTnyo4+src1xW7P9NyrffNn6/A7QPb5aiNhfyNifyNjmuBZv+zXT11+2Pncky/wrSP
+								wLWeKfyBI3p7WmmSqLq/5jwYIv2z4vhTL3eTeiKNE0NsC9ZInTRWnbmjaqWrPt38P8p3d8Pk2auQZRFBbX8k2q9/SOz9XJMp
+								/dvjW7q7K4rapeXCYKHd3uOUaP2a2ijFQlipax+Jzpqp9jlGU+WVVrGqlqTRV/D5s1Xf3zlyjzyaot3qOtxVWO3tqhO7CZgy
+								e3ioJvccSeMV3z865ABziRxT+frvk5XUXmqs57LRaf5qD9+WJp5hos3k+1xSc5aH8uWpY5u6PFNYniMlEVWs1z6IPZnbO4mw
+								Pec+jyzBWiWGNc2lwd8J+fR5k7qrd4+0vbeAf85/5lmTvSRTHUIIo6tIbrPMCxLD7WoXPvWZBlPgbn0mbv0Pn3Q6DMx2i3uA
+								a0RjZz8DBtooy2JHsfB8p8mGaLa0RrYQNczaVNmvgIS/L3n6DMB3S0uGL0NtSS/L0tsswHqBPlQ8u3e98M0HUHqG7NQZZv95
+								4caea676uyeH/Lt3+/D8r8/Y4Wt7J8+/cSyTNvL8oHlu/mfUoo855tW7NPt3fzHihZ5j3pouChsd+7BXClxXt3e3fv3ZJlbq
+								wQ5b1u7/Z9YShzY1lrmnZ7t+85k2Wuj9hd9d79+9lQ5gDvBLpTbIAb6P0778NDmev9O+/xk2b+771/8F98b2JnPx1/J8EAlv
+								WnZL+jgN5vi9bvhLKJkv3e5EA2pfK3G9X9ViTONoPZbX9P0pCto2suzwH91oT0FwNgkzFsHWXdn0NZSKl/H63i3bIfUQcpVX
+								W2jVV/2uug+M1MpIMOTQdTuYaUfD2K/w8OpCyT"/>
+				</foreignObject>
+				<svg viewBox="0 0 89.024 228.01" height="78.514" preserveAspectRatio="none" width="30.4299" x="0" y="500.753">
+					<clipPath id="mfid25">
+						<rect width="89.024" height="228.01" id="mfid26"/>
+					</clipPath>
+					<g clip-path="url(#mfid25)">
+						<mask id="mfid27">
+							<rect width="90" height="229" fill="white" stroke="none"/>
+						</mask>
+						<mask id="mfid28" fill="white" stroke="none">
+							<g>
+								<g mask="url(#mfid27)">
+									<use xlink:href="#mfid26"/>
+								</g>
+							</g>
+						</mask>
+						<!-- Unsupported Record: EmfPlusRecordTypeSetPixelOffsetMode  -->
+						<defs>
+							<image id="mfid29" width="90" height="229" xlink:href=""/>
+						</defs>
+						<!-- Unsupported Record: EmfPlusRecordTypeSetObject Obj_ImageAttributes -->
+						<g mask="url(#mfid28)">
+							<g transform="matrix(0.00018373, 0, 0, 0.00018373, 0, 0)">
+								<clipPath id="mfid30">
+									<rect x="-0.5" y="-0.5" width="90" height="229"/>
+								</clipPath>
+								<use xlink:href="#mfid29" clip-path="url(#mfid30)"
+										transform="matrix(5442.9, 0, 0, 5442.9, 2721.4, 2721.5)"/>
+							</g>
+						</g>
+					</g>
+				</svg>
+			</switch>
+			<rect v:rectContext="foreign" x="0" y="500.753" width="30.4299" height="78.514" class="st5"/>
+		</g>
+		<g id="shape635-1552" v:mID="635" v:groupContext="shape" transform="translate(538.497,799.539) rotate(150)">
+			<title>Sheet.635</title>
+			<rect v:rectContext="foreign" x="0" y="500.753" width="30.4299" height="78.514" class="st5"/>
+			<switch>
+				<foreignObject x="0" y="500.753" width="30.4299" height="78.514"
+						requiredExtensions="http://schemas.microsoft.com/visio/2003/SVGExtensions/">
+					<v:foreignData v:orgSize="29760" v:data="data:metafile;base64,eNrtnXlcE8f7xzeESznUqoilCHi2VbwwUPAgQL1qRaoocq
+								ktNlCIFyCIyiEoalEBbT0iINRWRTkbxSoK+rVKrCbSVpEIaK0IqIgX1gM1v2dysUCOYbS/v5qX4z57vefZz3x2dja8dsOgKI
+								pLtX68odTS5pfoUZSeKUVZT5w+iaIYFDuconpSKGr36UZRSw0paq4ORXm2W3lqSBfKeqQeBQBqGBRrKIAbymAzqA8g7g5Fp/
+								upKrTbAXlB21bDivLusm1HsnWk2+lKiXHOH7MpaiBE5vL51tpa49FsiqHgt34Ou1LU1244+w9iU7Q6KcqKzaSM5XF/tq4y7s
+								fuoowH0pbbsA2UMb2+a/vYjoo66LEhm9JbCzqu6NaqR/sPWrbRw32ySdf3u0JoMnXKpzNh6gPltqE+/P/nmL9+QNuFzZzsSu
+								WJPriDDiDQZboLRRUmG736EtqT6rJ0incYRfWtQIVh/vtv9ynKtGnqpy6eK+bdr57K95/d99yyRNPX+i+7nNoo4nIimAdH6+
+								5e4+RZGhd3couhv2ugYUrKlsHNNro6qWxrE4ZleYLJSR8zvqfjJyUPTt475t0y/tK1Vbk1acUPip8dXGnw+ZS7XpKVojVjdn
+								CPPFtnNe/JsQvDb7x58TD5icXfuTZLqmJq/1h0NWNPocFK9nbfBmEl84T5i6FeTX6rn04u5jSW7Jn45tbGGX2qZsTaBzx7Yf
+								54z3WfYRHn/R0Cnj25m/K0aWdhsXPSq/Vx3KzIC/75lc4ZhUlWbiX+bNvRmZGlJ+dWjwzYMT7pVUvgiP6zglxK7Ko320Vcn2
+								bjG+Rrl/lga/E35/JijV8UOVxN3c1hClZGXqq8E5gaOznFdsyhvuWraiMM/7nKaUwasS4tPm+3yUT95nnHLAM5jUfKUpaYCO
+								I4LCbvygnGa9HL+2lORRPfLGLrh+8QU83jby7O+Dhi06uv47lZ3sJGt5EZJuwHRpEXLhf0fWzLELsOCXSzDguZU/v65uLxY3
+								s0DqoNrEhlCIP1UZ3LhFMbIvskMUOPRpVxBA6ly3pECcQeQRMrxn4YWezV6LTQCDYMv7M+M9c79alp1K9FgtIuArPjyeIpVy
+								5mPoie8eZ1/QTf1TVMe7frU2VHkVNnEuSyiAfZHzbiJr26ZxjFD2SWP1tXdWR24SyBGe9Kmp3FYc5JQ8f9HPaNEZFlF3yNd9
+								s8MBSkc9ysd3MCsjnHPIVxcxaeNHhTtK7A9nH+lhEDdlzUB/Co0PPcAh4jeu7iZ5VA+ihojl3mjuCKnhezOG6pjNzI+rmFfR
+								9Pz88fUC/d3C3G8ByrrLzbbzXLYxZWmr3YwYDcOGw+b1rg3ubisBou5f0gqzTZcLfNDXOb+svGQXPCKriGwXv4jIUn199uus
+								6VHXy/W9xfWUyfJckLYgPsCmYJlrMi0018+PPWXgyGJoIU1rya8sv4/Bzz4vjVZ50em/QIzefbpOTUpYkXoOa6yBmQGNNs9T
+								ggt2ux6bF8WAi7zwlrQrUXzl7gWxui8/M6acMaN/zEe6rLvwDHWHVxknFu5CWPSuMXL69WpFaJ9A85Btp/FtLkZGH24tHSln
+								n3+1b1nz9doXeaeEPIlqxbvJbnTcYNnsKQconfsC2Cp4npru4u9RXVUfa6Rx5Axb5BcJAnX/uDSssVWoRx9l7q5gIB+GExz8
+								TauUdT3FeSH/WKJpZ8bM1cOZvvElKRlOItXM4/HbzYzmJU8+/PfBfY1d7vgZKW6ZxTlwcrRbs27+NBI4zsU9Vf0t81IzDBzc
+								qkfEbi2MtrLQd0W8a1nyv0D0xAbd9YX102u2D9pIoLPJM1w/K8h88Og3bdEzbfq5kh/uemwTShvtSXg4AKFnVMNHW+n/tsZn
+								kAz6TmaweUdvJNt3JWeBM3/HVT9RiLpkxvIZwaS9hGwbuR5k6TMnSbzTlnBo05FGXnrPu02dmjmo1OgaSQ7dOuVHPsq13hcC
+								2v+o22kErA2CzJv9l1ZQ7/5vsrh1nFf5sjeSKwBHrO38V2wRF6Qk4CnHO2BxbBFDzCD2koFCaODbKXefgoR6bdP0a2KWdZF3
+								4KhiZ6OfTDxSinLSHDdyGDQFjEKl7f7wL3l4R+t9Zbxzy6Mj78G6HbxcRpQXOe6B/OLFJspv/zG3sdh/KADJOj5WrwYfZ+bm
+								Us2QkB6V+9klO67HfL5p0cJqs4fnjGZkOxq3QP/TG53u6xf/4wreEP3voMk8jycHCBEJ0/0hNfzIJdzvSGjUa4inLlZpjFv2
+								l2vAidf27CxDeiaK/WVuhbfoxVnNjvT+4va4crNmdMT7391UKv8p9Y+aH2r1zl8iMjewoXZ5WBEIF6NS7J992WmryMnlx+BO
+								XfL9g+bpxtkMsROcLfj5/h0FAoSnRuiKv2tJM1JZPXHzaN0Bt/7oHo0Y3P/zLw8ahSHPD+4Dz2eh5gl9k/fAaCDs+WZ8PnuC
+								31dx1x5QNguAn1U/br2R6AzsYY9RJJyMTJUQn97kH+7rJlUNM+QWg2r8a1jJW8CEzhIJRXUcULC7xVaSDKLBttwY4wAac4WC
+								iq8BBzx93usjK3jhIGN62yHxb6nk/KaxejYHeUd6BwEjvvck5g6knXMpDteo74oZ4ldP1zwkRcj4Uga7Z4abeaCPtvQyr4gl
+								B2zLWe0H2DZNcq/B7XiG3D76BTX6SfMsQg3TesXtz98j7x0i7If2BaqMnJunjt4oPHnnB1az8TrjSL6lou62WeOVqU7af8nS
+								x6eYv6D8lihuq9dFtP03fidr6gKXManH3crC/Cqh6IQgQr52VJld4hmtQUGXi50VRqf2nXnM0bWh7vM9ri0oq1855yM+uOi5
+								dKzwFF/yiC7rB4HRx7XNpaU8G2NRxkkPBlkJC3qaB30Ol03fAqMVsYLUocqpCZdYEFp5Blyr3gmIqxzR8HMgXb4jgKp/QOcv
+								GPNxV8z3vav/Y8u9soiwOOkfZ+MhOLeCYuwl2bPYV5mS0ZJtbo2CzHpNS4Kg8MjtWHPYQBmno88eAvcxVJe8TDyMLefJud2T
+								x/V/Bm3LiP4HyVm8coeI5wZXyXWnOf1BpXkK6ozp6/7O8BgeWoi4KdDjjy1/D6jgmOMKnhyI7R+Hgexwvp2ecqtO3tOL81Zh
+								cHNzivWg/9/+VxtZGBR9HKP3m3Z4IDEtl1F88NlKZb7zr052lIg1ksr0w91iHO+NO6ocaLAs/0Dt/D/zJk8Vhwl5HsIL8uz4
+								Y6D4ktw3eLKdTSdg3HeQEN9ePcaHVuBTf4O1gciJp9I93flS/ORCfUPNDuJ1aVeKlRzTMu5djnWkWjKU846WBd18OZnCs5Q3
+								ayt2SJuZSYBe6ijpn5WFddXj1TZAL9aoi9LivQ/trqyAzj8FkwCoi7mHkPfJw4tuFv3tNhP3+QAj4QR9hzglxsd5lYl5U78m
+								++Jz0xIeE1jaMthjhH6MrqPJ5XV3/StY24TkCHo5HXWSw6NyaTGRoghPOtWJQ4rWEfb1s5Y5cL0zFxeLa4d3iE0j2Ne+u+gZ
+								7kPHdgCFzYP2joc8LFK0UurP2VtLEWJVnM0QmLDzr2cToKjSI1K282P2OBjmOiiLcZWhosdaRcR1zPfb7OVLBZ3me6RyW4X5
+								G25fuCUeFVvGhXA+k5pKy0a/OQWWvMLm8VT6kAzRyh48vLnMUv3HUj/Qu+M5Jmd7Bo3fDsKJ+21flle89Imd+sX+MZZnuYbc
+								qyKFEcZkuaCZxW19D5ODpBP9S45jjnZFzj+05PbSoj7O+ia+0N6Wl2Hl3zTY9ncyLLi+pyxN0F+3hJgW4LAhp2nHBzuJJm5D
+								/zyeFW86SzLBjHZvIzXCL00g8iZb1GWUzI0oN2nPOoJor58s9stsFCfZNa8+PL0dE1NR/MHFG8uqXPiJqI+6uen078xB7swy
+								8Zu7fOfOWGnlWSbx/tmBMPPt/KuteFYwk2OBSfZ33rbOWNV3euny25Otzzjeu258kF37+Jqf9BkZvHLH5W5ODjfIlPadjNeR
+								Utz75aNqF66M15ry+9fD2oev5tUfTj0rPd8iakTTrHr2zR801d/2bAOTZSz7GLZUpAAx8SE/W3a+AuHJcUbb2gimc7Jvj65f
+								l5XVEfMCy0q48HPxi660vb+9/ybkzInx803iIALvRI4b90jrNG9zuTEbi3a67gxoP4TX0CYqcVdOsSYQxq6NgemFYuVQe1Do
+								efOlvRQ3hM4+x1QLXlLziabnL5RPaN9NuhIe7Cc7tKUh8N+yd6SU6Al6zd/EM45dNhKGFQe453e64wwebW7ILYoshlpQuuSu
+								2zHLrcEui/4Gz5wG3PixULFo5LTM/xvtT/AgPq3+kPY/rW/gydQ4zolbMFK8xaGGeR01wsL0CPWsQvDEXZPBcaHZHW9YdU1j
+								DOgPriJ/tsbo1q7BV+sM6vYSs6SY6ZXqs4cUew4iOuk4Xqg9uRWZaIhPAtsOrFnfNkzsumHMENi56jr7/3j2+f7ZnFFci81+
+								OkJ8Nms76XvnLuVRAGF4NqdO7qvf6jEmlwJh4uc9DHDOczaqHnGO4iOz9e7LWN31RWsDt4S1YLz+QyXB5CBF6LbCD714Kqe5
+								WDWH0uJWbWXdQP1m9O8u2N2B9XDiofxJK6PmINSmvIguTbxXwLtEvl2HjbnYtCmgLsq9FRpvH6wnGC52tnzUM53IqXXlPjcu
+								r7AclqY8E0aY/iLtw/Ic0Qmh2GVkUco9f/u5wCG/f8FkZE0FtP54/yXnhiUGPkiF7SFlr57erb3yHa73HpMo0eP7ELfQqWNq
+								ksqvtDvDO1J2TZt2AW/4tnNVGTFq8wLg6359ReDCxM7mKPurzgrBkiwdewTe+Ce8ETXpzeVH0XPFCyJf1g1KSm58P++ZMxNj
+								Fm8aSmeu/GW/HjB3Rr3l5wY1t+DAz+N1VL+JV2j+NGnMtINYg5lZHq9bDXqns280PPSxhMJ4fUXez+q9BN9dSJ7p/muS6Il9
+								6DMwwV9+mKe3DddvfgFnAvD/+oMfL7cR3Z4lJZkX2/wJQtG0dRxeOtRz6YgqZofsXgnVNtUD1QvoJiLGcolvVTfich+0hv7K
+								H0lm+r2y5G35P0ku/r6e7hoQ+wL1BFRuq/02nP9oAyEn3zAeVEV9m6RKpJuf5D2rYMStW3Eq0fiUQi/zKDtpAu4ARabEXfMa
+								41jj2lDJmSv1q/+5A8bP26SfJcGRtIXiljZ8mbVrwiF1leygpiJZJTSrxEWQED4odKvERZgQHEr5R4ibICwCsrQKGiglgUn1
+								LiFRUwpPFDJV5RgYE0fqXEKyqQ4uUVyEJZBbGy+JQSL6uAIY8fKvGyCgzk8SslXlaBHC+tQBGiCmIV8SklHlXAUMYPlXhUgY
+								EyfqXEowqUeKigNUQVMDpo8l/8X/xf/F/8X/xf/P8Rx9KuR/TrlBXt+kW/rhnQrnf06yCDdn2kXzdb47i211kr2vWXfl02oF
+								2v6ddxBu36Tr/uK+K49uMEK9r4gT6uMKCNN+jjELo+9HGLLI7rOM6xoo1/6OMiA9p4iT6OYtDGV/RxF4rjVI3TrGjjN/q4zo
+								A23qOPAxm08SF93EjFxqkeZ/ZTMy6Vf9DYuXJ258bO42CwvwnK5zNl685RG9SMnQ07cM62GztvmTmcSp45gtrua0+lB4yn7p
+								7eQD36LZmSVKVBSYc4BaYZsHw9TPfAfCqVvnA8xLspSTWa30rdPZUIcSbE22CaBdPvKMk1HsQ/wLp1MN0Ly76H/SZA/CP16M
+								L3sH4XJalB8XbYZi3EP0G8A/JgQbwP4p2wDZSa/dSji7uou6WwzfX9UA7APLDFOyDOhhjyuH4Q1idQkhuHqEfCNKjHGdYDtz
+								QeluXAsnSY5lKPRBmwHOr+K4+6W7IGpvmwDI5BlElJKiHnvwqoR5eyYF0cJblZCMvgeG7+DMt+gLIXuGyY51OPyvdS230gz8
+								qtlOTvwzD/I0yPwH6xEP9ESa6mwnwR9ej3fbAshpLcKoJyFOYh/6ug561fID5ApX/NhvXRlKT2GCxPhmXZML8a1sN87XGYPw
+								jTYkpyG5UTUE5CKaEkdaiUQjkF5TQlqUflf1DOQPmVkjSgchbKOShllOQOKgIo5ynJXVR+o5JnjKB+XDSZen51H8xfoCT3Lk
+								IRQhFRkkZULkEppyT3f4fyB5Q/qZY7Qio/Zh7sA8dx9yJMD8A2v8NyiCuzYZ0/MKdAfBDKIVh+gXouzqF+XAzLxLlUS8MFKL
+								9BnEc9v5YPy6fCtAD286OeVxVAKYT152H6M9VSf57Kj4bl1XyIBTA9DNt/Rj2vOQLzZTAtgnKUaqkrg3IO4l+o59d/gX18YX
+								oMlp2F7adBfBxKMcz/CtMTVMvtX6kfl6DlJyE+A9MS6vmNEtjPh2J04hPXmY3hFrZTm+v8W+w4HSZT59/Km1n6a2fg+GzIuv
+								TM6c7AO5E3s/R/q1afLsKH47JR1qeiVnUKjp03s/RkFGKvLsGG47Eh62MnV0RFrQR2NDYcM2/mseIVK1bIEo8+gQnHYaOsj0
+								dErlAkHv0LHhwrb0Avj4iQsRE8Bg+unY2yProc2JFKUaJjseAYeTNzj4YvBzhNlFgsuDY2ZJ17JDw8fLlcFFniMbGxfO1wrX
+								kzcw+HhYW3Ji4XJTaBn60NrpmNsi4MDQ2jJS4TJQYHriVvQC9bRmOvaBUlIeFnLXBNbJR1wTLEpouiaM2EhIR8zXCNeTNz85
+								YuBXY7UZSJr9UMV89GWectQexWUdq2ZkLK2kOa4BryZubmLAa2IvEOoqDENcLVsSHrrJxFi5a0S7yNxVHi6zTA1ebNzDrIpb
+								FVWhzYmuCq2Sjr/Vxg00VRYXEQZd26A+rgavIGdDCXy1Ulyop2oqxLPLBXNVwVG2X9U1BwiCJxTRZHiSfuUw1XmTcz68egoO
+								BgjaKsbhUlMTFDJbwjG2X9QwBih6gSpUNrIvZ6lXAVeTN3Z84PQPCQ9omrtjjA12/I+L4jvD0bsgb0/MAAhSgYFofEN6iCd8
+								gb0H4ydlA7UdRZfJ00cRXwtmyU9R5fP4AHBGhtzWh6a27YsLEDvF3ezN1pc6VshSiYFgd2RzidjbJO85w7F7HbtaZWi4MoGz
+								emt4W3yZu5nTfT03Muriir24myceuuNvBWNmS9fedMxJ7bKkonLI7g37aB0/Jmbt/h7i5n+83vvMVR4t9uo8EVbMh6zXefAV
+								spSuctvhESp8OVeQN6ymefubcVpZMWR+ykbVsUcBkbZZ06EbERfK621oxUZXG5KEmtcHnezDXJEydOmfIuRElK2iyHIzbKOt
+								7FBbHpotBaU4sotMQRe5McLs2bqfKDa3HVe+sgNrrbVc3Gsrg6NsDVX+eZqiyu6rxnMjs9hmC2dYqK1pSP3sjYeBYnYas979
+								tZnJCNZXFiNobFidiYvTgpG8fipGwci5OwO3SGaixOwsbtxYnYvn5YFidna7c4EVsuijaLk7BxBypvwdY6eiNjz8WyOBFbhS
+								iqLE7IxrI4OVu7xUnYuAMVUjaOxYnYmGPxt2NrtjgJ213V6E2FxcnYM7EsTs7WbnEittYblPC3YWNZnITd9gZFvcXJ2O5YFn
+								9rtgaLE7E/c1drcbooxGz1FleKQsLWfrv5dmyce3AiNubtJiEby+IkbNx7cDL2FCyLk7JxLE7EVoqi2eKE7I4WV3GDQsJ2aS
+								uKWouTsfG+USFna7c4EdtlIpbFidkYFidhO7u4YFmclI1jcSK2swuWxQnZWBYnYY9zdsGyOBnbGcvipGwcixOxNYrSmjghG8
+								viJGyndqKoG72RscdhWZycrd3iRGyncVgWJ2arEYVucRI2y8kJy+KkbByLE7FZTlgWJ2RjWZyEbcdywrI4GZuFZXFSNo7Fid
+								iaRVFanJCtbM0pqiwuF4WEPQpTFDJ2+9ZU/fcwcrZ2ixOxlaK0bU3fdhYnYQ/vKIrK856MPaqNKFPUiELKxrE4EXv4KBUW9+
+								xgcRK27fBRWBYnZitFcVE7eiNhf9xOFHUWJ2KrEEWVxYnYH8tE0WZxEvYQJMoo7aKQsW2xLE7EHqJszXGaLE7IxrI4CXsghi
+								gITsYegmVxIvZAPFFI2TgWJ2HbYIpCxh6IZXEidpvE1Y/eSNj9FGwtrUnCtsATJYCI3Q+vNYnYtMQ1ifIWbK0WJ2H3xRSFjI
+								3XmkTsvhZYFidkY1mchG2GKQoZG681Sdk4FidiK0XRbHFCNpYoJOxeZnitScrGsTgRu5cZlsUJ2ViikLB79MJoTRi9kbF7YV
+								mclI1jcRK2CZYonqRsrNYkYRuZ9MKyOBm7B5bFydnaRSFiG+G1JjEbw+Ik7C5GRlgWJ2RjiULE7oLXmoRsLIuTsA27GGFZnI
+								yNJwopG6c1idh0UTRYnJCNZXESdndMUcjYeK1JztZucSJ2d0MsixOzMUQhYZt3x2tNUjaOxTWx1TwPwjQ3745lcc3Pk6hlY4
+								mi4TkYKduDs3DePC+vL6ZO/fTTCRM+GTnyo4+src1xW7P9NyrffNn6/A7QPb5aiNhfyNifyNjmuBZv+zXT11+2Pncky/wrSP
+								wLWeKfyBI3p7WmmSqLq/5jwYIv2z4vhTL3eTeiKNE0NsC9ZInTRWnbmjaqWrPt38P8p3d8Pk2auQZRFBbX8k2q9/SOz9XJMp
+								/dvjW7q7K4rapeXCYKHd3uOUaP2a2ijFQlipax+Jzpqp9jlGU+WVVrGqlqTRV/D5s1Xf3zlyjzyaot3qOtxVWO3tqhO7CZgy
+								e3ioJvccSeMV3z865ABziRxT+frvk5XUXmqs57LRaf5qD9+WJp5hos3k+1xSc5aH8uWpY5u6PFNYniMlEVWs1z6IPZnbO4mw
+								Pec+jyzBWiWGNc2lwd8J+fR5k7qrd4+0vbeAf85/5lmTvSRTHUIIo6tIbrPMCxLD7WoXPvWZBlPgbn0mbv0Pn3Q6DMx2i3uA
+								a0RjZz8DBtooy2JHsfB8p8mGaLa0RrYQNczaVNmvgIS/L3n6DMB3S0uGL0NtSS/L0tsswHqBPlQ8u3e98M0HUHqG7NQZZv95
+								4caea676uyeH/Lt3+/D8r8/Y4Wt7J8+/cSyTNvL8oHlu/mfUoo855tW7NPt3fzHihZ5j3pouChsd+7BXClxXt3e3fv3ZJlbq
+								wQ5b1u7/Z9YShzY1lrmnZ7t+85k2Wuj9hd9d79+9lQ5gDvBLpTbIAb6P0778NDmev9O+/xk2b+771/8F98b2JnPx1/J8EAlv
+								WnZL+jgN5vi9bvhLKJkv3e5EA2pfK3G9X9ViTONoPZbX9P0pCto2suzwH91oT0FwNgkzFsHWXdn0NZSKl/H63i3bIfUQcpVX
+								W2jVV/2uug+M1MpIMOTQdTuYaUfD2K/w8OpCyT"/>
+				</foreignObject>
+				<svg viewBox="0 0 89.024 228.01" height="78.514" preserveAspectRatio="none" width="30.4299" x="0" y="500.753">
+					<clipPath id="mfid31">
+						<rect width="89.024" height="228.01" id="mfid32"/>
+					</clipPath>
+					<g clip-path="url(#mfid31)">
+						<mask id="mfid33">
+							<rect width="90" height="229" fill="white" stroke="none"/>
+						</mask>
+						<mask id="mfid34" fill="white" stroke="none">
+							<g>
+								<g mask="url(#mfid33)">
+									<use xlink:href="#mfid32"/>
+								</g>
+							</g>
+						</mask>
+						<!-- Unsupported Record: EmfPlusRecordTypeSetPixelOffsetMode  -->
+						<defs>
+							<image id="mfid35" width="90" height="229" xlink:href=""/>
+						</defs>
+						<!-- Unsupported Record: EmfPlusRecordTypeSetObject Obj_ImageAttributes -->
+						<g mask="url(#mfid34)">
+							<g transform="matrix(0.00018373, 0, 0, 0.00018373, 0, 0)">
+								<clipPath id="mfid36">
+									<rect x="-0.5" y="-0.5" width="90" height="229"/>
+								</clipPath>
+								<use xlink:href="#mfid35" clip-path="url(#mfid36)"
+										transform="matrix(5442.9, 0, 0, 5442.9, 2721.4, 2721.5)"/>
+							</g>
+						</g>
+					</g>
+				</svg>
+			</switch>
+			<rect v:rectContext="foreign" x="0" y="500.753" width="30.4299" height="78.514" class="st5"/>
+		</g>
+		<g id="shape636-1555" v:mID="636" v:groupContext="shape" transform="translate(398.905,-202.875)">
+			<title>Sheet.636</title>
+			<rect v:rectContext="foreign" x="0" y="500.753" width="30.4299" height="78.514" class="st5"/>
+			<switch>
+				<foreignObject x="0" y="500.753" width="30.4299" height="78.514"
+						requiredExtensions="http://schemas.microsoft.com/visio/2003/SVGExtensions/">
+					<v:foreignData v:orgSize="29760" v:data="data:metafile;base64,eNrtnXlcE8f7xzeESznUqoilCHi2VbwwUPAgQL1qRaoocq
+								ktNlCIFyCIyiEoalEBbT0iINRWRTkbxSoK+rVKrCbSVpEIaK0IqIgX1gM1v2dysUCOYbS/v5qX4z57vefZz3x2dja8dsOgKI
+								pLtX68odTS5pfoUZSeKUVZT5w+iaIYFDuconpSKGr36UZRSw0paq4ORXm2W3lqSBfKeqQeBQBqGBRrKIAbymAzqA8g7g5Fp/
+								upKrTbAXlB21bDivLusm1HsnWk2+lKiXHOH7MpaiBE5vL51tpa49FsiqHgt34Ou1LU1244+w9iU7Q6KcqKzaSM5XF/tq4y7s
+								fuoowH0pbbsA2UMb2+a/vYjoo66LEhm9JbCzqu6NaqR/sPWrbRw32ySdf3u0JoMnXKpzNh6gPltqE+/P/nmL9+QNuFzZzsSu
+								WJPriDDiDQZboLRRUmG736EtqT6rJ0incYRfWtQIVh/vtv9ynKtGnqpy6eK+bdr57K95/d99yyRNPX+i+7nNoo4nIimAdH6+
+								5e4+RZGhd3couhv2ugYUrKlsHNNro6qWxrE4ZleYLJSR8zvqfjJyUPTt475t0y/tK1Vbk1acUPip8dXGnw+ZS7XpKVojVjdn
+								CPPFtnNe/JsQvDb7x58TD5icXfuTZLqmJq/1h0NWNPocFK9nbfBmEl84T5i6FeTX6rn04u5jSW7Jn45tbGGX2qZsTaBzx7Yf
+								54z3WfYRHn/R0Cnj25m/K0aWdhsXPSq/Vx3KzIC/75lc4ZhUlWbiX+bNvRmZGlJ+dWjwzYMT7pVUvgiP6zglxK7Ko320Vcn2
+								bjG+Rrl/lga/E35/JijV8UOVxN3c1hClZGXqq8E5gaOznFdsyhvuWraiMM/7nKaUwasS4tPm+3yUT95nnHLAM5jUfKUpaYCO
+								I4LCbvygnGa9HL+2lORRPfLGLrh+8QU83jby7O+Dhi06uv47lZ3sJGt5EZJuwHRpEXLhf0fWzLELsOCXSzDguZU/v65uLxY3
+								s0DqoNrEhlCIP1UZ3LhFMbIvskMUOPRpVxBA6ly3pECcQeQRMrxn4YWezV6LTQCDYMv7M+M9c79alp1K9FgtIuArPjyeIpVy
+								5mPoie8eZ1/QTf1TVMe7frU2VHkVNnEuSyiAfZHzbiJr26ZxjFD2SWP1tXdWR24SyBGe9Kmp3FYc5JQ8f9HPaNEZFlF3yNd9
+								s8MBSkc9ysd3MCsjnHPIVxcxaeNHhTtK7A9nH+lhEDdlzUB/Co0PPcAh4jeu7iZ5VA+ihojl3mjuCKnhezOG6pjNzI+rmFfR
+								9Pz88fUC/d3C3G8ByrrLzbbzXLYxZWmr3YwYDcOGw+b1rg3ubisBou5f0gqzTZcLfNDXOb+svGQXPCKriGwXv4jIUn199uus
+								6VHXy/W9xfWUyfJckLYgPsCmYJlrMi0018+PPWXgyGJoIU1rya8sv4/Bzz4vjVZ50em/QIzefbpOTUpYkXoOa6yBmQGNNs9T
+								ggt2ux6bF8WAi7zwlrQrUXzl7gWxui8/M6acMaN/zEe6rLvwDHWHVxknFu5CWPSuMXL69WpFaJ9A85Btp/FtLkZGH24tHSln
+								n3+1b1nz9doXeaeEPIlqxbvJbnTcYNnsKQconfsC2Cp4npru4u9RXVUfa6Rx5Axb5BcJAnX/uDSssVWoRx9l7q5gIB+GExz8
+								TauUdT3FeSH/WKJpZ8bM1cOZvvElKRlOItXM4/HbzYzmJU8+/PfBfY1d7vgZKW6ZxTlwcrRbs27+NBI4zsU9Vf0t81IzDBzc
+								qkfEbi2MtrLQd0W8a1nyv0D0xAbd9YX102u2D9pIoLPJM1w/K8h88Og3bdEzbfq5kh/uemwTShvtSXg4AKFnVMNHW+n/tsZn
+								kAz6TmaweUdvJNt3JWeBM3/HVT9RiLpkxvIZwaS9hGwbuR5k6TMnSbzTlnBo05FGXnrPu02dmjmo1OgaSQ7dOuVHPsq13hcC
+								2v+o22kErA2CzJv9l1ZQ7/5vsrh1nFf5sjeSKwBHrO38V2wRF6Qk4CnHO2BxbBFDzCD2koFCaODbKXefgoR6bdP0a2KWdZF3
+								4KhiZ6OfTDxSinLSHDdyGDQFjEKl7f7wL3l4R+t9Zbxzy6Mj78G6HbxcRpQXOe6B/OLFJspv/zG3sdh/KADJOj5WrwYfZ+bm
+								Us2QkB6V+9klO67HfL5p0cJqs4fnjGZkOxq3QP/TG53u6xf/4wreEP3voMk8jycHCBEJ0/0hNfzIJdzvSGjUa4inLlZpjFv2
+								l2vAidf27CxDeiaK/WVuhbfoxVnNjvT+4va4crNmdMT7391UKv8p9Y+aH2r1zl8iMjewoXZ5WBEIF6NS7J992WmryMnlx+BO
+								XfL9g+bpxtkMsROcLfj5/h0FAoSnRuiKv2tJM1JZPXHzaN0Bt/7oHo0Y3P/zLw8ahSHPD+4Dz2eh5gl9k/fAaCDs+WZ8PnuC
+								31dx1x5QNguAn1U/br2R6AzsYY9RJJyMTJUQn97kH+7rJlUNM+QWg2r8a1jJW8CEzhIJRXUcULC7xVaSDKLBttwY4wAac4WC
+								iq8BBzx93usjK3jhIGN62yHxb6nk/KaxejYHeUd6BwEjvvck5g6knXMpDteo74oZ4ldP1zwkRcj4Uga7Z4abeaCPtvQyr4gl
+								B2zLWe0H2DZNcq/B7XiG3D76BTX6SfMsQg3TesXtz98j7x0i7If2BaqMnJunjt4oPHnnB1az8TrjSL6lou62WeOVqU7af8nS
+								x6eYv6D8lihuq9dFtP03fidr6gKXManH3crC/Cqh6IQgQr52VJld4hmtQUGXi50VRqf2nXnM0bWh7vM9ri0oq1855yM+uOi5
+								dKzwFF/yiC7rB4HRx7XNpaU8G2NRxkkPBlkJC3qaB30Ol03fAqMVsYLUocqpCZdYEFp5Blyr3gmIqxzR8HMgXb4jgKp/QOcv
+								GPNxV8z3vav/Y8u9soiwOOkfZ+MhOLeCYuwl2bPYV5mS0ZJtbo2CzHpNS4Kg8MjtWHPYQBmno88eAvcxVJe8TDyMLefJud2T
+								x/V/Bm3LiP4HyVm8coeI5wZXyXWnOf1BpXkK6ozp6/7O8BgeWoi4KdDjjy1/D6jgmOMKnhyI7R+Hgexwvp2ecqtO3tOL81Zh
+								cHNzivWg/9/+VxtZGBR9HKP3m3Z4IDEtl1F88NlKZb7zr052lIg1ksr0w91iHO+NO6ocaLAs/0Dt/D/zJk8Vhwl5HsIL8uz4
+								Y6D4ktw3eLKdTSdg3HeQEN9ePcaHVuBTf4O1gciJp9I93flS/ORCfUPNDuJ1aVeKlRzTMu5djnWkWjKU846WBd18OZnCs5Q3
+								ayt2SJuZSYBe6ijpn5WFddXj1TZAL9aoi9LivQ/trqyAzj8FkwCoi7mHkPfJw4tuFv3tNhP3+QAj4QR9hzglxsd5lYl5U78m
+								++Jz0xIeE1jaMthjhH6MrqPJ5XV3/StY24TkCHo5HXWSw6NyaTGRoghPOtWJQ4rWEfb1s5Y5cL0zFxeLa4d3iE0j2Ne+u+gZ
+								7kPHdgCFzYP2joc8LFK0UurP2VtLEWJVnM0QmLDzr2cToKjSI1K282P2OBjmOiiLcZWhosdaRcR1zPfb7OVLBZ3me6RyW4X5
+								G25fuCUeFVvGhXA+k5pKy0a/OQWWvMLm8VT6kAzRyh48vLnMUv3HUj/Qu+M5Jmd7Bo3fDsKJ+21flle89Imd+sX+MZZnuYbc
+								qyKFEcZkuaCZxW19D5ODpBP9S45jjnZFzj+05PbSoj7O+ia+0N6Wl2Hl3zTY9ncyLLi+pyxN0F+3hJgW4LAhp2nHBzuJJm5D
+								/zyeFW86SzLBjHZvIzXCL00g8iZb1GWUzI0oN2nPOoJor58s9stsFCfZNa8+PL0dE1NR/MHFG8uqXPiJqI+6uen078xB7swy
+								8Zu7fOfOWGnlWSbx/tmBMPPt/KuteFYwk2OBSfZ33rbOWNV3euny25Otzzjeu258kF37+Jqf9BkZvHLH5W5ODjfIlPadjNeR
+								Utz75aNqF66M15ry+9fD2oev5tUfTj0rPd8iakTTrHr2zR801d/2bAOTZSz7GLZUpAAx8SE/W3a+AuHJcUbb2gimc7Jvj65f
+								l5XVEfMCy0q48HPxi660vb+9/ybkzInx803iIALvRI4b90jrNG9zuTEbi3a67gxoP4TX0CYqcVdOsSYQxq6NgemFYuVQe1Do
+								efOlvRQ3hM4+x1QLXlLziabnL5RPaN9NuhIe7Cc7tKUh8N+yd6SU6Al6zd/EM45dNhKGFQe453e64wwebW7ILYoshlpQuuSu
+								2zHLrcEui/4Gz5wG3PixULFo5LTM/xvtT/AgPq3+kPY/rW/gydQ4zolbMFK8xaGGeR01wsL0CPWsQvDEXZPBcaHZHW9YdU1j
+								DOgPriJ/tsbo1q7BV+sM6vYSs6SY6ZXqs4cUew4iOuk4Xqg9uRWZaIhPAtsOrFnfNkzsumHMENi56jr7/3j2+f7ZnFFci81+
+								OkJ8Nms76XvnLuVRAGF4NqdO7qvf6jEmlwJh4uc9DHDOczaqHnGO4iOz9e7LWN31RWsDt4S1YLz+QyXB5CBF6LbCD714Kqe5
+								WDWH0uJWbWXdQP1m9O8u2N2B9XDiofxJK6PmINSmvIguTbxXwLtEvl2HjbnYtCmgLsq9FRpvH6wnGC52tnzUM53IqXXlPjcu
+								r7AclqY8E0aY/iLtw/Ic0Qmh2GVkUco9f/u5wCG/f8FkZE0FtP54/yXnhiUGPkiF7SFlr57erb3yHa73HpMo0eP7ELfQqWNq
+								ksqvtDvDO1J2TZt2AW/4tnNVGTFq8wLg6359ReDCxM7mKPurzgrBkiwdewTe+Ce8ETXpzeVH0XPFCyJf1g1KSm58P++ZMxNj
+								Fm8aSmeu/GW/HjB3Rr3l5wY1t+DAz+N1VL+JV2j+NGnMtINYg5lZHq9bDXqns280PPSxhMJ4fUXez+q9BN9dSJ7p/muS6Il9
+								6DMwwV9+mKe3DddvfgFnAvD/+oMfL7cR3Z4lJZkX2/wJQtG0dRxeOtRz6YgqZofsXgnVNtUD1QvoJiLGcolvVTfich+0hv7K
+								H0lm+r2y5G35P0ku/r6e7hoQ+wL1BFRuq/02nP9oAyEn3zAeVEV9m6RKpJuf5D2rYMStW3Eq0fiUQi/zKDtpAu4ARabEXfMa
+								41jj2lDJmSv1q/+5A8bP26SfJcGRtIXiljZ8mbVrwiF1leygpiJZJTSrxEWQED4odKvERZgQHEr5R4ibICwCsrQKGiglgUn1
+								LiFRUwpPFDJV5RgYE0fqXEKyqQ4uUVyEJZBbGy+JQSL6uAIY8fKvGyCgzk8SslXlaBHC+tQBGiCmIV8SklHlXAUMYPlXhUgY
+								EyfqXEowqUeKigNUQVMDpo8l/8X/xf/F/8X/xf/P8Rx9KuR/TrlBXt+kW/rhnQrnf06yCDdn2kXzdb47i211kr2vWXfl02oF
+								2v6ddxBu36Tr/uK+K49uMEK9r4gT6uMKCNN+jjELo+9HGLLI7rOM6xoo1/6OMiA9p4iT6OYtDGV/RxF4rjVI3TrGjjN/q4zo
+								A23qOPAxm08SF93EjFxqkeZ/ZTMy6Vf9DYuXJ258bO42CwvwnK5zNl685RG9SMnQ07cM62GztvmTmcSp45gtrua0+lB4yn7p
+								7eQD36LZmSVKVBSYc4BaYZsHw9TPfAfCqVvnA8xLspSTWa30rdPZUIcSbE22CaBdPvKMk1HsQ/wLp1MN0Ly76H/SZA/CP16M
+								L3sH4XJalB8XbYZi3EP0G8A/JgQbwP4p2wDZSa/dSji7uou6WwzfX9UA7APLDFOyDOhhjyuH4Q1idQkhuHqEfCNKjHGdYDtz
+								QeluXAsnSY5lKPRBmwHOr+K4+6W7IGpvmwDI5BlElJKiHnvwqoR5eyYF0cJblZCMvgeG7+DMt+gLIXuGyY51OPyvdS230gz8
+								qtlOTvwzD/I0yPwH6xEP9ESa6mwnwR9ej3fbAshpLcKoJyFOYh/6ug561fID5ApX/NhvXRlKT2GCxPhmXZML8a1sN87XGYPw
+								jTYkpyG5UTUE5CKaEkdaiUQjkF5TQlqUflf1DOQPmVkjSgchbKOShllOQOKgIo5ynJXVR+o5JnjKB+XDSZen51H8xfoCT3Lk
+								IRQhFRkkZULkEppyT3f4fyB5Q/qZY7Qio/Zh7sA8dx9yJMD8A2v8NyiCuzYZ0/MKdAfBDKIVh+gXouzqF+XAzLxLlUS8MFKL
+								9BnEc9v5YPy6fCtAD286OeVxVAKYT152H6M9VSf57Kj4bl1XyIBTA9DNt/Rj2vOQLzZTAtgnKUaqkrg3IO4l+o59d/gX18YX
+								oMlp2F7adBfBxKMcz/CtMTVMvtX6kfl6DlJyE+A9MS6vmNEtjPh2J04hPXmY3hFrZTm+v8W+w4HSZT59/Km1n6a2fg+GzIuv
+								TM6c7AO5E3s/R/q1afLsKH47JR1qeiVnUKjp03s/RkFGKvLsGG47Eh62MnV0RFrQR2NDYcM2/mseIVK1bIEo8+gQnHYaOsj0
+								dErlAkHv0LHhwrb0Avj4iQsRE8Bg+unY2yProc2JFKUaJjseAYeTNzj4YvBzhNlFgsuDY2ZJ17JDw8fLlcFFniMbGxfO1wrX
+								kzcw+HhYW3Ji4XJTaBn60NrpmNsi4MDQ2jJS4TJQYHriVvQC9bRmOvaBUlIeFnLXBNbJR1wTLEpouiaM2EhIR8zXCNeTNz85
+								YuBXY7UZSJr9UMV89GWectQexWUdq2ZkLK2kOa4BryZubmLAa2IvEOoqDENcLVsSHrrJxFi5a0S7yNxVHi6zTA1ebNzDrIpb
+								FVWhzYmuCq2Sjr/Vxg00VRYXEQZd26A+rgavIGdDCXy1Ulyop2oqxLPLBXNVwVG2X9U1BwiCJxTRZHiSfuUw1XmTcz68egoO
+								BgjaKsbhUlMTFDJbwjG2X9QwBih6gSpUNrIvZ6lXAVeTN3Z84PQPCQ9omrtjjA12/I+L4jvD0bsgb0/MAAhSgYFofEN6iCd8
+								gb0H4ydlA7UdRZfJ00cRXwtmyU9R5fP4AHBGhtzWh6a27YsLEDvF3ezN1pc6VshSiYFgd2RzidjbJO85w7F7HbtaZWi4MoGz
+								emt4W3yZu5nTfT03Muriir24myceuuNvBWNmS9fedMxJ7bKkonLI7g37aB0/Jmbt/h7i5n+83vvMVR4t9uo8EVbMh6zXefAV
+								spSuctvhESp8OVeQN6ymefubcVpZMWR+ykbVsUcBkbZZ06EbERfK621oxUZXG5KEmtcHnezDXJEydOmfIuRElK2iyHIzbKOt
+								7FBbHpotBaU4sotMQRe5McLs2bqfKDa3HVe+sgNrrbVc3Gsrg6NsDVX+eZqiyu6rxnMjs9hmC2dYqK1pSP3sjYeBYnYas979
+								tZnJCNZXFiNobFidiYvTgpG8fipGwci5OwO3SGaixOwsbtxYnYvn5YFidna7c4EVsuijaLk7BxBypvwdY6eiNjz8WyOBFbhS
+								iqLE7IxrI4OVu7xUnYuAMVUjaOxYnYmGPxt2NrtjgJ213V6E2FxcnYM7EsTs7WbnEittYblPC3YWNZnITd9gZFvcXJ2O5YFn
+								9rtgaLE7E/c1drcbooxGz1FleKQsLWfrv5dmyce3AiNubtJiEby+IkbNx7cDL2FCyLk7JxLE7EVoqi2eKE7I4WV3GDQsJ2aS
+								uKWouTsfG+USFna7c4EdtlIpbFidkYFidhO7u4YFmclI1jcSK2swuWxQnZWBYnYY9zdsGyOBnbGcvipGwcixOxNYrSmjghG8
+								viJGyndqKoG72RscdhWZycrd3iRGyncVgWJ2arEYVucRI2y8kJy+KkbByLE7FZTlgWJ2RjWZyEbcdywrI4GZuFZXFSNo7Fid
+								iaRVFanJCtbM0pqiwuF4WEPQpTFDJ2+9ZU/fcwcrZ2ixOxlaK0bU3fdhYnYQ/vKIrK856MPaqNKFPUiELKxrE4EXv4KBUW9+
+								xgcRK27fBRWBYnZitFcVE7eiNhf9xOFHUWJ2KrEEWVxYnYH8tE0WZxEvYQJMoo7aKQsW2xLE7EHqJszXGaLE7IxrI4CXsghi
+								gITsYegmVxIvZAPFFI2TgWJ2HbYIpCxh6IZXEidpvE1Y/eSNj9FGwtrUnCtsATJYCI3Q+vNYnYtMQ1ifIWbK0WJ2H3xRSFjI
+								3XmkTsvhZYFidkY1mchG2GKQoZG681Sdk4FidiK0XRbHFCNpYoJOxeZnitScrGsTgRu5cZlsUJ2ViikLB79MJoTRi9kbF7YV
+								mclI1jcRK2CZYonqRsrNYkYRuZ9MKyOBm7B5bFydnaRSFiG+G1JjEbw+Ik7C5GRlgWJ2RjiULE7oLXmoRsLIuTsA27GGFZnI
+								yNJwopG6c1idh0UTRYnJCNZXESdndMUcjYeK1JztZucSJ2d0MsixOzMUQhYZt3x2tNUjaOxTWx1TwPwjQ3745lcc3Pk6hlY4
+								mi4TkYKduDs3DePC+vL6ZO/fTTCRM+GTnyo4+src1xW7P9NyrffNn6/A7QPb5aiNhfyNifyNjmuBZv+zXT11+2Pncky/wrSP
+								wLWeKfyBI3p7WmmSqLq/5jwYIv2z4vhTL3eTeiKNE0NsC9ZInTRWnbmjaqWrPt38P8p3d8Pk2auQZRFBbX8k2q9/SOz9XJMp
+								/dvjW7q7K4rapeXCYKHd3uOUaP2a2ijFQlipax+Jzpqp9jlGU+WVVrGqlqTRV/D5s1Xf3zlyjzyaot3qOtxVWO3tqhO7CZgy
+								e3ioJvccSeMV3z865ABziRxT+frvk5XUXmqs57LRaf5qD9+WJp5hos3k+1xSc5aH8uWpY5u6PFNYniMlEVWs1z6IPZnbO4mw
+								Pec+jyzBWiWGNc2lwd8J+fR5k7qrd4+0vbeAf85/5lmTvSRTHUIIo6tIbrPMCxLD7WoXPvWZBlPgbn0mbv0Pn3Q6DMx2i3uA
+								a0RjZz8DBtooy2JHsfB8p8mGaLa0RrYQNczaVNmvgIS/L3n6DMB3S0uGL0NtSS/L0tsswHqBPlQ8u3e98M0HUHqG7NQZZv95
+								4caea676uyeH/Lt3+/D8r8/Y4Wt7J8+/cSyTNvL8oHlu/mfUoo855tW7NPt3fzHihZ5j3pouChsd+7BXClxXt3e3fv3ZJlbq
+								wQ5b1u7/Z9YShzY1lrmnZ7t+85k2Wuj9hd9d79+9lQ5gDvBLpTbIAb6P0778NDmev9O+/xk2b+771/8F98b2JnPx1/J8EAlv
+								WnZL+jgN5vi9bvhLKJkv3e5EA2pfK3G9X9ViTONoPZbX9P0pCto2suzwH91oT0FwNgkzFsHWXdn0NZSKl/H63i3bIfUQcpVX
+								W2jVV/2uug+M1MpIMOTQdTuYaUfD2K/w8OpCyT"/>
+				</foreignObject>
+				<svg viewBox="0 0 89.024 228.01" height="78.514" preserveAspectRatio="none" width="30.4299" x="0" y="500.753">
+					<clipPath id="mfid37">
+						<rect width="89.024" height="228.01" id="mfid38"/>
+					</clipPath>
+					<g clip-path="url(#mfid37)">
+						<mask id="mfid39">
+							<rect width="90" height="229" fill="white" stroke="none"/>
+						</mask>
+						<mask id="mfid40" fill="white" stroke="none">
+							<g>
+								<g mask="url(#mfid39)">
+									<use xlink:href="#mfid38"/>
+								</g>
+							</g>
+						</mask>
+						<!-- Unsupported Record: EmfPlusRecordTypeSetPixelOffsetMode  -->
+						<defs>
+							<image id="mfid41" width="90" height="229" xlink:href=""/>
+						</defs>
+						<!-- Unsupported Record: EmfPlusRecordTypeSetObject Obj_ImageAttributes -->
+						<g mask="url(#mfid40)">
+							<g transform="matrix(0.00018373, 0, 0, 0.00018373, 0, 0)">
+								<clipPath id="mfid42">
+									<rect x="-0.5" y="-0.5" width="90" height="229"/>
+								</clipPath>
+								<use xlink:href="#mfid41" clip-path="url(#mfid42)"
+										transform="matrix(5442.9, 0, 0, 5442.9, 2721.4, 2721.5)"/>
+							</g>
+						</g>
+					</g>
+				</svg>
+			</switch>
+			<rect v:rectContext="foreign" x="0" y="500.753" width="30.4299" height="78.514" class="st5"/>
+		</g>
+		<g id="shape637-1558" v:mID="637" v:groupContext="shape" transform="translate(838.754,-138.135) rotate(30)">
+			<title>Sheet.637</title>
+			<rect v:rectContext="foreign" x="0" y="500.753" width="30.4299" height="78.514" class="st5"/>
+			<switch>
+				<foreignObject x="0" y="500.753" width="30.4299" height="78.514"
+						requiredExtensions="http://schemas.microsoft.com/visio/2003/SVGExtensions/">
+					<v:foreignData v:orgSize="29760" v:data="data:metafile;base64,eNrtnXlcE8f7xzeESznUqoilCHi2VbwwUPAgQL1qRaoocq
+								ktNlCIFyCIyiEoalEBbT0iINRWRTkbxSoK+rVKrCbSVpEIaK0IqIgX1gM1v2dysUCOYbS/v5qX4z57vefZz3x2dja8dsOgKI
+								pLtX68odTS5pfoUZSeKUVZT5w+iaIYFDuconpSKGr36UZRSw0paq4ORXm2W3lqSBfKeqQeBQBqGBRrKIAbymAzqA8g7g5Fp/
+								upKrTbAXlB21bDivLusm1HsnWk2+lKiXHOH7MpaiBE5vL51tpa49FsiqHgt34Ou1LU1244+w9iU7Q6KcqKzaSM5XF/tq4y7s
+								fuoowH0pbbsA2UMb2+a/vYjoo66LEhm9JbCzqu6NaqR/sPWrbRw32ySdf3u0JoMnXKpzNh6gPltqE+/P/nmL9+QNuFzZzsSu
+								WJPriDDiDQZboLRRUmG736EtqT6rJ0incYRfWtQIVh/vtv9ynKtGnqpy6eK+bdr57K95/d99yyRNPX+i+7nNoo4nIimAdH6+
+								5e4+RZGhd3couhv2ugYUrKlsHNNro6qWxrE4ZleYLJSR8zvqfjJyUPTt475t0y/tK1Vbk1acUPip8dXGnw+ZS7XpKVojVjdn
+								CPPFtnNe/JsQvDb7x58TD5icXfuTZLqmJq/1h0NWNPocFK9nbfBmEl84T5i6FeTX6rn04u5jSW7Jn45tbGGX2qZsTaBzx7Yf
+								54z3WfYRHn/R0Cnj25m/K0aWdhsXPSq/Vx3KzIC/75lc4ZhUlWbiX+bNvRmZGlJ+dWjwzYMT7pVUvgiP6zglxK7Ko320Vcn2
+								bjG+Rrl/lga/E35/JijV8UOVxN3c1hClZGXqq8E5gaOznFdsyhvuWraiMM/7nKaUwasS4tPm+3yUT95nnHLAM5jUfKUpaYCO
+								I4LCbvygnGa9HL+2lORRPfLGLrh+8QU83jby7O+Dhi06uv47lZ3sJGt5EZJuwHRpEXLhf0fWzLELsOCXSzDguZU/v65uLxY3
+								s0DqoNrEhlCIP1UZ3LhFMbIvskMUOPRpVxBA6ly3pECcQeQRMrxn4YWezV6LTQCDYMv7M+M9c79alp1K9FgtIuArPjyeIpVy
+								5mPoie8eZ1/QTf1TVMe7frU2VHkVNnEuSyiAfZHzbiJr26ZxjFD2SWP1tXdWR24SyBGe9Kmp3FYc5JQ8f9HPaNEZFlF3yNd9
+								s8MBSkc9ysd3MCsjnHPIVxcxaeNHhTtK7A9nH+lhEDdlzUB/Co0PPcAh4jeu7iZ5VA+ihojl3mjuCKnhezOG6pjNzI+rmFfR
+								9Pz88fUC/d3C3G8ByrrLzbbzXLYxZWmr3YwYDcOGw+b1rg3ubisBou5f0gqzTZcLfNDXOb+svGQXPCKriGwXv4jIUn199uus
+								6VHXy/W9xfWUyfJckLYgPsCmYJlrMi0018+PPWXgyGJoIU1rya8sv4/Bzz4vjVZ50em/QIzefbpOTUpYkXoOa6yBmQGNNs9T
+								ggt2ux6bF8WAi7zwlrQrUXzl7gWxui8/M6acMaN/zEe6rLvwDHWHVxknFu5CWPSuMXL69WpFaJ9A85Btp/FtLkZGH24tHSln
+								n3+1b1nz9doXeaeEPIlqxbvJbnTcYNnsKQconfsC2Cp4npru4u9RXVUfa6Rx5Axb5BcJAnX/uDSssVWoRx9l7q5gIB+GExz8
+								TauUdT3FeSH/WKJpZ8bM1cOZvvElKRlOItXM4/HbzYzmJU8+/PfBfY1d7vgZKW6ZxTlwcrRbs27+NBI4zsU9Vf0t81IzDBzc
+								qkfEbi2MtrLQd0W8a1nyv0D0xAbd9YX102u2D9pIoLPJM1w/K8h88Og3bdEzbfq5kh/uemwTShvtSXg4AKFnVMNHW+n/tsZn
+								kAz6TmaweUdvJNt3JWeBM3/HVT9RiLpkxvIZwaS9hGwbuR5k6TMnSbzTlnBo05FGXnrPu02dmjmo1OgaSQ7dOuVHPsq13hcC
+								2v+o22kErA2CzJv9l1ZQ7/5vsrh1nFf5sjeSKwBHrO38V2wRF6Qk4CnHO2BxbBFDzCD2koFCaODbKXefgoR6bdP0a2KWdZF3
+								4KhiZ6OfTDxSinLSHDdyGDQFjEKl7f7wL3l4R+t9Zbxzy6Mj78G6HbxcRpQXOe6B/OLFJspv/zG3sdh/KADJOj5WrwYfZ+bm
+								Us2QkB6V+9klO67HfL5p0cJqs4fnjGZkOxq3QP/TG53u6xf/4wreEP3voMk8jycHCBEJ0/0hNfzIJdzvSGjUa4inLlZpjFv2
+								l2vAidf27CxDeiaK/WVuhbfoxVnNjvT+4va4crNmdMT7391UKv8p9Y+aH2r1zl8iMjewoXZ5WBEIF6NS7J992WmryMnlx+BO
+								XfL9g+bpxtkMsROcLfj5/h0FAoSnRuiKv2tJM1JZPXHzaN0Bt/7oHo0Y3P/zLw8ahSHPD+4Dz2eh5gl9k/fAaCDs+WZ8PnuC
+								31dx1x5QNguAn1U/br2R6AzsYY9RJJyMTJUQn97kH+7rJlUNM+QWg2r8a1jJW8CEzhIJRXUcULC7xVaSDKLBttwY4wAac4WC
+								iq8BBzx93usjK3jhIGN62yHxb6nk/KaxejYHeUd6BwEjvvck5g6knXMpDteo74oZ4ldP1zwkRcj4Uga7Z4abeaCPtvQyr4gl
+								B2zLWe0H2DZNcq/B7XiG3D76BTX6SfMsQg3TesXtz98j7x0i7If2BaqMnJunjt4oPHnnB1az8TrjSL6lou62WeOVqU7af8nS
+								x6eYv6D8lihuq9dFtP03fidr6gKXManH3crC/Cqh6IQgQr52VJld4hmtQUGXi50VRqf2nXnM0bWh7vM9ri0oq1855yM+uOi5
+								dKzwFF/yiC7rB4HRx7XNpaU8G2NRxkkPBlkJC3qaB30Ol03fAqMVsYLUocqpCZdYEFp5Blyr3gmIqxzR8HMgXb4jgKp/QOcv
+								GPNxV8z3vav/Y8u9soiwOOkfZ+MhOLeCYuwl2bPYV5mS0ZJtbo2CzHpNS4Kg8MjtWHPYQBmno88eAvcxVJe8TDyMLefJud2T
+								x/V/Bm3LiP4HyVm8coeI5wZXyXWnOf1BpXkK6ozp6/7O8BgeWoi4KdDjjy1/D6jgmOMKnhyI7R+Hgexwvp2ecqtO3tOL81Zh
+								cHNzivWg/9/+VxtZGBR9HKP3m3Z4IDEtl1F88NlKZb7zr052lIg1ksr0w91iHO+NO6ocaLAs/0Dt/D/zJk8Vhwl5HsIL8uz4
+								Y6D4ktw3eLKdTSdg3HeQEN9ePcaHVuBTf4O1gciJp9I93flS/ORCfUPNDuJ1aVeKlRzTMu5djnWkWjKU846WBd18OZnCs5Q3
+								ayt2SJuZSYBe6ijpn5WFddXj1TZAL9aoi9LivQ/trqyAzj8FkwCoi7mHkPfJw4tuFv3tNhP3+QAj4QR9hzglxsd5lYl5U78m
+								++Jz0xIeE1jaMthjhH6MrqPJ5XV3/StY24TkCHo5HXWSw6NyaTGRoghPOtWJQ4rWEfb1s5Y5cL0zFxeLa4d3iE0j2Ne+u+gZ
+								7kPHdgCFzYP2joc8LFK0UurP2VtLEWJVnM0QmLDzr2cToKjSI1K282P2OBjmOiiLcZWhosdaRcR1zPfb7OVLBZ3me6RyW4X5
+								G25fuCUeFVvGhXA+k5pKy0a/OQWWvMLm8VT6kAzRyh48vLnMUv3HUj/Qu+M5Jmd7Bo3fDsKJ+21flle89Imd+sX+MZZnuYbc
+								qyKFEcZkuaCZxW19D5ODpBP9S45jjnZFzj+05PbSoj7O+ia+0N6Wl2Hl3zTY9ncyLLi+pyxN0F+3hJgW4LAhp2nHBzuJJm5D
+								/zyeFW86SzLBjHZvIzXCL00g8iZb1GWUzI0oN2nPOoJor58s9stsFCfZNa8+PL0dE1NR/MHFG8uqXPiJqI+6uen078xB7swy
+								8Zu7fOfOWGnlWSbx/tmBMPPt/KuteFYwk2OBSfZ33rbOWNV3euny25Otzzjeu258kF37+Jqf9BkZvHLH5W5ODjfIlPadjNeR
+								Utz75aNqF66M15ry+9fD2oev5tUfTj0rPd8iakTTrHr2zR801d/2bAOTZSz7GLZUpAAx8SE/W3a+AuHJcUbb2gimc7Jvj65f
+								l5XVEfMCy0q48HPxi660vb+9/ybkzInx803iIALvRI4b90jrNG9zuTEbi3a67gxoP4TX0CYqcVdOsSYQxq6NgemFYuVQe1Do
+								efOlvRQ3hM4+x1QLXlLziabnL5RPaN9NuhIe7Cc7tKUh8N+yd6SU6Al6zd/EM45dNhKGFQe453e64wwebW7ILYoshlpQuuSu
+								2zHLrcEui/4Gz5wG3PixULFo5LTM/xvtT/AgPq3+kPY/rW/gydQ4zolbMFK8xaGGeR01wsL0CPWsQvDEXZPBcaHZHW9YdU1j
+								DOgPriJ/tsbo1q7BV+sM6vYSs6SY6ZXqs4cUew4iOuk4Xqg9uRWZaIhPAtsOrFnfNkzsumHMENi56jr7/3j2+f7ZnFFci81+
+								OkJ8Nms76XvnLuVRAGF4NqdO7qvf6jEmlwJh4uc9DHDOczaqHnGO4iOz9e7LWN31RWsDt4S1YLz+QyXB5CBF6LbCD714Kqe5
+								WDWH0uJWbWXdQP1m9O8u2N2B9XDiofxJK6PmINSmvIguTbxXwLtEvl2HjbnYtCmgLsq9FRpvH6wnGC52tnzUM53IqXXlPjcu
+								r7AclqY8E0aY/iLtw/Ic0Qmh2GVkUco9f/u5wCG/f8FkZE0FtP54/yXnhiUGPkiF7SFlr57erb3yHa73HpMo0eP7ELfQqWNq
+								ksqvtDvDO1J2TZt2AW/4tnNVGTFq8wLg6359ReDCxM7mKPurzgrBkiwdewTe+Ce8ETXpzeVH0XPFCyJf1g1KSm58P++ZMxNj
+								Fm8aSmeu/GW/HjB3Rr3l5wY1t+DAz+N1VL+JV2j+NGnMtINYg5lZHq9bDXqns280PPSxhMJ4fUXez+q9BN9dSJ7p/muS6Il9
+								6DMwwV9+mKe3DddvfgFnAvD/+oMfL7cR3Z4lJZkX2/wJQtG0dRxeOtRz6YgqZofsXgnVNtUD1QvoJiLGcolvVTfich+0hv7K
+								H0lm+r2y5G35P0ku/r6e7hoQ+wL1BFRuq/02nP9oAyEn3zAeVEV9m6RKpJuf5D2rYMStW3Eq0fiUQi/zKDtpAu4ARabEXfMa
+								41jj2lDJmSv1q/+5A8bP26SfJcGRtIXiljZ8mbVrwiF1leygpiJZJTSrxEWQED4odKvERZgQHEr5R4ibICwCsrQKGiglgUn1
+								LiFRUwpPFDJV5RgYE0fqXEKyqQ4uUVyEJZBbGy+JQSL6uAIY8fKvGyCgzk8SslXlaBHC+tQBGiCmIV8SklHlXAUMYPlXhUgY
+								EyfqXEowqUeKigNUQVMDpo8l/8X/xf/F/8X/xf/P8Rx9KuR/TrlBXt+kW/rhnQrnf06yCDdn2kXzdb47i211kr2vWXfl02oF
+								2v6ddxBu36Tr/uK+K49uMEK9r4gT6uMKCNN+jjELo+9HGLLI7rOM6xoo1/6OMiA9p4iT6OYtDGV/RxF4rjVI3TrGjjN/q4zo
+								A23qOPAxm08SF93EjFxqkeZ/ZTMy6Vf9DYuXJ258bO42CwvwnK5zNl685RG9SMnQ07cM62GztvmTmcSp45gtrua0+lB4yn7p
+								7eQD36LZmSVKVBSYc4BaYZsHw9TPfAfCqVvnA8xLspSTWa30rdPZUIcSbE22CaBdPvKMk1HsQ/wLp1MN0Ly76H/SZA/CP16M
+								L3sH4XJalB8XbYZi3EP0G8A/JgQbwP4p2wDZSa/dSji7uou6WwzfX9UA7APLDFOyDOhhjyuH4Q1idQkhuHqEfCNKjHGdYDtz
+								QeluXAsnSY5lKPRBmwHOr+K4+6W7IGpvmwDI5BlElJKiHnvwqoR5eyYF0cJblZCMvgeG7+DMt+gLIXuGyY51OPyvdS230gz8
+								qtlOTvwzD/I0yPwH6xEP9ESa6mwnwR9ej3fbAshpLcKoJyFOYh/6ug561fID5ApX/NhvXRlKT2GCxPhmXZML8a1sN87XGYPw
+								jTYkpyG5UTUE5CKaEkdaiUQjkF5TQlqUflf1DOQPmVkjSgchbKOShllOQOKgIo5ynJXVR+o5JnjKB+XDSZen51H8xfoCT3Lk
+								IRQhFRkkZULkEppyT3f4fyB5Q/qZY7Qio/Zh7sA8dx9yJMD8A2v8NyiCuzYZ0/MKdAfBDKIVh+gXouzqF+XAzLxLlUS8MFKL
+								9BnEc9v5YPy6fCtAD286OeVxVAKYT152H6M9VSf57Kj4bl1XyIBTA9DNt/Rj2vOQLzZTAtgnKUaqkrg3IO4l+o59d/gX18YX
+								oMlp2F7adBfBxKMcz/CtMTVMvtX6kfl6DlJyE+A9MS6vmNEtjPh2J04hPXmY3hFrZTm+v8W+w4HSZT59/Km1n6a2fg+GzIuv
+								TM6c7AO5E3s/R/q1afLsKH47JR1qeiVnUKjp03s/RkFGKvLsGG47Eh62MnV0RFrQR2NDYcM2/mseIVK1bIEo8+gQnHYaOsj0
+								dErlAkHv0LHhwrb0Avj4iQsRE8Bg+unY2yProc2JFKUaJjseAYeTNzj4YvBzhNlFgsuDY2ZJ17JDw8fLlcFFniMbGxfO1wrX
+								kzcw+HhYW3Ji4XJTaBn60NrpmNsi4MDQ2jJS4TJQYHriVvQC9bRmOvaBUlIeFnLXBNbJR1wTLEpouiaM2EhIR8zXCNeTNz85
+								YuBXY7UZSJr9UMV89GWectQexWUdq2ZkLK2kOa4BryZubmLAa2IvEOoqDENcLVsSHrrJxFi5a0S7yNxVHi6zTA1ebNzDrIpb
+								FVWhzYmuCq2Sjr/Vxg00VRYXEQZd26A+rgavIGdDCXy1Ulyop2oqxLPLBXNVwVG2X9U1BwiCJxTRZHiSfuUw1XmTcz68egoO
+								BgjaKsbhUlMTFDJbwjG2X9QwBih6gSpUNrIvZ6lXAVeTN3Z84PQPCQ9omrtjjA12/I+L4jvD0bsgb0/MAAhSgYFofEN6iCd8
+								gb0H4ydlA7UdRZfJ00cRXwtmyU9R5fP4AHBGhtzWh6a27YsLEDvF3ezN1pc6VshSiYFgd2RzidjbJO85w7F7HbtaZWi4MoGz
+								emt4W3yZu5nTfT03Muriir24myceuuNvBWNmS9fedMxJ7bKkonLI7g37aB0/Jmbt/h7i5n+83vvMVR4t9uo8EVbMh6zXefAV
+								spSuctvhESp8OVeQN6ymefubcVpZMWR+ykbVsUcBkbZZ06EbERfK621oxUZXG5KEmtcHnezDXJEydOmfIuRElK2iyHIzbKOt
+								7FBbHpotBaU4sotMQRe5McLs2bqfKDa3HVe+sgNrrbVc3Gsrg6NsDVX+eZqiyu6rxnMjs9hmC2dYqK1pSP3sjYeBYnYas979
+								tZnJCNZXFiNobFidiYvTgpG8fipGwci5OwO3SGaixOwsbtxYnYvn5YFidna7c4EVsuijaLk7BxBypvwdY6eiNjz8WyOBFbhS
+								iqLE7IxrI4OVu7xUnYuAMVUjaOxYnYmGPxt2NrtjgJ213V6E2FxcnYM7EsTs7WbnEittYblPC3YWNZnITd9gZFvcXJ2O5YFn
+								9rtgaLE7E/c1drcbooxGz1FleKQsLWfrv5dmyce3AiNubtJiEby+IkbNx7cDL2FCyLk7JxLE7EVoqi2eKE7I4WV3GDQsJ2aS
+								uKWouTsfG+USFna7c4EdtlIpbFidkYFidhO7u4YFmclI1jcSK2swuWxQnZWBYnYY9zdsGyOBnbGcvipGwcixOxNYrSmjghG8
+								viJGyndqKoG72RscdhWZycrd3iRGyncVgWJ2arEYVucRI2y8kJy+KkbByLE7FZTlgWJ2RjWZyEbcdywrI4GZuFZXFSNo7Fid
+								iaRVFanJCtbM0pqiwuF4WEPQpTFDJ2+9ZU/fcwcrZ2ixOxlaK0bU3fdhYnYQ/vKIrK856MPaqNKFPUiELKxrE4EXv4KBUW9+
+								xgcRK27fBRWBYnZitFcVE7eiNhf9xOFHUWJ2KrEEWVxYnYH8tE0WZxEvYQJMoo7aKQsW2xLE7EHqJszXGaLE7IxrI4CXsghi
+								gITsYegmVxIvZAPFFI2TgWJ2HbYIpCxh6IZXEidpvE1Y/eSNj9FGwtrUnCtsATJYCI3Q+vNYnYtMQ1ifIWbK0WJ2H3xRSFjI
+								3XmkTsvhZYFidkY1mchG2GKQoZG681Sdk4FidiK0XRbHFCNpYoJOxeZnitScrGsTgRu5cZlsUJ2ViikLB79MJoTRi9kbF7YV
+								mclI1jcRK2CZYonqRsrNYkYRuZ9MKyOBm7B5bFydnaRSFiG+G1JjEbw+Ik7C5GRlgWJ2RjiULE7oLXmoRsLIuTsA27GGFZnI
+								yNJwopG6c1idh0UTRYnJCNZXESdndMUcjYeK1JztZucSJ2d0MsixOzMUQhYZt3x2tNUjaOxTWx1TwPwjQ3745lcc3Pk6hlY4
+								mi4TkYKduDs3DePC+vL6ZO/fTTCRM+GTnyo4+src1xW7P9NyrffNn6/A7QPb5aiNhfyNifyNjmuBZv+zXT11+2Pncky/wrSP
+								wLWeKfyBI3p7WmmSqLq/5jwYIv2z4vhTL3eTeiKNE0NsC9ZInTRWnbmjaqWrPt38P8p3d8Pk2auQZRFBbX8k2q9/SOz9XJMp
+								/dvjW7q7K4rapeXCYKHd3uOUaP2a2ijFQlipax+Jzpqp9jlGU+WVVrGqlqTRV/D5s1Xf3zlyjzyaot3qOtxVWO3tqhO7CZgy
+								e3ioJvccSeMV3z865ABziRxT+frvk5XUXmqs57LRaf5qD9+WJp5hos3k+1xSc5aH8uWpY5u6PFNYniMlEVWs1z6IPZnbO4mw
+								Pec+jyzBWiWGNc2lwd8J+fR5k7qrd4+0vbeAf85/5lmTvSRTHUIIo6tIbrPMCxLD7WoXPvWZBlPgbn0mbv0Pn3Q6DMx2i3uA
+								a0RjZz8DBtooy2JHsfB8p8mGaLa0RrYQNczaVNmvgIS/L3n6DMB3S0uGL0NtSS/L0tsswHqBPlQ8u3e98M0HUHqG7NQZZv95
+								4caea676uyeH/Lt3+/D8r8/Y4Wt7J8+/cSyTNvL8oHlu/mfUoo855tW7NPt3fzHihZ5j3pouChsd+7BXClxXt3e3fv3ZJlbq
+								wQ5b1u7/Z9YShzY1lrmnZ7t+85k2Wuj9hd9d79+9lQ5gDvBLpTbIAb6P0778NDmev9O+/xk2b+771/8F98b2JnPx1/J8EAlv
+								WnZL+jgN5vi9bvhLKJkv3e5EA2pfK3G9X9ViTONoPZbX9P0pCto2suzwH91oT0FwNgkzFsHWXdn0NZSKl/H63i3bIfUQcpVX
+								W2jVV/2uug+M1MpIMOTQdTuYaUfD2K/w8OpCyT"/>
+				</foreignObject>
+				<svg viewBox="0 0 89.024 228.01" height="78.514" preserveAspectRatio="none" width="30.4299" x="0" y="500.753">
+					<clipPath id="mfid43">
+						<rect width="89.024" height="228.01" id="mfid44"/>
+					</clipPath>
+					<g clip-path="url(#mfid43)">
+						<mask id="mfid45">
+							<rect width="90" height="229" fill="white" stroke="none"/>
+						</mask>
+						<mask id="mfid46" fill="white" stroke="none">
+							<g>
+								<g mask="url(#mfid45)">
+									<use xlink:href="#mfid44"/>
+								</g>
+							</g>
+						</mask>
+						<!-- Unsupported Record: EmfPlusRecordTypeSetPixelOffsetMode  -->
+						<defs>
+							<image id="mfid47" width="90" height="229" xlink:href=""/>
+						</defs>
+						<!-- Unsupported Record: EmfPlusRecordTypeSetObject Obj_ImageAttributes -->
+						<g mask="url(#mfid46)">
+							<g transform="matrix(0.00018373, 0, 0, 0.00018373, 0, 0)">
+								<clipPath id="mfid48">
+									<rect x="-0.5" y="-0.5" width="90" height="229"/>
+								</clipPath>
+								<use xlink:href="#mfid47" clip-path="url(#mfid48)"
+										transform="matrix(5442.9, 0, 0, 5442.9, 2721.4, 2721.5)"/>
+							</g>
+						</g>
+					</g>
+				</svg>
+			</switch>
+			<rect v:rectContext="foreign" x="0" y="500.753" width="30.4299" height="78.514" class="st5"/>
+		</g>
+		<g id="shape638-1561" v:mID="638" v:groupContext="shape" transform="translate(36.12,-306.375)">
+			<title>Sheet.638</title>
+			<desc>d = 0</desc>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="54" cy="561.267" width="108" height="36"/>
+			<rect x="0" y="543.267" width="108" height="36" class="st5"/>
+			<text x="36.26" y="566.67" class="st6" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>d <tspan class="st7">= 0</tspan></text>		</g>
+		<g id="shape639-1565" v:mID="639" v:groupContext="shape" transform="translate(198.12,-306.375)">
+			<title>Sheet.639</title>
+			<desc>d = 1</desc>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="54" cy="561.267" width="108" height="36"/>
+			<rect x="0" y="543.267" width="108" height="36" class="st5"/>
+			<text x="36.26" y="566.67" class="st6" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>d <tspan class="st7">= 1</tspan></text>		</g>
+		<g id="shape640-1569" v:mID="640" v:groupContext="shape" transform="translate(360.12,-306.375)">
+			<title>Sheet.640</title>
+			<desc>d = 2</desc>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="54" cy="561.267" width="108" height="36"/>
+			<rect x="0" y="543.267" width="108" height="36" class="st5"/>
+			<text x="36.26" y="566.67" class="st6" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>d <tspan class="st7">= 2</tspan></text>		</g>
+		<g id="shape641-1573" v:mID="641" v:groupContext="shape" transform="translate(522.12,-306.375)">
+			<title>Sheet.641</title>
+			<desc>d = 3</desc>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="54" cy="561.267" width="108" height="36"/>
+			<rect x="0" y="543.267" width="108" height="36" class="st5"/>
+			<text x="36.26" y="566.67" class="st6" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>d <tspan class="st7">= 3</tspan></text>		</g>
+		<g id="shape642-1577" v:mID="642" v:groupContext="shape" transform="translate(36.12,-18.375)">
+			<title>Sheet.642</title>
+			<desc>d = 4</desc>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="54" cy="561.267" width="108" height="36"/>
+			<rect x="0" y="543.267" width="108" height="36" class="st5"/>
+			<text x="36.26" y="566.67" class="st6" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>d <tspan class="st7">= 4</tspan></text>		</g>
+		<g id="shape643-1581" v:mID="643" v:groupContext="shape" transform="translate(198.12,-18.375)">
+			<title>Sheet.643</title>
+			<desc>d = 5</desc>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="54" cy="561.267" width="108" height="36"/>
+			<rect x="0" y="543.267" width="108" height="36" class="st5"/>
+			<text x="36.26" y="566.67" class="st6" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>d <tspan class="st7">= 5</tspan></text>		</g>
+		<g id="shape644-1585" v:mID="644" v:groupContext="shape" transform="translate(360.12,-18.375)">
+			<title>Sheet.644</title>
+			<desc>d = 6</desc>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="54" cy="561.267" width="108" height="36"/>
+			<rect x="0" y="543.267" width="108" height="36" class="st5"/>
+			<text x="36.26" y="566.67" class="st6" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>d <tspan class="st7">= 6</tspan></text>		</g>
+		<g id="shape645-1589" v:mID="645" v:groupContext="shape" transform="translate(522.12,-18.375)">
+			<title>Sheet.645</title>
+			<desc>d = 7</desc>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="54" cy="561.267" width="108" height="36"/>
+			<rect x="0" y="543.267" width="108" height="36" class="st5"/>
+			<text x="36.26" y="566.67" class="st6" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>d <tspan class="st7">= 7</tspan></text>		</g>
+	</g>
+</svg>
diff --git a/doc/img/equ_dir_search.svg b/doc/img/equ_dir_search.svg
new file mode 100644
index 0000000..3f14e3d
--- /dev/null
+++ b/doc/img/equ_dir_search.svg
@@ -0,0 +1,206 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<!-- Generated by Microsoft Visio, SVG Export equ_dir_search.svg Page-1 -->
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ev="http://www.w3.org/2001/xml-events"
+		xmlns:v="http://schemas.microsoft.com/visio/2003/SVGExtensions/" width="7.17726in" height="0.950904in"
+		viewBox="0 0 516.763 68.4651" xml:space="preserve" color-interpolation-filters="sRGB" class="st2">
+	<v:documentProperties v:langID="1033" v:viewMarkup="false">
+		<v:userDefs>
+			<v:ud v:nameU="msvNoAutoConnect" v:val="VT0(1):26"/>
+		</v:userDefs>
+	</v:documentProperties>
+
+	<style type="text/css">
+	<![CDATA[
+		.st1 {fill:none;stroke:none;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.75}
+		.st2 {fill:none;fill-rule:evenodd;font-size:12px;overflow:visible;stroke-linecap:square;stroke-miterlimit:3}
+	]]>
+	</style>
+
+	<g v:mID="0" v:index="1" v:groupContext="foregroundPage">
+		<title>Page-1</title>
+		<v:pageProperties v:drawingScale="1" v:pageScale="1" v:drawingUnits="19" v:shadowOffsetX="9" v:shadowOffsetY="-9"/>
+		<g id="shape1-1" v:mID="1" v:groupContext="shape" transform="translate(18.375,-18.375)">
+			<title>Sheet.1</title>
+			<rect v:rectContext="foreign" x="0" y="36.75" width="480.013" height="31.7151" class="st1"/>
+			<image x="0" y="36.75" width="480.013" height="31.7151" preserveAspectRatio="none" xlink:href="data:image/png;base64,
+						iVBORw0KGgoAAAANSUhEUgAABGAAAABKCAYAAAD0diLqAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAAAJcEhZcwAAGdYAABnW
+						ARjRyu0AADUmSURBVHhe7Z3pr2VF9b/9B3zjCxJfEBITYwwxhhiiaUKAQIB8MQoEBYLMo8woyCwoo8zIPDc2NINAg0wKCkIQZLJFQFBQ
+						hm6QSQRknvYvz/au+6uurl279lD77HP78yQn3ffcc/fZVbVqrVVrrar9mUIIIYQQQgghhBBCZEUBGCGEEEIIIYQQQojMKAAjhBBCCCGE
+						EEIIkRkFYIQQQgghhBBCCCEyowCMEEIIIYQQQgghRGYUgBFCCCGEEEIIIYTIjAIwQgghhBBCCCGEEJlRAEYIIYQQQgghhBAiMwrACCFE
+						Ap9++mlx2223FbfffvvMO/UsWrSouPvuu2d+EkIIIfLw0UcfFVdeeWWxePHimXfi3HvvvcW1115b2jYhhBDDoQCMEEIk8OCDDxY/+9nP
+						Sic3FT7L3/z1r3+deUcIIYTon+uvv75YsGBBckCFz82fP7/4/e9/P/OOEEKIIVAARgghanjzzTeLfffdt/jnP/858046Tz75ZHHAAQcU
+						77777sw7QgghRH8sXbq02HPPPYvXX3995p00Xn755WK//fYr/xVCCDEMCsAIIRrz/PPPF1tttVWx/vrrD/raaKONiocffnjmLobjt7/9
+						bfHjH/+4+Pjjj2feSeeDDz4ofvjDH5bXEEKsePz73/8uzjzzzOI///nPzDtxXnnlleLUU08t3n777Zl3RFMeeeSR0l6E7EjO1xZbbFE8
+						++yzM3cxHBdeeGH5asMpp5zS+m+FENNN0+31sk/9oACMEKIxZNm+853vFJ/5zGdmX+uss05x4403FnfeeWejF4GJs846qzj++OPLF9Ui
+						XOtzn/vcMte31+GHH94qENIWAihkCDFQbbnhhhuKgw8+eND7FkJMnvfff7848sgjG29DJIBw+umnN9ryKP4/zzzzTLHGGmssYzu++93v
+						louMkB2KvdD9BCnMRu2+++7FmmuuWXz2s59d5vr2GjqY8cYbbxTbbLNNKTNteOihh4oddtiheOutt2beEUKsKLTZXi/71B0FYIQQrWBB
+						8fWvf33W6cQZvfzyy3s70O+TTz4pM4lnn3126eza9/CdbbYCtYVoP1nNp556auad5mCscP6blocLIaab3/zmN+XivaleJFh74oknFnfd
+						ddfMO6IpnG2yyiqrzNoO/t/neScsPh5//PFy8fKVr3xl9ns22WSTQXX9E088UVaktv3OF154odhss8062TghxPTRdnu97FN3FIARQrSC
+						BQUBFzcLiINLNL1vcHTJQq6++url9wyZYSR4summmxavvfbazDv/g/LLgw46qJg3b16ZJb3uuuuKb3/728X//d//Fffdd9/Mp/4HDi7X
+						4DwYIcSKATpijz32KB577LGZd5pBZcLOO++sUu+WYDeoWDH7xIuqGKpj+oYzvjgA90tf+lJpE4fccopt3HXXXSvPGcMeYZfWW2+9Yrfd
+						diuTCi5Uvmy99dbFPffcM/OOEGJFoMv2etmnbigAI4RoDeX1nG/iOrgEIV566aWZT/QLZymg8IfMMOKU7rTTTsU777wz887/IOjywAMP
+						FJdddlnpdPMzQan777+/PHvgxRdfnPlkUQZvNtxww+UCM0KIuUvXrR1dt5aI8HZZthDlWjRQtYn+33vvvUv7OATYoCOOOCJYZUWwaccd
+						dyz+9a9/lQGaH/3oR+Wjql14nwCO/74QYu7SdXu97FM3BgnA/OMf/xjFAFFqRTnwUEZxLNBeFodjiFJSHfH3v/995icxLcRkiGALQRfX
+						wT3kkEOy7Q3lIEuUPnN5CAjA+NlFtkdxbg1t52wX1/mlHHy11VZbJthCAGaDDTaY6gxjTj0+Jh3VhRXNxsi2x7n00kuLY489duan5pCV
+						RL+McWGM7mPOogvHjr9dltf555/feFtYKs8991yx8cYbDzY3CMD89Kc/nflpWTh/bMsttyxeffXVcp5QEeQfEmwBGK4zreS2IWPRdV2Z
+						pnnbFdqIz+VXfE2CMfZ71+31Y7VP9DF9TdA5xqT9huwBmHvvvbfYdtttS+GbNLZlgsXhtDv6qdBO2tvn2RxdQA6QB+RCTAcpMuTvte/7
+						PBgfHCFK+4eYx6EAjEEGgL3zOLkG2YQvfvGLyzhrYwzA0Hd33313WYLqb6/yyanHx6ajurAi2ZgV2bbzHQRYmffMIXuEL/fBvCdIbM6p
+						v6glMM0TJAgILFy4sNQJVGjwBJ1FixYtNwfOPffcysX1JKEdnH1yxhlnZAu294XJh79dts/zYHxw7NuW9jclFoBZvHjxrG1ed911g1WY
+						Yw3AYJewT8yx2LzObUPGpOu6Mk3ztgu0jTY2PVw2F0P2OwEIgqw33XRT6Y/++c9/nv1OqsgfffTR8v/YqtD2eoN5xQHy6A38V7bZT4t9
+						IujOuVixhL/ZhUn5a1kDMHQApZix0/8RBrYwHHXUUcEFTluqrosQ2kn2Y5iUObG20gdVbUUAf/3rX5eLyD7P7ohdF3mgYiImFyI/RL2/
+						9rWvzTqkKFhfEafIEDDeZBTtWrxYYOQaY76Psv4h5jALpNAWJMCAcUAwVS8GB26y3YiMo0G/jmkLEmfRcN/cK+fYcDaAu2XKJUWPtyVV
+						vqYJa9NctjEpMpHLttOnv/jFL8ozK8hKG0P0O+1gzqA3TzvttDLLNn/+/GKttdYqnWsygTh9BGZtUesfUsh5MDfffHOpVz7/+c+X54bg
+						MHMIIjrYPy+GRTFbZt57772Zd7rx4Ycflk61G4yoevEkuu2337504NG5PjitBMJzLXz7BJnA0Xbbl3O7LGOKHAyR8UZGqrYg8R7zlEUS
+						BwUzb/3tuyarY8pkM2848+3iiy8uK16322674CLJ5n0uG5LT/k2KaZq3baBNtA2Zji2sH3744bICxE2g9QHBD/TmBRdcsEwAdoh+J+D6
+						rW99q5wzFoDZZ599irXXXrsMZGI3rb0x35b7w8Zhn/j/H//4x2LzzTdfTnf0bZ+WLl1azjdXT1e92Pp/3HHHVVY4EWDnnmM6fgi/oYps
+						ARgEjbMaGMCYoKHw6UgeO9unIbTrhhaVDDALjuuvv37mnbnJ7373uzJyGTtwjslkwt7nwabudf2sCvLAYp2snz+ZxfBce+215TixF5Q9
+						oS4pMmQw51HEXMtec2GMY1kCDBlBRhxtoK2cT+OXt4/pEF7GCWeW++ax4SzEvvzlLwcPCk3V421pIl/TxFy2MZO27W7g2K8oy9nvpt9o
+						O8ElFyvl5p7scfO2qPXv8Yorrij1AJlD1/nlGty7vwjGfnKdPoNYBvfAPeOU++fUMLbMy1122aXUEVWZW/QjwfYch6/3DXLob5clSDjt
+						WwarDuG9+uqrS7tjNpixCj2Nb2yH8JIMYC7gk1x00UXlOFXpkZw2JLf9myTTNG+bQrCMw7ZjbSNgYP7qoYce2muglG2nXDc0J3P2OwGH
+						r371q8Utt9yyjLzyf6pX0ONudTbzvcq2uNXd9A2VaARj/Iq+XPbJdBL9iJ3ywRaRAGGcaXNoVwWfIehep+Mn5a9lC8DQEBpEw2IgAESx
+						cEr6VHB33HFHsfLKK5cDFyoBJdhAlNBKh+caGFgWv3X7nJk0e+65Z/nZPp1kM1zf/OY3l8lSGmZg/eBMW2gjSoXSb9EMMxb+WKTKkAvl
+						fhgXrmevSUSW+6TqEZ3oFRZbOIboOYwUBiq0NQrZJCNhgZpJwsGgZN7JmJItYTF78sknB8coVY+3oY18TRNz1cZM2rZjp6gm42DRUFA0
+						R79z/yzCbK6HYBGMvrPsIrYVxzS0qLXDD6mmMdAROMdcxyWXg2v3wD3HzqmxwBPOe8hBRQ8efvjhvd4jOjfXWXGMx5DbZYcA2fEDK1Q6
+						ceAulSEsPmgfAZlQMIH+JoDonw0zKZhDjA1nKN16661l5c5VV1213H3ntiE57d+kyTFvxwCyzoKbhXfM76T9J5xwQrmA7zsxRlXkqquu
+						OmsLXHL1OwFI2lI1Fyyg4SYMYwEY7AP6g3mIvqRq0vdrIZd9QhfNmzev1M+xynGCbaw5qp5uR2IRH7fucdmT8NeyBGBQimSBUfw5lGIf
+						UO7LoPVZ9TEmqGogKjjWx94iFyz6MG5VWx+awOQ/4IADRtvesWJKmQU5C3OXtjJERorrmYPL/3lvWqk6Kd4yBGw9oA85wwHnNmSkMMRj
+						0YeW9ca5jZFbj49dR3VlLtqYFdW22zWrtnkAcsxWJMsuYpNwTEOOHwteSsLd36EjQvMBB7fPEm+DLZIEstAFvm7zwVHHEXaddxee/IaT
+						3tejl2mznxDoC8aPQLnZJ170+zRvMWFMQk8jIYj1/e9/v9x+hA2jiim0UML2k4jrexHVBktsMC51i6acNmQadF1X+p63Y8DaFAp8j4W+
+						+x35ZD1FoNLdDu+DHrAKTaCP3CpMH3xZgrYc4o3+D21RzGWfzOZgJ7GXVdB25ij6wk1oGATkeCJd1RZGYxL+WpYADB230korjXoC2KKK
+						hZNfejvtmOOXIyrZJ5aJr3P+UqCdCsA0x8r5/TNLusgQWQeqXlwHF8U2zU+/IhPmH6iIs0u7fKfXxzIJGN1JY3qvLqsAOfX4tOioLsxF
+						G7Oi2nbLyIccPAPbQ1WOBShsIRkKJGD7qKaxigNzEkNZWwKmOQ45RG9RcYP+r3sKBm1joesfLm5YMNp17ruQMwADVtXj2igW22N4Wkob
+						WIQQ/G+7eECuCWaMAZOlOrnMbUOmQdd1pe95O2lsMe77s2Oj736361ExEqtiw464VTno8tD2ettS+8tf/rL82WxZKACTyz5ZojAluIOt
+						4LNVNp82V9kuYxL+Wu8BGJsAdVGrMUAGuG5QphFzlhDgMWN73vtQQhhgBWCaY2XzKB4UkNFVhqw02JxbU6SxCPSYoT277bZbGSU3UOqh
+						Aw196EtkcwyBBu6Ve64z1Ln1+LToqK7MJRuzItt226YZO7SVrba/+tWvyn4ykO/Q9h7e59wl7o/PUyEYOizQHF/Xae4LO6MnxeG0+crn
+						QwtS2kB1UF+ykTsAA1ayb/aJ1zRvl6U92Kg6e+RDyT32fyxbJS0pxIKShWUVOW3INOm6LvQ9byeNVfWNPaDUd7/bnCGxVrWdki2JnAPj
+						fh//Z575gU62plPN+Ze//KX8mQANgU5/i08u++Sez5Myvy0AEzrzFagKojqo7lpD+2vRAAyDQFaAQd1rr72WO3gOBx4BogTfwJBj0GNl
+						TSxEiNZTgsWTOHA+QgLTlOeff748eZr75bp1GV5+z2dzODd9gCNAVoIDxrhPSkypIuDZ5pyvwnuMj19ZYIvqWPmmjS1POSDr5o9tG7hf
+						9vsj6FwXBRNbcNskqzO0KfQVgKEvObfG7h95Rj6RJ/p0hx12CGbJOP+DQ6CQP/6WzzJuOOTu4V7unOJkcv/MGmR44403LkuFDfoVxWHz
+						BbllvvBkCvs+fsdBYlXjyOcpscbR4rO8uCZzl3v1nd0UGarD9mZyHXsNed6Hq2eQSZ7K5X73m2++WWasGQeLsHMwGvfM3mDfEWfhQX+h
+						14j4c74F12ZeVjmvXIO/mWR5e91TT9Al/jxN0eOQS0dx3bPPPruU7SqZ52W6iznGuV/MHb6TcaHfu2YyuDan7NtYc0101R/+8IfyvriH
+						mJ4bq41Bz5guYG5w0CXzhbHkZ95HP7hzYJK23b1u6N58+u53C1bwol3IJrYmdg9ApQs2w5VDy7RRFYdfxdZFzgQI2RVkDR3V1a75mONM
+						e1ho1o1RXQAG6KPQVtY2DBGAAQ6tRJ5sbJEZHh89FK7PyhkuflCchQALKzcTjewzR0O6E/+L7VWpc47PcV5Fzsdxp8D8QK/YOPivn/zk
+						J8stqFN9lFy6zvcVsQG80A3m97g6kPlttoTf8bdm17rQpn0ufc7bvqBPkEmz59ZX6EN0pesXuDBfWEDHtlfbnKO6iX/5uQ/Mt2dsQz6P
+						T5/9bok1vpu+4RxC9FhdMNZsEXPJBVnBV6cqE/vA9kTa55PLPhEYIrbA2KdUoNUFYKx/aKubaPYZ2l+rDMAgPHQsh0/RATQOgXHhJnnf
+						Lcu1jkMhhRQLA4sTi5FAWRFVS8ki18ECh8c/EulCYaI4684XMacidgDdpLB+oqSU8mR+ZiKQsWE/Hs9xf/rpp8u+80u0zEBX7QWkr+rG
+						til2vz//+c/L+8Uoct26klgWsikl0HVgcLoGYLgGExQlbvePLKPMWGCzwMOw+sqdRRpjgCyhyFgMIvsYXowBwRR+Tun30JziXsjMoeyQ
+						a0rXTzzxxOLoo48ugwjwpz/9qfwujJPvqDA2OFkYfh5Lx/3hYLA3nO8KRXzrZCgF2kw0HoXG9/DCGRjC2aMPTjrppKiewfBhAF2lbJlu
+						dIe/IKI9jO/tt98+8049ixYtKh/9NxbMaa3Sz0adHoecOgqZ57tdmWcOcH1f5g888MBycc542++Ye4wt+6Jj7YxBNQLz9YEHHiivQVCJ
+						tuBcccgf98bWNO6hSn+O0cagh7bddtvZw9H5mXv83ve+V/Yx8+Wcc84pD7F3HdxJ2XauxRxlLzq6C/2LTontn++730PVErxwdgk0hg7/
+						A9qOk+8+Ycz60Xd6Q6CjcjylB9kl8UEbUu7D7CGvKoe4L18ChgrAILOh7bLMidxgX9Bt6MkqecUP4J5sQWALCd4LzUPaQ/+HFkshSBqx
+						UG+rI3OAbaB9sUU0pPgouXRdyFfkszwxjMf1oqcYC4KsVLpdc8015e9I8PA7vhe9gay5VbVNads+lz7nbR/Q3/iNdugrP2NTaBdV1dh2
+						/HFss29fzL9hTEKYTY/NuTbg0zKX8RGY1/iPdWeO9Nnv9BE+ENfzX8jYwoULK21IaHt9Krnsk/nmzEHmYgzaztyjrVUBGOZrij8ytL8W
+						DMAwECyMTYhDCpHPWAbFFXZrQNVeaRwR/o4Bs3Kxrvv1EPJ99913dhFp0Wvug/upwiZKarkan2EBYILd5kXGK0VYiV4ec8wxpRI36H+u
+						waKCa2AA+NldQAJjN69ii4GNrZ1HYZHDrs4OSpFJYPdripDvikGbQgGApvQRgGHSWwDDAiEofCYsfUmf8p6rMJE9C774TpsFAlGABJhC
+						c8rt99CcYlzJ8tM/Jtf8HuPqyobNO5QMfWGYMWPB4lefIAMouZBSislQE7hH7p97tlesjL8v0DPMC/rCnClfz4T0GvfF/VUp8mknJHch
+						6vQ45NJR/B1BlVSZ53cEflPmQypcC32GXjNsPjNnmDuuTqjKmDS1MbkhQEU2y9VVdo+2GDDnhxf/N+pkIodtN+fb1V0pMpyj31nArL76
+						6uV3+y/6rirjiYNLn9n906cEc+qSDtw3gfaqxUQXmFvYXcaTca3D+jyWLGEhzIKYz3aFse3qk6TCnKByzx1PbHps8dQHBK6tjeYv4V+a
+						nJju84MMVrVT51tNI64PVBXoM+p8lJy6Dv1vvqKNHdf0fUDukd9RrUEywsV+V9fOKrq0z6XPedsH6CbWc+78QwfSV1RkYF8YF372q+eZ
+						T7G2MucIOAL9jl/szrk2EHTBz7AnZZm/WedD9t3v9Ndhhx22TMLTfVHVHbKF+P7+9voUctonszcp/pvrI/qJPoPxxZ+rC+gM7a8FAzCU
+						ZNEQBJtBJZJHFNfN4lgGxVeAFtWrMp5UA5izSgYLYQll7ZvAddwInjnGddEuJgeTpK2TngsEiGCC29+0DaGgv0xh8y+KyC2hox20p2ry
+						p4xtU7gOVSLuNUxB1kV3kRM+19YIGbS7awDGZJPJaoE2k1WuT4YdhWOGgc9Z1DmUabex4Pfsvazrd2QVmXXnlDteJtehR6WZQ+0rIHMk
+						UCp+NZg5Dr6yqZOhplhQg++yV+iwyT6p0zMEAwgKhIw1f8dCu6r0eFpxdUhVNsyo0+ND6agUmWfxZJUvBu3jPvzATyosMn0n0A/k4Qgu
+						WLCgfNGmEGOzMfRL6DBp+tHukW2R5513Xrl90p2jdTKRw7ajs8hYmu6yeVsnw7n6nf54/PHHi1NPPbXcdufqtKrFGt+PvmOhdMEFFxSr
+						rbZaWTmDTMfsFeNy+umnZ9GTjBP3nLL9l/unH/l8LKtrtqaPwAAyViVnOWBsWLS645lzuyz6Cp8J/cb8YJ4g025Vl+k+f4xsDriJg7mC
+						LabqqqJNJmM+ylC6znxd/7OuPfS3tWE/sXGxYEEdXdrn0ue87QMSLe48AFuQWx8zX+l3O6PEoA1VQeWUOdcGgjpupb8FvXxf3CdXv7OO
+						oNKdCncCtabP0CW0PQTyTvVqE1uTyz6ZfuOeq2yqi/Ujn3fHwScmG8bQ/lrtIbwmTP4N2URHWaI0jTrFZeC8Es3k2n0/HYR7YGL5C0uf
+						sTnHMSyjWBfBox20J2aYDKuAsKhyX9g9IB/ISQzkBHlhzOpgocNn274IAsQCcoY5AXWTlYgxTltVO93IrNu+ujlVtfgPZckM60dXYZmR
+						4f3QQsgcB3MkjCYylArtd40B89P2tefEDbS4eqaqKgbo5zFtGekLnHic+ZQtf6l63CWHjmoq84bvsPVBql1xaWpjqKLAAW3zYj6l3peL
+						9WNdJi5VJnLa9lQZG8q2k/W04HLsu9imeuaZZy535lcVZOII8lQFO7rAPLKSbf6tCzKYnePzlj0O0WRB4TrNbV4EsPqWLc5PYX7bd2Cv
+						YkG+vrBAC1sbXf+kSvcxxwnoDXFvQ2N9UZc8beuj9K3rYr6i6arQwjfmf3QhtX0uTeYtIHch+5PyahPssLFOWU+kLLLBdJo/5/ogdQya
+						9nsb2O7Gusn0GnIdAv2CvkndXp/TPlkVCvebUl2DTeKzVvVVxVQGYGyR5isiKht431+opCoumwB+AKcPqu7ZZ+jO7oIt2usyuqmGiQlH
+						1Qb9xMTrEzOiKcbFlFWVYkiFdnetgDFMMdbJps2Bqiwii1wWu77hMPn0nVnLSoYWlMBcC42Xu+BxsykYFhyZ0N+Y4xAyaqky1ATkzT0P
+						ZohtSFDl3MbmE1VNKY4CjkidQzAmrC9Sst6petwlh46ySjRfflmE+NU2RlWFWVds3voByxhNbQzjwplBd955Z+MX+sZdqKXgBihd3RFi
+						DLbdZKyuP/uy7cyZW2+9deanMKa36zKeY8ECsamyTFaRz6JDcY6r6HNBgYw10T19QCaXRSJttfHMscDwMfnxq2ir9A3jx8HxsfMNgXvn
+						3KRpkEmjbxvikkPXxXxFS6iFdBD+Bbara4WgS5P2uTSdt/htIfuT8rJtOk1o4rekBmBs0d4kUJWCjUHIL/HpQ18iO1Q3sQ2qCtP33NM0
+						BG1NBzDmjH0M8/UYy6otVsbUBWBskebftDm/NNp3jK3z6sojzajHSobaYIo5JVpqnT0NjhOGIEVhmAIgghhzljDefCa0NaUrKB4me0qm
+						mHb1oRgY974CMMg0fV1XBcGE5nNVWURzrNyy7TZzCqr+DmzB4xsoPsfnQwfVWXAoZNRSZagpZH951Cr36u+VzoWNpZ9FtLNKfD1FX7Cv
+						OpZdp7zzG9/4RpKBGBPMsdR5marHXfrWUTGZj2UXCboQfIltl2hKE7viMnYbk1pRApO27ZAqY331O99XFRA3bLFW97mxYGcPpMiyHTyd
+						cni62Zs+qgfpd15D89xzz5U6ZagEgdl9f8Fm502FbDfzMLZwx4Zx4CtjNi0yaVhSqy8b4pJD11X5F2C6yh8Dxq2v7S8uTdrn0ue8zUGT
+						9QR9HZozLrZo7ztBA03GoI9+5/toS8wPNd9lWvxV881DQU0fe9hCir5mvVZX/T20vxYNwCBACJK/SLPMeqgxJlSxqB5/TzbFSoaYVOxl
+						7CM6h4AhaEwChDOGLUJTI5Dcp2Vk275SDuElE3PxxReXE4vTzc3YIGgYBgOFz5Nw/EhrSqTPIsCWdUEREcDoQ+hQglw7xYFC+dTdawp9
+						BWDoi6qsuw/3zudCRtqqUjAc7v7fqjnF/3nPlCQ/IyumVGxehZSSv+BBRihRZkz5GxQKisXFHAcLHuEIUPlhpMhQE/gOKmBw+od4CpJh
+						TpAvi7SP9/25Q1k7v6sz9PRfioEYE6nOLZi8VenGIXRUTObNQQ5lKnz9w3fzGFPKcVPBqecJFugCrm92xZ+32BiMdZWtaWpjcsNYsbDj
+						LBJk3PqRsWMMDfoefer2bZ1MQE7bbjLmL1ZD9NHv3DtOf9184fc489NSDWeJgVDw3cUqQuhvdDdzOYYFeFPsfh1co4/rNMHaO2SCwBZH
+						vi60hYBvuxkD9Fvdwt2C13U+zNgwnyrlvutsSG5dx1hU+YoxXeWvUfheniTTVOa6tM+lz3nbFfqUBNcWW2xR8MQoMHvuV4KFdAQ/89mY
+						ffCPXiABzSG0boV0W2J+iU8f/Y7N4RHdsWCPrTlyn7vYBzZvGEPGPQZrIwIvKfqadS1+Wmgt5DK0vxYNwJiiYJK7Z1JY1CnkGJvh8JWA
+						i68oeIQjhyPZBEBIKCfiMzg3KJdUmHgIdUq0FOFlQdgkyzsEdl+0g0lqFQ6+sUHoQoqDCU3f0c8hTMjtMyg9niLC0xoMosSUufLoYl/x
+						xTCHIiW7Zg5uH5UWfG8fARgLhISCiz5WRuorCvrzlltuKX+HQ+cqvao5ZUbB5hTjwgLQ5pA5zX603Prbovn0qe21tbb4Sofr2yOouS5/
+						g8Jx99TXyVBTCLogv0MGX8AMsmvkLDjmG2reJzOVspBiHPyxGDPIpJ37kLKvtk6P59ZRUCXzYOPqL45N5i0AwNyj3SbbqXqN63J9O4vA
+						AtauY0WfsjBFd/L/EGOyMaZvaQfOCO2wbaiuw0GfcSCfaw+gD9sOyB/9z+G6MWfIh8VKanYx1u+p32/fZ456CKsQ8fX8UDDXeKISTjgL
+						sjpcPRDzkWgvMkAfIvtV8u1iQf0+norB/HZ1dm5sLlPZOFTwBcx++3rT9Klvu5lPLNb9Q8d98F2QyzofZkzEqn5CxGzIELou5ivGdJXZ
+						NUt+oqu4V9MxKfqpa/tc+py3XbFgAfdDwgi7QUCfn11fjcU3VV7+4//5DJ+N2XY/oEPfuDa87doHqvySEFX93uT7bQ64vrsLbVqwYEFS
+						kKJvkL2m63jWgawH6ZdYEJatbDyhdv311698+qCLBaT9dZdPld/Qpi0pRAMwKASUPYNHIxnMBx54oPyZDgpFqCzSFMuumJN20UUXlR2D
+						sLkLM1N+fAeLVRatqdjkYvLWwQDbAmJMIAQYIQ45og9x7lBCLLItKoyzte222wYnlUVWqyYvBgXDwlMccCAJqPEzE9/AGcApoC9jY+lj
+						AQZbuMSwSYG8dK28wQD1EYCxdoeCiz58J9tVmA+PPvro7HsoPJTn/Pnzl3PKQ3OKMd14443Lv0FB2JibYuYzVZkWm28oLZ4S448lBwtS
+						imyP012yZEn5HTyBw66HDLHIcMe/ToaacO+995ZP/kjJovYNfUxf02bG0w4lQ0bpF5wVxoQXFUQ86apOuZrcxgzE2DDn1nf0q6jT47l1
+						VEzmTX+FnG/uhXvGfuAEEwgl82PObapew8giN3fccUc5Z2gHc2zLLbcs9RpyRKbO15s+Y7Ix5rSjmxk/dNamm25azoX9999/mXkQeiJC
+						H7YdmHP0P68m+sWunWIvYv2e+v32fbwYa1930T4C2dxPTAZyYg4/r7qMIbgLmpCPxNzC7hDQ4bV48eKZ39TDnElJXKRAu3gNAeNKkAm9
+						MHSCAOzwX/wF9ApyhUyxEEX2OBiTe2Rxgr8Ry+4byH+KDzMmbPHlB52qiNmQIXRdzFc03YF98AM43C/zD/nmcG7G1GwmpOinru1z6XPe
+						dsWt1sB+46tR5YCvZjrY5kFortIG2hLThfzOEpb0G4Eud/tKqo/gY35Jqr2v6vfU77fv43NUIiNLLvQVfv+k9FqbdbwloqvkEftF5fWq
+						q65aHHjggcu1uQqTq7qkaZXf0CUmEaP2EF4aeOihh5YTgJPnycrz2E9uxHeMDSY9g17l6KMMED6uyfPxERLXueH3RHNXXnnlYvXVV0/e
+						p2eLDCJYKKU6GAwUft9noHTF7R8W5QsXLiwX9hhqFj2Mw84777xc9New811YwLj96sLCFEeMa6HsfEHGofzBD35Q3gOR/Lr9dQbOG7KB
+						cahbyJqyRF660lcABoeTPufwuhT43nPOOWc2KMnf0u/sJa/Cn1P0P0aFSc984L2zzz571miiaJl3nJ/C53yQdYJpobHkGsiN3R9jThsZ
+						XzKh/A2Oth9FTpGhFFh8892TyhADjyrcfvvty7byot04TBglAl8oXJtnoXvEKcYJJuJOPx922GFl31Rl6bgGgQky01dddVUpCzzSGAOA
+						sSdQxoF0LOYZ7+OOO650lqqou15IJnzMADVxKGJ6nHvKqaNiMo9zzMJ+q622CgZ53XmEjue+jFS9xndy/8iGLUSZV8wvrkub2bJXt/Ae
+						m40xvY8uwI7zyG9ezA/aylxlLKvksattBw4ZRkaw0010P1kp7pvr1xHr99TvZ4GEDDD3kQHmP0Fk5i6LZRwxnHlXvoaGcUAnfeELXyir
+						90Lb7JhLyC19V/dCrgnGs8gP6cIqbMHKQqCPYNSQARjsALIwiQQB0M8EgOyx5sjmokWLSn/W9S1iATH0FVv90GnMZeQWO1fVHsbooIMO
+						KtZaa61yPnBdKgqwMdg3zmvDB2JRz3sshGP+HHKILUWnELC2v2WOXHfddUn9ih/DOISCFiHqbEhuXRfzFbFB/I4DUn1c34sxf/jhh2d+
+						8z9S9VPX9kHf87YrjCOybz4E+pV5gB3BntAu9DB9Fhpz2kBbYluAzLavtNJKZV/RZy5co83ah+sij/hZftWTT6zfU7+fABXjz9MP6Sf6
+						jK3WrMt5EaQi0VqlM3KDXktZx9P/pvvqXugrAor4lSk6xWA+Mt51Wzer/IbUtjQlGoBB4fpK15ReLArEzRFhxKh0gQ4+8cQTKxWgj1Vf
+						pCwy+D2f6/MU8rFAe2gXk7OuCqUOlB/GIiUjASgCJkpKZpPPWCS6KyjbPgIw4n/0IUMYDrIXLJbHYNzbgA7CMcdgE7RBEbOYiekYsl/I
+						NsFIIvXMCZwgCxATVODJKlwbecWoxALGZAW4HtUYoeulLFQsI9ckoNaXHg/Rp45qQ1O91oa5aGP6lAnmR+oi27J9yDCyHCO13+u+n8Ct
+						6S0CG/gXBF+YuwQo6raBDAnBYBzESWG+V5OAWoyhAjAkCNh2RHl5k4DTmMDOsii1ShlsCXM05oNRTYBdIahJdSqBc2ScRSSZXgL7Ns+Q
+						9diiEltEgsAW/wRerCKYOZZa0cL94jumjntuG5LT/qXQRD+2pe95OwZoC8EntvF1oamPEKt68knp97rvJ0j6t7/9bdafw1axQ4XgC4EG
+						fMtQQH5ouL8m6/gc4DNXJWSMFL+h77ZUBmAsc03U3RUAlBFKMvbIJxQy2yy6RlXpEK5DpC8Ei24UrwkZgse9pSwyMD5UX6RUykwjlDQS
+						Qa+L+NWBomDBGYom0+/0v/2OBSrlmLGsgYGMkGnj2n04Pow3Tug0ldyOnS4yxLwnws9rWoMvgH5AT7h7bFHmsQqvBQsWlPOGTD2ZKVuo
+						WcCEhZLpJ2SW6H9szzsBIKpeqq5X5aSxgORe2QrEdgOyR03Gsi89XkVfOqoNMb3WF3PRxvQpE1QQVW2nwCZgW8yRtQouMoJV/oCR2u+x
+						7582WLwOEbCoAr9wnXXWafWY2RAEFZijObEEwSSrM7tiCQH3nCL6jaBHlfx/+OGHZYUtfj12zPXBLIvvBh0IBPAevwtB4AWbxxxFBtyH
+						DsQCMNhAqj04f4Nrk31umpDLaUNy2786htBPfc/bMUB1BFVbXYNKdT7Cf//733KNar4c/llqADGl34fwUYagbh2fG/wIgrTnn39+NC6Q
+						4jf03ZbKAIwJk3uWB1tGEOyURz7ZwgUF3Bb+lv2QoYUO348zxqKC8iKElLIzyrbc/ZQhTLG6hmeuYYY5VoqXAsqkSpFRhk3/s8impI7x
+						IgJM35ozUAVjRIBvLi1O5hptZQhnhb9L0RNjBmVNMNetdqkLehhWYkoAxMCgYnTdDD4Lp1g1jZF6PQMDgX5Eh+Mck0khS2q6PJU+9HgV
+						femoNsT0Wh/MZRvTh0ywDQ95DmWksPeUUiO7LMqA8eJnFiUxJyq132PfP20wd8i69lFJ2gZ0CrqlzsEdEyz4SRYhA9OcICDRRcLLrXZh
+						XqYEKhk3/Hv3b7El2BRsi4HNScnq44ezPkj9W7t35jXJU+xqiu/oktuG5LR/MYbQT9M4b1OgLbSJ7Tf0Y1tiPsLjjz9eyi6ywbqY7+H7
+						kP+qbddGar/n9lGGgrlTtY4fAg5ZJoAcC3al+g19t6UyAINSpjTRSgnJoqKsEZyUaCmChePvZmybwMKNMkj/bAoDZW/7HLlH9oURCSey
+						WKdM2E9IRIwDaOcytA+l4GYkmkAVFEYxlPmwBSHGk2ooDBXKB6emKlNi4PCwpQP5mEuKfy7SVIZQXmQUmZfITy6GcFCsoovghclpLOjh
+						Yhl713Hj/24pN0qcDGRKxV7oerHgDfc5b968soKRv0F3tzmIraser6OrjmpDTK/1xVy2MV1lAh2BvWbbQwhbGGLfOcOAbT+cp4BzVLdY
+						Tun3uu+fJhgLKuRSDg/PgckC9nxaAhncJ7Yjd4KAAIN/uH3fELjg7Ae3gjI1YILPxrka7t+yJck9VNZsYF3CAUjaoldJUgB2CfvkBnhc
+						bMsGfj4VLDzRpI3PkNOG5LZ/IYbQT9M4b5tAm2hb2+BSnY9ARTFrH2zN008/Xey1115lQIYHTsRI7fchfJQhqFvH5wbdwBjhF8RI8Rty
+						tKUyAEM0+fTTTy8dHxwhDthi73OT6DTXOPLII1uVeN54441leWIVCC8LI0oWMUAYnLrII9DBHN5YN1HmChg5HI2mho3tRZw2HdtKRBCM
+						8yvof7ZRXHPNNbXyYQooxZkW4yBVhhhbFgOcU+IGCvrGZKjJeSZtMAfSdT5pF9F0AkA4q+bA43S6+22rMok4/gQvwYI5lD6SvWZfPnAd
+						c2IN/3rMM5x7rsk8uuSSS8rScoN+4fBD9DeHhmFg2vZVFz2eQlsd1YYUvdaVFcHGdJEJkjk4xlV/h5xyECPZRWwLvgfnH7nzK0Rqv9d9
+						/zTB1g78tEnZUuYsZ1pNysFuCmOOzJIsyqkD+B7sU0rgogsEN9wtPlahiW3iDAw7IDZkU/wAPgEb/GirOgN0Mz4e12IOcsYE0D6zY2B/
+						iz0ysGtms7Cb/mGgXIOtUBy2GjtgOIWcNiS3/fMZQj9N27xtA3LHeURNE08pPgKBEYIo2CdePEghJViS0u9D+ChDUbeOzwnz55hjjinO
+						OOOM6FxK9RtytCV6CG8foLyOPvropMdC54YJwmMjUdYrEpwYvttuu40imopDgGOCXIjpoU6GWDTxyF+yADkfeWffQ5AHBy8nZJQ5kMvO
+						bKHyDyePcmf+T3kon8FxZOujuz0CB9jNBiLv/B1ZQoP93QRzqG4566yzZgPIGAx/K6V/PRxuHG+uQRaUV05y6/Ex6agurEg2RrZ9xYbS
+						+x122GGZIPOYwQlHZ3Pobs5gr33PvHnzOh8EWgeBe+wA38PCjcQECVPsws033zy7Lc23KdgpEphudadtW3VtCQEk7A4JBw6fpDqNYAtP
+						qHO3WzD/sGVu4oW/xeaRpOBvc1eQ5LQhY9J1XZm2edsF2oj8jiHQtCL1+6TBLycYTCArFnyZtN+QPQAjhJj7DPEoT6o+zjzzzDJjlnJu
+						Sh+goMl0kMnbY489yqAP301W3jIUGFZKualuscAi2TLOSDJwXPk7N8vHtTngDwfajawTpMRZZvuF4V8Po8LWP5xggjoxIyOEECsy2CRs
+						EzYqZ4KALTtU9xIESdkG1BXaRaUj9oltezySluAPW4B5MpHZBd+mUL3CtgmqWgzsGDaKA98N3uO6VBKYvWNxQ1KC7UOWNeZzZJHdLV18
+						nqAM78/lSgshhGiDAjBCiE6QTcThy1WiS1CD7TkEOdh3y2tsh5Ph0HJPFoDpCg5t1d55IYQQ6eROEFDdQYCc7QjYJwIwk3i6W4y+bQpb
+						0FeUrfxCCNE3CsAIIVpDxottOfvss0/rcwgI2lAhwrYaXgR0cF55PCWHflPxYoEXXk0fVTkES5YsWaZCpSuUgWs7hRBCdAN7wrYjKgbb
+						JgioZDH7xItKRnQ0h6Xa0zhdG0Xlx9iesNWnTbGKIm2nEEKIdigAI4RoBeXW7J90Hc8hXhw+2+Qw8Nzg1FP9kvJ0uBQIamlbkRBCdMPO
+						7QrZkZwvOzdsLPRtUwhq8ZSeMbVRCCGmCQVghBCNodrFHkM+9ItHzY+J0BMmukC2NffZAUIIMZex6syQDcn54myU3AfEN6Vvm8KWWyUI
+						hBCiPQrACCEag/Plbhsa6sV3yvETQggRw982NNQLG0VQXgghhKhCARghhBBCCCGEEEKIzCgAI4QQQgghhBBCCJEZBWCEEEIIIYQQQggh
+						MqMAjBBCCCGEEEIIIURmFIARQgghhBBCCCGEyIwCMEIIIYQQQgghhBCZUQBGCCGEEEIIIYQQIjMKwAghhBBCCCGEEEJkRgEYIUQn7r77
+						7mL99dcv9t9//+LVV18tzjrrrGKjjTYq1lprreK+++4rnn/++WLfffct1ltvvWKrrbYqlixZMvOXy/PRRx8Vp556arHGGmsUV111VfHc
+						c8+Vf7v22msXO+64Y/HKK6/MfFIIIYSIU2dTXn755eLOO+8sttxyy2LNNdcsjjvuuOL999+f+evlefvtt4uDDjqotG/YvsWLFxe77LJL
+						ef3DDjus/L0QQggRQwEYIURr3njjjeK0004rgyoEXbbbbrti6dKl5e8uvfTS0kk96aSTinfffbd87brrrsW5555b/j7E/fffX9xwww3F
+						HXfcUay66qrFKaecUjrD77zzTrHTTjsVl1122cwnhRBCiDgPPfRQaVPuuuuuoE3ZfPPNi1tvvbX49NNPiyeffLK0WY888sjMXy8P18JO
+						XXTRRcVqq61WXHHFFcUnn3xSJgdIMtxzzz0znxRCCCHCKAAjhGgN2b9bbrmldFjnzZu3jONKAGaTTTYpXn/99fJnc3h5v4rLL7+8zFAS
+						pNlmm22KN998s3xfARghhBBNWbBgQfHss89W2pSTTz65DL7AE088Uay77rrlvyE+/PDD8jrYtIMPPrg45JBDygobUABGCCFEKgrACCE6
+						c+WVVxabbbZZWREDH3zwQbHffvuV2UYDJ3idddYpM5Ix3nvvvWL33Xdv9bdCCCGES6pNobrFtWNVEICh4pPPG1yH63FdIYQQIoYCMEKI
+						Tnz88cdlNvCII46YzSRaNvC2224rfwb+v+GGG5bnxMR44YUXyv35lIwbqY6xEEII4RKyKfyf9/gdmB076qijZu1YFVR6cl6MWylDcIek
+						A8kHIYQQIoYCMEKIThAUITjiZwM5lPCpp54qf7aKGMq933rrreK8886bPeiQUnD20Bt+JpHP7b333qWDywGHl1xySVkKLoQQQtQRqk7B
+						nlAVQ3UMWEUM57s89thjxU033VS+j23CRrn4CQEqYthuy/svvvhiWREqhBBCVKEAjBCiE6F985zVsvXWW5fBFnjttdeKDTbYoMw68sQJ
+						c245Q2aVVVYpD/K1rCNnxLA335xe+1v21lNF41bVCCGEEDF8mxI6EB77QtUmFTE8ye+ZZ54p3z/jjDNKG/Xggw+WP2OnqPZ0Kz45vJcn
+						AfIvZ87EDvEVQgghFIARQnSCR03zGE738ZvHHnts6bgalHcTZKFsm0eC2sGFVMjwHplInGI4/vjji/nz55f/Bz57wgknlBU1OMz2t0II
+						IUQdvk2hYoUKFhIABttmeYofwX4eL21cffXVZQBm4cKF5c9UzOy1117lk/oMbB+PtiYRsWjRototTEIIIVZsFIARQkwUHNoLL7xwNgAj
+						hBBCjAUSBe4WWyGEEKILCsAIISbKkiVLlslOCiGEEGOBba/u05KEEEKILigAI4SYGGwnovpl6dKlM+8IIYQQ4+Cll17S1lchhBC9ogCM
+						EGJihJ4wIYQQQowBnuCnR0sLIYToEwVghBBCCCGEEEIIITKjAIwQQgghhBBCCCFEZhSAEUIIIYQQQgghhMiMAjBCCCGEEEIIIYQQmVEA
+						RgghhBBCCCGEECIzCsAIIYQQQgghhBBCZEYBGCGEEEIIIYQQQoisFMX/A2IP+9+ZsJeHAAAAAElFTkSuQmCC"/>
+			<rect v:rectContext="foreign" x="0" y="36.75" width="480.013" height="31.7151" class="st1"/>
+		</g>
+	</g>
+</svg>
diff --git a/doc/img/equ_dual_self_guided.svg b/doc/img/equ_dual_self_guided.svg
new file mode 100644
index 0000000..c936f46
--- /dev/null
+++ b/doc/img/equ_dual_self_guided.svg
@@ -0,0 +1,71 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<!-- Generated by Microsoft Visio, SVG Export equ_dual_self_guided.svg Page-1 -->
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ev="http://www.w3.org/2001/xml-events"
+		xmlns:v="http://schemas.microsoft.com/visio/2003/SVGExtensions/" width="2.89143in" height="0.748518in"
+		viewBox="0 0 208.183 53.8933" xml:space="preserve" color-interpolation-filters="sRGB" class="st2">
+	<v:documentProperties v:langID="1033" v:viewMarkup="false">
+		<v:userDefs>
+			<v:ud v:nameU="msvNoAutoConnect" v:val="VT0(1):26"/>
+		</v:userDefs>
+	</v:documentProperties>
+
+	<style type="text/css">
+	<![CDATA[
+		.st1 {fill:none;stroke:none;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.75}
+		.st2 {fill:none;fill-rule:evenodd;font-size:12px;overflow:visible;stroke-linecap:square;stroke-miterlimit:3}
+	]]>
+	</style>
+
+	<g v:mID="0" v:index="1" v:groupContext="foregroundPage">
+		<title>Page-1</title>
+		<v:pageProperties v:drawingScale="1" v:pageScale="1" v:drawingUnits="19" v:shadowOffsetX="9" v:shadowOffsetY="-9"/>
+		<g id="shape1-1" v:mID="1" v:groupContext="shape" transform="translate(18.375,-18.375)">
+			<title>Sheet.1</title>
+			<rect v:rectContext="foreign" x="0" y="36.75" width="171.433" height="17.1433" class="st1"/>
+			<image x="0" y="36.75" width="171.433" height="17.1433" preserveAspectRatio="none" xlink:href="data:image/png;base64,
+						iVBORw0KGgoAAAANSUhEUgAAAZAAAAAoCAYAAADQUaxgAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAAAJcEhZcwAAGdYAABnW
+						ARjRyu0AAAv7SURBVHhe7Z35yw5dGMf9A37xm5+UkiRJkiJFhJAtW9lly5Z935UQsv0gITsRRWSJ3pdEsmXf9zU72bfz9jnvnMc8Y+Z+
+						7plz7u25r09NmGee+54z5zrne53rus6ooARBEAQhASIggiAIQiJEQARBEIREiIAIgiAIiRABEQRBEBIhAiIIgiAkQgREEARBSIQIiCAI
+						gpAIERDBmq9fv6p//vlHffz40TsjuOL06dPq5s2b3r/KP2JLmeP58+f62f769cs7Y48IiGAFA33y5Mlq8+bN6vfv395ZwRUM+t69e6sT
+						J054Z8ovYkuZ5cePH2r+/Plq+fLl+u8uiC0g3ECFChVCj23btulrvn//riZNmhR6zerVq/U1hcidO3dU/fr1Q9vVunVr9ebNG33dhQsX
+						VLVq1f66plmzZurp06f6mvIARjhv3jw1Z86clAZZbDaDHbRr1y60LXXr1lW3bt3S1z169Eg1btz4r2tq1KihLl++rK+Bq1ev6s/jz0KB
+						lcSWLVtKxkv16tXV0qVL1efPn70rSiO2FI5rW0Kkhw4d6kykE69A7t69W2IcEydO1J3mh5tD6TCcjRs3RhpOIULbFi1aVNJB586d837y
+						hwcPHqgGDRqoXr16aeEpjx7VkSNHVNOmTdW9e/e8M6kpRpvZuXNnyWA+cODAX3bw9u1b1bVrV9WmTRsdrgoLL/A7q1atUp07dy5xUvKZ
+						Fy9eqEGDBpXqw2fPnqlu3bqpMWPGaHEJIrZUNi5sCXBweXZcY0tiAfn586eaNm2abkyYZ228pjNnznhnyheoOuJB+4NejfGmZs6c6dRw
+						nzx5kjfxcCYyJjQmtqAhR1GMNkMbaSttpu08AwPPDU9w4MCB6vXr197ZcMznbNq0yTtjB9/NRMKk4xI83FGjRql///3XO/OH2bNna6fq
+						/v373pn/EVtKD1e2ZJ4dIm87P1nlQE6dOqUqV66sG4Q6GojbooRhRlRewIsaMWKEbnuHDh1KPEPTkSwTXScCmTxcTSC20N+1a9dW169f
+						986kR7HZDPZgVqt4fXjOBtpKm2l7WZjPCZssk8DEMW7cuNj9VxbYJx5/UAj494wZM0qFXQxiS+nhypaAZ1elShV1+PBh70wyrATk3bt3
+						qlOnTrpBffr00ROm6xhbPrN7927d9ooVK5Z0RNyOjEO+CAiTD95LEg+mGG0GL9hMdGa1miSvYT7n0KFD3pnkZEJAEDacqrAwG+dwtPi5
+						P4QlthQPV7Zknh25Iv9KJi5WAgI0gsYQzjl//rwO3XCkSoSVF4JLykuXLsXuyDjki4Aw6eAxrly50jsTj2KzGSY1Jjgz0d2+fTuRh0xu
+						AXuzHfSQCQHBoQqGc4FY/Lp167TXHBwbYkvxcGVLZkVI4p3QeFKsBcSfzKpatWpGQjf5in9JWalSJVWrVq2MLptdC8irV690otH0H0va
+						KVOm6IkqFXjAXH/06FHvTDyK0WYIsdBeVqtU6CXxkL98+aKGDBmiPUc8SBtcCwiCNmvWLJ1XQUhM/5LEbtSokWrbtq26du2ad/UfxJbi
+						48KWgKo1VjM2eSJrAfHnAqiiyEToJp/xx2HXrFnjnc0MrgQEY2NDEYMbL+T9+/f6PMJBpQyrKNOP/Izkpz8sgbeIWIZNCOlQjDbjn+im
+						Tp2aeDMXfRGWR4iLawF5+fKlrrBC2Pbv36/at2+vmjdvrr3kvn376raHrczFluLjypaOHz+uP8OUPyfBWkBg+/bt+kb8uYBMgrdDvTjf
+						mfQYPnx4qVhsUqh4YMLlMxksmfR+XAkIqyRWG2E19/Qf/cjKCqFBIMePH6++ffvmXRFdTROHJDZDf+3atUstWLDAO1M4cO/Dhg3TbbZJ
+						hK9fv157nXj6NrgWELxY7CnMEzb5j7AS3lzY0sOHD9XIkSN1PzRp0kR79IUU8nJlS4g24p00fAjWAoJXwRLVeOEu4rOFAkZHvNVsGnQx
+						sFPhQkAoA8Z74cCTCWJi0myMZEVC+/yDkomHhGeLFi10CCwJcW2GPTUMmB49eqiGDRvqSaeQYFIlzIB9MMnR5qSJcPqf38d7tMG1gHBf
+						hK7CwPmgtDcYb8+FLWHTOERmpYIzZRMGyjYubcmMdZvxZCUg3ECrVq10J0SVl5VXEA88YTwZszGH9hvPPQmmQ/mcJAd5GFYMUXBfpp+i
+						BhoDmQFNmGTPnj164jYhLrAd9DY2Y747icGfPHmyJMEa97BZVfPMt27dqkODTHZ44rQ5WI2ULnEEhHJark16sLIOq6gKgkBMnz49UoxM
+						v2Hb/mtyYUuInN8JM/fgf5NEWeCEhdlJOseOHTsSO9iubcnMNzkREBTcn/1PtbGuvGG8AJOwwyBSbWpyBYbvN/64MEAYKNxnlLdoBAQP
+						hxg2O4T92Ax6W5uxERC+m7xPkuPx48fep8SHtvrLuv1VQ/5XTKRLHAFJBc/S1QqEVcXYsWPVhw8fvDOlMSWjwVBVLmwJ22HSvHLlinfm
+						z7l0nwXtOXbsWKitlHWQu0rqYLq2pZwJCJNmsNYaBTTJrJ49e0YaEzdNgg0VpRO4liUoccxCIdiR4E+mu6jTD8NWQIzBpEpaGgGhHXhM
+						wdiwCUcglGVVa/mxsRmDjYDkArzEYFm3PwGaJPZM/xO6YEVlg0sB4V7If0TBCh2HhL7jew25sCWeH/k/Sn7B3EPLli11IUC+kglbMvPB
+						3LlzvTPxiS0gdBRfGDa5mPIyJtKwUAqdtWzZMv3ir/79++sHwkMgDBQ0rlTg8ecqiR6118MkCvnspEvKsrAVELwfQlOpBqwRkFQeWVyP
+						zcZm/BSSgLBq8XvIBv9Eh72kGzYx8BzjPPsoXAoIk1dUnzDBE15C9A4ePOid/UO2bYn7oRzawPcyJvxilG9kypYQfvrFZk6JJSB0GB1H
+						B4ZNkMGNdcFYH0vdDRs26D+pfiC2jCFTihYVUsknjBeAiIRhu6QsC1sBwXOjUiyq6oX+ZSDhoaVapXAPDNJ06sdtbcZPoQiICa8Q/gub
+						lLB7Bi5HnPwKz4bcVVyPPQxXAmL6JKoCkUkPewqb8CFXtgR8Hq+Pj7q3fCBTtgS2e3AgbQGhcyZMmKDq1KkTObHQYXQIN5UqmYXyISA2
+						OyCzCcbFw65Xr57eNxHlqbBUN3FYm2R6FLYCAsTOGdD+QcjKECPq0qWLWrt2rVqxYoVuA20+e/as9nL8k4PxXMoSfZc2A/kuINTj84ZT
+						3obKCvvTp0/eT0rDczGv3o6zWiUsQ3iGzYR+LzoJrgQER2TAgAF6rwcOlGkLn8+KAEcEe4pqY65sibHJ6+YXLlyYl+KRaVsCVo62e4pS
+						CgjVNxgsNxc89u7d6131P9TnUwUUvA7j4C2bfrjxOCGrXLFv377QNuER+CuTWDryNtHgdRx4RDZJWD8uBAQuXryoBzxto3/YgMWOdJPI
+						RCxGjx6tf4YBB98AjNHSrrC6/0zZDOSrgBBPZ1NmsB2UHLPnwMCkT7I5eB1HzZo1db+kwoQgUyWJ08WVgOCQLFmyRIsb/1eHeQ5M9NgH
+						JdipyJUt4TAxDxnxoG/yQUiyZUv8Po6I7d61REl0G0zSavHixd4ZIV1cCYgteHqECJLEXW3IVwHJFnjprsKjrgSEcWxTNJILW6LNFO0Y
+						weAeEEHbsGAhwcqRULatM5J1AaGT8Dhs4m7FCvFQmx27LmF5TWVN3LirDcUsIIQmCFEQonHhKePtEwoqq/ItFfwuBTC2NplNW2IMdezY
+						UYfQKVPn4O/YVVSYqDxCeJF220ZHsi4gLMO7d+9eqgRWKDyYxJjMshGKxDMlDozBE6Ygh4MTQvGFbS6gUGCSJa6fyTcdxAXhIC9hO/Fm
+						05bI/YWFfYrpDRom5B7nP/CKIusCIpQf+G9KectqWHmm4A5i1CSq2VluO+BdQujKVShabCk7YD/YEfZkk/swiIAIVlB+GbYvRnCDGfB4
+						6C4GvEtu3LjhNJIgtpR5eLYUAbn6r7FFQARrKPUdPHhwUSUhswXJXiqUMh3ayRfEljIH6YN+/fo5zaOKgAiCIAiJEAERBEEQEiECIgiC
+						ICRCBEQQBEFIhAiIIAiCkAgREEEQBCERIiCCIAhCApT6DyBwOP/MSHc/AAAAAElFTkSuQmCC"/>
+			<rect v:rectContext="foreign" x="0" y="36.75" width="171.433" height="17.1433" class="st1"/>
+		</g>
+	</g>
+</svg>
diff --git a/doc/img/equ_dual_self_para.svg b/doc/img/equ_dual_self_para.svg
new file mode 100644
index 0000000..d294bca
--- /dev/null
+++ b/doc/img/equ_dual_self_para.svg
@@ -0,0 +1,69 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<!-- Generated by Microsoft Visio, SVG Export equ_dual_self_para.svg Page-1 -->
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ev="http://www.w3.org/2001/xml-events"
+		xmlns:v="http://schemas.microsoft.com/visio/2003/SVGExtensions/" width="1.99855in" height="0.813996in"
+		viewBox="0 0 143.896 58.6077" xml:space="preserve" color-interpolation-filters="sRGB" class="st2">
+	<v:documentProperties v:langID="1033" v:viewMarkup="false">
+		<v:userDefs>
+			<v:ud v:nameU="msvNoAutoConnect" v:val="VT0(1):26"/>
+		</v:userDefs>
+	</v:documentProperties>
+
+	<style type="text/css">
+	<![CDATA[
+		.st1 {fill:none;stroke:none;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.75}
+		.st2 {fill:none;fill-rule:evenodd;font-size:12px;overflow:visible;stroke-linecap:square;stroke-miterlimit:3}
+	]]>
+	</style>
+
+	<g v:mID="0" v:index="1" v:groupContext="foregroundPage">
+		<title>Page-1</title>
+		<v:pageProperties v:drawingScale="1" v:pageScale="1" v:drawingUnits="19" v:shadowOffsetX="9" v:shadowOffsetY="-9"/>
+		<g id="shape1-1" v:mID="1" v:groupContext="shape" transform="translate(18.375,-18.375)">
+			<title>Sheet.1</title>
+			<rect v:rectContext="foreign" x="0" y="36.75" width="107.146" height="21.8577" class="st1"/>
+			<image x="0" y="36.75" width="107.146" height="21.8577" preserveAspectRatio="none" xlink:href="data:image/png;base64,
+						iVBORw0KGgoAAAANSUhEUgAAAPoAAAAzCAYAAAC+CxVBAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAAAJcEhZcwAAGdYAABnW
+						ARjRyu0AAAsuSURBVHhe7Z3pyw1tHMfvf8AbL5RXXkmSJC+IFBGKkKzJfqPsW1kiWbKFhJCkbKHsb+xZEonIEiL7viRk366nz/XM777n
+						zJmZc86ca44zZ65PTY8z59xnZq75fX/bdc15qpTFYql4rNAtlhRghW6xpAArdIslBVihWywpwArdYkkBVugWSwqwQrdYUoAVusWSAqzQ
+						LZYUkGih37x5UzVu3Fg1b95cderUSXXs2FE1aNBAb/ybfa1atVL169dXly9fdv7KYkkfiRb65s2b1bZt29SfP3/06+fPn6t27dqpBQsW
+						qL9//+p9Hz58UBMmTFCPHj3Sr9PKpk2bVJ06dVTbtm21A+S/vBYnydaoUSPVu3dvPWZJ5PPnz2r9+vVq586dzp74SFqQSazQv379qtas
+						WaM+fvzo7FF6QBnYY8eOOXv+/9zChQvV+/fvnT3pAwFMnjxZ3bhxw9mj9Bgh9HPnzjl7lLpy5YqaNWuW+vHjh7MnGVy/fl0NHjxYDR06
+						VItv+/btzjvxkbQgk1ih37lzJ8tzb9myRTVr1ky/J3z69Ek7hG/fvjl70gfjsWPHjhoDhJUrV2rDxEAFPrd69WrnVfJ49+6d6ty5c+xC
+						T2KQSazQiTruyPP79281c+bMrNQTj/vlyxfnVXnDuRJh37x54+wxw969e3WqKTAe1dXVavTo0doYhWvXrqn9+/c7r8zy6tUrderUqZoI
+						GAelEnoSg0yia3Q3eM2uXbuquXPnZkSupPDr1y9tFEuXLtX/jgJiOnDgQIZ4AWG7BSZpJlHdjdd5FgrnfebMmYwSQeA9ro1rjHp9uTAt
+						dMbx8OHDWal3EoNMWQv94cOHauTIkapu3bq6nuzTp4+6evWq824mRKOGDRuqgwcPOnuSA46J1JoISz0dBYySv8fQMfgwLly4oMcTUZqE
+						7yN9DRIa1zZ27NisMsIUpoXO91RVVWX0MfwoJMjgFHB27oZoKWy2LIXOYJFuIvL79+/r13jHPXv2aDFfunTJ+WQtu3bt0u8h+KRx69Yt
+						1aZNG9/ryhciOUbTokULde/ePWevPxs2bFCtW7c22iTC2HHECGPRokXO3my4Py1btizqWoMwKXSCDPeE68G2wigkyGDLZDyHDh3S49C0
+						aVN1+/Zt5934KEuhnz59WneJvdGN+oc6yJtySuqEV01ad/379+9q6tSputsdNaV9+fKlvnaM0lsneqFeHDNmjBo0aJCuIU2A8dKB5vhs
+						8+fPd97Jhns1Z86crP6AsG7duprpvrBt69atzl/UYkro3Ae653I9ub4vSpARWy7VdGbZCf3169dq0qRJ2ni94PnwgN6IwUAxYIgdQ0oS
+						Fy9e1HOvudLDIBDZqlWr9HwuaTNRndQ8CKnPw6JuoRD9EB+GizBwJGENKLnmEydOOHvMYEroZBuMZ5MmTfT1hI1V1CBDd57vdk/HxUnZ
+						CZ2FHaTtfrAfY8ZQ3IgDoPOZJLjB3OguXbqot2/fOnsLg7QfIyNtFKGFOQ2/aaBikOi3bNkyHak5flC0FuJyzCaEThbJ+WOH/JfrCctQ
+						olyL3He+u1Q9pbISOoM2ceJE9eLFC2dPLU+fPtUGvWTJEm1c3BBSXiIJK5AYNFmllJS5YMSNyKMavKT9pM1kQqzIYhzCDB1naLIuJPph
+						6GRgfDfHR2yILggMncaVdx6/WEwInV4HAqes4Ry5njDHFSXIiHPIVWaZJKfQaYKdP39eDRs2THe/ufAOHTqo48ePG58TpcaZN2+eNlrS
+						P+m2051kMNeuXauNu1KQJk7UTOTo0aOqX79+OmUUI/cTOlG/Z8+e2gmyzFXGlNfFLM9kwQir0RAHcNx8hA7UtWQWJpaH0rDt27dvTReb
+						a4zi8J89e6aDiTQKieR+Qi82yEh9PmDAAJ2loifOO2xWqVhChY5XGz9+vO4OUvchbLzxvn37VL169TKmSY4cOVJzw6OCoZDKYCTTp0/X
+						A8fAM6gMjETzSkHqtCjTXIh74MCB6uTJk/q1TK/xfWGppkkk+knTlJKB4+cTqeSzuTrapQI7ptdBSi02VojjKgRsnO/F0S1fvlyPH9pi
+						NgSt4ZhNEyh0Dk5U5aZ5D8x7eHJO6sGDBzUpZDHemQUI1HhBKSWejwZOHNMybtzd4yhbjx498m7KYEhRohpGuXHjRj3mkuG4hV6KRUPe
+						6Aci3nxKA0l5Me5ygOyK66GxKIjQKYlMrVbkvkhJ4A1cMn5xNOh8hU69SHOFg65YscL3oJLW4JHx3izeF88eBepVDJf6xQ9pIpWLYZiA
+						MYxSp929e1cbpffvglJN04h90I129xYkJeUcwhqCIJ8tVfYRBmM1btw4LWw3hWQo+SL1ud86BjleHPfPV+hEaaJ12NygeDtuFJ6JerEY
+						EDLfE+TJZBDKwTBMEUXoRADm3Ino3rESoVdXV8e69BKboOYnqrtxCz1XOVJOQsd2yVDdD6mA2JzJ5qX0ZfzukWiqZEInSnNAPE9QhJWT
+						ogHCvHcx0RyI1GFTPlLXeBfLJJkoQpdlpoxF0Ga6pnTjLhHCNm909FIuQqfx2717d99rcG+5MpR8CbJjMiNmX3jP9LQj+ApdIkNYrSdC
+						N9E8YHHFjBkzAg1eBoHOpKkBLwe42YVEC2p/VrQF9SnEiExPW7mhATdt2jTf2Q9JSzmHXCWWCN3kwp1CwbZpwLH52TmpNSk212Nq3YFo
+						y5vxyFQrNm56IRH4Cp3B52TCpn3cqXuxYJS9evUKFLqUEvy4gDe9Mk2pm3H8TT7OC0Pk3KiNg7y9pJqFZgn5wlz5kCFDdI/AD3e0z2UX
+						8mBNrsgfJwQopo2DGm1hU5ZRYVz87g/3jvGIy8Z9hY5H4aBBafKTJ0/0HCADEBT18VjMZ44YMSJnGslFMl3n58mk+x/XtMO/RIRJJM4F
+						4mLO3FsXuxHxmJqfdoNzwcn49QYEWUfPNfGsQtgjr8VMLZoAp8TUcVhvCYdN05PzNFUy4jC8vS9snAe4vDbOfhaQoaN8bCQMX6EzCHQh
+						ObA8W8zNxaMz79e/f3919uxZPe1AKomQaRDxwwKCpCj5GDKDOHv2bB3VWfwAHI+pjlGjRqlu3boZa4aUEzxlxtNmuYyIVYGMM7MSP3/+
+						dPZmw72iFDAtIBqAu3fv1pEozIEwF8x95Pi5GoKk9vk8aRcH2DfpOv2lx48fO3uzYR3J8OHD9fWYmrKUp+Jkao1jULaiNe/YSnnD8cP6
+						ZfngK3RgMPihPU6AA+FVuHmskpMVcYgdw2IFG08duecE5T28F+uGg+A4OAkGgIhEN5eoxHeyUogH/ytpNZwbPDbd3qAuK+PG2Ls3v5kQ
+						KQH8Nu5hVFiKLMtq3Rv9ErfDkfrS+zk2+gU4KjcS+bn2Ypu4hSA/6Og9R+8YuQXm3VikVOxTf2RnpOjYObpavHixb/nA2EyZMkWvH2F8
+						+WGRqAQK3RRElrD6hoYHXftiBy+pIGacKX2ItCBNrrAAYKmFzIiMopiZlNiFTooU1myiVjNV/yQRSeXSZPSscmzfvn1ov8FSC46RrNcv
+						68uXWIUuD6f4PY0mIPJ/1ZApB6j7aHAxl8t4VTo0uCjJwpp6lkzIiIsNBLEJnXqdHwMM+zF90nXSdu9SwLQhXddKN36ujSlCrrWUtXmS
+						oQtPb6zYtfaxCZ3uOYbrbtB54WbTWAr7TFrA2THrwM9oVSoYLVOEQfPwlkxoevM/ijCxJiL2Gt2SP4gd712JQmAajamqtGdv/wordIsl
+						BVihWywpwArdYkkBVugWSwqwQrdYUoAVusWSAqzQLZYUYIVusaQAK3SLJQVYoVssKcAK3WJJAVboFkvFo9R/WJivBGHVOxsAAAAASUVO
+						RK5CYII="/>
+			<rect v:rectContext="foreign" x="0" y="36.75" width="107.146" height="21.8577" class="st1"/>
+		</g>
+	</g>
+</svg>
diff --git a/doc/img/equ_edge_direction.svg b/doc/img/equ_edge_direction.svg
new file mode 100644
index 0000000..d36634d
--- /dev/null
+++ b/doc/img/equ_edge_direction.svg
@@ -0,0 +1,121 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<!-- Generated by Microsoft Visio, SVG Export equ_edge_direction.svg Page-1 -->
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ev="http://www.w3.org/2001/xml-events"
+		xmlns:v="http://schemas.microsoft.com/visio/2003/SVGExtensions/" width="2.2307in" height="1.4152in"
+		viewBox="0 0 160.61 101.895" xml:space="preserve" color-interpolation-filters="sRGB" class="st2">
+	<v:documentProperties v:langID="1033" v:viewMarkup="false">
+		<v:userDefs>
+			<v:ud v:nameU="msvNoAutoConnect" v:val="VT0(1):26"/>
+		</v:userDefs>
+	</v:documentProperties>
+
+	<style type="text/css">
+	<![CDATA[
+		.st1 {fill:none;stroke:none;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.75}
+		.st2 {fill:none;fill-rule:evenodd;font-size:12px;overflow:visible;stroke-linecap:square;stroke-miterlimit:3}
+	]]>
+	</style>
+
+	<g v:mID="0" v:index="1" v:groupContext="foregroundPage">
+		<title>Page-1</title>
+		<v:pageProperties v:drawingScale="1" v:pageScale="1" v:drawingUnits="19" v:shadowOffsetX="9" v:shadowOffsetY="-9"/>
+		<g id="shape1-1" v:mID="1" v:groupContext="shape" transform="translate(18.375,-18.375)">
+			<title>Sheet.1</title>
+			<rect v:rectContext="foreign" x="0" y="36.75" width="123.86" height="65.1446" class="st1"/>
+			<image x="0" y="36.75" width="123.86" height="65.1446" preserveAspectRatio="none" xlink:href="data:image/png;base64,
+						iVBORw0KGgoAAAANSUhEUgAAASEAAACYCAYAAACrr18SAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAAAJcEhZcwAAGdYAABnW
+						ARjRyu0AABsPSURBVHhe7Z3nzxRVG4f5B/jCBxI/GBMTQ/xgCDEGgzFANGKE2CIIQRELIh1sAQyCEKkiKHaNitIjVUikWbECooCCImBB
+						RFFUxAKW8+Y6zL3v7LJ9Z3Zm9/ldyQSe2dkyM+f8zt3OmVZOCCESRCIkhEgUiZAQIlEkQkKIRJEICSESRSIkhEgUiZAQIlEkQkKIRJEI
+						CSESRSIkhEgUiZAQIlEkQjXw559/usGDB7tWrVpltk2bNgWvCiHKQSIUAfv27XMdO3Z0nTp1cl9++WWwVwhRDhKhCHj33Xdd69at3cCB
+						A90ff/wR7BVClINEKAKeeOIJ74o9+OCDwR4hRLlIhGrE4kJYQooHCVE5EqEa+fbbb12XLl1afDzo77//dv/++2/wlxDlIxGqgB9//NHN
+						mTPHB6GxfAYMGODWrFnjTjvttBYXD+JcOWfLCnbr1s398MMP7tVXX3VXXnmlvz5t2rRxN954o9u7d687duyYe+yxx/y14/h27dq5cePG
+						uZ9++in4xJP89ddf7qWXXnKXXnqp/wyO5T28l88IM3HixMz328bv4D7NmzfvlNfYVw6cB7+N38j7LrroIrdx40b333//BUeIKJEIlcnW
+						rVvdBRdc4MaPH+87A6P+iy++mOkoxIVaIt9884278MIL/TZ69Gh/TRCo8PW55JJL3GWXXeYWL16csZief/55f93o7P/880/wac4tXbrU
+						77/++uvdr7/+6jv+559/7rp37+6uuOIKd+jQoeDI//Ppp596oTr//PPd/v37/b7Dhw+7Pn36+O/evHlz2VYan3/VVVe5Z555xgsi8H7O
+						b+fOnf5vES0SoTKwRt6vXz/fMYwjR474BktHI0OWFnbv3u3at2/vO3M1GxbM+++/H3xacbA6sD543+zZs7OsBXNVee2pp57Kes1+o1ku
+						xurVq/317Nmzp7++xrp16/zn5IoW8Lnz58/375s6daoXuu+++87dcMMN/t6VC58zc+bMrN/0yy+/uGuuucZbu1u2bPH7CsH7165dmxFC
+						UR4SoRKY20ED37BhQ7D3JNbJ2Ph/1NCoP/zww1g+OypMhM466yz38ccfB3tPYq+dc845bteuXcHekxQSIcDSREjC2PGF3F7eQ4LgjDPO
+						8C7ysGHD3IoVK4JXy8PuNd/zzjvveOuJ34EFt2rVqlN+UxjcxQ4dOnjBQrhE+UiESoDwIEAXX3yxO3jwYLD3JIyMjJCjRo1yx48fD/ZG
+						B7EJvveNN94I9qQPExo6LkIRpthrxUSIzk6mccyYMd4N4vqblVYs9mYWK8eNHTu2qGjkg3vIvbTvQtBuvvlm99FHH2VZcYXAJec351pq
+						ojgSoRJMnjzZN8h8jcvqg+KKByFyxKH27NkT7Ekf1YqQCWyuCJFhxMXFJZw+fbq3ArFITLSKiRBC8dxzz/l7cscdd2RiOpVA/KlHjx4Z
+						IWJDjEqVX1ipxqJFi4I9olwkQkUw85yGOHfu3GDvSazRlRMrqBa+kwDt0aNHgz3po1oRstfCIkS8jbgblg8xnrD1UY4I8X4ylgSo+YxK
+						3TFzAxE9RJL4DrEp7j+ZuGIglohXrksqSiMRKkJYhHJHQosHMZrTYHHVRowYkYkH0KCJE5DV4Tg+h+PgzTff9GnfkSNHeitn+PDh/nMm
+						TJiQeR9/MwITZyDDs3DhQv/ecrAOayN5pVs1gekoRMjcW7umYejcxJ24jqT1CUCT+jcQLALj7N++fbt3y9iwbMphyZIl/rzvvvvuLNea
+						WBYxLSziMFhZCxYs8FYb9weR4lxyf7cojUSoBEzFoGOGRYiR0tLPFg8idsSxdAbSvNdee61P8zKysg9hwUWgA5EpQsSohRk6dKi3dGjU
+						iBHvARoznZGsUJqJQ4TyBfotdY8Icfxdd92V9Zmvv/66T8mbCCDa3B+sVYS9GOHBBhcw7HYjxtQLhUWZe4rYITzcN7Pg4ooNNjsSoRIw
+						kjKiTpo0yTc+GiwjLjUrWCrEirB+sGqoI+EYju3fv39W4ye4jKi89dZbvkMxshN0tawRQkUNEp0GV4/Xu3btmup4EJhYYi3s2LEj2HuS
+						8Gu52TGzMMJWD9cWUUYM7r//fv83go/lSGyM643l8cUXX3jBxvrkelMewesIvblwCEPv3r29EOHWclwhzIoaMmRIxvVln8WHEKbw+3HT
+						qJBn9QSwgHZLrRWrFYlQGWzbts3HBmjQdBxG2RMnTrjly5f7vxkply1b5hsuozMjP0ITho4QHuFXrlyZlc6l8RP/sQA4Ac40p3vD1kN4
+						49yxGrFwcl9j32effVbwNSwcc0cty8W1nTJlihcqOj9/I94IT27FdNjiyldNze/ld+eD/Q899JB3fzkW14x7TgU4QmjYeYetHn4bv79U
+						8FrkRyIUMVg8uTUzNlJaJ0BkEBssHxu5yQoxujK1IN/rIh1YLDCcqMCNxGrVWlLVIRGKGHO7zMUAzHYEhpEcsG6wcrCGDCwnGvKBAwcy
+						r1u6l3lLbCJ5bLAIWz0I0i233OIrvIn3lYpBiWwkQhFDnIIJnFQ6A24WcZ5wXMEyPTaVAeEhkE1wFSxoS0MnyM171bDTAfeBeJ+528TB
+						cB1x/xhsyLKJypAIxQCTXQmgkobPF1fAArr88st9hoX0LrEg3mOuF/8Sd6LehQBs2KpqaXDuzKpPEwSssVSxeCmrYLDo3Lmzr67ON8FW
+						FEciVGcs3mMBaJEfJoEi4ASIw26raD4kQnUmN94jCkMQWFXIzY9EqI7QmXDRSPXjalVSBd0SIQ2P2xpe0kM0HxIhkVo0K71lIBESqYE4
+						EFXLZAYHDRrkK57D9TiiOZEIiVTAWkBM9mXyKVBTxTSNuFYoEOlBIiQSxyaAsrSqlSkgPlhEuRNZRfMhERKJQ51NrtWDG1ZsrpdoHiRC
+						InEIQIenuthcOz3RtmUgERKJg9iErR7ECFFiHh7r+KR5jW1ROxIhkTgsCEcBJ/VALBLGKoa2LMcLL7xwygMGRHMhERKJw8Re1hBinSAm
+						/7JiABNCWaiMCaFazqS5kQgJIRJFIiSESBSJkBAiUSRCQohEkQgJIRJFIiSESBSJkBAiUSRCQohEkQgJIRJFIiSESBSJkBAiUSRCQohE
+						kQgJIRJFIiSESBSJkBAiUSRCQohEkQgJIRJFIiSESBSJkBAiUSRCQohEkQgJIRJFIiSESBSJkBAiUSRCQohEkQgJIRJFIiSESBSJkDgF
+						Hru8detW9+WXXwZ7hIgPiZDIwDPhv/jiC3ffffe5Nm3auE2bNgWvCBEfEiHhdu/e7dq3b+86dOjg+vbt6y644ALXqlUriZCoCxIhkcUf
+						f/zhBg4cKBESdUMiJLKQCIl6IxESWUiERL2RCIksJEKi3kiERBYSIVFvJEIiC4mQqDcSoTx8/PHHrnv37u6SSy6p69a7d+/ECwQbUYSO
+						Hz/uJk6cmPeaxr098cQTvrhTVI9EKA/79+/P1MrY1qtXL7d+/Xr32muvVbStW7fOPfjgg27q1Kl+Gzx4sLvwwgtd69atsz7ftmeeeSb4
+						FcnQiCKECMyePTvrOrZt29YLRL57UmqbP39+5n4hbldeeaVr165d1ufbdvHFF7uDBw8Gv0RUg0SoAK+//ro744wzMo2N/7MvKqhO/uST
+						T9y0adPcOeeck/meq666yh05ciQ4qv40qjt27NgxL/B2Hdn4m/1RwX1ZvXq169OnT9YgsnTp0uCIxoY2uWTJEnf11Ve7jh07uhEjRriv
+						v/46eDU+JEIF4IYwEoYbNdYRVlLU0PFffPFFP9rSuDds2BC8Un8aOSb06aef+s4TvmdYSFG7S3ze3r17Xb9+/fx39O/fP1KxS4pFixa5
+						BQsW+PMzUb/iiivcoUOHgiPiQSJUBEa+nj17ZjXqqEfXMMSDiEUNHz7c/fXXX8He+tLIIgQrVqzIslKitmDDMFDhPvMd77//frC3Mfnl
+						l1/cHXfc4Q4fPhzscf7+cy0RpziRCJUg3+j61FNPxRaM/Oqrr3wMguB4EqRFhBBh4jOVCn4+C7Zr167uwIEDwRHR8u+///rY09ixY/13
+						p5Fff/3VrV27tujAZvMHmbz8zz//ZO0jLhYnEqESIDYEKus1ugIN5t577800hnpy9OhRd9NNN/nzJP4Rl9gWA+GhU3Pdq/l+3AfcCLtf
+						bHFasHTyQYMGuZ07dwZ70oW1Ya5poWvw/fffu8svv9zdfffdPtsIW7ZscaeddpqbN2+e/zsuJEJlwAjHDQw36jh9ZUZXzGP+rQc//vij
+						69atW9b55W5xN0TDLJlJkybVZFlgSZ599tmZ388gUq2olQMWJFtasevKVui6njhxIvMaA+DkyZMVE0oT+UZXfOikYjfNysaNG91FF11U
+						cwKgkAW7efPm4IiWBy4pJQXEzUqBiFMHRTgibiRCFUCMhIZsjTru0bWlYYmAqGJuWCZDhw7N3C+2eozsaYZAOm4X7lchuD633nqrjwnV
+						A4lQBdAxSKWHGzWBu3qMFi0B6m24nlE2/u+++85nHMP3rFZXr5HZt2+fT7QUKoolZsT1+fzzz/3fhAR+//13//+4kAhVCDcptyiOAsMf
+						fvghOEJUg2Xl2KKOreQWnmLBLly4MHi1ZUHQedSoUe7666/3SYgwCPPcuXOzpg7hlilFn0LyTesoFvATpbF0MOnuqMGCzZ3WgTXQUi1Y
+						hOass87KKgOxGNq5556bNTeOJX/feOON4Kh4qFmEMNeYU3XppZf6EYYR55577nE//fRTcERzkm90JbUuqoM5dlzHuBp8PguWimfS62kA
+						EaBNUSNGW+rRo4d79913fZZ03Lhx/sEDzDlkX63wGXzHypUrgz3O7dmzxwtQ+Pqw5YpVHNQkQqaeXDDK2AFTmnQ2Fy6JOpd6gdWTWxTX
+						kkfXWsECYg7drl27gj3Rk6/wNA0WrPWjCRMmeLHkb2KPWIYE6skYEki++eabfXyr1rmFZnWSgk8DNYmQBbnCNSSoKzcXv9OKngpBHOWl
+						l14K/ipNblC40o3MSJSTQxlFbf6QbXEWxTUzVOV26tQp9qVMsFaxAux+Yc3GWXhaDlgaI0eOzGo3WIT8PpvCw0oM/H3NNdd466gW6Hek
+						6seMGZMKQ6EmESJlzYXB8jGzFovo4YcfLjr7lpgKCo+JGTYJG5F8o2uc0zqaEQtKUzBJ4WSc5LNg45zWUQ78ntxJy1iG/DYb4GlnCNH2
+						7dv937VgxalxJAGqoSYRsrJuu5mdO3d2jz32WFmWwLfffuvduLj9zXpApiV3dI3Cd28p1FOEAEuAjKbdLzaWrfjzzz+DI5LFrkdc8Zim
+						EiFGFUQHiyZ8QykQK3VydFKKpqJ0j5Iid3Stl0sWvuaNsuWj3iIEYQs2DS5ZGFxSXNMoXK98NI0I0fGso3EiZi5yQ8spOMPcTItPGgXM
+						fqfhtPSK3Gqw2hXiFPWqt8JdnjNnjrdg01b1bktoxNU/TIQYLNNg/VUlQjSUa6+91gvOtm3bgr0nb+z48eN9qo+UXxjiQEOGDPEnz4xj
+						1lOmXqES0haYNswSUnaseghMR10tXQyzhJLOjtFnWK2A/mDz2iwInRsvJT5kMSJ+86xZs/w5sBAZwkWcldqe5cuXFxVVS8fHvURHuVQl
+						QhaQ5gJYeTcQxSean7soFzec1KIF1chQIGDElBodbjYj6fnnny8BqgE6F/HFerQJm4yctAABsdEuXbr4/kRlMoOkxavoZ4bN57KJvSwb
+						smbNGn8M140Bmpo9MtYM9MWWFSHORLypUiPA4JpNnz7dfy8eTa3WWlUixIXgRFl0yhQXl4ysEAVV4dHM0tgzZ87MHEtD4/3cgEaG87H5
+						TmmKKTQi+Qro4gArnikLaSmlMBEiw8yqhogJAskgbes58ZuJs4bbGMkQ+hkicMstt2Tmd3Esbm2xqRYUhnKtq02emDuHUBKCqLWsouqY
+						EG4YC35bUJry7ilTppzi06PUuVYPCpyWoFgt0CgYUdIWU2hEeGIFnYfJk3FdS0QH8UlT3I5zxX2iUJO+hCuGoFCgaE9lwc3aunXrKdfF
+						Ymm8xzArB6EpBEWKXOtqnxKCJYRRcfrpp7vzzjuv5sXcqhahckGpwwHHfBeuEcH1wgXDLE3apA9j8YTwVqh6HcuVRh4+lr/JeNYbfh+/
+						M66njXCPcL+Y81evuFPcmBUVnuqCJVkstka2jaxbFDMaEMUZM2bUfD1jFyE6RdjqMXORC8fi4HFPjouDNMUUCmGV62zE7ogV5IM4Alki
+						zocMX5IQmGUUj/ppI3SWZozb4V1QaGnukMVkce0KtUv6HEHpKOqPELTcxfGrIXYRokGhvIxuXCRMQVPqF154oeEeHIeIpimmkA86HW4N
+						7jGCjxCFY3K5YK2yJQ0dhw4UpavOOTdr3I57xhK2CArniQtH1rqQq0n/QzSKiVQlYECQYUskMF0JnCzmPf4tM4S5UPxwzGIetFaoY6SR
+						esUUEGhGtGpFjnViyKTQOBEfRKhQDIA6EYKeaanwZhEyilijWpGgHnE7OiFWcT2fF2dhDRamHzZsmE/N42LlxmTD0Pdwd7nGtUL7Z0Jt
+						ODteLbGLULOAmNYjpsD3YMWEJwVXCnUg1113nTeTCRragu/5nhRKXAHLLk2ZStwMhL5W16lecTu+hyRNrW5JJVg8qFgAOgzC07dvX/fO
+						O+8Ee2rj5Zdfdm+++WbwV21IhMqABsxymHHHFOx7SHsWiuGUAw3TVjGwOAEilO9JoVhAWEJpmTdlkA267bbbio7sxSBOQm1a3HE7+55i
+						7m4cINQMiLlFwfngGlIgHM5QpwmJUAloWJjymPRxxhRwofDVyU6ZgFQLyYBwIRpuAp9LcVnuk0LTEg+KEkscxBm3o128/fbbPkVdryJL
+						4+mnn/YlMaT0cV3jtMzrgUSoBHHHFCjmfP75532dCNYKYlFLbIEak9xRL1yFG07Npi0eFAVxx+3IJu7YscPdeOONmfKGZnkWfVJIhIoQ
+						RUwBi4YKU9so8sRdmjZtWmZJXBqybbUUkQHuAfGgXDcGN4/PJz5kxWWVxIMQYILFNm0gF86NJ7cyQtdqyVUL96jWuB3nSerZ7tc333zj
+						Yx8MQohbu3btsu4XW6EnV4jykAgVgEWuGE1zG1zcW62xBSrUScPmioCtghn+Diwg4i6l0uFkNxGXYktLYCEgZkzRScK94/uZ8pAr6nFv
+						xWqwRHlIhPJgMYV8jS7OLV/MplIKxXisIpnvMWurkngQx5VaWgIrjKB6vQtQEVQslXoLEFvuZG1RORKhPOS6UPXaiN0wolcLFg2WTaEY
+						DwKH0NF5sBqIB4VnaheC2BGuSKnnT/G9WAblZGyiJNeFqufW6PMf04BEqIkoFeOhw1CNjAhRPEpmpZx4EMeUsxQv1lJcqwGK5kUi1ESU
+						U/Nj6XqEqND0CNwLFsoio0YlLhXuLN1QrGYH65GANIvaYZngmlG1TQ1NFFW1onmRCDUR5cR4wun6fMdahgnhQYxsPahSGS8EilgTE2fJ
+						vvHZ1NGQTSrH5RMtF4lQk8CjlrA6SBeXiitxDNZQPnEgDR+u2DYLp5S44apRuMfjnp599lkvZsyKv/POO33sRIhCSIQaHNLn+bJCjz/+
+						eHDEqSAwvXr1OiUeZDGjsNWDhYMrVsqaIWjN97K0BNaQivdEuUiEIoSnydIJzzzzzNTO0ykGosSkyPCUD84jvGZNPogBEQsihY/7xvQT
+						VRGLcpEIRQiWBOsC05HLyTqlDavzCVs9CBJrGBNLYiVGhAV3z9Y0BrJhZMVs5j91QlhPuGGvvPJK1hNZ0gZuLFYhsSvmYiG4BOPZKDcY
+						MGBA0acJi9qRCEUMk0fDC483EggMFowt+cEcKToiQWpcONZ/AlZiZN1we0QNdUFMlTDrD3cMty4sXGmGAkysuNzyAuJaWHWs8cy5iHiQ
+						CEWIFfWxemSjQjqdzkima8KECd4q4vHeLGBlE0IRI0SIND5QGsCKftZROY6MGrVFhQon44ZpNwTgyfTZxu/Nl+EzS87KC8JgCRJzS+o8
+						WgISoQixmAqWQLOD9ZPG86Ss4JFHHnEPPPCAn3xarKzA2LVrl1/FILci3Cykej6UsSUiEYoQ3BHcFxo1pjxTI4iN4KIVm3PViLASQD2D
+						74gAywPzpFKsNVaf5NqGFz7jGs+ePbvidZ8QU5Zrya0IZ34dFiETgjU/LD4kQhGC6Y5Z//333/u6GlYHHD16dMHK5EYFd4vzQ2jrAeKC
+						a4WLxbUkRsP/cZ1YEcDWSKJIEuHPdamKwbEIGsuqhJdnxbXk3jENJlwpTiAb1zOf60Yg/vbbb09sKZNGRSIUEWa6k03BFWC0RniIDzWb
+						e0YHq2cnM9FDeHKfs4XwU6SJaKxatcpboZXAipYIDY/BYZoJIseCZawZvWzZsrwWEL+l0GRe7nW4xEGURiIUEXQCOkPbtm39yIz7UMmI
+						LEqTW7Nkwo+IICY8GhnxQEjybbNmzTpFVIhtIUClKsINkg+4Z/km85pVVU83tRmQCEUEIzAz02mc7733nm/Y9XwETEsAoQiXP1hWi46P
+						AFRjCRHbKpX9oi6Kz2ZKyr333lvwyRr8nhEjRvgEBU+joLQhzTVSaUEiFBGY4RZXoJPQWSjeo4KYznPixIngSFENuH/EWgjyG7lPE60m
+						JsTnlSou5XldDz30kP9cXMFCMR+s37vuustbZMSOcO8kQqWRCEUEsR9rnDZCU2PzwQcfuNdeey04SlSLlT/Y00Nxv6jJCj+AgP3MpSvX
+						HbJ4EJ9TaPkTCi2J89lnEgsq5LphVeGO85uUTSsfiVAE0IBZx2fFihX+bzrF4sWLfRUxJrnmUNWOxYPIhhF7I12/fv36U1YMIBlAYoAn
+						mBSqcuYYxAz3GVeMKRtYsR9++GFwxP8h/mQPDiAGxft4gGA+yxarigGHe09VOaIoC7g0EiHREGB9WAC6FAwCuEaPPvqot3JsLhgDQqVZ
+						PcQHS4j0O08x5f9YtzyB4+eff/ZTXBhkcMFZtoRANxYw38W0F9wyURyJkEg9Fg9KYjoMgobQsEbSk08+6WuHEERif1999ZW3zghqU0ZA
+						pg5Bwnri/8QJ61VL1chIhESqYQY7xYHMcOexQzNmzPBuUVrA4vroo4+Cv0Q1SISEqBKsJOKAhWJPojwkQkJUCa7Wb7/9FvwlqkUiJIRI
+						FImQECJRJEJCiESRCAkhEkUiJIRIFImQECJRJEJCiESRCAkhEkUiJIRIFImQECJRJEJCiESRCAkhEkUiJIRIFImQECJBnPsfh6LP/cPu
+						K/UAAAAASUVORK5CYII="/>
+			<rect v:rectContext="foreign" x="0" y="36.75" width="123.86" height="65.1446" class="st1"/>
+		</g>
+	</g>
+</svg>
diff --git a/doc/img/equ_guided_filter.svg b/doc/img/equ_guided_filter.svg
new file mode 100644
index 0000000..021c194
--- /dev/null
+++ b/doc/img/equ_guided_filter.svg
@@ -0,0 +1,53 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<!-- Generated by Microsoft Visio, SVG Export equ_guided_filter.svg Page-1 -->
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ev="http://www.w3.org/2001/xml-events"
+		xmlns:v="http://schemas.microsoft.com/visio/2003/SVGExtensions/" width="1.42115in" height="0.772328in"
+		viewBox="0 0 102.323 55.6076" xml:space="preserve" color-interpolation-filters="sRGB" class="st2">
+	<v:documentProperties v:langID="1033" v:viewMarkup="false">
+		<v:userDefs>
+			<v:ud v:nameU="msvNoAutoConnect" v:val="VT0(1):26"/>
+		</v:userDefs>
+	</v:documentProperties>
+
+	<style type="text/css">
+	<![CDATA[
+		.st1 {fill:none;stroke:none;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.75}
+		.st2 {fill:none;fill-rule:evenodd;font-size:12px;overflow:visible;stroke-linecap:square;stroke-miterlimit:3}
+	]]>
+	</style>
+
+	<g v:mID="0" v:index="1" v:groupContext="foregroundPage">
+		<title>Page-1</title>
+		<v:pageProperties v:drawingScale="1" v:pageScale="1" v:drawingUnits="19" v:shadowOffsetX="9" v:shadowOffsetY="-9"/>
+		<g id="shape1-1" v:mID="1" v:groupContext="shape" transform="translate(18.375,-18.375)">
+			<title>Sheet.1</title>
+			<rect v:rectContext="foreign" x="0" y="36.75" width="65.5731" height="18.8576" class="st1"/>
+			<image x="0" y="36.75" width="65.5731" height="18.8576" preserveAspectRatio="none" xlink:href="data:image/png;base64,
+						iVBORw0KGgoAAAANSUhEUgAAAJkAAAAsCAYAAAB2Wxp8AAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAAAJcEhZcwAAGdYAABnW
+						ARjRyu0AAAZ7SURBVHhe7ZvvK15vHMf9A57sgdoDeSTtgSRJ8URtZUWhaPPAj7C2FpOE2APlx2JCnnkg+bGhlZUUTbS2VsuoJUIYkl9h
+						MjHMj8+398e59r3dzn3f577Pdcx2X6866fy4z65znff1vj6fz3XmQwqFxSiRKSxHiUxhOUpkCstRIlNYjhKZwnKUyBSWo0SmsBwlMoXl
+						KJEpLEeJTGE5SmReyM+fP+nr1680NDREg4OD9OXLFz4mODo6ovHxcTo9PdWOmMMrRPb582fy9fUlHx8ft7ZXr15pd/j7OT8/p6mpKcrL
+						yyN/f39KTU2l169f0/v376m9vZ2SkpJoYGCABfbixQveZOFVTra0tESRkZEsIIxgPX78+EFlZWV8TW9vr3b072Z/f5/Ky8vJz8+PiouL
+						aXNzUzvzP3t7e3wuPT3daf94gleJ7MOHD9yBYWFhNDc3px29ihDjp0+ftCPXx+rqKs3Ozmp75oGg4FoBAQH05s0bOjs7085cZWRkhG7f
+						vk0hISE0MzOjHTWPV4msvr6eRYZOx8h1xPb2Nj18+FBqRxsFU7SsaXpjY4OSk5M5VMDUiCnTGRDkvXv3XPaPu3iNyBDYPn78mEVWUVHh
+						tMMhsszMTHaV60aWyE5OTqi0tJSfNycn51Jg74iDgwPKzs7mwSgTrxEZBBMdHc2dbh9r4RxexO7uLu/DAXJzc3/vXyeyRPbu3Tt2MEyT
+						o6Oj2lHnwL3S0tKkhwmGRIZ0NzExkRuNzv/+/bt25gLEMHiBNTU12pGbh8gwg4ODaXp6Wjt6AVJ5jHpnKTucoaenh+7evcv3wctA7LS+
+						vk6PHj3iY+gjs/GUDJHt7OzwNIkB9ezZM84YjQC36+rqYieXiUuRodOysrJoZWWFFY6Gd3d3a2cvgDPguGyblUlTUxO3Eam6rUN9+/aN
+						YmNjnWaSEBgGUHNzM78w7GPKjYqKopSUFJqYmKD5+Xm+z9OnT+nw8FD7pfvIEBkGDUSP58XA+NM4FRlGNlJfZGVAvKi2tjbeB7impKSE
+						j4vrHIFr8XJwracbnNToyBQcHx9Tfn6+7v2wBQYGcvHREZh6qqqqWFwC9AF+W1dXx+0R98df/HueIkNkqHGhLXqu/SdwKrLl5WUemRj5
+						qLVkZGTQnTt3aHJyUruC+BzcASk/ps2biKN4DOl8a2srOxCmGD3gSoWFhZeeWQwsuIWIX/AXTo4+M4NZkdkmOM6e6zoxHPiPjY1xDQUP
+						YJupwAHgBLLTXpmItuuNbIgDgnEWj9mztbVF9+/fZ+G6m4GiLII6FETgyXbr1i2uZzlCmAGutX9XtiDksb+32MxO+fYYFpmoMdmPMtFY
+						mcsQshFTm308BhBn2ceYrhCiNTs16iHTybBy4ao2BhAS4XpnojSDIZHBoeBU9pVg23hM5jKETGzjMXcdyxEQAe6HGFU2ZkVm+06MOJJ4
+						t7jeqsTNkMhETGPvBJjvMe+7WqYRoAOuO/AXUxt+68laJIL9lpYWnoKQiQrRwsngaAI4RmVlpekak1mRAZHto6TiKiYzsp5rFkMiEw1B
+						NRhVYcHw8DB39t8Qj7nKIB0hYk4E+ai1LSwsUERExBVXxxcOT548MR1oyxAZ2iCWk5AZO6O/v58F5sgoMMhevnzJfQjn9mQmMCQyOEdB
+						QQF3LupmGLX4Bgn7aOBNrY+hnR0dHdxGrMnpfX3gCogMCUNDQwO7eHV1NTsEsmxRSYfD4esFCM0sMkQGMABQx8M7snVcgf071ItXAQqz
+						MTExfI2nFQTDgT+q/M+fP+dlCmQ4RUVFXPXGP37T4jGsUAQFBXHb9LYHDx7wJz1GwEjGCMZz456dnZ0cHKMyDvGhL1DxX1xc1H5hDlki
+						AxhU+HwHLoSg/u3bt/z9GP6iiAxhQYAYiJjq9ZIEPD9qgfgGLTw8/FIpxyiGRAaLtLfJtbU1dgdP1a3QR6bIBJg+ESvCDLAaAAezdS2I
+						69evX9qePrimtrbWoy9TXIoMUwAsNS4u7tKaFpYr4AqYrz2ZpxX6YHH+Jg5aiBIhExIpd3EpMrGUZFs9RoCI+T4+Pp47RfHvgyVD1NMs
+						CfyRDoeGhvIiMBALyshesGiu+PeBkeAjCU+/MHEpMgS5jY2NHPQiJU5ISPj9Hw4U3kFfXx99/PhR23MfQ4G/QmEGJTKF5SiRKSxHiUxh
+						OUpkCstRIlNYjhKZwnKUyBSWo0SmsBwlMoXFEP0Ht6gu9OfTLrAAAAAASUVORK5CYII="/>
+			<rect v:rectContext="foreign" x="0" y="36.75" width="65.5731" height="18.8576" class="st1"/>
+		</g>
+	</g>
+</svg>
diff --git a/doc/img/equ_wiener_filter.svg b/doc/img/equ_wiener_filter.svg
new file mode 100644
index 0000000..fcea1c8
--- /dev/null
+++ b/doc/img/equ_wiener_filter.svg
@@ -0,0 +1,51 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<!-- Generated by Microsoft Visio, SVG Export equ_wiener_filter.svg Page-1 -->
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ev="http://www.w3.org/2001/xml-events"
+		xmlns:v="http://schemas.microsoft.com/visio/2003/SVGExtensions/" width="1.45687in" height="0.790186in"
+		viewBox="0 0 104.895 56.8934" xml:space="preserve" color-interpolation-filters="sRGB" class="st2">
+	<v:documentProperties v:langID="1033" v:viewMarkup="false">
+		<v:userDefs>
+			<v:ud v:nameU="msvNoAutoConnect" v:val="VT0(1):26"/>
+		</v:userDefs>
+	</v:documentProperties>
+
+	<style type="text/css">
+	<![CDATA[
+		.st1 {fill:none;stroke:none;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.75}
+		.st2 {fill:none;fill-rule:evenodd;font-size:12px;overflow:visible;stroke-linecap:square;stroke-miterlimit:3}
+	]]>
+	</style>
+
+	<g v:mID="0" v:index="1" v:groupContext="foregroundPage">
+		<title>Page-1</title>
+		<v:pageProperties v:drawingScale="1" v:pageScale="1" v:drawingUnits="19" v:shadowOffsetX="9" v:shadowOffsetY="-9"/>
+		<g id="shape1-1" v:mID="1" v:groupContext="shape" transform="translate(18.375,-18.375)">
+			<title>Sheet.1</title>
+			<rect v:rectContext="foreign" x="0" y="36.75" width="68.1446" height="20.1434" class="st1"/>
+			<image x="0" y="36.75" width="68.1446" height="20.1434" preserveAspectRatio="none" xlink:href="data:image/png;base64,
+						iVBORw0KGgoAAAANSUhEUgAAAJ8AAAAvCAYAAAD90RiVAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAAAJcEhZcwAAGdYAABnW
+						ARjRyu0AAAWxSURBVHhe7ZrLK25fGMf9AyZmRkpJBjIwIFKKHMXAwG1CopBbyUAUQilCuRQDA3KdGCgpYqCUASUnRSSSW0jI/bZ+fddZ
+						y9ney7bfd2/v2s7v+dTqeNde+z1r7fV9nvVd691+jCAUQeIjlEHiI5RB4iOUQeIjlEHiI5RB4iOUQeKzKbe3t6yvr4+Nj4+Lmn8PEp/N
+						+P37N8vNzWV5eXksLCyMjYyMiCv/HiQ+m3JxccF+/fpF4iN8D4mPUAaJzwDLy8vM39+f+fn5eVTs+lBXV1dZYGCgyz4HBQWxtbU13q6n
+						p4cFBAS4bJeens6urq54O2/xVHyHh4csMTHRqS9zc3OihT7r6+vcY2rvjY+PZ1tbW6KF9ViW+fb391lMTIzugK+vr1lDQwNvMzU1JWrt
+						iXY8k5OTotYZjBVtIiIiLJ0obzPf/f09Ky4uZtHR0bxfQ0ND4op7sLMuKSlhoaGh/J6ZmRlx5XuxTHyLi4u845GRkWxnZ0fUOiMndWlp
+						SdTYE5nRw8PD2ebmpqh1pr+/n487JyeH3dzciNo/TE9Ps6SkpC9La2sre35+Fnf9wVvxHR0dsezsbFZfX8/71dTUJK64B8c5tbW1LC4u
+						7sv5sxLLxNfZ2el2ErTgoeLhfGc6twIpqpSUFHZ5eSlqP/P09MQqKyt5u+bmZvb+/i6umMdb8cE2lJWVfWRkZLSHhwdx1Zm9vT1WWlrK
+						28NufDV/VmKJ+JDqi4qKDE0CHmp+fj6PULuiFRX+xWdXnJ+fs+TkZN7OqLcyirfiwzKLRAAPFxISous/X15eWGNjI1+FZLC1tLSIq9+P
+						JeKDkGBO0XlHL4driET5AE5PT1l5eblpQ/6dnJ2dfZh3Pc8kJ9hqvwe8EZ8MGlggaW8wL+4CfWFhgQsVyUMGm9VBpIcl4tPzR/Pz89xP
+						vL6+ihr7I3e8KPjbHRMTE3zCrNjdSnZ3d1lmZib3X3im2ATAF3Z3d4sW7kEmxi8jEJ4Ur7vAQIBVVVWxk5OTj+TxHUGkhyXikynbcRLw
+						IOGZ7L6zdQTZBuPRyxqwFnLnbrXf8xZkYqwqd3d3n6yQ4+YOfYWYZ2dn+WcZbFYGkRFMi0/rj1wVLEt4KJ6CTIlJdfWdRgsm4vHxUXyj
+						MfD/1tTUuPw+d8UuwYVMjGUUaMeBei2Yj7q6uo9nI5OHr4PItPjc+b23tzc2ODiou1u0I9pNhJ7fktniq6MYXyHFpvVs8gQC4pLgTK+6
+						upptb2/zz9rk4esgMi0+vUlAuscD+Ul+T24ivvJ7cmn29VLlDvQBh8vwexLZR9gDZDSUsbExXmSGk8lDRRCZFh92g+4mYWBgwCnl2x05
+						Ych+yIKu0GYLuwQXgqawsPDTGR2CH32E94MHRLZD1kP2k8jNooogMiU+O06CGbQ+CePC+FyhXZrtElxYMh3P6GQWh/U5Pj7mPs/Rf0u/
+						p2L+TIlPOwlW+wU8CF9vODz1e95upqxGBo3jGZ0860Pp7e3lK5F2Q4FfPvALCMarIohMic9uk2AWmSmM+j29pdmXYLksKChwOqPDRg9Z
+						D33NyMjgn7VIv2dk/vDdsbGxLDU1lR+hWYHX4kMEDQ8P84Hh1wAcWv5kjI4HmbSiooK301uafQVOFbDqoM/azQaQZ314FWxlZUXU/gWe
+						EH7PyPzJgEORxzlm8Vh8eJ9NvnrjqmRlZfFXp34KGA8mx9VYtO/v4dgIE+WqnYrjJIglLS3NqS9RUVGfMhN8YFtb24efOzg4YAkJCU73
+						oWjH6wgyH+4LDg7m3hGiN4upZZf4/4HjmI6ODvHJHCQ+wiOwxOttxjyBxEcYRr79srGxIWrMQeIjDIEN2ejoKGtvb7fsPJDERxgCr2h1
+						dXV9+nXELCQ+QhkkPkIZJD5CGSQ+QhkkPkIZJD5CGSQ+QhkkPkIZJD5CGSQ+QhkkPkIZJD5CGSQ+QhGM/Qd0+F8Wgj3WpQAAAABJRU5E
+						rkJggg=="/>
+			<rect v:rectContext="foreign" x="0" y="36.75" width="68.1446" height="20.1434" class="st1"/>
+		</g>
+	</g>
+</svg>
diff --git a/doc/img/inter_motion_field.svg b/doc/img/inter_motion_field.svg
new file mode 100644
index 0000000..091ae11
--- /dev/null
+++ b/doc/img/inter_motion_field.svg
@@ -0,0 +1,219 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<!-- Generated by Microsoft Visio, SVG Export inter_motion_field.svg Page-1 -->
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ev="http://www.w3.org/2001/xml-events"
+		xmlns:v="http://schemas.microsoft.com/visio/2003/SVGExtensions/" width="5.60417in" height="1.72563in"
+		viewBox="0 0 403.5 124.245" xml:space="preserve" color-interpolation-filters="sRGB" class="st21">
+	<v:documentProperties v:langID="1033" v:viewMarkup="false"/>
+
+	<style type="text/css">
+	<![CDATA[
+		.st1 {fill:none;stroke:#000000;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.24}
+		.st2 {fill:url(#ptrn11-12_10);shape-rendering:crispEdges;stroke:#000000;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.24}
+		.st3 {marker-start:url(#mrkr5-20);stroke:#923931;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.75}
+		.st4 {fill:#923931;fill-opacity:1;stroke:#923931;stroke-opacity:1;stroke-width:0.29411764705882}
+		.st5 {fill:#ffffff;stroke:none;stroke-linecap:butt;stroke-width:7.2}
+		.st6 {fill:#923931;font-family:Arial;font-size:0.666664em}
+		.st7 {baseline-shift:-32.4939%;font-size:0.649878em}
+		.st8 {stroke:#923931;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.75}
+		.st9 {marker-start:url(#mrkr10-32);stroke:#923931;stroke-dasharray:0.75,1.5;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.75}
+		.st10 {fill:url(#ptrn17-38_36);shape-rendering:crispEdges;stroke:#000000;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.24}
+		.st11 {marker-end:url(#mrkr10-44);stroke:#923931;stroke-dasharray:0.75,1.5;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.75}
+		.st12 {marker-end:url(#mrkr10-56);stroke:#000000;stroke-dasharray:0.75,1.5;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.75}
+		.st13 {fill:#000000;fill-opacity:1;stroke:#000000;stroke-opacity:1;stroke-width:0.29411764705882}
+		.st14 {marker-start:url(#mrkr5-62);stroke:#0070c0;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.75}
+		.st15 {fill:#0070c0;fill-opacity:1;stroke:#0070c0;stroke-opacity:1;stroke-width:0.34246575342466}
+		.st16 {fill:#0070c0;font-family:Arial;font-size:0.666664em}
+		.st17 {marker-end:url(#mrkr10-70);stroke:#0070c0;stroke-dasharray:0.75,1.5;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.75}
+		.st18 {fill:#0070c0;fill-opacity:1;stroke:#0070c0;stroke-opacity:1;stroke-width:0.29411764705882}
+		.st19 {fill:none;stroke:none;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.75}
+		.st20 {fill:#000000;font-family:Arial;font-size:0.499992em}
+		.st21 {fill:none;fill-rule:evenodd;font-size:12px;overflow:visible;stroke-linecap:square;stroke-miterlimit:3}
+	]]>
+	</style>
+
+	<defs id="Patterns_And_Gradients">
+		<pattern id="ptrn11-12" v:fillPattern="11" v:foreground="#002060" v:background="#ffffff" patternUnits="userSpaceOnUse"
+				width="6" height="6" viewBox="0 0 64 64">
+			<image x="0" y="0" width="64" height="64" image-rendering="optimizeSpeed"
+					xlink:href=""/>
+		</pattern>
+		<pattern id="ptrn17-38" v:fillPattern="17" v:foreground="#923931" v:foregroundOpacity="0.47" v:background="#ffffff"
+				v:backgroundOpacity="0.47" patternUnits="userSpaceOnUse" width="6" height="6" viewBox="0 0 64 64">
+			<image x="0" y="0" width="64" height="64" image-rendering="optimizeSpeed"
+					xlink:href=""/>
+		</pattern>
+	</defs>
+	<defs id="Markers">
+		<g id="lend5">
+			<path d="M 2 1 L 0 0 L 1.98117 -0.993387 C 1.67173 -0.364515 1.67301 0.372641 1.98465 1.00043 " style="stroke:none"/>
+		</g>
+		<marker id="mrkr5-20" class="st4" v:arrowType="5" v:arrowSize="1" v:setback="5.47" refX="5.47" orient="auto"
+				markerUnits="strokeWidth" overflow="visible">
+			<use xlink:href="#lend5" transform="scale(3.4) "/>
+		</marker>
+		<g id="lend10">
+			<path
+					d="M 0 0.75 C -0.414214 0.75 -0.75 0.414214 -0.75 0 -0.75 -0.414214 -0.414214 -0.75 0 -0.75 0.414214 -0.75 0.75 -0.414214 0.75 0 0.75 0.414214 0.414214 0.75 0 0.75 Z "
+					style="stroke:none"/>
+		</g>
+		<marker id="mrkr10-32" class="st4" v:arrowType="10" v:arrowSize="1" v:setback="2.07" refX="2.07" orient="auto"
+				markerUnits="strokeWidth" overflow="visible">
+			<use xlink:href="#lend10" transform="scale(3.4) "/>
+		</marker>
+		<marker id="mrkr10-44" class="st4" v:arrowType="10" v:arrowSize="1" v:setback="2.55" refX="-2.55" orient="auto"
+				markerUnits="strokeWidth" overflow="visible">
+			<use xlink:href="#lend10" transform="scale(-3.4,-3.4) "/>
+		</marker>
+		<marker id="mrkr10-56" class="st13" v:arrowType="10" v:arrowSize="1" v:setback="2.55" refX="-2.55" orient="auto"
+				markerUnits="strokeWidth" overflow="visible">
+			<use xlink:href="#lend10" transform="scale(-3.4,-3.4) "/>
+		</marker>
+		<marker id="mrkr5-62" class="st15" v:arrowType="5" v:arrowSize="0" v:setback="4.63" refX="4.63" orient="auto"
+				markerUnits="strokeWidth" overflow="visible">
+			<use xlink:href="#lend5" transform="scale(2.92) "/>
+		</marker>
+		<marker id="mrkr10-70" class="st18" v:arrowType="10" v:arrowSize="1" v:setback="2.55" refX="-2.55" orient="auto"
+				markerUnits="strokeWidth" overflow="visible">
+			<use xlink:href="#lend10" transform="scale(-3.4,-3.4) "/>
+		</marker>
+	</defs>
+	<g v:mID="0" v:index="1" v:groupContext="foregroundPage">
+		<title>Page-1</title>
+		<v:pageProperties v:drawingScale="1" v:pageScale="1" v:drawingUnits="19" v:shadowOffsetX="9" v:shadowOffsetY="-9"/>
+		<g id="shape1-1" v:mID="1" v:groupContext="shape" transform="translate(47.13,0.12) rotate(-90) scale(-1,1)">
+			<title>Parallelogram</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:prompt="" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<path d="M0 124.25 L97.2 124.25 L108 70.25 L10.8 70.25 L0 124.25 Z" class="st1"/>
+		</g>
+		<g id="shape2-3" v:mID="2" v:groupContext="shape" transform="translate(155.13,0.12) rotate(-90) scale(-1,1)">
+			<title>Parallelogram.2</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:prompt="" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<path d="M0 124.25 L97.2 124.25 L108 70.25 L10.8 70.25 L0 124.25 Z" class="st1"/>
+		</g>
+		<g id="shape3-5" v:mID="3" v:groupContext="shape" transform="translate(-60.87,0.12) rotate(-90) scale(-1,1)">
+			<title>Parallelogram.3</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:prompt="" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<path d="M0 124.25 L97.2 124.25 L108 70.25 L10.8 70.25 L0 124.25 Z" class="st1"/>
+		</g>
+		<g id="shape4-7" v:mID="4" v:groupContext="shape" transform="translate(26.88,31.62) rotate(-90) scale(-1,1)">
+			<title>Parallelogram.4</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:prompt="" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<path d="M0 124.25 L24.3 124.25 L27 110.75 L2.7 110.75 L0 124.25 Z" class="st1"/>
+		</g>
+		<g id="shape5-9" v:mID="5" v:groupContext="shape" transform="translate(134.88,49.62) rotate(-90) scale(-1,1)">
+			<title>Parallelogram.5</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:prompt="" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<pattern id="ptrn11-12_10" patternUnits="userSpaceOnUse" patternTransform="rotate(-90) scale(-1,1)"
+					xlink:href="#ptrn11-12"/>
+			<path d="M0 124.25 L24.3 124.25 L27 110.75 L2.7 110.75 L0 124.25 Z" class="st2"/>
+		</g>
+		<g id="shape6-13" v:mID="6" v:groupContext="shape" transform="translate(-81.12,13.62) rotate(-90) scale(-1,1)">
+			<title>Parallelogram.6</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:prompt="" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<path d="M0 124.25 L24.3 124.25 L27 110.75 L2.7 110.75 L0 124.25 Z" class="st1"/>
+		</g>
+		<g id="shape7-15" v:mID="7" v:groupContext="shape" transform="translate(56.8008,-95.4345) rotate(9.46232)">
+			<title>Sheet.7</title>
+			<desc>MVref</desc>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="41.0586" cy="124.245" width="82.12" height="0"/>
+			<path d="M4.1 124.25 L4.46 124.25 L82.12 124.25" class="st3"/>
+			<rect v:rectContext="textBkgnd" x="32.0251" y="118.245" width="18.067" height="12.0287" class="st5"/>
+			<text x="32.03" y="127.25" class="st6" v:langID="2052"><v:paragraph v:horizAlign="1"/><v:tabList/>MV<tspan
+						dy="-0.287em" class="st7" v:baseFontSize="8">ref</tspan></text>		</g>
+		<g id="shape9-24" v:mID="9" v:groupContext="shape" transform="translate(164.801,-77.4345) rotate(9.4623)">
+			<title>Sheet.9</title>
+			<path d="M0 124.25 L82.12 124.25" class="st8"/>
+		</g>
+		<g id="shape12-27" v:mID="12" v:groupContext="shape" transform="translate(123.949,167.675) rotate(-170.538)">
+			<title>Sheet.12</title>
+			<path d="M1.55 124.25 L1.91 124.25 L27.37 124.25" class="st9"/>
+		</g>
+		<g id="shape13-33" v:mID="13" v:groupContext="shape" transform="translate(263.13,0.12) rotate(-90) scale(-1,1)">
+			<title>Parallelogram.13</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:prompt="" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<path d="M0 124.25 L97.2 124.25 L108 70.25 L10.8 70.25 L0 124.25 Z" class="st1"/>
+		</g>
+		<g id="shape14-35" v:mID="14" v:groupContext="shape" transform="translate(242.88,67.62) rotate(-90) scale(-1,1)">
+			<title>Parallelogram.14</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:prompt="" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<pattern id="ptrn17-38_36" patternUnits="userSpaceOnUse" patternTransform="rotate(-90) scale(-1,1)"
+					xlink:href="#ptrn17-38"/>
+			<path d="M0 124.25 L24.3 124.25 L27 110.75 L2.7 110.75 L0 124.25 Z" class="st10"/>
+		</g>
+		<g id="shape8-39" v:mID="8" v:groupContext="shape" transform="translate(353.801,-45.9345) rotate(9.46229)">
+			<title>Sheet.8</title>
+			<path d="M0 124.25 L25.71 124.25" class="st11"/>
+		</g>
+		<g id="shape15-45" v:mID="15" v:groupContext="shape" transform="translate(272.557,-59.475) rotate(9.46231)">
+			<title>Sheet.15</title>
+			<desc>MVref</desc>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="41.1819" cy="124.245" width="82.37" height="0"/>
+			<path d="M0 124.25 L82.36 124.25" class="st8"/>
+			<rect v:rectContext="textBkgnd" x="32.1485" y="119.445" width="18.067" height="10.0769" class="st5"/>
+			<text x="32.15" y="126.64" class="st6" v:langID="2052"><v:paragraph v:horizAlign="1"/><v:tabList/>MV<tspan
+						dy="-0.287em" class="st7" v:baseFontSize="8">ref</tspan></text>		</g>
+		<g id="shape16-51" v:mID="16" v:groupContext="shape" transform="translate(245.314,-64.0156) rotate(9.46229)">
+			<title>Sheet.16</title>
+			<path d="M0 124.25 L25.71 124.25" class="st12"/>
+		</g>
+		<g id="shape17-57" v:mID="17" v:groupContext="shape" transform="translate(163.726,-75.3635) rotate(9.46229)">
+			<title>Sheet.17</title>
+			<desc>MV0</desc>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="41.6032" cy="124.245" width="83.21" height="0"/>
+			<path d="M3.47 124.25 L3.83 124.25 L83.21 124.25" class="st14"/>
+			<rect v:rectContext="textBkgnd" x="33.3787" y="119.445" width="16.449" height="9.59985" class="st5"/>
+			<text x="33.38" y="126.64" class="st16" v:langID="2052"><v:paragraph v:horizAlign="1"/><v:tabList/>MV0</text>		</g>
+		<g id="shape19-65" v:mID="19" v:groupContext="shape" transform="translate(245.326,-61.7636) rotate(9.46229)">
+			<title>Sheet.19</title>
+			<path d="M0 124.25 L25.71 124.25" class="st17"/>
+		</g>
+		<g id="shape21-71" v:mID="21" v:groupContext="shape" transform="translate(225.375,-0.375)">
+			<title>Sheet.21</title>
+			<desc>Current frame</desc>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="29.25" cy="117.495" width="58.5" height="13.5"/>
+			<rect x="0" y="110.745" width="58.5" height="13.5" class="st19"/>
+			<text x="10.74" y="119" class="st20" v:langID="2052"><v:paragraph v:spLine="-1" v:horizAlign="1"/><v:tabList/>Current frame</text>		</g>
+		<g id="shape22-74" v:mID="22" v:groupContext="shape" transform="translate(331.125,-0.375)">
+			<title>Sheet.22</title>
+			<desc>Reference frame 1 (R1)</desc>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="36" cy="117.495" width="72.01" height="13.5"/>
+			<rect x="0" y="110.745" width="72" height="13.5" class="st19"/>
+			<text x="4.49" y="119" class="st20" v:langID="2052"><v:paragraph v:spLine="-1" v:horizAlign="1"/><v:tabList/>Reference frame 1 (R1)</text>		</g>
+		<g id="shape23-77" v:mID="23" v:groupContext="shape" transform="translate(119.625,-0.375)">
+			<title>Sheet.23</title>
+			<desc>Reference frame 0</desc>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="29.25" cy="117.495" width="58.5" height="13.5"/>
+			<rect x="0" y="110.745" width="58.5" height="13.5" class="st19"/>
+			<text x="4.41" y="119" class="st20" v:langID="2052"><v:paragraph v:spLine="-1" v:horizAlign="1"/><v:tabList/>Reference frame 0</text>		</g>
+		<g id="shape24-80" v:mID="24" v:groupContext="shape" transform="translate(0.375,-0.375)">
+			<title>Sheet.24</title>
+			<desc>Reference frame of R1</desc>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="36" cy="117.495" width="72.01" height="13.5"/>
+			<rect x="0" y="110.745" width="72" height="13.5" class="st19"/>
+			<text x="5.65" y="119" class="st20" v:langID="2052"><v:paragraph v:spLine="-1" v:horizAlign="1"/><v:tabList/>Reference frame of R1</text>		</g>
+	</g>
+</svg>
diff --git a/doc/img/inter_obmc.svg b/doc/img/inter_obmc.svg
new file mode 100644
index 0000000..a69084b
--- /dev/null
+++ b/doc/img/inter_obmc.svg
@@ -0,0 +1,61 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.0//EN" "http://www.w3.org/TR/2001/REC-SVG-20010904/DTD/svg10.dtd">
+<!-- 由 Microsoft Visio 11.0, SVG Export, v1.0 生成 inter_obmc.svg 页-1 -->
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:v="http://schemas.microsoft.com/visio/2003/SVGExtensions/" width="2.98609in"
+		height="2.98609in" viewBox="0 0 214.998 214.998" xml:space="preserve" color-interpolation-filters="sRGB" class="st4">
+	<v:documentProperties v:langID="2052" v:viewMarkup="false"/>
+
+	<style type="text/css">
+	<![CDATA[
+		.st1 {fill:#ffffff;stroke:#000000;stroke-linecap:round;stroke-linejoin:round;stroke-width:1.2}
+		.st2 {fill:#000000;font-family:Times New Roman;font-size:1.16666em}
+		.st3 {fill:#8c8c8c;stroke:#000000;stroke-linecap:round;stroke-linejoin:round;stroke-width:1.2}
+		.st4 {fill:none;fill-rule:evenodd;font-size:12;overflow:visible;stroke-linecap:square;stroke-miterlimit:3}
+	]]>
+	</style>
+
+	<g v:mID="0" v:index="1" v:groupContext="foregroundPage">
+		<title>页-1</title>
+		<v:pageProperties v:drawingScale="0.0393701" v:pageScale="0.0393701" v:drawingUnits="24" v:shadowOffsetX="4.25197"
+				v:shadowOffsetY="-4.25197"/>
+		<g id="shape1-1" v:mID="1" v:groupContext="shape" transform="translate(1.2,-1.2)">
+			<title>工作表.1</title>
+			<desc>4</desc>
+			<v:textBlock v:margins="rect(2,2,2,2)" v:tabSpace="42.5197"/>
+			<v:textRect cx="35.4331" cy="144.132" width="70.87" height="141.732"/>
+			<rect x="0" y="73.2661" width="70.8661" height="141.732" class="st1"/>
+			<text x="31.93" y="148.33" class="st2" v:langID="2052"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text>		</g>
+		<g id="shape2-4" v:mID="2" v:groupContext="shape" transform="translate(72.0661,-1.2)">
+			<title>工作表.2</title>
+			<desc>0</desc>
+			<v:textBlock v:margins="rect(2,2,2,2)" v:tabSpace="42.5197"/>
+			<v:textRect cx="70.8661" cy="144.132" width="141.74" height="141.732"/>
+			<rect x="0" y="73.2661" width="141.732" height="141.732" class="st1"/>
+			<text x="67.37" y="148.33" class="st2" v:langID="2052"><v:paragraph v:horizAlign="1"/><v:tabList/>0</text>		</g>
+		<g id="shape3-7" v:mID="3" v:groupContext="shape" transform="translate(107.499,-142.932)">
+			<title>工作表.3</title>
+			<desc>2</desc>
+			<v:textBlock v:margins="rect(2,2,2,2)" v:tabSpace="42.5197"/>
+			<v:textRect cx="35.4331" cy="179.565" width="70.87" height="70.8661"/>
+			<rect x="0" y="144.132" width="70.8661" height="70.8661" class="st1"/>
+			<text x="31.93" y="183.77" class="st2" v:langID="2052"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text>		</g>
+		<g id="shape4-10" v:mID="4" v:groupContext="shape" transform="translate(178.365,-142.932)">
+			<title>工作表.4</title>
+			<desc>3</desc>
+			<v:textBlock v:margins="rect(2,2,2,2)" v:tabSpace="42.5197"/>
+			<v:textRect cx="17.7165" cy="197.282" width="35.44" height="35.4331"/>
+			<rect x="0" y="179.565" width="35.4331" height="35.4331" class="st1"/>
+			<text x="14.22" y="201.48" class="st2" v:langID="2052"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text>		</g>
+		<g id="shape5-13" v:mID="5" v:groupContext="shape" transform="translate(72.0661,-142.932)">
+			<title>工作表.5</title>
+			<desc>1</desc>
+			<v:textBlock v:margins="rect(2,2,2,2)" v:tabSpace="42.5197"/>
+			<v:textRect cx="17.7165" cy="197.282" width="35.44" height="35.4331"/>
+			<rect x="0" y="179.565" width="35.4331" height="35.4331" class="st1"/>
+			<text x="14.22" y="201.48" class="st2" v:langID="2052"><v:paragraph v:horizAlign="1"/><v:tabList/>1</text>		</g>
+		<g id="shape6-16" v:mID="6" v:groupContext="shape" transform="translate(72.0661,-72.0661)">
+			<title>工作表.6</title>
+			<rect x="0" y="144.132" width="35.4331" height="70.8661" class="st3"/>
+		</g>
+	</g>
+</svg>
diff --git a/doc/img/inter_spatial_mvp.svg b/doc/img/inter_spatial_mvp.svg
new file mode 100644
index 0000000..aa2e88a
--- /dev/null
+++ b/doc/img/inter_spatial_mvp.svg
@@ -0,0 +1,215 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<!-- Generated by Microsoft Visio, SVG Export inter_spatial_mvp.svg Page-1 -->
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ev="http://www.w3.org/2001/xml-events"
+		xmlns:v="http://schemas.microsoft.com/visio/2003/SVGExtensions/" width="3.50333in" height="3.01208in"
+		viewBox="0 0 252.24 216.87" xml:space="preserve" color-interpolation-filters="sRGB" class="st10">
+	<v:documentProperties v:langID="1033" v:viewMarkup="false"/>
+
+	<style type="text/css">
+	<![CDATA[
+		.st1 {fill:none;stroke:#000000;stroke-linecap:round;stroke-linejoin:round;stroke-width:1.5}
+		.st2 {fill:none;stroke:#000000;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.24}
+		.st3 {marker-end:url(#mrkr5-45);marker-start:url(#mrkr10-43);stroke:#ea700d;stroke-linecap:round;stroke-linejoin:round;stroke-width:1.25}
+		.st4 {fill:#ea700d;fill-opacity:1;stroke:#ea700d;stroke-opacity:1;stroke-width:0.3315649867374}
+		.st5 {marker-end:url(#mrkr5-54);marker-start:url(#mrkr10-52);stroke:#f59d56;stroke-linecap:round;stroke-linejoin:round;stroke-width:1.25}
+		.st6 {fill:#f59d56;fill-opacity:1;stroke:#f59d56;stroke-opacity:1;stroke-width:0.3315649867374}
+		.st7 {marker-end:url(#mrkr5-54);stroke:#f59d56;stroke-linecap:round;stroke-linejoin:round;stroke-width:1.25}
+		.st8 {marker-end:url(#mrkr5-70);marker-start:url(#mrkr10-68);stroke:#0070c0;stroke-linecap:round;stroke-linejoin:round;stroke-width:1.25}
+		.st9 {fill:#0070c0;fill-opacity:1;stroke:#0070c0;stroke-opacity:1;stroke-width:0.3315649867374}
+		.st10 {fill:none;fill-rule:evenodd;font-size:12px;overflow:visible;stroke-linecap:square;stroke-miterlimit:3}
+	]]>
+	</style>
+
+	<defs id="Markers">
+		<g id="lend10">
+			<path
+					d="M 0 0.75 C -0.414214 0.75 -0.75 0.414214 -0.75 0 -0.75 -0.414214 -0.414214 -0.75 0 -0.75 0.414214 -0.75 0.75 -0.414214 0.75 0 0.75 0.414214 0.414214 0.75 0 0.75 Z "
+					style="stroke:none"/>
+		</g>
+		<marker id="mrkr10-43" class="st4" v:arrowType="10" v:arrowSize="2" v:setback="1.974" refX="1.974" orient="auto"
+				markerUnits="strokeWidth" overflow="visible">
+			<use xlink:href="#lend10" transform="scale(3.016) "/>
+		</marker>
+		<g id="lend5">
+			<path d="M 2 1 L 0 0 L 1.98117 -0.993387 C 1.67173 -0.364515 1.67301 0.372641 1.98465 1.00043 " style="stroke:none"/>
+		</g>
+		<marker id="mrkr5-45" class="st4" v:arrowType="5" v:arrowSize="2" v:setback="5.278" refX="-5.278" orient="auto"
+				markerUnits="strokeWidth" overflow="visible">
+			<use xlink:href="#lend5" transform="scale(-3.016,-3.016) "/>
+		</marker>
+		<marker id="mrkr10-52" class="st6" v:arrowType="10" v:arrowSize="2" v:setback="1.974" refX="1.974" orient="auto"
+				markerUnits="strokeWidth" overflow="visible">
+			<use xlink:href="#lend10" transform="scale(3.016) "/>
+		</marker>
+		<marker id="mrkr5-54" class="st6" v:arrowType="5" v:arrowSize="2" v:setback="5.278" refX="-5.278" orient="auto"
+				markerUnits="strokeWidth" overflow="visible">
+			<use xlink:href="#lend5" transform="scale(-3.016,-3.016) "/>
+		</marker>
+		<marker id="mrkr10-68" class="st9" v:arrowType="10" v:arrowSize="2" v:setback="1.974" refX="1.974" orient="auto"
+				markerUnits="strokeWidth" overflow="visible">
+			<use xlink:href="#lend10" transform="scale(3.016) "/>
+		</marker>
+		<marker id="mrkr5-70" class="st9" v:arrowType="5" v:arrowSize="2" v:setback="5.278" refX="-5.278" orient="auto"
+				markerUnits="strokeWidth" overflow="visible">
+			<use xlink:href="#lend5" transform="scale(-3.016,-3.016) "/>
+		</marker>
+	</defs>
+	<g v:mID="0" v:index="1" v:groupContext="foregroundPage">
+		<title>Page-1</title>
+		<v:pageProperties v:drawingScale="1" v:pageScale="1" v:drawingUnits="19" v:shadowOffsetX="9" v:shadowOffsetY="-9"/>
+		<g id="shape33-1" v:mID="33" v:groupContext="shape" transform="translate(72.12,-0.75)">
+			<title>Square.33</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="72.87" width="144" height="144" class="st1"/>
+		</g>
+		<g id="shape38-3" v:mID="38" v:groupContext="shape" transform="translate(72.12,-144.75)">
+			<title>Square.38</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="180.87" width="36" height="36" class="st2"/>
+		</g>
+		<g id="shape39-5" v:mID="39" v:groupContext="shape" transform="translate(108.12,-144.75)">
+			<title>Square.39</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="180.87" width="36" height="36" class="st2"/>
+		</g>
+		<g id="shape40-7" v:mID="40" v:groupContext="shape" transform="translate(144.12,-144.75)">
+			<title>Square.40</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="180.87" width="36" height="36" class="st2"/>
+		</g>
+		<g id="shape41-9" v:mID="41" v:groupContext="shape" transform="translate(180.12,-144.75)">
+			<title>Square.41</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="180.87" width="36" height="36" class="st2"/>
+		</g>
+		<g id="shape42-11" v:mID="42" v:groupContext="shape" transform="translate(36.12,-108.75)">
+			<title>Square.42</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="180.87" width="36" height="36" class="st2"/>
+		</g>
+		<g id="shape43-13" v:mID="43" v:groupContext="shape" transform="translate(36.12,-72.75)">
+			<title>Square.43</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="180.87" width="36" height="36" class="st2"/>
+		</g>
+		<g id="shape44-15" v:mID="44" v:groupContext="shape" transform="translate(36.12,-36.75)">
+			<title>Square.44</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="180.87" width="36" height="36" class="st2"/>
+		</g>
+		<g id="shape45-17" v:mID="45" v:groupContext="shape" transform="translate(36.12,-0.75)">
+			<title>Square.45</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="180.87" width="36" height="36" class="st2"/>
+		</g>
+		<g id="shape46-19" v:mID="46" v:groupContext="shape" transform="translate(0.12,-108.75)">
+			<title>Square.46</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="180.87" width="36" height="36" class="st2"/>
+		</g>
+		<g id="shape47-21" v:mID="47" v:groupContext="shape" transform="translate(0.12,-72.75)">
+			<title>Square.47</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="180.87" width="36" height="36" class="st2"/>
+		</g>
+		<g id="shape48-23" v:mID="48" v:groupContext="shape" transform="translate(0.120005,-36.75)">
+			<title>Square.48</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="180.87" width="36" height="36" class="st2"/>
+		</g>
+		<g id="shape49-25" v:mID="49" v:groupContext="shape" transform="translate(0.120005,-0.75)">
+			<title>Square.49</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="180.87" width="36" height="36" class="st2"/>
+		</g>
+		<g id="shape50-27" v:mID="50" v:groupContext="shape" transform="translate(72.12,-180.75)">
+			<title>Square.50</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="180.87" width="36" height="36" class="st2"/>
+		</g>
+		<g id="shape51-29" v:mID="51" v:groupContext="shape" transform="translate(108.12,-180.75)">
+			<title>Square.51</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="180.87" width="36" height="36" class="st2"/>
+		</g>
+		<g id="shape52-31" v:mID="52" v:groupContext="shape" transform="translate(144.12,-180.75)">
+			<title>Square.52</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="180.87" width="36" height="36" class="st2"/>
+		</g>
+		<g id="shape53-33" v:mID="53" v:groupContext="shape" transform="translate(180.12,-180.75)">
+			<title>Square.53</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="180.87" width="36" height="36" class="st2"/>
+		</g>
+		<g id="shape54-35" v:mID="54" v:groupContext="shape" transform="translate(36.12,-144.75)">
+			<title>Square.54</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="180.87" width="36" height="36" class="st2"/>
+		</g>
+		<g id="shape55-37" v:mID="55" v:groupContext="shape" transform="translate(90.12,-162.75)">
+			<title>Sheet.55</title>
+			<path d="M2.47 216.87 L2.83 216.87 L101.4 216.87" class="st3"/>
+		</g>
+		<g id="shape56-46" v:mID="56" v:groupContext="shape" transform="translate(270.99,90.12) rotate(90)">
+			<title>Sheet.56</title>
+			<path d="M2.47 216.87 L2.83 216.87 L101.4 216.87" class="st5"/>
+		</g>
+		<g id="shape58-55" v:mID="58" v:groupContext="shape" transform="translate(-81.3576,28.773) rotate(-38.6598)">
+			<title>Sheet.58</title>
+			<path d="M0 216.87 L223.91 216.87" class="st7"/>
+		</g>
+		<g id="shape59-60" v:mID="59" v:groupContext="shape" transform="translate(216.12,-144.75)">
+			<title>Square.59</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="180.87" width="36" height="36" class="st2"/>
+		</g>
+		<g id="shape60-62" v:mID="60" v:groupContext="shape" transform="translate(54.12,-162.75)">
+			<title>Sheet.60</title>
+			<path d="M1.74 215.13 L2 214.87 L36 180.87 L137.4 180.87" class="st8"/>
+		</g>
+		<g id="shape61-71" v:mID="61" v:groupContext="shape" transform="translate(234.99,90.12) rotate(90)">
+			<title>Sheet.61</title>
+			<path d="M2.47 216.87 L2.83 216.87 L101.4 216.87" class="st8"/>
+		</g>
+	</g>
+</svg>
diff --git a/doc/img/inter_tmvp_positions.svg b/doc/img/inter_tmvp_positions.svg
new file mode 100644
index 0000000..87f8dfa
--- /dev/null
+++ b/doc/img/inter_tmvp_positions.svg
@@ -0,0 +1,99 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<!-- Generated by Microsoft Visio, SVG Export inter_tmvp_positions.svg Page-1 -->
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ev="http://www.w3.org/2001/xml-events"
+		xmlns:v="http://schemas.microsoft.com/visio/2003/SVGExtensions/" width="2.00333in" height="1.51208in"
+		viewBox="0 0 144.24 108.87" xml:space="preserve" color-interpolation-filters="sRGB" class="st4">
+	<v:documentProperties v:langID="1033" v:viewMarkup="false"/>
+
+	<style type="text/css">
+	<![CDATA[
+		.st1 {fill:none;stroke:#000000;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.24}
+		.st2 {fill:#000000;font-family:Calibri;font-size:0.833336em}
+		.st3 {fill:none;stroke:#000000;stroke-linecap:round;stroke-linejoin:round;stroke-width:1.5}
+		.st4 {fill:none;fill-rule:evenodd;font-size:12px;overflow:visible;stroke-linecap:square;stroke-miterlimit:3}
+	]]>
+	</style>
+
+	<g v:mID="0" v:index="1" v:groupContext="foregroundPage">
+		<title>Page-1</title>
+		<v:pageProperties v:drawingScale="1" v:pageScale="1" v:drawingUnits="19" v:shadowOffsetX="9" v:shadowOffsetY="-9"/>
+		<g id="shape31-1" v:mID="31" v:groupContext="shape" transform="translate(0.12,-0.12)">
+			<title>Square.31</title>
+			<desc>B4</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="90.87" width="36.01" height="36"/>
+			<rect x="0" y="72.87" width="36" height="36" class="st1"/>
+			<text x="12.75" y="93.87" class="st2" v:langID="2052"><v:paragraph v:horizAlign="1"/><v:tabList/>B4</text>		</g>
+		<g id="shape30-4" v:mID="30" v:groupContext="shape" transform="translate(108.12,-36.12)">
+			<title>Square.30</title>
+			<desc>B6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="90.87" width="36.01" height="36"/>
+			<rect x="0" y="72.87" width="36" height="36" class="st1"/>
+			<text x="12.75" y="93.87" class="st2" v:langID="2052"><v:paragraph v:horizAlign="1"/><v:tabList/>B6</text>		</g>
+		<g id="shape32-7" v:mID="32" v:groupContext="shape" transform="translate(108.12,-0.12)">
+			<title>Square.32</title>
+			<desc>B5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="90.87" width="36.01" height="36"/>
+			<rect x="0" y="72.87" width="36" height="36" class="st1"/>
+			<text x="12.75" y="93.87" class="st2" v:langID="2052"><v:paragraph v:horizAlign="1"/><v:tabList/>B5</text>		</g>
+		<g id="shape25-10" v:mID="25" v:groupContext="shape" transform="translate(36.12,-36.12)">
+			<title>Square</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="36.87" width="72" height="72" class="st3"/>
+		</g>
+		<g id="shape26-12" v:mID="26" v:groupContext="shape" transform="translate(36.12,-72.12)">
+			<title>Square.26</title>
+			<desc>B0</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="90.87" width="36.01" height="36"/>
+			<rect x="0" y="72.87" width="36" height="36" class="st1"/>
+			<text x="12.75" y="93.87" class="st2" v:langID="2052"><v:paragraph v:horizAlign="1"/><v:tabList/>B0</text>		</g>
+		<g id="shape27-15" v:mID="27" v:groupContext="shape" transform="translate(72.12,-72.12)">
+			<title>Square.27</title>
+			<desc>B1</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="90.87" width="36.01" height="36"/>
+			<rect x="0" y="72.87" width="36" height="36" class="st1"/>
+			<text x="12.75" y="93.87" class="st2" v:langID="2052"><v:paragraph v:horizAlign="1"/><v:tabList/>B1</text>		</g>
+		<g id="shape28-18" v:mID="28" v:groupContext="shape" transform="translate(36.12,-36.12)">
+			<title>Square.28</title>
+			<desc>B2</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="90.87" width="36.01" height="36"/>
+			<rect x="0" y="72.87" width="36" height="36" class="st1"/>
+			<text x="12.75" y="93.87" class="st2" v:langID="2052"><v:paragraph v:horizAlign="1"/><v:tabList/>B2</text>		</g>
+		<g id="shape29-21" v:mID="29" v:groupContext="shape" transform="translate(72.12,-36.12)">
+			<title>Square.29</title>
+			<desc>B3</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="90.87" width="36.01" height="36"/>
+			<rect x="0" y="72.87" width="36" height="36" class="st1"/>
+			<text x="12.75" y="93.87" class="st2" v:langID="2052"><v:paragraph v:horizAlign="1"/><v:tabList/>B3</text>		</g>
+	</g>
+</svg>
diff --git a/doc/img/inter_tx_partition.svg b/doc/img/inter_tx_partition.svg
new file mode 100644
index 0000000..6f853c6
--- /dev/null
+++ b/doc/img/inter_tx_partition.svg
@@ -0,0 +1,87 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<!-- Generated by Microsoft Visio, SVG Export inter_tx_partition.svg Page-1 -->
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ev="http://www.w3.org/2001/xml-events"
+		xmlns:v="http://schemas.microsoft.com/visio/2003/SVGExtensions/" width="4.52083in" height="2.02083in"
+		viewBox="0 0 325.5 145.5" xml:space="preserve" color-interpolation-filters="sRGB" class="st6">
+	<v:documentProperties v:langID="1033" v:viewMarkup="false"/>
+
+	<style type="text/css">
+	<![CDATA[
+		.st1 {fill:none;stroke:#000000;stroke-linecap:round;stroke-linejoin:round;stroke-width:1.5}
+		.st2 {stroke:#000000;stroke-dasharray:1.5,3;stroke-linecap:round;stroke-linejoin:round;stroke-width:1.5}
+		.st3 {stroke:#000000;stroke-dasharray:0.75,1.5;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.75}
+		.st4 {marker-end:url(#mrkr5-22);stroke:#0070c0;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.75}
+		.st5 {fill:#0070c0;fill-opacity:1;stroke:#0070c0;stroke-opacity:1;stroke-width:0.22935779816514}
+		.st6 {fill:none;fill-rule:evenodd;font-size:12px;overflow:visible;stroke-linecap:square;stroke-miterlimit:3}
+	]]>
+	</style>
+
+	<defs id="Markers">
+		<g id="lend5">
+			<path d="M 2 1 L 0 0 L 1.98117 -0.993387 C 1.67173 -0.364515 1.67301 0.372641 1.98465 1.00043 " style="stroke:none"/>
+		</g>
+		<marker id="mrkr5-22" class="st5" v:arrowType="5" v:arrowSize="2" v:setback="7.63" refX="-7.63" orient="auto"
+				markerUnits="strokeWidth" overflow="visible">
+			<use xlink:href="#lend5" transform="scale(-4.36,-4.36) "/>
+		</marker>
+	</defs>
+	<g v:mID="0" v:index="1" v:groupContext="foregroundPage">
+		<title>Page-1</title>
+		<v:pageProperties v:drawingScale="1" v:pageScale="1" v:drawingUnits="19" v:shadowOffsetX="9" v:shadowOffsetY="-9"/>
+		<g id="shape24-1" v:mID="24" v:groupContext="shape" transform="translate(0.75,-0.75)">
+			<title>Square.24</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="1.5" width="144" height="144" class="st1"/>
+		</g>
+		<g id="shape25-3" v:mID="25" v:groupContext="shape" transform="translate(180.75,-0.75)">
+			<title>Square.25</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="1.5" width="144" height="144" class="st1"/>
+		</g>
+		<g id="shape26-5" v:mID="26" v:groupContext="shape" transform="translate(180.75,-72.75)">
+			<title>Sheet.26</title>
+			<path d="M0 145.5 L144 145.5" class="st2"/>
+		</g>
+		<g id="shape27-8" v:mID="27" v:groupContext="shape" transform="translate(398.25,0.75) rotate(90)">
+			<title>Sheet.27</title>
+			<path d="M0 145.5 L144 145.5" class="st2"/>
+		</g>
+		<g id="shape28-11" v:mID="28" v:groupContext="shape" transform="translate(252.75,-108.75)">
+			<title>Sheet.28</title>
+			<path d="M0 145.5 L72 145.5" class="st3"/>
+		</g>
+		<g id="shape29-14" v:mID="29" v:groupContext="shape" transform="translate(434.25,0.750007) rotate(90)">
+			<title>Sheet.29</title>
+			<path d="M0 145.5 L72 145.5" class="st3"/>
+		</g>
+		<g id="shape30-17" v:mID="30" v:groupContext="shape" transform="translate(170.739,-101.283) rotate(-18.4349)">
+			<title>Sheet.30</title>
+			<path d="M0 145.5 L51.2 145.5" class="st4"/>
+		</g>
+		<g id="shape31-23" v:mID="31" v:groupContext="shape" transform="translate(270.75,-126.75)">
+			<title>Sheet.31</title>
+			<path d="M0 145.5 L30.28 145.5" class="st4"/>
+		</g>
+		<g id="shape32-28" v:mID="32" v:groupContext="shape" transform="translate(409.634,121.634) rotate(135)">
+			<title>Sheet.32</title>
+			<path d="M0 145.5 L45.06 145.5" class="st4"/>
+		</g>
+		<g id="shape33-33" v:mID="33" v:groupContext="shape" transform="translate(270.844,-90.8438)">
+			<title>Sheet.33</title>
+			<path d="M0 145.5 L30.18 145.5" class="st4"/>
+		</g>
+		<g id="shape34-38" v:mID="34" v:groupContext="shape" transform="translate(381.705,179.364) rotate(148.992)">
+			<title>Sheet.34</title>
+			<path d="M0 145.5 L99.28 145.5" class="st4"/>
+		</g>
+		<g id="shape35-43" v:mID="35" v:groupContext="shape" transform="translate(216.75,-36.75)">
+			<title>Sheet.35</title>
+			<path d="M0 145.5 L66.28 145.5" class="st4"/>
+		</g>
+	</g>
+</svg>
diff --git a/doc/img/intra_cfl.svg b/doc/img/intra_cfl.svg
new file mode 100644
index 0000000..1153a28
--- /dev/null
+++ b/doc/img/intra_cfl.svg
@@ -0,0 +1,193 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<!-- Generated by Microsoft Visio, SVG Export CfL_prediction.svg Page-1 -->
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ev="http://www.w3.org/2001/xml-events"
+		xmlns:v="http://schemas.microsoft.com/visio/2003/SVGExtensions/" width="6.52269in" height="1.90714in"
+		viewBox="0 0 469.634 137.314" xml:space="preserve" color-interpolation-filters="sRGB" class="st13">
+	<v:documentProperties v:langID="1033" v:viewMarkup="false"/>
+
+	<style type="text/css">
+	<![CDATA[
+		.st1 {fill:#ff00ff;fill-opacity:0;stroke:#000000;stroke-opacity:0;stroke-width:0.75}
+		.st2 {fill:#ffffff;stroke:#000000;stroke-width:0.75}
+		.st3 {fill:#000000;font-family:Calibri;font-size:0.75em}
+		.st4 {marker-end:url(#mrkr4-22);stroke:#000000;stroke-linecap:round;stroke-linejoin:round;stroke-width:1}
+		.st5 {fill:#000000;fill-opacity:1;stroke:#000000;stroke-opacity:1;stroke-width:0.28409090909091}
+		.st6 {fill:none;stroke:#000000;stroke-width:0.75}
+		.st7 {fill:#000000;font-family:Calibri;font-size:1.99999em}
+		.st8 {fill:#000000;font-family:Calibri;font-size:1.5em}
+		.st9 {fill:none;stroke:none;stroke-width:0.25}
+		.st10 {font-size:1em}
+		.st11 {fill:#000000;font-family:SimSun;font-size:0.75em}
+		.st12 {font-family:Calibri;font-size:1em}
+		.st13 {fill:none;fill-rule:evenodd;font-size:12px;overflow:visible;stroke-linecap:square;stroke-miterlimit:3}
+	]]>
+	</style>
+
+	<defs id="Markers">
+		<g id="lend4">
+			<path d="M 2 1 L 0 0 L 2 -1 L 2 1 " style="stroke:none"/>
+		</g>
+		<marker id="mrkr4-22" class="st5" v:arrowType="4" v:arrowSize="2" v:setback="7.04" refX="-7.04" orient="auto"
+				markerUnits="strokeWidth" overflow="visible">
+			<use xlink:href="#lend4" transform="scale(-3.52,-3.52) "/>
+		</marker>
+	</defs>
+	<g v:mID="0" v:index="1" v:groupContext="foregroundPage">
+		<title>Page-1</title>
+		<v:pageProperties v:drawingScale="0.0393701" v:pageScale="0.0393701" v:drawingUnits="24" v:shadowOffsetX="4.25197"
+				v:shadowOffsetY="-4.25197"/>
+		<v:layer v:name="Flowchart" v:index="0"/>
+		<v:layer v:name="Connector" v:index="1"/>
+		<g id="group5-1" transform="translate(111.581,-86.9232)" v:mID="5" v:groupContext="group" v:layerMember="0">
+			<v:custProps>
+				<v:cp v:nameU="Cost" v:lbl="Cost" v:type="7" v:format="@" v:langID="1033"/>
+				<v:cp v:nameU="Duration" v:lbl="Duration" v:type="2" v:langID="1033"/>
+				<v:cp v:nameU="Resources" v:lbl="Resources" v:langID="1033"/>
+			</v:custProps>
+			<v:userDefs>
+				<v:ud v:nameU="ScaleFactor" v:val="VT0(1):26"/>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<title>Tagged process</title>
+			<g id="shape6-2" v:mID="6" v:groupContext="shape" transform="translate(0.566929,0)">
+				<title>Sheet.6</title>
+				<path d="M53.15 137.31 L70.87 137.31 L70.87 128.46 L70.87 116.05 L0 116.05 L0 137.31 L53.15 137.31 Z" class="st1"/>
+			</g>
+			<g id="shape7-4" v:mID="7" v:groupContext="shape" v:layerMember="0" transform="translate(54.9213,0)">
+				<title>Sheet.7</title>
+			</g>
+			<g id="shape8-6" v:mID="8" v:groupContext="shape" v:layerMember="0">
+				<title>Sheet.8</title>
+				<desc>Sub-Sample</desc>
+				<v:textBlock v:margins="rect(2,2,2,2)" v:tabSpace="42.5197"/>
+				<v:textRect cx="35.4331" cy="126.684" width="70.87" height="21.2598"/>
+				<path d="M0 137.31 L70.87 137.31 L70.87 121.37 L70.87 116.05 L0 116.05 L0 137.31 Z" class="st2"/>
+				<text x="13.81" y="129.38" class="st3" v:langID="2052"><v:paragraph v:horizAlign="1"/><v:tabList/>Sub-Sample</text>			</g>
+		</g>
+		<g id="group9-9" transform="translate(224.967,-86.9232)" v:mID="9" v:groupContext="group" v:layerMember="0">
+			<v:custProps>
+				<v:cp v:nameU="Cost" v:lbl="Cost" v:type="7" v:format="@" v:langID="1033"/>
+				<v:cp v:nameU="Duration" v:lbl="Duration" v:type="2" v:langID="1033"/>
+				<v:cp v:nameU="Resources" v:lbl="Resources" v:langID="1033"/>
+			</v:custProps>
+			<v:userDefs>
+				<v:ud v:nameU="ScaleFactor" v:val="VT0(1):26"/>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<title>Tagged process.9</title>
+			<g id="shape10-10" v:mID="10" v:groupContext="shape" transform="translate(0.566929,0)">
+				<title>Sheet.10</title>
+				<path d="M53.15 137.31 L70.87 137.31 L70.87 128.46 L70.87 116.05 L0 116.05 L0 137.31 L53.15 137.31 Z" class="st1"/>
+			</g>
+			<g id="shape11-12" v:mID="11" v:groupContext="shape" v:layerMember="0" transform="translate(54.9213,0)">
+				<title>Sheet.11</title>
+			</g>
+			<g id="shape12-14" v:mID="12" v:groupContext="shape" v:layerMember="0">
+				<title>Sheet.12</title>
+				<desc>Average</desc>
+				<v:textBlock v:margins="rect(2,2,2,2)" v:tabSpace="42.5197"/>
+				<v:textRect cx="35.4331" cy="126.684" width="70.87" height="21.2598"/>
+				<path d="M0 137.31 L70.87 137.31 L70.87 121.37 L70.87 116.05 L0 116.05 L0 137.31 Z" class="st2"/>
+				<text x="20.48" y="129.38" class="st3" v:langID="2052"><v:paragraph v:horizAlign="1"/><v:tabList/>Average</text>			</g>
+		</g>
+		<g id="shape27-17" v:mID="27" v:groupContext="shape" transform="translate(182.447,-97.5531)">
+			<title>Sheet.27</title>
+			<path d="M0 137.31 L35.48 137.31" class="st4"/>
+		</g>
+		<g id="shape28-23" v:mID="28" v:groupContext="shape" transform="translate(295.833,-97.5531)">
+			<title>Sheet.28</title>
+			<path d="M0 137.31 L35.48 137.31" class="st4"/>
+		</g>
+		<g id="shape29-28" v:mID="29" v:groupContext="shape" transform="translate(341.47,-86.9232)">
+			<title>Sheet.29</title>
+			<desc>-</desc>
+			<v:textBlock v:margins="rect(2,2,2,2)" v:tabSpace="42.5197"/>
+			<v:textRect cx="11.1968" cy="126.117" width="22.4" height="22.3937"/>
+			<ellipse cx="11.1968" cy="126.117" rx="11.1968" ry="11.1968" class="st6"/>
+			<text x="7.52" y="133.32" class="st7" v:langID="2052"><v:paragraph v:horizAlign="1"/><v:tabList/>- </text>		</g>
+		<g id="shape34-31" v:mID="34" v:groupContext="shape" v:layerMember="1" transform="translate(147.014,-101.663)">
+			<title>Dynamic connector</title>
+			<path d="M0 130.79 L0 109.53 L205.65 109.53 L205.65 122.62" class="st4"/>
+		</g>
+		<g id="shape35-36" v:mID="35" v:groupContext="shape" transform="translate(34.2657,-97.5531)">
+			<title>Sheet.35</title>
+			<path d="M0 137.31 L70.27 137.31" class="st4"/>
+		</g>
+		<g id="shape36-41" v:mID="36" v:groupContext="shape" transform="translate(341.329,-43.2697)">
+			<title>Sheet.36</title>
+			<desc>×</desc>
+			<v:textBlock v:margins="rect(2,2,2,2)" v:tabSpace="42.5197"/>
+			<v:textRect cx="11.1968" cy="126.117" width="22.4" height="22.3937"/>
+			<ellipse cx="11.1968" cy="126.117" rx="11.1968" ry="11.1968" class="st6"/>
+			<text x="6.71" y="131.52" class="st8" v:langID="2052"><v:paragraph v:horizAlign="1"/><v:tabList/>× </text>		</g>
+		<g id="shape37-44" v:mID="37" v:groupContext="shape" transform="translate(34.2657,-53.5676)">
+			<title>Sheet.37</title>
+			<path d="M0 137.31 L300.06 137.31" class="st4"/>
+		</g>
+		<g id="shape38-49" v:mID="38" v:groupContext="shape" transform="translate(489.499,50.3067) rotate(89.9693)">
+			<title>Sheet.38</title>
+			<path d="M0 137.31 L14.24 137.31" class="st4"/>
+		</g>
+		<g id="shape39-54" v:mID="39" v:groupContext="shape" transform="translate(341.329,-0.75)">
+			<title>Sheet.39</title>
+			<desc>+</desc>
+			<v:textBlock v:margins="rect(2,2,2,2)" v:tabSpace="42.5197"/>
+			<v:textRect cx="11.1968" cy="126.117" width="22.4" height="22.3937"/>
+			<ellipse cx="11.1968" cy="126.117" rx="11.1968" ry="11.1968" class="st6"/>
+			<text x="6.71" y="131.52" class="st8" v:langID="2052"><v:paragraph v:horizAlign="1"/><v:tabList/>+  </text>		</g>
+		<g id="shape40-57" v:mID="40" v:groupContext="shape" transform="translate(34.2657,-11.9539)">
+			<title>Sheet.40</title>
+			<path d="M0 137.31 L300.02 137.31" class="st4"/>
+		</g>
+		<g id="shape41-62" v:mID="41" v:groupContext="shape" v:layerMember="1" transform="translate(345.51,-86.9234)">
+			<title>Dynamic connector.41</title>
+			<path d="M7.09 137.31 L7.09 151.53" class="st4"/>
+		</g>
+		<g id="shape74-67" v:mID="74" v:groupContext="shape" v:layerMember="1" transform="translate(345.439,-43.2697)">
+			<title>Dynamic connector.74</title>
+			<path d="M7.09 137.31 L7.09 150.4" class="st4"/>
+		</g>
+		<g id="shape75-72" v:mID="75" v:groupContext="shape" transform="translate(363.722,-11.9551)">
+			<title>Sheet.75</title>
+			<path d="M0 137.31 L35.48 137.31" class="st4"/>
+		</g>
+		<g id="shape78-77" v:mID="78" v:groupContext="shape" transform="translate(3.08465,-17.2788)">
+			<title>Sheet.78</title>
+			<desc>Chroma DC Prediction</desc>
+			<v:textBlock v:margins="rect(2,2,2,2)" v:tabSpace="42.5197"/>
+			<v:textRect cx="70.3916" cy="131.314" width="140.79" height="12"/>
+			<rect x="0" y="125.314" width="140.783" height="12" class="st9"/>
+			<text x="30.02" y="134.01" class="st3" v:langID="2052"><v:paragraph v:horizAlign="1"/><v:tabList/>Chroma DC Prediction</text>		</g>
+		<g id="shape82-80" v:mID="82" v:groupContext="shape" transform="translate(0.25,-60.75)">
+			<title>Sheet.82</title>
+			<desc>Scaling parameter α</desc>
+			<v:textBlock v:margins="rect(2,2,2,2)" v:tabSpace="42.5197"/>
+			<v:textRect cx="70.3916" cy="131.314" width="140.79" height="12"/>
+			<rect x="0" y="125.314" width="140.783" height="12" class="st9"/>
+			<text x="33.74" y="134.01" class="st3" v:langID="2052"><v:paragraph v:horizAlign="1"/><v:tabList/>Scaling parameter α </text>		</g>
+		<g id="shape83-83" v:mID="83" v:groupContext="shape" transform="translate(30.0138,-102.514)">
+			<title>Sheet.83</title>
+			<desc>Luma reconstructed samples</desc>
+			<v:textBlock v:margins="rect(2,2,2,2)" v:tabSpace="42.5197"/>
+			<v:textRect cx="43.937" cy="131.314" width="87.88" height="12"/>
+			<rect x="0" y="125.314" width="87.874" height="12" class="st9"/>
+			<text x="7.25" y="128.61" class="st3" v:langID="2052"><v:paragraph v:horizAlign="1"/><v:tabList/>Luma reconstructed <tspan
+						x="29.03" dy="1.2em" class="st10">samples</tspan></text>		</g>
+		<g id="shape84-87" v:mID="84" v:groupContext="shape" transform="translate(398.518,-5.47437)">
+			<title>Sheet.84</title>
+			<desc>CfL Prediction</desc>
+			<v:textBlock v:margins="rect(2,2,2,2)" v:tabSpace="42.5197"/>
+			<v:textRect cx="35.4331" cy="131.314" width="70.87" height="12"/>
+			<rect x="0" y="125.314" width="70.8661" height="12" class="st9"/>
+			<text x="10.04" y="134.01" class="st3" v:langID="2052"><v:paragraph v:horizAlign="1"/><v:tabList/>CfL Prediction</text>		</g>
+		<g id="shape85-90" v:mID="85" v:groupContext="shape" transform="translate(354.581,-72.75)">
+			<title>Sheet.85</title>
+			<desc>“AC” contribution</desc>
+			<v:textBlock v:margins="rect(2,2,2,2)" v:tabSpace="42.5197"/>
+			<v:textRect cx="40.3937" cy="131.314" width="80.79" height="12"/>
+			<rect x="0" y="125.314" width="80.7874" height="12" class="st9"/>
+			<text x="2.62" y="134.31" class="st11" v:langID="2052"><v:paragraph v:horizAlign="1"/><v:tabList/>“<tspan class="st12">AC</tspan>”<tspan
+						class="st12"> </tspan><tspan class="st12">contribution</tspan></text>		</g>
+	</g>
+</svg>
diff --git a/doc/img/intra_directional.svg b/doc/img/intra_directional.svg
new file mode 100644
index 0000000..3a08007
--- /dev/null
+++ b/doc/img/intra_directional.svg
@@ -0,0 +1,192 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<!-- Generated by Microsoft Visio, SVG Export intra_directional.svg Page-1 -->
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ev="http://www.w3.org/2001/xml-events"
+		xmlns:v="http://schemas.microsoft.com/visio/2003/SVGExtensions/" width="4.24969in" height="4.20313in"
+		viewBox="0 0 305.978 302.625" xml:space="preserve" color-interpolation-filters="sRGB" class="st13">
+	<v:documentProperties v:langID="1033" v:viewMarkup="false"/>
+
+	<style type="text/css">
+	<![CDATA[
+		.st1 {fill:none;stroke:#000000;stroke-linecap:round;stroke-linejoin:round;stroke-width:1.5}
+		.st2 {marker-start:url(#mrkr5-8);stroke:#000000;stroke-linecap:round;stroke-linejoin:round;stroke-width:1.5}
+		.st3 {fill:#000000;fill-opacity:1;stroke:#000000;stroke-opacity:1;stroke-width:0.37313432835821}
+		.st4 {fill:#ffffff;stroke:none;stroke-linecap:butt;stroke-width:7.2}
+		.st5 {fill:#2f4f4f;font-family:Consolas;font-size:0.791656em}
+		.st6 {font-size:1em}
+		.st7 {fill:#ffffff;stroke:none;stroke-linecap:butt}
+		.st8 {marker-end:url(#mrkr5-49);stroke:#000000;stroke-linecap:round;stroke-linejoin:round;stroke-width:1.5}
+		.st9 {marker-end:url(#mrkr5-65);stroke:#000000;stroke-dasharray:2.25,2.25;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.75}
+		.st10 {fill:#000000;fill-opacity:1;stroke:#000000;stroke-opacity:1;stroke-width:0.22935779816514}
+		.st11 {fill:none;stroke:none;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.75}
+		.st12 {fill:#000000;font-family:Calibri;font-size:0.666664em}
+		.st13 {fill:none;fill-rule:evenodd;font-size:12px;overflow:visible;stroke-linecap:square;stroke-miterlimit:3}
+	]]>
+	</style>
+
+	<defs id="Markers">
+		<g id="lend5">
+			<path d="M 2 1 L 0 0 L 1.98117 -0.993387 C 1.67173 -0.364515 1.67301 0.372641 1.98465 1.00043 " style="stroke:none"/>
+		</g>
+		<marker id="mrkr5-8" class="st3" v:arrowType="5" v:arrowSize="2" v:setback="4.45" refX="4.45" orient="auto"
+				markerUnits="strokeWidth" overflow="visible">
+			<use xlink:href="#lend5" transform="scale(2.68) "/>
+		</marker>
+		<marker id="mrkr5-49" class="st3" v:arrowType="5" v:arrowSize="2" v:setback="4.69" refX="-4.69" orient="auto"
+				markerUnits="strokeWidth" overflow="visible">
+			<use xlink:href="#lend5" transform="scale(-2.68,-2.68) "/>
+		</marker>
+		<marker id="mrkr5-65" class="st10" v:arrowType="5" v:arrowSize="2" v:setback="7.63" refX="-7.63" orient="auto"
+				markerUnits="strokeWidth" overflow="visible">
+			<use xlink:href="#lend5" transform="scale(-4.36,-4.36) "/>
+		</marker>
+	</defs>
+	<g v:mID="0" v:index="1" v:groupContext="foregroundPage">
+		<title>Page-1</title>
+		<v:pageProperties v:drawingScale="1" v:pageScale="1" v:drawingUnits="19" v:shadowOffsetX="9" v:shadowOffsetY="-9"/>
+		<g id="shape1-1" v:mID="1" v:groupContext="shape" transform="translate(8.98899,-0.75)">
+			<title>Square</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="14.625" width="288" height="288" class="st1"/>
+		</g>
+		<g id="shape5-3" v:mID="5" v:groupContext="shape" transform="translate(222.977,-200.113) rotate(45)">
+			<title>Sheet.5</title>
+			<desc>D135_PRED</desc>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="101.823" cy="302.625" width="203.65" height="0"/>
+			<path d="M6.68 302.62 L7.03 302.62 L203.65 302.62" class="st2"/>
+			<rect v:rectContext="textBkgnd" x="78.3191" y="295.425" width="47.0086" height="14.4001" class="st4"/>
+			<text x="78.32" y="306.23" class="st5" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>D<tspan class="st6"
+						v:langID="2052">13</tspan>5_PRED</text>		</g>
+		<g id="shape6-12" v:mID="6" v:groupContext="shape" transform="translate(8.98899,-144.75)">
+			<title>Sheet.6</title>
+			<desc>H_PRED</desc>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="72" cy="302.625" width="144" height="0"/>
+			<path d="M6.67 302.62 L7.03 302.62 L144 302.62" class="st2"/>
+			<rect v:rectContext="textBkgnd" x="56.3305" y="295.425" width="31.3391" height="14.4001" class="st4"/>
+			<text x="56.33" y="306.23" class="st5" v:langID="2052"><v:paragraph v:horizAlign="1"/><v:tabList/>H<tspan class="st6"
+						v:langID="1033">_PRED</tspan></text>		</g>
+		<g id="shape8-20" v:mID="8" v:groupContext="shape" transform="translate(367.241,-107.423) rotate(66.3706)">
+			<title>Sheet.8</title>
+			<desc>D113_PRED</desc>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="78.5891" cy="302.625" width="157.18" height="0"/>
+			<path d="M6.67 302.62 L7.03 302.62 L157.18 302.62" class="st2"/>
+			<rect v:rectContext="textBkgnd" x="55.0849" y="295.425" width="47.0086" height="14.4001" class="st7"/>
+			<text x="55.08" y="306.23" class="st5" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>D<tspan class="st6"
+						v:langID="2052">113</tspan>_PRED</text>		</g>
+		<g id="shape9-28" v:mID="9" v:groupContext="shape" transform="translate(130.287,-182.377) rotate(23.6294)">
+			<title>Sheet.9</title>
+			<desc>D157_PRED</desc>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="78.5891" cy="302.625" width="157.18" height="0"/>
+			<path d="M6.67 302.62 L7.03 302.62 L157.18 302.62" class="st2"/>
+			<rect v:rectContext="textBkgnd" x="55.0849" y="295.425" width="47.0086" height="14.4001" class="st4"/>
+			<text x="55.08" y="306.23" class="st5" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>D<tspan class="st6"
+						v:langID="2052">157</tspan>_PRED</text>		</g>
+		<g id="shape10-36" v:mID="10" v:groupContext="shape" transform="translate(-112.309,-56.3771) rotate(-23.6294)">
+			<title>Sheet.10</title>
+			<desc>D203_PRED</desc>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="78.5891" cy="302.625" width="157.18" height="0"/>
+			<path d="M6.67 302.62 L7.03 302.62 L157.18 302.62" class="st2"/>
+			<rect v:rectContext="textBkgnd" x="55.0849" y="295.425" width="47.0086" height="14.4001" class="st4"/>
+			<text x="55.08" y="306.23" class="st5" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>D<tspan class="st6"
+						v:langID="2052">203</tspan>_PRED</text>		</g>
+		<g id="shape11-44" v:mID="11" v:groupContext="shape" transform="translate(-60.9992,-56.1132) rotate(-45)">
+			<title>Sheet.11</title>
+			<desc>D45_PRED</desc>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="101.823" cy="302.625" width="203.65" height="0"/>
+			<path d="M0 302.62 L196.61 302.62" class="st8"/>
+			<rect v:rectContext="textBkgnd" x="80.9308" y="295.425" width="41.7854" height="14.4001" class="st7"/>
+			<text x="80.93" y="306.23" class="st5" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>D45_PRED</text>		</g>
+		<g id="shape12-52" v:mID="12" v:groupContext="shape" transform="translate(-149.636,157.875) rotate(-90)">
+			<title>Sheet.12</title>
+			<desc>V_PRED</desc>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="72" cy="302.625" width="144" height="0"/>
+			<path d="M0 302.62 L136.96 302.62" class="st8"/>
+			<rect v:rectContext="textBkgnd" x="56.3305" y="295.425" width="31.3391" height="14.4001" class="st7"/>
+			<text x="56.33" y="306.23" class="st5" v:langID="2052"><v:paragraph v:horizAlign="1"/><v:tabList/>V<tspan class="st6"
+						v:langID="1033">_PRED</tspan></text>		</g>
+		<g id="shape13-60" v:mID="13" v:groupContext="shape" transform="translate(-117.687,22.537) rotate(-63.4349)">
+			<title>Sheet.13</title>
+			<path d="M0 302.62 L155.27 302.62" class="st9"/>
+		</g>
+		<g id="shape14-66" v:mID="14" v:groupContext="shape" transform="translate(-110.772,9.50969) rotate(-60.6422)">
+			<title>Sheet.14</title>
+			<path d="M0 302.62 L159.5 302.62" class="st9"/>
+		</g>
+		<g id="shape15-71" v:mID="15" v:groupContext="shape" transform="translate(-103.636,-2.51593) rotate(-57.9946)">
+			<title>Sheet.15</title>
+			<path d="M0 302.62 L164.09 302.62" class="st9"/>
+		</g>
+		<g id="shape16-76" v:mID="16" v:groupContext="shape" transform="translate(-130.368,51.6163) rotate(-69.444)">
+			<title>Sheet.16</title>
+			<path d="M0 302.62 L148.07 302.62" class="st9"/>
+		</g>
+		<g id="shape17-81" v:mID="17" v:groupContext="shape" transform="translate(-135.861,67.6095) rotate(-72.646)">
+			<title>Sheet.17</title>
+			<path d="M0 302.62 L145.14 302.62" class="st9"/>
+		</g>
+		<g id="shape18-86" v:mID="18" v:groupContext="shape" transform="translate(-140.6,84.4777) rotate(-75.9638)">
+			<title>Sheet.18</title>
+			<path d="M0 302.62 L142.71 302.62" class="st9"/>
+		</g>
+		<g id="shape30-91" v:mID="30" v:groupContext="shape" transform="translate(-124.263,36.5772) rotate(-66.3706)">
+			<title>Sheet.30</title>
+			<desc>D67_PRED</desc>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="78.5891" cy="302.625" width="157.18" height="0"/>
+			<path d="M0 302.62 L150.14 302.62" class="st8"/>
+			<rect v:rectContext="textBkgnd" x="57.6964" y="295.425" width="41.7854" height="14.4001" class="st4"/>
+			<text x="57.7" y="306.23" class="st5" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>D<tspan class="st6"
+						v:langID="2052">67</tspan>_PRED</text>		</g>
+		<g id="shape31-99" v:mID="31" v:groupContext="shape" transform="translate(214.864,-288.75)">
+			<title>Sheet.31</title>
+			<desc>+1</desc>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="295.875" width="18" height="13.5"/>
+			<rect x="0" y="289.125" width="18" height="13.5" class="st11"/>
+			<text x="4.98" y="298.27" class="st12" v:langID="2052"><v:paragraph v:horizAlign="1"/><v:tabList/>+1</text>		</g>
+		<g id="shape32-102" v:mID="32" v:groupContext="shape" transform="translate(224.989,-288.75)">
+			<title>Sheet.32</title>
+			<desc>+2</desc>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="295.875" width="18" height="13.5"/>
+			<rect x="0" y="289.125" width="18" height="13.5" class="st11"/>
+			<text x="4.98" y="298.27" class="st12" v:langID="2052"><v:paragraph v:horizAlign="1"/><v:tabList/>+2</text>		</g>
+		<g id="shape33-105" v:mID="33" v:groupContext="shape" transform="translate(238.489,-288.75)">
+			<title>Sheet.33</title>
+			<desc>+3</desc>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="295.875" width="18" height="13.5"/>
+			<rect x="0" y="289.125" width="18" height="13.5" class="st11"/>
+			<text x="4.98" y="298.27" class="st12" v:langID="2052"><v:paragraph v:horizAlign="1"/><v:tabList/>+3</text>		</g>
+		<g id="shape34-108" v:mID="34" v:groupContext="shape" transform="translate(197.989,-288.75)">
+			<title>Sheet.34</title>
+			<desc>-1</desc>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="295.875" width="18" height="13.5"/>
+			<rect x="0" y="289.125" width="18" height="13.5" class="st11"/>
+			<text x="5.75" y="298.27" class="st12" v:langID="2052"><v:paragraph v:horizAlign="1"/><v:tabList/>-1</text>		</g>
+		<g id="shape35-111" v:mID="35" v:groupContext="shape" transform="translate(188.989,-288.75)">
+			<title>Sheet.35</title>
+			<desc>-2</desc>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="295.875" width="18" height="13.5"/>
+			<rect x="0" y="289.125" width="18" height="13.5" class="st11"/>
+			<text x="5.75" y="298.27" class="st12" v:langID="2052"><v:paragraph v:horizAlign="1"/><v:tabList/>-2</text>		</g>
+		<g id="shape36-114" v:mID="36" v:groupContext="shape" transform="translate(177.739,-288.75)">
+			<title>Sheet.36</title>
+			<desc>-3</desc>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="295.875" width="18" height="13.5"/>
+			<rect x="0" y="289.125" width="18" height="13.5" class="st11"/>
+			<text x="5.75" y="298.27" class="st12" v:langID="2052"><v:paragraph v:horizAlign="1"/><v:tabList/>-3</text>		</g>
+	</g>
+</svg>
diff --git a/doc/img/intra_paeth.svg b/doc/img/intra_paeth.svg
new file mode 100644
index 0000000..f7a831f
--- /dev/null
+++ b/doc/img/intra_paeth.svg
@@ -0,0 +1,181 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<!-- Generated by Microsoft Visio, SVG Export intra_paeth.svg Page-1 -->
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ev="http://www.w3.org/2001/xml-events"
+		xmlns:v="http://schemas.microsoft.com/visio/2003/SVGExtensions/" width="2.52083in" height="2.52083in"
+		viewBox="0 0 181.5 181.5" xml:space="preserve" color-interpolation-filters="sRGB" class="st7">
+	<v:documentProperties v:langID="1033" v:viewMarkup="false"/>
+
+	<style type="text/css">
+	<![CDATA[
+		.st1 {fill:none;stroke:#000000;stroke-dasharray:2.25,2.25;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.75}
+		.st2 {fill:#000000;font-family:Calibri;font-size:1.00001em}
+		.st3 {fill:none;stroke:#000000;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.24}
+		.st4 {fill:#000000;font-family:Calibri;font-size:0.833336em}
+		.st5 {font-size:1em}
+		.st6 {fill:none;stroke:#000000;stroke-linecap:round;stroke-linejoin:round;stroke-width:2.25}
+		.st7 {fill:none;fill-rule:evenodd;font-size:12px;overflow:visible;stroke-linecap:square;stroke-miterlimit:3}
+	]]>
+	</style>
+
+	<g v:mID="0" v:index="1" v:groupContext="foregroundPage">
+		<title>Page-1</title>
+		<v:pageProperties v:drawingScale="1" v:pageScale="1" v:drawingUnits="19" v:shadowOffsetX="9" v:shadowOffsetY="-9"/>
+		<g id="shape211-1" v:mID="211" v:groupContext="shape" transform="translate(0.375,-73.125)">
+			<title>Square.211</title>
+			<desc>L</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="163.5" width="36" height="36"/>
+			<path d="M0 181.5 L36 181.5 L36 145.5 L0 145.5 L0 181.5 Z" class="st1"/>
+			<text x="15.48" y="167.1" class="st2" v:langID="2052"><v:paragraph v:horizAlign="1"/><v:tabList/>L</text>		</g>
+		<g id="shape212-4" v:mID="212" v:groupContext="shape" transform="translate(108.375,-145.125)">
+			<title>Square.212</title>
+			<desc>T</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="163.5" width="36" height="36"/>
+			<path d="M0 181.5 L36 181.5 L36 145.5 L0 145.5 L0 181.5 Z" class="st1"/>
+			<text x="15.08" y="167.1" class="st2" v:langID="2052"><v:paragraph v:horizAlign="1"/><v:tabList/>T</text>		</g>
+		<g id="shape213-7" v:mID="213" v:groupContext="shape" transform="translate(0.375007,-145.125)">
+			<title>Square.213</title>
+			<desc>TL</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="163.5" width="36" height="36"/>
+			<path d="M0 181.5 L36 181.5 L36 145.5 L0 145.5 L0 181.5 Z" class="st1"/>
+			<text x="12.55" y="167.1" class="st2" v:langID="2052"><v:paragraph v:horizAlign="1"/><v:tabList/>TL</text>		</g>
+		<g id="group214-10" transform="translate(36.375,-1.12501)" v:mID="214" v:groupContext="group">
+			<title>Sheet.214</title>
+			<g id="shape183-11" v:mID="183" v:groupContext="shape" transform="translate(6.86646E-06,-108)">
+				<title>Square.183</title>
+				<v:userDefs>
+					<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+				</v:userDefs>
+				<rect x="0" y="145.5" width="36" height="36" class="st3"/>
+			</g>
+			<g id="shape184-13" v:mID="184" v:groupContext="shape" transform="translate(36,-108)">
+				<title>Square.184</title>
+				<v:userDefs>
+					<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+				</v:userDefs>
+				<rect x="0" y="145.5" width="36" height="36" class="st3"/>
+			</g>
+			<g id="shape185-15" v:mID="185" v:groupContext="shape" transform="translate(72,-108)">
+				<title>Square.185</title>
+				<v:userDefs>
+					<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+				</v:userDefs>
+				<rect x="0" y="145.5" width="36" height="36" class="st3"/>
+			</g>
+			<g id="shape186-17" v:mID="186" v:groupContext="shape" transform="translate(108,-108)">
+				<title>Square.186</title>
+				<v:userDefs>
+					<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+				</v:userDefs>
+				<rect x="0" y="145.5" width="36" height="36" class="st3"/>
+			</g>
+			<g id="shape199-19" v:mID="199" v:groupContext="shape" transform="translate(1.37329E-05,-72)">
+				<title>Square.199</title>
+				<v:userDefs>
+					<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+				</v:userDefs>
+				<rect x="0" y="145.5" width="36" height="36" class="st3"/>
+			</g>
+			<g id="shape200-21" v:mID="200" v:groupContext="shape" transform="translate(36,-72)">
+				<title>Square.200</title>
+				<v:userDefs>
+					<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+				</v:userDefs>
+				<rect x="0" y="145.5" width="36" height="36" class="st3"/>
+			</g>
+			<g id="shape201-23" v:mID="201" v:groupContext="shape" transform="translate(72,-72)">
+				<title>Square.201</title>
+				<desc>Current Pixel</desc>
+				<v:userDefs>
+					<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+				</v:userDefs>
+				<v:textBlock v:margins="rect(0,0,0,0)"/>
+				<v:textRect cx="18" cy="163.5" width="36" height="36"/>
+				<rect x="0" y="145.5" width="36" height="36" class="st3"/>
+				<text x="2.43" y="160.5" class="st4" v:langID="2052"><v:paragraph v:horizAlign="1"/><v:tabList/>Current <tspan
+							x="8.47" dy="1.2em" class="st5">Pixel</tspan></text>			</g>
+			<g id="shape202-27" v:mID="202" v:groupContext="shape" transform="translate(108,-72)">
+				<title>Square.202</title>
+				<v:userDefs>
+					<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+				</v:userDefs>
+				<rect x="0" y="145.5" width="36" height="36" class="st3"/>
+			</g>
+			<g id="shape203-29" v:mID="203" v:groupContext="shape" transform="translate(0,-36)">
+				<title>Square.203</title>
+				<v:userDefs>
+					<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+				</v:userDefs>
+				<rect x="0" y="145.5" width="36" height="36" class="st3"/>
+			</g>
+			<g id="shape204-31" v:mID="204" v:groupContext="shape" transform="translate(36,-36)">
+				<title>Square.204</title>
+				<v:userDefs>
+					<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+				</v:userDefs>
+				<rect x="0" y="145.5" width="36" height="36" class="st3"/>
+			</g>
+			<g id="shape205-33" v:mID="205" v:groupContext="shape" transform="translate(72,-36)">
+				<title>Square.205</title>
+				<v:userDefs>
+					<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+				</v:userDefs>
+				<rect x="0" y="145.5" width="36" height="36" class="st3"/>
+			</g>
+			<g id="shape206-35" v:mID="206" v:groupContext="shape" transform="translate(108,-36)">
+				<title>Square.206</title>
+				<v:userDefs>
+					<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+				</v:userDefs>
+				<rect x="0" y="145.5" width="36" height="36" class="st3"/>
+			</g>
+			<g id="shape207-37" v:mID="207" v:groupContext="shape" transform="translate(6.86646E-06,0)">
+				<title>Square.207</title>
+				<v:userDefs>
+					<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+				</v:userDefs>
+				<rect x="0" y="145.5" width="36" height="36" class="st3"/>
+			</g>
+			<g id="shape208-39" v:mID="208" v:groupContext="shape" transform="translate(36,0)">
+				<title>Square.208</title>
+				<v:userDefs>
+					<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+				</v:userDefs>
+				<rect x="0" y="145.5" width="36" height="36" class="st3"/>
+			</g>
+			<g id="shape209-41" v:mID="209" v:groupContext="shape" transform="translate(72,0)">
+				<title>Square.209</title>
+				<v:userDefs>
+					<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+				</v:userDefs>
+				<rect x="0" y="145.5" width="36" height="36" class="st3"/>
+			</g>
+			<g id="shape210-43" v:mID="210" v:groupContext="shape" transform="translate(108,0)">
+				<title>Square.210</title>
+				<v:userDefs>
+					<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+				</v:userDefs>
+				<rect x="0" y="145.5" width="36" height="36" class="st3"/>
+			</g>
+		</g>
+		<g id="shape215-45" v:mID="215" v:groupContext="shape" transform="translate(36.375,-1.125)">
+			<title>Square.215</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="37.5" width="144" height="144" class="st6"/>
+		</g>
+	</g>
+</svg>
diff --git a/doc/img/intra_recursive.svg b/doc/img/intra_recursive.svg
new file mode 100644
index 0000000..adc4193
--- /dev/null
+++ b/doc/img/intra_recursive.svg
@@ -0,0 +1,710 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<!-- Generated by Microsoft Visio, SVG Export intra_recursive.svg Page-1 -->
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ev="http://www.w3.org/2001/xml-events"
+		xmlns:v="http://schemas.microsoft.com/visio/2003/SVGExtensions/" width="4.52015in" height="4.46693in"
+		viewBox="0 0 325.45 321.619" xml:space="preserve" color-interpolation-filters="sRGB" class="st9">
+	<v:documentProperties v:langID="1033" v:viewMarkup="false"/>
+
+	<style type="text/css">
+	<![CDATA[
+		.st1 {fill:none;stroke:#000000;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.24}
+		.st2 {fill:none;stroke:#000000;stroke-linecap:round;stroke-linejoin:round;stroke-width:1.5}
+		.st3 {marker-end:url(#mrkr10-184);marker-start:url(#mrkr10-182);stroke:#0070c0;stroke-dasharray:0.75,1.5;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.75}
+		.st4 {fill:#0070c0;fill-opacity:1;stroke:#0070c0;stroke-opacity:1;stroke-width:0.34246575342466}
+		.st5 {fill:#0070c0;fill-opacity:1;stroke:#0070c0;stroke-opacity:1;stroke-width:0.29411764705882}
+		.st6 {marker-end:url(#mrkr10-235);marker-start:url(#mrkr10-233);stroke:#bf9000;stroke-dasharray:0.75,1.5;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.75}
+		.st7 {fill:#bf9000;fill-opacity:1;stroke:#bf9000;stroke-opacity:1;stroke-width:0.34246575342466}
+		.st8 {fill:#bf9000;fill-opacity:1;stroke:#bf9000;stroke-opacity:1;stroke-width:0.29411764705882}
+		.st9 {fill:none;fill-rule:evenodd;font-size:12px;overflow:visible;stroke-linecap:square;stroke-miterlimit:3}
+	]]>
+	</style>
+
+	<defs id="Markers">
+		<g id="lend10">
+			<path
+					d="M 0 0.75 C -0.414214 0.75 -0.75 0.414214 -0.75 0 -0.75 -0.414214 -0.414214 -0.75 0 -0.75 0.414214 -0.75 0.75 -0.414214 0.75 0 0.75 0.414214 0.414214 0.75 0 0.75 Z "
+					style="stroke:none"/>
+		</g>
+		<marker id="mrkr10-182" class="st4" v:arrowType="10" v:arrowSize="0" v:setback="1.71" refX="1.71" orient="auto"
+				markerUnits="strokeWidth" overflow="visible">
+			<use xlink:href="#lend10" transform="scale(2.92) "/>
+		</marker>
+		<marker id="mrkr10-184" class="st5" v:arrowType="10" v:arrowSize="1" v:setback="2.55" refX="-2.55" orient="auto"
+				markerUnits="strokeWidth" overflow="visible">
+			<use xlink:href="#lend10" transform="scale(-3.4,-3.4) "/>
+		</marker>
+		<marker id="mrkr10-233" class="st7" v:arrowType="10" v:arrowSize="0" v:setback="1.71" refX="1.71" orient="auto"
+				markerUnits="strokeWidth" overflow="visible">
+			<use xlink:href="#lend10" transform="scale(2.92) "/>
+		</marker>
+		<marker id="mrkr10-235" class="st8" v:arrowType="10" v:arrowSize="1" v:setback="2.55" refX="-2.55" orient="auto"
+				markerUnits="strokeWidth" overflow="visible">
+			<use xlink:href="#lend10" transform="scale(-3.4,-3.4) "/>
+		</marker>
+	</defs>
+	<g v:mID="0" v:index="1" v:groupContext="foregroundPage">
+		<title>Page-1</title>
+		<v:pageProperties v:drawingScale="1" v:pageScale="1" v:drawingUnits="19" v:shadowOffsetX="9" v:shadowOffsetY="-9"/>
+		<g id="group149-1" transform="translate(0.12,-214.583)" v:mID="149" v:groupContext="group">
+			<title>Sheet.149</title>
+			<g id="shape142-2" v:mID="142" v:groupContext="shape" transform="translate(0,-71.2776)">
+				<title>Square.142</title>
+				<v:userDefs>
+					<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+				</v:userDefs>
+				<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+			</g>
+			<g id="shape143-4" v:mID="143" v:groupContext="shape" transform="translate(36.0645,-71.2776)">
+				<title>Square.143</title>
+				<v:userDefs>
+					<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+				</v:userDefs>
+				<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+			</g>
+			<g id="shape144-6" v:mID="144" v:groupContext="shape" transform="translate(72.129,-71.2776)">
+				<title>Square.144</title>
+				<v:userDefs>
+					<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+				</v:userDefs>
+				<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+			</g>
+			<g id="shape145-8" v:mID="145" v:groupContext="shape" transform="translate(108.193,-71.2776)">
+				<title>Square.145</title>
+				<v:userDefs>
+					<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+				</v:userDefs>
+				<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+			</g>
+			<g id="shape146-10" v:mID="146" v:groupContext="shape" transform="translate(144.258,-71.2776)">
+				<title>Square.146</title>
+				<v:userDefs>
+					<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+				</v:userDefs>
+				<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+			</g>
+			<g id="shape147-12" v:mID="147" v:groupContext="shape" transform="translate(0,-35.6388)">
+				<title>Square.147</title>
+				<v:userDefs>
+					<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+				</v:userDefs>
+				<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+			</g>
+			<g id="shape148-14" v:mID="148" v:groupContext="shape">
+				<title>Square.148</title>
+				<v:userDefs>
+					<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+				</v:userDefs>
+				<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+			</g>
+		</g>
+		<g id="group64-16" transform="translate(36.1845,-214.583)" v:mID="64" v:groupContext="group">
+			<title>Sheet.64</title>
+			<g id="shape38-17" v:mID="38" v:groupContext="shape">
+				<title>Rectangle</title>
+				<v:userDefs>
+					<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+				</v:userDefs>
+				<rect x="0" y="250.341" width="144.258" height="71.2776" class="st2"/>
+			</g>
+			<g id="group63-19" v:mID="63" v:groupContext="group">
+				<title>Sheet.63</title>
+				<g id="shape46-20" v:mID="46" v:groupContext="shape" transform="translate(0,-35.6388)">
+					<title>Square.46</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+				<g id="shape47-22" v:mID="47" v:groupContext="shape" transform="translate(36.0645,-35.6388)">
+					<title>Square.47</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+				<g id="shape48-24" v:mID="48" v:groupContext="shape" transform="translate(72.129,-35.6388)">
+					<title>Square.48</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+				<g id="shape49-26" v:mID="49" v:groupContext="shape" transform="translate(108.193,-35.6388)">
+					<title>Square.49</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+				<g id="shape50-28" v:mID="50" v:groupContext="shape">
+					<title>Square.50</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+				<g id="shape51-30" v:mID="51" v:groupContext="shape" transform="translate(36.0645,0)">
+					<title>Square.51</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+				<g id="shape52-32" v:mID="52" v:groupContext="shape" transform="translate(72.129,0)">
+					<title>Square.52</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+				<g id="shape53-34" v:mID="53" v:groupContext="shape" transform="translate(108.193,0)">
+					<title>Square.53</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+			</g>
+		</g>
+		<g id="group65-36" transform="translate(180.442,-214.583)" v:mID="65" v:groupContext="group">
+			<title>Sheet.65</title>
+			<g id="shape66-37" v:mID="66" v:groupContext="shape">
+				<title>Rectangle</title>
+				<v:userDefs>
+					<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+				</v:userDefs>
+				<rect x="0" y="250.341" width="144.258" height="71.2776" class="st2"/>
+			</g>
+			<g id="group67-39" v:mID="67" v:groupContext="group">
+				<title>Sheet.67</title>
+				<g id="shape68-40" v:mID="68" v:groupContext="shape" transform="translate(0,-35.6388)">
+					<title>Square.46</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+				<g id="shape69-42" v:mID="69" v:groupContext="shape" transform="translate(36.0645,-35.6388)">
+					<title>Square.47</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+				<g id="shape70-44" v:mID="70" v:groupContext="shape" transform="translate(72.129,-35.6388)">
+					<title>Square.48</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+				<g id="shape71-46" v:mID="71" v:groupContext="shape" transform="translate(108.193,-35.6388)">
+					<title>Square.49</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+				<g id="shape72-48" v:mID="72" v:groupContext="shape">
+					<title>Square.50</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+				<g id="shape73-50" v:mID="73" v:groupContext="shape" transform="translate(36.0645,0)">
+					<title>Square.51</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+				<g id="shape74-52" v:mID="74" v:groupContext="shape" transform="translate(72.129,0)">
+					<title>Square.52</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+				<g id="shape75-54" v:mID="75" v:groupContext="shape" transform="translate(108.193,0)">
+					<title>Square.53</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+			</g>
+		</g>
+		<g id="group76-56" transform="translate(36.1845,-143.305)" v:mID="76" v:groupContext="group">
+			<title>Sheet.76</title>
+			<g id="shape77-57" v:mID="77" v:groupContext="shape">
+				<title>Rectangle</title>
+				<v:userDefs>
+					<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+				</v:userDefs>
+				<rect x="0" y="250.341" width="144.258" height="71.2776" class="st2"/>
+			</g>
+			<g id="group78-59" v:mID="78" v:groupContext="group">
+				<title>Sheet.78</title>
+				<g id="shape79-60" v:mID="79" v:groupContext="shape" transform="translate(0,-35.6388)">
+					<title>Square.46</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+				<g id="shape80-62" v:mID="80" v:groupContext="shape" transform="translate(36.0645,-35.6388)">
+					<title>Square.47</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+				<g id="shape81-64" v:mID="81" v:groupContext="shape" transform="translate(72.129,-35.6388)">
+					<title>Square.48</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+				<g id="shape82-66" v:mID="82" v:groupContext="shape" transform="translate(108.193,-35.6388)">
+					<title>Square.49</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+				<g id="shape83-68" v:mID="83" v:groupContext="shape">
+					<title>Square.50</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+				<g id="shape84-70" v:mID="84" v:groupContext="shape" transform="translate(36.0645,0)">
+					<title>Square.51</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+				<g id="shape85-72" v:mID="85" v:groupContext="shape" transform="translate(72.129,0)">
+					<title>Square.52</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+				<g id="shape86-74" v:mID="86" v:groupContext="shape" transform="translate(108.193,0)">
+					<title>Square.53</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+			</g>
+		</g>
+		<g id="group87-76" transform="translate(180.442,-143.305)" v:mID="87" v:groupContext="group">
+			<title>Sheet.87</title>
+			<g id="shape88-77" v:mID="88" v:groupContext="shape">
+				<title>Rectangle</title>
+				<v:userDefs>
+					<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+				</v:userDefs>
+				<rect x="0" y="250.341" width="144.258" height="71.2776" class="st2"/>
+			</g>
+			<g id="group89-79" v:mID="89" v:groupContext="group">
+				<title>Sheet.89</title>
+				<g id="shape90-80" v:mID="90" v:groupContext="shape" transform="translate(0,-35.6388)">
+					<title>Square.46</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+				<g id="shape91-82" v:mID="91" v:groupContext="shape" transform="translate(36.0645,-35.6388)">
+					<title>Square.47</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+				<g id="shape92-84" v:mID="92" v:groupContext="shape" transform="translate(72.129,-35.6388)">
+					<title>Square.48</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+				<g id="shape93-86" v:mID="93" v:groupContext="shape" transform="translate(108.193,-35.6388)">
+					<title>Square.49</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+				<g id="shape94-88" v:mID="94" v:groupContext="shape">
+					<title>Square.50</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+				<g id="shape95-90" v:mID="95" v:groupContext="shape" transform="translate(36.0645,0)">
+					<title>Square.51</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+				<g id="shape96-92" v:mID="96" v:groupContext="shape" transform="translate(72.129,0)">
+					<title>Square.52</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+				<g id="shape97-94" v:mID="97" v:groupContext="shape" transform="translate(108.193,0)">
+					<title>Square.53</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+			</g>
+		</g>
+		<g id="group98-96" transform="translate(36.1845,-72.0276)" v:mID="98" v:groupContext="group">
+			<title>Sheet.98</title>
+			<g id="shape99-97" v:mID="99" v:groupContext="shape">
+				<title>Rectangle</title>
+				<v:userDefs>
+					<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+				</v:userDefs>
+				<rect x="0" y="250.341" width="144.258" height="71.2776" class="st2"/>
+			</g>
+			<g id="group100-99" v:mID="100" v:groupContext="group">
+				<title>Sheet.100</title>
+				<g id="shape101-100" v:mID="101" v:groupContext="shape" transform="translate(0,-35.6388)">
+					<title>Square.46</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+				<g id="shape102-102" v:mID="102" v:groupContext="shape" transform="translate(36.0645,-35.6388)">
+					<title>Square.47</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+				<g id="shape103-104" v:mID="103" v:groupContext="shape" transform="translate(72.129,-35.6388)">
+					<title>Square.48</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+				<g id="shape104-106" v:mID="104" v:groupContext="shape" transform="translate(108.193,-35.6388)">
+					<title>Square.49</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+				<g id="shape105-108" v:mID="105" v:groupContext="shape">
+					<title>Square.50</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+				<g id="shape106-110" v:mID="106" v:groupContext="shape" transform="translate(36.0645,0)">
+					<title>Square.51</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+				<g id="shape107-112" v:mID="107" v:groupContext="shape" transform="translate(72.129,0)">
+					<title>Square.52</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+				<g id="shape108-114" v:mID="108" v:groupContext="shape" transform="translate(108.193,0)">
+					<title>Square.53</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+			</g>
+		</g>
+		<g id="group109-116" transform="translate(180.442,-72.0276)" v:mID="109" v:groupContext="group">
+			<title>Sheet.109</title>
+			<g id="shape110-117" v:mID="110" v:groupContext="shape">
+				<title>Rectangle</title>
+				<v:userDefs>
+					<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+				</v:userDefs>
+				<rect x="0" y="250.341" width="144.258" height="71.2776" class="st2"/>
+			</g>
+			<g id="group111-119" v:mID="111" v:groupContext="group">
+				<title>Sheet.111</title>
+				<g id="shape112-120" v:mID="112" v:groupContext="shape" transform="translate(0,-35.6388)">
+					<title>Square.46</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+				<g id="shape113-122" v:mID="113" v:groupContext="shape" transform="translate(36.0645,-35.6388)">
+					<title>Square.47</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+				<g id="shape114-124" v:mID="114" v:groupContext="shape" transform="translate(72.129,-35.6388)">
+					<title>Square.48</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+				<g id="shape115-126" v:mID="115" v:groupContext="shape" transform="translate(108.193,-35.6388)">
+					<title>Square.49</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+				<g id="shape116-128" v:mID="116" v:groupContext="shape">
+					<title>Square.50</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+				<g id="shape117-130" v:mID="117" v:groupContext="shape" transform="translate(36.0645,0)">
+					<title>Square.51</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+				<g id="shape118-132" v:mID="118" v:groupContext="shape" transform="translate(72.129,0)">
+					<title>Square.52</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+				<g id="shape119-134" v:mID="119" v:groupContext="shape" transform="translate(108.193,0)">
+					<title>Square.53</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+			</g>
+		</g>
+		<g id="group120-136" transform="translate(36.1845,-0.75)" v:mID="120" v:groupContext="group">
+			<title>Sheet.120</title>
+			<g id="shape121-137" v:mID="121" v:groupContext="shape">
+				<title>Rectangle</title>
+				<v:userDefs>
+					<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+				</v:userDefs>
+				<rect x="0" y="250.341" width="144.258" height="71.2776" class="st2"/>
+			</g>
+			<g id="group122-139" v:mID="122" v:groupContext="group">
+				<title>Sheet.122</title>
+				<g id="shape123-140" v:mID="123" v:groupContext="shape" transform="translate(0,-35.6388)">
+					<title>Square.46</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+				<g id="shape124-142" v:mID="124" v:groupContext="shape" transform="translate(36.0645,-35.6388)">
+					<title>Square.47</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+				<g id="shape125-144" v:mID="125" v:groupContext="shape" transform="translate(72.129,-35.6388)">
+					<title>Square.48</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+				<g id="shape126-146" v:mID="126" v:groupContext="shape" transform="translate(108.193,-35.6388)">
+					<title>Square.49</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+				<g id="shape127-148" v:mID="127" v:groupContext="shape">
+					<title>Square.50</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+				<g id="shape128-150" v:mID="128" v:groupContext="shape" transform="translate(36.0645,0)">
+					<title>Square.51</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+				<g id="shape129-152" v:mID="129" v:groupContext="shape" transform="translate(72.129,0)">
+					<title>Square.52</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+				<g id="shape130-154" v:mID="130" v:groupContext="shape" transform="translate(108.193,0)">
+					<title>Square.53</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+			</g>
+		</g>
+		<g id="group131-156" transform="translate(180.442,-0.75)" v:mID="131" v:groupContext="group">
+			<title>Sheet.131</title>
+			<g id="shape132-157" v:mID="132" v:groupContext="shape">
+				<title>Rectangle</title>
+				<v:userDefs>
+					<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+				</v:userDefs>
+				<rect x="0" y="250.341" width="144.258" height="71.2776" class="st2"/>
+			</g>
+			<g id="group133-159" v:mID="133" v:groupContext="group">
+				<title>Sheet.133</title>
+				<g id="shape134-160" v:mID="134" v:groupContext="shape" transform="translate(0,-35.6388)">
+					<title>Square.46</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+				<g id="shape135-162" v:mID="135" v:groupContext="shape" transform="translate(36.0645,-35.6388)">
+					<title>Square.47</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+				<g id="shape136-164" v:mID="136" v:groupContext="shape" transform="translate(72.129,-35.6388)">
+					<title>Square.48</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+				<g id="shape137-166" v:mID="137" v:groupContext="shape" transform="translate(108.193,-35.6388)">
+					<title>Square.49</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+				<g id="shape138-168" v:mID="138" v:groupContext="shape">
+					<title>Square.50</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+				<g id="shape139-170" v:mID="139" v:groupContext="shape" transform="translate(36.0645,0)">
+					<title>Square.51</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+				<g id="shape140-172" v:mID="140" v:groupContext="shape" transform="translate(72.129,0)">
+					<title>Square.52</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+				<g id="shape141-174" v:mID="141" v:groupContext="shape" transform="translate(108.193,0)">
+					<title>Square.53</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+			</g>
+		</g>
+		<g id="shape150-176" v:mID="150" v:groupContext="shape" transform="translate(244.217,-210.826) rotate(44.6598)">
+			<title>Sheet.150</title>
+			<path d="M1.28 321.62 L1.64 321.62 L49.49 321.62" class="st3"/>
+		</g>
+		<g id="shape151-185" v:mID="151" v:groupContext="shape" transform="translate(-266.901,54.0731) rotate(-90)">
+			<title>Sheet.151</title>
+			<path d="M1.28 321.62 L1.64 321.62 L34.22 321.62" class="st3"/>
+		</g>
+		<g id="shape152-192" v:mID="152" v:groupContext="shape" transform="translate(319.501,243.543) rotate(134.544)">
+			<title>Sheet.152</title>
+			<path d="M1.28 321.62 L1.64 321.62 L48.79 321.62" class="st3"/>
+		</g>
+		<g id="shape153-199" v:mID="153" v:groupContext="shape" transform="translate(271.203,305.09) rotate(153.231)">
+			<title>Sheet.153</title>
+			<path d="M1.28 321.62 L1.64 321.62 L78.31 321.62" class="st3"/>
+		</g>
+		<g id="shape154-206" v:mID="154" v:groupContext="shape" transform="translate(264.717,322.853) rotate(161.452)">
+			<title>Sheet.154</title>
+			<path d="M1.28 321.62 L1.64 321.62 L111.68 321.62" class="st3"/>
+		</g>
+		<g id="shape155-213" v:mID="155" v:groupContext="shape" transform="translate(18.1522,-267.546)">
+			<title>Sheet.155</title>
+			<path d="M1.28 321.62 L1.64 321.62 L34.65 321.62" class="st3"/>
+		</g>
+		<g id="shape156-220" v:mID="156" v:groupContext="shape" transform="translate(-204.714,-142.665) rotate(-43.8643)">
+			<title>Sheet.156</title>
+			<path d="M1.28 321.62 L1.64 321.62 L48.8 321.62" class="st3"/>
+		</g>
+		<g id="shape157-227" v:mID="157" v:groupContext="shape" transform="translate(388.475,-68.2707) rotate(44.6598)">
+			<title>Sheet.157</title>
+			<path d="M1.28 321.62 L1.64 321.62 L99.49 321.62" class="st6"/>
+		</g>
+		<g id="shape158-236" v:mID="158" v:groupContext="shape" transform="translate(-53.2468,375.362) rotate(-116.517)">
+			<title>Sheet.158</title>
+			<path d="M1.28 321.62 L1.64 321.62 L77.74 321.62" class="st6"/>
+		</g>
+		<g id="shape159-243" v:mID="159" v:groupContext="shape" transform="translate(556.158,160.495) rotate(90)">
+			<title>Sheet.159</title>
+			<path d="M1.28 321.62 L1.64 321.62 L69.37 321.62" class="st6"/>
+		</g>
+		<g id="shape160-250" v:mID="160" v:groupContext="shape" transform="translate(557.58,305.696) rotate(116.838)">
+			<title>Sheet.160</title>
+			<path d="M1.28 321.62 L1.64 321.62 L77.97 321.62" class="st6"/>
+		</g>
+		<g id="shape161-257" v:mID="161" v:groupContext="shape" transform="translate(532.733,389.26) rotate(135.34)">
+			<title>Sheet.161</title>
+			<path d="M1.28 321.62 L1.64 321.62 L99.49 321.62" class="st6"/>
+		</g>
+		<g id="shape162-264" v:mID="162" v:groupContext="shape" transform="translate(303.283,-92.4976) rotate(25.977)">
+			<title>Sheet.162</title>
+			<path d="M1.28 321.62 L1.64 321.62 L78.32 321.62" class="st6"/>
+		</g>
+		<g id="shape163-271" v:mID="163" v:groupContext="shape" transform="translate(162.41,-89.8469)">
+			<title>Sheet.163</title>
+			<path d="M1.28 321.62 L1.64 321.62 L70.22 321.62" class="st6"/>
+		</g>
+	</g>
+</svg>
diff --git a/doc/img/intra_tx_partition.svg b/doc/img/intra_tx_partition.svg
new file mode 100644
index 0000000..69575d4
--- /dev/null
+++ b/doc/img/intra_tx_partition.svg
@@ -0,0 +1,142 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<!-- Generated by Microsoft Visio, SVG Export intra_tx_partition.svg Page-1 -->
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ev="http://www.w3.org/2001/xml-events"
+		xmlns:v="http://schemas.microsoft.com/visio/2003/SVGExtensions/" width="7.02083in" height="2.02083in"
+		viewBox="0 0 505.5 145.5" xml:space="preserve" color-interpolation-filters="sRGB" class="st6">
+	<v:documentProperties v:langID="1033" v:viewMarkup="false"/>
+
+	<style type="text/css">
+	<![CDATA[
+		.st1 {fill:none;stroke:#000000;stroke-linecap:round;stroke-linejoin:round;stroke-width:1.5}
+		.st2 {stroke:#000000;stroke-dasharray:1.5,3;stroke-linecap:round;stroke-linejoin:round;stroke-width:1.5}
+		.st3 {stroke:#000000;stroke-dasharray:0.75,1.5;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.75}
+		.st4 {marker-end:url(#mrkr5-36);stroke:#0070c0;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.75}
+		.st5 {fill:#0070c0;fill-opacity:1;stroke:#0070c0;stroke-opacity:1;stroke-width:0.22935779816514}
+		.st6 {fill:none;fill-rule:evenodd;font-size:12px;overflow:visible;stroke-linecap:square;stroke-miterlimit:3}
+	]]>
+	</style>
+
+	<defs id="Markers">
+		<g id="lend5">
+			<path d="M 2 1 L 0 0 L 1.98117 -0.993387 C 1.67173 -0.364515 1.67301 0.372641 1.98465 1.00043 " style="stroke:none"/>
+		</g>
+		<marker id="mrkr5-36" class="st5" v:arrowType="5" v:arrowSize="2" v:setback="7.63" refX="-7.63" orient="auto"
+				markerUnits="strokeWidth" overflow="visible">
+			<use xlink:href="#lend5" transform="scale(-4.36,-4.36) "/>
+		</marker>
+	</defs>
+	<g v:mID="0" v:index="1" v:groupContext="foregroundPage">
+		<title>Page-1</title>
+		<v:pageProperties v:drawingScale="1" v:pageScale="1" v:drawingUnits="19" v:shadowOffsetX="9" v:shadowOffsetY="-9"/>
+		<g id="shape1-1" v:mID="1" v:groupContext="shape" transform="translate(0.75,-0.75)">
+			<title>Square</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="1.5" width="144" height="144" class="st1"/>
+		</g>
+		<g id="shape4-3" v:mID="4" v:groupContext="shape" transform="translate(180.75,-0.75)">
+			<title>Square.4</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="1.5" width="144" height="144" class="st1"/>
+		</g>
+		<g id="shape5-5" v:mID="5" v:groupContext="shape" transform="translate(398.25,0.75) rotate(90)">
+			<title>Sheet.5</title>
+			<path d="M0 145.5 L144 145.5" class="st2"/>
+		</g>
+		<g id="shape6-8" v:mID="6" v:groupContext="shape" transform="translate(180.75,-72.75)">
+			<title>Sheet.6</title>
+			<path d="M0 145.5 L144 145.5" class="st2"/>
+		</g>
+		<g id="shape7-11" v:mID="7" v:groupContext="shape" transform="translate(360.75,-0.75)">
+			<title>Square.7</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="1.5" width="144" height="144" class="st1"/>
+		</g>
+		<g id="shape8-13" v:mID="8" v:groupContext="shape" transform="translate(578.25,0.75) rotate(90)">
+			<title>Sheet.8</title>
+			<path d="M0 145.5 L144 145.5" class="st2"/>
+		</g>
+		<g id="shape9-16" v:mID="9" v:groupContext="shape" transform="translate(432,-108.5)">
+			<title>Sheet.9</title>
+			<path d="M0 145.5 L72.75 145.5" class="st3"/>
+		</g>
+		<g id="shape10-19" v:mID="10" v:groupContext="shape" transform="translate(360.75,-72.75)">
+			<title>Sheet.10</title>
+			<path d="M0 145.5 L144 145.5" class="st2"/>
+		</g>
+		<g id="shape11-22" v:mID="11" v:groupContext="shape" transform="translate(360.75,-36.75)">
+			<title>Sheet.11</title>
+			<path d="M0 145.5 L72 145.5" class="st3"/>
+		</g>
+		<g id="shape12-25" v:mID="12" v:groupContext="shape" transform="translate(542.25,0.750007) rotate(90)">
+			<title>Sheet.12</title>
+			<path d="M0 145.5 L72 145.5" class="st3"/>
+		</g>
+		<g id="shape13-28" v:mID="13" v:groupContext="shape" transform="translate(614.25,0.75) rotate(90)">
+			<title>Sheet.13</title>
+			<path d="M0 145.5 L72 145.5" class="st3"/>
+		</g>
+		<g id="shape14-31" v:mID="14" v:groupContext="shape" transform="translate(216.75,-108.75)">
+			<title>Sheet.14</title>
+			<path d="M0 145.5 L66.28 145.5" class="st4"/>
+		</g>
+		<g id="shape15-37" v:mID="15" v:groupContext="shape" transform="translate(391.634,139.634) rotate(135)">
+			<title>Sheet.15</title>
+			<path d="M0 145.5 L96.1 145.5" class="st4"/>
+		</g>
+		<g id="shape16-42" v:mID="16" v:groupContext="shape" transform="translate(216.75,-36.75)">
+			<title>Sheet.16</title>
+			<path d="M0 145.5 L66.28 145.5" class="st4"/>
+		</g>
+		<g id="shape17-47" v:mID="17" v:groupContext="shape" transform="translate(378.75,-126.75)">
+			<title>Sheet.17</title>
+			<path d="M0 145.5 L102.28 145.5" class="st4"/>
+		</g>
+		<g id="shape18-52" v:mID="18" v:groupContext="shape" transform="translate(378.75,-90.75)">
+			<title>Sheet.18</title>
+			<path d="M0 145.5 L102.28 145.5" class="st4"/>
+		</g>
+		<g id="shape19-57" v:mID="19" v:groupContext="shape" transform="translate(378.75,-54.75)">
+			<title>Sheet.19</title>
+			<path d="M0 145.5 L102.28 145.5" class="st4"/>
+		</g>
+		<g id="shape20-62" v:mID="20" v:groupContext="shape" transform="translate(378.75,-18.75)">
+			<title>Sheet.20</title>
+			<path d="M0 145.5 L102.28 145.5" class="st4"/>
+		</g>
+		<g id="shape21-67" v:mID="21" v:groupContext="shape" transform="translate(532.761,156.783) rotate(161.565)">
+			<title>Sheet.21</title>
+			<path d="M0 145.5 L108.12 145.5" class="st4"/>
+		</g>
+		<g id="shape22-72" v:mID="22" v:groupContext="shape" transform="translate(532.761,192.783) rotate(161.565)">
+			<title>Sheet.22</title>
+			<path d="M0 145.5 L108.12 145.5" class="st4"/>
+		</g>
+		<g id="shape23-77" v:mID="23" v:groupContext="shape" transform="translate(532.761,228.783) rotate(161.565)">
+			<title>Sheet.23</title>
+			<path d="M0 145.5 L108.12 145.5" class="st4"/>
+		</g>
+		<g id="shape36-82" v:mID="36" v:groupContext="shape" transform="translate(360.75,-108.5)">
+			<title>Sheet.36</title>
+			<path d="M0 145.5 L72 145.5" class="st3"/>
+		</g>
+		<g id="shape37-85" v:mID="37" v:groupContext="shape" transform="translate(432.75,-36.75)">
+			<title>Sheet.37</title>
+			<path d="M0 145.5 L72 145.5" class="st3"/>
+		</g>
+		<g id="shape38-88" v:mID="38" v:groupContext="shape" transform="translate(542.25,72.75) rotate(90)">
+			<title>Sheet.38</title>
+			<path d="M0 145.5 L72 145.5" class="st3"/>
+		</g>
+		<g id="shape39-91" v:mID="39" v:groupContext="shape" transform="translate(614.25,72.75) rotate(90)">
+			<title>Sheet.39</title>
+			<path d="M0 145.5 L72 145.5" class="st3"/>
+		</g>
+	</g>
+</svg>
diff --git a/doc/img/loop_restoration.svg b/doc/img/loop_restoration.svg
new file mode 100644
index 0000000..cdeb76a
--- /dev/null
+++ b/doc/img/loop_restoration.svg
@@ -0,0 +1,114 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<!-- Generated by Microsoft Visio, SVG Export loop_restoration.svg Page-1 -->
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ev="http://www.w3.org/2001/xml-events"
+		xmlns:v="http://schemas.microsoft.com/visio/2003/SVGExtensions/" width="5.47917in" height="2.49905in"
+		viewBox="0 0 394.5 179.932" xml:space="preserve" color-interpolation-filters="sRGB" class="st11">
+	<v:documentProperties v:langID="1033" v:viewMarkup="false">
+		<v:userDefs>
+			<v:ud v:nameU="msvNoAutoConnect" v:val="VT0(1):26"/>
+		</v:userDefs>
+	</v:documentProperties>
+
+	<style type="text/css">
+	<![CDATA[
+		.st1 {fill:#bfbfbf;stroke:#000000;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.24}
+		.st2 {marker-end:url(#mrkr4-8);stroke:#000000;stroke-linecap:round;stroke-linejoin:round;stroke-width:1}
+		.st3 {fill:#000000;fill-opacity:1;stroke:#000000;stroke-opacity:1;stroke-width:0.28409090909091}
+		.st4 {stroke:#000000;stroke-dasharray:0,3.75;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.75}
+		.st5 {marker-end:url(#mrkr4-27);stroke:#4bacc6;stroke-linecap:round;stroke-linejoin:round;stroke-width:1}
+		.st6 {fill:#4bacc6;fill-opacity:1;stroke:#4bacc6;stroke-opacity:1;stroke-width:0.28409090909091}
+		.st7 {fill:none;stroke:none;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.75}
+		.st8 {fill:#000000;font-family:Times New Roman;font-size:1.00001em}
+		.st9 {baseline-shift:-32.4941%;font-size:0.649882em}
+		.st10 {font-size:1em}
+		.st11 {fill:none;fill-rule:evenodd;font-size:12px;overflow:visible;stroke-linecap:square;stroke-miterlimit:3}
+	]]>
+	</style>
+
+	<defs id="Markers">
+		<g id="lend4">
+			<path d="M 2 1 L 0 0 L 2 -1 L 2 1 " style="stroke:none"/>
+		</g>
+		<marker id="mrkr4-8" class="st3" v:arrowType="4" v:arrowSize="2" v:setback="7.04" refX="-7.04" orient="auto"
+				markerUnits="strokeWidth" overflow="visible">
+			<use xlink:href="#lend4" transform="scale(-3.52,-3.52) "/>
+		</marker>
+		<marker id="mrkr4-27" class="st6" v:arrowType="4" v:arrowSize="2" v:setback="7.04" refX="-7.04" orient="auto"
+				markerUnits="strokeWidth" overflow="visible">
+			<use xlink:href="#lend4" transform="scale(-3.52,-3.52) "/>
+		</marker>
+	</defs>
+	<g v:mID="0" v:index="1" v:groupContext="foregroundPage">
+		<title>Page-1</title>
+		<v:pageProperties v:drawingScale="1" v:pageScale="1" v:drawingUnits="19" v:shadowOffsetX="9" v:shadowOffsetY="-9"/>
+		<v:layer v:name="Connector" v:index="0"/>
+		<g id="shape24-1" v:mID="24" v:groupContext="shape" transform="translate(34.9607,-40.8257)">
+			<title>Parallelogram</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:prompt="" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<path d="M0 179.93 L222.58 179.93 L288.29 110.74 L65.71 110.74 L0 179.93 Z" class="st1"/>
+		</g>
+		<g id="shape28-3" v:mID="28" v:groupContext="shape" transform="translate(-95.504,15.1931) rotate(-46.4754)">
+			<title>Sheet.28</title>
+			<path d="M0 179.93 L40.67 179.93" class="st2"/>
+		</g>
+		<g id="shape29-9" v:mID="29" v:groupContext="shape" transform="translate(34.9607,-40.8257)">
+			<title>Sheet.29</title>
+			<path d="M0 179.93 L48.37 179.93" class="st2"/>
+		</g>
+		<g id="shape33-14" v:mID="33" v:groupContext="shape" transform="translate(-10.6429,-34.9507) rotate(-14.6817)">
+			<title>Sheet.33</title>
+			<path d="M0 179.93 L180.5 179.93" class="st2"/>
+		</g>
+		<g id="shape36-19" v:mID="36" v:groupContext="shape" transform="translate(36.2288,91.5749) rotate(-90)">
+			<title>Sheet.36</title>
+			<path d="M0 179.93 L57.25 179.93" class="st4"/>
+		</g>
+		<g id="shape37-22" v:mID="37" v:groupContext="shape" transform="translate(-55.1147,-16.6562) rotate(-30.0403)">
+			<title>Sheet.37</title>
+			<path d="M0 179.93 L202.28 179.93" class="st5"/>
+		</g>
+		<g id="shape38-28" v:mID="38" v:groupContext="shape" transform="translate(18.375,-33.5132)">
+			<title>Sheet.38</title>
+			<desc>X</desc>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="5.34375" cy="174.026" width="10.69" height="11.8125"/>
+			<rect x="0" y="168.119" width="10.6875" height="11.8125" class="st7"/>
+			<text x="4" y="177.63" class="st8" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>X</text>		</g>
+		<g id="shape43-31" v:mID="43" v:groupContext="shape" transform="translate(31.875,-69.5132)">
+			<title>Sheet.43</title>
+			<desc>X1</desc>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="14.3438" cy="174.026" width="28.69" height="11.8125"/>
+			<rect x="0" y="168.119" width="28.6875" height="11.8125" class="st7"/>
+			<text x="8.06" y="177.63" class="st8" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>X<tspan dy="-0.279em"
+						class="st9" v:baseFontSize="12">1</tspan></text>		</g>
+		<g id="shape52-35" v:mID="52" v:groupContext="shape" transform="translate(72.375,-20.0132)">
+			<title>Sheet.52</title>
+			<desc>X2</desc>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="14.3438" cy="174.026" width="28.69" height="11.8125"/>
+			<rect x="0" y="168.119" width="28.6875" height="11.8125" class="st7"/>
+			<text x="8.06" y="177.63" class="st8" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>X<tspan dy="-0.279em"
+						class="st9" v:baseFontSize="12">2</tspan></text>		</g>
+		<g id="shape53-39" v:mID="53" v:groupContext="shape" transform="translate(205.688,-148.826)">
+			<title>Sheet.53</title>
+			<desc>Y</desc>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="10.6875" cy="174.026" width="21.38" height="11.8125"/>
+			<rect x="0" y="168.119" width="21.375" height="11.8125" class="st7"/>
+			<text x="6.35" y="177.63" class="st8" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>Y</text>		</g>
+		<g id="shape54-42" v:mID="54" v:groupContext="shape" transform="translate(200.625,-60.1114)">
+			<title>Sheet.54</title>
+			<desc>Xr = X + α(X1 – X) + β(X2 – X)</desc>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="87.75" cy="170.932" width="175.5" height="18"/>
+			<rect x="0" y="161.932" width="175.5" height="18" class="st7"/>
+			<text x="12.79" y="174.53" class="st8" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>X<tspan dy="-0.279em"
+						class="st9" v:baseFontSize="12">r </tspan><tspan dy="0.181em" class="st10">= X + </tspan>α(X<tspan
+						dy="-0.279em" class="st9" v:baseFontSize="12">1 </tspan><tspan dy="0.181em" class="st10">–</tspan> X) + β(X<tspan
+						dy="-0.279em" class="st9" v:baseFontSize="12">2 </tspan><tspan dy="0.181em" class="st10">–</tspan> X)  </text>		</g>
+	</g>
+</svg>
diff --git a/doc/img/partition_codingblock.svg b/doc/img/partition_codingblock.svg
new file mode 100644
index 0000000..872692d
--- /dev/null
+++ b/doc/img/partition_codingblock.svg
@@ -0,0 +1,225 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<!-- Generated by Microsoft Visio, SVG Export partition_codingblock.svg Page-1 -->
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ev="http://www.w3.org/2001/xml-events"
+		xmlns:v="http://schemas.microsoft.com/visio/2003/SVGExtensions/" width="8.02083in" height="8.51563in"
+		viewBox="0 0 577.5 613.125" xml:space="preserve" color-interpolation-filters="sRGB" class="st6">
+	<v:documentProperties v:langID="1033" v:viewMarkup="false"/>
+
+	<style type="text/css">
+	<![CDATA[
+		.st1 {fill:none;stroke:#000000;stroke-linecap:round;stroke-linejoin:round;stroke-width:1.5}
+		.st2 {fill:none;stroke:none;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.75}
+		.st3 {fill:#000000;font-family:Consolas;font-size:1.16666em}
+		.st4 {font-size:1em}
+		.st5 {stroke:#0070c0;stroke-dasharray:1.5,3;stroke-linecap:round;stroke-linejoin:round;stroke-width:1.5}
+		.st6 {fill:none;fill-rule:evenodd;font-size:12px;overflow:visible;stroke-linecap:square;stroke-miterlimit:3}
+	]]>
+	</style>
+
+	<g v:mID="0" v:index="1" v:groupContext="foregroundPage">
+		<title>Page-1</title>
+		<v:pageProperties v:drawingScale="1" v:pageScale="1" v:drawingUnits="19" v:shadowOffsetX="9" v:shadowOffsetY="-9"/>
+		<g id="shape1-1" v:mID="1" v:groupContext="shape" transform="translate(0.75,-468.375)">
+			<title>Square</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="469.125" width="144" height="144" class="st1"/>
+		</g>
+		<g id="shape6-3" v:mID="6" v:groupContext="shape" transform="translate(216.75,-468.375)">
+			<title>Square.6</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="469.125" width="144" height="144" class="st1"/>
+		</g>
+		<g id="shape15-5" v:mID="15" v:groupContext="shape" transform="translate(432.75,-468.375)">
+			<title>Square.15</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="469.125" width="144" height="144" class="st1"/>
+		</g>
+		<g id="shape24-7" v:mID="24" v:groupContext="shape" transform="translate(0.75,-252.375)">
+			<title>Square.24</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="469.125" width="144" height="144" class="st1"/>
+		</g>
+		<g id="shape30-9" v:mID="30" v:groupContext="shape" transform="translate(216.75,-252.375)">
+			<title>Square.30</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="469.125" width="144" height="144" class="st1"/>
+		</g>
+		<g id="shape34-11" v:mID="34" v:groupContext="shape" transform="translate(432.75,-252.375)">
+			<title>Square.34</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="469.125" width="144" height="144" class="st1"/>
+		</g>
+		<g id="shape38-13" v:mID="38" v:groupContext="shape" transform="translate(0.75,-36.375)">
+			<title>Square.38</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="469.125" width="144" height="144" class="st1"/>
+		</g>
+		<g id="shape42-15" v:mID="42" v:groupContext="shape" transform="translate(216.75,-36.375)">
+			<title>Square.42</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="469.125" width="144" height="144" class="st1"/>
+		</g>
+		<g id="shape47-17" v:mID="47" v:groupContext="shape" transform="translate(432.75,-36.375)">
+			<title>Square.47</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="469.125" width="144" height="144" class="st1"/>
+		</g>
+		<g id="shape50-19" v:mID="50" v:groupContext="shape" transform="translate(0.75,-436.875)">
+			<title>Sheet.50</title>
+			<desc>PARTITION_SPLIT</desc>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="72" cy="599.625" width="144.01" height="27"/>
+			<rect x="0" y="586.125" width="144" height="27" class="st2"/>
+			<text x="14.27" y="610.43" class="st3" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>PARTITION_SPLIT</text>		</g>
+		<g id="shape51-22" v:mID="51" v:groupContext="shape" transform="translate(216.75,-436.875)">
+			<title>Sheet.51</title>
+			<desc>PARTITION_VERT_4</desc>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="72" cy="599.625" width="144.01" height="27"/>
+			<rect x="0" y="586.125" width="144" height="27" class="st2"/>
+			<text x="10.42" y="610.43" class="st3" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>PARTITION_VERT_4</text>		</g>
+		<g id="shape52-25" v:mID="52" v:groupContext="shape" transform="translate(432.75,-436.875)">
+			<title>Sheet.52</title>
+			<desc>PARTITION_HORZ_4</desc>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="72" cy="599.625" width="144.01" height="27"/>
+			<rect x="0" y="586.125" width="144" height="27" class="st2"/>
+			<text x="10.42" y="610.43" class="st3" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>PARTITION_HORZ_4</text>		</g>
+		<g id="shape60-28" v:mID="60" v:groupContext="shape" transform="translate(0.75,-220.875)">
+			<title>Sheet.60</title>
+			<desc>PARTITION_HORZ_B</desc>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="72" cy="599.625" width="144.01" height="27"/>
+			<rect x="0" y="586.125" width="144" height="27" class="st2"/>
+			<text x="10.42" y="604.32" class="st3" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>PARTITION_HORZ_<tspan
+						class="st4" v:langID="2052">B</tspan></text>		</g>
+		<g id="shape61-32" v:mID="61" v:groupContext="shape" transform="translate(216.75,-220.875)">
+			<title>Sheet.61</title>
+			<desc>PARTITION_VERT_A</desc>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="72" cy="599.625" width="144.01" height="27"/>
+			<rect x="0" y="586.125" width="144" height="27" class="st2"/>
+			<text x="10.42" y="604.32" class="st3" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>PARTITION_VERT_A</text>		</g>
+		<g id="shape62-35" v:mID="62" v:groupContext="shape" transform="translate(432.75,-220.875)">
+			<title>Sheet.62</title>
+			<desc>PARTITION_HORZ_A</desc>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="72" cy="599.625" width="144.01" height="27"/>
+			<rect x="0" y="586.125" width="144" height="27" class="st2"/>
+			<text x="10.42" y="604.32" class="st3" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>PARTITION_HORZ_A</text>		</g>
+		<g id="shape63-38" v:mID="63" v:groupContext="shape" transform="translate(0.75,-0.375)">
+			<title>Sheet.63</title>
+			<desc>PARTITION_VERT_B</desc>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="72" cy="599.625" width="144.01" height="27"/>
+			<rect x="0" y="586.125" width="144" height="27" class="st2"/>
+			<text x="10.42" y="604.32" class="st3" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>PARTITION_VERT_<tspan
+						class="st4" v:langID="2052">B</tspan></text>		</g>
+		<g id="shape64-42" v:mID="64" v:groupContext="shape" transform="translate(216.75,-0.375)">
+			<title>Sheet.64</title>
+			<desc>PARTITION_HORZ</desc>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="72" cy="599.625" width="144.01" height="27"/>
+			<rect x="0" y="586.125" width="144" height="27" class="st2"/>
+			<text x="18.12" y="604.32" class="st3" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>PARTITION_HORZ</text>		</g>
+		<g id="shape65-45" v:mID="65" v:groupContext="shape" transform="translate(432.75,-0.375)">
+			<title>Sheet.65</title>
+			<desc>PARTITION_VERT</desc>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="72" cy="599.625" width="144.01" height="27"/>
+			<rect x="0" y="586.125" width="144" height="27" class="st2"/>
+			<text x="18.12" y="604.32" class="st3" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>PARTITION_<tspan
+						class="st4" v:langID="2052">VERT</tspan></text>		</g>
+		<g id="shape66-49" v:mID="66" v:groupContext="shape" transform="translate(685.875,0.75) rotate(90)">
+			<title>Sheet.66</title>
+			<path d="M0 613.13 L144 613.13" class="st5"/>
+		</g>
+		<g id="shape67-52" v:mID="67" v:groupContext="shape" transform="translate(0.75,-540.375)">
+			<title>Sheet.67</title>
+			<path d="M0 613.13 L144 613.13" class="st5"/>
+		</g>
+		<g id="shape68-55" v:mID="68" v:groupContext="shape" transform="translate(865.875,0.750007) rotate(90)">
+			<title>Sheet.68</title>
+			<path d="M0 613.13 L144 613.13" class="st5"/>
+		</g>
+		<g id="shape69-58" v:mID="69" v:groupContext="shape" transform="translate(901.875,0.750007) rotate(90)">
+			<title>Sheet.69</title>
+			<path d="M0 613.13 L144 613.13" class="st5"/>
+		</g>
+		<g id="shape70-61" v:mID="70" v:groupContext="shape" transform="translate(937.875,0.750007) rotate(90)">
+			<title>Sheet.70</title>
+			<path d="M0 613.13 L144 613.13" class="st5"/>
+		</g>
+		<g id="shape71-64" v:mID="71" v:groupContext="shape" transform="translate(432.75,-504.375)">
+			<title>Sheet.71</title>
+			<path d="M0 613.13 L144 613.13" class="st5"/>
+		</g>
+		<g id="shape72-67" v:mID="72" v:groupContext="shape" transform="translate(432.75,-540.375)">
+			<title>Sheet.72</title>
+			<path d="M0 613.13 L144 613.13" class="st5"/>
+		</g>
+		<g id="shape73-70" v:mID="73" v:groupContext="shape" transform="translate(432.75,-576.375)">
+			<title>Sheet.73</title>
+			<path d="M0 613.13 L144 613.13" class="st5"/>
+		</g>
+		<g id="shape74-73" v:mID="74" v:groupContext="shape" transform="translate(0.75,-324.375)">
+			<title>Sheet.74</title>
+			<path d="M0 613.13 L144 613.13" class="st5"/>
+		</g>
+		<g id="shape75-76" v:mID="75" v:groupContext="shape" transform="translate(685.875,288.75) rotate(90)">
+			<title>Sheet.75</title>
+			<path d="M0 613.13 L72 613.13" class="st5"/>
+		</g>
+		<g id="shape76-79" v:mID="76" v:groupContext="shape" transform="translate(901.875,216.75) rotate(90)">
+			<title>Sheet.76</title>
+			<path d="M0 613.13 L144 613.13" class="st5"/>
+		</g>
+		<g id="shape77-82" v:mID="77" v:groupContext="shape" transform="translate(216.75,-324.375)">
+			<title>Sheet.77</title>
+			<path d="M0 613.13 L72 613.13" class="st5"/>
+		</g>
+		<g id="shape78-85" v:mID="78" v:groupContext="shape" transform="translate(432.75,-324.375)">
+			<title>Sheet.78</title>
+			<path d="M0 613.13 L144 613.13" class="st5"/>
+		</g>
+		<g id="shape79-88" v:mID="79" v:groupContext="shape" transform="translate(1117.88,216.75) rotate(90)">
+			<title>Sheet.79</title>
+			<path d="M0 613.13 L72 613.13" class="st5"/>
+		</g>
+		<g id="shape80-91" v:mID="80" v:groupContext="shape" transform="translate(685.875,432.75) rotate(90)">
+			<title>Sheet.80</title>
+			<path d="M0 613.13 L144 613.13" class="st5"/>
+		</g>
+		<g id="shape81-94" v:mID="81" v:groupContext="shape" transform="translate(72.75,-108.375)">
+			<title>Sheet.81</title>
+			<path d="M0 613.13 L72 613.13" class="st5"/>
+		</g>
+		<g id="shape82-97" v:mID="82" v:groupContext="shape" transform="translate(216.75,-108.375)">
+			<title>Sheet.82</title>
+			<path d="M0 613.13 L144 613.13" class="st5"/>
+		</g>
+		<g id="shape83-100" v:mID="83" v:groupContext="shape" transform="translate(1117.88,432.75) rotate(90)">
+			<title>Sheet.83</title>
+			<path d="M0 613.13 L144 613.13" class="st5"/>
+		</g>
+	</g>
+</svg>
diff --git a/doc/img/primary_tap.svg b/doc/img/primary_tap.svg
new file mode 100644
index 0000000..8cd2a18
--- /dev/null
+++ b/doc/img/primary_tap.svg
@@ -0,0 +1,1589 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<!-- Generated by Microsoft Visio, SVG Export primary_tap.svg Page-1 -->
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ev="http://www.w3.org/2001/xml-events"
+		xmlns:v="http://schemas.microsoft.com/visio/2003/SVGExtensions/" width="11.2533in" height="6.63188in"
+		viewBox="0 0 810.24 477.495" xml:space="preserve" color-interpolation-filters="sRGB" class="st7">
+	<v:documentProperties v:langID="1033" v:viewMarkup="false">
+		<v:userDefs>
+			<v:ud v:nameU="msvNoAutoConnect" v:val="VT0(1):26"/>
+		</v:userDefs>
+	</v:documentProperties>
+
+	<style type="text/css">
+	<![CDATA[
+		.st1 {fill:#ffffff;stroke:#000000;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.24}
+		.st2 {fill:#000000;font-family:Calibri;font-size:1.00001em;font-style:italic}
+		.st3 {font-size:1em;font-style:normal}
+		.st4 {fill:#00b0f0;fill-opacity:0.5;stroke:#000000;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.24}
+		.st5 {fill:none;stroke:none;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.75}
+		.st6 {fill:#000000;font-family:Calibri;font-size:1.5em;font-style:italic}
+		.st7 {fill:none;fill-rule:evenodd;font-size:12px;overflow:visible;stroke-linecap:square;stroke-miterlimit:3}
+	]]>
+	</style>
+
+	<g v:mID="0" v:index="1" v:groupContext="foregroundPage">
+		<title>Page-1</title>
+		<v:pageProperties v:drawingScale="1" v:pageScale="1" v:drawingUnits="19" v:shadowOffsetX="9" v:shadowOffsetY="-9"/>
+		<g id="shape1-1" v:mID="1" v:groupContext="shape" transform="translate(18.12,-423.375)">
+			<title>Square</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape2-3" v:mID="2" v:groupContext="shape" transform="translate(54.12,-423.375)">
+			<title>Square.2</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape3-5" v:mID="3" v:groupContext="shape" transform="translate(90.12,-423.375)">
+			<title>Square.3</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape4-7" v:mID="4" v:groupContext="shape" transform="translate(126.12,-423.375)">
+			<title>Square.4</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape5-9" v:mID="5" v:groupContext="shape" transform="translate(162.12,-423.375)">
+			<title>Square.5</title>
+			<desc>a/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="459.495" width="36.01" height="36"/>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+			<text x="6.52" y="463.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>a<tspan class="st3">/16</tspan></text>		</g>
+		<g id="shape6-13" v:mID="6" v:groupContext="shape" transform="translate(18.12,-387.375)">
+			<title>Square.6</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape7-15" v:mID="7" v:groupContext="shape" transform="translate(54.12,-387.375)">
+			<title>Square.7</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape8-17" v:mID="8" v:groupContext="shape" transform="translate(90.12,-387.375)">
+			<title>Square.8</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape9-19" v:mID="9" v:groupContext="shape" transform="translate(126.12,-387.375)">
+			<title>Square.9</title>
+			<desc>b/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="459.495" width="36.01" height="36"/>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+			<text x="6.52" y="463.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>b<tspan class="st3">/16</tspan></text>		</g>
+		<g id="shape10-23" v:mID="10" v:groupContext="shape" transform="translate(162.12,-387.375)">
+			<title>Square.10</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape11-25" v:mID="11" v:groupContext="shape" transform="translate(18.12,-351.375)">
+			<title>Square.11</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape12-27" v:mID="12" v:groupContext="shape" transform="translate(54.12,-351.375)">
+			<title>Square.12</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape13-29" v:mID="13" v:groupContext="shape" transform="translate(90.12,-351.375)">
+			<title>Square.13</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st4"/>
+		</g>
+		<g id="shape14-31" v:mID="14" v:groupContext="shape" transform="translate(126.12,-351.375)">
+			<title>Square.14</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape15-33" v:mID="15" v:groupContext="shape" transform="translate(162.12,-351.375)">
+			<title>Square.15</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape16-35" v:mID="16" v:groupContext="shape" transform="translate(18.12,-315.375)">
+			<title>Square.16</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape17-37" v:mID="17" v:groupContext="shape" transform="translate(54.12,-315.375)">
+			<title>Square.17</title>
+			<desc>b/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="459.495" width="36.01" height="36"/>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+			<text x="6.52" y="463.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>b<tspan class="st3">/16</tspan></text>		</g>
+		<g id="shape18-41" v:mID="18" v:groupContext="shape" transform="translate(90.12,-315.375)">
+			<title>Square.18</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape19-43" v:mID="19" v:groupContext="shape" transform="translate(126.12,-315.375)">
+			<title>Square.19</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape20-45" v:mID="20" v:groupContext="shape" transform="translate(162.12,-315.375)">
+			<title>Square.20</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape21-47" v:mID="21" v:groupContext="shape" transform="translate(18.12,-279.375)">
+			<title>Square.21</title>
+			<desc>a/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="459.495" width="36.01" height="36"/>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+			<text x="6.52" y="463.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>a<tspan class="st3">/16</tspan></text>		</g>
+		<g id="shape22-51" v:mID="22" v:groupContext="shape" transform="translate(54.12,-279.375)">
+			<title>Square.22</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape23-53" v:mID="23" v:groupContext="shape" transform="translate(90.12,-279.375)">
+			<title>Square.23</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape24-55" v:mID="24" v:groupContext="shape" transform="translate(126.12,-279.375)">
+			<title>Square.24</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape25-57" v:mID="25" v:groupContext="shape" transform="translate(162.12,-279.375)">
+			<title>Square.25</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape30-59" v:mID="30" v:groupContext="shape" transform="translate(216.12,-423.375)">
+			<title>Square.30</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape31-61" v:mID="31" v:groupContext="shape" transform="translate(252.12,-423.375)">
+			<title>Square.31</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape32-63" v:mID="32" v:groupContext="shape" transform="translate(288.12,-423.375)">
+			<title>Square.32</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape33-65" v:mID="33" v:groupContext="shape" transform="translate(324.12,-423.375)">
+			<title>Square.33</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape34-67" v:mID="34" v:groupContext="shape" transform="translate(360.12,-423.375)">
+			<title>Square.34</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape35-69" v:mID="35" v:groupContext="shape" transform="translate(216.12,-387.375)">
+			<title>Square.35</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape36-71" v:mID="36" v:groupContext="shape" transform="translate(252.12,-387.375)">
+			<title>Square.36</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape37-73" v:mID="37" v:groupContext="shape" transform="translate(288.12,-387.375)">
+			<title>Square.37</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape38-75" v:mID="38" v:groupContext="shape" transform="translate(324.12,-387.375)">
+			<title>Square.38</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape39-77" v:mID="39" v:groupContext="shape" transform="translate(360.12,-387.375)">
+			<title>Square.39</title>
+			<desc>a/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="459.495" width="36.01" height="36"/>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+			<text x="6.52" y="463.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>a<tspan class="st3">/16</tspan></text>		</g>
+		<g id="shape40-81" v:mID="40" v:groupContext="shape" transform="translate(216.12,-351.375)">
+			<title>Square.40</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape41-83" v:mID="41" v:groupContext="shape" transform="translate(252.12,-351.375)">
+			<title>Square.41</title>
+			<desc>b/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="459.495" width="36.01" height="36"/>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+			<text x="6.52" y="463.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>b<tspan class="st3">/16</tspan></text>		</g>
+		<g id="shape42-87" v:mID="42" v:groupContext="shape" transform="translate(288.12,-351.375)">
+			<title>Square.42</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st4"/>
+		</g>
+		<g id="shape43-89" v:mID="43" v:groupContext="shape" transform="translate(324.12,-351.375)">
+			<title>Square.43</title>
+			<desc>b/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="459.495" width="36.01" height="36"/>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+			<text x="6.52" y="463.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>b<tspan class="st3">/16</tspan></text>		</g>
+		<g id="shape44-93" v:mID="44" v:groupContext="shape" transform="translate(360.12,-351.375)">
+			<title>Square.44</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape45-95" v:mID="45" v:groupContext="shape" transform="translate(216.12,-315.375)">
+			<title>Square.45</title>
+			<desc>a/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="459.495" width="36.01" height="36"/>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+			<text x="6.52" y="463.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>a<tspan class="st3">/16</tspan></text>		</g>
+		<g id="shape46-99" v:mID="46" v:groupContext="shape" transform="translate(252.12,-315.375)">
+			<title>Square.46</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape47-101" v:mID="47" v:groupContext="shape" transform="translate(288.12,-315.375)">
+			<title>Square.47</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape48-103" v:mID="48" v:groupContext="shape" transform="translate(324.12,-315.375)">
+			<title>Square.48</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape49-105" v:mID="49" v:groupContext="shape" transform="translate(360.12,-315.375)">
+			<title>Square.49</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape50-107" v:mID="50" v:groupContext="shape" transform="translate(216.12,-279.375)">
+			<title>Square.50</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape51-109" v:mID="51" v:groupContext="shape" transform="translate(252.12,-279.375)">
+			<title>Square.51</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape52-111" v:mID="52" v:groupContext="shape" transform="translate(288.12,-279.375)">
+			<title>Square.52</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape53-113" v:mID="53" v:groupContext="shape" transform="translate(324.12,-279.375)">
+			<title>Square.53</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape54-115" v:mID="54" v:groupContext="shape" transform="translate(360.12,-279.375)">
+			<title>Square.54</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape55-117" v:mID="55" v:groupContext="shape" transform="translate(414.12,-423.375)">
+			<title>Square.55</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape56-119" v:mID="56" v:groupContext="shape" transform="translate(450.12,-423.375)">
+			<title>Square.56</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape57-121" v:mID="57" v:groupContext="shape" transform="translate(486.12,-423.375)">
+			<title>Square.57</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape58-123" v:mID="58" v:groupContext="shape" transform="translate(522.12,-423.375)">
+			<title>Square.58</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape59-125" v:mID="59" v:groupContext="shape" transform="translate(558.12,-423.375)">
+			<title>Square.59</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape60-127" v:mID="60" v:groupContext="shape" transform="translate(414.12,-387.375)">
+			<title>Square.60</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape61-129" v:mID="61" v:groupContext="shape" transform="translate(450.12,-387.375)">
+			<title>Square.61</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape62-131" v:mID="62" v:groupContext="shape" transform="translate(486.12,-387.375)">
+			<title>Square.62</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape63-133" v:mID="63" v:groupContext="shape" transform="translate(522.12,-387.375)">
+			<title>Square.63</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape64-135" v:mID="64" v:groupContext="shape" transform="translate(558.12,-387.375)">
+			<title>Square.64</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape65-137" v:mID="65" v:groupContext="shape" transform="translate(414.12,-351.375)">
+			<title>Square.65</title>
+			<desc>a/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="459.495" width="36.01" height="36"/>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+			<text x="6.52" y="463.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>a<tspan class="st3">/16</tspan></text>		</g>
+		<g id="shape66-141" v:mID="66" v:groupContext="shape" transform="translate(450.12,-351.375)">
+			<title>Square.66</title>
+			<desc>b/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="459.495" width="36.01" height="36"/>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+			<text x="6.52" y="463.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>b<tspan class="st3">/16</tspan></text>		</g>
+		<g id="shape67-145" v:mID="67" v:groupContext="shape" transform="translate(486.12,-351.375)">
+			<title>Square.67</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st4"/>
+		</g>
+		<g id="shape68-147" v:mID="68" v:groupContext="shape" transform="translate(522.12,-351.375)">
+			<title>Square.68</title>
+			<desc>b/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="459.495" width="36.01" height="36"/>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+			<text x="6.52" y="463.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>b<tspan class="st3">/16</tspan></text>		</g>
+		<g id="shape69-151" v:mID="69" v:groupContext="shape" transform="translate(558.12,-351.375)">
+			<title>Square.69</title>
+			<desc>a/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="459.495" width="36.01" height="36"/>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+			<text x="6.52" y="463.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>a<tspan class="st3">/16</tspan></text>		</g>
+		<g id="shape70-155" v:mID="70" v:groupContext="shape" transform="translate(414.12,-315.375)">
+			<title>Square.70</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape71-157" v:mID="71" v:groupContext="shape" transform="translate(450.12,-315.375)">
+			<title>Square.71</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape72-159" v:mID="72" v:groupContext="shape" transform="translate(486.12,-315.375)">
+			<title>Square.72</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape73-161" v:mID="73" v:groupContext="shape" transform="translate(522.12,-315.375)">
+			<title>Square.73</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape74-163" v:mID="74" v:groupContext="shape" transform="translate(558.12,-315.375)">
+			<title>Square.74</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape75-165" v:mID="75" v:groupContext="shape" transform="translate(414.12,-279.375)">
+			<title>Square.75</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape76-167" v:mID="76" v:groupContext="shape" transform="translate(450.12,-279.375)">
+			<title>Square.76</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape77-169" v:mID="77" v:groupContext="shape" transform="translate(486.12,-279.375)">
+			<title>Square.77</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape78-171" v:mID="78" v:groupContext="shape" transform="translate(522.12,-279.375)">
+			<title>Square.78</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape79-173" v:mID="79" v:groupContext="shape" transform="translate(558.12,-279.375)">
+			<title>Square.79</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape80-175" v:mID="80" v:groupContext="shape" transform="translate(612.12,-423.375)">
+			<title>Square.80</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape81-177" v:mID="81" v:groupContext="shape" transform="translate(648.12,-423.375)">
+			<title>Square.81</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape82-179" v:mID="82" v:groupContext="shape" transform="translate(684.12,-423.375)">
+			<title>Square.82</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape83-181" v:mID="83" v:groupContext="shape" transform="translate(720.12,-423.375)">
+			<title>Square.83</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape84-183" v:mID="84" v:groupContext="shape" transform="translate(756.12,-423.375)">
+			<title>Square.84</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape85-185" v:mID="85" v:groupContext="shape" transform="translate(612.12,-387.375)">
+			<title>Square.85</title>
+			<desc>a/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="459.495" width="36.01" height="36"/>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+			<text x="6.52" y="463.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>a<tspan class="st3">/16</tspan></text>		</g>
+		<g id="shape86-189" v:mID="86" v:groupContext="shape" transform="translate(648.12,-387.375)">
+			<title>Square.86</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape87-191" v:mID="87" v:groupContext="shape" transform="translate(684.12,-387.375)">
+			<title>Square.87</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape88-193" v:mID="88" v:groupContext="shape" transform="translate(720.12,-387.375)">
+			<title>Square.88</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape89-195" v:mID="89" v:groupContext="shape" transform="translate(756.12,-387.375)">
+			<title>Square.89</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape90-197" v:mID="90" v:groupContext="shape" transform="translate(612.12,-351.375)">
+			<title>Square.90</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape91-199" v:mID="91" v:groupContext="shape" transform="translate(648.12,-351.375)">
+			<title>Square.91</title>
+			<desc>b/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="459.495" width="36.01" height="36"/>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+			<text x="6.52" y="463.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>b<tspan class="st3">/16</tspan></text>		</g>
+		<g id="shape92-203" v:mID="92" v:groupContext="shape" transform="translate(684.12,-351.375)">
+			<title>Square.92</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st4"/>
+		</g>
+		<g id="shape93-205" v:mID="93" v:groupContext="shape" transform="translate(720.12,-351.375)">
+			<title>Square.93</title>
+			<desc>b/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="459.495" width="36.01" height="36"/>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+			<text x="6.52" y="463.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>b<tspan class="st3">/16</tspan></text>		</g>
+		<g id="shape94-209" v:mID="94" v:groupContext="shape" transform="translate(756.12,-351.375)">
+			<title>Square.94</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape95-211" v:mID="95" v:groupContext="shape" transform="translate(612.12,-315.375)">
+			<title>Square.95</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape96-213" v:mID="96" v:groupContext="shape" transform="translate(648.12,-315.375)">
+			<title>Square.96</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape97-215" v:mID="97" v:groupContext="shape" transform="translate(684.12,-315.375)">
+			<title>Square.97</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape98-217" v:mID="98" v:groupContext="shape" transform="translate(720.12,-315.375)">
+			<title>Square.98</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape99-219" v:mID="99" v:groupContext="shape" transform="translate(756.12,-315.375)">
+			<title>Square.99</title>
+			<desc>a/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="459.495" width="36.01" height="36"/>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+			<text x="6.52" y="463.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>a<tspan class="st3">/16</tspan></text>		</g>
+		<g id="shape100-223" v:mID="100" v:groupContext="shape" transform="translate(612.12,-279.375)">
+			<title>Square.100</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape101-225" v:mID="101" v:groupContext="shape" transform="translate(648.12,-279.375)">
+			<title>Square.101</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape102-227" v:mID="102" v:groupContext="shape" transform="translate(684.12,-279.375)">
+			<title>Square.102</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape103-229" v:mID="103" v:groupContext="shape" transform="translate(720.12,-279.375)">
+			<title>Square.103</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape104-231" v:mID="104" v:groupContext="shape" transform="translate(756.12,-279.375)">
+			<title>Square.104</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape115-233" v:mID="115" v:groupContext="shape" transform="translate(18.12,-189.375)">
+			<title>Square.115</title>
+			<desc>a/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="459.495" width="36.01" height="36"/>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+			<text x="6.52" y="463.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>a<tspan class="st3">/16</tspan></text>		</g>
+		<g id="shape116-237" v:mID="116" v:groupContext="shape" transform="translate(54.12,-189.375)">
+			<title>Square.116</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape117-239" v:mID="117" v:groupContext="shape" transform="translate(90.12,-189.375)">
+			<title>Square.117</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape118-241" v:mID="118" v:groupContext="shape" transform="translate(126.12,-189.375)">
+			<title>Square.118</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape119-243" v:mID="119" v:groupContext="shape" transform="translate(162.12,-189.375)">
+			<title>Square.119</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape120-245" v:mID="120" v:groupContext="shape" transform="translate(18.12,-153.375)">
+			<title>Square.120</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape121-247" v:mID="121" v:groupContext="shape" transform="translate(54.12,-153.375)">
+			<title>Square.121</title>
+			<desc>b/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="459.495" width="36.01" height="36"/>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+			<text x="6.52" y="463.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>b<tspan class="st3">/16</tspan></text>		</g>
+		<g id="shape122-251" v:mID="122" v:groupContext="shape" transform="translate(90.12,-153.375)">
+			<title>Square.122</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape123-253" v:mID="123" v:groupContext="shape" transform="translate(126.12,-153.375)">
+			<title>Square.123</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape124-255" v:mID="124" v:groupContext="shape" transform="translate(162.12,-153.375)">
+			<title>Square.124</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape125-257" v:mID="125" v:groupContext="shape" transform="translate(18.12,-117.375)">
+			<title>Square.125</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape126-259" v:mID="126" v:groupContext="shape" transform="translate(54.12,-117.375)">
+			<title>Square.126</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape127-261" v:mID="127" v:groupContext="shape" transform="translate(90.12,-117.375)">
+			<title>Square.127</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st4"/>
+		</g>
+		<g id="shape128-263" v:mID="128" v:groupContext="shape" transform="translate(126.12,-117.375)">
+			<title>Square.128</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape129-265" v:mID="129" v:groupContext="shape" transform="translate(162.12,-117.375)">
+			<title>Square.129</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape130-267" v:mID="130" v:groupContext="shape" transform="translate(18.12,-81.375)">
+			<title>Square.130</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape131-269" v:mID="131" v:groupContext="shape" transform="translate(54.12,-81.375)">
+			<title>Square.131</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape132-271" v:mID="132" v:groupContext="shape" transform="translate(90.12,-81.3749)">
+			<title>Square.132</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape133-273" v:mID="133" v:groupContext="shape" transform="translate(126.12,-81.3749)">
+			<title>Square.133</title>
+			<desc>b/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="459.495" width="36.01" height="36"/>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+			<text x="6.52" y="463.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>b<tspan class="st3">/16</tspan></text>		</g>
+		<g id="shape134-277" v:mID="134" v:groupContext="shape" transform="translate(162.12,-81.3749)">
+			<title>Square.134</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape135-279" v:mID="135" v:groupContext="shape" transform="translate(18.12,-45.375)">
+			<title>Square.135</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape136-281" v:mID="136" v:groupContext="shape" transform="translate(54.12,-45.375)">
+			<title>Square.136</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape137-283" v:mID="137" v:groupContext="shape" transform="translate(90.12,-45.375)">
+			<title>Square.137</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape138-285" v:mID="138" v:groupContext="shape" transform="translate(126.12,-45.375)">
+			<title>Square.138</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape139-287" v:mID="139" v:groupContext="shape" transform="translate(162.12,-45.375)">
+			<title>Square.139</title>
+			<desc>a/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="459.495" width="36.01" height="36"/>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+			<text x="6.52" y="463.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>a<tspan class="st3">/16</tspan></text>		</g>
+		<g id="shape140-291" v:mID="140" v:groupContext="shape" transform="translate(216.12,-189.375)">
+			<title>Square.140</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape141-293" v:mID="141" v:groupContext="shape" transform="translate(252.12,-189.375)">
+			<title>Square.141</title>
+			<desc>a/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="459.495" width="36.01" height="36"/>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+			<text x="6.52" y="463.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>a<tspan class="st3">/16</tspan></text>		</g>
+		<g id="shape142-297" v:mID="142" v:groupContext="shape" transform="translate(288.12,-189.375)">
+			<title>Square.142</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape143-299" v:mID="143" v:groupContext="shape" transform="translate(324.12,-189.375)">
+			<title>Square.143</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape144-301" v:mID="144" v:groupContext="shape" transform="translate(360.12,-189.375)">
+			<title>Square.144</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape145-303" v:mID="145" v:groupContext="shape" transform="translate(216.12,-153.375)">
+			<title>Square.145</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape146-305" v:mID="146" v:groupContext="shape" transform="translate(252.12,-153.375)">
+			<title>Square.146</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape147-307" v:mID="147" v:groupContext="shape" transform="translate(288.12,-153.375)">
+			<title>Square.147</title>
+			<desc>b/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="459.495" width="36.01" height="36"/>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+			<text x="6.52" y="463.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>b<tspan class="st3">/16</tspan></text>		</g>
+		<g id="shape148-311" v:mID="148" v:groupContext="shape" transform="translate(324.12,-153.375)">
+			<title>Square.148</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape149-313" v:mID="149" v:groupContext="shape" transform="translate(360.12,-153.375)">
+			<title>Square.149</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape150-315" v:mID="150" v:groupContext="shape" transform="translate(216.12,-117.375)">
+			<title>Square.150</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape151-317" v:mID="151" v:groupContext="shape" transform="translate(252.12,-117.375)">
+			<title>Square.151</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape152-319" v:mID="152" v:groupContext="shape" transform="translate(288.12,-117.375)">
+			<title>Square.152</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st4"/>
+		</g>
+		<g id="shape153-321" v:mID="153" v:groupContext="shape" transform="translate(324.12,-117.375)">
+			<title>Square.153</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape154-323" v:mID="154" v:groupContext="shape" transform="translate(360.12,-117.375)">
+			<title>Square.154</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape155-325" v:mID="155" v:groupContext="shape" transform="translate(216.12,-81.3749)">
+			<title>Square.155</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape156-327" v:mID="156" v:groupContext="shape" transform="translate(252.12,-81.3749)">
+			<title>Square.156</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape157-329" v:mID="157" v:groupContext="shape" transform="translate(288.12,-81.3749)">
+			<title>Square.157</title>
+			<desc>b/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="459.495" width="36.01" height="36"/>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+			<text x="6.52" y="463.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>b<tspan class="st3">/16</tspan></text>		</g>
+		<g id="shape158-333" v:mID="158" v:groupContext="shape" transform="translate(324.12,-81.3749)">
+			<title>Square.158</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape159-335" v:mID="159" v:groupContext="shape" transform="translate(360.12,-81.3749)">
+			<title>Square.159</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape160-337" v:mID="160" v:groupContext="shape" transform="translate(216.12,-45.3749)">
+			<title>Square.160</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape161-339" v:mID="161" v:groupContext="shape" transform="translate(252.12,-45.3749)">
+			<title>Square.161</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape162-341" v:mID="162" v:groupContext="shape" transform="translate(288.12,-45.3749)">
+			<title>Square.162</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape163-343" v:mID="163" v:groupContext="shape" transform="translate(324.12,-45.3749)">
+			<title>Square.163</title>
+			<desc>a/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="459.495" width="36.01" height="36"/>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+			<text x="6.52" y="463.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>a<tspan class="st3">/16</tspan></text>		</g>
+		<g id="shape164-347" v:mID="164" v:groupContext="shape" transform="translate(360.12,-45.3749)">
+			<title>Square.164</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape165-349" v:mID="165" v:groupContext="shape" transform="translate(414.12,-189.375)">
+			<title>Square.165</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape166-351" v:mID="166" v:groupContext="shape" transform="translate(450.12,-189.375)">
+			<title>Square.166</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape167-353" v:mID="167" v:groupContext="shape" transform="translate(486.12,-189.375)">
+			<title>Square.167</title>
+			<desc>a/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="459.495" width="36.01" height="36"/>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+			<text x="6.52" y="463.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>a<tspan class="st3">/16</tspan></text>		</g>
+		<g id="shape168-357" v:mID="168" v:groupContext="shape" transform="translate(522.12,-189.375)">
+			<title>Square.168</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape169-359" v:mID="169" v:groupContext="shape" transform="translate(558.12,-189.375)">
+			<title>Square.169</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape170-361" v:mID="170" v:groupContext="shape" transform="translate(414.12,-153.375)">
+			<title>Square.170</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape171-363" v:mID="171" v:groupContext="shape" transform="translate(450.12,-153.375)">
+			<title>Square.171</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape172-365" v:mID="172" v:groupContext="shape" transform="translate(486.12,-153.375)">
+			<title>Square.172</title>
+			<desc>b/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="459.495" width="36.01" height="36"/>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+			<text x="6.52" y="463.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>b<tspan class="st3">/16</tspan></text>		</g>
+		<g id="shape173-369" v:mID="173" v:groupContext="shape" transform="translate(522.12,-153.375)">
+			<title>Square.173</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape174-371" v:mID="174" v:groupContext="shape" transform="translate(558.12,-153.375)">
+			<title>Square.174</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape175-373" v:mID="175" v:groupContext="shape" transform="translate(414.12,-117.375)">
+			<title>Square.175</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape176-375" v:mID="176" v:groupContext="shape" transform="translate(450.12,-117.375)">
+			<title>Square.176</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape177-377" v:mID="177" v:groupContext="shape" transform="translate(486.12,-117.375)">
+			<title>Square.177</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st4"/>
+		</g>
+		<g id="shape178-379" v:mID="178" v:groupContext="shape" transform="translate(522.12,-117.375)">
+			<title>Square.178</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape179-381" v:mID="179" v:groupContext="shape" transform="translate(558.12,-117.375)">
+			<title>Square.179</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape180-383" v:mID="180" v:groupContext="shape" transform="translate(414.12,-81.3749)">
+			<title>Square.180</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape181-385" v:mID="181" v:groupContext="shape" transform="translate(450.12,-81.3749)">
+			<title>Square.181</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape182-387" v:mID="182" v:groupContext="shape" transform="translate(486.12,-81.3749)">
+			<title>Square.182</title>
+			<desc>b/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="459.495" width="36.01" height="36"/>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+			<text x="6.52" y="463.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>b<tspan class="st3">/16</tspan></text>		</g>
+		<g id="shape183-391" v:mID="183" v:groupContext="shape" transform="translate(522.12,-81.3749)">
+			<title>Square.183</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape184-393" v:mID="184" v:groupContext="shape" transform="translate(558.12,-81.3749)">
+			<title>Square.184</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape185-395" v:mID="185" v:groupContext="shape" transform="translate(414.12,-45.3749)">
+			<title>Square.185</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape186-397" v:mID="186" v:groupContext="shape" transform="translate(450.12,-45.3749)">
+			<title>Square.186</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape187-399" v:mID="187" v:groupContext="shape" transform="translate(486.12,-45.3749)">
+			<title>Square.187</title>
+			<desc>a/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="459.495" width="36.01" height="36"/>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+			<text x="6.52" y="463.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>a<tspan class="st3">/16</tspan></text>		</g>
+		<g id="shape188-403" v:mID="188" v:groupContext="shape" transform="translate(522.12,-45.3749)">
+			<title>Square.188</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape189-405" v:mID="189" v:groupContext="shape" transform="translate(558.12,-45.3749)">
+			<title>Square.189</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape190-407" v:mID="190" v:groupContext="shape" transform="translate(612.12,-189.375)">
+			<title>Square.190</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape191-409" v:mID="191" v:groupContext="shape" transform="translate(648.12,-189.375)">
+			<title>Square.191</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape192-411" v:mID="192" v:groupContext="shape" transform="translate(684.12,-189.375)">
+			<title>Square.192</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape193-413" v:mID="193" v:groupContext="shape" transform="translate(720.12,-189.375)">
+			<title>Square.193</title>
+			<desc>a/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="459.495" width="36.01" height="36"/>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+			<text x="6.52" y="463.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>a<tspan class="st3">/16</tspan></text>		</g>
+		<g id="shape194-417" v:mID="194" v:groupContext="shape" transform="translate(756.12,-189.375)">
+			<title>Square.194</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape195-419" v:mID="195" v:groupContext="shape" transform="translate(612.12,-153.375)">
+			<title>Square.195</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape196-421" v:mID="196" v:groupContext="shape" transform="translate(648.12,-153.375)">
+			<title>Square.196</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape197-423" v:mID="197" v:groupContext="shape" transform="translate(684.12,-153.375)">
+			<title>Square.197</title>
+			<desc>b/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="459.495" width="36.01" height="36"/>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+			<text x="6.52" y="463.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>b<tspan class="st3">/16</tspan></text>		</g>
+		<g id="shape198-427" v:mID="198" v:groupContext="shape" transform="translate(720.12,-153.375)">
+			<title>Square.198</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape199-429" v:mID="199" v:groupContext="shape" transform="translate(756.12,-153.375)">
+			<title>Square.199</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape200-431" v:mID="200" v:groupContext="shape" transform="translate(612.12,-117.375)">
+			<title>Square.200</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape201-433" v:mID="201" v:groupContext="shape" transform="translate(648.12,-117.375)">
+			<title>Square.201</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape202-435" v:mID="202" v:groupContext="shape" transform="translate(684.12,-117.375)">
+			<title>Square.202</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st4"/>
+		</g>
+		<g id="shape203-437" v:mID="203" v:groupContext="shape" transform="translate(720.12,-117.375)">
+			<title>Square.203</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape204-439" v:mID="204" v:groupContext="shape" transform="translate(756.12,-117.375)">
+			<title>Square.204</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape205-441" v:mID="205" v:groupContext="shape" transform="translate(612.12,-81.3749)">
+			<title>Square.205</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape206-443" v:mID="206" v:groupContext="shape" transform="translate(648.12,-81.3749)">
+			<title>Square.206</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape207-445" v:mID="207" v:groupContext="shape" transform="translate(684.12,-81.3749)">
+			<title>Square.207</title>
+			<desc>b/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="459.495" width="36.01" height="36"/>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+			<text x="6.52" y="463.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>b<tspan class="st3">/16</tspan></text>		</g>
+		<g id="shape208-449" v:mID="208" v:groupContext="shape" transform="translate(720.12,-81.3749)">
+			<title>Square.208</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape209-451" v:mID="209" v:groupContext="shape" transform="translate(756.12,-81.3749)">
+			<title>Square.209</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape210-453" v:mID="210" v:groupContext="shape" transform="translate(612.12,-45.3749)">
+			<title>Square.210</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape211-455" v:mID="211" v:groupContext="shape" transform="translate(648.12,-45.3749)">
+			<title>Square.211</title>
+			<desc>a/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="459.495" width="36.01" height="36"/>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+			<text x="6.52" y="463.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>a<tspan class="st3">/16</tspan></text>		</g>
+		<g id="shape212-459" v:mID="212" v:groupContext="shape" transform="translate(684.12,-45.3749)">
+			<title>Square.212</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape213-461" v:mID="213" v:groupContext="shape" transform="translate(720.12,-45.3749)">
+			<title>Square.213</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape214-463" v:mID="214" v:groupContext="shape" transform="translate(756.12,-45.3749)">
+			<title>Square.214</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape236-465" v:mID="236" v:groupContext="shape" transform="translate(54.12,-252.375)">
+			<title>Sheet.236</title>
+			<desc>d = 0</desc>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="54" cy="463.995" width="108" height="27"/>
+			<rect x="0" y="450.495" width="108" height="27" class="st5"/>
+			<text x="36.26" y="469.4" class="st6" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>d<tspan class="st3"> </tspan><tspan
+						class="st3">= 0</tspan></text>		</g>
+		<g id="shape237-470" v:mID="237" v:groupContext="shape" transform="translate(252.12,-252.375)">
+			<title>Sheet.237</title>
+			<desc>d = 1</desc>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="54" cy="463.995" width="108" height="27"/>
+			<rect x="0" y="450.495" width="108" height="27" class="st5"/>
+			<text x="36.26" y="469.4" class="st6" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>d<tspan class="st3"> </tspan><tspan
+						class="st3">= 1</tspan></text>		</g>
+		<g id="shape238-475" v:mID="238" v:groupContext="shape" transform="translate(450.12,-252.375)">
+			<title>Sheet.238</title>
+			<desc>d = 2</desc>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="54" cy="463.995" width="108" height="27"/>
+			<rect x="0" y="450.495" width="108" height="27" class="st5"/>
+			<text x="36.26" y="469.4" class="st6" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>d<tspan class="st3"> </tspan><tspan
+						class="st3">= 2</tspan></text>		</g>
+		<g id="shape239-480" v:mID="239" v:groupContext="shape" transform="translate(648.12,-252.375)">
+			<title>Sheet.239</title>
+			<desc>d = 3</desc>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="54" cy="463.995" width="108" height="27"/>
+			<rect x="0" y="450.495" width="108" height="27" class="st5"/>
+			<text x="36.26" y="469.4" class="st6" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>d<tspan class="st3"> </tspan><tspan
+						class="st3">= 3</tspan></text>		</g>
+		<g id="shape240-485" v:mID="240" v:groupContext="shape" transform="translate(54.12,-18.375)">
+			<title>Sheet.240</title>
+			<desc>d = 4</desc>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="54" cy="463.995" width="108" height="27"/>
+			<rect x="0" y="450.495" width="108" height="27" class="st5"/>
+			<text x="36.26" y="469.4" class="st6" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>d<tspan class="st3"> </tspan><tspan
+						class="st3">= 4</tspan></text>		</g>
+		<g id="shape241-490" v:mID="241" v:groupContext="shape" transform="translate(252.12,-18.375)">
+			<title>Sheet.241</title>
+			<desc>d = 5</desc>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="54" cy="463.995" width="108" height="27"/>
+			<rect x="0" y="450.495" width="108" height="27" class="st5"/>
+			<text x="36.26" y="469.4" class="st6" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>d<tspan class="st3"> </tspan><tspan
+						class="st3">= 5</tspan></text>		</g>
+		<g id="shape242-495" v:mID="242" v:groupContext="shape" transform="translate(450.12,-18.375)">
+			<title>Sheet.242</title>
+			<desc>d = 6</desc>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="54" cy="463.995" width="108" height="27"/>
+			<rect x="0" y="450.495" width="108" height="27" class="st5"/>
+			<text x="36.26" y="469.4" class="st6" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>d<tspan class="st3"> </tspan><tspan
+						class="st3">= 6</tspan></text>		</g>
+		<g id="shape243-500" v:mID="243" v:groupContext="shape" transform="translate(648.12,-18.375)">
+			<title>Sheet.243</title>
+			<desc>d = 7</desc>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="54" cy="463.995" width="108" height="27"/>
+			<rect x="0" y="450.495" width="108" height="27" class="st5"/>
+			<text x="36.26" y="469.4" class="st6" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>d<tspan class="st3"> </tspan><tspan
+						class="st3">= 7</tspan></text>		</g>
+	</g>
+</svg>
diff --git a/doc/img/quant_ac.svg b/doc/img/quant_ac.svg
new file mode 100644
index 0000000..3f589c8
--- /dev/null
+++ b/doc/img/quant_ac.svg
@@ -0,0 +1 @@
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" viewBox="0 0 653.55 392.07"><defs><style>.cls-1,.cls-10,.cls-12,.cls-2,.cls-20,.cls-26,.cls-33,.cls-4,.cls-6,.cls-7,.cls-9{fill:none;}.cls-2{stroke:#d9d9d9;}.cls-10,.cls-12,.cls-2,.cls-20,.cls-26,.cls-33,.cls-4,.cls-6,.cls-7,.cls-9{stroke-linejoin:round;}.cls-2,.cls-20,.cls-26,.cls-33{stroke-width:0.75px;}.cls-3{clip-path:url(#clip-path);}.cls-20,.cls-4,.cls-6{stroke:#5b9bd5;}.cls-10,.cls-4,.cls-7{stroke-linecap:round;stroke-width:2.25px;}.cls-5{fill:#5b9bd5;}.cls-12,.cls-6,.cls-9{stroke-width:0.72px;}.cls-26,.cls-7,.cls-9{stroke:#ed7d31;}.cls-8{fill:#ed7d31;}.cls-10,.cls-12,.cls-33{stroke:#a5a5a5;}.cls-11{fill:#a5a5a5;}.cls-13{clip-path:url(#clip-path-4);}.cls-14{font-size:9px;font-family:Calibri, Calibri;}.cls-14,.cls-15,.cls-21{fill:#595959;}.cls-15{font-size:15.96px;}.cls-15,.cls-21{font-family:TimesNewRomanPSMT, Times New Roman;}.cls-16{letter-spacing:0em;}.cls-17{letter-spacing:0em;}.cls-18{letter-spacing:0em;}.cls-19{letter-spacing:0em;}.cls-21{font-size:14.04px;}.cls-22{letter-spacing:0em;}.cls-23{letter-spacing:0em;}.cls-24{letter-spacing:0.01em;}.cls-25{letter-spacing:0em;}.cls-27{letter-spacing:0em;}.cls-28{letter-spacing:0em;}.cls-29{letter-spacing:0em;}.cls-30{letter-spacing:0em;}.cls-31{letter-spacing:0em;}.cls-32{letter-spacing:-0.01em;}</style><clipPath id="clip-path"><rect class="cls-1" x="53.78" y="8.9" width="587.4" height="355.08"/></clipPath><clipPath id="clip-path-4"><rect class="cls-1" x="0.38" y="0.38" width="652.8" height="391.32"/></clipPath></defs><title>tables3Asset 1</title><g id="Layer_2" data-name="Layer 2"><g id="Layer_1-2" data-name="Layer 1"><path class="cls-2" d="M53.81,9H640.53M53.81,59.65H640.53M53.81,110.18H640.53M53.81,160.82H640.53M53.81,211.46H640.53M53.81,262.1H640.53M53.81,312.74H640.53"/><path class="cls-2" d="M626.78,9V363.3M512.18,9V363.3M397.57,9V363.3M283,9V363.3M168.38,9V363.3M53.81,9V363.3"/><line class="cls-2" x1="53.81" y1="363.3" x2="640.53" y2="363.3"/><g class="cls-3"><polyline class="cls-4" points="54.95 363.25 57.26 363.25 59.53 363.25 61.81 363.13 64.09 363.13 66.38 363.13 68.66 363.13 71.06 363.13 73.33 363.13 75.61 363.13 77.89 363.13 80.17 363.13 82.45 363.13 84.73 363.13 87.02 363.13 89.3 363.01 91.58 363.01 93.86 363.01 96.25 363.01 98.53 363.01 100.81 363.01 103.09 363.01 105.38 363.01 107.66 363.01 109.94 363.01 112.22 363.01 114.5 363.01 116.78 362.89 119.17 362.89 121.45 362.89 123.73 362.89 126.02 362.89 128.29 362.89 130.57 362.89 132.85 362.89 135.13 362.89 137.41 362.89 139.69 362.89 142.09 362.89 144.38 362.77 146.66 362.77 148.94 362.77 151.22 362.77 153.5 362.77 155.78 362.77 158.06 362.77 160.34 362.77 162.62 362.77 165.01 362.77 167.29 362.77 169.57 362.77 171.85 362.65 174.13 362.65 176.41 362.65 178.69 362.65 180.97 362.65 183.25 362.65 185.53 362.65 187.94 362.65 190.22 362.65 192.5 362.65 194.78 362.65 197.06 362.65 199.34 362.54 201.62 362.54 203.9 362.54 206.18 362.54 208.46 362.54 210.85 362.54 213.13 362.54 215.41 362.54 217.69 362.54 219.97 362.54 222.25 362.54 224.53 362.42 226.81 362.42 229.09 362.42 231.38 362.42 233.78 362.42 236.06 362.42 238.34 362.42 240.62 362.42 242.9 362.42 245.18 362.42 247.46 362.42 249.74 362.42 252.01 362.3 254.29 362.3 256.69 362.3 258.98 362.3 261.25 362.3 263.54 362.3 265.81 362.3 268.1 362.3 270.38 362.3 272.65 362.3 274.94 362.3 277.21 362.18 279.62 362.18 281.89 362.18 284.18 362.18 286.45 362.18 288.74 362.18 291.01 362.06 293.3 362.06 295.57 362.06 297.86 362.06 300.13 362.06 302.42 362.06 304.81 361.94 307.1 361.94 309.38 361.94 311.65 361.94 313.94 361.94 316.21 361.94 318.5 361.81 320.77 361.81 323.06 361.81 325.33 361.81 327.74 361.81 330.01 361.81 332.3 361.69 334.57 361.69 336.86 361.69 339.13 361.57 341.42 361.57 343.69 361.57 345.98 361.57 348.25 361.45 350.65 361.45 352.94 361.45 355.21 361.45 357.5 361.33 359.77 361.33 362.06 361.33 364.33 361.33 366.62 361.21 368.89 361.21 371.18 361.21 373.57 361.21 375.86 361.1 378.13 361.1 380.42 361.1 382.69 360.98 384.98 360.98 387.25 360.98 389.54 360.86 391.81 360.86 394.1 360.74 396.5 360.74 398.77 360.74 401.06 360.62 403.33 360.62 405.62 360.62 407.89 360.5 410.18 360.5 412.45 360.38 414.74 360.38 417.01 360.25 419.42 360.25 421.69 360.25 423.98 360.13 426.25 360.13 428.54 360.01 430.81 360.01 433.1 359.89 435.38 359.89 437.65 359.77 439.94 359.77 442.33 359.65 444.62 359.54 446.89 359.54 449.18 359.42 451.45 359.42 453.74 359.3 456.01 359.18 458.3 359.18 460.57 359.06 462.86 359.06 465.25 358.94 467.54 358.81 469.81 358.81 472.1 358.69 474.38 358.57 476.65 358.45 478.94 358.45 481.21 358.33 483.5 358.21 485.77 358.1 488.06 357.98 490.45 357.98 492.74 357.86 495.01 357.74 497.3 357.62 499.57 357.5 501.86 357.38 504.13 357.25 506.42 357.13 508.69 357.01 510.98 356.89 513.38 356.77 515.65 356.65 517.93 356.54 520.22 356.42 522.5 356.3 524.77 356.18 527.05 356.06 529.34 355.94 531.62 355.81 533.89 355.57 536.29 355.45 538.58 355.33 540.86 355.21 543.13 354.98 545.41 354.86 547.7 354.74 549.98 354.5 552.25 354.38 554.53 354.25 556.82 354.01 559.22 353.89 561.5 353.65 563.77 353.54 566.05 353.3 568.34 353.06 570.62 352.94 572.89 352.69 575.17 352.45 577.46 352.33 579.74 352.1 582.13 351.86 584.41 351.62 586.7 351.38 588.98 351.13 591.25 350.89 593.53 350.65 595.82 350.42 598.1 350.18 600.38 349.94 602.65 349.69 605.05 349.45 607.34 349.21 609.62 348.86 611.89 348.62 614.17 348.38 616.46 348.01 618.74 347.77 621.01 347.42 623.29 347.18 625.58 346.81 627.98 346.45 630.25 346.21 632.53 345.86 634.82 345.5 637.1 345.13 639.38 344.79"/></g><circle class="cls-5" cx="54.92" cy="363.2" r="1.98"/><circle class="cls-6" cx="54.92" cy="363.2" r="1.98"/><circle class="cls-5" cx="57.2" cy="363.2" r="1.98"/><circle class="cls-6" cx="57.2" cy="363.2" r="1.98"/><circle class="cls-5" cx="59.48" cy="363.2" r="1.98"/><circle class="cls-6" cx="59.48" cy="363.2" r="1.98"/><circle class="cls-5" cx="61.76" cy="363.08" r="1.98"/><circle class="cls-6" cx="61.76" cy="363.08" r="1.98"/><circle class="cls-5" cx="64.04" cy="363.08" r="1.98"/><circle class="cls-6" cx="64.04" cy="363.08" r="1.98"/><circle class="cls-5" cx="66.32" cy="363.08" r="1.98"/><circle class="cls-6" cx="66.32" cy="363.08" r="1.98"/><circle class="cls-5" cx="68.6" cy="363.08" r="1.98"/><circle class="cls-6" cx="68.6" cy="363.08" r="1.98"/><circle class="cls-5" cx="71" cy="363.08" r="1.98"/><circle class="cls-6" cx="71" cy="363.08" r="1.98"/><circle class="cls-5" cx="73.28" cy="363.08" r="1.98"/><circle class="cls-6" cx="73.28" cy="363.08" r="1.98"/><circle class="cls-5" cx="75.56" cy="363.08" r="1.98"/><circle class="cls-6" cx="75.56" cy="363.08" r="1.98"/><circle class="cls-5" cx="77.83" cy="363.08" r="1.98"/><circle class="cls-6" cx="77.83" cy="363.08" r="1.98"/><circle class="cls-5" cx="80.12" cy="363.08" r="1.98"/><circle class="cls-6" cx="80.12" cy="363.08" r="1.98"/><circle class="cls-5" cx="82.4" cy="363.08" r="1.98"/><circle class="cls-6" cx="82.4" cy="363.08" r="1.98"/><circle class="cls-5" cx="84.67" cy="363.08" r="1.98"/><circle class="cls-6" cx="84.67" cy="363.08" r="1.98"/><circle class="cls-5" cx="86.95" cy="363.08" r="1.98"/><circle class="cls-6" cx="86.95" cy="363.08" r="1.98"/><circle class="cls-5" cx="89.24" cy="362.96" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -280.75, 415.99)"/><circle class="cls-6" cx="89.24" cy="362.96" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -280.75, 415.99)"/><circle class="cls-5" cx="91.52" cy="362.96" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -278.69, 418.26)"/><circle class="cls-6" cx="91.52" cy="362.96" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -278.69, 418.26)"/><circle class="cls-5" cx="93.8" cy="362.96" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -276.64, 420.53)"/><circle class="cls-6" cx="93.8" cy="362.96" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -276.64, 420.53)"/><circle class="cls-5" cx="96.19" cy="362.96" r="1.98"/><circle class="cls-6" cx="96.19" cy="362.96" r="1.98"/><circle class="cls-5" cx="98.47" cy="362.96" r="1.98"/><circle class="cls-6" cx="98.47" cy="362.96" r="1.98"/><circle class="cls-5" cx="100.76" cy="362.96" r="1.98"/><circle class="cls-6" cx="100.76" cy="362.96" r="1.98"/><circle class="cls-5" cx="103.03" cy="362.96" r="1.98"/><circle class="cls-6" cx="103.03" cy="362.96" r="1.98"/><circle class="cls-5" cx="105.31" cy="362.96" r="1.98"/><circle class="cls-6" cx="105.31" cy="362.96" r="1.98"/><circle class="cls-5" cx="107.6" cy="362.96" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -264.2, 434.26)"/><circle class="cls-6" cx="107.6" cy="362.96" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -264.2, 434.26)"/><circle class="cls-5" cx="109.88" cy="362.96" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -262.14, 436.53)"/><circle class="cls-6" cx="109.88" cy="362.96" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -262.14, 436.53)"/><circle class="cls-5" cx="112.16" cy="362.96" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -260.09, 438.8)"/><circle class="cls-6" cx="112.16" cy="362.96" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -260.09, 438.8)"/><circle class="cls-5" cx="114.44" cy="362.96" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -258.03, 441.07)"/><circle class="cls-6" cx="114.44" cy="362.96" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -258.03, 441.07)"/><circle class="cls-5" cx="116.72" cy="362.84" r="1.98"/><circle class="cls-6" cx="116.72" cy="362.84" r="1.98"/><circle class="cls-5" cx="119.12" cy="362.84" r="1.98"/><circle class="cls-6" cx="119.12" cy="362.84" r="1.98"/><circle class="cls-5" cx="121.4" cy="362.84" r="1.98"/><circle class="cls-6" cx="121.4" cy="362.84" r="1.98"/><circle class="cls-5" cx="123.67" cy="362.84" r="1.98"/><circle class="cls-6" cx="123.67" cy="362.84" r="1.98"/><circle class="cls-5" cx="125.95" cy="362.84" r="1.98"/><circle class="cls-6" cx="125.95" cy="362.84" r="1.98"/><circle class="cls-5" cx="128.24" cy="362.84" r="1.98"/><circle class="cls-6" cx="128.24" cy="362.84" r="1.98"/><circle class="cls-5" cx="130.52" cy="362.84" r="1.98"/><circle class="cls-6" cx="130.52" cy="362.84" r="1.98"/><circle class="cls-5" cx="132.8" cy="362.84" r="1.98"/><circle class="cls-6" cx="132.8" cy="362.84" r="1.98"/><circle class="cls-5" cx="135.08" cy="362.84" r="1.98"/><circle class="cls-6" cx="135.08" cy="362.84" r="1.98"/><circle class="cls-5" cx="137.36" cy="362.84" r="1.98"/><circle class="cls-6" cx="137.36" cy="362.84" r="1.98"/><circle class="cls-5" cx="139.64" cy="362.84" r="1.98"/><circle class="cls-6" cx="139.64" cy="362.84" r="1.98"/><circle class="cls-5" cx="142.03" cy="362.84" r="1.98"/><circle class="cls-6" cx="142.03" cy="362.84" r="1.98"/><circle class="cls-5" cx="144.31" cy="362.72" r="1.98"/><circle class="cls-6" cx="144.31" cy="362.72" r="1.98"/><circle class="cls-5" cx="146.6" cy="362.72" r="1.98"/><circle class="cls-6" cx="146.6" cy="362.72" r="1.98"/><circle class="cls-5" cx="148.88" cy="362.72" r="1.98"/><circle class="cls-6" cx="148.88" cy="362.72" r="1.98"/><circle class="cls-5" cx="151.16" cy="362.72" r="1.98"/><circle class="cls-6" cx="151.16" cy="362.72" r="1.98"/><circle class="cls-5" cx="153.44" cy="362.72" r="1.98"/><circle class="cls-6" cx="153.44" cy="362.72" r="1.98"/><circle class="cls-5" cx="155.72" cy="362.72" r="1.98"/><circle class="cls-6" cx="155.72" cy="362.72" r="1.98"/><circle class="cls-5" cx="158" cy="362.72" r="1.98"/><circle class="cls-6" cx="158" cy="362.72" r="1.98"/><circle class="cls-5" cx="160.28" cy="362.72" r="1.98"/><circle class="cls-6" cx="160.28" cy="362.72" r="1.98"/><circle class="cls-5" cx="162.56" cy="362.72" r="1.98"/><circle class="cls-6" cx="162.56" cy="362.72" r="1.98"/><circle class="cls-5" cx="164.95" cy="362.72" r="1.98"/><circle class="cls-6" cx="164.95" cy="362.72" r="1.98"/><circle class="cls-5" cx="167.24" cy="362.72" r="1.98"/><circle class="cls-6" cx="167.24" cy="362.72" r="1.98"/><circle class="cls-5" cx="169.52" cy="362.72" r="1.98"/><circle class="cls-6" cx="169.52" cy="362.72" r="1.98"/><circle class="cls-5" cx="171.8" cy="362.6" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -205.96, 497.82)"/><circle class="cls-6" cx="171.8" cy="362.6" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -205.96, 497.82)"/><circle class="cls-5" cx="174.08" cy="362.6" r="1.98"/><circle class="cls-6" cx="174.08" cy="362.6" r="1.98"/><circle class="cls-5" cx="176.36" cy="362.6" r="1.98"/><circle class="cls-6" cx="176.36" cy="362.6" r="1.98"/><circle class="cls-5" cx="178.64" cy="362.6" r="1.98"/><circle class="cls-6" cx="178.64" cy="362.6" r="1.98"/><circle class="cls-5" cx="180.92" cy="362.6" r="1.98"/><circle class="cls-6" cx="180.92" cy="362.6" r="1.98"/><circle class="cls-5" cx="183.19" cy="362.6" r="1.98"/><circle class="cls-6" cx="183.19" cy="362.6" r="1.98"/><circle class="cls-5" cx="185.47" cy="362.6" r="1.98"/><circle class="cls-6" cx="185.47" cy="362.6" r="1.98"/><circle class="cls-5" cx="187.88" cy="362.6" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -191.47, 513.83)"/><circle class="cls-6" cx="187.88" cy="362.6" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -191.47, 513.83)"/><circle class="cls-5" cx="190.16" cy="362.6" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -189.41, 516.1)"/><circle class="cls-6" cx="190.16" cy="362.6" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -189.41, 516.1)"/><circle class="cls-5" cx="192.44" cy="362.6" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -187.36, 518.36)"/><circle class="cls-6" cx="192.44" cy="362.6" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -187.36, 518.36)"/><circle class="cls-5" cx="194.72" cy="362.6" r="1.98"/><circle class="cls-6" cx="194.72" cy="362.6" r="1.98"/><circle class="cls-5" cx="197" cy="362.6" r="1.98"/><circle class="cls-6" cx="197" cy="362.6" r="1.98"/><circle class="cls-5" cx="199.28" cy="362.48" r="1.98"/><circle class="cls-6" cx="199.28" cy="362.48" r="1.98"/><circle class="cls-5" cx="201.56" cy="362.48" r="1.98" transform="translate(-34.74 21.62) rotate(-5.65)"/><circle class="cls-6" cx="201.56" cy="362.48" r="1.98" transform="translate(-34.74 21.62) rotate(-5.65)"/><circle class="cls-5" cx="203.83" cy="362.48" r="1.98" transform="translate(-34.73 21.85) rotate(-5.65)"/><circle class="cls-6" cx="203.83" cy="362.48" r="1.98" transform="translate(-34.73 21.85) rotate(-5.65)"/><circle class="cls-5" cx="206.11" cy="362.48" r="1.98" transform="translate(-34.71 22.07) rotate(-5.65)"/><circle class="cls-6" cx="206.11" cy="362.48" r="1.98" transform="translate(-34.71 22.07) rotate(-5.65)"/><path class="cls-5" d="M210.38,362.48a2,2,0,1,1-2-2A2,2,0,0,1,210.38,362.48Z"/><path class="cls-6" d="M210.38,362.48a2,2,0,1,1-2-2A2,2,0,0,1,210.38,362.48Z"/><path class="cls-5" d="M212.78,362.48a2,2,0,1,1-2-2A2,2,0,0,1,212.78,362.48Z"/><path class="cls-6" d="M212.78,362.48a2,2,0,1,1-2-2A2,2,0,0,1,212.78,362.48Z"/><path class="cls-5" d="M215.06,362.48a2,2,0,1,1-2-2A2,2,0,0,1,215.06,362.48Z"/><path class="cls-6" d="M215.06,362.48a2,2,0,1,1-2-2A2,2,0,0,1,215.06,362.48Z"/><path class="cls-5" d="M217.33,362.48a2,2,0,1,1-2-2A2,2,0,0,1,217.33,362.48Z"/><path class="cls-6" d="M217.33,362.48a2,2,0,1,1-2-2A2,2,0,0,1,217.33,362.48Z"/><path class="cls-5" d="M219.61,362.48a2,2,0,1,1-2-2A2,2,0,0,1,219.61,362.48Z"/><path class="cls-6" d="M219.61,362.48a2,2,0,1,1-2-2A2,2,0,0,1,219.61,362.48Z"/><path class="cls-5" d="M221.89,362.48a2,2,0,1,1-2-2A2,2,0,0,1,221.89,362.48Z"/><path class="cls-6" d="M221.89,362.48a2,2,0,1,1-2-2A2,2,0,0,1,221.89,362.48Z"/><path class="cls-5" d="M224.17,362.48a2,2,0,1,1-2-2A2,2,0,0,1,224.17,362.48Z"/><path class="cls-6" d="M224.17,362.48a2,2,0,1,1-2-2A2,2,0,0,1,224.17,362.48Z"/><path class="cls-5" d="M226.45,362.36a2,2,0,1,1-2-2A2,2,0,0,1,226.45,362.36Z"/><path class="cls-6" d="M226.45,362.36a2,2,0,1,1-2-2A2,2,0,0,1,226.45,362.36Z"/><path class="cls-5" d="M228.73,362.36a2,2,0,1,1-2-2A2,2,0,0,1,228.73,362.36Z"/><path class="cls-6" d="M228.73,362.36a2,2,0,1,1-2-2A2,2,0,0,1,228.73,362.36Z"/><path class="cls-5" d="M231,362.36a2,2,0,1,1-2-2A2,2,0,0,1,231,362.36Z"/><path class="cls-6" d="M231,362.36a2,2,0,1,1-2-2A2,2,0,0,1,231,362.36Z"/><path class="cls-5" d="M233.29,362.36a2,2,0,1,1-2-2A2,2,0,0,1,233.29,362.36Z"/><path class="cls-6" d="M233.29,362.36a2,2,0,1,1-2-2A2,2,0,0,1,233.29,362.36Z"/><circle class="cls-5" cx="233.72" cy="362.36" r="1.98" transform="translate(-144.3 569.79) rotate(-85.93)"/><circle class="cls-6" cx="233.72" cy="362.36" r="1.98" transform="translate(-144.3 569.79) rotate(-85.93)"/><circle class="cls-5" cx="236" cy="362.36" r="1.98" transform="translate(-142.18 572.07) rotate(-85.93)"/><circle class="cls-6" cx="236" cy="362.36" r="1.98" transform="translate(-142.18 572.07) rotate(-85.93)"/><circle class="cls-5" cx="238.28" cy="362.36" r="1.98" transform="translate(-140.06 574.34) rotate(-85.93)"/><circle class="cls-6" cx="238.28" cy="362.36" r="1.98" transform="translate(-140.06 574.34) rotate(-85.93)"/><circle class="cls-5" cx="240.56" cy="362.36" r="1.98" transform="translate(-137.94 576.62) rotate(-85.93)"/><circle class="cls-6" cx="240.56" cy="362.36" r="1.98" transform="translate(-137.94 576.62) rotate(-85.93)"/><circle class="cls-5" cx="242.83" cy="362.36" r="1.98" transform="translate(-135.82 578.89) rotate(-85.93)"/><circle class="cls-6" cx="242.83" cy="362.36" r="1.98" transform="translate(-135.82 578.89) rotate(-85.93)"/><circle class="cls-5" cx="245.11" cy="362.36" r="1.98" transform="translate(-133.7 581.17) rotate(-85.93)"/><circle class="cls-6" cx="245.11" cy="362.36" r="1.98" transform="translate(-133.7 581.17) rotate(-85.93)"/><path class="cls-5" d="M249.38,362.36a2,2,0,1,1-2-2A2,2,0,0,1,249.38,362.36Z"/><path class="cls-6" d="M249.38,362.36a2,2,0,1,1-2-2A2,2,0,0,1,249.38,362.36Z"/><path class="cls-5" d="M251.66,362.36a2,2,0,1,1-2-2A2,2,0,0,1,251.66,362.36Z"/><path class="cls-6" d="M251.66,362.36a2,2,0,1,1-2-2A2,2,0,0,1,251.66,362.36Z"/><path class="cls-5" d="M253.94,362.24a2,2,0,1,1-2-2A2,2,0,0,1,253.94,362.24Z"/><path class="cls-6" d="M253.94,362.24a2,2,0,1,1-2-2A2,2,0,0,1,253.94,362.24Z"/><path class="cls-5" d="M256.22,362.24a2,2,0,1,1-2-2A2,2,0,0,1,256.22,362.24Z"/><path class="cls-6" d="M256.22,362.24a2,2,0,1,1-2-2A2,2,0,0,1,256.22,362.24Z"/><path class="cls-5" d="M258.61,362.24a2,2,0,1,1-2-2A2,2,0,0,1,258.61,362.24Z"/><path class="cls-6" d="M258.61,362.24a2,2,0,1,1-2-2A2,2,0,0,1,258.61,362.24Z"/><path class="cls-5" d="M260.89,362.24a2,2,0,1,1-2-2A2,2,0,0,1,260.89,362.24Z"/><path class="cls-6" d="M260.89,362.24a2,2,0,1,1-2-2A2,2,0,0,1,260.89,362.24Z"/><path class="cls-5" d="M263.17,362.24a2,2,0,1,1-2-2A2,2,0,0,1,263.17,362.24Z"/><path class="cls-6" d="M263.17,362.24a2,2,0,1,1-2-2A2,2,0,0,1,263.17,362.24Z"/><path class="cls-5" d="M265.45,362.24a2,2,0,1,1-2-2A2,2,0,0,1,265.45,362.24Z"/><path class="cls-6" d="M265.45,362.24a2,2,0,1,1-2-2A2,2,0,0,1,265.45,362.24Z"/><path class="cls-5" d="M267.73,362.24a2,2,0,1,1-2-2A2,2,0,0,1,267.73,362.24Z"/><path class="cls-6" d="M267.73,362.24a2,2,0,1,1-2-2A2,2,0,0,1,267.73,362.24Z"/><path class="cls-5" d="M270,362.24a2,2,0,1,1-2-2A2,2,0,0,1,270,362.24Z"/><path class="cls-6" d="M270,362.24a2,2,0,1,1-2-2A2,2,0,0,1,270,362.24Z"/><path class="cls-5" d="M272.29,362.24a2,2,0,1,1-2-2A2,2,0,0,1,272.29,362.24Z"/><path class="cls-6" d="M272.29,362.24a2,2,0,1,1-2-2A2,2,0,0,1,272.29,362.24Z"/><path class="cls-5" d="M274.57,362.24a2,2,0,1,1-2-2A2,2,0,0,1,274.57,362.24Z"/><path class="cls-6" d="M274.57,362.24a2,2,0,1,1-2-2A2,2,0,0,1,274.57,362.24Z"/><path class="cls-5" d="M276.85,362.24a2,2,0,1,1-2-2A2,2,0,0,1,276.85,362.24Z"/><path class="cls-6" d="M276.85,362.24a2,2,0,1,1-2-2A2,2,0,0,1,276.85,362.24Z"/><path class="cls-5" d="M279.13,362.12a2,2,0,1,1-2-2A2,2,0,0,1,279.13,362.12Z"/><path class="cls-6" d="M279.13,362.12a2,2,0,1,1-2-2A2,2,0,0,1,279.13,362.12Z"/><circle class="cls-5" cx="279.56" cy="362.12" r="1.98" transform="translate(-34.32 29.31) rotate(-5.65)"/><circle class="cls-6" cx="279.56" cy="362.12" r="1.98" transform="translate(-34.32 29.31) rotate(-5.65)"/><circle class="cls-5" cx="281.83" cy="362.12" r="1.98" transform="translate(-34.31 29.53) rotate(-5.65)"/><circle class="cls-6" cx="281.83" cy="362.12" r="1.98" transform="translate(-34.31 29.53) rotate(-5.65)"/><circle class="cls-5" cx="284.11" cy="362.12" r="1.98" transform="translate(-34.3 29.76) rotate(-5.65)"/><circle class="cls-6" cx="284.11" cy="362.12" r="1.98" transform="translate(-34.3 29.76) rotate(-5.65)"/><path class="cls-5" d="M288.38,362.12a2,2,0,1,1-2-2A2,2,0,0,1,288.38,362.12Z"/><path class="cls-6" d="M288.38,362.12a2,2,0,1,1-2-2A2,2,0,0,1,288.38,362.12Z"/><path class="cls-5" d="M290.66,362.12a2,2,0,1,1-2-2A2,2,0,0,1,290.66,362.12Z"/><path class="cls-6" d="M290.66,362.12a2,2,0,1,1-2-2A2,2,0,0,1,290.66,362.12Z"/><path class="cls-5" d="M292.94,362a2,2,0,1,1-2-2A2,2,0,0,1,292.94,362Z"/><path class="cls-6" d="M292.94,362a2,2,0,1,1-2-2A2,2,0,0,1,292.94,362Z"/><path class="cls-5" d="M295.22,362a2,2,0,1,1-2-2A2,2,0,0,1,295.22,362Z"/><path class="cls-6" d="M295.22,362a2,2,0,1,1-2-2A2,2,0,0,1,295.22,362Z"/><path class="cls-5" d="M297.5,362a2,2,0,1,1-2-2A2,2,0,0,1,297.5,362Z"/><path class="cls-6" d="M297.5,362a2,2,0,1,1-2-2A2,2,0,0,1,297.5,362Z"/><path class="cls-5" d="M299.78,362a2,2,0,1,1-2-2A2,2,0,0,1,299.78,362Z"/><path class="cls-6" d="M299.78,362a2,2,0,1,1-2-2A2,2,0,0,1,299.78,362Z"/><path class="cls-5" d="M302.06,362a2,2,0,1,1-2-2A2,2,0,0,1,302.06,362Z"/><path class="cls-6" d="M302.06,362a2,2,0,1,1-2-2A2,2,0,0,1,302.06,362Z"/><path class="cls-5" d="M304.33,362a2,2,0,1,1-2-2A2,2,0,0,1,304.33,362Z"/><path class="cls-6" d="M304.33,362a2,2,0,1,1-2-2A2,2,0,0,1,304.33,362Z"/><path class="cls-5" d="M306.73,361.88a2,2,0,1,1-2-2A2,2,0,0,1,306.73,361.88Z"/><path class="cls-6" d="M306.73,361.88a2,2,0,1,1-2-2A2,2,0,0,1,306.73,361.88Z"/><path class="cls-5" d="M309,361.88a2,2,0,1,1-2-2A2,2,0,0,1,309,361.88Z"/><path class="cls-6" d="M309,361.88a2,2,0,1,1-2-2A2,2,0,0,1,309,361.88Z"/><path class="cls-5" d="M311.29,361.88a2,2,0,1,1-2-2A2,2,0,0,1,311.29,361.88Z"/><path class="cls-6" d="M311.29,361.88a2,2,0,1,1-2-2A2,2,0,0,1,311.29,361.88Z"/><path class="cls-5" d="M313.57,361.88a2,2,0,1,1-2-2A2,2,0,0,1,313.57,361.88Z"/><path class="cls-6" d="M313.57,361.88a2,2,0,1,1-2-2A2,2,0,0,1,313.57,361.88Z"/><path class="cls-5" d="M315.85,361.88a2,2,0,1,1-2-2A2,2,0,0,1,315.85,361.88Z"/><path class="cls-6" d="M315.85,361.88a2,2,0,1,1-2-2A2,2,0,0,1,315.85,361.88Z"/><path class="cls-5" d="M318.13,361.88a2,2,0,1,1-2-2A2,2,0,0,1,318.13,361.88Z"/><path class="cls-6" d="M318.13,361.88a2,2,0,1,1-2-2A2,2,0,0,1,318.13,361.88Z"/><circle class="cls-5" cx="318.44" cy="361.76" r="1.98" transform="translate(-34.1 33.14) rotate(-5.65)"/><circle class="cls-6" cx="318.44" cy="361.76" r="1.98" transform="translate(-34.1 33.14) rotate(-5.65)"/><circle class="cls-5" cx="320.72" cy="361.76" r="1.98" transform="translate(-34.09 33.36) rotate(-5.65)"/><circle class="cls-6" cx="320.72" cy="361.76" r="1.98" transform="translate(-34.09 33.36) rotate(-5.65)"/><circle class="cls-5" cx="323" cy="361.76" r="1.98" transform="translate(-34.07 33.59) rotate(-5.65)"/><circle class="cls-6" cx="323" cy="361.76" r="1.98" transform="translate(-34.07 33.59) rotate(-5.65)"/><circle class="cls-5" cx="325.28" cy="361.76" r="1.98" transform="translate(-34.06 33.81) rotate(-5.65)"/><circle class="cls-6" cx="325.28" cy="361.76" r="1.98" transform="translate(-34.06 33.81) rotate(-5.65)"/><path class="cls-5" d="M329.66,361.76a2,2,0,1,1-2-2A2,2,0,0,1,329.66,361.76Z"/><path class="cls-6" d="M329.66,361.76a2,2,0,1,1-2-2A2,2,0,0,1,329.66,361.76Z"/><path class="cls-5" d="M331.94,361.76a2,2,0,1,1-2-2A2,2,0,0,1,331.94,361.76Z"/><path class="cls-6" d="M331.94,361.76a2,2,0,1,1-2-2A2,2,0,0,1,331.94,361.76Z"/><path class="cls-5" d="M334.22,361.64a2,2,0,1,1-2-2A2,2,0,0,1,334.22,361.64Z"/><path class="cls-6" d="M334.22,361.64a2,2,0,1,1-2-2A2,2,0,0,1,334.22,361.64Z"/><path class="cls-5" d="M336.5,361.64a2,2,0,1,1-2-2A2,2,0,0,1,336.5,361.64Z"/><path class="cls-6" d="M336.5,361.64a2,2,0,1,1-2-2A2,2,0,0,1,336.5,361.64Z"/><path class="cls-5" d="M338.78,361.64a2,2,0,1,1-2-2A2,2,0,0,1,338.78,361.64Z"/><path class="cls-6" d="M338.78,361.64a2,2,0,1,1-2-2A2,2,0,0,1,338.78,361.64Z"/><path class="cls-5" d="M341.06,361.52a2,2,0,1,1-2-2A2,2,0,0,1,341.06,361.52Z"/><path class="cls-6" d="M341.06,361.52a2,2,0,1,1-2-2A2,2,0,0,1,341.06,361.52Z"/><path class="cls-5" d="M343.33,361.52a2,2,0,1,1-2-2A2,2,0,0,1,343.33,361.52Z"/><path class="cls-6" d="M343.33,361.52a2,2,0,1,1-2-2A2,2,0,0,1,343.33,361.52Z"/><path class="cls-5" d="M345.61,361.52a2,2,0,1,1-2-2A2,2,0,0,1,345.61,361.52Z"/><path class="cls-6" d="M345.61,361.52a2,2,0,1,1-2-2A2,2,0,0,1,345.61,361.52Z"/><path class="cls-5" d="M347.89,361.52a2,2,0,1,1-2-2A2,2,0,0,1,347.89,361.52Z"/><path class="cls-6" d="M347.89,361.52a2,2,0,1,1-2-2A2,2,0,0,1,347.89,361.52Z"/><path class="cls-5" d="M350.17,361.4a2,2,0,1,1-2-2A2,2,0,0,1,350.17,361.4Z"/><path class="cls-6" d="M350.17,361.4a2,2,0,1,1-2-2A2,2,0,0,1,350.17,361.4Z"/><path class="cls-5" d="M352.57,361.4a2,2,0,1,1-2-2A2,2,0,0,1,352.57,361.4Z"/><path class="cls-6" d="M352.57,361.4a2,2,0,1,1-2-2A2,2,0,0,1,352.57,361.4Z"/><path class="cls-5" d="M354.85,361.4a2,2,0,1,1-2-2A2,2,0,0,1,354.85,361.4Z"/><path class="cls-6" d="M354.85,361.4a2,2,0,1,1-2-2A2,2,0,0,1,354.85,361.4Z"/><path class="cls-5" d="M357.13,361.4a2,2,0,1,1-2-2A2,2,0,0,1,357.13,361.4Z"/><path class="cls-6" d="M357.13,361.4a2,2,0,1,1-2-2A2,2,0,0,1,357.13,361.4Z"/><circle class="cls-5" cx="357.44" cy="361.28" r="1.98" transform="translate(-28.27 692.2) rotate(-85.93)"/><circle class="cls-6" cx="357.44" cy="361.28" r="1.98" transform="translate(-28.27 692.2) rotate(-85.93)"/><circle class="cls-5" cx="359.72" cy="361.28" r="1.98" transform="translate(-26.15 694.47) rotate(-85.93)"/><circle class="cls-6" cx="359.72" cy="361.28" r="1.98" transform="translate(-26.15 694.47) rotate(-85.93)"/><circle class="cls-5" cx="362" cy="361.28" r="1.98" transform="translate(-24.03 696.75) rotate(-85.93)"/><circle class="cls-6" cx="362" cy="361.28" r="1.98" transform="translate(-24.03 696.75) rotate(-85.93)"/><circle class="cls-5" cx="364.28" cy="361.28" r="1.98" transform="translate(-21.91 699.02) rotate(-85.93)"/><circle class="cls-6" cx="364.28" cy="361.28" r="1.98" transform="translate(-21.91 699.02) rotate(-85.93)"/><circle class="cls-5" cx="366.56" cy="361.16" r="1.98" transform="translate(-33.8 37.88) rotate(-5.65)"/><circle class="cls-6" cx="366.56" cy="361.16" r="1.98" transform="translate(-33.8 37.88) rotate(-5.65)"/><circle class="cls-5" cx="368.83" cy="361.16" r="1.98" transform="translate(-33.79 38.1) rotate(-5.65)"/><circle class="cls-6" cx="368.83" cy="361.16" r="1.98" transform="translate(-33.79 38.1) rotate(-5.65)"/><circle class="cls-5" cx="371.11" cy="361.16" r="1.98" transform="translate(-33.78 38.33) rotate(-5.65)"/><circle class="cls-6" cx="371.11" cy="361.16" r="1.98" transform="translate(-33.78 38.33) rotate(-5.65)"/><path class="cls-5" d="M375.5,361.16a2,2,0,1,1-2-2A2,2,0,0,1,375.5,361.16Z"/><path class="cls-6" d="M375.5,361.16a2,2,0,1,1-2-2A2,2,0,0,1,375.5,361.16Z"/><path class="cls-5" d="M377.78,361a2,2,0,1,1-2-2A2,2,0,0,1,377.78,361Z"/><path class="cls-6" d="M377.78,361a2,2,0,1,1-2-2A2,2,0,0,1,377.78,361Z"/><path class="cls-5" d="M380.06,361a2,2,0,1,1-2-2A2,2,0,0,1,380.06,361Z"/><path class="cls-6" d="M380.06,361a2,2,0,1,1-2-2A2,2,0,0,1,380.06,361Z"/><path class="cls-5" d="M382.33,361a2,2,0,1,1-2-2A2,2,0,0,1,382.33,361Z"/><path class="cls-6" d="M382.33,361a2,2,0,1,1-2-2A2,2,0,0,1,382.33,361Z"/><path class="cls-5" d="M384.61,360.92a2,2,0,1,1-2-2A2,2,0,0,1,384.61,360.92Z"/><path class="cls-6" d="M384.61,360.92a2,2,0,1,1-2-2A2,2,0,0,1,384.61,360.92Z"/><path class="cls-5" d="M386.89,360.92a2,2,0,1,1-2-2A2,2,0,0,1,386.89,360.92Z"/><path class="cls-6" d="M386.89,360.92a2,2,0,1,1-2-2A2,2,0,0,1,386.89,360.92Z"/><path class="cls-5" d="M389.17,360.92a2,2,0,1,1-2-2A2,2,0,0,1,389.17,360.92Z"/><path class="cls-6" d="M389.17,360.92a2,2,0,1,1-2-2A2,2,0,0,1,389.17,360.92Z"/><path class="cls-5" d="M391.45,360.8a2,2,0,1,1-2-2A2,2,0,0,1,391.45,360.8Z"/><path class="cls-6" d="M391.45,360.8a2,2,0,1,1-2-2A2,2,0,0,1,391.45,360.8Z"/><path class="cls-5" d="M393.73,360.8a2,2,0,1,1-2-2A2,2,0,0,1,393.73,360.8Z"/><path class="cls-6" d="M393.73,360.8a2,2,0,1,1-2-2A2,2,0,0,1,393.73,360.8Z"/><path class="cls-5" d="M396,360.68a2,2,0,1,1-2-2A2,2,0,0,1,396,360.68Z"/><path class="cls-6" d="M396,360.68a2,2,0,1,1-2-2A2,2,0,0,1,396,360.68Z"/><circle class="cls-5" cx="396.44" cy="360.68" r="1.98" transform="translate(8.56 730.54) rotate(-85.93)"/><circle class="cls-6" cx="396.44" cy="360.68" r="1.98" transform="translate(8.56 730.54) rotate(-85.93)"/><circle class="cls-5" cx="398.72" cy="360.68" r="1.98" transform="translate(10.68 732.82) rotate(-85.93)"/><circle class="cls-6" cx="398.72" cy="360.68" r="1.98" transform="translate(10.68 732.82) rotate(-85.93)"/><circle class="cls-5" cx="401" cy="360.56" r="1.98" transform="translate(12.92 734.98) rotate(-85.93)"/><circle class="cls-6" cx="401" cy="360.56" r="1.98" transform="translate(12.92 734.98) rotate(-85.93)"/><circle class="cls-5" cx="403.28" cy="360.56" r="1.98" transform="translate(15.04 737.26) rotate(-85.93)"/><circle class="cls-6" cx="403.28" cy="360.56" r="1.98" transform="translate(15.04 737.26) rotate(-85.93)"/><circle class="cls-5" cx="405.56" cy="360.56" r="1.98" transform="translate(17.16 739.53) rotate(-85.93)"/><circle class="cls-6" cx="405.56" cy="360.56" r="1.98" transform="translate(17.16 739.53) rotate(-85.93)"/><circle class="cls-5" cx="407.83" cy="360.44" r="1.98" transform="translate(-33.53 41.94) rotate(-5.65)"/><circle class="cls-6" cx="407.83" cy="360.44" r="1.98" transform="translate(-33.53 41.94) rotate(-5.65)"/><circle class="cls-5" cx="410.11" cy="360.44" r="1.98" transform="translate(-33.52 42.17) rotate(-5.65)"/><circle class="cls-6" cx="410.11" cy="360.44" r="1.98" transform="translate(-33.52 42.17) rotate(-5.65)"/><path class="cls-5" d="M414.38,360.32a2,2,0,1,1-2-2A2,2,0,0,1,414.38,360.32Z"/><path class="cls-6" d="M414.38,360.32a2,2,0,1,1-2-2A2,2,0,0,1,414.38,360.32Z"/><path class="cls-5" d="M416.66,360.32a2,2,0,1,1-2-2A2,2,0,0,1,416.66,360.32Z"/><path class="cls-6" d="M416.66,360.32a2,2,0,1,1-2-2A2,2,0,0,1,416.66,360.32Z"/><path class="cls-5" d="M418.94,360.2a2,2,0,1,1-2-2A2,2,0,0,1,418.94,360.2Z"/><path class="cls-6" d="M418.94,360.2a2,2,0,1,1-2-2A2,2,0,0,1,418.94,360.2Z"/><path class="cls-5" d="M421.33,360.2a2,2,0,1,1-2-2A2,2,0,0,1,421.33,360.2Z"/><path class="cls-6" d="M421.33,360.2a2,2,0,1,1-2-2A2,2,0,0,1,421.33,360.2Z"/><path class="cls-5" d="M423.61,360.2a2,2,0,1,1-2-2A2,2,0,0,1,423.61,360.2Z"/><path class="cls-6" d="M423.61,360.2a2,2,0,1,1-2-2A2,2,0,0,1,423.61,360.2Z"/><path class="cls-5" d="M425.89,360.08a2,2,0,1,1-2-2A2,2,0,0,1,425.89,360.08Z"/><path class="cls-6" d="M425.89,360.08a2,2,0,1,1-2-2A2,2,0,0,1,425.89,360.08Z"/><path class="cls-5" d="M428.17,360.08a2,2,0,1,1-2-2A2,2,0,0,1,428.17,360.08Z"/><path class="cls-6" d="M428.17,360.08a2,2,0,1,1-2-2A2,2,0,0,1,428.17,360.08Z"/><path class="cls-5" d="M430.45,360a2,2,0,1,1-2-2A2,2,0,0,1,430.45,360Z"/><path class="cls-6" d="M430.45,360a2,2,0,1,1-2-2A2,2,0,0,1,430.45,360Z"/><path class="cls-5" d="M432.73,360a2,2,0,1,1-2-2A2,2,0,0,1,432.73,360Z"/><path class="cls-6" d="M432.73,360a2,2,0,1,1-2-2A2,2,0,0,1,432.73,360Z"/><path class="cls-5" d="M435,359.84a2,2,0,1,1-2-2A2,2,0,0,1,435,359.84Z"/><path class="cls-6" d="M435,359.84a2,2,0,1,1-2-2A2,2,0,0,1,435,359.84Z"/><path class="cls-5" d="M437.29,359.84a2,2,0,1,1-2-2A2,2,0,0,1,437.29,359.84Z"/><path class="cls-6" d="M437.29,359.84a2,2,0,1,1-2-2A2,2,0,0,1,437.29,359.84Z"/><path class="cls-5" d="M439.57,359.72a2,2,0,1,1-2-2A2,2,0,0,1,439.57,359.72Z"/><path class="cls-6" d="M439.57,359.72a2,2,0,1,1-2-2A2,2,0,0,1,439.57,359.72Z"/><path class="cls-5" d="M441.85,359.72a2,2,0,1,1-2-2A2,2,0,0,1,441.85,359.72Z"/><path class="cls-6" d="M441.85,359.72a2,2,0,1,1-2-2A2,2,0,0,1,441.85,359.72Z"/><circle class="cls-5" cx="442.28" cy="359.6" r="1.98" transform="translate(52.23 775.27) rotate(-85.93)"/><circle class="cls-6" cx="442.28" cy="359.6" r="1.98" transform="translate(52.23 775.27) rotate(-85.93)"/><circle class="cls-5" cx="444.56" cy="359.48" r="1.98" transform="translate(-33.26 45.55) rotate(-5.65)"/><circle class="cls-6" cx="444.56" cy="359.48" r="1.98" transform="translate(-33.26 45.55) rotate(-5.65)"/><circle class="cls-5" cx="446.83" cy="359.48" r="1.98" transform="translate(-33.25 45.78) rotate(-5.65)"/><circle class="cls-6" cx="446.83" cy="359.48" r="1.98" transform="translate(-33.25 45.78) rotate(-5.65)"/><circle class="cls-5" cx="449.11" cy="359.36" r="1.98" transform="translate(58.83 781.87) rotate(-85.93)"/><circle class="cls-6" cx="449.11" cy="359.36" r="1.98" transform="translate(58.83 781.87) rotate(-85.93)"/><path class="cls-5" d="M453.38,359.36a2,2,0,1,1-2-2A2,2,0,0,1,453.38,359.36Z"/><path class="cls-6" d="M453.38,359.36a2,2,0,1,1-2-2A2,2,0,0,1,453.38,359.36Z"/><path class="cls-5" d="M455.66,359.24a2,2,0,1,1-2-2A2,2,0,0,1,455.66,359.24Z"/><path class="cls-6" d="M455.66,359.24a2,2,0,1,1-2-2A2,2,0,0,1,455.66,359.24Z"/><circle class="cls-5" cx="455.95" cy="359.12" r="1.98" transform="translate(76.55 798.33) rotate(-87.4)"/><circle class="cls-6" cx="455.95" cy="359.12" r="1.98" transform="translate(76.55 798.33) rotate(-87.4)"/><path class="cls-5" d="M460.22,359.12a2,2,0,1,1-2-2A2,2,0,0,1,460.22,359.12Z"/><path class="cls-6" d="M460.22,359.12a2,2,0,1,1-2-2A2,2,0,0,1,460.22,359.12Z"/><circle class="cls-5" cx="460.51" cy="359" r="1.98" transform="translate(-19.15 26.02) rotate(-3.17)"/><circle class="cls-6" cx="460.51" cy="359" r="1.98" transform="translate(-19.15 26.02) rotate(-3.17)"/><path class="cls-5" d="M464.78,359a2,2,0,1,1-2-2A2,2,0,0,1,464.78,359Z"/><path class="cls-6" d="M464.78,359a2,2,0,1,1-2-2A2,2,0,0,1,464.78,359Z"/><path class="cls-5" d="M467.18,358.88a2,2,0,1,1-2-2A2,2,0,0,1,467.18,358.88Z"/><path class="cls-6" d="M467.18,358.88a2,2,0,1,1-2-2A2,2,0,0,1,467.18,358.88Z"/><path class="cls-5" d="M469.45,358.76a2,2,0,1,1-2-2A2,2,0,0,1,469.45,358.76Z"/><path class="cls-6" d="M469.45,358.76a2,2,0,1,1-2-2A2,2,0,0,1,469.45,358.76Z"/><path class="cls-5" d="M471.74,358.76a2,2,0,1,1-2-2A2,2,0,0,1,471.74,358.76Z"/><path class="cls-6" d="M471.74,358.76a2,2,0,1,1-2-2A2,2,0,0,1,471.74,358.76Z"/><circle class="cls-5" cx="472.04" cy="358.64" r="1.98" transform="translate(-19.11 26.65) rotate(-3.17)"/><circle class="cls-6" cx="472.04" cy="358.64" r="1.98" transform="translate(-19.11 26.65) rotate(-3.17)"/><path class="cls-5" d="M476.3,358.52a2,2,0,1,1-2-2A2,2,0,0,1,476.3,358.52Z"/><path class="cls-6" d="M476.3,358.52a2,2,0,1,1-2-2A2,2,0,0,1,476.3,358.52Z"/><circle class="cls-5" cx="476.6" cy="358.4" r="1.98" transform="translate(96.97 818.26) rotate(-87.4)"/><circle class="cls-6" cx="476.6" cy="358.4" r="1.98" transform="translate(96.97 818.26) rotate(-87.4)"/><path class="cls-5" d="M480.86,358.4a2,2,0,1,1-2-2A2,2,0,0,1,480.86,358.4Z"/><path class="cls-6" d="M480.86,358.4a2,2,0,1,1-2-2A2,2,0,0,1,480.86,358.4Z"/><circle class="cls-5" cx="481.16" cy="358.28" r="1.98" transform="translate(-19.08 27.16) rotate(-3.17)"/><circle class="cls-6" cx="481.16" cy="358.28" r="1.98" transform="translate(-19.08 27.16) rotate(-3.17)"/><path class="cls-5" d="M485.42,358.16a2,2,0,1,1-2-2A2,2,0,0,1,485.42,358.16Z"/><path class="cls-6" d="M485.42,358.16a2,2,0,1,1-2-2A2,2,0,0,1,485.42,358.16Z"/><circle class="cls-5" cx="485.72" cy="358.04" r="1.98" transform="translate(106.04 827.03) rotate(-87.4)"/><circle class="cls-6" cx="485.72" cy="358.04" r="1.98" transform="translate(106.04 827.03) rotate(-87.4)"/><path class="cls-5" d="M490,357.92a2,2,0,1,1-2-2A2,2,0,0,1,490,357.92Z"/><path class="cls-6" d="M490,357.92a2,2,0,1,1-2-2A2,2,0,0,1,490,357.92Z"/><circle class="cls-5" cx="490.39" cy="357.92" r="1.98" transform="translate(-19.04 27.67) rotate(-3.17)"/><circle class="cls-6" cx="490.39" cy="357.92" r="1.98" transform="translate(-19.04 27.67) rotate(-3.17)"/><path class="cls-5" d="M494.66,357.8a2,2,0,1,1-2-2A2,2,0,0,1,494.66,357.8Z"/><path class="cls-6" d="M494.66,357.8a2,2,0,1,1-2-2A2,2,0,0,1,494.66,357.8Z"/><circle class="cls-5" cx="494.95" cy="357.68" r="1.98" transform="translate(-19.02 27.92) rotate(-3.17)"/><circle class="cls-6" cx="494.95" cy="357.68" r="1.98" transform="translate(-19.02 27.92) rotate(-3.17)"/><path class="cls-5" d="M499.22,357.56a2,2,0,1,1-2-2A2,2,0,0,1,499.22,357.56Z"/><path class="cls-6" d="M499.22,357.56a2,2,0,1,1-2-2A2,2,0,0,1,499.22,357.56Z"/><circle class="cls-5" cx="499.51" cy="357.44" r="1.98" transform="translate(119.81 840.24) rotate(-87.4)"/><circle class="cls-6" cx="499.51" cy="357.44" r="1.98" transform="translate(119.81 840.24) rotate(-87.4)"/><path class="cls-5" d="M503.78,357.32a2,2,0,1,1-2-2A2,2,0,0,1,503.78,357.32Z"/><path class="cls-6" d="M503.78,357.32a2,2,0,1,1-2-2A2,2,0,0,1,503.78,357.32Z"/><path class="cls-5" d="M506.06,357.2a2,2,0,1,1-2-2A2,2,0,0,1,506.06,357.2Z"/><path class="cls-6" d="M506.06,357.2a2,2,0,1,1-2-2A2,2,0,0,1,506.06,357.2Z"/><path class="cls-5" d="M508.33,357.08a2,2,0,1,1-2-2A2,2,0,0,1,508.33,357.08Z"/><path class="cls-6" d="M508.33,357.08a2,2,0,1,1-2-2A2,2,0,0,1,508.33,357.08Z"/><path class="cls-5" d="M510.62,357a2,2,0,1,1-2-2A2,2,0,0,1,510.62,357Z"/><path class="cls-6" d="M510.62,357a2,2,0,1,1-2-2A2,2,0,0,1,510.62,357Z"/><path class="cls-5" d="M512.89,356.84a2,2,0,1,1-2-2A2,2,0,0,1,512.89,356.84Z"/><path class="cls-6" d="M512.89,356.84a2,2,0,1,1-2-2A2,2,0,0,1,512.89,356.84Z"/><path class="cls-5" d="M515.3,356.72a2,2,0,1,1-2-2A2,2,0,0,1,515.3,356.72Z"/><path class="cls-6" d="M515.3,356.72a2,2,0,1,1-2-2A2,2,0,0,1,515.3,356.72Z"/><circle class="cls-5" cx="515.6" cy="356.6" r="1.98" transform="translate(-18.93 29.06) rotate(-3.17)"/><circle class="cls-6" cx="515.6" cy="356.6" r="1.98" transform="translate(-18.93 29.06) rotate(-3.17)"/><path class="cls-5" d="M519.86,356.48a2,2,0,1,1-2-2A2,2,0,0,1,519.86,356.48Z"/><path class="cls-6" d="M519.86,356.48a2,2,0,1,1-2-2A2,2,0,0,1,519.86,356.48Z"/><circle class="cls-5" cx="520.16" cy="356.36" r="1.98" transform="translate(-18.91 29.31) rotate(-3.17)"/><circle class="cls-6" cx="520.16" cy="356.36" r="1.98" transform="translate(-18.91 29.31) rotate(-3.17)"/><path class="cls-5" d="M524.42,356.24a2,2,0,1,1-2-2A2,2,0,0,1,524.42,356.24Z"/><path class="cls-6" d="M524.42,356.24a2,2,0,1,1-2-2A2,2,0,0,1,524.42,356.24Z"/><circle class="cls-5" cx="524.72" cy="356.12" r="1.98" transform="translate(145.19 864.15) rotate(-87.4)"/><circle class="cls-6" cx="524.72" cy="356.12" r="1.98" transform="translate(145.19 864.15) rotate(-87.4)"/><path class="cls-5" d="M529,356a2,2,0,1,1-2-2A2,2,0,0,1,529,356Z"/><path class="cls-6" d="M529,356a2,2,0,1,1-2-2A2,2,0,0,1,529,356Z"/><circle class="cls-5" cx="529.28" cy="355.88" r="1.98"/><circle class="cls-6" cx="529.28" cy="355.88" r="1.98"/><path class="cls-5" d="M533.54,355.76a2,2,0,1,1-2-2A2,2,0,0,1,533.54,355.76Z"/><path class="cls-6" d="M533.54,355.76a2,2,0,1,1-2-2A2,2,0,0,1,533.54,355.76Z"/><circle class="cls-5" cx="533.83" cy="355.52" r="1.98"/><circle class="cls-6" cx="533.83" cy="355.52" r="1.98"/><path class="cls-5" d="M538.22,355.4a2,2,0,1,1-2-2A2,2,0,0,1,538.22,355.4Z"/><path class="cls-6" d="M538.22,355.4a2,2,0,1,1-2-2A2,2,0,0,1,538.22,355.4Z"/><circle class="cls-5" cx="538.51" cy="355.28" r="1.98" transform="translate(-18.82 30.32) rotate(-3.17)"/><circle class="cls-6" cx="538.51" cy="355.28" r="1.98" transform="translate(-18.82 30.32) rotate(-3.17)"/><path class="cls-5" d="M542.78,355.16a2,2,0,1,1-2-2A2,2,0,0,1,542.78,355.16Z"/><path class="cls-6" d="M542.78,355.16a2,2,0,1,1-2-2A2,2,0,0,1,542.78,355.16Z"/><path class="cls-5" d="M545.06,354.92a2,2,0,1,1-2-2A2,2,0,0,1,545.06,354.92Z"/><path class="cls-6" d="M545.06,354.92a2,2,0,1,1-2-2A2,2,0,0,1,545.06,354.92Z"/><path class="cls-5" d="M547.33,354.8a2,2,0,1,1-2-2A2,2,0,0,1,547.33,354.8Z"/><path class="cls-6" d="M547.33,354.8a2,2,0,1,1-2-2A2,2,0,0,1,547.33,354.8Z"/><path class="cls-5" d="M549.62,354.68a2,2,0,1,1-2-2A2,2,0,0,1,549.62,354.68Z"/><path class="cls-6" d="M549.62,354.68a2,2,0,1,1-2-2A2,2,0,0,1,549.62,354.68Z"/><path class="cls-5" d="M551.89,354.44a2,2,0,1,1-2-2A2,2,0,0,1,551.89,354.44Z"/><path class="cls-6" d="M551.89,354.44a2,2,0,1,1-2-2A2,2,0,0,1,551.89,354.44Z"/><path class="cls-5" d="M554.18,354.32a2,2,0,1,1-2-2A2,2,0,0,1,554.18,354.32Z"/><path class="cls-6" d="M554.18,354.32a2,2,0,1,1-2-2A2,2,0,0,1,554.18,354.32Z"/><path class="cls-5" d="M556.45,354.2a2,2,0,1,1-2-2A2,2,0,0,1,556.45,354.2Z"/><path class="cls-6" d="M556.45,354.2a2,2,0,1,1-2-2A2,2,0,0,1,556.45,354.2Z"/><path class="cls-5" d="M558.74,354a2,2,0,1,1-2-2A2,2,0,0,1,558.74,354Z"/><path class="cls-6" d="M558.74,354a2,2,0,1,1-2-2A2,2,0,0,1,558.74,354Z"/><circle class="cls-5" cx="559.16" cy="353.84" r="1.98" transform="translate(180.35 896.38) rotate(-87.4)"/><circle class="cls-6" cx="559.16" cy="353.84" r="1.98" transform="translate(180.35 896.38) rotate(-87.4)"/><path class="cls-5" d="M563.42,353.6a2,2,0,1,1-2-2A2,2,0,0,1,563.42,353.6Z"/><path class="cls-6" d="M563.42,353.6a2,2,0,1,1-2-2A2,2,0,0,1,563.42,353.6Z"/><circle class="cls-5" cx="563.72" cy="353.48" r="1.98" transform="translate(185.06 900.59) rotate(-87.4)"/><circle class="cls-6" cx="563.72" cy="353.48" r="1.98" transform="translate(185.06 900.59) rotate(-87.4)"/><path class="cls-5" d="M568,353.24a2,2,0,1,1-2-2A2,2,0,0,1,568,353.24Z"/><path class="cls-6" d="M568,353.24a2,2,0,1,1-2-2A2,2,0,0,1,568,353.24Z"/><circle class="cls-5" cx="568.28" cy="353" r="1.98" transform="translate(-18.65 31.97) rotate(-3.17)"/><circle class="cls-6" cx="568.28" cy="353" r="1.98" transform="translate(-18.65 31.97) rotate(-3.17)"/><path class="cls-5" d="M572.54,352.88a2,2,0,1,1-2-2A2,2,0,0,1,572.54,352.88Z"/><path class="cls-6" d="M572.54,352.88a2,2,0,1,1-2-2A2,2,0,0,1,572.54,352.88Z"/><circle class="cls-5" cx="572.83" cy="352.64" r="1.98" transform="translate(-18.62 32.22) rotate(-3.17)"/><circle class="cls-6" cx="572.83" cy="352.64" r="1.98" transform="translate(-18.62 32.22) rotate(-3.17)"/><path class="cls-5" d="M577.1,352.4a2,2,0,1,1-2-2A2,2,0,0,1,577.1,352.4Z"/><path class="cls-6" d="M577.1,352.4a2,2,0,1,1-2-2A2,2,0,0,1,577.1,352.4Z"/><circle class="cls-5" cx="577.39" cy="352.28" r="1.98" transform="translate(-18.6 32.47) rotate(-3.17)"/><circle class="cls-6" cx="577.39" cy="352.28" r="1.98" transform="translate(-18.6 32.47) rotate(-3.17)"/><path class="cls-5" d="M581.66,352a2,2,0,1,1-2-2A2,2,0,0,1,581.66,352Z"/><path class="cls-6" d="M581.66,352a2,2,0,1,1-2-2A2,2,0,0,1,581.66,352Z"/><path class="cls-5" d="M584.06,351.8a2,2,0,1,1-2-2A2,2,0,0,1,584.06,351.8Z"/><path class="cls-6" d="M584.06,351.8a2,2,0,1,1-2-2A2,2,0,0,1,584.06,351.8Z"/><path class="cls-5" d="M586.33,351.56a2,2,0,1,1-2-2A2,2,0,0,1,586.33,351.56Z"/><path class="cls-6" d="M586.33,351.56a2,2,0,1,1-2-2A2,2,0,0,1,586.33,351.56Z"/><path class="cls-5" d="M588.62,351.32a2,2,0,1,1-2-2A2,2,0,0,1,588.62,351.32Z"/><path class="cls-6" d="M588.62,351.32a2,2,0,1,1-2-2A2,2,0,0,1,588.62,351.32Z"/><path class="cls-5" d="M590.89,351.08a2,2,0,1,1-2-2A2,2,0,0,1,590.89,351.08Z"/><path class="cls-6" d="M590.89,351.08a2,2,0,1,1-2-2A2,2,0,0,1,590.89,351.08Z"/><path class="cls-5" d="M593.18,350.84a2,2,0,1,1-2-2A2,2,0,0,1,593.18,350.84Z"/><path class="cls-6" d="M593.18,350.84a2,2,0,1,1-2-2A2,2,0,0,1,593.18,350.84Z"/><path class="cls-5" d="M595.45,350.6a2,2,0,1,1-2-2A2,2,0,0,1,595.45,350.6Z"/><path class="cls-6" d="M595.45,350.6a2,2,0,1,1-2-2A2,2,0,0,1,595.45,350.6Z"/><path class="cls-5" d="M597.74,350.36a2,2,0,1,1-2-2A2,2,0,0,1,597.74,350.36Z"/><path class="cls-6" d="M597.74,350.36a2,2,0,1,1-2-2A2,2,0,0,1,597.74,350.36Z"/><circle class="cls-5" cx="598.04" cy="350.12" r="1.98" transform="translate(221.18 931.67) rotate(-87.4)"/><circle class="cls-6" cx="598.04" cy="350.12" r="1.98" transform="translate(221.18 931.67) rotate(-87.4)"/><path class="cls-5" d="M602.3,349.88a2,2,0,1,1-2-2A2,2,0,0,1,602.3,349.88Z"/><path class="cls-6" d="M602.3,349.88a2,2,0,1,1-2-2A2,2,0,0,1,602.3,349.88Z"/><circle class="cls-5" cx="602.6" cy="349.64" r="1.98" transform="translate(-18.41 33.86) rotate(-3.17)"/><circle class="cls-6" cx="602.6" cy="349.64" r="1.98" transform="translate(-18.41 33.86) rotate(-3.17)"/><path class="cls-5" d="M607,349.4a2,2,0,1,1-2-2A2,2,0,0,1,607,349.4Z"/><path class="cls-6" d="M607,349.4a2,2,0,1,1-2-2A2,2,0,0,1,607,349.4Z"/><circle class="cls-5" cx="607.28" cy="349.16" r="1.98" transform="translate(230.96 939.98) rotate(-87.4)"/><circle class="cls-6" cx="607.28" cy="349.16" r="1.98" transform="translate(230.96 939.98) rotate(-87.4)"/><path class="cls-5" d="M611.54,348.8a2,2,0,1,1-2-2A2,2,0,0,1,611.54,348.8Z"/><path class="cls-6" d="M611.54,348.8a2,2,0,1,1-2-2A2,2,0,0,1,611.54,348.8Z"/><circle class="cls-5" cx="611.83" cy="348.56" r="1.98" transform="translate(-18.34 34.37) rotate(-3.17)"/><circle class="cls-6" cx="611.83" cy="348.56" r="1.98" transform="translate(-18.34 34.37) rotate(-3.17)"/><path class="cls-5" d="M616.1,348.32a2,2,0,1,1-2-2A2,2,0,0,1,616.1,348.32Z"/><path class="cls-6" d="M616.1,348.32a2,2,0,1,1-2-2A2,2,0,0,1,616.1,348.32Z"/><circle class="cls-5" cx="616.39" cy="347.96" r="1.98" transform="translate(-18.3 34.62) rotate(-3.17)"/><circle class="cls-6" cx="616.39" cy="347.96" r="1.98" transform="translate(-18.3 34.62) rotate(-3.17)"/><path class="cls-5" d="M620.66,347.72a2,2,0,1,1-2-2A2,2,0,0,1,620.66,347.72Z"/><path class="cls-6" d="M620.66,347.72a2,2,0,1,1-2-2A2,2,0,0,1,620.66,347.72Z"/><circle class="cls-5" cx="620.95" cy="347.36" r="1.98" transform="translate(-18.26 34.87) rotate(-3.17)"/><circle class="cls-6" cx="620.95" cy="347.36" r="1.98" transform="translate(-18.26 34.87) rotate(-3.17)"/><path class="cls-5" d="M625.22,347.12a2,2,0,1,1-2-2A2,2,0,0,1,625.22,347.12Z"/><path class="cls-6" d="M625.22,347.12a2,2,0,1,1-2-2A2,2,0,0,1,625.22,347.12Z"/><circle class="cls-5" cx="625.51" cy="346.76" r="1.98" transform="translate(250.77 955.91) rotate(-87.4)"/><circle class="cls-6" cx="625.51" cy="346.76" r="1.98" transform="translate(250.77 955.91) rotate(-87.4)"/><path class="cls-5" d="M629.89,346.4a2,2,0,1,1-2-2A2,2,0,0,1,629.89,346.4Z"/><path class="cls-6" d="M629.89,346.4a2,2,0,1,1-2-2A2,2,0,0,1,629.89,346.4Z"/><path class="cls-5" d="M632.18,346.16a2,2,0,1,1-2-2A2,2,0,0,1,632.18,346.16Z"/><path class="cls-6" d="M632.18,346.16a2,2,0,1,1-2-2A2,2,0,0,1,632.18,346.16Z"/><path class="cls-5" d="M634.45,345.8a2,2,0,1,1-2-2A2,2,0,0,1,634.45,345.8Z"/><path class="cls-6" d="M634.45,345.8a2,2,0,1,1-2-2A2,2,0,0,1,634.45,345.8Z"/><path class="cls-5" d="M636.74,345.44a2,2,0,1,1-2-2A2,2,0,0,1,636.74,345.44Z"/><path class="cls-6" d="M636.74,345.44a2,2,0,1,1-2-2A2,2,0,0,1,636.74,345.44Z"/><circle class="cls-5" cx="637.04" cy="345.08" r="1.98"/><circle class="cls-6" cx="637.04" cy="345.08" r="1.98"/><path class="cls-5" d="M641.3,344.72a2,2,0,1,1-2-2A2,2,0,0,1,641.3,344.72Z"/><path class="cls-6" d="M641.3,344.72a2,2,0,1,1-2-2A2,2,0,0,1,641.3,344.72Z"/><g class="cls-3"><polyline class="cls-7" points="54.95 363.25 57.26 363.25 59.53 363.13 61.81 363.13 64.09 363.13 66.38 363.13 68.66 363.13 71.06 363.01 73.33 363.01 75.61 363.01 77.89 363.01 80.17 362.89 82.45 362.89 84.73 362.89 87.02 362.77 89.3 362.77 91.58 362.77 93.86 362.65 96.25 362.65 98.53 362.65 100.81 362.54 103.09 362.54 105.38 362.54 107.66 362.42 109.94 362.42 112.22 362.42 114.5 362.3 116.78 362.3 119.17 362.18 121.45 362.18 123.73 362.18 126.02 362.06 128.29 362.06 130.57 362.06 132.85 361.94 135.13 361.94 137.41 361.94 139.69 361.81 142.09 361.81 144.38 361.69 146.66 361.69 148.94 361.69 151.22 361.57 153.5 361.57 155.78 361.45 158.06 361.45 160.34 361.45 162.62 361.33 165.01 361.33 167.29 361.33 169.57 361.21 171.85 361.21 174.13 361.1 176.41 361.1 178.69 361.1 180.97 360.98 183.25 360.98 185.53 360.98 187.94 360.86 190.22 360.86 192.5 360.74 194.78 360.74 197.06 360.74 199.34 360.62 201.62 360.62 203.9 360.5 206.18 360.5 208.46 360.5 210.85 360.38 213.13 360.38 215.41 360.38 217.69 360.25 219.97 360.25 222.25 360.25 224.53 360.13 226.81 360.13 229.09 360.01 231.38 360.01 233.78 360.01 236.06 359.89 238.34 359.89 240.62 359.89 242.9 359.77 245.18 359.77 247.46 359.65 249.74 359.65 252.01 359.65 254.29 359.54 256.69 359.54 258.98 359.54 261.25 359.42 263.54 359.42 265.81 359.42 268.1 359.3 270.38 359.3 272.65 359.18 274.94 359.18 277.21 359.06 279.62 358.94 281.89 358.94 284.18 358.81 286.45 358.69 288.74 358.69 291.01 358.57 293.3 358.45 295.57 358.45 297.86 358.33 300.13 358.21 302.42 358.21 304.81 358.1 307.1 357.98 309.38 357.98 311.65 357.86 313.94 357.74 316.21 357.74 318.5 357.62 320.77 357.5 323.06 357.38 325.33 357.38 327.74 357.25 330.01 357.13 332.3 357.01 334.57 356.89 336.86 356.77 339.13 356.65 341.42 356.54 343.69 356.42 345.98 356.3 348.25 356.18 350.65 356.06 352.94 355.94 355.21 355.81 357.5 355.69 359.77 355.57 362.06 355.45 364.33 355.33 366.62 355.21 368.89 355.1 371.18 354.98 373.57 354.74 375.86 354.62 378.13 354.5 380.42 354.25 382.69 354.13 384.98 354.01 387.25 353.77 389.54 353.65 391.81 353.54 394.1 353.3 396.5 353.18 398.77 352.94 401.06 352.81 403.33 352.57 405.62 352.33 407.89 352.21 410.18 351.98 412.45 351.74 414.74 351.62 417.01 351.38 419.42 351.13 421.69 351.01 423.98 350.77 426.25 350.54 428.54 350.3 430.81 349.94 433.1 349.69 435.38 349.45 437.65 349.21 439.94 348.98 442.33 348.74 444.62 348.5 446.89 348.25 449.18 347.89 451.45 347.65 453.74 347.3 456.01 347.06 458.3 346.81 460.57 346.45 462.86 346.1 465.25 345.86 467.54 345.5 469.81 345.13 472.1 344.89 474.38 344.54 476.65 344.06 478.94 343.69 481.21 343.33 483.5 342.98 485.77 342.62 488.06 342.25 490.45 341.89 492.74 341.42 495.01 341.06 497.3 340.57 499.57 340.21 501.86 339.74 504.13 339.25 506.42 338.89 508.69 338.42 510.98 337.94 513.38 337.45 515.65 336.98 517.93 336.5 520.22 335.89 522.5 335.42 524.77 334.94 527.05 334.33 529.34 333.74 531.62 333.25 533.89 332.65 536.29 332.06 538.58 331.45 540.86 330.86 543.13 330.25 545.41 329.65 547.7 328.94 549.98 328.33 552.25 327.62 554.53 326.89 556.82 326.3 559.22 325.57 561.5 324.86 563.77 324.01 566.05 323.3 568.34 322.57 570.62 321.74 572.89 320.89 575.17 320.18 577.46 319.33 579.74 318.38 582.13 317.54 584.41 316.69 586.7 315.86 588.98 314.89 591.25 313.94 593.53 312.98 595.82 312.01 598.1 310.94 600.38 309.98 602.65 308.89 605.05 307.81 607.34 306.74 609.62 305.65 611.89 304.57 614.17 303.38 616.46 302.18 618.74 301.1 621.01 299.89 623.29 298.57 625.58 297.38 627.98 296.06 630.25 294.74 632.53 293.42 634.82 292.1 637.1 290.65 639.38 289.27"/></g><circle class="cls-8" cx="54.92" cy="363.2" r="1.98"/><circle class="cls-9" cx="54.92" cy="363.2" r="1.98"/><circle class="cls-8" cx="57.2" cy="363.2" r="1.98"/><circle class="cls-9" cx="57.2" cy="363.2" r="1.98"/><circle class="cls-8" cx="59.48" cy="363.08" r="1.98"/><circle class="cls-9" cx="59.48" cy="363.08" r="1.98"/><circle class="cls-8" cx="61.76" cy="363.08" r="1.98"/><circle class="cls-9" cx="61.76" cy="363.08" r="1.98"/><circle class="cls-8" cx="64.04" cy="363.08" r="1.98"/><circle class="cls-9" cx="64.04" cy="363.08" r="1.98"/><circle class="cls-8" cx="66.32" cy="363.08" r="1.98"/><circle class="cls-9" cx="66.32" cy="363.08" r="1.98"/><circle class="cls-8" cx="68.6" cy="363.08" r="1.98"/><circle class="cls-9" cx="68.6" cy="363.08" r="1.98"/><circle class="cls-8" cx="71" cy="362.96" r="1.98"/><circle class="cls-9" cx="71" cy="362.96" r="1.98"/><circle class="cls-8" cx="73.28" cy="362.96" r="1.98"/><circle class="cls-9" cx="73.28" cy="362.96" r="1.98"/><circle class="cls-8" cx="75.56" cy="362.96" r="1.98"/><circle class="cls-9" cx="75.56" cy="362.96" r="1.98"/><circle class="cls-8" cx="77.83" cy="362.96" r="1.98"/><circle class="cls-9" cx="77.83" cy="362.96" r="1.98"/><circle class="cls-8" cx="80.12" cy="362.84" r="1.98"/><circle class="cls-9" cx="80.12" cy="362.84" r="1.98"/><circle class="cls-8" cx="82.4" cy="362.84" r="1.98"/><circle class="cls-9" cx="82.4" cy="362.84" r="1.98"/><circle class="cls-8" cx="84.67" cy="362.84" r="1.98"/><circle class="cls-9" cx="84.67" cy="362.84" r="1.98"/><circle class="cls-8" cx="86.95" cy="362.72" r="1.98"/><circle class="cls-9" cx="86.95" cy="362.72" r="1.98"/><circle class="cls-8" cx="89.24" cy="362.72" r="1.98"/><circle class="cls-9" cx="89.24" cy="362.72" r="1.98"/><circle class="cls-8" cx="91.52" cy="362.72" r="1.98"/><circle class="cls-9" cx="91.52" cy="362.72" r="1.98"/><circle class="cls-8" cx="93.8" cy="362.6" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -276.28, 420.2)"/><circle class="cls-9" cx="93.8" cy="362.6" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -276.28, 420.2)"/><circle class="cls-8" cx="96.19" cy="362.6" r="1.98"/><circle class="cls-9" cx="96.19" cy="362.6" r="1.98"/><circle class="cls-8" cx="98.47" cy="362.6" r="1.98"/><circle class="cls-9" cx="98.47" cy="362.6" r="1.98"/><circle class="cls-8" cx="100.76" cy="362.48" r="1.98"/><circle class="cls-9" cx="100.76" cy="362.48" r="1.98"/><circle class="cls-8" cx="103.03" cy="362.48" r="1.98"/><circle class="cls-9" cx="103.03" cy="362.48" r="1.98"/><circle class="cls-8" cx="105.31" cy="362.48" r="1.98"/><circle class="cls-9" cx="105.31" cy="362.48" r="1.98"/><circle class="cls-8" cx="107.6" cy="362.36" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -263.6, 433.72)"/><circle class="cls-9" cx="107.6" cy="362.36" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -263.6, 433.72)"/><circle class="cls-8" cx="109.88" cy="362.36" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -261.54, 435.99)"/><circle class="cls-9" cx="109.88" cy="362.36" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -261.54, 435.99)"/><circle class="cls-8" cx="112.16" cy="362.36" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -259.49, 438.26)"/><circle class="cls-9" cx="112.16" cy="362.36" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -259.49, 438.26)"/><circle class="cls-8" cx="114.44" cy="362.24" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -257.31, 440.42)"/><circle class="cls-9" cx="114.44" cy="362.24" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -257.31, 440.42)"/><circle class="cls-8" cx="116.72" cy="362.24" r="1.98"/><circle class="cls-9" cx="116.72" cy="362.24" r="1.98"/><circle class="cls-8" cx="119.12" cy="362.12" r="1.98"/><circle class="cls-9" cx="119.12" cy="362.12" r="1.98"/><circle class="cls-8" cx="121.4" cy="362.12" r="1.98"/><circle class="cls-9" cx="121.4" cy="362.12" r="1.98"/><circle class="cls-8" cx="123.67" cy="362.12" r="1.98"/><circle class="cls-9" cx="123.67" cy="362.12" r="1.98"/><circle class="cls-8" cx="125.95" cy="362" r="1.98"/><circle class="cls-9" cx="125.95" cy="362" r="1.98"/><circle class="cls-8" cx="128.24" cy="362" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -244.63, 453.94)"/><circle class="cls-9" cx="128.24" cy="362" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -244.63, 453.94)"/><circle class="cls-8" cx="130.52" cy="362" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -242.58, 456.2)"/><circle class="cls-9" cx="130.52" cy="362" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -242.58, 456.2)"/><circle class="cls-8" cx="132.8" cy="361.88" r="1.98"/><circle class="cls-9" cx="132.8" cy="361.88" r="1.98"/><circle class="cls-8" cx="135.08" cy="361.88" r="1.98"/><circle class="cls-9" cx="135.08" cy="361.88" r="1.98"/><circle class="cls-8" cx="137.36" cy="361.88" r="1.98"/><circle class="cls-9" cx="137.36" cy="361.88" r="1.98"/><circle class="cls-8" cx="139.64" cy="361.76" r="1.98"/><circle class="cls-9" cx="139.64" cy="361.76" r="1.98"/><circle class="cls-8" cx="142.03" cy="361.76" r="1.98"/><circle class="cls-9" cx="142.03" cy="361.76" r="1.98"/><circle class="cls-8" cx="144.31" cy="361.64" r="1.98"/><circle class="cls-9" cx="144.31" cy="361.64" r="1.98"/><circle class="cls-8" cx="146.6" cy="361.64" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -227.73, 471.88)"/><circle class="cls-9" cx="146.6" cy="361.64" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -227.73, 471.88)"/><circle class="cls-8" cx="148.88" cy="361.64" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -225.67, 474.15)"/><circle class="cls-9" cx="148.88" cy="361.64" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -225.67, 474.15)"/><circle class="cls-8" cx="151.16" cy="361.52" r="1.98"/><circle class="cls-9" cx="151.16" cy="361.52" r="1.98"/><circle class="cls-8" cx="153.44" cy="361.52" r="1.98"/><circle class="cls-9" cx="153.44" cy="361.52" r="1.98"/><circle class="cls-8" cx="155.72" cy="361.4" r="1.98"/><circle class="cls-9" cx="155.72" cy="361.4" r="1.98"/><circle class="cls-8" cx="158" cy="361.4" r="1.98"/><circle class="cls-9" cx="158" cy="361.4" r="1.98"/><circle class="cls-8" cx="160.28" cy="361.4" r="1.98"/><circle class="cls-9" cx="160.28" cy="361.4" r="1.98"/><circle class="cls-8" cx="162.56" cy="361.28" r="1.98"/><circle class="cls-9" cx="162.56" cy="361.28" r="1.98"/><circle class="cls-8" cx="164.95" cy="361.28" r="1.98"/><circle class="cls-9" cx="164.95" cy="361.28" r="1.98"/><circle class="cls-8" cx="167.24" cy="361.28" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -208.76, 492.1)"/><circle class="cls-9" cx="167.24" cy="361.28" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -208.76, 492.1)"/><circle class="cls-8" cx="169.52" cy="361.16" r="1.98"/><circle class="cls-9" cx="169.52" cy="361.16" r="1.98"/><circle class="cls-8" cx="171.8" cy="361.16" r="1.98"/><circle class="cls-9" cx="171.8" cy="361.16" r="1.98"/><circle class="cls-8" cx="174.08" cy="361.04" r="1.98"/><circle class="cls-9" cx="174.08" cy="361.04" r="1.98"/><circle class="cls-8" cx="176.36" cy="361.04" r="1.98"/><circle class="cls-9" cx="176.36" cy="361.04" r="1.98"/><circle class="cls-8" cx="178.64" cy="361.04" r="1.98"/><circle class="cls-9" cx="178.64" cy="361.04" r="1.98"/><circle class="cls-8" cx="180.92" cy="360.92" r="1.98"/><circle class="cls-9" cx="180.92" cy="360.92" r="1.98"/><circle class="cls-8" cx="183.19" cy="360.92" r="1.98"/><circle class="cls-9" cx="183.19" cy="360.92" r="1.98"/><circle class="cls-8" cx="185.47" cy="360.92" r="1.98"/><circle class="cls-9" cx="185.47" cy="360.92" r="1.98"/><circle class="cls-8" cx="187.88" cy="360.8" r="1.98"/><circle class="cls-9" cx="187.88" cy="360.8" r="1.98"/><circle class="cls-8" cx="190.16" cy="360.8" r="1.98"/><circle class="cls-9" cx="190.16" cy="360.8" r="1.98"/><circle class="cls-8" cx="192.44" cy="360.68" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -185.45, 516.63)"/><circle class="cls-9" cx="192.44" cy="360.68" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -185.45, 516.63)"/><circle class="cls-8" cx="194.72" cy="360.68" r="1.98"/><circle class="cls-9" cx="194.72" cy="360.68" r="1.98"/><circle class="cls-8" cx="197" cy="360.68" r="1.98"/><circle class="cls-9" cx="197" cy="360.68" r="1.98"/><circle class="cls-8" cx="199.28" cy="360.56" r="1.98"/><circle class="cls-9" cx="199.28" cy="360.56" r="1.98"/><circle class="cls-8" cx="201.56" cy="360.56" r="1.98" transform="translate(-172.38 536.04) rotate(-85.93)"/><circle class="cls-9" cx="201.56" cy="360.56" r="1.98" transform="translate(-172.38 536.04) rotate(-85.93)"/><circle class="cls-8" cx="203.83" cy="360.44" r="1.98" transform="translate(-34.52 21.84) rotate(-5.65)"/><circle class="cls-9" cx="203.83" cy="360.44" r="1.98" transform="translate(-34.52 21.84) rotate(-5.65)"/><circle class="cls-8" cx="206.11" cy="360.44" r="1.98" transform="translate(-34.51 22.06) rotate(-5.65)"/><circle class="cls-9" cx="206.11" cy="360.44" r="1.98" transform="translate(-34.51 22.06) rotate(-5.65)"/><path class="cls-8" d="M210.38,360.44a2,2,0,1,1-2-2A2,2,0,0,1,210.38,360.44Z"/><path class="cls-9" d="M210.38,360.44a2,2,0,1,1-2-2A2,2,0,0,1,210.38,360.44Z"/><path class="cls-8" d="M212.78,360.32a2,2,0,1,1-2-2A2,2,0,0,1,212.78,360.32Z"/><path class="cls-9" d="M212.78,360.32a2,2,0,1,1-2-2A2,2,0,0,1,212.78,360.32Z"/><path class="cls-8" d="M215.06,360.32a2,2,0,1,1-2-2A2,2,0,0,1,215.06,360.32Z"/><path class="cls-9" d="M215.06,360.32a2,2,0,1,1-2-2A2,2,0,0,1,215.06,360.32Z"/><path class="cls-8" d="M217.33,360.32a2,2,0,1,1-2-2A2,2,0,0,1,217.33,360.32Z"/><path class="cls-9" d="M217.33,360.32a2,2,0,1,1-2-2A2,2,0,0,1,217.33,360.32Z"/><path class="cls-8" d="M219.61,360.2a2,2,0,1,1-2-2A2,2,0,0,1,219.61,360.2Z"/><path class="cls-9" d="M219.61,360.2a2,2,0,1,1-2-2A2,2,0,0,1,219.61,360.2Z"/><path class="cls-8" d="M221.89,360.2a2,2,0,1,1-2-2A2,2,0,0,1,221.89,360.2Z"/><path class="cls-9" d="M221.89,360.2a2,2,0,1,1-2-2A2,2,0,0,1,221.89,360.2Z"/><path class="cls-8" d="M224.17,360.2a2,2,0,1,1-2-2A2,2,0,0,1,224.17,360.2Z"/><path class="cls-9" d="M224.17,360.2a2,2,0,1,1-2-2A2,2,0,0,1,224.17,360.2Z"/><path class="cls-8" d="M226.45,360.08a2,2,0,1,1-2-2A2,2,0,0,1,226.45,360.08Z"/><path class="cls-9" d="M226.45,360.08a2,2,0,1,1-2-2A2,2,0,0,1,226.45,360.08Z"/><path class="cls-8" d="M228.73,360.08a2,2,0,1,1-2-2A2,2,0,0,1,228.73,360.08Z"/><path class="cls-9" d="M228.73,360.08a2,2,0,1,1-2-2A2,2,0,0,1,228.73,360.08Z"/><path class="cls-8" d="M231,360a2,2,0,1,1-2-2A2,2,0,0,1,231,360Z"/><path class="cls-9" d="M231,360a2,2,0,1,1-2-2A2,2,0,0,1,231,360Z"/><path class="cls-8" d="M233.29,360a2,2,0,1,1-2-2A2,2,0,0,1,233.29,360Z"/><path class="cls-9" d="M233.29,360a2,2,0,1,1-2-2A2,2,0,0,1,233.29,360Z"/><circle class="cls-8" cx="233.72" cy="359.96" r="1.98" transform="translate(-141.9 567.57) rotate(-85.93)"/><circle class="cls-9" cx="233.72" cy="359.96" r="1.98" transform="translate(-141.9 567.57) rotate(-85.93)"/><circle class="cls-8" cx="236" cy="359.84" r="1.98" transform="translate(-34.31 25.01) rotate(-5.65)"/><circle class="cls-9" cx="236" cy="359.84" r="1.98" transform="translate(-34.31 25.01) rotate(-5.65)"/><circle class="cls-8" cx="238.28" cy="359.84" r="1.98" transform="translate(-34.3 25.23) rotate(-5.65)"/><circle class="cls-9" cx="238.28" cy="359.84" r="1.98" transform="translate(-34.3 25.23) rotate(-5.65)"/><circle class="cls-8" cx="240.56" cy="359.84" r="1.98" transform="translate(-34.29 25.45) rotate(-5.65)"/><circle class="cls-9" cx="240.56" cy="359.84" r="1.98" transform="translate(-34.29 25.45) rotate(-5.65)"/><circle class="cls-8" cx="242.83" cy="359.72" r="1.98" transform="translate(-34.26 25.68) rotate(-5.65)"/><circle class="cls-9" cx="242.83" cy="359.72" r="1.98" transform="translate(-34.26 25.68) rotate(-5.65)"/><circle class="cls-8" cx="245.11" cy="359.72" r="1.98" transform="translate(-34.25 25.9) rotate(-5.65)"/><circle class="cls-9" cx="245.11" cy="359.72" r="1.98" transform="translate(-34.25 25.9) rotate(-5.65)"/><path class="cls-8" d="M249.38,359.6a2,2,0,1,1-2-2A2,2,0,0,1,249.38,359.6Z"/><path class="cls-9" d="M249.38,359.6a2,2,0,1,1-2-2A2,2,0,0,1,249.38,359.6Z"/><path class="cls-8" d="M251.66,359.6a2,2,0,1,1-2-2A2,2,0,0,1,251.66,359.6Z"/><path class="cls-9" d="M251.66,359.6a2,2,0,1,1-2-2A2,2,0,0,1,251.66,359.6Z"/><path class="cls-8" d="M253.94,359.6a2,2,0,1,1-2-2A2,2,0,0,1,253.94,359.6Z"/><path class="cls-9" d="M253.94,359.6a2,2,0,1,1-2-2A2,2,0,0,1,253.94,359.6Z"/><path class="cls-8" d="M256.22,359.48a2,2,0,1,1-2-2A2,2,0,0,1,256.22,359.48Z"/><path class="cls-9" d="M256.22,359.48a2,2,0,1,1-2-2A2,2,0,0,1,256.22,359.48Z"/><path class="cls-8" d="M258.61,359.48a2,2,0,1,1-2-2A2,2,0,0,1,258.61,359.48Z"/><path class="cls-9" d="M258.61,359.48a2,2,0,1,1-2-2A2,2,0,0,1,258.61,359.48Z"/><path class="cls-8" d="M260.89,359.48a2,2,0,1,1-2-2A2,2,0,0,1,260.89,359.48Z"/><path class="cls-9" d="M260.89,359.48a2,2,0,1,1-2-2A2,2,0,0,1,260.89,359.48Z"/><path class="cls-8" d="M263.17,359.36a2,2,0,1,1-2-2A2,2,0,0,1,263.17,359.36Z"/><path class="cls-9" d="M263.17,359.36a2,2,0,1,1-2-2A2,2,0,0,1,263.17,359.36Z"/><path class="cls-8" d="M265.45,359.36a2,2,0,1,1-2-2A2,2,0,0,1,265.45,359.36Z"/><path class="cls-9" d="M265.45,359.36a2,2,0,1,1-2-2A2,2,0,0,1,265.45,359.36Z"/><path class="cls-8" d="M267.73,359.36a2,2,0,1,1-2-2A2,2,0,0,1,267.73,359.36Z"/><path class="cls-9" d="M267.73,359.36a2,2,0,1,1-2-2A2,2,0,0,1,267.73,359.36Z"/><path class="cls-8" d="M270,359.24a2,2,0,1,1-2-2A2,2,0,0,1,270,359.24Z"/><path class="cls-9" d="M270,359.24a2,2,0,1,1-2-2A2,2,0,0,1,270,359.24Z"/><path class="cls-8" d="M272.29,359.24a2,2,0,1,1-2-2A2,2,0,0,1,272.29,359.24Z"/><path class="cls-9" d="M272.29,359.24a2,2,0,1,1-2-2A2,2,0,0,1,272.29,359.24Z"/><path class="cls-8" d="M274.57,359.12a2,2,0,1,1-2-2A2,2,0,0,1,274.57,359.12Z"/><path class="cls-9" d="M274.57,359.12a2,2,0,1,1-2-2A2,2,0,0,1,274.57,359.12Z"/><path class="cls-8" d="M276.85,359.12a2,2,0,1,1-2-2A2,2,0,0,1,276.85,359.12Z"/><path class="cls-9" d="M276.85,359.12a2,2,0,1,1-2-2A2,2,0,0,1,276.85,359.12Z"/><path class="cls-8" d="M279.13,359a2,2,0,1,1-2-2A2,2,0,0,1,279.13,359Z"/><path class="cls-9" d="M279.13,359a2,2,0,1,1-2-2A2,2,0,0,1,279.13,359Z"/><circle class="cls-8" cx="279.56" cy="358.88" r="1.98"/><circle class="cls-9" cx="279.56" cy="358.88" r="1.98"/><circle class="cls-8" cx="281.83" cy="358.88" r="1.98"/><circle class="cls-9" cx="281.83" cy="358.88" r="1.98"/><circle class="cls-8" cx="284.11" cy="358.76" r="1.98" transform="translate(-33.97 29.74) rotate(-5.65)"/><circle class="cls-9" cx="284.11" cy="358.76" r="1.98" transform="translate(-33.97 29.74) rotate(-5.65)"/><path class="cls-8" d="M288.38,358.64a2,2,0,1,1-2-2A2,2,0,0,1,288.38,358.64Z"/><path class="cls-9" d="M288.38,358.64a2,2,0,1,1-2-2A2,2,0,0,1,288.38,358.64Z"/><path class="cls-8" d="M290.66,358.64a2,2,0,1,1-2-2A2,2,0,0,1,290.66,358.64Z"/><path class="cls-9" d="M290.66,358.64a2,2,0,1,1-2-2A2,2,0,0,1,290.66,358.64Z"/><path class="cls-8" d="M292.94,358.52a2,2,0,1,1-2-2A2,2,0,0,1,292.94,358.52Z"/><path class="cls-9" d="M292.94,358.52a2,2,0,1,1-2-2A2,2,0,0,1,292.94,358.52Z"/><path class="cls-8" d="M295.22,358.4a2,2,0,1,1-2-2A2,2,0,0,1,295.22,358.4Z"/><path class="cls-9" d="M295.22,358.4a2,2,0,1,1-2-2A2,2,0,0,1,295.22,358.4Z"/><path class="cls-8" d="M297.5,358.4a2,2,0,1,1-2-2A2,2,0,0,1,297.5,358.4Z"/><path class="cls-9" d="M297.5,358.4a2,2,0,1,1-2-2A2,2,0,0,1,297.5,358.4Z"/><path class="cls-8" d="M299.78,358.28a2,2,0,1,1-2-2A2,2,0,0,1,299.78,358.28Z"/><path class="cls-9" d="M299.78,358.28a2,2,0,1,1-2-2A2,2,0,0,1,299.78,358.28Z"/><path class="cls-8" d="M302.06,358.16a2,2,0,1,1-2-2A2,2,0,0,1,302.06,358.16Z"/><path class="cls-9" d="M302.06,358.16a2,2,0,1,1-2-2A2,2,0,0,1,302.06,358.16Z"/><path class="cls-8" d="M304.33,358.16a2,2,0,1,1-2-2A2,2,0,0,1,304.33,358.16Z"/><path class="cls-9" d="M304.33,358.16a2,2,0,1,1-2-2A2,2,0,0,1,304.33,358.16Z"/><path class="cls-8" d="M306.73,358a2,2,0,1,1-2-2A2,2,0,0,1,306.73,358Z"/><path class="cls-9" d="M306.73,358a2,2,0,1,1-2-2A2,2,0,0,1,306.73,358Z"/><path class="cls-8" d="M309,357.92a2,2,0,1,1-2-2A2,2,0,0,1,309,357.92Z"/><path class="cls-9" d="M309,357.92a2,2,0,1,1-2-2A2,2,0,0,1,309,357.92Z"/><path class="cls-8" d="M311.29,357.92a2,2,0,1,1-2-2A2,2,0,0,1,311.29,357.92Z"/><path class="cls-9" d="M311.29,357.92a2,2,0,1,1-2-2A2,2,0,0,1,311.29,357.92Z"/><path class="cls-8" d="M313.57,357.8a2,2,0,1,1-2-2A2,2,0,0,1,313.57,357.8Z"/><path class="cls-9" d="M313.57,357.8a2,2,0,1,1-2-2A2,2,0,0,1,313.57,357.8Z"/><path class="cls-8" d="M315.85,357.68a2,2,0,1,1-2-2A2,2,0,0,1,315.85,357.68Z"/><path class="cls-9" d="M315.85,357.68a2,2,0,1,1-2-2A2,2,0,0,1,315.85,357.68Z"/><path class="cls-8" d="M318.13,357.68a2,2,0,1,1-2-2A2,2,0,0,1,318.13,357.68Z"/><path class="cls-9" d="M318.13,357.68a2,2,0,1,1-2-2A2,2,0,0,1,318.13,357.68Z"/><circle class="cls-8" cx="318.44" cy="357.56" r="1.98" transform="translate(-60.79 649.84) rotate(-85.93)"/><circle class="cls-9" cx="318.44" cy="357.56" r="1.98" transform="translate(-60.79 649.84) rotate(-85.93)"/><circle class="cls-8" cx="320.72" cy="357.44" r="1.98" transform="translate(-33.66 33.34) rotate(-5.65)"/><circle class="cls-9" cx="320.72" cy="357.44" r="1.98" transform="translate(-33.66 33.34) rotate(-5.65)"/><circle class="cls-8" cx="323" cy="357.32" r="1.98" transform="translate(-56.32 654.17) rotate(-85.93)"/><circle class="cls-9" cx="323" cy="357.32" r="1.98" transform="translate(-56.32 654.17) rotate(-85.93)"/><circle class="cls-8" cx="325.28" cy="357.32" r="1.98" transform="translate(-54.2 656.44) rotate(-85.93)"/><circle class="cls-9" cx="325.28" cy="357.32" r="1.98" transform="translate(-54.2 656.44) rotate(-85.93)"/><path class="cls-8" d="M329.66,357.2a2,2,0,1,1-2-2A2,2,0,0,1,329.66,357.2Z"/><path class="cls-9" d="M329.66,357.2a2,2,0,1,1-2-2A2,2,0,0,1,329.66,357.2Z"/><path class="cls-8" d="M331.94,357.08a2,2,0,1,1-2-2A2,2,0,0,1,331.94,357.08Z"/><path class="cls-9" d="M331.94,357.08a2,2,0,1,1-2-2A2,2,0,0,1,331.94,357.08Z"/><path class="cls-8" d="M334.22,357a2,2,0,1,1-2-2A2,2,0,0,1,334.22,357Z"/><path class="cls-9" d="M334.22,357a2,2,0,1,1-2-2A2,2,0,0,1,334.22,357Z"/><path class="cls-8" d="M336.5,356.84a2,2,0,1,1-2-2A2,2,0,0,1,336.5,356.84Z"/><path class="cls-9" d="M336.5,356.84a2,2,0,1,1-2-2A2,2,0,0,1,336.5,356.84Z"/><path class="cls-8" d="M338.78,356.72a2,2,0,1,1-2-2A2,2,0,0,1,338.78,356.72Z"/><path class="cls-9" d="M338.78,356.72a2,2,0,1,1-2-2A2,2,0,0,1,338.78,356.72Z"/><path class="cls-8" d="M341.06,356.6a2,2,0,1,1-2-2A2,2,0,0,1,341.06,356.6Z"/><path class="cls-9" d="M341.06,356.6a2,2,0,1,1-2-2A2,2,0,0,1,341.06,356.6Z"/><path class="cls-8" d="M343.33,356.48a2,2,0,1,1-2-2A2,2,0,0,1,343.33,356.48Z"/><path class="cls-9" d="M343.33,356.48a2,2,0,1,1-2-2A2,2,0,0,1,343.33,356.48Z"/><path class="cls-8" d="M345.61,356.36a2,2,0,1,1-2-2A2,2,0,0,1,345.61,356.36Z"/><path class="cls-9" d="M345.61,356.36a2,2,0,1,1-2-2A2,2,0,0,1,345.61,356.36Z"/><path class="cls-8" d="M347.89,356.24a2,2,0,1,1-2-2A2,2,0,0,1,347.89,356.24Z"/><path class="cls-9" d="M347.89,356.24a2,2,0,1,1-2-2A2,2,0,0,1,347.89,356.24Z"/><path class="cls-8" d="M350.17,356.12a2,2,0,1,1-2-2A2,2,0,0,1,350.17,356.12Z"/><path class="cls-9" d="M350.17,356.12a2,2,0,1,1-2-2A2,2,0,0,1,350.17,356.12Z"/><path class="cls-8" d="M352.57,356a2,2,0,1,1-2-2A2,2,0,0,1,352.57,356Z"/><path class="cls-9" d="M352.57,356a2,2,0,1,1-2-2A2,2,0,0,1,352.57,356Z"/><path class="cls-8" d="M354.85,355.88a2,2,0,1,1-2-2A2,2,0,0,1,354.85,355.88Z"/><path class="cls-9" d="M354.85,355.88a2,2,0,1,1-2-2A2,2,0,0,1,354.85,355.88Z"/><path class="cls-8" d="M357.13,355.76a2,2,0,1,1-2-2A2,2,0,0,1,357.13,355.76Z"/><path class="cls-9" d="M357.13,355.76a2,2,0,1,1-2-2A2,2,0,0,1,357.13,355.76Z"/><circle class="cls-8" cx="357.44" cy="355.64" r="1.98" transform="translate(-22.64 686.96) rotate(-85.93)"/><circle class="cls-9" cx="357.44" cy="355.64" r="1.98" transform="translate(-22.64 686.96) rotate(-85.93)"/><circle class="cls-8" cx="359.72" cy="355.52" r="1.98"/><circle class="cls-9" cx="359.72" cy="355.52" r="1.98"/><circle class="cls-8" cx="362" cy="355.4" r="1.98" transform="translate(-33.26 37.4) rotate(-5.65)"/><circle class="cls-9" cx="362" cy="355.4" r="1.98" transform="translate(-33.26 37.4) rotate(-5.65)"/><circle class="cls-8" cx="364.28" cy="355.28" r="1.98" transform="translate(-15.93 693.45) rotate(-85.93)"/><circle class="cls-9" cx="364.28" cy="355.28" r="1.98" transform="translate(-15.93 693.45) rotate(-85.93)"/><circle class="cls-8" cx="366.56" cy="355.16" r="1.98" transform="translate(-33.21 37.85) rotate(-5.65)"/><circle class="cls-9" cx="366.56" cy="355.16" r="1.98" transform="translate(-33.21 37.85) rotate(-5.65)"/><circle class="cls-8" cx="368.83" cy="355.04" r="1.98" transform="translate(-33.19 38.07) rotate(-5.65)"/><circle class="cls-9" cx="368.83" cy="355.04" r="1.98" transform="translate(-33.19 38.07) rotate(-5.65)"/><circle class="cls-8" cx="371.11" cy="354.92" r="1.98" transform="translate(-9.22 699.94) rotate(-85.93)"/><circle class="cls-9" cx="371.11" cy="354.92" r="1.98" transform="translate(-9.22 699.94) rotate(-85.93)"/><path class="cls-8" d="M375.5,354.68a2,2,0,1,1-2-2A2,2,0,0,1,375.5,354.68Z"/><path class="cls-9" d="M375.5,354.68a2,2,0,1,1-2-2A2,2,0,0,1,375.5,354.68Z"/><path class="cls-8" d="M377.78,354.56a2,2,0,1,1-2-2A2,2,0,0,1,377.78,354.56Z"/><path class="cls-9" d="M377.78,354.56a2,2,0,1,1-2-2A2,2,0,0,1,377.78,354.56Z"/><path class="cls-8" d="M380.06,354.44a2,2,0,1,1-2-2A2,2,0,0,1,380.06,354.44Z"/><path class="cls-9" d="M380.06,354.44a2,2,0,1,1-2-2A2,2,0,0,1,380.06,354.44Z"/><path class="cls-8" d="M382.33,354.2a2,2,0,1,1-2-2A2,2,0,0,1,382.33,354.2Z"/><path class="cls-9" d="M382.33,354.2a2,2,0,1,1-2-2A2,2,0,0,1,382.33,354.2Z"/><path class="cls-8" d="M384.61,354.08a2,2,0,1,1-2-2A2,2,0,0,1,384.61,354.08Z"/><path class="cls-9" d="M384.61,354.08a2,2,0,1,1-2-2A2,2,0,0,1,384.61,354.08Z"/><path class="cls-8" d="M386.89,354a2,2,0,1,1-2-2A2,2,0,0,1,386.89,354Z"/><path class="cls-9" d="M386.89,354a2,2,0,1,1-2-2A2,2,0,0,1,386.89,354Z"/><path class="cls-8" d="M389.17,353.72a2,2,0,1,1-2-2A2,2,0,0,1,389.17,353.72Z"/><path class="cls-9" d="M389.17,353.72a2,2,0,1,1-2-2A2,2,0,0,1,389.17,353.72Z"/><path class="cls-8" d="M391.45,353.6a2,2,0,1,1-2-2A2,2,0,0,1,391.45,353.6Z"/><path class="cls-9" d="M391.45,353.6a2,2,0,1,1-2-2A2,2,0,0,1,391.45,353.6Z"/><path class="cls-8" d="M393.73,353.48a2,2,0,1,1-2-2A2,2,0,0,1,393.73,353.48Z"/><path class="cls-9" d="M393.73,353.48a2,2,0,1,1-2-2A2,2,0,0,1,393.73,353.48Z"/><path class="cls-8" d="M396,353.24a2,2,0,1,1-2-2A2,2,0,0,1,396,353.24Z"/><path class="cls-9" d="M396,353.24a2,2,0,1,1-2-2A2,2,0,0,1,396,353.24Z"/><circle class="cls-8" cx="396.44" cy="353.12" r="1.98" transform="translate(-32.87 40.78) rotate(-5.65)"/><circle class="cls-9" cx="396.44" cy="353.12" r="1.98" transform="translate(-32.87 40.78) rotate(-5.65)"/><circle class="cls-8" cx="398.72" cy="352.88" r="1.98"/><circle class="cls-9" cx="398.72" cy="352.88" r="1.98"/><circle class="cls-8" cx="401" cy="352.76" r="1.98" transform="translate(-32.81 41.23) rotate(-5.65)"/><circle class="cls-9" cx="401" cy="352.76" r="1.98" transform="translate(-32.81 41.23) rotate(-5.65)"/><circle class="cls-8" cx="403.28" cy="352.52" r="1.98"/><circle class="cls-9" cx="403.28" cy="352.52" r="1.98"/><circle class="cls-8" cx="405.56" cy="352.28" r="1.98" transform="translate(25.42 731.84) rotate(-85.93)"/><circle class="cls-9" cx="405.56" cy="352.28" r="1.98" transform="translate(25.42 731.84) rotate(-85.93)"/><circle class="cls-8" cx="407.83" cy="352.16" r="1.98" transform="translate(-32.72 41.9) rotate(-5.65)"/><circle class="cls-9" cx="407.83" cy="352.16" r="1.98" transform="translate(-32.72 41.9) rotate(-5.65)"/><circle class="cls-8" cx="410.11" cy="351.92" r="1.98" transform="translate(30.01 736.05) rotate(-85.93)"/><circle class="cls-9" cx="410.11" cy="351.92" r="1.98" transform="translate(30.01 736.05) rotate(-85.93)"/><path class="cls-8" d="M414.38,351.68a2,2,0,1,1-2-2A2,2,0,0,1,414.38,351.68Z"/><path class="cls-9" d="M414.38,351.68a2,2,0,1,1-2-2A2,2,0,0,1,414.38,351.68Z"/><path class="cls-8" d="M416.66,351.56a2,2,0,1,1-2-2A2,2,0,0,1,416.66,351.56Z"/><path class="cls-9" d="M416.66,351.56a2,2,0,1,1-2-2A2,2,0,0,1,416.66,351.56Z"/><path class="cls-8" d="M418.94,351.32a2,2,0,1,1-2-2A2,2,0,0,1,418.94,351.32Z"/><path class="cls-9" d="M418.94,351.32a2,2,0,1,1-2-2A2,2,0,0,1,418.94,351.32Z"/><path class="cls-8" d="M421.33,351.08a2,2,0,1,1-2-2A2,2,0,0,1,421.33,351.08Z"/><path class="cls-9" d="M421.33,351.08a2,2,0,1,1-2-2A2,2,0,0,1,421.33,351.08Z"/><path class="cls-8" d="M423.61,351a2,2,0,1,1-2-2A2,2,0,0,1,423.61,351Z"/><path class="cls-9" d="M423.61,351a2,2,0,1,1-2-2A2,2,0,0,1,423.61,351Z"/><path class="cls-8" d="M425.89,350.72a2,2,0,1,1-2-2A2,2,0,0,1,425.89,350.72Z"/><path class="cls-9" d="M425.89,350.72a2,2,0,1,1-2-2A2,2,0,0,1,425.89,350.72Z"/><path class="cls-8" d="M428.17,350.48a2,2,0,1,1-2-2A2,2,0,0,1,428.17,350.48Z"/><path class="cls-9" d="M428.17,350.48a2,2,0,1,1-2-2A2,2,0,0,1,428.17,350.48Z"/><path class="cls-8" d="M430.45,350.24a2,2,0,1,1-2-2A2,2,0,0,1,430.45,350.24Z"/><path class="cls-9" d="M430.45,350.24a2,2,0,1,1-2-2A2,2,0,0,1,430.45,350.24Z"/><path class="cls-8" d="M432.73,349.88a2,2,0,1,1-2-2A2,2,0,0,1,432.73,349.88Z"/><path class="cls-9" d="M432.73,349.88a2,2,0,1,1-2-2A2,2,0,0,1,432.73,349.88Z"/><path class="cls-8" d="M435,349.64a2,2,0,1,1-2-2A2,2,0,0,1,435,349.64Z"/><path class="cls-9" d="M435,349.64a2,2,0,1,1-2-2A2,2,0,0,1,435,349.64Z"/><path class="cls-8" d="M437.29,349.4a2,2,0,1,1-2-2A2,2,0,0,1,437.29,349.4Z"/><path class="cls-9" d="M437.29,349.4a2,2,0,1,1-2-2A2,2,0,0,1,437.29,349.4Z"/><path class="cls-8" d="M439.57,349.16a2,2,0,1,1-2-2A2,2,0,0,1,439.57,349.16Z"/><path class="cls-9" d="M439.57,349.16a2,2,0,1,1-2-2A2,2,0,0,1,439.57,349.16Z"/><path class="cls-8" d="M441.85,348.92a2,2,0,1,1-2-2A2,2,0,0,1,441.85,348.92Z"/><path class="cls-9" d="M441.85,348.92a2,2,0,1,1-2-2A2,2,0,0,1,441.85,348.92Z"/><circle class="cls-8" cx="442.28" cy="348.68" r="1.98" transform="translate(63.12 765.12) rotate(-85.93)"/><circle class="cls-9" cx="442.28" cy="348.68" r="1.98" transform="translate(63.12 765.12) rotate(-85.93)"/><circle class="cls-8" cx="444.56" cy="348.44" r="1.98" transform="translate(-32.17 45.5) rotate(-5.65)"/><circle class="cls-9" cx="444.56" cy="348.44" r="1.98" transform="translate(-32.17 45.5) rotate(-5.65)"/><circle class="cls-8" cx="446.83" cy="348.2" r="1.98"/><circle class="cls-9" cx="446.83" cy="348.2" r="1.98"/><circle class="cls-8" cx="449.11" cy="347.84" r="1.98" transform="translate(-32.09 45.95) rotate(-5.65)"/><circle class="cls-9" cx="449.11" cy="347.84" r="1.98" transform="translate(-32.09 45.95) rotate(-5.65)"/><path class="cls-8" d="M453.38,347.6a2,2,0,1,1-2-2A2,2,0,0,1,453.38,347.6Z"/><path class="cls-9" d="M453.38,347.6a2,2,0,1,1-2-2A2,2,0,0,1,453.38,347.6Z"/><path class="cls-8" d="M455.66,347.24a2,2,0,1,1-2-2A2,2,0,0,1,455.66,347.24Z"/><path class="cls-9" d="M455.66,347.24a2,2,0,1,1-2-2A2,2,0,0,1,455.66,347.24Z"/><circle class="cls-8" cx="455.95" cy="347" r="1.98" transform="translate(-18.49 25.75) rotate(-3.17)"/><circle class="cls-9" cx="455.95" cy="347" r="1.98" transform="translate(-18.49 25.75) rotate(-3.17)"/><path class="cls-8" d="M460.22,346.76a2,2,0,1,1-2-2A2,2,0,0,1,460.22,346.76Z"/><path class="cls-9" d="M460.22,346.76a2,2,0,1,1-2-2A2,2,0,0,1,460.22,346.76Z"/><circle class="cls-8" cx="460.51" cy="346.4" r="1.98" transform="translate(93.61 790.74) rotate(-87.4)"/><circle class="cls-9" cx="460.51" cy="346.4" r="1.98" transform="translate(93.61 790.74) rotate(-87.4)"/><path class="cls-8" d="M464.78,346a2,2,0,1,1-2-2A2,2,0,0,1,464.78,346Z"/><path class="cls-9" d="M464.78,346a2,2,0,1,1-2-2A2,2,0,0,1,464.78,346Z"/><path class="cls-8" d="M467.18,345.8a2,2,0,1,1-2-2A2,2,0,0,1,467.18,345.8Z"/><path class="cls-9" d="M467.18,345.8a2,2,0,1,1-2-2A2,2,0,0,1,467.18,345.8Z"/><path class="cls-8" d="M469.45,345.44a2,2,0,1,1-2-2A2,2,0,0,1,469.45,345.44Z"/><path class="cls-9" d="M469.45,345.44a2,2,0,1,1-2-2A2,2,0,0,1,469.45,345.44Z"/><path class="cls-8" d="M471.74,345.08a2,2,0,1,1-2-2A2,2,0,0,1,471.74,345.08Z"/><path class="cls-9" d="M471.74,345.08a2,2,0,1,1-2-2A2,2,0,0,1,471.74,345.08Z"/><circle class="cls-8" cx="472.04" cy="344.84" r="1.98" transform="translate(106.16 800.76) rotate(-87.4)"/><circle class="cls-9" cx="472.04" cy="344.84" r="1.98" transform="translate(106.16 800.76) rotate(-87.4)"/><path class="cls-8" d="M476.3,344.48a2,2,0,1,1-2-2A2,2,0,0,1,476.3,344.48Z"/><path class="cls-9" d="M476.3,344.48a2,2,0,1,1-2-2A2,2,0,0,1,476.3,344.48Z"/><circle class="cls-8" cx="476.6" cy="344" r="1.98"/><circle class="cls-9" cx="476.6" cy="344" r="1.98"/><path class="cls-8" d="M480.86,343.64a2,2,0,1,1-2-2A2,2,0,0,1,480.86,343.64Z"/><path class="cls-9" d="M480.86,343.64a2,2,0,1,1-2-2A2,2,0,0,1,480.86,343.64Z"/><circle class="cls-8" cx="481.16" cy="343.28" r="1.98"/><circle class="cls-9" cx="481.16" cy="343.28" r="1.98"/><path class="cls-8" d="M485.42,342.92a2,2,0,1,1-2-2A2,2,0,0,1,485.42,342.92Z"/><path class="cls-9" d="M485.42,342.92a2,2,0,1,1-2-2A2,2,0,0,1,485.42,342.92Z"/><circle class="cls-8" cx="485.72" cy="342.56" r="1.98"/><circle class="cls-9" cx="485.72" cy="342.56" r="1.98"/><path class="cls-8" d="M490,342.2a2,2,0,1,1-2-2A2,2,0,0,1,490,342.2Z"/><path class="cls-9" d="M490,342.2a2,2,0,1,1-2-2A2,2,0,0,1,490,342.2Z"/><circle class="cls-8" cx="490.39" cy="341.84" r="1.98"/><circle class="cls-9" cx="490.39" cy="341.84" r="1.98"/><path class="cls-8" d="M494.66,341.36a2,2,0,1,1-2-2A2,2,0,0,1,494.66,341.36Z"/><path class="cls-9" d="M494.66,341.36a2,2,0,1,1-2-2A2,2,0,0,1,494.66,341.36Z"/><circle class="cls-8" cx="494.95" cy="341" r="1.98"/><circle class="cls-9" cx="494.95" cy="341" r="1.98"/><path class="cls-8" d="M499.22,340.52a2,2,0,1,1-2-2A2,2,0,0,1,499.22,340.52Z"/><path class="cls-9" d="M499.22,340.52a2,2,0,1,1-2-2A2,2,0,0,1,499.22,340.52Z"/><circle class="cls-8" cx="499.51" cy="340.16" r="1.98" transform="translate(-31.09 50.88) rotate(-5.65)"/><circle class="cls-9" cx="499.51" cy="340.16" r="1.98" transform="translate(-31.09 50.88) rotate(-5.65)"/><path class="cls-8" d="M503.78,339.68a2,2,0,1,1-2-2A2,2,0,0,1,503.78,339.68Z"/><path class="cls-9" d="M503.78,339.68a2,2,0,1,1-2-2A2,2,0,0,1,503.78,339.68Z"/><path class="cls-8" d="M506.06,339.2a2,2,0,1,1-2-2A2,2,0,0,1,506.06,339.2Z"/><path class="cls-9" d="M506.06,339.2a2,2,0,1,1-2-2A2,2,0,0,1,506.06,339.2Z"/><path class="cls-8" d="M508.33,338.84a2,2,0,1,1-2-2A2,2,0,0,1,508.33,338.84Z"/><path class="cls-9" d="M508.33,338.84a2,2,0,1,1-2-2A2,2,0,0,1,508.33,338.84Z"/><path class="cls-8" d="M510.62,338.36a2,2,0,1,1-2-2A2,2,0,0,1,510.62,338.36Z"/><path class="cls-9" d="M510.62,338.36a2,2,0,1,1-2-2A2,2,0,0,1,510.62,338.36Z"/><path class="cls-8" d="M512.89,337.88a2,2,0,1,1-2-2A2,2,0,0,1,512.89,337.88Z"/><path class="cls-9" d="M512.89,337.88a2,2,0,1,1-2-2A2,2,0,0,1,512.89,337.88Z"/><path class="cls-8" d="M515.3,337.4a2,2,0,1,1-2-2A2,2,0,0,1,515.3,337.4Z"/><path class="cls-9" d="M515.3,337.4a2,2,0,1,1-2-2A2,2,0,0,1,515.3,337.4Z"/><circle class="cls-8" cx="515.6" cy="336.92" r="1.98"/><circle class="cls-9" cx="515.6" cy="336.92" r="1.98"/><path class="cls-8" d="M519.86,336.44a2,2,0,1,1-2-2A2,2,0,0,1,519.86,336.44Z"/><path class="cls-9" d="M519.86,336.44a2,2,0,1,1-2-2A2,2,0,0,1,519.86,336.44Z"/><circle class="cls-8" cx="520.16" cy="335.84" r="1.98"/><circle class="cls-9" cx="520.16" cy="335.84" r="1.98"/><path class="cls-8" d="M524.42,335.36a2,2,0,1,1-2-2A2,2,0,0,1,524.42,335.36Z"/><path class="cls-9" d="M524.42,335.36a2,2,0,1,1-2-2A2,2,0,0,1,524.42,335.36Z"/><circle class="cls-8" cx="524.72" cy="334.88" r="1.98" transform="translate(153.49 834.53) rotate(-85.93)"/><circle class="cls-9" cx="524.72" cy="334.88" r="1.98" transform="translate(153.49 834.53) rotate(-85.93)"/><path class="cls-8" d="M529,334.28a2,2,0,1,1-2-2A2,2,0,0,1,529,334.28Z"/><path class="cls-9" d="M529,334.28a2,2,0,1,1-2-2A2,2,0,0,1,529,334.28Z"/><circle class="cls-8" cx="529.28" cy="333.68" r="1.98" transform="translate(158.92 837.96) rotate(-85.93)"/><circle class="cls-9" cx="529.28" cy="333.68" r="1.98" transform="translate(158.92 837.96) rotate(-85.93)"/><path class="cls-8" d="M533.54,333.2a2,2,0,1,1-2-2A2,2,0,0,1,533.54,333.2Z"/><path class="cls-9" d="M533.54,333.2a2,2,0,1,1-2-2A2,2,0,0,1,533.54,333.2Z"/><circle class="cls-8" cx="533.83" cy="332.6" r="1.98" transform="translate(164.23 841.51) rotate(-85.93)"/><circle class="cls-9" cx="533.83" cy="332.6" r="1.98" transform="translate(164.23 841.51) rotate(-85.93)"/><path class="cls-8" d="M538.22,332a2,2,0,1,1-2-2A2,2,0,0,1,538.22,332Z"/><path class="cls-9" d="M538.22,332a2,2,0,1,1-2-2A2,2,0,0,1,538.22,332Z"/><circle class="cls-8" cx="538.51" cy="331.4" r="1.98"/><circle class="cls-9" cx="538.51" cy="331.4" r="1.98"/><path class="cls-8" d="M542.78,330.8a2,2,0,1,1-2-2A2,2,0,0,1,542.78,330.8Z"/><path class="cls-9" d="M542.78,330.8a2,2,0,1,1-2-2A2,2,0,0,1,542.78,330.8Z"/><path class="cls-8" d="M545.06,330.2a2,2,0,1,1-2-2A2,2,0,0,1,545.06,330.2Z"/><path class="cls-9" d="M545.06,330.2a2,2,0,1,1-2-2A2,2,0,0,1,545.06,330.2Z"/><path class="cls-8" d="M547.33,329.6a2,2,0,1,1-2-2A2,2,0,0,1,547.33,329.6Z"/><path class="cls-9" d="M547.33,329.6a2,2,0,1,1-2-2A2,2,0,0,1,547.33,329.6Z"/><path class="cls-8" d="M549.62,328.88a2,2,0,1,1-2-2A2,2,0,0,1,549.62,328.88Z"/><path class="cls-9" d="M549.62,328.88a2,2,0,1,1-2-2A2,2,0,0,1,549.62,328.88Z"/><path class="cls-8" d="M551.89,328.28a2,2,0,1,1-2-2A2,2,0,0,1,551.89,328.28Z"/><path class="cls-9" d="M551.89,328.28a2,2,0,1,1-2-2A2,2,0,0,1,551.89,328.28Z"/><path class="cls-8" d="M554.18,327.56a2,2,0,1,1-2-2A2,2,0,0,1,554.18,327.56Z"/><path class="cls-9" d="M554.18,327.56a2,2,0,1,1-2-2A2,2,0,0,1,554.18,327.56Z"/><path class="cls-8" d="M556.45,326.84a2,2,0,1,1-2-2A2,2,0,0,1,556.45,326.84Z"/><path class="cls-9" d="M556.45,326.84a2,2,0,1,1-2-2A2,2,0,0,1,556.45,326.84Z"/><path class="cls-8" d="M558.74,326.24a2,2,0,1,1-2-2A2,2,0,0,1,558.74,326.24Z"/><path class="cls-9" d="M558.74,326.24a2,2,0,1,1-2-2A2,2,0,0,1,558.74,326.24Z"/><circle class="cls-8" cx="559.16" cy="325.52" r="1.98" transform="translate(-29.35 56.68) rotate(-5.65)"/><circle class="cls-9" cx="559.16" cy="325.52" r="1.98" transform="translate(-29.35 56.68) rotate(-5.65)"/><path class="cls-8" d="M563.42,324.8a2,2,0,1,1-2-2A2,2,0,0,1,563.42,324.8Z"/><path class="cls-9" d="M563.42,324.8a2,2,0,1,1-2-2A2,2,0,0,1,563.42,324.8Z"/><circle class="cls-8" cx="563.72" cy="323.96" r="1.98" transform="translate(200.61 863.29) rotate(-85.93)"/><circle class="cls-9" cx="563.72" cy="323.96" r="1.98" transform="translate(200.61 863.29) rotate(-85.93)"/><path class="cls-8" d="M568,323.24a2,2,0,1,1-2-2A2,2,0,0,1,568,323.24Z"/><path class="cls-9" d="M568,323.24a2,2,0,1,1-2-2A2,2,0,0,1,568,323.24Z"/><circle class="cls-8" cx="568.28" cy="322.52" r="1.98" transform="translate(-29.01 57.57) rotate(-5.65)"/><circle class="cls-9" cx="568.28" cy="322.52" r="1.98" transform="translate(-29.01 57.57) rotate(-5.65)"/><path class="cls-8" d="M572.54,321.68a2,2,0,1,1-2-2A2,2,0,0,1,572.54,321.68Z"/><path class="cls-9" d="M572.54,321.68a2,2,0,1,1-2-2A2,2,0,0,1,572.54,321.68Z"/><circle class="cls-8" cx="572.83" cy="320.84" r="1.98"/><circle class="cls-9" cx="572.83" cy="320.84" r="1.98"/><path class="cls-8" d="M577.1,320.12a2,2,0,1,1-2-2A2,2,0,0,1,577.1,320.12Z"/><path class="cls-9" d="M577.1,320.12a2,2,0,1,1-2-2A2,2,0,0,1,577.1,320.12Z"/><circle class="cls-8" cx="577.39" cy="319.28" r="1.98"/><circle class="cls-9" cx="577.39" cy="319.28" r="1.98"/><path class="cls-8" d="M581.66,318.32a2,2,0,1,1-2-2A2,2,0,0,1,581.66,318.32Z"/><path class="cls-9" d="M581.66,318.32a2,2,0,1,1-2-2A2,2,0,0,1,581.66,318.32Z"/><path class="cls-8" d="M584.06,317.48a2,2,0,1,1-2-2A2,2,0,0,1,584.06,317.48Z"/><path class="cls-9" d="M584.06,317.48a2,2,0,1,1-2-2A2,2,0,0,1,584.06,317.48Z"/><path class="cls-8" d="M586.33,316.64a2,2,0,1,1-2-2A2,2,0,0,1,586.33,316.64Z"/><path class="cls-9" d="M586.33,316.64a2,2,0,1,1-2-2A2,2,0,0,1,586.33,316.64Z"/><path class="cls-8" d="M588.62,315.8a2,2,0,1,1-2-2A2,2,0,0,1,588.62,315.8Z"/><path class="cls-9" d="M588.62,315.8a2,2,0,1,1-2-2A2,2,0,0,1,588.62,315.8Z"/><path class="cls-8" d="M590.89,314.84a2,2,0,1,1-2-2A2,2,0,0,1,590.89,314.84Z"/><path class="cls-9" d="M590.89,314.84a2,2,0,1,1-2-2A2,2,0,0,1,590.89,314.84Z"/><path class="cls-8" d="M593.18,313.88a2,2,0,1,1-2-2A2,2,0,0,1,593.18,313.88Z"/><path class="cls-9" d="M593.18,313.88a2,2,0,1,1-2-2A2,2,0,0,1,593.18,313.88Z"/><path class="cls-8" d="M595.45,312.92a2,2,0,1,1-2-2A2,2,0,0,1,595.45,312.92Z"/><path class="cls-9" d="M595.45,312.92a2,2,0,1,1-2-2A2,2,0,0,1,595.45,312.92Z"/><path class="cls-8" d="M597.74,312a2,2,0,1,1-2-2A2,2,0,0,1,597.74,312Z"/><path class="cls-9" d="M597.74,312a2,2,0,1,1-2-2A2,2,0,0,1,597.74,312Z"/><circle class="cls-8" cx="598.04" cy="310.88" r="1.98" transform="translate(245.55 885.37) rotate(-85.93)"/><circle class="cls-9" cx="598.04" cy="310.88" r="1.98" transform="translate(245.55 885.37) rotate(-85.93)"/><path class="cls-8" d="M602.3,309.92a2,2,0,1,1-2-2A2,2,0,0,1,602.3,309.92Z"/><path class="cls-9" d="M602.3,309.92a2,2,0,1,1-2-2A2,2,0,0,1,602.3,309.92Z"/><circle class="cls-8" cx="602.6" cy="308.84" r="1.98"/><circle class="cls-9" cx="602.6" cy="308.84" r="1.98"/><path class="cls-8" d="M607,307.76a2,2,0,1,1-2-2A2,2,0,0,1,607,307.76Z"/><path class="cls-9" d="M607,307.76a2,2,0,1,1-2-2A2,2,0,0,1,607,307.76Z"/><circle class="cls-8" cx="607.28" cy="306.68" r="1.98" transform="translate(258.32 890.68) rotate(-85.93)"/><circle class="cls-9" cx="607.28" cy="306.68" r="1.98" transform="translate(258.32 890.68) rotate(-85.93)"/><path class="cls-8" d="M611.54,305.6a2,2,0,1,1-2-2A2,2,0,0,1,611.54,305.6Z"/><path class="cls-9" d="M611.54,305.6a2,2,0,1,1-2-2A2,2,0,0,1,611.54,305.6Z"/><circle class="cls-8" cx="611.83" cy="304.52" r="1.98" transform="translate(-27.03 61.77) rotate(-5.65)"/><circle class="cls-9" cx="611.83" cy="304.52" r="1.98" transform="translate(-27.03 61.77) rotate(-5.65)"/><path class="cls-8" d="M616.1,303.32a2,2,0,1,1-2-2A2,2,0,0,1,616.1,303.32Z"/><path class="cls-9" d="M616.1,303.32a2,2,0,1,1-2-2A2,2,0,0,1,616.1,303.32Z"/><circle class="cls-8" cx="616.39" cy="302.12" r="1.98"/><circle class="cls-9" cx="616.39" cy="302.12" r="1.98"/><path class="cls-8" d="M620.66,301a2,2,0,1,1-2-2A2,2,0,0,1,620.66,301Z"/><path class="cls-9" d="M620.66,301a2,2,0,1,1-2-2A2,2,0,0,1,620.66,301Z"/><circle class="cls-8" cx="620.95" cy="299.84" r="1.98"/><circle class="cls-9" cx="620.95" cy="299.84" r="1.98"/><path class="cls-8" d="M625.22,298.52a2,2,0,1,1-2-2A2,2,0,0,1,625.22,298.52Z"/><path class="cls-9" d="M625.22,298.52a2,2,0,1,1-2-2A2,2,0,0,1,625.22,298.52Z"/><circle class="cls-8" cx="625.51" cy="297.32" r="1.98" transform="translate(284.61 900.18) rotate(-85.93)"/><circle class="cls-9" cx="625.51" cy="297.32" r="1.98" transform="translate(284.61 900.18) rotate(-85.93)"/><path class="cls-8" d="M629.89,296a2,2,0,1,1-2-2A2,2,0,0,1,629.89,296Z"/><path class="cls-9" d="M629.89,296a2,2,0,1,1-2-2A2,2,0,0,1,629.89,296Z"/><path class="cls-8" d="M632.18,294.68a2,2,0,1,1-2-2A2,2,0,0,1,632.18,294.68Z"/><path class="cls-9" d="M632.18,294.68a2,2,0,1,1-2-2A2,2,0,0,1,632.18,294.68Z"/><path class="cls-8" d="M634.45,293.36a2,2,0,1,1-2-2A2,2,0,0,1,634.45,293.36Z"/><path class="cls-9" d="M634.45,293.36a2,2,0,1,1-2-2A2,2,0,0,1,634.45,293.36Z"/><path class="cls-8" d="M636.74,292a2,2,0,1,1-2-2A2,2,0,0,1,636.74,292Z"/><path class="cls-9" d="M636.74,292a2,2,0,1,1-2-2A2,2,0,0,1,636.74,292Z"/><circle class="cls-8" cx="637.04" cy="290.6" r="1.98" transform="translate(302.01 905.43) rotate(-85.93)"/><circle class="cls-9" cx="637.04" cy="290.6" r="1.98" transform="translate(302.01 905.43) rotate(-85.93)"/><path class="cls-8" d="M641.3,289.28a2,2,0,1,1-2-2A2,2,0,0,1,641.3,289.28Z"/><path class="cls-9" d="M641.3,289.28a2,2,0,1,1-2-2A2,2,0,0,1,641.3,289.28Z"/><g class="cls-3"><polyline class="cls-10" points="54.95 363.25 57.26 363.13 59.53 363.13 61.81 363.01 64.09 362.89 66.38 362.89 68.66 362.77 71.06 362.65 73.33 362.54 75.61 362.42 77.89 362.3 80.17 362.18 82.45 362.06 84.73 361.94 87.02 361.69 89.3 361.57 91.58 361.45 93.86 361.33 96.25 361.1 98.53 360.98 100.81 360.74 103.09 360.62 105.38 360.5 107.66 360.25 109.94 360.13 112.22 359.89 114.5 359.77 116.78 359.54 119.17 359.42 121.45 359.18 123.73 359.06 126.02 358.81 128.29 358.69 130.57 358.45 132.85 358.33 135.13 358.1 137.41 357.98 139.69 357.74 142.09 357.5 144.38 357.38 146.66 357.13 148.94 357.01 151.22 356.77 153.5 356.65 155.78 356.42 158.06 356.18 160.34 356.06 162.62 355.81 165.01 355.69 167.29 355.45 169.57 355.33 171.85 355.1 174.13 354.86 176.41 354.74 178.69 354.5 180.97 354.38 183.25 354.13 185.53 354.01 187.94 353.77 190.22 353.65 192.5 353.42 194.78 353.18 197.06 353.06 199.34 352.81 201.62 352.69 203.9 352.45 206.18 352.33 208.46 352.1 210.85 351.98 213.13 351.74 215.41 351.62 217.69 351.38 219.97 351.25 222.25 351.01 224.53 350.89 226.81 350.65 229.09 350.54 231.38 350.3 233.78 350.18 236.06 349.94 238.34 349.81 240.62 349.57 242.9 349.45 245.18 349.21 247.46 349.1 249.74 348.86 252.01 348.74 254.29 348.5 256.69 348.38 258.98 348.25 261.25 348.01 263.54 347.89 265.81 347.65 268.1 347.54 270.38 347.3 272.65 347.18 274.94 346.81 277.21 346.45 279.62 346.21 281.89 345.86 284.18 345.5 286.45 345.13 288.74 344.77 291.01 344.54 293.3 344.18 295.57 343.81 297.86 343.45 300.13 343.21 302.42 342.86 304.81 342.5 307.1 342.13 309.38 341.89 311.65 341.54 313.94 341.18 316.21 340.81 318.5 340.57 320.77 340.21 323.06 339.86 325.33 339.5 327.74 339.25 330.01 338.89 332.3 338.42 334.57 337.94 336.86 337.45 339.13 336.98 341.42 336.5 343.69 335.89 345.98 335.42 348.25 334.94 350.65 334.45 352.94 333.98 355.21 333.5 357.5 333.01 359.77 332.54 362.06 332.06 364.33 331.57 366.62 331.1 368.89 330.62 371.18 329.89 373.57 329.3 375.86 328.57 378.13 327.98 380.42 327.25 382.69 326.65 384.98 326.06 387.25 325.33 389.54 324.74 391.81 324.01 394.1 323.42 396.5 322.69 398.77 322.1 401.06 321.25 403.33 320.54 405.62 319.69 407.89 318.86 410.18 318.01 412.45 317.3 414.74 316.45 417.01 315.62 419.42 314.77 421.69 313.94 423.98 312.98 426.25 312.01 428.54 311.06 430.81 310.1 433.1 309.13 435.38 308.18 437.65 307.21 439.94 306.13 442.33 305.18 444.62 304.1 446.89 302.89 449.18 301.81 451.45 300.74 453.74 299.54 456.01 298.45 458.3 297.25 460.57 295.94 462.86 294.74 465.25 293.42 467.54 292.1 469.81 290.77 472.1 289.45 474.38 288.01 476.65 286.57 478.94 285.13 481.21 283.69 483.5 282.25 485.77 280.69 488.06 279.13 490.45 277.45 492.74 275.89 495.01 274.21 497.3 272.65 499.57 270.86 501.86 269.06 504.13 267.25 506.42 265.45 508.69 263.77 510.98 261.74 513.38 259.81 515.65 257.89 517.93 255.97 520.22 253.81 522.5 251.78 524.77 249.62 527.05 247.57 529.34 245.29 531.62 243.01 533.89 240.74 536.29 238.46 538.58 236.06 540.86 233.53 543.13 231.13 545.41 228.62 547.7 225.97 549.98 223.34 552.25 220.69 554.53 217.94 556.82 215.18 559.22 212.18 561.5 209.29 563.77 206.41 566.05 203.29 568.34 200.18 570.62 197.18 572.89 193.94 575.17 190.69 577.46 187.22 579.74 183.85 582.13 180.5 584.41 176.9 586.7 173.29 588.98 169.57 591.25 165.85 593.53 162.01 595.82 158.06 598.1 154.09 600.38 150.01 602.65 145.81 605.05 141.62 607.34 137.18 609.62 132.85 611.89 128.29 614.17 123.73 616.46 119.06 618.74 114.38 621.01 109.58 623.29 104.66 625.58 99.61 627.98 94.45 630.25 89.3 632.53 83.89 634.82 78.38 637.1 72.86 639.38 67.25"/></g><circle class="cls-11" cx="54.98" cy="363.26" r="2.52"/><circle class="cls-12" cx="54.98" cy="363.26" r="2.52"/><circle class="cls-11" cx="57.26" cy="363.14" r="2.52"/><circle class="cls-12" cx="57.26" cy="363.14" r="2.52"/><circle class="cls-11" cx="59.54" cy="363.14" r="2.52"/><circle class="cls-12" cx="59.54" cy="363.14" r="2.52"/><circle class="cls-11" cx="61.82" cy="363.02" r="2.52"/><circle class="cls-12" cx="61.82" cy="363.02" r="2.52"/><circle class="cls-11" cx="64.1" cy="362.9" r="2.52"/><circle class="cls-12" cx="64.1" cy="362.9" r="2.52"/><circle class="cls-11" cx="66.38" cy="362.9" r="2.52"/><circle class="cls-12" cx="66.38" cy="362.9" r="2.52"/><circle class="cls-11" cx="68.65" cy="362.78" r="2.52"/><circle class="cls-12" cx="68.65" cy="362.78" r="2.52"/><circle class="cls-11" cx="71.06" cy="362.66" r="2.52"/><circle class="cls-12" cx="71.06" cy="362.66" r="2.52"/><circle class="cls-11" cx="73.33" cy="362.54" r="2.52"/><circle class="cls-12" cx="73.33" cy="362.54" r="2.52"/><circle class="cls-11" cx="75.62" cy="362.42" r="2.52"/><circle class="cls-12" cx="75.62" cy="362.42" r="2.52"/><circle class="cls-11" cx="77.9" cy="362.3" r="2.52" transform="translate(-290.31 404.11) rotate(-84.35)"/><circle class="cls-12" cx="77.9" cy="362.3" r="2.52" transform="translate(-290.31 404.11) rotate(-84.35)"/><circle class="cls-11" cx="80.17" cy="362.18" r="2.52"/><circle class="cls-12" cx="80.17" cy="362.18" r="2.52"/><circle class="cls-11" cx="82.45" cy="362.06" r="2.52"/><circle class="cls-12" cx="82.45" cy="362.06" r="2.52"/><circle class="cls-11" cx="84.74" cy="361.94" r="2.52"/><circle class="cls-12" cx="84.74" cy="361.94" r="2.52"/><circle class="cls-11" cx="87.02" cy="361.7" r="2.52"/><circle class="cls-12" cx="87.02" cy="361.7" r="2.52"/><circle class="cls-11" cx="89.3" cy="361.58" r="2.52"/><circle class="cls-12" cx="89.3" cy="361.58" r="2.52"/><circle class="cls-11" cx="91.58" cy="361.46" r="2.52"/><circle class="cls-12" cx="91.58" cy="361.46" r="2.52"/><circle class="cls-11" cx="93.86" cy="361.34" r="2.52"/><circle class="cls-12" cx="93.86" cy="361.34" r="2.52"/><circle class="cls-11" cx="96.26" cy="361.1" r="2.52"/><circle class="cls-12" cx="96.26" cy="361.1" r="2.52"/><circle class="cls-11" cx="98.53" cy="360.98" r="2.52" transform="translate(-270.39 423.46) rotate(-84.35)"/><circle class="cls-12" cx="98.53" cy="360.98" r="2.52" transform="translate(-270.39 423.46) rotate(-84.35)"/><circle class="cls-11" cx="100.81" cy="360.74" r="2.52"/><circle class="cls-12" cx="100.81" cy="360.74" r="2.52"/><circle class="cls-11" cx="103.1" cy="360.62" r="2.52"/><circle class="cls-12" cx="103.1" cy="360.62" r="2.52"/><circle class="cls-11" cx="105.38" cy="360.5" r="2.52"/><circle class="cls-12" cx="105.38" cy="360.5" r="2.52"/><circle class="cls-11" cx="107.66" cy="360.26" r="2.52"/><circle class="cls-12" cx="107.66" cy="360.26" r="2.52"/><circle class="cls-11" cx="109.94" cy="360.14" r="2.52"/><circle class="cls-12" cx="109.94" cy="360.14" r="2.52"/><circle class="cls-11" cx="112.22" cy="359.9" r="2.52"/><circle class="cls-12" cx="112.22" cy="359.9" r="2.52"/><circle class="cls-11" cx="114.5" cy="359.78" r="2.52"/><circle class="cls-12" cx="114.5" cy="359.78" r="2.52"/><circle class="cls-11" cx="116.78" cy="359.54" r="2.52"/><circle class="cls-12" cx="116.78" cy="359.54" r="2.52"/><circle class="cls-11" cx="119.17" cy="359.42" r="2.52"/><circle class="cls-12" cx="119.17" cy="359.42" r="2.52"/><circle class="cls-11" cx="121.45" cy="359.18" r="2.52"/><circle class="cls-12" cx="121.45" cy="359.18" r="2.52"/><circle class="cls-11" cx="123.74" cy="359.06" r="2.52"/><circle class="cls-12" cx="123.74" cy="359.06" r="2.52"/><circle class="cls-11" cx="126.02" cy="358.82" r="2.52"/><circle class="cls-12" cx="126.02" cy="358.82" r="2.52"/><circle class="cls-11" cx="128.3" cy="358.7" r="2.52"/><circle class="cls-12" cx="128.3" cy="358.7" r="2.52"/><circle class="cls-11" cx="130.58" cy="358.46" r="2.52"/><circle class="cls-12" cx="130.58" cy="358.46" r="2.52"/><circle class="cls-11" cx="132.86" cy="358.34" r="2.52"/><circle class="cls-12" cx="132.86" cy="358.34" r="2.52"/><circle class="cls-11" cx="135.14" cy="358.1" r="2.52"/><circle class="cls-12" cx="135.14" cy="358.1" r="2.52"/><circle class="cls-11" cx="137.42" cy="357.98" r="2.52"/><circle class="cls-12" cx="137.42" cy="357.98" r="2.52"/><circle class="cls-11" cx="139.69" cy="357.74" r="2.52"/><circle class="cls-12" cx="139.69" cy="357.74" r="2.52"/><circle class="cls-11" cx="142.1" cy="357.5" r="2.52"/><circle class="cls-12" cx="142.1" cy="357.5" r="2.52"/><circle class="cls-11" cx="144.38" cy="357.38" r="2.52"/><circle class="cls-12" cx="144.38" cy="357.38" r="2.52"/><circle class="cls-11" cx="146.66" cy="357.14" r="2.52"/><circle class="cls-12" cx="146.66" cy="357.14" r="2.52"/><circle class="cls-11" cx="148.94" cy="357.02" r="2.52"/><circle class="cls-12" cx="148.94" cy="357.02" r="2.52"/><circle class="cls-11" cx="151.22" cy="356.78" r="2.52"/><circle class="cls-12" cx="151.22" cy="356.78" r="2.52"/><circle class="cls-11" cx="153.5" cy="356.66" r="2.52"/><circle class="cls-12" cx="153.5" cy="356.66" r="2.52"/><circle class="cls-11" cx="155.78" cy="356.42" r="2.52"/><circle class="cls-12" cx="155.78" cy="356.42" r="2.52"/><circle class="cls-11" cx="158.06" cy="356.18" r="2.52"/><circle class="cls-12" cx="158.06" cy="356.18" r="2.52"/><circle class="cls-11" cx="160.33" cy="356.06" r="2.52"/><circle class="cls-12" cx="160.33" cy="356.06" r="2.52"/><circle class="cls-11" cx="162.62" cy="355.82" r="2.52"/><circle class="cls-12" cx="162.62" cy="355.82" r="2.52"/><circle class="cls-11" cx="165.02" cy="355.7" r="2.52"/><circle class="cls-12" cx="165.02" cy="355.7" r="2.52"/><circle class="cls-11" cx="167.3" cy="355.46" r="2.52"/><circle class="cls-12" cx="167.3" cy="355.46" r="2.52"/><circle class="cls-11" cx="169.58" cy="355.34" r="2.52"/><circle class="cls-12" cx="169.58" cy="355.34" r="2.52"/><circle class="cls-11" cx="171.86" cy="355.1" r="2.52"/><circle class="cls-12" cx="171.86" cy="355.1" r="2.52"/><circle class="cls-11" cx="174.14" cy="354.86" r="2.52"/><circle class="cls-12" cx="174.14" cy="354.86" r="2.52"/><circle class="cls-11" cx="176.42" cy="354.74" r="2.52"/><circle class="cls-12" cx="176.42" cy="354.74" r="2.52"/><circle class="cls-11" cx="178.69" cy="354.5" r="2.52"/><circle class="cls-12" cx="178.69" cy="354.5" r="2.52"/><circle class="cls-11" cx="180.97" cy="354.38" r="2.52"/><circle class="cls-12" cx="180.97" cy="354.38" r="2.52"/><circle class="cls-11" cx="183.26" cy="354.14" r="2.52"/><circle class="cls-12" cx="183.26" cy="354.14" r="2.52"/><circle class="cls-11" cx="185.53" cy="354.02" r="2.52"/><circle class="cls-12" cx="185.53" cy="354.02" r="2.52"/><circle class="cls-11" cx="187.94" cy="353.78" r="2.52"/><circle class="cls-12" cx="187.94" cy="353.78" r="2.52"/><circle class="cls-11" cx="190.22" cy="353.66" r="2.52"/><circle class="cls-12" cx="190.22" cy="353.66" r="2.52"/><circle class="cls-11" cx="192.5" cy="353.42" r="2.52"/><circle class="cls-12" cx="192.5" cy="353.42" r="2.52"/><circle class="cls-11" cx="194.78" cy="353.18" r="2.52"/><circle class="cls-12" cx="194.78" cy="353.18" r="2.52"/><circle class="cls-11" cx="197.05" cy="353.06" r="2.52"/><circle class="cls-12" cx="197.05" cy="353.06" r="2.52"/><circle class="cls-11" cx="199.33" cy="352.82" r="2.52"/><circle class="cls-12" cx="199.33" cy="352.82" r="2.52"/><circle class="cls-11" cx="201.61" cy="352.7" r="2.52"/><circle class="cls-12" cx="201.61" cy="352.7" r="2.52"/><circle class="cls-11" cx="203.9" cy="352.46" r="2.52"/><circle class="cls-12" cx="203.9" cy="352.46" r="2.52"/><circle class="cls-11" cx="206.18" cy="352.34" r="2.52"/><circle class="cls-12" cx="206.18" cy="352.34" r="2.52"/><circle class="cls-11" cx="208.46" cy="352.1" r="2.52"/><circle class="cls-12" cx="208.46" cy="352.1" r="2.52"/><circle class="cls-11" cx="210.85" cy="351.98" r="2.52"/><circle class="cls-12" cx="210.85" cy="351.98" r="2.52"/><circle class="cls-11" cx="213.13" cy="351.74" r="2.52"/><circle class="cls-12" cx="213.13" cy="351.74" r="2.52"/><circle class="cls-11" cx="215.42" cy="351.62" r="2.52" transform="matrix(1, -0.1, 0.1, 1, -33.6, 22.94)"/><circle class="cls-12" cx="215.42" cy="351.62" r="2.52" transform="matrix(1, -0.1, 0.1, 1, -33.6, 22.94)"/><circle class="cls-11" cx="217.7" cy="351.38" r="2.52"/><circle class="cls-12" cx="217.7" cy="351.38" r="2.52"/><circle class="cls-11" cx="219.98" cy="351.26" r="2.52" transform="matrix(1, -0.1, 0.1, 1, -33.54, 23.39)"/><circle class="cls-12" cx="219.98" cy="351.26" r="2.52" transform="matrix(1, -0.1, 0.1, 1, -33.54, 23.39)"/><circle class="cls-11" cx="222.26" cy="351.02" r="2.52"/><circle class="cls-12" cx="222.26" cy="351.02" r="2.52"/><circle class="cls-11" cx="224.54" cy="350.9" r="2.52" transform="matrix(1, -0.1, 0.1, 1, -33.48, 23.83)"/><circle class="cls-12" cx="224.54" cy="350.9" r="2.52" transform="matrix(1, -0.1, 0.1, 1, -33.48, 23.83)"/><circle class="cls-11" cx="226.82" cy="350.66" r="2.52" transform="matrix(1, -0.1, 0.1, 1, -33.45, 24.06)"/><circle class="cls-12" cx="226.82" cy="350.66" r="2.52" transform="matrix(1, -0.1, 0.1, 1, -33.45, 24.06)"/><circle class="cls-11" cx="229.1" cy="350.54" r="2.52" transform="matrix(1, -0.1, 0.1, 1, -33.43, 24.28)"/><circle class="cls-12" cx="229.1" cy="350.54" r="2.52" transform="matrix(1, -0.1, 0.1, 1, -33.43, 24.28)"/><circle class="cls-11" cx="231.38" cy="350.3" r="2.52" transform="matrix(1, -0.1, 0.1, 1, -33.39, 24.5)"/><circle class="cls-12" cx="231.38" cy="350.3" r="2.52" transform="matrix(1, -0.1, 0.1, 1, -33.39, 24.5)"/><circle class="cls-11" cx="233.78" cy="350.18" r="2.52" transform="translate(-33.37 24.74) rotate(-5.65)"/><circle class="cls-12" cx="233.78" cy="350.18" r="2.52" transform="translate(-33.37 24.74) rotate(-5.65)"/><circle class="cls-11" cx="236.06" cy="349.94" r="2.52" transform="translate(-129.73 560.59) rotate(-85.93)"/><circle class="cls-12" cx="236.06" cy="349.94" r="2.52" transform="translate(-129.73 560.59) rotate(-85.93)"/><circle class="cls-11" cx="238.33" cy="349.82" r="2.52" transform="translate(-33.31 25.19) rotate(-5.65)"/><circle class="cls-12" cx="238.33" cy="349.82" r="2.52" transform="translate(-33.31 25.19) rotate(-5.65)"/><circle class="cls-11" cx="240.61" cy="349.58" r="2.52" transform="translate(-125.14 564.8) rotate(-85.93)"/><circle class="cls-12" cx="240.61" cy="349.58" r="2.52" transform="translate(-125.14 564.8) rotate(-85.93)"/><circle class="cls-11" cx="242.9" cy="349.46" r="2.52"/><circle class="cls-12" cx="242.9" cy="349.46" r="2.52"/><circle class="cls-11" cx="245.18" cy="349.22" r="2.52"/><circle class="cls-12" cx="245.18" cy="349.22" r="2.52"/><circle class="cls-11" cx="247.46" cy="349.1" r="2.52"/><circle class="cls-12" cx="247.46" cy="349.1" r="2.52"/><circle class="cls-11" cx="249.74" cy="348.86" r="2.52"/><circle class="cls-12" cx="249.74" cy="348.86" r="2.52"/><circle class="cls-11" cx="252.01" cy="348.74" r="2.52"/><circle class="cls-12" cx="252.01" cy="348.74" r="2.52"/><circle class="cls-11" cx="254.29" cy="348.5" r="2.52"/><circle class="cls-12" cx="254.29" cy="348.5" r="2.52"/><circle class="cls-11" cx="256.7" cy="348.38" r="2.52"/><circle class="cls-12" cx="256.7" cy="348.38" r="2.52"/><circle class="cls-11" cx="258.98" cy="348.26" r="2.52" transform="matrix(1, -0.1, 0.1, 1, -33.06, 27.21)"/><circle class="cls-12" cx="258.98" cy="348.26" r="2.52" transform="matrix(1, -0.1, 0.1, 1, -33.06, 27.21)"/><circle class="cls-11" cx="261.26" cy="348.02" r="2.52"/><circle class="cls-12" cx="261.26" cy="348.02" r="2.52"/><circle class="cls-11" cx="263.54" cy="347.9" r="2.52" transform="matrix(1, -0.1, 0.1, 1, -33, 27.66)"/><circle class="cls-12" cx="263.54" cy="347.9" r="2.52" transform="matrix(1, -0.1, 0.1, 1, -33, 27.66)"/><circle class="cls-11" cx="265.82" cy="347.66" r="2.52" transform="matrix(1, -0.1, 0.1, 1, -32.96, 27.88)"/><circle class="cls-12" cx="265.82" cy="347.66" r="2.52" transform="matrix(1, -0.1, 0.1, 1, -32.96, 27.88)"/><circle class="cls-11" cx="268.1" cy="347.54" r="2.52" transform="matrix(1, -0.1, 0.1, 1, -32.94, 28.11)"/><circle class="cls-12" cx="268.1" cy="347.54" r="2.52" transform="matrix(1, -0.1, 0.1, 1, -32.94, 28.11)"/><circle class="cls-11" cx="270.38" cy="347.3" r="2.52" transform="matrix(1, -0.1, 0.1, 1, -32.91, 28.33)"/><circle class="cls-12" cx="270.38" cy="347.3" r="2.52" transform="matrix(1, -0.1, 0.1, 1, -32.91, 28.33)"/><circle class="cls-11" cx="272.65" cy="347.18" r="2.52"/><circle class="cls-12" cx="272.65" cy="347.18" r="2.52"/><circle class="cls-11" cx="274.94" cy="346.82" r="2.52"/><circle class="cls-12" cx="274.94" cy="346.82" r="2.52"/><circle class="cls-11" cx="277.22" cy="346.46" r="2.52"/><circle class="cls-12" cx="277.22" cy="346.46" r="2.52"/><circle class="cls-11" cx="279.61" cy="346.22" r="2.52" transform="translate(-85.55 600.58) rotate(-85.93)"/><circle class="cls-12" cx="279.61" cy="346.22" r="2.52" transform="translate(-85.55 600.58) rotate(-85.93)"/><circle class="cls-11" cx="281.9" cy="345.86" r="2.52"/><circle class="cls-12" cx="281.9" cy="345.86" r="2.52"/><circle class="cls-11" cx="284.18" cy="345.5" r="2.52"/><circle class="cls-12" cx="284.18" cy="345.5" r="2.52"/><circle class="cls-11" cx="286.46" cy="345.14" r="2.52"/><circle class="cls-12" cx="286.46" cy="345.14" r="2.52"/><circle class="cls-11" cx="288.74" cy="344.78" r="2.52"/><circle class="cls-12" cx="288.74" cy="344.78" r="2.52"/><circle class="cls-11" cx="291.01" cy="344.54" r="2.52"/><circle class="cls-12" cx="291.01" cy="344.54" r="2.52"/><circle class="cls-11" cx="293.29" cy="344.18" r="2.52"/><circle class="cls-12" cx="293.29" cy="344.18" r="2.52"/><circle class="cls-11" cx="295.57" cy="343.82" r="2.52"/><circle class="cls-12" cx="295.57" cy="343.82" r="2.52"/><circle class="cls-11" cx="297.85" cy="343.46" r="2.52"/><circle class="cls-12" cx="297.85" cy="343.46" r="2.52"/><circle class="cls-11" cx="300.13" cy="343.22" r="2.52"/><circle class="cls-12" cx="300.13" cy="343.22" r="2.52"/><circle class="cls-11" cx="302.42" cy="342.86" r="2.52" transform="translate(-61.02 620.2) rotate(-85.93)"/><circle class="cls-12" cx="302.42" cy="342.86" r="2.52" transform="translate(-61.02 620.2) rotate(-85.93)"/><circle class="cls-11" cx="304.82" cy="342.5" r="2.52" transform="translate(-58.43 622.26) rotate(-85.93)"/><circle class="cls-12" cx="304.82" cy="342.5" r="2.52" transform="translate(-58.43 622.26) rotate(-85.93)"/><circle class="cls-11" cx="307.1" cy="342.14" r="2.52"/><circle class="cls-12" cx="307.1" cy="342.14" r="2.52"/><circle class="cls-11" cx="309.38" cy="341.9" r="2.52"/><circle class="cls-12" cx="309.38" cy="341.9" r="2.52"/><circle class="cls-11" cx="311.65" cy="341.54" r="2.52" transform="translate(-50.68 54.33) rotate(-9.22)"/><circle class="cls-12" cx="311.65" cy="341.54" r="2.52" transform="translate(-50.68 54.33) rotate(-9.22)"/><circle class="cls-11" cx="313.94" cy="341.18" r="2.52" transform="translate(-73.12 596.41) rotate(-80.78)"/><circle class="cls-12" cx="313.94" cy="341.18" r="2.52" transform="translate(-73.12 596.41) rotate(-80.78)"/><circle class="cls-11" cx="316.22" cy="340.82" r="2.52" transform="translate(-70.85 598.35) rotate(-80.78)"/><circle class="cls-12" cx="316.22" cy="340.82" r="2.52" transform="translate(-70.85 598.35) rotate(-80.78)"/><circle class="cls-11" cx="318.5" cy="340.58" r="2.52"/><circle class="cls-12" cx="318.5" cy="340.58" r="2.52"/><circle class="cls-11" cx="320.78" cy="340.22" r="2.52"/><circle class="cls-12" cx="320.78" cy="340.22" r="2.52"/><circle class="cls-11" cx="323.06" cy="339.86" r="2.52"/><circle class="cls-12" cx="323.06" cy="339.86" r="2.52"/><circle class="cls-11" cx="325.33" cy="339.5" r="2.52"/><circle class="cls-12" cx="325.33" cy="339.5" r="2.52"/><circle class="cls-11" cx="327.74" cy="339.26" r="2.52"/><circle class="cls-12" cx="327.74" cy="339.26" r="2.52"/><circle class="cls-11" cx="330.01" cy="338.9" r="2.52"/><circle class="cls-12" cx="330.01" cy="338.9" r="2.52"/><circle class="cls-11" cx="332.29" cy="338.42" r="2.52"/><circle class="cls-12" cx="332.29" cy="338.42" r="2.52"/><circle class="cls-11" cx="334.57" cy="337.94" r="2.52"/><circle class="cls-12" cx="334.57" cy="337.94" r="2.52"/><circle class="cls-11" cx="336.85" cy="337.46" r="2.52"/><circle class="cls-12" cx="336.85" cy="337.46" r="2.52"/><circle class="cls-11" cx="339.13" cy="336.98" r="2.52"/><circle class="cls-12" cx="339.13" cy="336.98" r="2.52"/><circle class="cls-11" cx="341.42" cy="336.5" r="2.52"/><circle class="cls-12" cx="341.42" cy="336.5" r="2.52"/><circle class="cls-11" cx="343.7" cy="335.9" r="2.52"/><circle class="cls-12" cx="343.7" cy="335.9" r="2.52"/><circle class="cls-11" cx="345.98" cy="335.42" r="2.52"/><circle class="cls-12" cx="345.98" cy="335.42" r="2.52"/><circle class="cls-11" cx="348.26" cy="334.94" r="2.52" transform="translate(-19.37 648.49) rotate(-84.34)"/><circle class="cls-12" cx="348.26" cy="334.94" r="2.52" transform="translate(-19.37 648.49) rotate(-84.34)"/><circle class="cls-11" cx="350.65" cy="334.46" r="2.52" transform="translate(-49.05 60.49) rotate(-9.22)"/><circle class="cls-12" cx="350.65" cy="334.46" r="2.52" transform="translate(-49.05 60.49) rotate(-9.22)"/><circle class="cls-11" cx="352.94" cy="333.98" r="2.52"/><circle class="cls-12" cx="352.94" cy="333.98" r="2.52"/><circle class="cls-11" cx="355.22" cy="333.5" r="2.52"/><circle class="cls-12" cx="355.22" cy="333.5" r="2.52"/><circle class="cls-11" cx="357.5" cy="333.02" r="2.52"/><circle class="cls-12" cx="357.5" cy="333.02" r="2.52"/><circle class="cls-11" cx="359.78" cy="332.54" r="2.52" transform="translate(-26.1 634.4) rotate(-80.78)"/><circle class="cls-12" cx="359.78" cy="332.54" r="2.52" transform="translate(-26.1 634.4) rotate(-80.78)"/><circle class="cls-11" cx="362.06" cy="332.06" r="2.52"/><circle class="cls-12" cx="362.06" cy="332.06" r="2.52"/><circle class="cls-11" cx="364.33" cy="331.58" r="2.52"/><circle class="cls-12" cx="364.33" cy="331.58" r="2.52"/><circle class="cls-11" cx="366.61" cy="331.1" r="2.52" transform="translate(-18.93 639.94) rotate(-80.78)"/><circle class="cls-12" cx="366.61" cy="331.1" r="2.52" transform="translate(-18.93 639.94) rotate(-80.78)"/><circle class="cls-11" cx="368.9" cy="330.62" r="2.52"/><circle class="cls-12" cx="368.9" cy="330.62" r="2.52"/><circle class="cls-11" cx="371.18" cy="329.9" r="2.52"/><circle class="cls-12" cx="371.18" cy="329.9" r="2.52"/><circle class="cls-11" cx="373.57" cy="329.3" r="2.52"/><circle class="cls-12" cx="373.57" cy="329.3" r="2.52"/><circle class="cls-11" cx="375.85" cy="328.58" r="2.52"/><circle class="cls-12" cx="375.85" cy="328.58" r="2.52"/><circle class="cls-11" cx="378.13" cy="327.98" r="2.52"/><circle class="cls-12" cx="378.13" cy="327.98" r="2.52"/><circle class="cls-11" cx="380.42" cy="327.26" r="2.52"/><circle class="cls-12" cx="380.42" cy="327.26" r="2.52"/><circle class="cls-11" cx="382.7" cy="326.66" r="2.52" transform="translate(19.92 675.3) rotate(-84.34)"/><circle class="cls-12" cx="382.7" cy="326.66" r="2.52" transform="translate(19.92 675.3) rotate(-84.34)"/><circle class="cls-11" cx="384.98" cy="326.06" r="2.52"/><circle class="cls-12" cx="384.98" cy="326.06" r="2.52"/><circle class="cls-11" cx="387.26" cy="325.34" r="2.52"/><circle class="cls-12" cx="387.26" cy="325.34" r="2.52"/><circle class="cls-11" cx="389.54" cy="324.74" r="2.52" transform="translate(-46.99 66.59) rotate(-9.22)"/><circle class="cls-12" cx="389.54" cy="324.74" r="2.52" transform="translate(-46.99 66.59) rotate(-9.22)"/><circle class="cls-11" cx="391.82" cy="324.02" r="2.52" transform="translate(30.77 682) rotate(-84.34)"/><circle class="cls-12" cx="391.82" cy="324.02" r="2.52" transform="translate(30.77 682) rotate(-84.34)"/><circle class="cls-11" cx="394.1" cy="323.42" r="2.52"/><circle class="cls-12" cx="394.1" cy="323.42" r="2.52"/><circle class="cls-11" cx="396.5" cy="322.7" r="2.52"/><circle class="cls-12" cx="396.5" cy="322.7" r="2.52"/><circle class="cls-11" cx="398.78" cy="322.1" r="2.52" transform="translate(16.96 664.13) rotate(-80.78)"/><circle class="cls-12" cx="398.78" cy="322.1" r="2.52" transform="translate(16.96 664.13) rotate(-80.78)"/><circle class="cls-11" cx="401.06" cy="321.26" r="2.52"/><circle class="cls-12" cx="401.06" cy="321.26" r="2.52"/><circle class="cls-11" cx="403.33" cy="320.54" r="2.52" transform="translate(22.33 667.32) rotate(-80.78)"/><circle class="cls-12" cx="403.33" cy="320.54" r="2.52" transform="translate(22.33 667.32) rotate(-80.78)"/><circle class="cls-11" cx="405.61" cy="319.7" r="2.52"/><circle class="cls-12" cx="405.61" cy="319.7" r="2.52"/><circle class="cls-11" cx="407.9" cy="318.86" r="2.52"/><circle class="cls-12" cx="407.9" cy="318.86" r="2.52"/><circle class="cls-11" cx="410.18" cy="318.02" r="2.52"/><circle class="cls-12" cx="410.18" cy="318.02" r="2.52"/><circle class="cls-11" cx="412.46" cy="317.3" r="2.52"/><circle class="cls-12" cx="412.46" cy="317.3" r="2.52"/><circle class="cls-11" cx="414.74" cy="316.46" r="2.52"/><circle class="cls-12" cx="414.74" cy="316.46" r="2.52"/><circle class="cls-11" cx="417.01" cy="315.62" r="2.52"/><circle class="cls-12" cx="417.01" cy="315.62" r="2.52"/><circle class="cls-11" cx="419.42" cy="314.78" r="2.52"/><circle class="cls-12" cx="419.42" cy="314.78" r="2.52"/><circle class="cls-11" cx="421.7" cy="313.94" r="2.52" transform="translate(67.73 702.64) rotate(-84.34)"/><circle class="cls-12" cx="421.7" cy="313.94" r="2.52" transform="translate(67.73 702.64) rotate(-84.34)"/><circle class="cls-11" cx="423.98" cy="312.98" r="2.52"/><circle class="cls-12" cx="423.98" cy="312.98" r="2.52"/><circle class="cls-11" cx="426.26" cy="312.02" r="2.52" transform="translate(73.76 705.45) rotate(-84.34)"/><circle class="cls-12" cx="426.26" cy="312.02" r="2.52" transform="translate(73.76 705.45) rotate(-84.34)"/><circle class="cls-11" cx="428.54" cy="311.06" r="2.52"/><circle class="cls-12" cx="428.54" cy="311.06" r="2.52"/><circle class="cls-11" cx="430.82" cy="310.1" r="2.52" transform="translate(-44.11 73.01) rotate(-9.22)"/><circle class="cls-12" cx="430.82" cy="310.1" r="2.52" transform="translate(-44.11 73.01) rotate(-9.22)"/><circle class="cls-11" cx="433.1" cy="309.14" r="2.52"/><circle class="cls-12" cx="433.1" cy="309.14" r="2.52"/><circle class="cls-11" cx="435.38" cy="308.18" r="2.52" transform="translate(-43.74 73.72) rotate(-9.22)"/><circle class="cls-12" cx="435.38" cy="308.18" r="2.52" transform="translate(-43.74 73.72) rotate(-9.22)"/><circle class="cls-11" cx="437.65" cy="307.22" r="2.52" transform="translate(88.81 712.47) rotate(-84.34)"/><circle class="cls-12" cx="437.65" cy="307.22" r="2.52" transform="translate(88.81 712.47) rotate(-84.34)"/><circle class="cls-11" cx="439.94" cy="306.14" r="2.52"/><circle class="cls-12" cx="439.94" cy="306.14" r="2.52"/><circle class="cls-11" cx="442.33" cy="305.18" r="2.52" transform="translate(70.25 692.91) rotate(-80.78)"/><circle class="cls-12" cx="442.33" cy="305.18" r="2.52" transform="translate(70.25 692.91) rotate(-80.78)"/><circle class="cls-11" cx="444.61" cy="304.1" r="2.52" transform="translate(73.23 694.26) rotate(-80.78)"/><circle class="cls-12" cx="444.61" cy="304.1" r="2.52" transform="translate(73.23 694.26) rotate(-80.78)"/><circle class="cls-11" cx="446.9" cy="302.9" r="2.52"/><circle class="cls-12" cx="446.9" cy="302.9" r="2.52"/><circle class="cls-11" cx="449.18" cy="301.82" r="2.52"/><circle class="cls-12" cx="449.18" cy="301.82" r="2.52"/><circle class="cls-11" cx="451.46" cy="300.74" r="2.52"/><circle class="cls-12" cx="451.46" cy="300.74" r="2.52"/><circle class="cls-11" cx="453.73" cy="299.54" r="2.52" transform="translate(85.39 699.43) rotate(-80.78)"/><circle class="cls-12" cx="453.73" cy="299.54" r="2.52" transform="translate(85.39 699.43) rotate(-80.78)"/><circle class="cls-11" cx="456.01" cy="298.46" r="2.52"/><circle class="cls-12" cx="456.01" cy="298.46" r="2.52"/><path class="cls-11" d="M460.81,297.26a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,460.81,297.26Z"/><path class="cls-12" d="M460.81,297.26a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,460.81,297.26Z"/><path class="cls-11" d="M463.1,295.94a2.52,2.52,0,1,1-2.53-2.52A2.52,2.52,0,0,1,463.1,295.94Z"/><path class="cls-12" d="M463.1,295.94a2.52,2.52,0,1,1-2.53-2.52A2.52,2.52,0,0,1,463.1,295.94Z"/><path class="cls-11" d="M465.37,294.74a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,465.37,294.74Z"/><path class="cls-12" d="M465.37,294.74a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,465.37,294.74Z"/><path class="cls-11" d="M467.78,293.42a2.52,2.52,0,1,1-2.53-2.52A2.53,2.53,0,0,1,467.78,293.42Z"/><path class="cls-12" d="M467.78,293.42a2.52,2.52,0,1,1-2.53-2.52A2.53,2.53,0,0,1,467.78,293.42Z"/><circle class="cls-11" cx="467.54" cy="292.1" r="2.52" transform="translate(-26.51 47.49) rotate(-5.65)"/><circle class="cls-12" cx="467.54" cy="292.1" r="2.52" transform="translate(-26.51 47.49) rotate(-5.65)"/><path class="cls-11" d="M472.33,290.78a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,472.33,290.78Z"/><path class="cls-12" d="M472.33,290.78a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,472.33,290.78Z"/><circle class="cls-11" cx="472.1" cy="289.46" r="2.52" transform="translate(-26.22 47.93) rotate(-5.65)"/><circle class="cls-12" cx="472.1" cy="289.46" r="2.52" transform="translate(-26.22 47.93) rotate(-5.65)"/><path class="cls-11" d="M476.89,288a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,476.89,288Z"/><path class="cls-12" d="M476.89,288a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,476.89,288Z"/><circle class="cls-11" cx="476.66" cy="286.58" r="2.52" transform="translate(157.01 741.72) rotate(-85.93)"/><circle class="cls-12" cx="476.66" cy="286.58" r="2.52" transform="translate(157.01 741.72) rotate(-85.93)"/><path class="cls-11" d="M481.45,285.14a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,481.45,285.14Z"/><path class="cls-12" d="M481.45,285.14a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,481.45,285.14Z"/><circle class="cls-11" cx="481.22" cy="283.7" r="2.52"/><circle class="cls-12" cx="481.22" cy="283.7" r="2.52"/><path class="cls-11" d="M486,282.26a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,486,282.26Z"/><path class="cls-12" d="M486,282.26a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,486,282.26Z"/><circle class="cls-11" cx="485.78" cy="280.7" r="2.52"/><circle class="cls-12" cx="485.78" cy="280.7" r="2.52"/><path class="cls-11" d="M490.57,279.14a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,490.57,279.14Z"/><path class="cls-12" d="M490.57,279.14a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,490.57,279.14Z"/><circle class="cls-11" cx="490.45" cy="277.46" r="2.52" transform="translate(-24.95 49.68) rotate(-5.65)"/><circle class="cls-12" cx="490.45" cy="277.46" r="2.52" transform="translate(-24.95 49.68) rotate(-5.65)"/><path class="cls-11" d="M495.25,275.9a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,495.25,275.9Z"/><path class="cls-12" d="M495.25,275.9a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,495.25,275.9Z"/><circle class="cls-11" cx="495.01" cy="274.22" r="2.52" transform="translate(186.4 748.55) rotate(-85.93)"/><circle class="cls-12" cx="495.01" cy="274.22" r="2.52" transform="translate(186.4 748.55) rotate(-85.93)"/><path class="cls-11" d="M499.81,272.66a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,499.81,272.66Z"/><path class="cls-12" d="M499.81,272.66a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,499.81,272.66Z"/><path class="cls-11" d="M502.1,270.86a2.52,2.52,0,1,1-2.53-2.52A2.52,2.52,0,0,1,502.1,270.86Z"/><path class="cls-12" d="M502.1,270.86a2.52,2.52,0,1,1-2.53-2.52A2.52,2.52,0,0,1,502.1,270.86Z"/><path class="cls-11" d="M504.37,269.06a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,504.37,269.06Z"/><path class="cls-12" d="M504.37,269.06a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,504.37,269.06Z"/><path class="cls-11" d="M506.66,267.26a2.52,2.52,0,1,1-2.53-2.52A2.52,2.52,0,0,1,506.66,267.26Z"/><path class="cls-12" d="M506.66,267.26a2.52,2.52,0,1,1-2.53-2.52A2.52,2.52,0,0,1,506.66,267.26Z"/><path class="cls-11" d="M508.93,265.46a2.52,2.52,0,1,1-2.51-2.52A2.51,2.51,0,0,1,508.93,265.46Z"/><path class="cls-12" d="M508.93,265.46a2.52,2.52,0,1,1-2.51-2.52A2.51,2.51,0,0,1,508.93,265.46Z"/><path class="cls-11" d="M511.22,263.78a2.52,2.52,0,1,1-2.53-2.52A2.53,2.53,0,0,1,511.22,263.78Z"/><path class="cls-12" d="M511.22,263.78a2.52,2.52,0,1,1-2.53-2.52A2.53,2.53,0,0,1,511.22,263.78Z"/><path class="cls-11" d="M513.49,261.74a2.52,2.52,0,1,1-2.51-2.52A2.51,2.51,0,0,1,513.49,261.74Z"/><path class="cls-12" d="M513.49,261.74a2.52,2.52,0,1,1-2.51-2.52A2.51,2.51,0,0,1,513.49,261.74Z"/><path class="cls-11" d="M515.89,259.82a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,515.89,259.82Z"/><path class="cls-12" d="M515.89,259.82a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,515.89,259.82Z"/><circle class="cls-11" cx="515.66" cy="257.9" r="2.52"/><circle class="cls-12" cx="515.66" cy="257.9" r="2.52"/><path class="cls-11" d="M520.45,256a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,520.45,256Z"/><path class="cls-12" d="M520.45,256a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,520.45,256Z"/><circle class="cls-11" cx="520.22" cy="253.82" r="2.52" transform="translate(-22.48 52.5) rotate(-5.65)"/><circle class="cls-12" cx="520.22" cy="253.82" r="2.52" transform="translate(-22.48 52.5) rotate(-5.65)"/><path class="cls-11" d="M525,251.78a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,525,251.78Z"/><path class="cls-12" d="M525,251.78a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,525,251.78Z"/><circle class="cls-11" cx="524.78" cy="249.62" r="2.52"/><circle class="cls-12" cx="524.78" cy="249.62" r="2.52"/><path class="cls-11" d="M529.57,247.58a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,529.57,247.58Z"/><path class="cls-12" d="M529.57,247.58a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,529.57,247.58Z"/><circle class="cls-11" cx="529.33" cy="245.3" r="2.52" transform="translate(247.13 755.91) rotate(-85.93)"/><circle class="cls-12" cx="529.33" cy="245.3" r="2.52" transform="translate(247.13 755.91) rotate(-85.93)"/><path class="cls-11" d="M534.13,243a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,534.13,243Z"/><path class="cls-12" d="M534.13,243a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,534.13,243Z"/><circle class="cls-11" cx="533.89" cy="240.74" r="2.52" transform="translate(-21.12 53.78) rotate(-5.65)"/><circle class="cls-12" cx="533.89" cy="240.74" r="2.52" transform="translate(-21.12 53.78) rotate(-5.65)"/><path class="cls-11" d="M538.81,238.46a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,538.81,238.46Z"/><path class="cls-12" d="M538.81,238.46a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,538.81,238.46Z"/><path class="cls-11" d="M541.1,236.06a2.52,2.52,0,1,1-2.53-2.52A2.52,2.52,0,0,1,541.1,236.06Z"/><path class="cls-12" d="M541.1,236.06a2.52,2.52,0,1,1-2.53-2.52A2.52,2.52,0,0,1,541.1,236.06Z"/><path class="cls-11" d="M543.37,233.54a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,543.37,233.54Z"/><path class="cls-12" d="M543.37,233.54a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,543.37,233.54Z"/><path class="cls-11" d="M545.66,231.14a2.52,2.52,0,1,1-2.53-2.52A2.52,2.52,0,0,1,545.66,231.14Z"/><path class="cls-12" d="M545.66,231.14a2.52,2.52,0,1,1-2.53-2.52A2.52,2.52,0,0,1,545.66,231.14Z"/><path class="cls-11" d="M547.93,228.62a2.52,2.52,0,1,1-2.51-2.52A2.51,2.51,0,0,1,547.93,228.62Z"/><path class="cls-12" d="M547.93,228.62a2.52,2.52,0,1,1-2.51-2.52A2.51,2.51,0,0,1,547.93,228.62Z"/><path class="cls-11" d="M550.22,226a2.52,2.52,0,1,1-2.53-2.52A2.53,2.53,0,0,1,550.22,226Z"/><path class="cls-12" d="M550.22,226a2.52,2.52,0,1,1-2.53-2.52A2.53,2.53,0,0,1,550.22,226Z"/><path class="cls-11" d="M552.49,223.34a2.52,2.52,0,1,1-2.51-2.52A2.51,2.51,0,0,1,552.49,223.34Z"/><path class="cls-12" d="M552.49,223.34a2.52,2.52,0,1,1-2.51-2.52A2.51,2.51,0,0,1,552.49,223.34Z"/><path class="cls-11" d="M554.78,220.7a2.52,2.52,0,1,1-2.53-2.52A2.53,2.53,0,0,1,554.78,220.7Z"/><path class="cls-12" d="M554.78,220.7a2.52,2.52,0,1,1-2.53-2.52A2.53,2.53,0,0,1,554.78,220.7Z"/><circle class="cls-11" cx="554.54" cy="217.94" r="2.52" transform="translate(297.84 755.63) rotate(-85.93)"/><circle class="cls-12" cx="554.54" cy="217.94" r="2.52" transform="translate(297.84 755.63) rotate(-85.93)"/><path class="cls-11" d="M559.33,215.18a2.52,2.52,0,1,1-2.52-2.53A2.52,2.52,0,0,1,559.33,215.18Z"/><path class="cls-12" d="M559.33,215.18a2.52,2.52,0,1,1-2.52-2.53A2.52,2.52,0,0,1,559.33,215.18Z"/><circle class="cls-11" cx="559.22" cy="212.18" r="2.52" transform="translate(260.2 730.18) rotate(-80.78)"/><circle class="cls-12" cx="559.22" cy="212.18" r="2.52" transform="translate(260.2 730.18) rotate(-80.78)"/><path class="cls-11" d="M564,209.3a2.52,2.52,0,1,1-2.52-2.53A2.53,2.53,0,0,1,564,209.3Z"/><path class="cls-12" d="M564,209.3a2.52,2.52,0,1,1-2.52-2.53A2.53,2.53,0,0,1,564,209.3Z"/><circle class="cls-11" cx="563.78" cy="206.42" r="2.52"/><circle class="cls-12" cx="563.78" cy="206.42" r="2.52"/><path class="cls-11" d="M568.57,203.3a2.52,2.52,0,1,1-2.51-2.53A2.53,2.53,0,0,1,568.57,203.3Z"/><path class="cls-12" d="M568.57,203.3a2.52,2.52,0,1,1-2.51-2.53A2.53,2.53,0,0,1,568.57,203.3Z"/><circle class="cls-11" cx="568.33" cy="200.18" r="2.52" transform="translate(279.71 729.11) rotate(-80.78)"/><circle class="cls-12" cx="568.33" cy="200.18" r="2.52" transform="translate(279.71 729.11) rotate(-80.78)"/><path class="cls-11" d="M573.13,197.18a2.52,2.52,0,1,1-2.51-2.53A2.52,2.52,0,0,1,573.13,197.18Z"/><path class="cls-12" d="M573.13,197.18a2.52,2.52,0,1,1-2.51-2.53A2.52,2.52,0,0,1,573.13,197.18Z"/><circle class="cls-11" cx="572.89" cy="193.94" r="2.52" transform="translate(-30.61 234) rotate(-22.5)"/><circle class="cls-12" cx="572.89" cy="193.94" r="2.52" transform="translate(-30.61 234) rotate(-22.5)"/><path class="cls-11" d="M577.69,190.7a2.52,2.52,0,1,1-2.51-2.52A2.51,2.51,0,0,1,577.69,190.7Z"/><path class="cls-12" d="M577.69,190.7a2.52,2.52,0,1,1-2.51-2.52A2.51,2.51,0,0,1,577.69,190.7Z"/><circle class="cls-11" cx="577.45" cy="187.22" r="2.52" transform="translate(-27.69 235.23) rotate(-22.5)"/><circle class="cls-12" cx="577.45" cy="187.22" r="2.52" transform="translate(-27.69 235.23) rotate(-22.5)"/><path class="cls-11" d="M582.25,183.86a2.52,2.52,0,1,1-2.51-2.52A2.53,2.53,0,0,1,582.25,183.86Z"/><path class="cls-12" d="M582.25,183.86a2.52,2.52,0,1,1-2.51-2.52A2.53,2.53,0,0,1,582.25,183.86Z"/><path class="cls-11" d="M584.66,180.49a2.52,2.52,0,1,1-2.53-2.51A2.53,2.53,0,0,1,584.66,180.49Z"/><path class="cls-12" d="M584.66,180.49a2.52,2.52,0,1,1-2.53-2.51A2.53,2.53,0,0,1,584.66,180.49Z"/><path class="cls-11" d="M586.93,176.9a2.52,2.52,0,1,1-2.51-2.52A2.51,2.51,0,0,1,586.93,176.9Z"/><path class="cls-12" d="M586.93,176.9a2.52,2.52,0,1,1-2.51-2.52A2.51,2.51,0,0,1,586.93,176.9Z"/><path class="cls-11" d="M589.22,173.3a2.52,2.52,0,1,1-2.53-2.53A2.54,2.54,0,0,1,589.22,173.3Z"/><path class="cls-12" d="M589.22,173.3a2.52,2.52,0,1,1-2.53-2.53A2.54,2.54,0,0,1,589.22,173.3Z"/><path class="cls-11" d="M591.49,169.58a2.52,2.52,0,1,1-2.51-2.53A2.52,2.52,0,0,1,591.49,169.58Z"/><path class="cls-12" d="M591.49,169.58a2.52,2.52,0,1,1-2.51-2.53A2.52,2.52,0,0,1,591.49,169.58Z"/><path class="cls-11" d="M593.78,165.86a2.52,2.52,0,1,1-2.53-2.52A2.54,2.54,0,0,1,593.78,165.86Z"/><path class="cls-12" d="M593.78,165.86a2.52,2.52,0,1,1-2.53-2.52A2.54,2.54,0,0,1,593.78,165.86Z"/><circle class="cls-11" cx="593.54" cy="162.02" r="2.52" transform="translate(338.54 721.93) rotate(-80.78)"/><circle class="cls-12" cx="593.54" cy="162.02" r="2.52" transform="translate(338.54 721.93) rotate(-80.78)"/><path class="cls-11" d="M598.33,158.05a2.52,2.52,0,1,1-2.52-2.51A2.52,2.52,0,0,1,598.33,158.05Z"/><path class="cls-12" d="M598.33,158.05a2.52,2.52,0,1,1-2.52-2.51A2.52,2.52,0,0,1,598.33,158.05Z"/><circle class="cls-11" cx="598.1" cy="154.1" r="2.52" transform="translate(-13.44 240.61) rotate(-22.5)"/><circle class="cls-12" cx="598.1" cy="154.1" r="2.52" transform="translate(-13.44 240.61) rotate(-22.5)"/><path class="cls-11" d="M602.89,150a2.52,2.52,0,1,1-2.52-2.53A2.53,2.53,0,0,1,602.89,150Z"/><path class="cls-12" d="M602.89,150a2.52,2.52,0,1,1-2.52-2.53A2.53,2.53,0,0,1,602.89,150Z"/><circle class="cls-11" cx="602.66" cy="145.82" r="2.52" transform="translate(-9.93 241.73) rotate(-22.5)"/><circle class="cls-12" cx="602.66" cy="145.82" r="2.52" transform="translate(-9.93 241.73) rotate(-22.5)"/><path class="cls-11" d="M607.57,141.62a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,607.57,141.62Z"/><path class="cls-12" d="M607.57,141.62a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,607.57,141.62Z"/><circle class="cls-11" cx="607.33" cy="137.18" r="2.52" transform="translate(374.65 714.69) rotate(-80.78)"/><circle class="cls-12" cx="607.33" cy="137.18" r="2.52" transform="translate(374.65 714.69) rotate(-80.78)"/><path class="cls-11" d="M612.13,132.86a2.52,2.52,0,1,1-2.51-2.52A2.53,2.53,0,0,1,612.13,132.86Z"/><path class="cls-12" d="M612.13,132.86a2.52,2.52,0,1,1-2.51-2.52A2.53,2.53,0,0,1,612.13,132.86Z"/><circle class="cls-11" cx="611.89" cy="128.3" r="2.52" transform="translate(387.24 711.74) rotate(-80.78)"/><circle class="cls-12" cx="611.89" cy="128.3" r="2.52" transform="translate(387.24 711.74) rotate(-80.78)"/><path class="cls-11" d="M616.69,123.74a2.52,2.52,0,1,1-2.51-2.53A2.52,2.52,0,0,1,616.69,123.74Z"/><path class="cls-12" d="M616.69,123.74a2.52,2.52,0,1,1-2.51-2.53A2.52,2.52,0,0,1,616.69,123.74Z"/><circle class="cls-11" cx="616.45" cy="119.06" r="2.52" transform="translate(1.36 244.97) rotate(-22.5)"/><circle class="cls-12" cx="616.45" cy="119.06" r="2.52" transform="translate(1.36 244.97) rotate(-22.5)"/><path class="cls-11" d="M621.25,114.38a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,621.25,114.38Z"/><path class="cls-12" d="M621.25,114.38a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,621.25,114.38Z"/><circle class="cls-11" cx="621.01" cy="109.58" r="2.52" transform="translate(413.38 705.02) rotate(-80.78)"/><circle class="cls-12" cx="621.01" cy="109.58" r="2.52" transform="translate(413.38 705.02) rotate(-80.78)"/><path class="cls-11" d="M625.81,104.65a2.52,2.52,0,1,1-2.51-2.51A2.52,2.52,0,0,1,625.81,104.65Z"/><path class="cls-12" d="M625.81,104.65a2.52,2.52,0,1,1-2.51-2.51A2.52,2.52,0,0,1,625.81,104.65Z"/><path class="cls-11" d="M628.1,99.62a2.52,2.52,0,1,1-2.53-2.52A2.52,2.52,0,0,1,628.1,99.62Z"/><path class="cls-12" d="M628.1,99.62a2.52,2.52,0,1,1-2.53-2.52A2.52,2.52,0,0,1,628.1,99.62Z"/><path class="cls-11" d="M630.49,94.46A2.52,2.52,0,1,1,628,91.93,2.51,2.51,0,0,1,630.49,94.46Z"/><path class="cls-12" d="M630.49,94.46A2.52,2.52,0,1,1,628,91.93,2.51,2.51,0,0,1,630.49,94.46Z"/><path class="cls-11" d="M632.78,89.3a2.52,2.52,0,1,1-2.53-2.53A2.54,2.54,0,0,1,632.78,89.3Z"/><path class="cls-12" d="M632.78,89.3a2.52,2.52,0,1,1-2.53-2.53A2.54,2.54,0,0,1,632.78,89.3Z"/><circle class="cls-11" cx="632.54" cy="83.9" r="2.52" transform="translate(448.4 694.82) rotate(-80.78)"/><circle class="cls-12" cx="632.54" cy="83.9" r="2.52" transform="translate(448.4 694.82) rotate(-80.78)"/><path class="cls-11" d="M637.33,78.38a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,637.33,78.38Z"/><path class="cls-12" d="M637.33,78.38a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,637.33,78.38Z"/><circle class="cls-11" cx="637.1" cy="72.86" r="2.52"/><circle class="cls-12" cx="637.1" cy="72.86" r="2.52"/><path class="cls-11" d="M641.89,67.21a2.52,2.52,0,1,1-2.52-2.51A2.52,2.52,0,0,1,641.89,67.21Z"/><path class="cls-12" d="M641.89,67.21a2.52,2.52,0,1,1-2.52-2.51A2.52,2.52,0,0,1,641.89,67.21Z"/><g class="cls-13"><text class="cls-14" transform="translate(40.94 365.91)">0</text></g><g class="cls-13"><text class="cls-14" transform="translate(27.23 315.31)">5000</text></g><g class="cls-13"><text class="cls-14" transform="translate(22.67 264.69)">10000</text></g><g class="cls-13"><text class="cls-14" transform="translate(22.67 214.04)">15000</text></g><g class="cls-13"><text class="cls-14" transform="translate(22.67 163.44)">20000</text></g><g class="cls-13"><text class="cls-14" transform="translate(22.67 112.82)">25000</text></g><g class="cls-13"><text class="cls-14" transform="translate(22.67 62.17)">30000</text></g><g class="cls-13"><text class="cls-14" transform="translate(22.67 11.55)">35000</text></g><g class="cls-13"><text class="cls-14" transform="translate(52.69 377.63)">0</text></g><g class="cls-13"><text class="cls-14" transform="translate(165.03 377.63)">50</text></g><g class="cls-13"><text class="cls-14" transform="translate(277.36 377.63)">100</text></g><g class="cls-13"><text class="cls-14" transform="translate(391.98 377.63)">150</text></g><g class="cls-13"><text class="cls-14" transform="translate(506.58 377.63)">200</text></g><g class="cls-13"><text class="cls-14" transform="translate(621.19 377.63)">250</text></g><g class="cls-13"><text class="cls-15" transform="translate(17.56 199.36) rotate(-90)">Qstep</text></g><g class="cls-13"><text class="cls-15" transform="translate(325.65 386.9)">Q<tspan class="cls-16" x="11.53" y="0">_</tspan><tspan class="cls-17" x="19.54" y="0">i</tspan><tspan class="cls-18" x="23.97" y="0">n</tspan><tspan class="cls-19" x="31.98" y="0">d</tspan><tspan x="40.01" y="0">ex</tspan></text></g><line class="cls-4" x1="481.68" y1="70.09" x2="500.88" y2="70.09"/><path class="cls-5" d="M493.1,69.92a2,2,0,1,1-2-2A2,2,0,0,1,493.1,69.92Z"/><path class="cls-20" d="M493.1,69.92a2,2,0,1,1-2-2A2,2,0,0,1,493.1,69.92Z"/><g class="cls-13"><text class="cls-21" transform="translate(503.01 74.49)"><tspan class="cls-22">8</tspan><tspan x="6.98" y="0">-</tspan><tspan class="cls-23" x="11.65" y="0">b</tspan><tspan class="cls-22" x="18.73" y="0">it</tspan><tspan class="cls-24" x="26.45" y="0"> </tspan><tspan class="cls-25" x="30.03" y="0">A</tspan><tspan x="40.11" y="0">C</tspan></text></g><line class="cls-7" x1="481.68" y1="90.76" x2="500.88" y2="90.76"/><path class="cls-8" d="M493.1,90.68a2,2,0,1,1-2-2A2,2,0,0,1,493.1,90.68Z"/><path class="cls-26" d="M493.1,90.68a2,2,0,1,1-2-2A2,2,0,0,1,493.1,90.68Z"/><g class="cls-13"><text class="cls-21" transform="translate(503.01 95.15)"><tspan class="cls-22">1</tspan><tspan class="cls-27" x="6.98" y="0">0</tspan><tspan class="cls-28" x="14.03" y="0">-</tspan><tspan class="cls-29" x="18.7" y="0">bi</tspan><tspan class="cls-30" x="29.54" y="0">t</tspan><tspan class="cls-31" x="33.48" y="0"> </tspan><tspan class="cls-32" x="36.99" y="0">A</tspan><tspan x="47.05" y="0">C</tspan></text></g><line class="cls-10" x1="481.68" y1="111.43" x2="500.88" y2="111.43"/><path class="cls-11" d="M493.69,111.38a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,493.69,111.38Z"/><path class="cls-33" d="M493.69,111.38a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,493.69,111.38Z"/><g class="cls-13"><text class="cls-21" transform="translate(503.01 115.82)"><tspan class="cls-22">1</tspan><tspan class="cls-27" x="6.98" y="0">2</tspan><tspan class="cls-28" x="14.03" y="0">-</tspan><tspan class="cls-29" x="18.7" y="0">bi</tspan><tspan class="cls-30" x="29.54" y="0">t</tspan><tspan class="cls-31" x="33.48" y="0"> </tspan><tspan class="cls-32" x="36.99" y="0">A</tspan><tspan x="47.05" y="0">C</tspan></text></g><rect class="cls-2" x="0.38" y="0.38" width="652.8" height="391.32"/></g></g></svg>
\ No newline at end of file
diff --git a/doc/img/quant_dc.svg b/doc/img/quant_dc.svg
new file mode 100644
index 0000000..4fda108
--- /dev/null
+++ b/doc/img/quant_dc.svg
@@ -0,0 +1 @@
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" viewBox="0 0 653.55 392.07"><defs><style>.cls-1,.cls-10,.cls-12,.cls-18,.cls-2,.cls-24,.cls-33,.cls-4,.cls-6,.cls-7,.cls-9{fill:none;}.cls-2{stroke:#d9d9d9;}.cls-10,.cls-12,.cls-18,.cls-2,.cls-24,.cls-33,.cls-4,.cls-6,.cls-7,.cls-9{stroke-linejoin:round;}.cls-18,.cls-2,.cls-24,.cls-33{stroke-width:0.75px;}.cls-3{clip-path:url(#clip-path);}.cls-18,.cls-4,.cls-6{stroke:#5b9bd5;}.cls-10,.cls-4,.cls-7{stroke-linecap:round;stroke-width:2.25px;}.cls-5{fill:#5b9bd5;}.cls-12,.cls-6,.cls-9{stroke-width:0.72px;}.cls-24,.cls-7,.cls-9{stroke:#ed7d31;}.cls-8{fill:#ed7d31;}.cls-10,.cls-12,.cls-33{stroke:#a5a5a5;}.cls-11{fill:#a5a5a5;}.cls-13{clip-path:url(#clip-path-4);}.cls-14{font-size:9px;font-family:Calibri, Calibri;}.cls-14,.cls-15,.cls-19,.cls-25{fill:#595959;}.cls-15{font-size:15.96px;}.cls-15,.cls-19,.cls-25{font-family:TimesNewRomanPSMT, Times New Roman;}.cls-16{letter-spacing:0em;}.cls-17{letter-spacing:0em;}.cls-19{font-size:14.04px;}.cls-20{letter-spacing:0em;}.cls-21{letter-spacing:0em;}.cls-22{letter-spacing:0.01em;}.cls-23{letter-spacing:0em;}.cls-25{font-size:14.06px;}.cls-26{letter-spacing:0em;}.cls-27{letter-spacing:0em;}.cls-28{letter-spacing:-0.01em;}.cls-29{letter-spacing:0em;}.cls-30{letter-spacing:0em;}.cls-31{letter-spacing:0em;}.cls-32{letter-spacing:-0.01em;}.cls-34{letter-spacing:0em;}.cls-35{letter-spacing:0em;}.cls-36{letter-spacing:0em;}.cls-37{letter-spacing:0em;}.cls-38{letter-spacing:0em;}.cls-39{letter-spacing:-0.01em;}</style><clipPath id="clip-path"><rect class="cls-1" x="53.77" y="8.9" width="587.4" height="355.08"/></clipPath><clipPath id="clip-path-4"><rect class="cls-1" x="0.38" y="0.38" width="652.8" height="391.32"/></clipPath></defs><title>tables2Asset 1</title><g id="Layer_2" data-name="Layer 2"><g id="Layer_1-2" data-name="Layer 1"><path class="cls-2" d="M53.8,9H640.52M53.8,79.82H640.52M53.8,150.74H640.52M53.8,221.54H640.52M53.8,292.46H640.52"/><path class="cls-2" d="M626.77,9V363.3M512.18,9V363.3M397.58,9V363.3M283,9V363.3M168.37,9V363.3M53.8,9V363.3"/><line class="cls-2" x1="53.8" y1="363.3" x2="640.52" y2="363.3"/><g class="cls-3"><polyline class="cls-4" points="54.95 363.24 57.26 363.13 59.53 363.13 61.81 363.13 64.09 363.13 66.38 363.13 68.66 363.13 71.06 363.13 73.33 363.13 75.61 363.13 77.89 363.13 80.17 363.01 82.45 363.01 84.73 363.01 87.02 363.01 89.3 363.01 91.58 363.01 93.86 363.01 96.25 363.01 98.53 363.01 100.81 362.89 103.09 362.89 105.38 362.89 107.66 362.89 109.94 362.89 112.22 362.89 114.5 362.89 116.78 362.89 119.17 362.89 121.45 362.89 123.73 362.89 126.02 362.77 128.29 362.77 130.57 362.77 132.85 362.77 135.13 362.77 137.41 362.77 139.69 362.77 142.09 362.77 144.38 362.77 146.66 362.65 148.94 362.65 151.22 362.65 153.5 362.65 155.78 362.65 158.06 362.65 160.34 362.65 162.62 362.65 165.01 362.65 167.29 362.65 169.57 362.65 171.85 362.54 174.13 362.54 176.41 362.54 178.69 362.54 180.97 362.54 183.25 362.54 185.53 362.54 187.94 362.54 190.22 362.54 192.5 362.54 194.78 362.42 197.06 362.42 199.34 362.42 201.62 362.42 203.9 362.42 206.18 362.42 208.46 362.42 210.85 362.42 213.13 362.42 215.41 362.42 217.69 362.42 219.97 362.3 222.25 362.3 224.53 362.3 226.81 362.3 229.09 362.3 231.38 362.3 233.78 362.3 236.06 362.3 238.34 362.3 240.62 362.3 242.9 362.18 245.18 362.18 247.46 362.18 249.74 362.18 252.01 362.18 254.29 362.18 256.69 362.18 258.98 362.18 261.25 362.18 263.54 362.18 265.81 362.18 268.1 362.06 270.38 362.06 272.65 362.06 274.94 362.06 277.21 362.06 279.62 362.06 281.89 361.94 284.18 361.94 286.45 361.94 288.74 361.94 291.01 361.94 293.3 361.94 295.57 361.81 297.86 361.81 300.13 361.81 302.42 361.81 304.81 361.81 307.1 361.81 309.38 361.69 311.65 361.69 313.94 361.69 316.21 361.69 318.5 361.69 320.77 361.69 323.06 361.57 325.33 361.57 327.74 361.57 330.01 361.57 332.3 361.57 334.57 361.45 336.86 361.45 339.13 361.45 341.42 361.33 343.69 361.33 345.98 361.33 348.25 361.33 350.65 361.33 352.94 361.21 355.21 361.21 357.5 361.21 359.77 361.21 362.06 361.1 364.33 361.1 366.62 361.1 368.89 361.1 371.18 360.98 373.57 360.98 375.86 360.98 378.13 360.86 380.42 360.86 382.69 360.86 384.98 360.74 387.25 360.74 389.54 360.74 391.81 360.62 394.1 360.62 396.5 360.62 398.77 360.62 401.06 360.5 403.33 360.5 405.62 360.38 407.89 360.38 410.18 360.38 412.45 360.25 414.74 360.25 417.01 360.25 419.42 360.13 421.69 360.13 423.98 360.13 426.25 360.01 428.54 360.01 430.81 359.89 433.1 359.89 435.38 359.89 437.65 359.77 439.94 359.77 442.33 359.65 444.62 359.65 446.89 359.65 449.18 359.54 451.45 359.54 453.74 359.42 456.01 359.42 458.3 359.3 460.57 359.3 462.86 359.18 465.25 359.18 467.54 359.06 469.81 359.06 472.1 358.94 474.38 358.94 476.65 358.81 478.94 358.81 481.21 358.69 483.5 358.69 485.77 358.57 488.06 358.57 490.45 358.45 492.74 358.45 495.01 358.33 497.3 358.33 499.57 358.21 501.86 358.1 504.13 358.1 506.42 357.98 508.69 357.98 510.98 357.86 513.38 357.74 515.65 357.74 517.93 357.62 520.22 357.5 522.5 357.5 524.77 357.38 527.05 357.25 529.34 357.25 531.62 357.13 533.89 357.01 536.29 357.01 538.58 356.89 540.86 356.77 543.13 356.65 545.41 356.54 547.7 356.42 549.98 356.42 552.25 356.3 554.53 356.18 556.82 356.06 559.22 355.94 561.5 355.81 563.77 355.69 566.05 355.45 568.34 355.33 570.62 355.21 572.89 355.1 575.17 354.98 577.46 354.74 579.74 354.62 582.13 354.38 584.41 354.25 586.7 354.01 588.98 353.77 591.25 353.65 593.53 353.42 595.82 353.18 598.1 352.81 600.38 352.57 602.65 352.33 605.05 351.98 607.34 351.74 609.62 351.38 611.89 351.01 614.17 350.65 616.46 350.18 618.74 349.81 621.01 349.33 623.29 348.86 625.58 348.25 627.98 347.77 630.25 347.18 632.53 346.45 634.82 345.86 637.1 345.13 639.38 344.37"/></g><circle class="cls-5" cx="54.91" cy="363.2" r="1.98"/><circle class="cls-6" cx="54.91" cy="363.2" r="1.98"/><circle class="cls-5" cx="57.2" cy="363.08" r="1.98"/><circle class="cls-6" cx="57.2" cy="363.08" r="1.98"/><circle class="cls-5" cx="59.48" cy="363.08" r="1.98"/><circle class="cls-6" cx="59.48" cy="363.08" r="1.98"/><circle class="cls-5" cx="61.75" cy="363.08" r="1.98"/><circle class="cls-6" cx="61.75" cy="363.08" r="1.98"/><circle class="cls-5" cx="64.03" cy="363.08" r="1.98"/><circle class="cls-6" cx="64.03" cy="363.08" r="1.98"/><circle class="cls-5" cx="66.31" cy="363.08" r="1.98"/><circle class="cls-6" cx="66.31" cy="363.08" r="1.98"/><circle class="cls-5" cx="68.59" cy="363.08" r="1.98"/><circle class="cls-6" cx="68.59" cy="363.08" r="1.98"/><circle class="cls-5" cx="71" cy="363.08" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -297.31, 397.95)"/><circle class="cls-6" cx="71" cy="363.08" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -297.31, 397.95)"/><circle class="cls-5" cx="73.28" cy="363.08" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -295.25, 400.22)"/><circle class="cls-6" cx="73.28" cy="363.08" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -295.25, 400.22)"/><circle class="cls-5" cx="75.56" cy="363.08" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -293.2, 402.49)"/><circle class="cls-6" cx="75.56" cy="363.08" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -293.2, 402.49)"/><circle class="cls-5" cx="77.84" cy="363.08" r="1.98"/><circle class="cls-6" cx="77.84" cy="363.08" r="1.98"/><circle class="cls-5" cx="80.12" cy="362.96" r="1.98"/><circle class="cls-6" cx="80.12" cy="362.96" r="1.98"/><circle class="cls-5" cx="82.4" cy="362.96" r="1.98"/><circle class="cls-6" cx="82.4" cy="362.96" r="1.98"/><circle class="cls-5" cx="84.67" cy="362.96" r="1.98"/><circle class="cls-6" cx="84.67" cy="362.96" r="1.98"/><circle class="cls-5" cx="86.95" cy="362.96" r="1.98"/><circle class="cls-6" cx="86.95" cy="362.96" r="1.98"/><circle class="cls-5" cx="89.23" cy="362.96" r="1.98"/><circle class="cls-6" cx="89.23" cy="362.96" r="1.98"/><circle class="cls-5" cx="91.51" cy="362.96" r="1.98"/><circle class="cls-6" cx="91.51" cy="362.96" r="1.98"/><circle class="cls-5" cx="93.79" cy="362.96" r="1.98"/><circle class="cls-6" cx="93.79" cy="362.96" r="1.98"/><circle class="cls-5" cx="96.19" cy="362.96" r="1.98"/><circle class="cls-6" cx="96.19" cy="362.96" r="1.98"/><circle class="cls-5" cx="98.48" cy="362.96" r="1.98"/><circle class="cls-6" cx="98.48" cy="362.96" r="1.98"/><circle class="cls-5" cx="100.76" cy="362.84" r="1.98"/><circle class="cls-6" cx="100.76" cy="362.84" r="1.98"/><circle class="cls-5" cx="103.03" cy="362.84" r="1.98"/><circle class="cls-6" cx="103.03" cy="362.84" r="1.98"/><circle class="cls-5" cx="105.31" cy="362.84" r="1.98"/><circle class="cls-6" cx="105.31" cy="362.84" r="1.98"/><circle class="cls-5" cx="107.59" cy="362.84" r="1.98"/><circle class="cls-6" cx="107.59" cy="362.84" r="1.98"/><circle class="cls-5" cx="109.88" cy="362.84" r="1.98"/><circle class="cls-6" cx="109.88" cy="362.84" r="1.98"/><circle class="cls-5" cx="112.15" cy="362.84" r="1.98"/><circle class="cls-6" cx="112.15" cy="362.84" r="1.98"/><circle class="cls-5" cx="114.43" cy="362.84" r="1.98"/><circle class="cls-6" cx="114.43" cy="362.84" r="1.98"/><circle class="cls-5" cx="116.71" cy="362.84" r="1.98"/><circle class="cls-6" cx="116.71" cy="362.84" r="1.98"/><circle class="cls-5" cx="119.12" cy="362.84" r="1.98"/><circle class="cls-6" cx="119.12" cy="362.84" r="1.98"/><circle class="cls-5" cx="121.4" cy="362.84" r="1.98"/><circle class="cls-6" cx="121.4" cy="362.84" r="1.98"/><circle class="cls-5" cx="123.67" cy="362.84" r="1.98"/><circle class="cls-6" cx="123.67" cy="362.84" r="1.98"/><circle class="cls-5" cx="125.95" cy="362.72" r="1.98"/><circle class="cls-6" cx="125.95" cy="362.72" r="1.98"/><circle class="cls-5" cx="128.23" cy="362.72" r="1.98"/><circle class="cls-6" cx="128.23" cy="362.72" r="1.98"/><circle class="cls-5" cx="130.51" cy="362.72" r="1.98"/><circle class="cls-6" cx="130.51" cy="362.72" r="1.98"/><circle class="cls-5" cx="132.79" cy="362.72" r="1.98"/><circle class="cls-6" cx="132.79" cy="362.72" r="1.98"/><circle class="cls-5" cx="135.07" cy="362.72" r="1.98"/><circle class="cls-6" cx="135.07" cy="362.72" r="1.98"/><circle class="cls-5" cx="137.36" cy="362.72" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -237.13, 463.66)"/><circle class="cls-6" cx="137.36" cy="362.72" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -237.13, 463.66)"/><circle class="cls-5" cx="139.64" cy="362.72" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -235.07, 465.93)"/><circle class="cls-6" cx="139.64" cy="362.72" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -235.07, 465.93)"/><circle class="cls-5" cx="142.03" cy="362.72" r="1.98"/><circle class="cls-6" cx="142.03" cy="362.72" r="1.98"/><circle class="cls-5" cx="144.31" cy="362.72" r="1.98"/><circle class="cls-6" cx="144.31" cy="362.72" r="1.98"/><circle class="cls-5" cx="146.59" cy="362.6" r="1.98"/><circle class="cls-6" cx="146.59" cy="362.6" r="1.98"/><circle class="cls-5" cx="148.88" cy="362.6" r="1.98"/><circle class="cls-6" cx="148.88" cy="362.6" r="1.98"/><circle class="cls-5" cx="151.15" cy="362.6" r="1.98"/><circle class="cls-6" cx="151.15" cy="362.6" r="1.98"/><circle class="cls-5" cx="153.43" cy="362.6" r="1.98"/><circle class="cls-6" cx="153.43" cy="362.6" r="1.98"/><circle class="cls-5" cx="155.71" cy="362.6" r="1.98"/><circle class="cls-6" cx="155.71" cy="362.6" r="1.98"/><circle class="cls-5" cx="158" cy="362.6" r="1.98"/><circle class="cls-6" cx="158" cy="362.6" r="1.98"/><circle class="cls-5" cx="160.28" cy="362.6" r="1.98"/><circle class="cls-6" cx="160.28" cy="362.6" r="1.98"/><circle class="cls-5" cx="162.56" cy="362.6" r="1.98"/><circle class="cls-6" cx="162.56" cy="362.6" r="1.98"/><circle class="cls-5" cx="164.95" cy="362.6" r="1.98"/><circle class="cls-6" cx="164.95" cy="362.6" r="1.98"/><circle class="cls-5" cx="167.23" cy="362.6" r="1.98"/><circle class="cls-6" cx="167.23" cy="362.6" r="1.98"/><circle class="cls-5" cx="169.51" cy="362.6" r="1.98"/><circle class="cls-6" cx="169.51" cy="362.6" r="1.98"/><circle class="cls-5" cx="171.79" cy="362.48" r="1.98"/><circle class="cls-6" cx="171.79" cy="362.48" r="1.98"/><circle class="cls-5" cx="174.07" cy="362.48" r="1.98"/><circle class="cls-6" cx="174.07" cy="362.48" r="1.98"/><circle class="cls-5" cx="176.36" cy="362.48" r="1.98"/><circle class="cls-6" cx="176.36" cy="362.48" r="1.98"/><circle class="cls-5" cx="178.64" cy="362.48" r="1.98"/><circle class="cls-6" cx="178.64" cy="362.48" r="1.98"/><circle class="cls-5" cx="180.92" cy="362.48" r="1.98"/><circle class="cls-6" cx="180.92" cy="362.48" r="1.98"/><circle class="cls-5" cx="183.19" cy="362.48" r="1.98"/><circle class="cls-6" cx="183.19" cy="362.48" r="1.98"/><circle class="cls-5" cx="185.48" cy="362.48" r="1.98"/><circle class="cls-6" cx="185.48" cy="362.48" r="1.98"/><circle class="cls-5" cx="187.88" cy="362.48" r="1.98"/><circle class="cls-6" cx="187.88" cy="362.48" r="1.98"/><circle class="cls-5" cx="190.15" cy="362.48" r="1.98"/><circle class="cls-6" cx="190.15" cy="362.48" r="1.98"/><circle class="cls-5" cx="192.43" cy="362.48" r="1.98"/><circle class="cls-6" cx="192.43" cy="362.48" r="1.98"/><circle class="cls-5" cx="194.71" cy="362.36" r="1.98"/><circle class="cls-6" cx="194.71" cy="362.36" r="1.98"/><circle class="cls-5" cx="196.99" cy="362.36" r="1.98"/><circle class="cls-6" cx="196.99" cy="362.36" r="1.98"/><path class="cls-5" d="M201.26,362.36a2,2,0,1,1-2-2A2,2,0,0,1,201.26,362.36Z"/><path class="cls-6" d="M201.26,362.36a2,2,0,1,1-2-2A2,2,0,0,1,201.26,362.36Z"/><path class="cls-5" d="M203.53,362.36a2,2,0,1,1-2-2A2,2,0,0,1,203.53,362.36Z"/><path class="cls-6" d="M203.53,362.36a2,2,0,1,1-2-2A2,2,0,0,1,203.53,362.36Z"/><path class="cls-5" d="M205.81,362.36a2,2,0,1,1-2-2A2,2,0,0,1,205.81,362.36Z"/><path class="cls-6" d="M205.81,362.36a2,2,0,1,1-2-2A2,2,0,0,1,205.81,362.36Z"/><path class="cls-5" d="M208.09,362.36a2,2,0,1,1-2-2A2,2,0,0,1,208.09,362.36Z"/><path class="cls-6" d="M208.09,362.36a2,2,0,1,1-2-2A2,2,0,0,1,208.09,362.36Z"/><path class="cls-5" d="M210.37,362.36a2,2,0,1,1-2-2A2,2,0,0,1,210.37,362.36Z"/><path class="cls-6" d="M210.37,362.36a2,2,0,1,1-2-2A2,2,0,0,1,210.37,362.36Z"/><path class="cls-5" d="M212.77,362.36a2,2,0,1,1-2-2A2,2,0,0,1,212.77,362.36Z"/><path class="cls-6" d="M212.77,362.36a2,2,0,1,1-2-2A2,2,0,0,1,212.77,362.36Z"/><path class="cls-5" d="M215.05,362.36a2,2,0,1,1-2-2A2,2,0,0,1,215.05,362.36Z"/><path class="cls-6" d="M215.05,362.36a2,2,0,1,1-2-2A2,2,0,0,1,215.05,362.36Z"/><path class="cls-5" d="M217.33,362.36a2,2,0,1,1-2-2A2,2,0,0,1,217.33,362.36Z"/><path class="cls-6" d="M217.33,362.36a2,2,0,1,1-2-2A2,2,0,0,1,217.33,362.36Z"/><path class="cls-5" d="M219.61,362.36a2,2,0,1,1-2-2A2,2,0,0,1,219.61,362.36Z"/><path class="cls-6" d="M219.61,362.36a2,2,0,1,1-2-2A2,2,0,0,1,219.61,362.36Z"/><path class="cls-5" d="M221.89,362.24a2,2,0,1,1-2-2A2,2,0,0,1,221.89,362.24Z"/><path class="cls-6" d="M221.89,362.24a2,2,0,1,1-2-2A2,2,0,0,1,221.89,362.24Z"/><circle class="cls-5" cx="222.2" cy="362.24" r="1.98" transform="translate(-34.61 23.66) rotate(-5.65)"/><circle class="cls-6" cx="222.2" cy="362.24" r="1.98" transform="translate(-34.61 23.66) rotate(-5.65)"/><circle class="cls-5" cx="224.48" cy="362.24" r="1.98" transform="translate(-34.6 23.88) rotate(-5.65)"/><circle class="cls-6" cx="224.48" cy="362.24" r="1.98" transform="translate(-34.6 23.88) rotate(-5.65)"/><circle class="cls-5" cx="226.76" cy="362.24" r="1.98" transform="translate(-34.59 24.11) rotate(-5.65)"/><circle class="cls-6" cx="226.76" cy="362.24" r="1.98" transform="translate(-34.59 24.11) rotate(-5.65)"/><circle class="cls-5" cx="229.03" cy="362.24" r="1.98" transform="translate(-34.58 24.33) rotate(-5.65)"/><circle class="cls-6" cx="229.03" cy="362.24" r="1.98" transform="translate(-34.58 24.33) rotate(-5.65)"/><circle class="cls-5" cx="231.31" cy="362.24" r="1.98" transform="translate(-34.57 24.56) rotate(-5.65)"/><circle class="cls-6" cx="231.31" cy="362.24" r="1.98" transform="translate(-34.57 24.56) rotate(-5.65)"/><path class="cls-5" d="M235.7,362.24a2,2,0,1,1-2-2A2,2,0,0,1,235.7,362.24Z"/><path class="cls-6" d="M235.7,362.24a2,2,0,1,1-2-2A2,2,0,0,1,235.7,362.24Z"/><path class="cls-5" d="M238,362.24a2,2,0,1,1-2-2A2,2,0,0,1,238,362.24Z"/><path class="cls-6" d="M238,362.24a2,2,0,1,1-2-2A2,2,0,0,1,238,362.24Z"/><path class="cls-5" d="M240.26,362.24a2,2,0,1,1-2-2A2,2,0,0,1,240.26,362.24Z"/><path class="cls-6" d="M240.26,362.24a2,2,0,1,1-2-2A2,2,0,0,1,240.26,362.24Z"/><path class="cls-5" d="M242.53,362.24a2,2,0,1,1-2-2A2,2,0,0,1,242.53,362.24Z"/><path class="cls-6" d="M242.53,362.24a2,2,0,1,1-2-2A2,2,0,0,1,242.53,362.24Z"/><path class="cls-5" d="M244.81,362.12a2,2,0,1,1-2-2A2,2,0,0,1,244.81,362.12Z"/><path class="cls-6" d="M244.81,362.12a2,2,0,1,1-2-2A2,2,0,0,1,244.81,362.12Z"/><path class="cls-5" d="M247.09,362.12a2,2,0,1,1-2-2A2,2,0,0,1,247.09,362.12Z"/><path class="cls-6" d="M247.09,362.12a2,2,0,1,1-2-2A2,2,0,0,1,247.09,362.12Z"/><path class="cls-5" d="M249.37,362.12a2,2,0,1,1-2-2A2,2,0,0,1,249.37,362.12Z"/><path class="cls-6" d="M249.37,362.12a2,2,0,1,1-2-2A2,2,0,0,1,249.37,362.12Z"/><path class="cls-5" d="M251.65,362.12a2,2,0,1,1-2-2A2,2,0,0,1,251.65,362.12Z"/><path class="cls-6" d="M251.65,362.12a2,2,0,1,1-2-2A2,2,0,0,1,251.65,362.12Z"/><path class="cls-5" d="M253.93,362.12a2,2,0,1,1-2-2A2,2,0,0,1,253.93,362.12Z"/><path class="cls-6" d="M253.93,362.12a2,2,0,1,1-2-2A2,2,0,0,1,253.93,362.12Z"/><path class="cls-5" d="M256.21,362.12a2,2,0,1,1-2-2A2,2,0,0,1,256.21,362.12Z"/><path class="cls-6" d="M256.21,362.12a2,2,0,1,1-2-2A2,2,0,0,1,256.21,362.12Z"/><path class="cls-5" d="M258.61,362.12a2,2,0,1,1-2-2A2,2,0,0,1,258.61,362.12Z"/><path class="cls-6" d="M258.61,362.12a2,2,0,1,1-2-2A2,2,0,0,1,258.61,362.12Z"/><path class="cls-5" d="M260.89,362.12a2,2,0,1,1-2-2A2,2,0,0,1,260.89,362.12Z"/><path class="cls-6" d="M260.89,362.12a2,2,0,1,1-2-2A2,2,0,0,1,260.89,362.12Z"/><circle class="cls-5" cx="261.2" cy="362.12" r="1.98" transform="translate(-118.52 596.98) rotate(-85.93)"/><circle class="cls-6" cx="261.2" cy="362.12" r="1.98" transform="translate(-118.52 596.98) rotate(-85.93)"/><circle class="cls-5" cx="263.48" cy="362.12" r="1.98" transform="translate(-116.41 599.26) rotate(-85.93)"/><circle class="cls-6" cx="263.48" cy="362.12" r="1.98" transform="translate(-116.41 599.26) rotate(-85.93)"/><circle class="cls-5" cx="265.76" cy="362.12" r="1.98" transform="translate(-114.29 601.53) rotate(-85.93)"/><circle class="cls-6" cx="265.76" cy="362.12" r="1.98" transform="translate(-114.29 601.53) rotate(-85.93)"/><circle class="cls-5" cx="268.03" cy="362" r="1.98" transform="translate(-112.05 603.69) rotate(-85.93)"/><circle class="cls-6" cx="268.03" cy="362" r="1.98" transform="translate(-112.05 603.69) rotate(-85.93)"/><circle class="cls-5" cx="270.31" cy="362" r="1.98" transform="translate(-109.93 605.97) rotate(-85.93)"/><circle class="cls-6" cx="270.31" cy="362" r="1.98" transform="translate(-109.93 605.97) rotate(-85.93)"/><circle class="cls-5" cx="272.59" cy="362" r="1.98" transform="translate(-107.81 608.24) rotate(-85.93)"/><circle class="cls-6" cx="272.59" cy="362" r="1.98" transform="translate(-107.81 608.24) rotate(-85.93)"/><circle class="cls-5" cx="274.87" cy="362" r="1.98" transform="translate(-105.69 610.52) rotate(-85.93)"/><circle class="cls-6" cx="274.87" cy="362" r="1.98" transform="translate(-105.69 610.52) rotate(-85.93)"/><path class="cls-5" d="M279.14,362a2,2,0,1,1-2-2A2,2,0,0,1,279.14,362Z"/><path class="cls-6" d="M279.14,362a2,2,0,1,1-2-2A2,2,0,0,1,279.14,362Z"/><path class="cls-5" d="M281.53,362a2,2,0,1,1-2-2A2,2,0,0,1,281.53,362Z"/><path class="cls-6" d="M281.53,362a2,2,0,1,1-2-2A2,2,0,0,1,281.53,362Z"/><path class="cls-5" d="M283.81,361.88a2,2,0,1,1-2-2A2,2,0,0,1,283.81,361.88Z"/><path class="cls-6" d="M283.81,361.88a2,2,0,1,1-2-2A2,2,0,0,1,283.81,361.88Z"/><path class="cls-5" d="M286.09,361.88a2,2,0,1,1-2-2A2,2,0,0,1,286.09,361.88Z"/><path class="cls-6" d="M286.09,361.88a2,2,0,1,1-2-2A2,2,0,0,1,286.09,361.88Z"/><path class="cls-5" d="M288.37,361.88a2,2,0,1,1-2-2A2,2,0,0,1,288.37,361.88Z"/><path class="cls-6" d="M288.37,361.88a2,2,0,1,1-2-2A2,2,0,0,1,288.37,361.88Z"/><path class="cls-5" d="M290.65,361.88a2,2,0,1,1-2-2A2,2,0,0,1,290.65,361.88Z"/><path class="cls-6" d="M290.65,361.88a2,2,0,1,1-2-2A2,2,0,0,1,290.65,361.88Z"/><path class="cls-5" d="M292.93,361.88a2,2,0,1,1-2-2A2,2,0,0,1,292.93,361.88Z"/><path class="cls-6" d="M292.93,361.88a2,2,0,1,1-2-2A2,2,0,0,1,292.93,361.88Z"/><path class="cls-5" d="M295.21,361.88a2,2,0,1,1-2-2A2,2,0,0,1,295.21,361.88Z"/><path class="cls-6" d="M295.21,361.88a2,2,0,1,1-2-2A2,2,0,0,1,295.21,361.88Z"/><path class="cls-5" d="M297.49,361.76a2,2,0,1,1-2-2A2,2,0,0,1,297.49,361.76Z"/><path class="cls-6" d="M297.49,361.76a2,2,0,1,1-2-2A2,2,0,0,1,297.49,361.76Z"/><path class="cls-5" d="M299.77,361.76a2,2,0,1,1-2-2A2,2,0,0,1,299.77,361.76Z"/><path class="cls-6" d="M299.77,361.76a2,2,0,1,1-2-2A2,2,0,0,1,299.77,361.76Z"/><path class="cls-5" d="M302.05,361.76a2,2,0,1,1-2-2A2,2,0,0,1,302.05,361.76Z"/><path class="cls-6" d="M302.05,361.76a2,2,0,1,1-2-2A2,2,0,0,1,302.05,361.76Z"/><path class="cls-5" d="M304.33,361.76a2,2,0,1,1-2-2A2,2,0,0,1,304.33,361.76Z"/><path class="cls-6" d="M304.33,361.76a2,2,0,1,1-2-2A2,2,0,0,1,304.33,361.76Z"/><circle class="cls-5" cx="304.76" cy="361.76" r="1.98" transform="translate(-77.69 640.1) rotate(-85.93)"/><circle class="cls-6" cx="304.76" cy="361.76" r="1.98" transform="translate(-77.69 640.1) rotate(-85.93)"/><circle class="cls-5" cx="307.03" cy="361.76" r="1.98" transform="translate(-75.58 642.37) rotate(-85.93)"/><circle class="cls-6" cx="307.03" cy="361.76" r="1.98" transform="translate(-75.58 642.37) rotate(-85.93)"/><circle class="cls-5" cx="309.31" cy="361.64" r="1.98"/><circle class="cls-6" cx="309.31" cy="361.64" r="1.98"/><circle class="cls-5" cx="311.59" cy="361.64" r="1.98"/><circle class="cls-6" cx="311.59" cy="361.64" r="1.98"/><circle class="cls-5" cx="313.87" cy="361.64" r="1.98"/><circle class="cls-6" cx="313.87" cy="361.64" r="1.98"/><path class="cls-5" d="M318.14,361.64a2,2,0,1,1-2-2A2,2,0,0,1,318.14,361.64Z"/><path class="cls-6" d="M318.14,361.64a2,2,0,1,1-2-2A2,2,0,0,1,318.14,361.64Z"/><path class="cls-5" d="M320.42,361.64a2,2,0,1,1-2-2A2,2,0,0,1,320.42,361.64Z"/><path class="cls-6" d="M320.42,361.64a2,2,0,1,1-2-2A2,2,0,0,1,320.42,361.64Z"/><path class="cls-5" d="M322.7,361.64a2,2,0,1,1-2-2A2,2,0,0,1,322.7,361.64Z"/><path class="cls-6" d="M322.7,361.64a2,2,0,1,1-2-2A2,2,0,0,1,322.7,361.64Z"/><path class="cls-5" d="M325,361.52a2,2,0,1,1-2-2A2,2,0,0,1,325,361.52Z"/><path class="cls-6" d="M325,361.52a2,2,0,1,1-2-2A2,2,0,0,1,325,361.52Z"/><path class="cls-5" d="M327.26,361.52a2,2,0,1,1-2-2A2,2,0,0,1,327.26,361.52Z"/><path class="cls-6" d="M327.26,361.52a2,2,0,1,1-2-2A2,2,0,0,1,327.26,361.52Z"/><path class="cls-5" d="M329.65,361.52a2,2,0,1,1-2-2A2,2,0,0,1,329.65,361.52Z"/><path class="cls-6" d="M329.65,361.52a2,2,0,1,1-2-2A2,2,0,0,1,329.65,361.52Z"/><path class="cls-5" d="M331.93,361.52a2,2,0,1,1-2-2A2,2,0,0,1,331.93,361.52Z"/><path class="cls-6" d="M331.93,361.52a2,2,0,1,1-2-2A2,2,0,0,1,331.93,361.52Z"/><path class="cls-5" d="M334.21,361.52a2,2,0,1,1-2-2A2,2,0,0,1,334.21,361.52Z"/><path class="cls-6" d="M334.21,361.52a2,2,0,1,1-2-2A2,2,0,0,1,334.21,361.52Z"/><path class="cls-5" d="M336.49,361.4a2,2,0,1,1-2-2A2,2,0,0,1,336.49,361.4Z"/><path class="cls-6" d="M336.49,361.4a2,2,0,1,1-2-2A2,2,0,0,1,336.49,361.4Z"/><path class="cls-5" d="M338.77,361.4a2,2,0,1,1-2-2A2,2,0,0,1,338.77,361.4Z"/><path class="cls-6" d="M338.77,361.4a2,2,0,1,1-2-2A2,2,0,0,1,338.77,361.4Z"/><path class="cls-5" d="M341.05,361.4a2,2,0,1,1-2-2A2,2,0,0,1,341.05,361.4Z"/><path class="cls-6" d="M341.05,361.4a2,2,0,1,1-2-2A2,2,0,0,1,341.05,361.4Z"/><path class="cls-5" d="M343.33,361.28a2,2,0,1,1-2-2A2,2,0,0,1,343.33,361.28Z"/><path class="cls-6" d="M343.33,361.28a2,2,0,1,1-2-2A2,2,0,0,1,343.33,361.28Z"/><path class="cls-5" d="M345.61,361.28a2,2,0,1,1-2-2A2,2,0,0,1,345.61,361.28Z"/><path class="cls-6" d="M345.61,361.28a2,2,0,1,1-2-2A2,2,0,0,1,345.61,361.28Z"/><path class="cls-5" d="M347.89,361.28a2,2,0,1,1-2-2A2,2,0,0,1,347.89,361.28Z"/><path class="cls-6" d="M347.89,361.28a2,2,0,1,1-2-2A2,2,0,0,1,347.89,361.28Z"/><circle class="cls-5" cx="348.2" cy="361.28" r="1.98"/><circle class="cls-6" cx="348.2" cy="361.28" r="1.98"/><circle class="cls-5" cx="350.59" cy="361.28" r="1.98"/><circle class="cls-6" cx="350.59" cy="361.28" r="1.98"/><circle class="cls-5" cx="352.87" cy="361.16" r="1.98" transform="translate(-33.87 36.53) rotate(-5.65)"/><circle class="cls-6" cx="352.87" cy="361.16" r="1.98" transform="translate(-33.87 36.53) rotate(-5.65)"/><path class="cls-5" d="M357.14,361.16a2,2,0,1,1-2-2A2,2,0,0,1,357.14,361.16Z"/><path class="cls-6" d="M357.14,361.16a2,2,0,1,1-2-2A2,2,0,0,1,357.14,361.16Z"/><path class="cls-5" d="M359.42,361.16a2,2,0,1,1-2-2A2,2,0,0,1,359.42,361.16Z"/><path class="cls-6" d="M359.42,361.16a2,2,0,1,1-2-2A2,2,0,0,1,359.42,361.16Z"/><path class="cls-5" d="M361.7,361.16a2,2,0,1,1-2-2A2,2,0,0,1,361.7,361.16Z"/><path class="cls-6" d="M361.7,361.16a2,2,0,1,1-2-2A2,2,0,0,1,361.7,361.16Z"/><path class="cls-5" d="M364,361a2,2,0,1,1-2-2A2,2,0,0,1,364,361Z"/><path class="cls-6" d="M364,361a2,2,0,1,1-2-2A2,2,0,0,1,364,361Z"/><path class="cls-5" d="M366.26,361a2,2,0,1,1-2-2A2,2,0,0,1,366.26,361Z"/><path class="cls-6" d="M366.26,361a2,2,0,1,1-2-2A2,2,0,0,1,366.26,361Z"/><path class="cls-5" d="M368.53,361a2,2,0,1,1-2-2A2,2,0,0,1,368.53,361Z"/><path class="cls-6" d="M368.53,361a2,2,0,1,1-2-2A2,2,0,0,1,368.53,361Z"/><path class="cls-5" d="M370.81,361a2,2,0,1,1-2-2A2,2,0,0,1,370.81,361Z"/><path class="cls-6" d="M370.81,361a2,2,0,1,1-2-2A2,2,0,0,1,370.81,361Z"/><path class="cls-5" d="M373.09,360.92a2,2,0,1,1-2-2A2,2,0,0,1,373.09,360.92Z"/><path class="cls-6" d="M373.09,360.92a2,2,0,1,1-2-2A2,2,0,0,1,373.09,360.92Z"/><path class="cls-5" d="M375.49,360.92a2,2,0,1,1-2-2A2,2,0,0,1,375.49,360.92Z"/><path class="cls-6" d="M375.49,360.92a2,2,0,1,1-2-2A2,2,0,0,1,375.49,360.92Z"/><path class="cls-5" d="M377.77,360.92a2,2,0,1,1-2-2A2,2,0,0,1,377.77,360.92Z"/><path class="cls-6" d="M377.77,360.92a2,2,0,1,1-2-2A2,2,0,0,1,377.77,360.92Z"/><path class="cls-5" d="M380.05,360.8a2,2,0,1,1-2-2A2,2,0,0,1,380.05,360.8Z"/><path class="cls-6" d="M380.05,360.8a2,2,0,1,1-2-2A2,2,0,0,1,380.05,360.8Z"/><path class="cls-5" d="M382.33,360.8a2,2,0,1,1-2-2A2,2,0,0,1,382.33,360.8Z"/><path class="cls-6" d="M382.33,360.8a2,2,0,1,1-2-2A2,2,0,0,1,382.33,360.8Z"/><path class="cls-5" d="M384.61,360.8a2,2,0,1,1-2-2A2,2,0,0,1,384.61,360.8Z"/><path class="cls-6" d="M384.61,360.8a2,2,0,1,1-2-2A2,2,0,0,1,384.61,360.8Z"/><path class="cls-5" d="M386.89,360.68a2,2,0,1,1-2-2A2,2,0,0,1,386.89,360.68Z"/><path class="cls-6" d="M386.89,360.68a2,2,0,1,1-2-2A2,2,0,0,1,386.89,360.68Z"/><circle class="cls-5" cx="387.2" cy="360.68" r="1.98" transform="translate(-0.02 721.33) rotate(-85.93)"/><circle class="cls-6" cx="387.2" cy="360.68" r="1.98" transform="translate(-0.02 721.33) rotate(-85.93)"/><circle class="cls-5" cx="389.48" cy="360.68" r="1.98" transform="translate(2.1 723.6) rotate(-85.93)"/><circle class="cls-6" cx="389.48" cy="360.68" r="1.98" transform="translate(2.1 723.6) rotate(-85.93)"/><circle class="cls-5" cx="391.76" cy="360.56" r="1.98" transform="translate(-33.62 40.36) rotate(-5.65)"/><circle class="cls-6" cx="391.76" cy="360.56" r="1.98" transform="translate(-33.62 40.36) rotate(-5.65)"/><circle class="cls-5" cx="394.03" cy="360.56" r="1.98" transform="translate(-33.61 40.58) rotate(-5.65)"/><circle class="cls-6" cx="394.03" cy="360.56" r="1.98" transform="translate(-33.61 40.58) rotate(-5.65)"/><path class="cls-5" d="M398.42,360.56a2,2,0,1,1-2-2A2,2,0,0,1,398.42,360.56Z"/><path class="cls-6" d="M398.42,360.56a2,2,0,1,1-2-2A2,2,0,0,1,398.42,360.56Z"/><path class="cls-5" d="M400.7,360.56a2,2,0,1,1-2-2A2,2,0,0,1,400.7,360.56Z"/><path class="cls-6" d="M400.7,360.56a2,2,0,1,1-2-2A2,2,0,0,1,400.7,360.56Z"/><path class="cls-5" d="M403,360.44a2,2,0,1,1-2-2A2,2,0,0,1,403,360.44Z"/><path class="cls-6" d="M403,360.44a2,2,0,1,1-2-2A2,2,0,0,1,403,360.44Z"/><path class="cls-5" d="M405.26,360.44a2,2,0,1,1-2-2A2,2,0,0,1,405.26,360.44Z"/><path class="cls-6" d="M405.26,360.44a2,2,0,1,1-2-2A2,2,0,0,1,405.26,360.44Z"/><path class="cls-5" d="M407.53,360.32a2,2,0,1,1-2-2A2,2,0,0,1,407.53,360.32Z"/><path class="cls-6" d="M407.53,360.32a2,2,0,1,1-2-2A2,2,0,0,1,407.53,360.32Z"/><path class="cls-5" d="M409.81,360.32a2,2,0,1,1-2-2A2,2,0,0,1,409.81,360.32Z"/><path class="cls-6" d="M409.81,360.32a2,2,0,1,1-2-2A2,2,0,0,1,409.81,360.32Z"/><path class="cls-5" d="M412.09,360.32a2,2,0,1,1-2-2A2,2,0,0,1,412.09,360.32Z"/><path class="cls-6" d="M412.09,360.32a2,2,0,1,1-2-2A2,2,0,0,1,412.09,360.32Z"/><path class="cls-5" d="M414.37,360.2a2,2,0,1,1-2-2A2,2,0,0,1,414.37,360.2Z"/><path class="cls-6" d="M414.37,360.2a2,2,0,1,1-2-2A2,2,0,0,1,414.37,360.2Z"/><path class="cls-5" d="M416.65,360.2a2,2,0,1,1-2-2A2,2,0,0,1,416.65,360.2Z"/><path class="cls-6" d="M416.65,360.2a2,2,0,1,1-2-2A2,2,0,0,1,416.65,360.2Z"/><path class="cls-5" d="M418.93,360.2a2,2,0,1,1-2-2A2,2,0,0,1,418.93,360.2Z"/><path class="cls-6" d="M418.93,360.2a2,2,0,1,1-2-2A2,2,0,0,1,418.93,360.2Z"/><path class="cls-5" d="M421.33,360.08a2,2,0,1,1-2-2A2,2,0,0,1,421.33,360.08Z"/><path class="cls-6" d="M421.33,360.08a2,2,0,1,1-2-2A2,2,0,0,1,421.33,360.08Z"/><path class="cls-5" d="M423.61,360.08a2,2,0,1,1-2-2A2,2,0,0,1,423.61,360.08Z"/><path class="cls-6" d="M423.61,360.08a2,2,0,1,1-2-2A2,2,0,0,1,423.61,360.08Z"/><path class="cls-5" d="M425.89,360.08a2,2,0,1,1-2-2A2,2,0,0,1,425.89,360.08Z"/><path class="cls-6" d="M425.89,360.08a2,2,0,1,1-2-2A2,2,0,0,1,425.89,360.08Z"/><circle class="cls-5" cx="426.2" cy="359.96" r="1.98"/><circle class="cls-6" cx="426.2" cy="359.96" r="1.98"/><circle class="cls-5" cx="428.48" cy="359.96" r="1.98"/><circle class="cls-6" cx="428.48" cy="359.96" r="1.98"/><circle class="cls-5" cx="430.76" cy="359.84" r="1.98" transform="translate(-33.36 44.2) rotate(-5.65)"/><circle class="cls-6" cx="430.76" cy="359.84" r="1.98" transform="translate(-33.36 44.2) rotate(-5.65)"/><circle class="cls-5" cx="433.03" cy="359.84" r="1.98" transform="translate(-33.35 44.42) rotate(-5.65)"/><circle class="cls-6" cx="433.03" cy="359.84" r="1.98" transform="translate(-33.35 44.42) rotate(-5.65)"/><circle class="cls-5" cx="435.31" cy="359.84" r="1.98" transform="translate(-33.34 44.65) rotate(-5.65)"/><circle class="cls-6" cx="435.31" cy="359.84" r="1.98" transform="translate(-33.34 44.65) rotate(-5.65)"/><circle class="cls-5" cx="437.59" cy="359.72" r="1.98" transform="translate(47.76 770.71) rotate(-85.93)"/><circle class="cls-6" cx="437.59" cy="359.72" r="1.98" transform="translate(47.76 770.71) rotate(-85.93)"/><circle class="cls-5" cx="439.87" cy="359.72" r="1.98" transform="translate(49.88 772.98) rotate(-85.93)"/><circle class="cls-6" cx="439.87" cy="359.72" r="1.98" transform="translate(49.88 772.98) rotate(-85.93)"/><path class="cls-5" d="M444.26,359.6a2,2,0,1,1-2-2A2,2,0,0,1,444.26,359.6Z"/><path class="cls-6" d="M444.26,359.6a2,2,0,1,1-2-2A2,2,0,0,1,444.26,359.6Z"/><path class="cls-5" d="M446.53,359.6a2,2,0,1,1-2-2A2,2,0,0,1,446.53,359.6Z"/><path class="cls-6" d="M446.53,359.6a2,2,0,1,1-2-2A2,2,0,0,1,446.53,359.6Z"/><path class="cls-5" d="M448.81,359.6a2,2,0,1,1-2-2A2,2,0,0,1,448.81,359.6Z"/><path class="cls-6" d="M448.81,359.6a2,2,0,1,1-2-2A2,2,0,0,1,448.81,359.6Z"/><path class="cls-5" d="M451.09,359.48a2,2,0,1,1-2-2A2,2,0,0,1,451.09,359.48Z"/><path class="cls-6" d="M451.09,359.48a2,2,0,1,1-2-2A2,2,0,0,1,451.09,359.48Z"/><path class="cls-5" d="M453.38,359.48a2,2,0,1,1-2-2A2,2,0,0,1,453.38,359.48Z"/><path class="cls-6" d="M453.38,359.48a2,2,0,1,1-2-2A2,2,0,0,1,453.38,359.48Z"/><path class="cls-5" d="M455.65,359.36a2,2,0,1,1-2-2A2,2,0,0,1,455.65,359.36Z"/><path class="cls-6" d="M455.65,359.36a2,2,0,1,1-2-2A2,2,0,0,1,455.65,359.36Z"/><path class="cls-5" d="M457.94,359.36a2,2,0,1,1-2-2A2,2,0,0,1,457.94,359.36Z"/><path class="cls-6" d="M457.94,359.36a2,2,0,1,1-2-2A2,2,0,0,1,457.94,359.36Z"/><path class="cls-5" d="M460.21,359.24a2,2,0,1,1-2-2A2,2,0,0,1,460.21,359.24Z"/><path class="cls-6" d="M460.21,359.24a2,2,0,1,1-2-2A2,2,0,0,1,460.21,359.24Z"/><path class="cls-5" d="M462.5,359.24a2,2,0,1,1-2-2A2,2,0,0,1,462.5,359.24Z"/><path class="cls-6" d="M462.5,359.24a2,2,0,1,1-2-2A2,2,0,0,1,462.5,359.24Z"/><circle class="cls-5" cx="462.8" cy="359.12" r="1.98" transform="translate(-19.15 26.14) rotate(-3.17)"/><circle class="cls-6" cx="462.8" cy="359.12" r="1.98" transform="translate(-19.15 26.14) rotate(-3.17)"/><path class="cls-5" d="M467.18,359.12a2,2,0,1,1-2-2A2,2,0,0,1,467.18,359.12Z"/><path class="cls-6" d="M467.18,359.12a2,2,0,1,1-2-2A2,2,0,0,1,467.18,359.12Z"/><circle class="cls-5" cx="467.48" cy="359" r="1.98" transform="translate(-19.14 26.4) rotate(-3.17)"/><circle class="cls-6" cx="467.48" cy="359" r="1.98" transform="translate(-19.14 26.4) rotate(-3.17)"/><path class="cls-5" d="M471.74,359a2,2,0,1,1-2-2A2,2,0,0,1,471.74,359Z"/><path class="cls-6" d="M471.74,359a2,2,0,1,1-2-2A2,2,0,0,1,471.74,359Z"/><circle class="cls-5" cx="472.03" cy="358.88" r="1.98" transform="translate(92.14 814.16) rotate(-87.4)"/><circle class="cls-6" cx="472.03" cy="358.88" r="1.98" transform="translate(92.14 814.16) rotate(-87.4)"/><path class="cls-5" d="M476.3,358.88a2,2,0,1,1-2-2A2,2,0,0,1,476.3,358.88Z"/><path class="cls-6" d="M476.3,358.88a2,2,0,1,1-2-2A2,2,0,0,1,476.3,358.88Z"/><circle class="cls-5" cx="476.59" cy="358.76" r="1.98" transform="translate(-19.11 26.9) rotate(-3.17)"/><circle class="cls-6" cx="476.59" cy="358.76" r="1.98" transform="translate(-19.11 26.9) rotate(-3.17)"/><path class="cls-5" d="M480.86,358.76a2,2,0,1,1-2-2A2,2,0,0,1,480.86,358.76Z"/><path class="cls-6" d="M480.86,358.76a2,2,0,1,1-2-2A2,2,0,0,1,480.86,358.76Z"/><circle class="cls-5" cx="481.15" cy="358.64" r="1.98"/><circle class="cls-6" cx="481.15" cy="358.64" r="1.98"/><path class="cls-5" d="M485.42,358.64a2,2,0,1,1-2-2A2,2,0,0,1,485.42,358.64Z"/><path class="cls-6" d="M485.42,358.64a2,2,0,1,1-2-2A2,2,0,0,1,485.42,358.64Z"/><circle class="cls-5" cx="485.71" cy="358.52" r="1.98" transform="translate(105.56 827.49) rotate(-87.4)"/><circle class="cls-6" cx="485.71" cy="358.52" r="1.98" transform="translate(105.56 827.49) rotate(-87.4)"/><path class="cls-5" d="M490,358.52a2,2,0,1,1-2-2A2,2,0,0,1,490,358.52Z"/><path class="cls-6" d="M490,358.52a2,2,0,1,1-2-2A2,2,0,0,1,490,358.52Z"/><path class="cls-5" d="M492.38,358.4a2,2,0,1,1-2-2A2,2,0,0,1,492.38,358.4Z"/><path class="cls-6" d="M492.38,358.4a2,2,0,1,1-2-2A2,2,0,0,1,492.38,358.4Z"/><path class="cls-5" d="M494.65,358.4a2,2,0,1,1-2-2A2,2,0,0,1,494.65,358.4Z"/><path class="cls-6" d="M494.65,358.4a2,2,0,1,1-2-2A2,2,0,0,1,494.65,358.4Z"/><path class="cls-5" d="M496.94,358.28a2,2,0,1,1-2-2A2,2,0,0,1,496.94,358.28Z"/><path class="cls-6" d="M496.94,358.28a2,2,0,1,1-2-2A2,2,0,0,1,496.94,358.28Z"/><path class="cls-5" d="M499.21,358.28a2,2,0,1,1-2-2A2,2,0,0,1,499.21,358.28Z"/><path class="cls-6" d="M499.21,358.28a2,2,0,1,1-2-2A2,2,0,0,1,499.21,358.28Z"/><path class="cls-5" d="M501.5,358.16a2,2,0,1,1-2-2A2,2,0,0,1,501.5,358.16Z"/><path class="cls-6" d="M501.5,358.16a2,2,0,1,1-2-2A2,2,0,0,1,501.5,358.16Z"/><circle class="cls-5" cx="501.8" cy="358.04" r="1.98" transform="translate(-19.03 28.3) rotate(-3.17)"/><circle class="cls-6" cx="501.8" cy="358.04" r="1.98" transform="translate(-19.03 28.3) rotate(-3.17)"/><path class="cls-5" d="M506.06,358a2,2,0,1,1-2-2A2,2,0,0,1,506.06,358Z"/><path class="cls-6" d="M506.06,358a2,2,0,1,1-2-2A2,2,0,0,1,506.06,358Z"/><circle class="cls-5" cx="506.36" cy="357.92" r="1.98" transform="translate(125.86 847.53) rotate(-87.4)"/><circle class="cls-6" cx="506.36" cy="357.92" r="1.98" transform="translate(125.86 847.53) rotate(-87.4)"/><path class="cls-5" d="M510.62,357.92a2,2,0,1,1-2-2A2,2,0,0,1,510.62,357.92Z"/><path class="cls-6" d="M510.62,357.92a2,2,0,1,1-2-2A2,2,0,0,1,510.62,357.92Z"/><circle class="cls-5" cx="510.92" cy="357.8" r="1.98" transform="translate(130.34 851.97) rotate(-87.4)"/><circle class="cls-6" cx="510.92" cy="357.8" r="1.98" transform="translate(130.34 851.97) rotate(-87.4)"/><path class="cls-5" d="M515.3,357.68a2,2,0,1,1-2-2A2,2,0,0,1,515.3,357.68Z"/><path class="cls-6" d="M515.3,357.68a2,2,0,1,1-2-2A2,2,0,0,1,515.3,357.68Z"/><circle class="cls-5" cx="515.59" cy="357.68" r="1.98" transform="translate(-18.99 29.06) rotate(-3.17)"/><circle class="cls-6" cx="515.59" cy="357.68" r="1.98" transform="translate(-18.99 29.06) rotate(-3.17)"/><path class="cls-5" d="M519.86,357.56a2,2,0,1,1-2-2A2,2,0,0,1,519.86,357.56Z"/><path class="cls-6" d="M519.86,357.56a2,2,0,1,1-2-2A2,2,0,0,1,519.86,357.56Z"/><circle class="cls-5" cx="520.15" cy="357.44" r="1.98" transform="translate(-18.97 29.31) rotate(-3.17)"/><circle class="cls-6" cx="520.15" cy="357.44" r="1.98" transform="translate(-18.97 29.31) rotate(-3.17)"/><path class="cls-5" d="M524.42,357.44a2,2,0,1,1-2-2A2,2,0,0,1,524.42,357.44Z"/><path class="cls-6" d="M524.42,357.44a2,2,0,1,1-2-2A2,2,0,0,1,524.42,357.44Z"/><circle class="cls-5" cx="524.71" cy="357.32" r="1.98" transform="translate(-18.96 29.56) rotate(-3.17)"/><circle class="cls-6" cx="524.71" cy="357.32" r="1.98" transform="translate(-18.96 29.56) rotate(-3.17)"/><path class="cls-5" d="M529,357.2a2,2,0,1,1-2-2A2,2,0,0,1,529,357.2Z"/><path class="cls-6" d="M529,357.2a2,2,0,1,1-2-2A2,2,0,0,1,529,357.2Z"/><circle class="cls-5" cx="529.27" cy="357.2" r="1.98" transform="translate(148.46 869.74) rotate(-87.4)"/><circle class="cls-6" cx="529.27" cy="357.2" r="1.98" transform="translate(148.46 869.74) rotate(-87.4)"/><path class="cls-5" d="M533.53,357.08a2,2,0,1,1-2-2A2,2,0,0,1,533.53,357.08Z"/><path class="cls-6" d="M533.53,357.08a2,2,0,1,1-2-2A2,2,0,0,1,533.53,357.08Z"/><path class="cls-5" d="M535.82,357a2,2,0,1,1-2-2A2,2,0,0,1,535.82,357Z"/><path class="cls-6" d="M535.82,357a2,2,0,1,1-2-2A2,2,0,0,1,535.82,357Z"/><path class="cls-5" d="M538.21,357a2,2,0,1,1-2-2A2,2,0,0,1,538.21,357Z"/><path class="cls-6" d="M538.21,357a2,2,0,1,1-2-2A2,2,0,0,1,538.21,357Z"/><path class="cls-5" d="M540.5,356.84a2,2,0,1,1-2-2A2,2,0,0,1,540.5,356.84Z"/><path class="cls-6" d="M540.5,356.84a2,2,0,1,1-2-2A2,2,0,0,1,540.5,356.84Z"/><circle class="cls-5" cx="540.8" cy="356.72" r="1.98" transform="translate(-18.9 30.45) rotate(-3.17)"/><circle class="cls-6" cx="540.8" cy="356.72" r="1.98" transform="translate(-18.9 30.45) rotate(-3.17)"/><path class="cls-5" d="M545.06,356.6a2,2,0,1,1-2-2A2,2,0,0,1,545.06,356.6Z"/><path class="cls-6" d="M545.06,356.6a2,2,0,1,1-2-2A2,2,0,0,1,545.06,356.6Z"/><circle class="cls-5" cx="545.36" cy="356.48" r="1.98" transform="translate(164.53 885.12) rotate(-87.4)"/><circle class="cls-6" cx="545.36" cy="356.48" r="1.98" transform="translate(164.53 885.12) rotate(-87.4)"/><path class="cls-5" d="M549.62,356.36a2,2,0,1,1-2-2A2,2,0,0,1,549.62,356.36Z"/><path class="cls-6" d="M549.62,356.36a2,2,0,1,1-2-2A2,2,0,0,1,549.62,356.36Z"/><circle class="cls-5" cx="549.92" cy="356.36" r="1.98" transform="translate(-18.86 30.96) rotate(-3.17)"/><circle class="cls-6" cx="549.92" cy="356.36" r="1.98" transform="translate(-18.86 30.96) rotate(-3.17)"/><path class="cls-5" d="M554.18,356.24a2,2,0,1,1-2-2A2,2,0,0,1,554.18,356.24Z"/><path class="cls-6" d="M554.18,356.24a2,2,0,1,1-2-2A2,2,0,0,1,554.18,356.24Z"/><circle class="cls-5" cx="554.48" cy="356.12" r="1.98" transform="translate(-18.84 31.21) rotate(-3.17)"/><circle class="cls-6" cx="554.48" cy="356.12" r="1.98" transform="translate(-18.84 31.21) rotate(-3.17)"/><path class="cls-5" d="M558.74,356a2,2,0,1,1-2-2A2,2,0,0,1,558.74,356Z"/><path class="cls-6" d="M558.74,356a2,2,0,1,1-2-2A2,2,0,0,1,558.74,356Z"/><circle class="cls-5" cx="559.15" cy="355.88" r="1.98" transform="translate(178.31 898.33) rotate(-87.4)"/><circle class="cls-6" cx="559.15" cy="355.88" r="1.98" transform="translate(178.31 898.33) rotate(-87.4)"/><path class="cls-5" d="M563.42,355.76a2,2,0,1,1-2-2A2,2,0,0,1,563.42,355.76Z"/><path class="cls-6" d="M563.42,355.76a2,2,0,1,1-2-2A2,2,0,0,1,563.42,355.76Z"/><circle class="cls-5" cx="563.71" cy="355.64" r="1.98"/><circle class="cls-6" cx="563.71" cy="355.64" r="1.98"/><path class="cls-5" d="M568,355.4a2,2,0,1,1-2-2A2,2,0,0,1,568,355.4Z"/><path class="cls-6" d="M568,355.4a2,2,0,1,1-2-2A2,2,0,0,1,568,355.4Z"/><circle class="cls-5" cx="568.27" cy="355.28" r="1.98"/><circle class="cls-6" cx="568.27" cy="355.28" r="1.98"/><path class="cls-5" d="M572.53,355.16a2,2,0,1,1-2-2A2,2,0,0,1,572.53,355.16Z"/><path class="cls-6" d="M572.53,355.16a2,2,0,1,1-2-2A2,2,0,0,1,572.53,355.16Z"/><path class="cls-5" d="M574.82,355a2,2,0,1,1-2-2A2,2,0,0,1,574.82,355Z"/><path class="cls-6" d="M574.82,355a2,2,0,1,1-2-2A2,2,0,0,1,574.82,355Z"/><path class="cls-5" d="M577.09,354.92a2,2,0,1,1-2-2A2,2,0,0,1,577.09,354.92Z"/><path class="cls-6" d="M577.09,354.92a2,2,0,1,1-2-2A2,2,0,0,1,577.09,354.92Z"/><path class="cls-5" d="M579.38,354.68a2,2,0,1,1-2-2A2,2,0,0,1,579.38,354.68Z"/><path class="cls-6" d="M579.38,354.68a2,2,0,1,1-2-2A2,2,0,0,1,579.38,354.68Z"/><path class="cls-5" d="M581.65,354.56a2,2,0,1,1-2-2A2,2,0,0,1,581.65,354.56Z"/><path class="cls-6" d="M581.65,354.56a2,2,0,1,1-2-2A2,2,0,0,1,581.65,354.56Z"/><path class="cls-5" d="M584.06,354.32a2,2,0,1,1-2-2A2,2,0,0,1,584.06,354.32Z"/><path class="cls-6" d="M584.06,354.32a2,2,0,1,1-2-2A2,2,0,0,1,584.06,354.32Z"/><circle class="cls-5" cx="584.36" cy="354.2" r="1.98"/><circle class="cls-6" cx="584.36" cy="354.2" r="1.98"/><path class="cls-5" d="M588.62,354a2,2,0,1,1-2-2A2,2,0,0,1,588.62,354Z"/><path class="cls-6" d="M588.62,354a2,2,0,1,1-2-2A2,2,0,0,1,588.62,354Z"/><circle class="cls-5" cx="588.92" cy="353.72" r="1.98" transform="translate(-18.66 33.11) rotate(-3.17)"/><circle class="cls-6" cx="588.92" cy="353.72" r="1.98" transform="translate(-18.66 33.11) rotate(-3.17)"/><path class="cls-5" d="M593.18,353.6a2,2,0,1,1-2-2A2,2,0,0,1,593.18,353.6Z"/><path class="cls-6" d="M593.18,353.6a2,2,0,1,1-2-2A2,2,0,0,1,593.18,353.6Z"/><circle class="cls-5" cx="593.48" cy="353.36" r="1.98" transform="translate(213.59 930.21) rotate(-87.4)"/><circle class="cls-6" cx="593.48" cy="353.36" r="1.98" transform="translate(213.59 930.21) rotate(-87.4)"/><path class="cls-5" d="M597.74,353.12a2,2,0,1,1-2-2A2,2,0,0,1,597.74,353.12Z"/><path class="cls-6" d="M597.74,353.12a2,2,0,1,1-2-2A2,2,0,0,1,597.74,353.12Z"/><circle class="cls-5" cx="598.03" cy="352.76" r="1.98"/><circle class="cls-6" cx="598.03" cy="352.76" r="1.98"/><path class="cls-5" d="M602.3,352.52a2,2,0,1,1-2-2A2,2,0,0,1,602.3,352.52Z"/><path class="cls-6" d="M602.3,352.52a2,2,0,1,1-2-2A2,2,0,0,1,602.3,352.52Z"/><circle class="cls-5" cx="602.59" cy="352.28" r="1.98" transform="translate(-31.78 61.09) rotate(-5.65)"/><circle class="cls-6" cx="602.59" cy="352.28" r="1.98" transform="translate(-31.78 61.09) rotate(-5.65)"/><path class="cls-5" d="M607,351.92a2,2,0,1,1-2-2A2,2,0,0,1,607,351.92Z"/><path class="cls-6" d="M607,351.92a2,2,0,1,1-2-2A2,2,0,0,1,607,351.92Z"/><circle class="cls-5" cx="607.27" cy="351.68" r="1.98"/><circle class="cls-6" cx="607.27" cy="351.68" r="1.98"/><path class="cls-5" d="M611.53,351.32a2,2,0,1,1-2-2A2,2,0,0,1,611.53,351.32Z"/><path class="cls-6" d="M611.53,351.32a2,2,0,1,1-2-2A2,2,0,0,1,611.53,351.32Z"/><path class="cls-5" d="M613.82,351a2,2,0,1,1-2-2A2,2,0,0,1,613.82,351Z"/><path class="cls-6" d="M613.82,351a2,2,0,1,1-2-2A2,2,0,0,1,613.82,351Z"/><path class="cls-5" d="M616.09,350.6a2,2,0,1,1-2-2A2,2,0,0,1,616.09,350.6Z"/><path class="cls-6" d="M616.09,350.6a2,2,0,1,1-2-2A2,2,0,0,1,616.09,350.6Z"/><path class="cls-5" d="M618.38,350.12a2,2,0,1,1-2-2A2,2,0,0,1,618.38,350.12Z"/><path class="cls-6" d="M618.38,350.12a2,2,0,1,1-2-2A2,2,0,0,1,618.38,350.12Z"/><path class="cls-5" d="M620.65,349.76a2,2,0,1,1-2-2A2,2,0,0,1,620.65,349.76Z"/><path class="cls-6" d="M620.65,349.76a2,2,0,1,1-2-2A2,2,0,0,1,620.65,349.76Z"/><path class="cls-5" d="M622.94,349.28a2,2,0,1,1-2-2A2,2,0,0,1,622.94,349.28Z"/><path class="cls-6" d="M622.94,349.28a2,2,0,1,1-2-2A2,2,0,0,1,622.94,349.28Z"/><path class="cls-5" d="M625.21,348.8a2,2,0,1,1-2-2A2,2,0,0,1,625.21,348.8Z"/><path class="cls-6" d="M625.21,348.8a2,2,0,1,1-2-2A2,2,0,0,1,625.21,348.8Z"/><path class="cls-5" d="M627.5,348.2a2,2,0,1,1-2-2A2,2,0,0,1,627.5,348.2Z"/><path class="cls-6" d="M627.5,348.2a2,2,0,1,1-2-2A2,2,0,0,1,627.5,348.2Z"/><circle class="cls-5" cx="627.92" cy="347.72" r="1.98" transform="translate(236.56 949.4) rotate(-85.93)"/><circle class="cls-6" cx="627.92" cy="347.72" r="1.98" transform="translate(236.56 949.4) rotate(-85.93)"/><path class="cls-5" d="M632.18,347.12a2,2,0,1,1-2-2A2,2,0,0,1,632.18,347.12Z"/><path class="cls-6" d="M632.18,347.12a2,2,0,1,1-2-2A2,2,0,0,1,632.18,347.12Z"/><circle class="cls-5" cx="632.48" cy="346.4" r="1.98"/><circle class="cls-6" cx="632.48" cy="346.4" r="1.98"/><path class="cls-5" d="M636.74,345.8a2,2,0,1,1-2-2A2,2,0,0,1,636.74,345.8Z"/><path class="cls-6" d="M636.74,345.8a2,2,0,1,1-2-2A2,2,0,0,1,636.74,345.8Z"/><circle class="cls-5" cx="637.03" cy="345.08" r="1.98" transform="translate(247.67 956.05) rotate(-85.93)"/><circle class="cls-6" cx="637.03" cy="345.08" r="1.98" transform="translate(247.67 956.05) rotate(-85.93)"/><path class="cls-5" d="M641.3,344.36a2,2,0,1,1-2-2A2,2,0,0,1,641.3,344.36Z"/><path class="cls-6" d="M641.3,344.36a2,2,0,1,1-2-2A2,2,0,0,1,641.3,344.36Z"/><g class="cls-3"><polyline class="cls-7" points="54.95 363.24 57.26 363.13 59.53 363.13 61.81 363.13 64.09 363.13 66.38 363.01 68.66 363.01 71.06 363.01 73.33 362.89 75.61 362.89 77.89 362.89 80.17 362.77 82.45 362.77 84.73 362.77 87.02 362.65 89.3 362.65 91.58 362.54 93.86 362.54 96.25 362.54 98.53 362.42 100.81 362.42 103.09 362.3 105.38 362.3 107.66 362.18 109.94 362.18 112.22 362.18 114.5 362.06 116.78 362.06 119.17 361.94 121.45 361.94 123.73 361.81 126.02 361.81 128.29 361.69 130.57 361.69 132.85 361.69 135.13 361.57 137.41 361.57 139.69 361.45 142.09 361.45 144.38 361.33 146.66 361.33 148.94 361.21 151.22 361.21 153.5 361.1 155.78 361.1 158.06 361.1 160.34 360.98 162.62 360.98 165.01 360.86 167.29 360.86 169.57 360.74 171.85 360.74 174.13 360.62 176.41 360.62 178.69 360.62 180.97 360.5 183.25 360.5 185.53 360.38 187.94 360.38 190.22 360.25 192.5 360.25 194.78 360.13 197.06 360.13 199.34 360.13 201.62 360.01 203.9 360.01 206.18 359.89 208.46 359.89 210.85 359.89 213.13 359.77 215.41 359.77 217.69 359.65 219.97 359.65 222.25 359.54 224.53 359.54 226.81 359.54 229.09 359.42 231.38 359.42 233.78 359.3 236.06 359.3 238.34 359.18 240.62 359.18 242.9 359.18 245.18 359.06 247.46 359.06 249.74 358.94 252.01 358.94 254.29 358.94 256.69 358.81 258.98 358.81 261.25 358.69 263.54 358.69 265.81 358.69 268.1 358.57 270.38 358.57 272.65 358.57 274.94 358.45 277.21 358.33 279.62 358.21 281.89 358.21 284.18 358.1 286.45 357.98 288.74 357.86 291.01 357.86 293.3 357.74 295.57 357.62 297.86 357.5 300.13 357.5 302.42 357.38 304.81 357.25 307.1 357.25 309.38 357.13 311.65 357.01 313.94 356.89 316.21 356.89 318.5 356.77 320.77 356.65 323.06 356.65 325.33 356.54 327.74 356.42 330.01 356.3 332.3 356.18 334.57 356.06 336.86 355.94 339.13 355.81 341.42 355.69 343.69 355.57 345.98 355.45 348.25 355.33 350.65 355.21 352.94 355.1 355.21 354.98 357.5 354.86 359.77 354.74 362.06 354.62 364.33 354.5 366.62 354.5 368.89 354.25 371.18 354.13 373.57 354.01 375.86 353.89 378.13 353.65 380.42 353.54 382.69 353.42 384.98 353.3 387.25 353.18 389.54 352.94 391.81 352.81 394.1 352.69 396.5 352.57 398.77 352.33 401.06 352.21 403.33 351.98 405.62 351.86 407.89 351.74 410.18 351.5 412.45 351.38 414.74 351.13 417.01 351.01 419.42 350.77 421.69 350.65 423.98 350.42 426.25 350.3 428.54 350.06 430.81 349.81 433.1 349.69 435.38 349.45 437.65 349.33 439.94 349.1 442.33 348.86 444.62 348.74 446.89 348.5 449.18 348.25 451.45 348.01 453.74 347.89 456.01 347.65 458.3 347.42 460.57 347.18 462.86 346.94 465.25 346.69 467.54 346.45 469.81 346.21 472.1 345.98 474.38 345.74 476.65 345.5 478.94 345.25 481.21 345.01 483.5 344.77 485.77 344.54 488.06 344.3 490.45 344.06 492.74 343.69 495.01 343.45 497.3 343.21 499.57 342.98 501.86 342.62 504.13 342.38 506.42 342.13 508.69 341.77 510.98 341.54 513.38 341.18 515.65 340.94 517.93 340.57 520.22 340.33 522.5 339.98 524.77 339.62 527.05 339.38 529.34 339.01 531.62 338.65 533.89 338.3 536.29 337.94 538.58 337.57 540.86 337.21 543.13 336.74 545.41 336.38 547.7 336.01 549.98 335.54 552.25 335.06 554.53 334.69 556.82 334.21 559.22 333.74 561.5 333.25 563.77 332.65 566.05 332.18 568.34 331.57 570.62 331.1 572.89 330.38 575.17 329.77 577.46 329.18 579.74 328.45 582.13 327.74 584.41 327.01 586.7 326.18 588.98 325.33 591.25 324.5 593.53 323.54 595.82 322.57 598.1 321.62 600.38 320.54 602.65 319.33 605.05 318.13 607.34 316.81 609.62 315.5 611.89 314.06 614.17 312.5 616.46 310.81 618.74 309.13 621.01 307.21 623.29 305.3 625.58 303.25 627.98 300.98 630.25 298.69 632.53 296.18 634.82 293.42 637.1 290.54 639.38 287.51"/></g><circle class="cls-8" cx="54.91" cy="363.2" r="1.98"/><circle class="cls-9" cx="54.91" cy="363.2" r="1.98"/><circle class="cls-8" cx="57.2" cy="363.08" r="1.98"/><circle class="cls-9" cx="57.2" cy="363.08" r="1.98"/><circle class="cls-8" cx="59.48" cy="363.08" r="1.98"/><circle class="cls-9" cx="59.48" cy="363.08" r="1.98"/><circle class="cls-8" cx="61.75" cy="363.08" r="1.98"/><circle class="cls-9" cx="61.75" cy="363.08" r="1.98"/><circle class="cls-8" cx="64.03" cy="363.08" r="1.98"/><circle class="cls-9" cx="64.03" cy="363.08" r="1.98"/><circle class="cls-8" cx="66.31" cy="362.96" r="1.98"/><circle class="cls-9" cx="66.31" cy="362.96" r="1.98"/><circle class="cls-8" cx="68.59" cy="362.96" r="1.98"/><circle class="cls-9" cx="68.59" cy="362.96" r="1.98"/><circle class="cls-8" cx="71" cy="362.96" r="1.98"/><circle class="cls-9" cx="71" cy="362.96" r="1.98"/><circle class="cls-8" cx="73.28" cy="362.84" r="1.98"/><circle class="cls-9" cx="73.28" cy="362.84" r="1.98"/><circle class="cls-8" cx="75.56" cy="362.84" r="1.98"/><circle class="cls-9" cx="75.56" cy="362.84" r="1.98"/><circle class="cls-8" cx="77.84" cy="362.84" r="1.98"/><circle class="cls-9" cx="77.84" cy="362.84" r="1.98"/><circle class="cls-8" cx="80.12" cy="362.72" r="1.98"/><circle class="cls-9" cx="80.12" cy="362.72" r="1.98"/><circle class="cls-8" cx="82.4" cy="362.72" r="1.98"/><circle class="cls-9" cx="82.4" cy="362.72" r="1.98"/><circle class="cls-8" cx="84.67" cy="362.72" r="1.98"/><circle class="cls-9" cx="84.67" cy="362.72" r="1.98"/><circle class="cls-8" cx="86.95" cy="362.6" r="1.98"/><circle class="cls-9" cx="86.95" cy="362.6" r="1.98"/><circle class="cls-8" cx="89.23" cy="362.6" r="1.98"/><circle class="cls-9" cx="89.23" cy="362.6" r="1.98"/><circle class="cls-8" cx="91.51" cy="362.48" r="1.98"/><circle class="cls-9" cx="91.51" cy="362.48" r="1.98"/><circle class="cls-8" cx="93.79" cy="362.48" r="1.98"/><circle class="cls-9" cx="93.79" cy="362.48" r="1.98"/><circle class="cls-8" cx="96.19" cy="362.48" r="1.98"/><circle class="cls-9" cx="96.19" cy="362.48" r="1.98"/><circle class="cls-8" cx="98.48" cy="362.36" r="1.98"/><circle class="cls-9" cx="98.48" cy="362.36" r="1.98"/><circle class="cls-8" cx="100.76" cy="362.36" r="1.98"/><circle class="cls-9" cx="100.76" cy="362.36" r="1.98"/><circle class="cls-8" cx="103.03" cy="362.24" r="1.98"/><circle class="cls-9" cx="103.03" cy="362.24" r="1.98"/><circle class="cls-8" cx="105.31" cy="362.24" r="1.98"/><circle class="cls-9" cx="105.31" cy="362.24" r="1.98"/><circle class="cls-8" cx="107.59" cy="362.12" r="1.98"/><circle class="cls-9" cx="107.59" cy="362.12" r="1.98"/><circle class="cls-8" cx="109.88" cy="362.12" r="1.98"/><circle class="cls-9" cx="109.88" cy="362.12" r="1.98"/><circle class="cls-8" cx="112.15" cy="362.12" r="1.98"/><circle class="cls-9" cx="112.15" cy="362.12" r="1.98"/><circle class="cls-8" cx="114.43" cy="362" r="1.98"/><circle class="cls-9" cx="114.43" cy="362" r="1.98"/><circle class="cls-8" cx="116.71" cy="362" r="1.98"/><circle class="cls-9" cx="116.71" cy="362" r="1.98"/><circle class="cls-8" cx="119.12" cy="361.88" r="1.98"/><circle class="cls-9" cx="119.12" cy="361.88" r="1.98"/><circle class="cls-8" cx="121.4" cy="361.88" r="1.98"/><circle class="cls-9" cx="121.4" cy="361.88" r="1.98"/><circle class="cls-8" cx="123.67" cy="361.76" r="1.98"/><circle class="cls-9" cx="123.67" cy="361.76" r="1.98"/><circle class="cls-8" cx="125.95" cy="361.76" r="1.98"/><circle class="cls-9" cx="125.95" cy="361.76" r="1.98"/><circle class="cls-8" cx="128.23" cy="361.64" r="1.98"/><circle class="cls-9" cx="128.23" cy="361.64" r="1.98"/><circle class="cls-8" cx="130.51" cy="361.64" r="1.98"/><circle class="cls-9" cx="130.51" cy="361.64" r="1.98"/><circle class="cls-8" cx="132.79" cy="361.64" r="1.98"/><circle class="cls-9" cx="132.79" cy="361.64" r="1.98"/><circle class="cls-8" cx="135.07" cy="361.52" r="1.98"/><circle class="cls-9" cx="135.07" cy="361.52" r="1.98"/><circle class="cls-8" cx="137.36" cy="361.52" r="1.98"/><circle class="cls-9" cx="137.36" cy="361.52" r="1.98"/><circle class="cls-8" cx="139.64" cy="361.4" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -233.76, 464.74)"/><circle class="cls-9" cx="139.64" cy="361.4" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -233.76, 464.74)"/><circle class="cls-8" cx="142.03" cy="361.4" r="1.98"/><circle class="cls-9" cx="142.03" cy="361.4" r="1.98"/><circle class="cls-8" cx="144.31" cy="361.28" r="1.98"/><circle class="cls-9" cx="144.31" cy="361.28" r="1.98"/><circle class="cls-8" cx="146.59" cy="361.28" r="1.98"/><circle class="cls-9" cx="146.59" cy="361.28" r="1.98"/><circle class="cls-8" cx="148.88" cy="361.16" r="1.98"/><circle class="cls-9" cx="148.88" cy="361.16" r="1.98"/><circle class="cls-8" cx="151.15" cy="361.16" r="1.98"/><circle class="cls-9" cx="151.15" cy="361.16" r="1.98"/><circle class="cls-8" cx="153.43" cy="361.04" r="1.98"/><circle class="cls-9" cx="153.43" cy="361.04" r="1.98"/><circle class="cls-8" cx="155.71" cy="361.04" r="1.98"/><circle class="cls-9" cx="155.71" cy="361.04" r="1.98"/><circle class="cls-8" cx="158" cy="361.04" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -216.85, 482.69)"/><circle class="cls-9" cx="158" cy="361.04" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -216.85, 482.69)"/><circle class="cls-8" cx="160.28" cy="360.92" r="1.98"/><circle class="cls-9" cx="160.28" cy="360.92" r="1.98"/><circle class="cls-8" cx="162.56" cy="360.92" r="1.98"/><circle class="cls-9" cx="162.56" cy="360.92" r="1.98"/><circle class="cls-8" cx="164.95" cy="360.8" r="1.98"/><circle class="cls-9" cx="164.95" cy="360.8" r="1.98"/><circle class="cls-8" cx="167.23" cy="360.8" r="1.98"/><circle class="cls-9" cx="167.23" cy="360.8" r="1.98"/><circle class="cls-8" cx="169.51" cy="360.68" r="1.98"/><circle class="cls-9" cx="169.51" cy="360.68" r="1.98"/><circle class="cls-8" cx="171.79" cy="360.68" r="1.98"/><circle class="cls-9" cx="171.79" cy="360.68" r="1.98"/><circle class="cls-8" cx="174.07" cy="360.56" r="1.98"/><circle class="cls-9" cx="174.07" cy="360.56" r="1.98"/><circle class="cls-8" cx="176.36" cy="360.56" r="1.98"/><circle class="cls-9" cx="176.36" cy="360.56" r="1.98"/><circle class="cls-8" cx="178.64" cy="360.56" r="1.98"/><circle class="cls-9" cx="178.64" cy="360.56" r="1.98"/><circle class="cls-8" cx="180.92" cy="360.44" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -195.59, 504.95)"/><circle class="cls-9" cx="180.92" cy="360.44" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -195.59, 504.95)"/><circle class="cls-8" cx="183.19" cy="360.44" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -193.54, 507.22)"/><circle class="cls-9" cx="183.19" cy="360.44" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -193.54, 507.22)"/><circle class="cls-8" cx="185.48" cy="360.32" r="1.98"/><circle class="cls-9" cx="185.48" cy="360.32" r="1.98"/><circle class="cls-8" cx="187.88" cy="360.32" r="1.98"/><circle class="cls-9" cx="187.88" cy="360.32" r="1.98"/><circle class="cls-8" cx="190.15" cy="360.2" r="1.98"/><circle class="cls-9" cx="190.15" cy="360.2" r="1.98"/><circle class="cls-8" cx="192.43" cy="360.2" r="1.98"/><circle class="cls-9" cx="192.43" cy="360.2" r="1.98"/><circle class="cls-8" cx="194.71" cy="360.08" r="1.98"/><circle class="cls-9" cx="194.71" cy="360.08" r="1.98"/><circle class="cls-8" cx="196.99" cy="360.08" r="1.98"/><circle class="cls-9" cx="196.99" cy="360.08" r="1.98"/><path class="cls-8" d="M201.26,360.08a2,2,0,1,1-2-2A2,2,0,0,1,201.26,360.08Z"/><path class="cls-9" d="M201.26,360.08a2,2,0,1,1-2-2A2,2,0,0,1,201.26,360.08Z"/><path class="cls-8" d="M203.53,360a2,2,0,1,1-2-2A2,2,0,0,1,203.53,360Z"/><path class="cls-9" d="M203.53,360a2,2,0,1,1-2-2A2,2,0,0,1,203.53,360Z"/><path class="cls-8" d="M205.81,360a2,2,0,1,1-2-2A2,2,0,0,1,205.81,360Z"/><path class="cls-9" d="M205.81,360a2,2,0,1,1-2-2A2,2,0,0,1,205.81,360Z"/><path class="cls-8" d="M208.09,359.84a2,2,0,1,1-2-2A2,2,0,0,1,208.09,359.84Z"/><path class="cls-9" d="M208.09,359.84a2,2,0,1,1-2-2A2,2,0,0,1,208.09,359.84Z"/><path class="cls-8" d="M210.37,359.84a2,2,0,1,1-2-2A2,2,0,0,1,210.37,359.84Z"/><path class="cls-9" d="M210.37,359.84a2,2,0,1,1-2-2A2,2,0,0,1,210.37,359.84Z"/><path class="cls-8" d="M212.77,359.84a2,2,0,1,1-2-2A2,2,0,0,1,212.77,359.84Z"/><path class="cls-9" d="M212.77,359.84a2,2,0,1,1-2-2A2,2,0,0,1,212.77,359.84Z"/><path class="cls-8" d="M215.05,359.72a2,2,0,1,1-2-2A2,2,0,0,1,215.05,359.72Z"/><path class="cls-9" d="M215.05,359.72a2,2,0,1,1-2-2A2,2,0,0,1,215.05,359.72Z"/><path class="cls-8" d="M217.33,359.72a2,2,0,1,1-2-2A2,2,0,0,1,217.33,359.72Z"/><path class="cls-9" d="M217.33,359.72a2,2,0,1,1-2-2A2,2,0,0,1,217.33,359.72Z"/><path class="cls-8" d="M219.61,359.6a2,2,0,1,1-2-2A2,2,0,0,1,219.61,359.6Z"/><path class="cls-9" d="M219.61,359.6a2,2,0,1,1-2-2A2,2,0,0,1,219.61,359.6Z"/><path class="cls-8" d="M221.89,359.6a2,2,0,1,1-2-2A2,2,0,0,1,221.89,359.6Z"/><path class="cls-9" d="M221.89,359.6a2,2,0,1,1-2-2A2,2,0,0,1,221.89,359.6Z"/><circle class="cls-8" cx="222.2" cy="359.48" r="1.98" transform="translate(-34.34 23.64) rotate(-5.65)"/><circle class="cls-9" cx="222.2" cy="359.48" r="1.98" transform="translate(-34.34 23.64) rotate(-5.65)"/><circle class="cls-8" cx="224.48" cy="359.48" r="1.98" transform="translate(-34.33 23.87) rotate(-5.65)"/><circle class="cls-9" cx="224.48" cy="359.48" r="1.98" transform="translate(-34.33 23.87) rotate(-5.65)"/><circle class="cls-8" cx="226.76" cy="359.48" r="1.98" transform="translate(-34.32 24.09) rotate(-5.65)"/><circle class="cls-9" cx="226.76" cy="359.48" r="1.98" transform="translate(-34.32 24.09) rotate(-5.65)"/><circle class="cls-8" cx="229.03" cy="359.36" r="1.98" transform="translate(-145.65 562.34) rotate(-85.93)"/><circle class="cls-9" cx="229.03" cy="359.36" r="1.98" transform="translate(-145.65 562.34) rotate(-85.93)"/><circle class="cls-8" cx="231.31" cy="359.36" r="1.98" transform="translate(-143.53 564.61) rotate(-85.93)"/><circle class="cls-9" cx="231.31" cy="359.36" r="1.98" transform="translate(-143.53 564.61) rotate(-85.93)"/><path class="cls-8" d="M235.7,359.24a2,2,0,1,1-2-2A2,2,0,0,1,235.7,359.24Z"/><path class="cls-9" d="M235.7,359.24a2,2,0,1,1-2-2A2,2,0,0,1,235.7,359.24Z"/><path class="cls-8" d="M238,359.24a2,2,0,1,1-2-2A2,2,0,0,1,238,359.24Z"/><path class="cls-9" d="M238,359.24a2,2,0,1,1-2-2A2,2,0,0,1,238,359.24Z"/><path class="cls-8" d="M240.26,359.12a2,2,0,1,1-2-2A2,2,0,0,1,240.26,359.12Z"/><path class="cls-9" d="M240.26,359.12a2,2,0,1,1-2-2A2,2,0,0,1,240.26,359.12Z"/><path class="cls-8" d="M242.53,359.12a2,2,0,1,1-2-2A2,2,0,0,1,242.53,359.12Z"/><path class="cls-9" d="M242.53,359.12a2,2,0,1,1-2-2A2,2,0,0,1,242.53,359.12Z"/><path class="cls-8" d="M244.81,359.12a2,2,0,1,1-2-2A2,2,0,0,1,244.81,359.12Z"/><path class="cls-9" d="M244.81,359.12a2,2,0,1,1-2-2A2,2,0,0,1,244.81,359.12Z"/><path class="cls-8" d="M247.09,359a2,2,0,1,1-2-2A2,2,0,0,1,247.09,359Z"/><path class="cls-9" d="M247.09,359a2,2,0,1,1-2-2A2,2,0,0,1,247.09,359Z"/><path class="cls-8" d="M249.37,359a2,2,0,1,1-2-2A2,2,0,0,1,249.37,359Z"/><path class="cls-9" d="M249.37,359a2,2,0,1,1-2-2A2,2,0,0,1,249.37,359Z"/><path class="cls-8" d="M251.65,358.88a2,2,0,1,1-2-2A2,2,0,0,1,251.65,358.88Z"/><path class="cls-9" d="M251.65,358.88a2,2,0,1,1-2-2A2,2,0,0,1,251.65,358.88Z"/><path class="cls-8" d="M253.93,358.88a2,2,0,1,1-2-2A2,2,0,0,1,253.93,358.88Z"/><path class="cls-9" d="M253.93,358.88a2,2,0,1,1-2-2A2,2,0,0,1,253.93,358.88Z"/><path class="cls-8" d="M256.21,358.88a2,2,0,1,1-2-2A2,2,0,0,1,256.21,358.88Z"/><path class="cls-9" d="M256.21,358.88a2,2,0,1,1-2-2A2,2,0,0,1,256.21,358.88Z"/><path class="cls-8" d="M258.61,358.76a2,2,0,1,1-2-2A2,2,0,0,1,258.61,358.76Z"/><path class="cls-9" d="M258.61,358.76a2,2,0,1,1-2-2A2,2,0,0,1,258.61,358.76Z"/><path class="cls-8" d="M260.89,358.76a2,2,0,1,1-2-2A2,2,0,0,1,260.89,358.76Z"/><path class="cls-9" d="M260.89,358.76a2,2,0,1,1-2-2A2,2,0,0,1,260.89,358.76Z"/><circle class="cls-8" cx="261.2" cy="358.64" r="1.98"/><circle class="cls-9" cx="261.2" cy="358.64" r="1.98"/><circle class="cls-8" cx="263.48" cy="358.64" r="1.98"/><circle class="cls-9" cx="263.48" cy="358.64" r="1.98"/><circle class="cls-8" cx="265.76" cy="358.64" r="1.98"/><circle class="cls-9" cx="265.76" cy="358.64" r="1.98"/><circle class="cls-8" cx="268.03" cy="358.52" r="1.98" transform="translate(-34.02 28.16) rotate(-5.65)"/><circle class="cls-9" cx="268.03" cy="358.52" r="1.98" transform="translate(-34.02 28.16) rotate(-5.65)"/><circle class="cls-8" cx="270.31" cy="358.52" r="1.98" transform="translate(-34.01 28.38) rotate(-5.65)"/><circle class="cls-9" cx="270.31" cy="358.52" r="1.98" transform="translate(-34.01 28.38) rotate(-5.65)"/><circle class="cls-8" cx="272.59" cy="358.52" r="1.98" transform="translate(-34 28.61) rotate(-5.65)"/><circle class="cls-9" cx="272.59" cy="358.52" r="1.98" transform="translate(-34 28.61) rotate(-5.65)"/><circle class="cls-8" cx="274.87" cy="358.4" r="1.98" transform="translate(-102.1 607.17) rotate(-85.93)"/><circle class="cls-9" cx="274.87" cy="358.4" r="1.98" transform="translate(-102.1 607.17) rotate(-85.93)"/><path class="cls-8" d="M279.14,358.28a2,2,0,1,1-2-2A2,2,0,0,1,279.14,358.28Z"/><path class="cls-9" d="M279.14,358.28a2,2,0,1,1-2-2A2,2,0,0,1,279.14,358.28Z"/><path class="cls-8" d="M281.53,358.16a2,2,0,1,1-2-2A2,2,0,0,1,281.53,358.16Z"/><path class="cls-9" d="M281.53,358.16a2,2,0,1,1-2-2A2,2,0,0,1,281.53,358.16Z"/><path class="cls-8" d="M283.81,358.16a2,2,0,1,1-2-2A2,2,0,0,1,283.81,358.16Z"/><path class="cls-9" d="M283.81,358.16a2,2,0,1,1-2-2A2,2,0,0,1,283.81,358.16Z"/><path class="cls-8" d="M286.09,358a2,2,0,1,1-2-2A2,2,0,0,1,286.09,358Z"/><path class="cls-9" d="M286.09,358a2,2,0,1,1-2-2A2,2,0,0,1,286.09,358Z"/><path class="cls-8" d="M288.37,357.92a2,2,0,1,1-2-2A2,2,0,0,1,288.37,357.92Z"/><path class="cls-9" d="M288.37,357.92a2,2,0,1,1-2-2A2,2,0,0,1,288.37,357.92Z"/><path class="cls-8" d="M290.65,357.8a2,2,0,1,1-2-2A2,2,0,0,1,290.65,357.8Z"/><path class="cls-9" d="M290.65,357.8a2,2,0,1,1-2-2A2,2,0,0,1,290.65,357.8Z"/><path class="cls-8" d="M292.93,357.8a2,2,0,1,1-2-2A2,2,0,0,1,292.93,357.8Z"/><path class="cls-9" d="M292.93,357.8a2,2,0,1,1-2-2A2,2,0,0,1,292.93,357.8Z"/><path class="cls-8" d="M295.21,357.68a2,2,0,1,1-2-2A2,2,0,0,1,295.21,357.68Z"/><path class="cls-9" d="M295.21,357.68a2,2,0,1,1-2-2A2,2,0,0,1,295.21,357.68Z"/><path class="cls-8" d="M297.49,357.56a2,2,0,1,1-2-2A2,2,0,0,1,297.49,357.56Z"/><path class="cls-9" d="M297.49,357.56a2,2,0,1,1-2-2A2,2,0,0,1,297.49,357.56Z"/><path class="cls-8" d="M299.77,357.44a2,2,0,1,1-2-2A2,2,0,0,1,299.77,357.44Z"/><path class="cls-9" d="M299.77,357.44a2,2,0,1,1-2-2A2,2,0,0,1,299.77,357.44Z"/><path class="cls-8" d="M302.05,357.44a2,2,0,1,1-2-2A2,2,0,0,1,302.05,357.44Z"/><path class="cls-9" d="M302.05,357.44a2,2,0,1,1-2-2A2,2,0,0,1,302.05,357.44Z"/><path class="cls-8" d="M304.33,357.32a2,2,0,1,1-2-2A2,2,0,0,1,304.33,357.32Z"/><path class="cls-9" d="M304.33,357.32a2,2,0,1,1-2-2A2,2,0,0,1,304.33,357.32Z"/><circle class="cls-8" cx="304.76" cy="357.2" r="1.98" transform="translate(-33.71 31.77) rotate(-5.65)"/><circle class="cls-9" cx="304.76" cy="357.2" r="1.98" transform="translate(-33.71 31.77) rotate(-5.65)"/><circle class="cls-8" cx="307.03" cy="357.2" r="1.98" transform="translate(-33.7 31.99) rotate(-5.65)"/><circle class="cls-9" cx="307.03" cy="357.2" r="1.98" transform="translate(-33.7 31.99) rotate(-5.65)"/><circle class="cls-8" cx="309.31" cy="357.08" r="1.98" transform="translate(-68.79 640.3) rotate(-85.93)"/><circle class="cls-9" cx="309.31" cy="357.08" r="1.98" transform="translate(-68.79 640.3) rotate(-85.93)"/><circle class="cls-8" cx="311.59" cy="356.96" r="1.98"/><circle class="cls-9" cx="311.59" cy="356.96" r="1.98"/><circle class="cls-8" cx="313.87" cy="356.84" r="1.98" transform="translate(-33.63 32.67) rotate(-5.65)"/><circle class="cls-9" cx="313.87" cy="356.84" r="1.98" transform="translate(-33.63 32.67) rotate(-5.65)"/><path class="cls-8" d="M318.14,356.84a2,2,0,1,1-2-2A2,2,0,0,1,318.14,356.84Z"/><path class="cls-9" d="M318.14,356.84a2,2,0,1,1-2-2A2,2,0,0,1,318.14,356.84Z"/><path class="cls-8" d="M320.42,356.72a2,2,0,1,1-2-2A2,2,0,0,1,320.42,356.72Z"/><path class="cls-9" d="M320.42,356.72a2,2,0,1,1-2-2A2,2,0,0,1,320.42,356.72Z"/><path class="cls-8" d="M322.7,356.6a2,2,0,1,1-2-2A2,2,0,0,1,322.7,356.6Z"/><path class="cls-9" d="M322.7,356.6a2,2,0,1,1-2-2A2,2,0,0,1,322.7,356.6Z"/><path class="cls-8" d="M325,356.6a2,2,0,1,1-2-2A2,2,0,0,1,325,356.6Z"/><path class="cls-9" d="M325,356.6a2,2,0,1,1-2-2A2,2,0,0,1,325,356.6Z"/><path class="cls-8" d="M327.26,356.48a2,2,0,1,1-2-2A2,2,0,0,1,327.26,356.48Z"/><path class="cls-9" d="M327.26,356.48a2,2,0,1,1-2-2A2,2,0,0,1,327.26,356.48Z"/><path class="cls-8" d="M329.65,356.36a2,2,0,1,1-2-2A2,2,0,0,1,329.65,356.36Z"/><path class="cls-9" d="M329.65,356.36a2,2,0,1,1-2-2A2,2,0,0,1,329.65,356.36Z"/><path class="cls-8" d="M331.93,356.24a2,2,0,1,1-2-2A2,2,0,0,1,331.93,356.24Z"/><path class="cls-9" d="M331.93,356.24a2,2,0,1,1-2-2A2,2,0,0,1,331.93,356.24Z"/><path class="cls-8" d="M334.21,356.12a2,2,0,1,1-2-2A2,2,0,0,1,334.21,356.12Z"/><path class="cls-9" d="M334.21,356.12a2,2,0,1,1-2-2A2,2,0,0,1,334.21,356.12Z"/><path class="cls-8" d="M336.49,356a2,2,0,1,1-2-2A2,2,0,0,1,336.49,356Z"/><path class="cls-9" d="M336.49,356a2,2,0,1,1-2-2A2,2,0,0,1,336.49,356Z"/><path class="cls-8" d="M338.77,355.88a2,2,0,1,1-2-2A2,2,0,0,1,338.77,355.88Z"/><path class="cls-9" d="M338.77,355.88a2,2,0,1,1-2-2A2,2,0,0,1,338.77,355.88Z"/><path class="cls-8" d="M341.05,355.76a2,2,0,1,1-2-2A2,2,0,0,1,341.05,355.76Z"/><path class="cls-9" d="M341.05,355.76a2,2,0,1,1-2-2A2,2,0,0,1,341.05,355.76Z"/><path class="cls-8" d="M343.33,355.64a2,2,0,1,1-2-2A2,2,0,0,1,343.33,355.64Z"/><path class="cls-9" d="M343.33,355.64a2,2,0,1,1-2-2A2,2,0,0,1,343.33,355.64Z"/><path class="cls-8" d="M345.61,355.52a2,2,0,1,1-2-2A2,2,0,0,1,345.61,355.52Z"/><path class="cls-9" d="M345.61,355.52a2,2,0,1,1-2-2A2,2,0,0,1,345.61,355.52Z"/><path class="cls-8" d="M347.89,355.4a2,2,0,1,1-2-2A2,2,0,0,1,347.89,355.4Z"/><path class="cls-9" d="M347.89,355.4a2,2,0,1,1-2-2A2,2,0,0,1,347.89,355.4Z"/><circle class="cls-8" cx="348.2" cy="355.28" r="1.98"/><circle class="cls-9" cx="348.2" cy="355.28" r="1.98"/><circle class="cls-8" cx="350.59" cy="355.16" r="1.98" transform="translate(-33.29 36.28) rotate(-5.65)"/><circle class="cls-9" cx="350.59" cy="355.16" r="1.98" transform="translate(-33.29 36.28) rotate(-5.65)"/><circle class="cls-8" cx="352.87" cy="355.04" r="1.98"/><circle class="cls-9" cx="352.87" cy="355.04" r="1.98"/><path class="cls-8" d="M357.14,354.92a2,2,0,1,1-2-2A2,2,0,0,1,357.14,354.92Z"/><path class="cls-9" d="M357.14,354.92a2,2,0,1,1-2-2A2,2,0,0,1,357.14,354.92Z"/><path class="cls-8" d="M359.42,354.8a2,2,0,1,1-2-2A2,2,0,0,1,359.42,354.8Z"/><path class="cls-9" d="M359.42,354.8a2,2,0,1,1-2-2A2,2,0,0,1,359.42,354.8Z"/><path class="cls-8" d="M361.7,354.68a2,2,0,1,1-2-2A2,2,0,0,1,361.7,354.68Z"/><path class="cls-9" d="M361.7,354.68a2,2,0,1,1-2-2A2,2,0,0,1,361.7,354.68Z"/><path class="cls-8" d="M364,354.56a2,2,0,1,1-2-2A2,2,0,0,1,364,354.56Z"/><path class="cls-9" d="M364,354.56a2,2,0,1,1-2-2A2,2,0,0,1,364,354.56Z"/><path class="cls-8" d="M366.26,354.44a2,2,0,1,1-2-2A2,2,0,0,1,366.26,354.44Z"/><path class="cls-9" d="M366.26,354.44a2,2,0,1,1-2-2A2,2,0,0,1,366.26,354.44Z"/><path class="cls-8" d="M368.53,354.44a2,2,0,1,1-2-2A2,2,0,0,1,368.53,354.44Z"/><path class="cls-9" d="M368.53,354.44a2,2,0,1,1-2-2A2,2,0,0,1,368.53,354.44Z"/><path class="cls-8" d="M370.81,354.2a2,2,0,1,1-2-2A2,2,0,0,1,370.81,354.2Z"/><path class="cls-9" d="M370.81,354.2a2,2,0,1,1-2-2A2,2,0,0,1,370.81,354.2Z"/><path class="cls-8" d="M373.09,354.08a2,2,0,1,1-2-2A2,2,0,0,1,373.09,354.08Z"/><path class="cls-9" d="M373.09,354.08a2,2,0,1,1-2-2A2,2,0,0,1,373.09,354.08Z"/><path class="cls-8" d="M375.49,354a2,2,0,1,1-2-2A2,2,0,0,1,375.49,354Z"/><path class="cls-9" d="M375.49,354a2,2,0,1,1-2-2A2,2,0,0,1,375.49,354Z"/><path class="cls-8" d="M377.77,353.84a2,2,0,1,1-2-2A2,2,0,0,1,377.77,353.84Z"/><path class="cls-9" d="M377.77,353.84a2,2,0,1,1-2-2A2,2,0,0,1,377.77,353.84Z"/><path class="cls-8" d="M380.05,353.6a2,2,0,1,1-2-2A2,2,0,0,1,380.05,353.6Z"/><path class="cls-9" d="M380.05,353.6a2,2,0,1,1-2-2A2,2,0,0,1,380.05,353.6Z"/><path class="cls-8" d="M382.33,353.48a2,2,0,1,1-2-2A2,2,0,0,1,382.33,353.48Z"/><path class="cls-9" d="M382.33,353.48a2,2,0,1,1-2-2A2,2,0,0,1,382.33,353.48Z"/><path class="cls-8" d="M384.61,353.36a2,2,0,1,1-2-2A2,2,0,0,1,384.61,353.36Z"/><path class="cls-9" d="M384.61,353.36a2,2,0,1,1-2-2A2,2,0,0,1,384.61,353.36Z"/><path class="cls-8" d="M386.89,353.24a2,2,0,1,1-2-2A2,2,0,0,1,386.89,353.24Z"/><path class="cls-9" d="M386.89,353.24a2,2,0,1,1-2-2A2,2,0,0,1,386.89,353.24Z"/><circle class="cls-8" cx="387.2" cy="353.12" r="1.98" transform="translate(-23.38 678.75) rotate(-80.78)"/><circle class="cls-9" cx="387.2" cy="353.12" r="1.98" transform="translate(-23.38 678.75) rotate(-80.78)"/><circle class="cls-8" cx="389.48" cy="352.88" r="1.98" transform="translate(-32.88 40.1) rotate(-5.65)"/><circle class="cls-9" cx="389.48" cy="352.88" r="1.98" transform="translate(-32.88 40.1) rotate(-5.65)"/><circle class="cls-8" cx="391.76" cy="352.76" r="1.98"/><circle class="cls-9" cx="391.76" cy="352.76" r="1.98"/><circle class="cls-8" cx="394.03" cy="352.64" r="1.98"/><circle class="cls-9" cx="394.03" cy="352.64" r="1.98"/><path class="cls-8" d="M398.42,352.52a2,2,0,1,1-2-2A2,2,0,0,1,398.42,352.52Z"/><path class="cls-9" d="M398.42,352.52a2,2,0,1,1-2-2A2,2,0,0,1,398.42,352.52Z"/><path class="cls-8" d="M400.7,352.28a2,2,0,1,1-2-2A2,2,0,0,1,400.7,352.28Z"/><path class="cls-9" d="M400.7,352.28a2,2,0,1,1-2-2A2,2,0,0,1,400.7,352.28Z"/><path class="cls-8" d="M403,352.16a2,2,0,1,1-2-2A2,2,0,0,1,403,352.16Z"/><path class="cls-9" d="M403,352.16a2,2,0,1,1-2-2A2,2,0,0,1,403,352.16Z"/><path class="cls-8" d="M405.26,351.92a2,2,0,1,1-2-2A2,2,0,0,1,405.26,351.92Z"/><path class="cls-9" d="M405.26,351.92a2,2,0,1,1-2-2A2,2,0,0,1,405.26,351.92Z"/><path class="cls-8" d="M407.53,351.8a2,2,0,1,1-2-2A2,2,0,0,1,407.53,351.8Z"/><path class="cls-9" d="M407.53,351.8a2,2,0,1,1-2-2A2,2,0,0,1,407.53,351.8Z"/><path class="cls-8" d="M409.81,351.68a2,2,0,1,1-2-2A2,2,0,0,1,409.81,351.68Z"/><path class="cls-9" d="M409.81,351.68a2,2,0,1,1-2-2A2,2,0,0,1,409.81,351.68Z"/><path class="cls-8" d="M412.09,351.44a2,2,0,1,1-2-2A2,2,0,0,1,412.09,351.44Z"/><path class="cls-9" d="M412.09,351.44a2,2,0,1,1-2-2A2,2,0,0,1,412.09,351.44Z"/><path class="cls-8" d="M414.37,351.32a2,2,0,1,1-2-2A2,2,0,0,1,414.37,351.32Z"/><path class="cls-9" d="M414.37,351.32a2,2,0,1,1-2-2A2,2,0,0,1,414.37,351.32Z"/><path class="cls-8" d="M416.65,351.08a2,2,0,1,1-2-2A2,2,0,0,1,416.65,351.08Z"/><path class="cls-9" d="M416.65,351.08a2,2,0,1,1-2-2A2,2,0,0,1,416.65,351.08Z"/><path class="cls-8" d="M418.93,351a2,2,0,1,1-2-2A2,2,0,0,1,418.93,351Z"/><path class="cls-9" d="M418.93,351a2,2,0,1,1-2-2A2,2,0,0,1,418.93,351Z"/><path class="cls-8" d="M421.33,350.72a2,2,0,1,1-2-2A2,2,0,0,1,421.33,350.72Z"/><path class="cls-9" d="M421.33,350.72a2,2,0,1,1-2-2A2,2,0,0,1,421.33,350.72Z"/><path class="cls-8" d="M423.61,350.6a2,2,0,1,1-2-2A2,2,0,0,1,423.61,350.6Z"/><path class="cls-9" d="M423.61,350.6a2,2,0,1,1-2-2A2,2,0,0,1,423.61,350.6Z"/><path class="cls-8" d="M425.89,350.36a2,2,0,1,1-2-2A2,2,0,0,1,425.89,350.36Z"/><path class="cls-9" d="M425.89,350.36a2,2,0,1,1-2-2A2,2,0,0,1,425.89,350.36Z"/><circle class="cls-8" cx="426.2" cy="350.24" r="1.98"/><circle class="cls-9" cx="426.2" cy="350.24" r="1.98"/><circle class="cls-8" cx="428.48" cy="350" r="1.98"/><circle class="cls-9" cx="428.48" cy="350" r="1.98"/><circle class="cls-8" cx="430.76" cy="349.76" r="1.98"/><circle class="cls-9" cx="430.76" cy="349.76" r="1.98"/><circle class="cls-8" cx="433.03" cy="349.64" r="1.98"/><circle class="cls-9" cx="433.03" cy="349.64" r="1.98"/><circle class="cls-8" cx="435.31" cy="349.4" r="1.98"/><circle class="cls-9" cx="435.31" cy="349.4" r="1.98"/><circle class="cls-8" cx="437.59" cy="349.28" r="1.98" transform="translate(22.73 725.27) rotate(-80.78)"/><circle class="cls-9" cx="437.59" cy="349.28" r="1.98" transform="translate(22.73 725.27) rotate(-80.78)"/><circle class="cls-8" cx="439.87" cy="349.04" r="1.98"/><circle class="cls-9" cx="439.87" cy="349.04" r="1.98"/><path class="cls-8" d="M444.26,348.8a2,2,0,1,1-2-2A2,2,0,0,1,444.26,348.8Z"/><path class="cls-9" d="M444.26,348.8a2,2,0,1,1-2-2A2,2,0,0,1,444.26,348.8Z"/><path class="cls-8" d="M446.53,348.68a2,2,0,1,1-2-2A2,2,0,0,1,446.53,348.68Z"/><path class="cls-9" d="M446.53,348.68a2,2,0,1,1-2-2A2,2,0,0,1,446.53,348.68Z"/><path class="cls-8" d="M448.81,348.44a2,2,0,1,1-2-2A2,2,0,0,1,448.81,348.44Z"/><path class="cls-9" d="M448.81,348.44a2,2,0,1,1-2-2A2,2,0,0,1,448.81,348.44Z"/><path class="cls-8" d="M451.09,348.2a2,2,0,1,1-2-2A2,2,0,0,1,451.09,348.2Z"/><path class="cls-9" d="M451.09,348.2a2,2,0,1,1-2-2A2,2,0,0,1,451.09,348.2Z"/><path class="cls-8" d="M453.38,348a2,2,0,1,1-2-2A2,2,0,0,1,453.38,348Z"/><path class="cls-9" d="M453.38,348a2,2,0,1,1-2-2A2,2,0,0,1,453.38,348Z"/><path class="cls-8" d="M455.65,347.84a2,2,0,1,1-2-2A2,2,0,0,1,455.65,347.84Z"/><path class="cls-9" d="M455.65,347.84a2,2,0,1,1-2-2A2,2,0,0,1,455.65,347.84Z"/><path class="cls-8" d="M457.94,347.6a2,2,0,1,1-2-2A2,2,0,0,1,457.94,347.6Z"/><path class="cls-9" d="M457.94,347.6a2,2,0,1,1-2-2A2,2,0,0,1,457.94,347.6Z"/><path class="cls-8" d="M460.21,347.36a2,2,0,1,1-2-2A2,2,0,0,1,460.21,347.36Z"/><path class="cls-9" d="M460.21,347.36a2,2,0,1,1-2-2A2,2,0,0,1,460.21,347.36Z"/><path class="cls-8" d="M462.5,347.12a2,2,0,1,1-2-2A2,2,0,0,1,462.5,347.12Z"/><path class="cls-9" d="M462.5,347.12a2,2,0,1,1-2-2A2,2,0,0,1,462.5,347.12Z"/><circle class="cls-8" cx="462.8" cy="346.88" r="1.98"/><circle class="cls-9" cx="462.8" cy="346.88" r="1.98"/><path class="cls-8" d="M467.18,346.64a2,2,0,1,1-2-2A2,2,0,0,1,467.18,346.64Z"/><path class="cls-9" d="M467.18,346.64a2,2,0,1,1-2-2A2,2,0,0,1,467.18,346.64Z"/><circle class="cls-8" cx="467.48" cy="346.4" r="1.98"/><circle class="cls-9" cx="467.48" cy="346.4" r="1.98"/><path class="cls-8" d="M471.74,346.16a2,2,0,1,1-2-2A2,2,0,0,1,471.74,346.16Z"/><path class="cls-9" d="M471.74,346.16a2,2,0,1,1-2-2A2,2,0,0,1,471.74,346.16Z"/><circle class="cls-8" cx="472.03" cy="345.92" r="1.98" transform="translate(-31.79 48.2) rotate(-5.65)"/><circle class="cls-9" cx="472.03" cy="345.92" r="1.98" transform="translate(-31.79 48.2) rotate(-5.65)"/><path class="cls-8" d="M476.3,345.68a2,2,0,1,1-2-2A2,2,0,0,1,476.3,345.68Z"/><path class="cls-9" d="M476.3,345.68a2,2,0,1,1-2-2A2,2,0,0,1,476.3,345.68Z"/><circle class="cls-8" cx="476.59" cy="345.44" r="1.98" transform="translate(98.24 796.34) rotate(-85.93)"/><circle class="cls-9" cx="476.59" cy="345.44" r="1.98" transform="translate(98.24 796.34) rotate(-85.93)"/><path class="cls-8" d="M480.86,345.2a2,2,0,1,1-2-2A2,2,0,0,1,480.86,345.2Z"/><path class="cls-9" d="M480.86,345.2a2,2,0,1,1-2-2A2,2,0,0,1,480.86,345.2Z"/><circle class="cls-8" cx="481.15" cy="344.96" r="1.98"/><circle class="cls-9" cx="481.15" cy="344.96" r="1.98"/><path class="cls-8" d="M485.42,344.72a2,2,0,1,1-2-2A2,2,0,0,1,485.42,344.72Z"/><path class="cls-9" d="M485.42,344.72a2,2,0,1,1-2-2A2,2,0,0,1,485.42,344.72Z"/><circle class="cls-8" cx="485.71" cy="344.48" r="1.98" transform="translate(-31.58 49.54) rotate(-5.65)"/><circle class="cls-9" cx="485.71" cy="344.48" r="1.98" transform="translate(-31.58 49.54) rotate(-5.65)"/><path class="cls-8" d="M490,344.24a2,2,0,1,1-2-2A2,2,0,0,1,490,344.24Z"/><path class="cls-9" d="M490,344.24a2,2,0,1,1-2-2A2,2,0,0,1,490,344.24Z"/><path class="cls-8" d="M492.38,344a2,2,0,1,1-2-2A2,2,0,0,1,492.38,344Z"/><path class="cls-9" d="M492.38,344a2,2,0,1,1-2-2A2,2,0,0,1,492.38,344Z"/><path class="cls-8" d="M494.65,343.64a2,2,0,1,1-2-2A2,2,0,0,1,494.65,343.64Z"/><path class="cls-9" d="M494.65,343.64a2,2,0,1,1-2-2A2,2,0,0,1,494.65,343.64Z"/><path class="cls-8" d="M496.94,343.4a2,2,0,1,1-2-2A2,2,0,0,1,496.94,343.4Z"/><path class="cls-9" d="M496.94,343.4a2,2,0,1,1-2-2A2,2,0,0,1,496.94,343.4Z"/><path class="cls-8" d="M499.21,343.16a2,2,0,1,1-2-2A2,2,0,0,1,499.21,343.16Z"/><path class="cls-9" d="M499.21,343.16a2,2,0,1,1-2-2A2,2,0,0,1,499.21,343.16Z"/><path class="cls-8" d="M501.5,342.92a2,2,0,1,1-2-2A2,2,0,0,1,501.5,342.92Z"/><path class="cls-9" d="M501.5,342.92a2,2,0,1,1-2-2A2,2,0,0,1,501.5,342.92Z"/><circle class="cls-8" cx="501.8" cy="342.56" r="1.98" transform="translate(-31.31 51.11) rotate(-5.65)"/><circle class="cls-9" cx="501.8" cy="342.56" r="1.98" transform="translate(-31.31 51.11) rotate(-5.65)"/><path class="cls-8" d="M506.06,342.32a2,2,0,1,1-2-2A2,2,0,0,1,506.06,342.32Z"/><path class="cls-9" d="M506.06,342.32a2,2,0,1,1-2-2A2,2,0,0,1,506.06,342.32Z"/><circle class="cls-8" cx="506.36" cy="342.08" r="1.98" transform="translate(129.25 822.91) rotate(-85.93)"/><circle class="cls-9" cx="506.36" cy="342.08" r="1.98" transform="translate(129.25 822.91) rotate(-85.93)"/><path class="cls-8" d="M510.62,341.72a2,2,0,1,1-2-2A2,2,0,0,1,510.62,341.72Z"/><path class="cls-9" d="M510.62,341.72a2,2,0,1,1-2-2A2,2,0,0,1,510.62,341.72Z"/><circle class="cls-8" cx="510.92" cy="341.48" r="1.98" transform="translate(-31.16 52.01) rotate(-5.65)"/><circle class="cls-9" cx="510.92" cy="341.48" r="1.98" transform="translate(-31.16 52.01) rotate(-5.65)"/><path class="cls-8" d="M515.3,341.12a2,2,0,1,1-2-2A2,2,0,0,1,515.3,341.12Z"/><path class="cls-9" d="M515.3,341.12a2,2,0,1,1-2-2A2,2,0,0,1,515.3,341.12Z"/><circle class="cls-8" cx="515.59" cy="340.88" r="1.98"/><circle class="cls-9" cx="515.59" cy="340.88" r="1.98"/><path class="cls-8" d="M519.86,340.52a2,2,0,1,1-2-2A2,2,0,0,1,519.86,340.52Z"/><path class="cls-9" d="M519.86,340.52a2,2,0,1,1-2-2A2,2,0,0,1,519.86,340.52Z"/><circle class="cls-8" cx="520.15" cy="340.28" r="1.98" transform="translate(-31 52.91) rotate(-5.65)"/><circle class="cls-9" cx="520.15" cy="340.28" r="1.98" transform="translate(-31 52.91) rotate(-5.65)"/><path class="cls-8" d="M524.42,339.92a2,2,0,1,1-2-2A2,2,0,0,1,524.42,339.92Z"/><path class="cls-9" d="M524.42,339.92a2,2,0,1,1-2-2A2,2,0,0,1,524.42,339.92Z"/><circle class="cls-8" cx="524.71" cy="339.56" r="1.98" transform="translate(-30.91 53.36) rotate(-5.65)"/><circle class="cls-9" cx="524.71" cy="339.56" r="1.98" transform="translate(-30.91 53.36) rotate(-5.65)"/><path class="cls-8" d="M529,339.32a2,2,0,1,1-2-2A2,2,0,0,1,529,339.32Z"/><path class="cls-9" d="M529,339.32a2,2,0,1,1-2-2A2,2,0,0,1,529,339.32Z"/><circle class="cls-8" cx="529.27" cy="338.96" r="1.98"/><circle class="cls-9" cx="529.27" cy="338.96" r="1.98"/><path class="cls-8" d="M533.53,338.6a2,2,0,1,1-2-2A2,2,0,0,1,533.53,338.6Z"/><path class="cls-9" d="M533.53,338.6a2,2,0,1,1-2-2A2,2,0,0,1,533.53,338.6Z"/><path class="cls-8" d="M535.82,338.24a2,2,0,1,1-2-2A2,2,0,0,1,535.82,338.24Z"/><path class="cls-9" d="M535.82,338.24a2,2,0,1,1-2-2A2,2,0,0,1,535.82,338.24Z"/><path class="cls-8" d="M538.21,337.88a2,2,0,1,1-2-2A2,2,0,0,1,538.21,337.88Z"/><path class="cls-9" d="M538.21,337.88a2,2,0,1,1-2-2A2,2,0,0,1,538.21,337.88Z"/><path class="cls-8" d="M540.5,337.52a2,2,0,1,1-2-2A2,2,0,0,1,540.5,337.52Z"/><path class="cls-9" d="M540.5,337.52a2,2,0,1,1-2-2A2,2,0,0,1,540.5,337.52Z"/><circle class="cls-8" cx="540.8" cy="337.16" r="1.98"/><circle class="cls-9" cx="540.8" cy="337.16" r="1.98"/><path class="cls-8" d="M545.06,336.68a2,2,0,1,1-2-2A2,2,0,0,1,545.06,336.68Z"/><path class="cls-9" d="M545.06,336.68a2,2,0,1,1-2-2A2,2,0,0,1,545.06,336.68Z"/><circle class="cls-8" cx="545.36" cy="336.32" r="1.98"/><circle class="cls-9" cx="545.36" cy="336.32" r="1.98"/><path class="cls-8" d="M549.62,336a2,2,0,1,1-2-2A2,2,0,0,1,549.62,336Z"/><path class="cls-9" d="M549.62,336a2,2,0,1,1-2-2A2,2,0,0,1,549.62,336Z"/><circle class="cls-8" cx="549.92" cy="335.48" r="1.98" transform="translate(-30.38 55.82) rotate(-5.65)"/><circle class="cls-9" cx="549.92" cy="335.48" r="1.98" transform="translate(-30.38 55.82) rotate(-5.65)"/><path class="cls-8" d="M554.18,335a2,2,0,1,1-2-2A2,2,0,0,1,554.18,335Z"/><path class="cls-9" d="M554.18,335a2,2,0,1,1-2-2A2,2,0,0,1,554.18,335Z"/><circle class="cls-8" cx="554.48" cy="334.64" r="1.98" transform="translate(181.38 863.99) rotate(-85.93)"/><circle class="cls-9" cx="554.48" cy="334.64" r="1.98" transform="translate(181.38 863.99) rotate(-85.93)"/><path class="cls-8" d="M558.74,334.16a2,2,0,1,1-2-2A2,2,0,0,1,558.74,334.16Z"/><path class="cls-9" d="M558.74,334.16a2,2,0,1,1-2-2A2,2,0,0,1,558.74,334.16Z"/><circle class="cls-8" cx="559.15" cy="333.68" r="1.98"/><circle class="cls-9" cx="559.15" cy="333.68" r="1.98"/><path class="cls-8" d="M563.42,333.2a2,2,0,1,1-2-2A2,2,0,0,1,563.42,333.2Z"/><path class="cls-9" d="M563.42,333.2a2,2,0,1,1-2-2A2,2,0,0,1,563.42,333.2Z"/><circle class="cls-8" cx="563.71" cy="332.6" r="1.98"/><circle class="cls-9" cx="563.71" cy="332.6" r="1.98"/><path class="cls-8" d="M568,332.12a2,2,0,1,1-2-2A2,2,0,0,1,568,332.12Z"/><path class="cls-9" d="M568,332.12a2,2,0,1,1-2-2A2,2,0,0,1,568,332.12Z"/><circle class="cls-8" cx="568.27" cy="331.52" r="1.98"/><circle class="cls-9" cx="568.27" cy="331.52" r="1.98"/><path class="cls-8" d="M572.53,331a2,2,0,1,1-2-2A2,2,0,0,1,572.53,331Z"/><path class="cls-9" d="M572.53,331a2,2,0,1,1-2-2A2,2,0,0,1,572.53,331Z"/><path class="cls-8" d="M574.82,330.32a2,2,0,1,1-2-2A2,2,0,0,1,574.82,330.32Z"/><path class="cls-9" d="M574.82,330.32a2,2,0,1,1-2-2A2,2,0,0,1,574.82,330.32Z"/><path class="cls-8" d="M577.09,329.72a2,2,0,1,1-2-2A2,2,0,0,1,577.09,329.72Z"/><path class="cls-9" d="M577.09,329.72a2,2,0,1,1-2-2A2,2,0,0,1,577.09,329.72Z"/><path class="cls-8" d="M579.38,329.12a2,2,0,1,1-2-2A2,2,0,0,1,579.38,329.12Z"/><path class="cls-9" d="M579.38,329.12a2,2,0,1,1-2-2A2,2,0,0,1,579.38,329.12Z"/><path class="cls-8" d="M581.65,328.4a2,2,0,1,1-2-2A2,2,0,0,1,581.65,328.4Z"/><path class="cls-9" d="M581.65,328.4a2,2,0,1,1-2-2A2,2,0,0,1,581.65,328.4Z"/><path class="cls-8" d="M584.06,327.68a2,2,0,1,1-2-2A2,2,0,0,1,584.06,327.68Z"/><path class="cls-9" d="M584.06,327.68a2,2,0,1,1-2-2A2,2,0,0,1,584.06,327.68Z"/><circle class="cls-8" cx="584.36" cy="326.96" r="1.98"/><circle class="cls-9" cx="584.36" cy="326.96" r="1.98"/><path class="cls-8" d="M588.62,326.12a2,2,0,1,1-2-2A2,2,0,0,1,588.62,326.12Z"/><path class="cls-9" d="M588.62,326.12a2,2,0,1,1-2-2A2,2,0,0,1,588.62,326.12Z"/><circle class="cls-8" cx="588.92" cy="325.28" r="1.98" transform="translate(-29.19 59.61) rotate(-5.65)"/><circle class="cls-9" cx="588.92" cy="325.28" r="1.98" transform="translate(-29.19 59.61) rotate(-5.65)"/><path class="cls-8" d="M593.18,324.44a2,2,0,1,1-2-2A2,2,0,0,1,593.18,324.44Z"/><path class="cls-9" d="M593.18,324.44a2,2,0,1,1-2-2A2,2,0,0,1,593.18,324.44Z"/><circle class="cls-8" cx="593.48" cy="323.48" r="1.98" transform="translate(-28.99 60.05) rotate(-5.65)"/><circle class="cls-9" cx="593.48" cy="323.48" r="1.98" transform="translate(-28.99 60.05) rotate(-5.65)"/><path class="cls-8" d="M597.74,322.52a2,2,0,1,1-2-2A2,2,0,0,1,597.74,322.52Z"/><path class="cls-9" d="M597.74,322.52a2,2,0,1,1-2-2A2,2,0,0,1,597.74,322.52Z"/><circle class="cls-8" cx="598.03" cy="321.56" r="1.98" transform="translate(-28.77 60.49) rotate(-5.65)"/><circle class="cls-9" cx="598.03" cy="321.56" r="1.98" transform="translate(-28.77 60.49) rotate(-5.65)"/><path class="cls-8" d="M602.3,320.48a2,2,0,1,1-2-2A2,2,0,0,1,602.3,320.48Z"/><path class="cls-9" d="M602.3,320.48a2,2,0,1,1-2-2A2,2,0,0,1,602.3,320.48Z"/><circle class="cls-8" cx="602.59" cy="319.28" r="1.98" transform="translate(-28.53 60.93) rotate(-5.65)"/><circle class="cls-9" cx="602.59" cy="319.28" r="1.98" transform="translate(-28.53 60.93) rotate(-5.65)"/><path class="cls-8" d="M607,318.08a2,2,0,1,1-2-2A2,2,0,0,1,607,318.08Z"/><path class="cls-9" d="M607,318.08a2,2,0,1,1-2-2A2,2,0,0,1,607,318.08Z"/><circle class="cls-8" cx="607.27" cy="316.76" r="1.98"/><circle class="cls-9" cx="607.27" cy="316.76" r="1.98"/><path class="cls-8" d="M611.53,315.44a2,2,0,1,1-2-2A2,2,0,0,1,611.53,315.44Z"/><path class="cls-9" d="M611.53,315.44a2,2,0,1,1-2-2A2,2,0,0,1,611.53,315.44Z"/><path class="cls-8" d="M613.82,314a2,2,0,1,1-2-2A2,2,0,0,1,613.82,314Z"/><path class="cls-9" d="M613.82,314a2,2,0,1,1-2-2A2,2,0,0,1,613.82,314Z"/><path class="cls-8" d="M616.09,312.44a2,2,0,1,1-2-2A2,2,0,0,1,616.09,312.44Z"/><path class="cls-9" d="M616.09,312.44a2,2,0,1,1-2-2A2,2,0,0,1,616.09,312.44Z"/><path class="cls-8" d="M618.38,310.76a2,2,0,1,1-2-2A2,2,0,0,1,618.38,310.76Z"/><path class="cls-9" d="M618.38,310.76a2,2,0,1,1-2-2A2,2,0,0,1,618.38,310.76Z"/><path class="cls-8" d="M620.65,309.08a2,2,0,1,1-2-2A2,2,0,0,1,620.65,309.08Z"/><path class="cls-9" d="M620.65,309.08a2,2,0,1,1-2-2A2,2,0,0,1,620.65,309.08Z"/><path class="cls-8" d="M622.94,307.16a2,2,0,1,1-2-2A2,2,0,0,1,622.94,307.16Z"/><path class="cls-9" d="M622.94,307.16a2,2,0,1,1-2-2A2,2,0,0,1,622.94,307.16Z"/><path class="cls-8" d="M625.21,305.24a2,2,0,1,1-2-2A2,2,0,0,1,625.21,305.24Z"/><path class="cls-9" d="M625.21,305.24a2,2,0,1,1-2-2A2,2,0,0,1,625.21,305.24Z"/><path class="cls-8" d="M627.5,303.2a2,2,0,1,1-2-2A2,2,0,0,1,627.5,303.2Z"/><path class="cls-9" d="M627.5,303.2a2,2,0,1,1-2-2A2,2,0,0,1,627.5,303.2Z"/><circle class="cls-8" cx="627.92" cy="300.92" r="1.98" transform="translate(-26.6 63.34) rotate(-5.65)"/><circle class="cls-9" cx="627.92" cy="300.92" r="1.98" transform="translate(-26.6 63.34) rotate(-5.65)"/><path class="cls-8" d="M632.18,298.64a2,2,0,1,1-2-2A2,2,0,0,1,632.18,298.64Z"/><path class="cls-9" d="M632.18,298.64a2,2,0,1,1-2-2A2,2,0,0,1,632.18,298.64Z"/><circle class="cls-8" cx="632.48" cy="296.12" r="1.98" transform="translate(-26.1 63.76) rotate(-5.65)"/><circle class="cls-9" cx="632.48" cy="296.12" r="1.98" transform="translate(-26.1 63.76) rotate(-5.65)"/><path class="cls-8" d="M636.74,293.36a2,2,0,1,1-2-2A2,2,0,0,1,636.74,293.36Z"/><path class="cls-9" d="M636.74,293.36a2,2,0,1,1-2-2A2,2,0,0,1,636.74,293.36Z"/><circle class="cls-8" cx="637.03" cy="290.48" r="1.98" transform="translate(-25.52 64.19) rotate(-5.65)"/><circle class="cls-9" cx="637.03" cy="290.48" r="1.98" transform="translate(-25.52 64.19) rotate(-5.65)"/><path class="cls-8" d="M641.3,287.48a2,2,0,1,1-2-2A2,2,0,0,1,641.3,287.48Z"/><path class="cls-9" d="M641.3,287.48a2,2,0,1,1-2-2A2,2,0,0,1,641.3,287.48Z"/><g class="cls-3"><polyline class="cls-10" points="54.95 363.24 57.26 363.13 59.53 363.01 61.81 362.89 64.09 362.77 66.38 362.65 68.66 362.54 71.06 362.42 73.33 362.3 75.61 362.18 77.89 362.06 80.17 361.81 82.45 361.69 84.73 361.45 87.02 361.33 89.3 361.1 91.58 360.98 93.86 360.74 96.25 360.5 98.53 360.38 100.81 360.13 103.09 359.89 105.38 359.77 107.66 359.54 109.94 359.3 112.22 359.06 114.5 358.81 116.78 358.69 119.17 358.45 121.45 358.21 123.73 357.98 126.02 357.74 128.29 357.5 130.57 357.38 132.85 357.13 135.13 356.89 137.41 356.65 139.69 356.42 142.09 356.18 144.38 355.94 146.66 355.69 148.94 355.57 151.22 355.33 153.5 355.1 155.78 354.86 158.06 354.62 160.34 354.38 162.62 354.13 165.01 353.89 167.29 353.77 169.57 353.54 171.85 353.3 174.13 353.06 176.41 352.81 178.69 352.69 180.97 352.45 183.25 352.21 185.53 351.98 187.94 351.74 190.22 351.5 192.5 351.38 194.78 351.13 197.06 350.89 199.34 350.65 201.62 350.54 203.9 350.3 206.18 350.06 208.46 349.81 210.85 349.57 213.13 349.45 215.41 349.21 217.69 348.98 219.97 348.86 222.25 348.62 224.53 348.38 226.81 348.25 229.09 348.01 231.38 347.77 233.78 347.54 236.06 347.42 238.34 347.18 240.62 346.94 242.9 346.81 245.18 346.57 247.46 346.45 249.74 346.21 252.01 345.98 254.29 345.86 256.69 345.62 258.98 345.38 261.25 345.25 263.54 345.01 265.81 344.89 268.1 344.65 270.38 344.42 272.65 344.3 274.94 343.94 277.21 343.57 279.62 343.21 281.89 342.86 284.18 342.5 286.45 342.13 288.74 341.77 291.01 341.42 293.3 341.06 295.57 340.69 297.86 340.33 300.13 339.98 302.42 339.62 304.81 339.38 307.1 339.01 309.38 338.65 311.65 338.3 313.94 337.94 316.21 337.57 318.5 337.21 320.77 336.86 323.06 336.62 325.33 336.25 327.74 335.89 330.01 335.54 332.3 335.06 334.57 334.57 336.86 334.1 339.13 333.62 341.42 333.13 343.69 332.65 345.98 332.18 348.25 331.69 350.65 331.21 352.94 330.74 355.21 330.25 357.5 329.77 359.77 329.3 362.06 328.81 364.33 328.33 366.62 327.86 368.89 327.38 371.18 326.77 373.57 326.18 375.86 325.57 378.13 324.98 380.42 324.38 382.69 323.77 384.98 323.18 387.25 322.57 389.54 321.98 391.81 321.38 394.1 320.89 396.5 320.3 398.77 319.69 401.06 318.98 403.33 318.25 405.62 317.54 407.89 316.94 410.18 316.21 412.45 315.5 414.74 314.77 417.01 314.18 419.42 313.45 421.69 312.74 423.98 312.01 426.25 311.18 428.54 310.33 430.81 309.62 433.1 308.77 435.38 308.06 437.65 307.33 439.94 306.5 442.33 305.77 444.62 304.94 446.89 304.1 449.18 303.13 451.45 302.3 453.74 301.45 456.01 300.62 458.3 299.77 460.57 298.81 462.86 297.98 465.25 297.01 467.54 296.06 469.81 295.21 472.1 294.25 474.38 293.3 476.65 292.21 478.94 291.25 481.21 290.3 483.5 289.33 485.77 288.25 488.06 287.3 490.45 286.21 492.74 285.13 495.01 284.06 497.3 282.98 499.57 281.89 501.86 280.69 504.13 279.62 506.42 278.54 508.69 277.33 510.98 276.13 513.38 274.94 515.65 273.74 517.93 272.54 520.22 271.21 522.5 269.89 524.77 268.69 527.05 267.38 529.34 265.94 531.62 264.62 533.89 263.18 536.29 261.74 538.58 260.3 540.86 258.74 543.13 257.3 545.41 255.62 547.7 253.94 549.98 252.25 552.25 250.57 554.53 248.78 556.82 246.85 559.22 244.94 561.5 242.9 563.77 240.97 566.05 238.69 568.34 236.53 570.62 234.25 572.89 231.85 575.17 229.34 577.46 226.69 579.74 223.94 582.13 221.18 584.41 218.06 586.7 214.94 588.98 211.57 591.25 208.22 593.53 204.38 595.82 200.53 598.1 196.46 600.38 192.13 602.65 187.46 605.05 182.66 607.34 177.38 609.62 171.97 611.89 166.09 614.17 159.97 616.46 153.5 618.74 146.66 621.01 139.22 623.29 131.53 625.58 123.14 627.98 114.25 630.25 104.89 632.53 94.81 634.82 83.89 637.1 72.5 639.38 60.2"/></g><circle class="cls-11" cx="54.98" cy="363.26" r="2.52"/><circle class="cls-12" cx="54.98" cy="363.26" r="2.52"/><circle class="cls-11" cx="57.25" cy="363.14" r="2.52"/><circle class="cls-12" cx="57.25" cy="363.14" r="2.52"/><circle class="cls-11" cx="59.53" cy="363.02" r="2.52"/><circle class="cls-12" cx="59.53" cy="363.02" r="2.52"/><circle class="cls-11" cx="61.82" cy="362.9" r="2.52"/><circle class="cls-12" cx="61.82" cy="362.9" r="2.52"/><circle class="cls-11" cx="64.09" cy="362.78" r="2.52"/><circle class="cls-12" cx="64.09" cy="362.78" r="2.52"/><circle class="cls-11" cx="66.37" cy="362.66" r="2.52"/><circle class="cls-12" cx="66.37" cy="362.66" r="2.52"/><circle class="cls-11" cx="68.65" cy="362.54" r="2.52"/><circle class="cls-12" cx="68.65" cy="362.54" r="2.52"/><circle class="cls-11" cx="71.06" cy="362.42" r="2.52"/><circle class="cls-12" cx="71.06" cy="362.42" r="2.52"/><circle class="cls-11" cx="73.34" cy="362.3" r="2.52"/><circle class="cls-12" cx="73.34" cy="362.3" r="2.52"/><circle class="cls-11" cx="75.62" cy="362.18" r="2.52"/><circle class="cls-12" cx="75.62" cy="362.18" r="2.52"/><circle class="cls-11" cx="77.9" cy="362.06" r="2.52"/><circle class="cls-12" cx="77.9" cy="362.06" r="2.52"/><circle class="cls-11" cx="80.17" cy="361.82" r="2.52"/><circle class="cls-12" cx="80.17" cy="361.82" r="2.52"/><circle class="cls-11" cx="82.45" cy="361.7" r="2.52"/><circle class="cls-12" cx="82.45" cy="361.7" r="2.52"/><circle class="cls-11" cx="84.73" cy="361.46" r="2.52"/><circle class="cls-12" cx="84.73" cy="361.46" r="2.52"/><circle class="cls-11" cx="87.01" cy="361.34" r="2.52" transform="translate(-281.14 412.32) rotate(-84.35)"/><circle class="cls-12" cx="87.01" cy="361.34" r="2.52" transform="translate(-281.14 412.32) rotate(-84.35)"/><circle class="cls-11" cx="89.29" cy="361.1" r="2.52" transform="translate(-278.84 414.37) rotate(-84.35)"/><circle class="cls-12" cx="89.29" cy="361.1" r="2.52" transform="translate(-278.84 414.37) rotate(-84.35)"/><circle class="cls-11" cx="91.57" cy="360.98" r="2.52" transform="translate(-276.67 416.53) rotate(-84.35)"/><circle class="cls-12" cx="91.57" cy="360.98" r="2.52" transform="translate(-276.67 416.53) rotate(-84.35)"/><circle class="cls-11" cx="93.86" cy="360.74" r="2.52"/><circle class="cls-12" cx="93.86" cy="360.74" r="2.52"/><circle class="cls-11" cx="96.26" cy="360.5" r="2.52"/><circle class="cls-12" cx="96.26" cy="360.5" r="2.52"/><circle class="cls-11" cx="98.53" cy="360.38" r="2.52"/><circle class="cls-12" cx="98.53" cy="360.38" r="2.52"/><circle class="cls-11" cx="100.81" cy="360.14" r="2.52"/><circle class="cls-12" cx="100.81" cy="360.14" r="2.52"/><circle class="cls-11" cx="103.09" cy="359.9" r="2.52"/><circle class="cls-12" cx="103.09" cy="359.9" r="2.52"/><circle class="cls-11" cx="105.38" cy="359.78" r="2.52"/><circle class="cls-12" cx="105.38" cy="359.78" r="2.52"/><circle class="cls-11" cx="107.65" cy="359.54" r="2.52"/><circle class="cls-12" cx="107.65" cy="359.54" r="2.52"/><circle class="cls-11" cx="109.93" cy="359.3" r="2.52" transform="translate(-258.44 433.29) rotate(-84.35)"/><circle class="cls-12" cx="109.93" cy="359.3" r="2.52" transform="translate(-258.44 433.29) rotate(-84.35)"/><circle class="cls-11" cx="112.21" cy="359.06" r="2.52" transform="translate(-256.15 435.34) rotate(-84.35)"/><circle class="cls-12" cx="112.21" cy="359.06" r="2.52" transform="translate(-256.15 435.34) rotate(-84.35)"/><circle class="cls-11" cx="114.5" cy="358.82" r="2.52"/><circle class="cls-12" cx="114.5" cy="358.82" r="2.52"/><circle class="cls-11" cx="116.78" cy="358.7" r="2.52"/><circle class="cls-12" cx="116.78" cy="358.7" r="2.52"/><circle class="cls-11" cx="119.17" cy="358.46" r="2.52"/><circle class="cls-12" cx="119.17" cy="358.46" r="2.52"/><circle class="cls-11" cx="121.45" cy="358.22" r="2.52"/><circle class="cls-12" cx="121.45" cy="358.22" r="2.52"/><circle class="cls-11" cx="123.73" cy="357.98" r="2.52"/><circle class="cls-12" cx="123.73" cy="357.98" r="2.52"/><circle class="cls-11" cx="126.01" cy="357.74" r="2.52" transform="translate(-242.4 447.89) rotate(-84.35)"/><circle class="cls-12" cx="126.01" cy="357.74" r="2.52" transform="translate(-242.4 447.89) rotate(-84.35)"/><circle class="cls-11" cx="128.29" cy="357.5" r="2.52"/><circle class="cls-12" cx="128.29" cy="357.5" r="2.52"/><circle class="cls-11" cx="130.57" cy="357.38" r="2.52" transform="translate(-237.93 452.1) rotate(-84.35)"/><circle class="cls-12" cx="130.57" cy="357.38" r="2.52" transform="translate(-237.93 452.1) rotate(-84.35)"/><circle class="cls-11" cx="132.86" cy="357.14" r="2.52"/><circle class="cls-12" cx="132.86" cy="357.14" r="2.52"/><circle class="cls-11" cx="135.14" cy="356.9" r="2.52"/><circle class="cls-12" cx="135.14" cy="356.9" r="2.52"/><circle class="cls-11" cx="137.42" cy="356.66" r="2.52"/><circle class="cls-12" cx="137.42" cy="356.66" r="2.52"/><circle class="cls-11" cx="139.69" cy="356.42" r="2.52"/><circle class="cls-12" cx="139.69" cy="356.42" r="2.52"/><circle class="cls-11" cx="142.09" cy="356.18" r="2.52"/><circle class="cls-12" cx="142.09" cy="356.18" r="2.52"/><circle class="cls-11" cx="144.38" cy="355.94" r="2.52"/><circle class="cls-12" cx="144.38" cy="355.94" r="2.52"/><circle class="cls-11" cx="146.65" cy="355.7" r="2.52" transform="translate(-221.76 466.59) rotate(-84.35)"/><circle class="cls-12" cx="146.65" cy="355.7" r="2.52" transform="translate(-221.76 466.59) rotate(-84.35)"/><circle class="cls-11" cx="148.93" cy="355.58" r="2.52"/><circle class="cls-12" cx="148.93" cy="355.58" r="2.52"/><circle class="cls-11" cx="151.21" cy="355.34" r="2.52" transform="translate(-217.29 470.8) rotate(-84.35)"/><circle class="cls-12" cx="151.21" cy="355.34" r="2.52" transform="translate(-217.29 470.8) rotate(-84.35)"/><circle class="cls-11" cx="153.5" cy="355.1" r="2.52"/><circle class="cls-12" cx="153.5" cy="355.1" r="2.52"/><circle class="cls-11" cx="155.78" cy="354.86" r="2.52"/><circle class="cls-12" cx="155.78" cy="354.86" r="2.52"/><circle class="cls-11" cx="158.06" cy="354.62" r="2.52"/><circle class="cls-12" cx="158.06" cy="354.62" r="2.52"/><circle class="cls-11" cx="160.34" cy="354.38" r="2.52"/><circle class="cls-12" cx="160.34" cy="354.38" r="2.52"/><circle class="cls-11" cx="162.62" cy="354.14" r="2.52"/><circle class="cls-12" cx="162.62" cy="354.14" r="2.52"/><circle class="cls-11" cx="165.01" cy="353.9" r="2.52"/><circle class="cls-12" cx="165.01" cy="353.9" r="2.52"/><circle class="cls-11" cx="167.29" cy="353.78" r="2.52"/><circle class="cls-12" cx="167.29" cy="353.78" r="2.52"/><circle class="cls-11" cx="169.57" cy="353.54" r="2.52"/><circle class="cls-12" cx="169.57" cy="353.54" r="2.52"/><circle class="cls-11" cx="171.86" cy="353.3" r="2.52"/><circle class="cls-12" cx="171.86" cy="353.3" r="2.52"/><circle class="cls-11" cx="174.14" cy="353.06" r="2.52"/><circle class="cls-12" cx="174.14" cy="353.06" r="2.52"/><circle class="cls-11" cx="176.42" cy="352.82" r="2.52"/><circle class="cls-12" cx="176.42" cy="352.82" r="2.52"/><circle class="cls-11" cx="178.69" cy="352.7" r="2.52"/><circle class="cls-12" cx="178.69" cy="352.7" r="2.52"/><circle class="cls-11" cx="180.98" cy="352.46" r="2.52"/><circle class="cls-12" cx="180.98" cy="352.46" r="2.52"/><circle class="cls-11" cx="183.26" cy="352.22" r="2.52"/><circle class="cls-12" cx="183.26" cy="352.22" r="2.52"/><circle class="cls-11" cx="185.53" cy="351.98" r="2.52"/><circle class="cls-12" cx="185.53" cy="351.98" r="2.52"/><circle class="cls-11" cx="187.93" cy="351.74" r="2.52"/><circle class="cls-12" cx="187.93" cy="351.74" r="2.52"/><circle class="cls-11" cx="190.21" cy="351.5" r="2.52" transform="translate(-195.58 455.86) rotate(-76.72)"/><circle class="cls-12" cx="190.21" cy="351.5" r="2.52" transform="translate(-195.58 455.86) rotate(-76.72)"/><circle class="cls-11" cx="192.5" cy="351.38" r="2.52"/><circle class="cls-12" cx="192.5" cy="351.38" r="2.52"/><circle class="cls-11" cx="194.78" cy="351.14" r="2.52"/><circle class="cls-12" cx="194.78" cy="351.14" r="2.52"/><circle class="cls-11" cx="197.05" cy="350.9" r="2.52"/><circle class="cls-12" cx="197.05" cy="350.9" r="2.52"/><circle class="cls-11" cx="199.33" cy="350.66" r="2.52"/><circle class="cls-12" cx="199.33" cy="350.66" r="2.52"/><circle class="cls-11" cx="201.61" cy="350.54" r="2.52"/><circle class="cls-12" cx="201.61" cy="350.54" r="2.52"/><circle class="cls-11" cx="203.89" cy="350.3" r="2.52"/><circle class="cls-12" cx="203.89" cy="350.3" r="2.52"/><circle class="cls-11" cx="206.18" cy="350.06" r="2.52" transform="translate(-162.49 520.73) rotate(-84.34)"/><circle class="cls-12" cx="206.18" cy="350.06" r="2.52" transform="translate(-162.49 520.73) rotate(-84.34)"/><circle class="cls-11" cx="208.46" cy="349.82" r="2.52"/><circle class="cls-12" cx="208.46" cy="349.82" r="2.52"/><circle class="cls-11" cx="210.85" cy="349.58" r="2.52" transform="translate(-53.27 38.29) rotate(-9.22)"/><circle class="cls-12" cx="210.85" cy="349.58" r="2.52" transform="translate(-53.27 38.29) rotate(-9.22)"/><circle class="cls-11" cx="213.13" cy="349.46" r="2.52"/><circle class="cls-12" cx="213.13" cy="349.46" r="2.52"/><circle class="cls-11" cx="215.41" cy="349.22" r="2.52" transform="translate(-53.16 39.02) rotate(-9.22)"/><circle class="cls-12" cx="215.41" cy="349.22" r="2.52" transform="translate(-53.16 39.02) rotate(-9.22)"/><circle class="cls-11" cx="217.7" cy="348.98" r="2.52"/><circle class="cls-12" cx="217.7" cy="348.98" r="2.52"/><circle class="cls-11" cx="219.98" cy="348.86" r="2.52" transform="translate(-159.61 510.11) rotate(-80.78)"/><circle class="cls-12" cx="219.98" cy="348.86" r="2.52" transform="translate(-159.61 510.11) rotate(-80.78)"/><circle class="cls-11" cx="222.26" cy="348.62" r="2.52"/><circle class="cls-12" cx="222.26" cy="348.62" r="2.52"/><circle class="cls-11" cx="224.53" cy="348.38" r="2.52"/><circle class="cls-12" cx="224.53" cy="348.38" r="2.52"/><circle class="cls-11" cx="226.81" cy="348.26" r="2.52"/><circle class="cls-12" cx="226.81" cy="348.26" r="2.52"/><circle class="cls-11" cx="229.09" cy="348.02" r="2.52"/><circle class="cls-12" cx="229.09" cy="348.02" r="2.52"/><circle class="cls-11" cx="231.37" cy="347.78" r="2.52"/><circle class="cls-12" cx="231.37" cy="347.78" r="2.52"/><circle class="cls-11" cx="233.77" cy="347.54" r="2.52"/><circle class="cls-12" cx="233.77" cy="347.54" r="2.52"/><circle class="cls-11" cx="236.05" cy="347.42" r="2.52"/><circle class="cls-12" cx="236.05" cy="347.42" r="2.52"/><circle class="cls-11" cx="238.33" cy="347.18" r="2.52"/><circle class="cls-12" cx="238.33" cy="347.18" r="2.52"/><circle class="cls-11" cx="240.61" cy="346.94" r="2.52"/><circle class="cls-12" cx="240.61" cy="346.94" r="2.52"/><circle class="cls-11" cx="242.89" cy="346.82" r="2.52"/><circle class="cls-12" cx="242.89" cy="346.82" r="2.52"/><circle class="cls-11" cx="245.18" cy="346.58" r="2.52" transform="translate(-52.35 43.75) rotate(-9.22)"/><circle class="cls-12" cx="245.18" cy="346.58" r="2.52" transform="translate(-52.35 43.75) rotate(-9.22)"/><circle class="cls-11" cx="247.46" cy="346.46" r="2.52"/><circle class="cls-12" cx="247.46" cy="346.46" r="2.52"/><circle class="cls-11" cx="249.74" cy="346.22" r="2.52" transform="translate(-52.23 44.47) rotate(-9.22)"/><circle class="cls-12" cx="249.74" cy="346.22" r="2.52" transform="translate(-52.23 44.47) rotate(-9.22)"/><circle class="cls-11" cx="252.02" cy="345.98" r="2.52" transform="translate(-117.11 562.67) rotate(-84.34)"/><circle class="cls-12" cx="252.02" cy="345.98" r="2.52" transform="translate(-117.11 562.67) rotate(-84.34)"/><circle class="cls-11" cx="254.3" cy="345.86" r="2.52" transform="translate(-52.12 45.2) rotate(-9.22)"/><circle class="cls-12" cx="254.3" cy="345.86" r="2.52" transform="translate(-52.12 45.2) rotate(-9.22)"/><circle class="cls-11" cx="256.7" cy="345.62" r="2.52"/><circle class="cls-12" cx="256.7" cy="345.62" r="2.52"/><circle class="cls-11" cx="258.98" cy="345.38" r="2.52"/><circle class="cls-12" cx="258.98" cy="345.38" r="2.52"/><circle class="cls-11" cx="261.26" cy="345.26" r="2.52"/><circle class="cls-12" cx="261.26" cy="345.26" r="2.52"/><circle class="cls-11" cx="263.53" cy="345.02" r="2.52"/><circle class="cls-12" cx="263.53" cy="345.02" r="2.52"/><circle class="cls-11" cx="265.81" cy="344.9" r="2.52"/><circle class="cls-12" cx="265.81" cy="344.9" r="2.52"/><circle class="cls-11" cx="268.09" cy="344.66" r="2.52"/><circle class="cls-12" cx="268.09" cy="344.66" r="2.52"/><circle class="cls-11" cx="270.37" cy="344.42" r="2.52"/><circle class="cls-12" cx="270.37" cy="344.42" r="2.52"/><circle class="cls-11" cx="272.66" cy="344.3" r="2.52"/><circle class="cls-12" cx="272.66" cy="344.3" r="2.52"/><circle class="cls-11" cx="274.94" cy="343.94" r="2.52"/><circle class="cls-12" cx="274.94" cy="343.94" r="2.52"/><circle class="cls-11" cx="277.22" cy="343.58" r="2.52"/><circle class="cls-12" cx="277.22" cy="343.58" r="2.52"/><circle class="cls-11" cx="279.61" cy="343.22" r="2.52"/><circle class="cls-12" cx="279.61" cy="343.22" r="2.52"/><circle class="cls-11" cx="281.89" cy="342.86" r="2.52"/><circle class="cls-12" cx="281.89" cy="342.86" r="2.52"/><circle class="cls-11" cx="284.18" cy="342.5" r="2.52" transform="translate(-51.19 49.94) rotate(-9.22)"/><circle class="cls-12" cx="284.18" cy="342.5" r="2.52" transform="translate(-51.19 49.94) rotate(-9.22)"/><circle class="cls-11" cx="286.46" cy="342.14" r="2.52" transform="translate(-51.11 50.3) rotate(-9.22)"/><circle class="cls-12" cx="286.46" cy="342.14" r="2.52" transform="translate(-51.11 50.3) rotate(-9.22)"/><circle class="cls-11" cx="288.74" cy="341.78" r="2.52" transform="translate(-79.83 595.43) rotate(-84.34)"/><circle class="cls-12" cx="288.74" cy="341.78" r="2.52" transform="translate(-79.83 595.43) rotate(-84.34)"/><circle class="cls-11" cx="291.02" cy="341.42" r="2.52" transform="translate(-77.41 597.37) rotate(-84.34)"/><circle class="cls-12" cx="291.02" cy="341.42" r="2.52" transform="translate(-77.41 597.37) rotate(-84.34)"/><circle class="cls-11" cx="293.3" cy="341.06" r="2.52" transform="translate(-75 599.32) rotate(-84.34)"/><circle class="cls-12" cx="293.3" cy="341.06" r="2.52" transform="translate(-75 599.32) rotate(-84.34)"/><circle class="cls-11" cx="295.58" cy="340.7" r="2.52" transform="translate(-72.59 601.26) rotate(-84.34)"/><circle class="cls-12" cx="295.58" cy="340.7" r="2.52" transform="translate(-72.59 601.26) rotate(-84.34)"/><circle class="cls-11" cx="297.85" cy="340.34" r="2.52" transform="translate(-70.17 603.2) rotate(-84.34)"/><circle class="cls-12" cx="297.85" cy="340.34" r="2.52" transform="translate(-70.17 603.2) rotate(-84.34)"/><circle class="cls-11" cx="300.13" cy="339.98" r="2.52" transform="translate(-67.76 605.15) rotate(-84.34)"/><circle class="cls-12" cx="300.13" cy="339.98" r="2.52" transform="translate(-67.76 605.15) rotate(-84.34)"/><circle class="cls-11" cx="302.41" cy="339.62" r="2.52" transform="translate(-65.35 607.09) rotate(-84.34)"/><circle class="cls-12" cx="302.41" cy="339.62" r="2.52" transform="translate(-65.35 607.09) rotate(-84.34)"/><circle class="cls-11" cx="304.81" cy="339.38" r="2.52"/><circle class="cls-12" cx="304.81" cy="339.38" r="2.52"/><circle class="cls-11" cx="307.09" cy="339.02" r="2.52"/><circle class="cls-12" cx="307.09" cy="339.02" r="2.52"/><circle class="cls-11" cx="309.37" cy="338.66" r="2.52"/><circle class="cls-12" cx="309.37" cy="338.66" r="2.52"/><circle class="cls-11" cx="311.66" cy="338.3" r="2.52"/><circle class="cls-12" cx="311.66" cy="338.3" r="2.52"/><circle class="cls-11" cx="313.94" cy="337.94" r="2.52"/><circle class="cls-12" cx="313.94" cy="337.94" r="2.52"/><circle class="cls-11" cx="316.22" cy="337.58" r="2.52"/><circle class="cls-12" cx="316.22" cy="337.58" r="2.52"/><circle class="cls-11" cx="318.49" cy="337.22" r="2.52"/><circle class="cls-12" cx="318.49" cy="337.22" r="2.52"/><circle class="cls-11" cx="320.77" cy="336.86" r="2.52"/><circle class="cls-12" cx="320.77" cy="336.86" r="2.52"/><circle class="cls-11" cx="323.05" cy="336.62" r="2.52"/><circle class="cls-12" cx="323.05" cy="336.62" r="2.52"/><circle class="cls-11" cx="325.33" cy="336.26" r="2.52"/><circle class="cls-12" cx="325.33" cy="336.26" r="2.52"/><circle class="cls-11" cx="327.74" cy="335.9" r="2.52"/><circle class="cls-12" cx="327.74" cy="335.9" r="2.52"/><circle class="cls-11" cx="330.02" cy="335.54" r="2.52"/><circle class="cls-12" cx="330.02" cy="335.54" r="2.52"/><circle class="cls-11" cx="332.3" cy="335.06" r="2.52" transform="translate(-33.87 632.72) rotate(-84.34)"/><circle class="cls-12" cx="332.3" cy="335.06" r="2.52" transform="translate(-33.87 632.72) rotate(-84.34)"/><circle class="cls-11" cx="334.58" cy="334.58" r="2.52" transform="translate(-49.27 57.91) rotate(-9.22)"/><circle class="cls-12" cx="334.58" cy="334.58" r="2.52" transform="translate(-49.27 57.91) rotate(-9.22)"/><circle class="cls-11" cx="336.85" cy="334.1" r="2.52"/><circle class="cls-12" cx="336.85" cy="334.1" r="2.52"/><circle class="cls-11" cx="339.13" cy="333.62" r="2.52" transform="translate(-26.27 638.23) rotate(-84.34)"/><circle class="cls-12" cx="339.13" cy="333.62" r="2.52" transform="translate(-26.27 638.23) rotate(-84.34)"/><circle class="cls-11" cx="341.41" cy="333.14" r="2.52" transform="translate(-48.95 58.99) rotate(-9.22)"/><circle class="cls-12" cx="341.41" cy="333.14" r="2.52" transform="translate(-48.95 58.99) rotate(-9.22)"/><circle class="cls-11" cx="343.7" cy="332.66" r="2.52"/><circle class="cls-12" cx="343.7" cy="332.66" r="2.52"/><circle class="cls-11" cx="345.98" cy="332.18" r="2.52"/><circle class="cls-12" cx="345.98" cy="332.18" r="2.52"/><circle class="cls-11" cx="348.26" cy="331.7" r="2.52"/><circle class="cls-12" cx="348.26" cy="331.7" r="2.52"/><circle class="cls-11" cx="350.66" cy="331.22" r="2.52"/><circle class="cls-12" cx="350.66" cy="331.22" r="2.52"/><circle class="cls-11" cx="352.94" cy="330.74" r="2.52"/><circle class="cls-12" cx="352.94" cy="330.74" r="2.52"/><circle class="cls-11" cx="355.22" cy="330.26" r="2.52"/><circle class="cls-12" cx="355.22" cy="330.26" r="2.52"/><circle class="cls-11" cx="357.49" cy="329.78" r="2.52"/><circle class="cls-12" cx="357.49" cy="329.78" r="2.52"/><circle class="cls-11" cx="359.77" cy="329.3" r="2.52"/><circle class="cls-12" cx="359.77" cy="329.3" r="2.52"/><circle class="cls-11" cx="362.05" cy="328.82" r="2.52"/><circle class="cls-12" cx="362.05" cy="328.82" r="2.52"/><circle class="cls-11" cx="364.33" cy="328.34" r="2.52"/><circle class="cls-12" cx="364.33" cy="328.34" r="2.52"/><circle class="cls-11" cx="366.61" cy="327.86" r="2.52"/><circle class="cls-12" cx="366.61" cy="327.86" r="2.52"/><circle class="cls-11" cx="368.89" cy="327.38" r="2.52"/><circle class="cls-12" cx="368.89" cy="327.38" r="2.52"/><circle class="cls-11" cx="371.18" cy="326.78" r="2.52" transform="translate(9.42 663.94) rotate(-84.34)"/><circle class="cls-12" cx="371.18" cy="326.78" r="2.52" transform="translate(9.42 663.94) rotate(-84.34)"/><circle class="cls-11" cx="373.58" cy="326.18" r="2.52"/><circle class="cls-12" cx="373.58" cy="326.18" r="2.52"/><circle class="cls-11" cx="375.85" cy="325.58" r="2.52" transform="translate(-47.3 64.41) rotate(-9.22)"/><circle class="cls-12" cx="375.85" cy="325.58" r="2.52" transform="translate(-47.3 64.41) rotate(-9.22)"/><circle class="cls-11" cx="378.13" cy="324.98" r="2.52" transform="translate(17.48 669.25) rotate(-84.34)"/><circle class="cls-12" cx="378.13" cy="324.98" r="2.52" transform="translate(17.48 669.25) rotate(-84.34)"/><circle class="cls-11" cx="380.41" cy="324.38" r="2.52"/><circle class="cls-12" cx="380.41" cy="324.38" r="2.52"/><circle class="cls-11" cx="382.7" cy="323.78" r="2.52"/><circle class="cls-12" cx="382.7" cy="323.78" r="2.52"/><circle class="cls-11" cx="384.98" cy="323.18" r="2.52"/><circle class="cls-12" cx="384.98" cy="323.18" r="2.52"/><circle class="cls-11" cx="387.26" cy="322.58" r="2.52" transform="translate(6.81 653.16) rotate(-80.78)"/><circle class="cls-12" cx="387.26" cy="322.58" r="2.52" transform="translate(6.81 653.16) rotate(-80.78)"/><circle class="cls-11" cx="389.53" cy="321.98" r="2.52"/><circle class="cls-12" cx="389.53" cy="321.98" r="2.52"/><circle class="cls-11" cx="391.81" cy="321.38" r="2.52"/><circle class="cls-12" cx="391.81" cy="321.38" r="2.52"/><circle class="cls-11" cx="394.09" cy="320.9" r="2.52"/><circle class="cls-12" cx="394.09" cy="320.9" r="2.52"/><circle class="cls-11" cx="396.49" cy="320.3" r="2.52"/><circle class="cls-12" cx="396.49" cy="320.3" r="2.52"/><circle class="cls-11" cx="398.77" cy="319.7" r="2.52"/><circle class="cls-12" cx="398.77" cy="319.7" r="2.52"/><circle class="cls-11" cx="401.05" cy="318.98" r="2.52"/><circle class="cls-12" cx="401.05" cy="318.98" r="2.52"/><circle class="cls-11" cx="403.33" cy="318.26" r="2.52"/><circle class="cls-12" cx="403.33" cy="318.26" r="2.52"/><circle class="cls-11" cx="405.61" cy="317.54" r="2.52"/><circle class="cls-12" cx="405.61" cy="317.54" r="2.52"/><circle class="cls-11" cx="407.89" cy="316.94" r="2.52"/><circle class="cls-12" cx="407.89" cy="316.94" r="2.52"/><circle class="cls-11" cx="410.18" cy="316.22" r="2.52" transform="translate(-45.36 69.79) rotate(-9.22)"/><circle class="cls-12" cx="410.18" cy="316.22" r="2.52" transform="translate(-45.36 69.79) rotate(-9.22)"/><circle class="cls-11" cx="412.46" cy="315.5" r="2.52" transform="translate(-45.21 70.14) rotate(-9.22)"/><circle class="cls-12" cx="412.46" cy="315.5" r="2.52" transform="translate(-45.21 70.14) rotate(-9.22)"/><circle class="cls-11" cx="414.74" cy="314.78" r="2.52" transform="translate(60.62 696.47) rotate(-84.34)"/><circle class="cls-12" cx="414.74" cy="314.78" r="2.52" transform="translate(60.62 696.47) rotate(-84.34)"/><circle class="cls-11" cx="417.02" cy="314.18" r="2.52"/><circle class="cls-12" cx="417.02" cy="314.18" r="2.52"/><circle class="cls-11" cx="419.41" cy="313.46" r="2.52"/><circle class="cls-12" cx="419.41" cy="313.46" r="2.52"/><circle class="cls-11" cx="421.7" cy="312.74" r="2.52"/><circle class="cls-12" cx="421.7" cy="312.74" r="2.52"/><circle class="cls-11" cx="423.98" cy="312.02" r="2.52"/><circle class="cls-12" cx="423.98" cy="312.02" r="2.52"/><circle class="cls-11" cx="426.26" cy="311.18" r="2.52"/><circle class="cls-12" cx="426.26" cy="311.18" r="2.52"/><circle class="cls-11" cx="428.53" cy="310.34" r="2.52"/><circle class="cls-12" cx="428.53" cy="310.34" r="2.52"/><circle class="cls-11" cx="430.81" cy="309.62" r="2.52"/><circle class="cls-12" cx="430.81" cy="309.62" r="2.52"/><circle class="cls-11" cx="433.09" cy="308.78" r="2.52"/><circle class="cls-12" cx="433.09" cy="308.78" r="2.52"/><circle class="cls-11" cx="435.37" cy="308.06" r="2.52"/><circle class="cls-12" cx="435.37" cy="308.06" r="2.52"/><circle class="cls-11" cx="437.66" cy="307.34" r="2.52"/><circle class="cls-12" cx="437.66" cy="307.34" r="2.52"/><circle class="cls-11" cx="439.94" cy="306.5" r="2.52"/><circle class="cls-12" cx="439.94" cy="306.5" r="2.52"/><circle class="cls-11" cx="442.33" cy="305.78" r="2.52"/><circle class="cls-12" cx="442.33" cy="305.78" r="2.52"/><circle class="cls-11" cx="444.61" cy="304.94" r="2.52"/><circle class="cls-12" cx="444.61" cy="304.94" r="2.52"/><circle class="cls-11" cx="446.89" cy="304.1" r="2.52"/><circle class="cls-12" cx="446.89" cy="304.1" r="2.52"/><path class="cls-11" d="M451.69,303.14a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,451.69,303.14Z"/><path class="cls-12" d="M451.69,303.14a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,451.69,303.14Z"/><circle class="cls-11" cx="451.46" cy="302.3" r="2.52" transform="translate(-42.59 76.22) rotate(-9.22)"/><circle class="cls-12" cx="451.46" cy="302.3" r="2.52" transform="translate(-42.59 76.22) rotate(-9.22)"/><path class="cls-11" d="M456.25,301.46a2.52,2.52,0,1,1-2.51-2.52A2.51,2.51,0,0,1,456.25,301.46Z"/><path class="cls-12" d="M456.25,301.46a2.52,2.52,0,1,1-2.51-2.52A2.51,2.51,0,0,1,456.25,301.46Z"/><path class="cls-11" d="M458.53,300.62A2.52,2.52,0,1,1,456,298.1,2.52,2.52,0,0,1,458.53,300.62Z"/><path class="cls-12" d="M458.53,300.62A2.52,2.52,0,1,1,456,298.1,2.52,2.52,0,0,1,458.53,300.62Z"/><circle class="cls-11" cx="458.3" cy="299.78" r="2.52" transform="translate(126.79 735.67) rotate(-85.93)"/><circle class="cls-12" cx="458.3" cy="299.78" r="2.52" transform="translate(126.79 735.67) rotate(-85.93)"/><path class="cls-11" d="M463.09,298.82a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,463.09,298.82Z"/><path class="cls-12" d="M463.09,298.82a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,463.09,298.82Z"/><circle class="cls-11" cx="462.86" cy="297.98" r="2.52" transform="translate(132.82 738.54) rotate(-85.93)"/><circle class="cls-12" cx="462.86" cy="297.98" r="2.52" transform="translate(132.82 738.54) rotate(-85.93)"/><path class="cls-11" d="M467.77,297a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,467.77,297Z"/><path class="cls-12" d="M467.77,297a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,467.77,297Z"/><circle class="cls-11" cx="467.53" cy="296.06" r="2.52" transform="translate(139.08 741.43) rotate(-85.93)"/><circle class="cls-12" cx="467.53" cy="296.06" r="2.52" transform="translate(139.08 741.43) rotate(-85.93)"/><path class="cls-11" d="M472.33,295.22a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,472.33,295.22Z"/><path class="cls-12" d="M472.33,295.22a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,472.33,295.22Z"/><circle class="cls-11" cx="472.09" cy="294.26" r="2.52"/><circle class="cls-12" cx="472.09" cy="294.26" r="2.52"/><path class="cls-11" d="M476.89,293.3a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,476.89,293.3Z"/><path class="cls-12" d="M476.89,293.3a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,476.89,293.3Z"/><circle class="cls-11" cx="476.65" cy="292.22" r="2.52" transform="translate(-26.47 48.39) rotate(-5.65)"/><circle class="cls-12" cx="476.65" cy="292.22" r="2.52" transform="translate(-26.47 48.39) rotate(-5.65)"/><path class="cls-11" d="M481.45,291.26a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,481.45,291.26Z"/><path class="cls-12" d="M481.45,291.26a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,481.45,291.26Z"/><circle class="cls-11" cx="481.21" cy="290.3" r="2.52" transform="translate(-26.26 48.83) rotate(-5.65)"/><circle class="cls-12" cx="481.21" cy="290.3" r="2.52" transform="translate(-26.26 48.83) rotate(-5.65)"/><path class="cls-11" d="M486,289.34a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,486,289.34Z"/><path class="cls-12" d="M486,289.34a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,486,289.34Z"/><circle class="cls-11" cx="485.77" cy="288.26" r="2.52"/><circle class="cls-12" cx="485.77" cy="288.26" r="2.52"/><path class="cls-11" d="M490.57,287.3a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,490.57,287.3Z"/><path class="cls-12" d="M490.57,287.3a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,490.57,287.3Z"/><path class="cls-11" d="M493,286.22a2.52,2.52,0,1,1-2.53-2.52A2.53,2.53,0,0,1,493,286.22Z"/><path class="cls-12" d="M493,286.22a2.52,2.52,0,1,1-2.53-2.52A2.53,2.53,0,0,1,493,286.22Z"/><path class="cls-11" d="M495.25,285.14a2.52,2.52,0,1,1-2.51-2.52A2.51,2.51,0,0,1,495.25,285.14Z"/><path class="cls-12" d="M495.25,285.14a2.52,2.52,0,1,1-2.51-2.52A2.51,2.51,0,0,1,495.25,285.14Z"/><path class="cls-11" d="M497.53,284.06a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,497.53,284.06Z"/><path class="cls-12" d="M497.53,284.06a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,497.53,284.06Z"/><circle class="cls-11" cx="497.3" cy="282.98" r="2.52" transform="translate(179.78 758.96) rotate(-85.93)"/><circle class="cls-12" cx="497.3" cy="282.98" r="2.52" transform="translate(179.78 758.96) rotate(-85.93)"/><path class="cls-11" d="M502.09,281.9a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,502.09,281.9Z"/><path class="cls-12" d="M502.09,281.9a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,502.09,281.9Z"/><circle class="cls-11" cx="501.86" cy="280.7" r="2.52" transform="translate(186.29 761.39) rotate(-85.93)"/><circle class="cls-12" cx="501.86" cy="280.7" r="2.52" transform="translate(186.29 761.39) rotate(-85.93)"/><path class="cls-11" d="M506.65,279.62a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,506.65,279.62Z"/><path class="cls-12" d="M506.65,279.62a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,506.65,279.62Z"/><circle class="cls-11" cx="506.42" cy="278.54" r="2.52"/><circle class="cls-12" cx="506.42" cy="278.54" r="2.52"/><path class="cls-11" d="M511.21,277.34a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,511.21,277.34Z"/><path class="cls-12" d="M511.21,277.34a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,511.21,277.34Z"/><circle class="cls-11" cx="510.98" cy="276.14" r="2.52" transform="translate(-24.72 51.69) rotate(-5.65)"/><circle class="cls-12" cx="510.98" cy="276.14" r="2.52" transform="translate(-24.72 51.69) rotate(-5.65)"/><path class="cls-11" d="M515.89,274.94a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,515.89,274.94Z"/><path class="cls-12" d="M515.89,274.94a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,515.89,274.94Z"/><circle class="cls-11" cx="515.65" cy="273.74" r="2.52"/><circle class="cls-12" cx="515.65" cy="273.74" r="2.52"/><path class="cls-11" d="M520.45,272.54a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,520.45,272.54Z"/><path class="cls-12" d="M520.45,272.54a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,520.45,272.54Z"/><circle class="cls-11" cx="520.21" cy="271.22" r="2.52" transform="translate(-24.19 52.58) rotate(-5.65)"/><circle class="cls-12" cx="520.21" cy="271.22" r="2.52" transform="translate(-24.19 52.58) rotate(-5.65)"/><path class="cls-11" d="M525,269.9a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,525,269.9Z"/><path class="cls-12" d="M525,269.9a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,525,269.9Z"/><circle class="cls-11" cx="524.77" cy="268.7" r="2.52" transform="translate(219.55 773.1) rotate(-85.93)"/><circle class="cls-12" cx="524.77" cy="268.7" r="2.52" transform="translate(219.55 773.1) rotate(-85.93)"/><path class="cls-11" d="M529.57,267.38a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,529.57,267.38Z"/><path class="cls-12" d="M529.57,267.38a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,529.57,267.38Z"/><path class="cls-11" d="M531.86,265.94a2.52,2.52,0,1,1-2.53-2.52A2.52,2.52,0,0,1,531.86,265.94Z"/><path class="cls-12" d="M531.86,265.94a2.52,2.52,0,1,1-2.53-2.52A2.52,2.52,0,0,1,531.86,265.94Z"/><path class="cls-11" d="M534.13,264.62a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,534.13,264.62Z"/><path class="cls-12" d="M534.13,264.62a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,534.13,264.62Z"/><path class="cls-11" d="M536.42,263.18a2.52,2.52,0,1,1-2.53-2.52A2.53,2.53,0,0,1,536.42,263.18Z"/><path class="cls-12" d="M536.42,263.18a2.52,2.52,0,1,1-2.53-2.52A2.53,2.53,0,0,1,536.42,263.18Z"/><circle class="cls-11" cx="536.3" cy="261.74" r="2.52"/><circle class="cls-12" cx="536.3" cy="261.74" r="2.52"/><path class="cls-11" d="M541.09,260.3a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,541.09,260.3Z"/><path class="cls-12" d="M541.09,260.3a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,541.09,260.3Z"/><circle class="cls-11" cx="540.86" cy="258.74" r="2.52"/><circle class="cls-12" cx="540.86" cy="258.74" r="2.52"/><path class="cls-11" d="M545.65,257.3a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,545.65,257.3Z"/><path class="cls-12" d="M545.65,257.3a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,545.65,257.3Z"/><circle class="cls-11" cx="545.42" cy="255.62" r="2.52" transform="translate(251.78 781.54) rotate(-85.93)"/><circle class="cls-12" cx="545.42" cy="255.62" r="2.52" transform="translate(251.78 781.54) rotate(-85.93)"/><path class="cls-11" d="M550.21,253.94a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,550.21,253.94Z"/><path class="cls-12" d="M550.21,253.94a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,550.21,253.94Z"/><circle class="cls-11" cx="549.98" cy="252.26" r="2.52"/><circle class="cls-12" cx="549.98" cy="252.26" r="2.52"/><path class="cls-11" d="M554.77,250.58a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,554.77,250.58Z"/><path class="cls-12" d="M554.77,250.58a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,554.77,250.58Z"/><circle class="cls-11" cx="554.53" cy="248.78" r="2.52" transform="translate(267.08 784.28) rotate(-85.93)"/><circle class="cls-12" cx="554.53" cy="248.78" r="2.52" transform="translate(267.08 784.28) rotate(-85.93)"/><path class="cls-11" d="M559.33,246.86a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,559.33,246.86Z"/><path class="cls-12" d="M559.33,246.86a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,559.33,246.86Z"/><circle class="cls-11" cx="559.21" cy="244.94" r="2.52" transform="translate(-21.41 56.3) rotate(-5.65)"/><circle class="cls-12" cx="559.21" cy="244.94" r="2.52" transform="translate(-21.41 56.3) rotate(-5.65)"/><path class="cls-11" d="M564,242.9a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,564,242.9Z"/><path class="cls-12" d="M564,242.9a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,564,242.9Z"/><circle class="cls-11" cx="563.77" cy="240.98" r="2.52" transform="translate(283.44 786.25) rotate(-85.93)"/><circle class="cls-12" cx="563.77" cy="240.98" r="2.52" transform="translate(283.44 786.25) rotate(-85.93)"/><path class="cls-11" d="M568.57,238.7a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,568.57,238.7Z"/><path class="cls-12" d="M568.57,238.7a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,568.57,238.7Z"/><path class="cls-11" d="M570.86,236.54a2.52,2.52,0,1,1-2.53-2.52A2.52,2.52,0,0,1,570.86,236.54Z"/><path class="cls-12" d="M570.86,236.54a2.52,2.52,0,1,1-2.53-2.52A2.52,2.52,0,0,1,570.86,236.54Z"/><path class="cls-11" d="M573.13,234.26a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,573.13,234.26Z"/><path class="cls-12" d="M573.13,234.26a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,573.13,234.26Z"/><path class="cls-11" d="M575.42,231.86a2.52,2.52,0,1,1-2.53-2.52A2.53,2.53,0,0,1,575.42,231.86Z"/><path class="cls-12" d="M575.42,231.86a2.52,2.52,0,1,1-2.53-2.52A2.53,2.53,0,0,1,575.42,231.86Z"/><path class="cls-11" d="M577.69,229.34a2.52,2.52,0,1,1-2.51-2.52A2.51,2.51,0,0,1,577.69,229.34Z"/><path class="cls-12" d="M577.69,229.34a2.52,2.52,0,1,1-2.51-2.52A2.51,2.51,0,0,1,577.69,229.34Z"/><path class="cls-11" d="M580,226.7a2.52,2.52,0,1,1-2.53-2.52A2.53,2.53,0,0,1,580,226.7Z"/><path class="cls-12" d="M580,226.7a2.52,2.52,0,1,1-2.53-2.52A2.53,2.53,0,0,1,580,226.7Z"/><path class="cls-11" d="M582.25,223.94a2.52,2.52,0,1,1-2.51-2.52A2.51,2.51,0,0,1,582.25,223.94Z"/><path class="cls-12" d="M582.25,223.94a2.52,2.52,0,1,1-2.51-2.52A2.51,2.51,0,0,1,582.25,223.94Z"/><path class="cls-11" d="M584.65,221.18a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,584.65,221.18Z"/><path class="cls-12" d="M584.65,221.18a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,584.65,221.18Z"/><circle class="cls-11" cx="584.42" cy="218.06" r="2.52" transform="translate(275.56 760) rotate(-80.78)"/><circle class="cls-12" cx="584.42" cy="218.06" r="2.52" transform="translate(275.56 760) rotate(-80.78)"/><path class="cls-11" d="M589.21,214.94a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,589.21,214.94Z"/><path class="cls-12" d="M589.21,214.94a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,589.21,214.94Z"/><circle class="cls-11" cx="588.98" cy="211.58" r="2.52" transform="translate(-36.13 241.5) rotate(-22.5)"/><circle class="cls-12" cx="588.98" cy="211.58" r="2.52" transform="translate(-36.13 241.5) rotate(-22.5)"/><path class="cls-11" d="M593.77,208.22a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,593.77,208.22Z"/><path class="cls-12" d="M593.77,208.22a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,593.77,208.22Z"/><circle class="cls-11" cx="593.53" cy="204.38" r="2.52" transform="translate(296.72 757.51) rotate(-80.78)"/><circle class="cls-12" cx="593.53" cy="204.38" r="2.52" transform="translate(296.72 757.51) rotate(-80.78)"/><path class="cls-11" d="M598.33,200.54a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,598.33,200.54Z"/><path class="cls-12" d="M598.33,200.54a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,598.33,200.54Z"/><circle class="cls-11" cx="598.09" cy="196.46" r="2.52"/><circle class="cls-12" cx="598.09" cy="196.46" r="2.52"/><path class="cls-11" d="M602.89,192.14a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,602.89,192.14Z"/><path class="cls-12" d="M602.89,192.14a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,602.89,192.14Z"/><circle class="cls-11" cx="602.65" cy="187.46" r="2.52"/><circle class="cls-12" cx="602.65" cy="187.46" r="2.52"/><path class="cls-11" d="M607.57,182.66a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,607.57,182.66Z"/><path class="cls-12" d="M607.57,182.66a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,607.57,182.66Z"/><path class="cls-11" d="M609.86,177.38a2.52,2.52,0,1,1-2.53-2.52A2.52,2.52,0,0,1,609.86,177.38Z"/><path class="cls-12" d="M609.86,177.38a2.52,2.52,0,1,1-2.53-2.52A2.52,2.52,0,0,1,609.86,177.38Z"/><path class="cls-11" d="M612.13,172a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,612.13,172Z"/><path class="cls-12" d="M612.13,172a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,612.13,172Z"/><path class="cls-11" d="M614.42,166.1a2.52,2.52,0,1,1-2.53-2.52A2.53,2.53,0,0,1,614.42,166.1Z"/><path class="cls-12" d="M614.42,166.1a2.52,2.52,0,1,1-2.53-2.52A2.53,2.53,0,0,1,614.42,166.1Z"/><path class="cls-11" d="M616.69,160a2.52,2.52,0,1,1-2.51-2.52A2.51,2.51,0,0,1,616.69,160Z"/><path class="cls-12" d="M616.69,160a2.52,2.52,0,1,1-2.51-2.52A2.51,2.51,0,0,1,616.69,160Z"/><path class="cls-11" d="M619,153.5a2.52,2.52,0,1,1-2.53-2.52A2.53,2.53,0,0,1,619,153.5Z"/><path class="cls-12" d="M619,153.5a2.52,2.52,0,1,1-2.53-2.52A2.53,2.53,0,0,1,619,153.5Z"/><path class="cls-11" d="M621.25,146.66a2.52,2.52,0,1,1-2.51-2.52A2.51,2.51,0,0,1,621.25,146.66Z"/><path class="cls-12" d="M621.25,146.66a2.52,2.52,0,1,1-2.51-2.52A2.51,2.51,0,0,1,621.25,146.66Z"/><path class="cls-11" d="M623.53,139.22A2.52,2.52,0,1,1,621,136.7,2.52,2.52,0,0,1,623.53,139.22Z"/><path class="cls-12" d="M623.53,139.22A2.52,2.52,0,1,1,621,136.7,2.52,2.52,0,0,1,623.53,139.22Z"/><circle class="cls-11" cx="623.3" cy="131.54" r="2.52" transform="translate(-2.89 248.54) rotate(-22.5)"/><circle class="cls-12" cx="623.3" cy="131.54" r="2.52" transform="translate(-2.89 248.54) rotate(-22.5)"/><path class="cls-11" d="M628.09,123.14a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,628.09,123.14Z"/><path class="cls-12" d="M628.09,123.14a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,628.09,123.14Z"/><circle class="cls-11" cx="627.98" cy="114.26" r="2.52" transform="translate(4.08 249.01) rotate(-22.5)"/><circle class="cls-12" cx="627.98" cy="114.26" r="2.52" transform="translate(4.08 249.01) rotate(-22.5)"/><path class="cls-11" d="M632.77,104.9a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,632.77,104.9Z"/><path class="cls-12" d="M632.77,104.9a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,632.77,104.9Z"/><circle class="cls-11" cx="632.53" cy="94.82" r="2.52" transform="translate(11.86 249.28) rotate(-22.5)"/><circle class="cls-12" cx="632.53" cy="94.82" r="2.52" transform="translate(11.86 249.28) rotate(-22.5)"/><path class="cls-11" d="M637.33,83.9a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,637.33,83.9Z"/><path class="cls-12" d="M637.33,83.9a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,637.33,83.9Z"/><circle class="cls-11" cx="637.09" cy="72.5" r="2.52" transform="translate(463.48 689.75) rotate(-80.78)"/><circle class="cls-12" cx="637.09" cy="72.5" r="2.52" transform="translate(463.48 689.75) rotate(-80.78)"/><path class="cls-11" d="M641.89,60.26a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,641.89,60.26Z"/><path class="cls-12" d="M641.89,60.26a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,641.89,60.26Z"/><g class="cls-13"><text class="cls-14" transform="translate(40.93 365.91)">0</text></g><g class="cls-13"><text class="cls-14" transform="translate(27.23 295.04)">5000</text></g><g class="cls-13"><text class="cls-14" transform="translate(22.67 224.17)">10000</text></g><g class="cls-13"><text class="cls-14" transform="translate(22.67 153.31)">15000</text></g><g class="cls-13"><text class="cls-14" transform="translate(22.67 82.43)">20000</text></g><g class="cls-13"><text class="cls-14" transform="translate(22.67 11.56)">25000</text></g><g class="cls-13"><text class="cls-14" transform="translate(52.7 377.63)">0</text></g><g class="cls-13"><text class="cls-14" transform="translate(165.03 377.63)">50</text></g><g class="cls-13"><text class="cls-14" transform="translate(277.35 377.63)">100</text></g><g class="cls-13"><text class="cls-14" transform="translate(391.99 377.63)">150</text></g><g class="cls-13"><text class="cls-14" transform="translate(506.58 377.63)">200</text></g><g class="cls-13"><text class="cls-14" transform="translate(621.19 377.63)">250</text></g><g class="cls-13"><text class="cls-15" transform="translate(17.56 199.35) rotate(-90)">Qstep</text></g><g class="cls-13"><text class="cls-15" transform="translate(325.64 386.9)">Q<tspan class="cls-16" x="11.53" y="0">_</tspan><tspan x="19.54" y="0">i</tspan><tspan class="cls-16" x="23.97" y="0">n</tspan><tspan class="cls-17" x="31.98" y="0">d</tspan><tspan x="40.01" y="0">ex</tspan></text></g><line class="cls-4" x1="477.71" y1="70.43" x2="496.92" y2="70.44"/><path class="cls-5" d="M489.13,70.28a2,2,0,1,1-2-2A2,2,0,0,1,489.13,70.28Z"/><path class="cls-18" d="M489.13,70.28a2,2,0,1,1-2-2A2,2,0,0,1,489.13,70.28Z"/><g class="cls-13"><text class="cls-19" transform="translate(499.04 74.83)"><tspan class="cls-20">8</tspan><tspan x="6.98" y="0">-</tspan><tspan class="cls-21" x="11.65" y="0">b</tspan><tspan class="cls-20" x="18.73" y="0">it</tspan><tspan class="cls-22" x="26.45" y="0"> </tspan><tspan class="cls-23" x="30.03" y="0">D</tspan><tspan x="40.11" y="0">C</tspan></text></g><line class="cls-7" x1="477.71" y1="91.78" x2="496.92" y2="91.78"/><path class="cls-8" d="M489.13,91.64a2,2,0,1,1-2-2A2,2,0,0,1,489.13,91.64Z"/><path class="cls-24" d="M489.13,91.64a2,2,0,1,1-2-2A2,2,0,0,1,489.13,91.64Z"/><g class="cls-13"><text class="cls-25" transform="translate(499.04 96.16)"><tspan class="cls-26">1</tspan><tspan class="cls-27" x="6.96" y="0">0</tspan><tspan x="14.01" y="0">-</tspan><tspan class="cls-28" x="18.69" y="0">b</tspan><tspan class="cls-29" x="25.65" y="0">i</tspan><tspan class="cls-30" x="29.5" y="0">t</tspan><tspan class="cls-31" x="33.45" y="0"> </tspan><tspan class="cls-32" x="36.97" y="0">D</tspan><tspan x="47.01" y="0">C</tspan></text></g><line class="cls-10" x1="477.71" y1="113.13" x2="496.92" y2="113.13"/><circle class="cls-11" cx="487.21" cy="113.06" r="2.52" transform="translate(297.58 575.87) rotate(-80.78)"/><circle class="cls-33" cx="487.21" cy="113.06" r="2.52" transform="translate(297.58 575.87) rotate(-80.78)"/><g class="cls-13"><text class="cls-19" transform="translate(499.04 117.52)"><tspan class="cls-20">1</tspan><tspan class="cls-34" x="6.98" y="0">2</tspan><tspan class="cls-35" x="14.03" y="0">-</tspan><tspan class="cls-36" x="18.7" y="0">bi</tspan><tspan class="cls-37" x="29.54" y="0">t</tspan><tspan class="cls-38" x="33.48" y="0"> </tspan><tspan class="cls-39" x="36.99" y="0">D</tspan><tspan x="47.05" y="0">C</tspan></text></g><rect class="cls-2" x="0.38" y="0.38" width="652.8" height="391.32"/></g></g></svg>
\ No newline at end of file
diff --git a/doc/img/scc_intrabc.svg b/doc/img/scc_intrabc.svg
new file mode 100644
index 0000000..dfe4948
--- /dev/null
+++ b/doc/img/scc_intrabc.svg
@@ -0,0 +1,348 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<!-- Generated by Microsoft Visio, SVG Export scc_intrabc.svg Page-1 -->
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ev="http://www.w3.org/2001/xml-events"
+		xmlns:v="http://schemas.microsoft.com/visio/2003/SVGExtensions/" width="5.4258in" height="2.9597in"
+		viewBox="0 0 390.657 213.098" xml:space="preserve" color-interpolation-filters="sRGB" class="st8">
+	<v:documentProperties v:langID="1033" v:viewMarkup="false"/>
+
+	<style type="text/css">
+	<![CDATA[
+		.st1 {fill:#d8d8d8;stroke:#000000;stroke-width:0.25}
+		.st2 {fill:#fec000;stroke:#000000;stroke-width:0.25}
+		.st3 {fill:#00fefe;stroke:#000000;stroke-width:0.25}
+		.st4 {fill:#ffffff;stroke:#000000;stroke-width:0.25}
+		.st5 {fill:#ffc000;stroke:#000000;stroke-width:0.25}
+		.st6 {fill:none;stroke:none;stroke-width:0.25}
+		.st7 {fill:#4672c4;font-family:Calibri;font-size:0.666664em}
+		.st8 {fill:none;fill-rule:evenodd;font-size:12px;overflow:visible;stroke-linecap:square;stroke-miterlimit:3}
+	]]>
+	</style>
+
+	<g v:mID="0" v:index="1" v:groupContext="foregroundPage">
+		<title>Page-1</title>
+		<v:pageProperties v:drawingScale="0.0393701" v:pageScale="0.0393701" v:drawingUnits="24" v:shadowOffsetX="4.25197"
+				v:shadowOffsetY="-4.25197"/>
+		<g id="shape1-1" v:mID="1" v:groupContext="shape" transform="translate(0.25,-141.982)">
+			<title>Sheet.1</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st1"/>
+		</g>
+		<g id="shape3-3" v:mID="3" v:groupContext="shape" transform="translate(28.5965,-141.982)">
+			<title>Sheet.3</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st1"/>
+		</g>
+		<g id="shape4-5" v:mID="4" v:groupContext="shape" transform="translate(56.9429,-141.982)">
+			<title>Sheet.4</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st1"/>
+		</g>
+		<g id="shape5-7" v:mID="5" v:groupContext="shape" transform="translate(85.2894,-141.982)">
+			<title>Sheet.5</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st1"/>
+		</g>
+		<g id="shape6-9" v:mID="6" v:groupContext="shape" transform="translate(113.636,-141.982)">
+			<title>Sheet.6</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st1"/>
+		</g>
+		<g id="shape7-11" v:mID="7" v:groupContext="shape" transform="translate(141.982,-141.982)">
+			<title>Sheet.7</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st1"/>
+		</g>
+		<g id="shape8-13" v:mID="8" v:groupContext="shape" transform="translate(170.329,-141.982)">
+			<title>Sheet.8</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st1"/>
+		</g>
+		<g id="shape9-15" v:mID="9" v:groupContext="shape" transform="translate(198.675,-141.982)">
+			<title>Sheet.9</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st1"/>
+		</g>
+		<g id="shape10-17" v:mID="10" v:groupContext="shape" transform="translate(0.25,-113.636)">
+			<title>Sheet.10</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st1"/>
+		</g>
+		<g id="shape11-19" v:mID="11" v:groupContext="shape" transform="translate(28.5965,-113.636)">
+			<title>Sheet.11</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st1"/>
+		</g>
+		<g id="shape12-21" v:mID="12" v:groupContext="shape" transform="translate(56.9429,-113.636)">
+			<title>Sheet.12</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st1"/>
+		</g>
+		<g id="shape13-23" v:mID="13" v:groupContext="shape" transform="translate(85.2894,-113.636)">
+			<title>Sheet.13</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st1"/>
+		</g>
+		<g id="shape14-25" v:mID="14" v:groupContext="shape" transform="translate(113.636,-113.636)">
+			<title>Sheet.14</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st1"/>
+		</g>
+		<g id="shape15-27" v:mID="15" v:groupContext="shape" transform="translate(141.982,-113.636)">
+			<title>Sheet.15</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st1"/>
+		</g>
+		<g id="shape16-29" v:mID="16" v:groupContext="shape" transform="translate(170.329,-113.636)">
+			<title>Sheet.16</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st1"/>
+		</g>
+		<g id="shape17-31" v:mID="17" v:groupContext="shape" transform="translate(198.675,-113.636)">
+			<title>Sheet.17</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st1"/>
+		</g>
+		<g id="shape18-33" v:mID="18" v:groupContext="shape" transform="translate(0.25,-85.2894)">
+			<title>Sheet.18</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st1"/>
+		</g>
+		<g id="shape19-35" v:mID="19" v:groupContext="shape" transform="translate(28.5965,-85.2894)">
+			<title>Sheet.19</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st1"/>
+		</g>
+		<g id="shape20-37" v:mID="20" v:groupContext="shape" transform="translate(56.9429,-85.2894)">
+			<title>Sheet.20</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st1"/>
+		</g>
+		<g id="shape21-39" v:mID="21" v:groupContext="shape" transform="translate(85.2894,-85.2894)">
+			<title>Sheet.21</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st1"/>
+		</g>
+		<g id="shape22-41" v:mID="22" v:groupContext="shape" transform="translate(113.636,-85.2894)">
+			<title>Sheet.22</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st1"/>
+		</g>
+		<g id="shape23-43" v:mID="23" v:groupContext="shape" transform="translate(141.982,-85.2894)">
+			<title>Sheet.23</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st1"/>
+		</g>
+		<g id="shape24-45" v:mID="24" v:groupContext="shape" transform="translate(170.329,-85.2894)">
+			<title>Sheet.24</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st2"/>
+		</g>
+		<g id="shape25-47" v:mID="25" v:groupContext="shape" transform="translate(198.675,-85.2894)">
+			<title>Sheet.25</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st3"/>
+		</g>
+		<g id="shape26-49" v:mID="26" v:groupContext="shape" transform="translate(0.25,-56.9429)">
+			<title>Sheet.26</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st1"/>
+		</g>
+		<g id="shape27-51" v:mID="27" v:groupContext="shape" transform="translate(28.5965,-56.9429)">
+			<title>Sheet.27</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st1"/>
+		</g>
+		<g id="shape28-53" v:mID="28" v:groupContext="shape" transform="translate(56.9429,-56.9429)">
+			<title>Sheet.28</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st1"/>
+		</g>
+		<g id="shape29-55" v:mID="29" v:groupContext="shape" transform="translate(85.2894,-56.9429)">
+			<title>Sheet.29</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st1"/>
+		</g>
+		<g id="shape30-57" v:mID="30" v:groupContext="shape" transform="translate(113.636,-56.9429)">
+			<title>Sheet.30</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st2"/>
+		</g>
+		<g id="shape31-59" v:mID="31" v:groupContext="shape" transform="translate(141.982,-56.9429)">
+			<title>Sheet.31</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st3"/>
+		</g>
+		<g id="shape32-61" v:mID="32" v:groupContext="shape" transform="translate(170.329,-56.9429)">
+			<title>Sheet.32</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st4"/>
+		</g>
+		<g id="shape33-63" v:mID="33" v:groupContext="shape" transform="translate(198.675,-56.9429)">
+			<title>Sheet.33</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st4"/>
+		</g>
+		<g id="shape34-65" v:mID="34" v:groupContext="shape" transform="translate(227.022,-141.982)">
+			<title>Sheet.34</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st1"/>
+		</g>
+		<g id="shape35-67" v:mID="35" v:groupContext="shape" transform="translate(255.368,-141.982)">
+			<title>Sheet.35</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st1"/>
+		</g>
+		<g id="shape36-69" v:mID="36" v:groupContext="shape" transform="translate(283.715,-141.982)">
+			<title>Sheet.36</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st5"/>
+		</g>
+		<g id="shape37-71" v:mID="37" v:groupContext="shape" transform="translate(312.061,-141.982)">
+			<title>Sheet.37</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st3"/>
+		</g>
+		<g id="shape38-73" v:mID="38" v:groupContext="shape" transform="translate(227.022,-113.636)">
+			<title>Sheet.38</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st2"/>
+		</g>
+		<g id="shape39-75" v:mID="39" v:groupContext="shape" transform="translate(255.368,-113.636)">
+			<title>Sheet.39</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st3"/>
+		</g>
+		<g id="shape40-77" v:mID="40" v:groupContext="shape" transform="translate(283.715,-113.636)">
+			<title>Sheet.40</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st4"/>
+		</g>
+		<g id="shape41-79" v:mID="41" v:groupContext="shape" transform="translate(312.061,-113.636)">
+			<title>Sheet.41</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st4"/>
+		</g>
+		<g id="shape42-81" v:mID="42" v:groupContext="shape" transform="translate(227.022,-85.2894)">
+			<title>Sheet.42</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st4"/>
+		</g>
+		<g id="shape43-83" v:mID="43" v:groupContext="shape" transform="translate(255.368,-85.2894)">
+			<title>Sheet.43</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st4"/>
+		</g>
+		<g id="shape44-85" v:mID="44" v:groupContext="shape" transform="translate(283.715,-85.2894)">
+			<title>Sheet.44</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st4"/>
+		</g>
+		<g id="shape45-87" v:mID="45" v:groupContext="shape" transform="translate(312.061,-85.2894)">
+			<title>Sheet.45</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st4"/>
+		</g>
+		<g id="shape46-89" v:mID="46" v:groupContext="shape" transform="translate(227.022,-56.9429)">
+			<title>Sheet.46</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st4"/>
+		</g>
+		<g id="shape47-91" v:mID="47" v:groupContext="shape" transform="translate(255.368,-56.9429)">
+			<title>Sheet.47</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st4"/>
+		</g>
+		<g id="shape48-93" v:mID="48" v:groupContext="shape" transform="translate(283.715,-56.9429)">
+			<title>Sheet.48</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st4"/>
+		</g>
+		<g id="shape49-95" v:mID="49" v:groupContext="shape" transform="translate(312.061,-56.9429)">
+			<title>Sheet.49</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st4"/>
+		</g>
+		<g id="shape50-97" v:mID="50" v:groupContext="shape" transform="translate(0.25,-28.5965)">
+			<title>Sheet.50</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st1"/>
+		</g>
+		<g id="shape51-99" v:mID="51" v:groupContext="shape" transform="translate(28.5965,-28.5965)">
+			<title>Sheet.51</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st1"/>
+		</g>
+		<g id="shape52-101" v:mID="52" v:groupContext="shape" transform="translate(56.9429,-28.5965)">
+			<title>Sheet.52</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st2"/>
+		</g>
+		<g id="shape53-103" v:mID="53" v:groupContext="shape" transform="translate(85.2894,-28.5965)">
+			<title>Sheet.53</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st3"/>
+		</g>
+		<g id="shape54-105" v:mID="54" v:groupContext="shape" transform="translate(113.636,-28.5965)">
+			<title>Sheet.54</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st4"/>
+		</g>
+		<g id="shape55-107" v:mID="55" v:groupContext="shape" transform="translate(141.982,-28.5965)">
+			<title>Sheet.55</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st4"/>
+		</g>
+		<g id="shape56-109" v:mID="56" v:groupContext="shape" transform="translate(170.329,-28.5965)">
+			<title>Sheet.56</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st4"/>
+		</g>
+		<g id="shape57-111" v:mID="57" v:groupContext="shape" transform="translate(198.675,-28.5965)">
+			<title>Sheet.57</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st4"/>
+		</g>
+		<g id="shape58-113" v:mID="58" v:groupContext="shape" transform="translate(227.022,-28.5965)">
+			<title>Sheet.58</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st4"/>
+		</g>
+		<g id="shape59-115" v:mID="59" v:groupContext="shape" transform="translate(255.368,-28.5965)">
+			<title>Sheet.59</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st4"/>
+		</g>
+		<g id="shape60-117" v:mID="60" v:groupContext="shape" transform="translate(283.715,-28.5965)">
+			<title>Sheet.60</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st4"/>
+		</g>
+		<g id="shape61-119" v:mID="61" v:groupContext="shape" transform="translate(312.061,-28.5965)">
+			<title>Sheet.61</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st4"/>
+		</g>
+		<g id="shape62-121" v:mID="62" v:groupContext="shape" transform="translate(0.25,-0.25)">
+			<title>Sheet.62</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st4"/>
+		</g>
+		<g id="shape63-123" v:mID="63" v:groupContext="shape" transform="translate(28.5965,-0.25)">
+			<title>Sheet.63</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st4"/>
+		</g>
+		<g id="shape64-125" v:mID="64" v:groupContext="shape" transform="translate(56.9429,-0.25)">
+			<title>Sheet.64</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st4"/>
+		</g>
+		<g id="shape65-127" v:mID="65" v:groupContext="shape" transform="translate(85.2894,-0.25)">
+			<title>Sheet.65</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st4"/>
+		</g>
+		<g id="shape66-129" v:mID="66" v:groupContext="shape" transform="translate(113.636,-0.25)">
+			<title>Sheet.66</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st4"/>
+		</g>
+		<g id="shape67-131" v:mID="67" v:groupContext="shape" transform="translate(141.982,-0.25)">
+			<title>Sheet.67</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st4"/>
+		</g>
+		<g id="shape68-133" v:mID="68" v:groupContext="shape" transform="translate(170.329,-0.25)">
+			<title>Sheet.68</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st4"/>
+		</g>
+		<g id="shape69-135" v:mID="69" v:groupContext="shape" transform="translate(198.675,-0.25)">
+			<title>Sheet.69</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st4"/>
+		</g>
+		<g id="shape70-137" v:mID="70" v:groupContext="shape" transform="translate(227.022,-0.25)">
+			<title>Sheet.70</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st4"/>
+		</g>
+		<g id="shape71-139" v:mID="71" v:groupContext="shape" transform="translate(255.368,-0.25)">
+			<title>Sheet.71</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st4"/>
+		</g>
+		<g id="shape72-141" v:mID="72" v:groupContext="shape" transform="translate(283.715,-0.25)">
+			<title>Sheet.72</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st4"/>
+		</g>
+		<g id="shape73-143" v:mID="73" v:groupContext="shape" transform="translate(312.061,-0.25)">
+			<title>Sheet.73</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st4"/>
+		</g>
+		<g id="shape74-145" v:mID="74" v:groupContext="shape" transform="translate(0.25,-184.502)">
+			<title>Sheet.74</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st3"/>
+		</g>
+		<g id="shape75-147" v:mID="75" v:groupContext="shape" transform="translate(255.368,-184.502)">
+			<title>Sheet.75</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st5"/>
+		</g>
+		<g id="shape76-149" v:mID="76" v:groupContext="shape" transform="translate(127.809,-184.502)">
+			<title>Sheet.76</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st1"/>
+		</g>
+		<g id="shape79-151" v:mID="79" v:groupContext="shape" transform="translate(27.8091,-193.762)">
+			<title>Sheet.79</title>
+			<desc>Current processing block</desc>
+			<v:textBlock v:margins="rect(2,2,2,2)" v:tabSpace="42.5197"/>
+			<v:textRect cx="50" cy="207.098" width="100" height="12"/>
+			<rect x="0" y="201.098" width="100" height="12" class="st6"/>
+			<text x="9.78" y="209.5" class="st7" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>Current processing block</text>		</g>
+		<g id="shape80-154" v:mID="80" v:groupContext="shape" transform="translate(158.899,-192.675)">
+			<title>Sheet.80</title>
+			<desc>Allowed prediction block</desc>
+			<v:textBlock v:margins="rect(2,2,2,2)" v:tabSpace="42.5197"/>
+			<v:textRect cx="50" cy="207.098" width="100" height="12"/>
+			<rect x="0" y="201.098" width="100" height="12" class="st6"/>
+			<text x="9.68" y="209.5" class="st7" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>Allowed prediction block</text>		</g>
+		<g id="shape81-157" v:mID="81" v:groupContext="shape" transform="translate(290.407,-192.675)">
+			<title>Sheet.81</title>
+			<desc>Restricted immediate blocks</desc>
+			<v:textBlock v:margins="rect(2,2,2,2)" v:tabSpace="42.5197"/>
+			<v:textRect cx="50" cy="207.098" width="100" height="12"/>
+			<rect x="0" y="201.098" width="100" height="12" class="st6"/>
+			<text x="3.92" y="209.5" class="st7" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>Restricted immediate blocks</text>		</g>
+	</g>
+</svg>
diff --git a/doc/img/secondary_tap.svg b/doc/img/secondary_tap.svg
new file mode 100644
index 0000000..4c6283d
--- /dev/null
+++ b/doc/img/secondary_tap.svg
@@ -0,0 +1,857 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<!-- Generated by Microsoft Visio, SVG Export secondary_tap.svg Page-1 -->
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ev="http://www.w3.org/2001/xml-events"
+		xmlns:v="http://schemas.microsoft.com/visio/2003/SVGExtensions/" width="11.2533in" height="3.38188in"
+		viewBox="0 0 810.24 243.495" xml:space="preserve" color-interpolation-filters="sRGB" class="st7">
+	<v:documentProperties v:langID="1033" v:viewMarkup="false">
+		<v:userDefs>
+			<v:ud v:nameU="msvNoAutoConnect" v:val="VT0(1):26"/>
+		</v:userDefs>
+	</v:documentProperties>
+
+	<style type="text/css">
+	<![CDATA[
+		.st1 {fill:#ffffff;stroke:#000000;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.24}
+		.st2 {fill:#000000;font-family:Calibri;font-size:1.00001em}
+		.st3 {fill:#00b0f0;fill-opacity:0.5;stroke:#000000;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.24}
+		.st4 {fill:none;stroke:none;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.75}
+		.st5 {fill:#000000;font-family:Calibri;font-size:1.5em;font-style:italic}
+		.st6 {font-size:1em;font-style:normal}
+		.st7 {fill:none;fill-rule:evenodd;font-size:12px;overflow:visible;stroke-linecap:square;stroke-miterlimit:3}
+	]]>
+	</style>
+
+	<g v:mID="0" v:index="1" v:groupContext="foregroundPage">
+		<title>Page-1</title>
+		<v:pageProperties v:drawingScale="1" v:pageScale="1" v:drawingUnits="19" v:shadowOffsetX="9" v:shadowOffsetY="-9"/>
+		<g id="shape1-1" v:mID="1" v:groupContext="shape" transform="translate(18.12,-189.375)">
+			<title>Square</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape2-3" v:mID="2" v:groupContext="shape" transform="translate(54.12,-189.375)">
+			<title>Square.2</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape3-5" v:mID="3" v:groupContext="shape" transform="translate(90.12,-189.375)">
+			<title>Square.3</title>
+			<desc>1/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="225.495" width="36.01" height="36"/>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+			<text x="6.56" y="229.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1/16</text>		</g>
+		<g id="shape4-8" v:mID="4" v:groupContext="shape" transform="translate(126.12,-189.375)">
+			<title>Square.4</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape5-10" v:mID="5" v:groupContext="shape" transform="translate(162.12,-189.375)">
+			<title>Square.5</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape6-12" v:mID="6" v:groupContext="shape" transform="translate(18.12,-153.375)">
+			<title>Square.6</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape7-14" v:mID="7" v:groupContext="shape" transform="translate(54.12,-153.375)">
+			<title>Square.7</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape8-16" v:mID="8" v:groupContext="shape" transform="translate(90.12,-153.375)">
+			<title>Square.8</title>
+			<desc>2/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="225.495" width="36.01" height="36"/>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+			<text x="6.56" y="229.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2/16</text>		</g>
+		<g id="shape9-19" v:mID="9" v:groupContext="shape" transform="translate(126.12,-153.375)">
+			<title>Square.9</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape10-21" v:mID="10" v:groupContext="shape" transform="translate(162.12,-153.375)">
+			<title>Square.10</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape11-23" v:mID="11" v:groupContext="shape" transform="translate(18.12,-117.375)">
+			<title>Square.11</title>
+			<desc>1/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="225.495" width="36.01" height="36"/>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+			<text x="6.56" y="229.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1/16</text>		</g>
+		<g id="shape12-26" v:mID="12" v:groupContext="shape" transform="translate(54.12,-117.375)">
+			<title>Square.12</title>
+			<desc>2/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="225.495" width="36.01" height="36"/>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+			<text x="6.56" y="229.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2/16</text>		</g>
+		<g id="shape13-29" v:mID="13" v:groupContext="shape" transform="translate(90.12,-117.375)">
+			<title>Square.13</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st3"/>
+		</g>
+		<g id="shape14-31" v:mID="14" v:groupContext="shape" transform="translate(126.12,-117.375)">
+			<title>Square.14</title>
+			<desc>2/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="225.495" width="36.01" height="36"/>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+			<text x="6.56" y="229.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2/16</text>		</g>
+		<g id="shape15-34" v:mID="15" v:groupContext="shape" transform="translate(162.12,-117.375)">
+			<title>Square.15</title>
+			<desc>1/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="225.495" width="36.01" height="36"/>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+			<text x="6.56" y="229.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1/16</text>		</g>
+		<g id="shape16-37" v:mID="16" v:groupContext="shape" transform="translate(18.12,-81.375)">
+			<title>Square.16</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape17-39" v:mID="17" v:groupContext="shape" transform="translate(54.12,-81.375)">
+			<title>Square.17</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape18-41" v:mID="18" v:groupContext="shape" transform="translate(90.12,-81.375)">
+			<title>Square.18</title>
+			<desc>2/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="225.495" width="36.01" height="36"/>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+			<text x="6.56" y="229.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2/16</text>		</g>
+		<g id="shape19-44" v:mID="19" v:groupContext="shape" transform="translate(126.12,-81.375)">
+			<title>Square.19</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape20-46" v:mID="20" v:groupContext="shape" transform="translate(162.12,-81.375)">
+			<title>Square.20</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape21-48" v:mID="21" v:groupContext="shape" transform="translate(18.12,-45.375)">
+			<title>Square.21</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape22-50" v:mID="22" v:groupContext="shape" transform="translate(54.12,-45.375)">
+			<title>Square.22</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape23-52" v:mID="23" v:groupContext="shape" transform="translate(90.12,-45.375)">
+			<title>Square.23</title>
+			<desc>1/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="225.495" width="36.01" height="36"/>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+			<text x="6.56" y="229.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1/16</text>		</g>
+		<g id="shape24-55" v:mID="24" v:groupContext="shape" transform="translate(126.12,-45.375)">
+			<title>Square.24</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape25-57" v:mID="25" v:groupContext="shape" transform="translate(162.12,-45.375)">
+			<title>Square.25</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape30-59" v:mID="30" v:groupContext="shape" transform="translate(216.12,-189.375)">
+			<title>Square.30</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape31-61" v:mID="31" v:groupContext="shape" transform="translate(252.12,-189.375)">
+			<title>Square.31</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape32-63" v:mID="32" v:groupContext="shape" transform="translate(288.12,-189.375)">
+			<title>Square.32</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape33-65" v:mID="33" v:groupContext="shape" transform="translate(324.12,-189.375)">
+			<title>Square.33</title>
+			<desc>1/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="225.495" width="36.01" height="36"/>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+			<text x="6.56" y="229.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1/16</text>		</g>
+		<g id="shape34-68" v:mID="34" v:groupContext="shape" transform="translate(360.12,-189.375)">
+			<title>Square.34</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape35-70" v:mID="35" v:groupContext="shape" transform="translate(216.12,-153.375)">
+			<title>Square.35</title>
+			<desc>1/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="225.495" width="36.01" height="36"/>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+			<text x="6.56" y="229.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1/16</text>		</g>
+		<g id="shape36-73" v:mID="36" v:groupContext="shape" transform="translate(252.12,-153.375)">
+			<title>Square.36</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape37-75" v:mID="37" v:groupContext="shape" transform="translate(288.12,-153.375)">
+			<title>Square.37</title>
+			<desc>2/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="225.495" width="36.01" height="36"/>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+			<text x="6.56" y="229.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2/16</text>		</g>
+		<g id="shape38-78" v:mID="38" v:groupContext="shape" transform="translate(324.12,-153.375)">
+			<title>Square.38</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape39-80" v:mID="39" v:groupContext="shape" transform="translate(360.12,-153.375)">
+			<title>Square.39</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape40-82" v:mID="40" v:groupContext="shape" transform="translate(216.12,-117.375)">
+			<title>Square.40</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape41-84" v:mID="41" v:groupContext="shape" transform="translate(252.12,-117.375)">
+			<title>Square.41</title>
+			<desc>2/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="225.495" width="36.01" height="36"/>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+			<text x="6.56" y="229.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2/16</text>		</g>
+		<g id="shape42-87" v:mID="42" v:groupContext="shape" transform="translate(288.12,-117.375)">
+			<title>Square.42</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st3"/>
+		</g>
+		<g id="shape43-89" v:mID="43" v:groupContext="shape" transform="translate(324.12,-117.375)">
+			<title>Square.43</title>
+			<desc>2/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="225.495" width="36.01" height="36"/>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+			<text x="6.56" y="229.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2/16</text>		</g>
+		<g id="shape44-92" v:mID="44" v:groupContext="shape" transform="translate(360.12,-117.375)">
+			<title>Square.44</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape45-94" v:mID="45" v:groupContext="shape" transform="translate(216.12,-81.375)">
+			<title>Square.45</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape46-96" v:mID="46" v:groupContext="shape" transform="translate(252.12,-81.375)">
+			<title>Square.46</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape47-98" v:mID="47" v:groupContext="shape" transform="translate(288.12,-81.3749)">
+			<title>Square.47</title>
+			<desc>2/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="225.495" width="36.01" height="36"/>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+			<text x="6.56" y="229.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2/16</text>		</g>
+		<g id="shape48-101" v:mID="48" v:groupContext="shape" transform="translate(324.12,-81.3749)">
+			<title>Square.48</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape49-103" v:mID="49" v:groupContext="shape" transform="translate(360.12,-81.3749)">
+			<title>Square.49</title>
+			<desc>1/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="225.495" width="36.01" height="36"/>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+			<text x="6.56" y="229.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1/16</text>		</g>
+		<g id="shape50-106" v:mID="50" v:groupContext="shape" transform="translate(216.12,-45.375)">
+			<title>Square.50</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape51-108" v:mID="51" v:groupContext="shape" transform="translate(252.12,-45.375)">
+			<title>Square.51</title>
+			<desc>1/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="225.495" width="36.01" height="36"/>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+			<text x="6.56" y="229.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1/16</text>		</g>
+		<g id="shape52-111" v:mID="52" v:groupContext="shape" transform="translate(288.12,-45.375)">
+			<title>Square.52</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape53-113" v:mID="53" v:groupContext="shape" transform="translate(324.12,-45.375)">
+			<title>Square.53</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape54-115" v:mID="54" v:groupContext="shape" transform="translate(360.12,-45.375)">
+			<title>Square.54</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape55-117" v:mID="55" v:groupContext="shape" transform="translate(414.12,-189.375)">
+			<title>Square.55</title>
+			<desc>1/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="225.495" width="36.01" height="36"/>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+			<text x="6.56" y="229.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1/16</text>		</g>
+		<g id="shape56-120" v:mID="56" v:groupContext="shape" transform="translate(450.12,-189.375)">
+			<title>Square.56</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape57-122" v:mID="57" v:groupContext="shape" transform="translate(486.12,-189.375)">
+			<title>Square.57</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape58-124" v:mID="58" v:groupContext="shape" transform="translate(522.12,-189.375)">
+			<title>Square.58</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape59-126" v:mID="59" v:groupContext="shape" transform="translate(558.12,-189.375)">
+			<title>Square.59</title>
+			<desc>1/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="225.495" width="36.01" height="36"/>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+			<text x="6.56" y="229.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1/16</text>		</g>
+		<g id="shape60-129" v:mID="60" v:groupContext="shape" transform="translate(414.12,-153.375)">
+			<title>Square.60</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape61-131" v:mID="61" v:groupContext="shape" transform="translate(450.12,-153.375)">
+			<title>Square.61</title>
+			<desc>2/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="225.495" width="36.01" height="36"/>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+			<text x="6.56" y="229.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2/16</text>		</g>
+		<g id="shape62-134" v:mID="62" v:groupContext="shape" transform="translate(486.12,-153.375)">
+			<title>Square.62</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape63-136" v:mID="63" v:groupContext="shape" transform="translate(522.12,-153.375)">
+			<title>Square.63</title>
+			<desc>2/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="225.495" width="36.01" height="36"/>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+			<text x="6.56" y="229.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2/16</text>		</g>
+		<g id="shape64-139" v:mID="64" v:groupContext="shape" transform="translate(558.12,-153.375)">
+			<title>Square.64</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape65-141" v:mID="65" v:groupContext="shape" transform="translate(414.12,-117.375)">
+			<title>Square.65</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape66-143" v:mID="66" v:groupContext="shape" transform="translate(450.12,-117.375)">
+			<title>Square.66</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape67-145" v:mID="67" v:groupContext="shape" transform="translate(486.12,-117.375)">
+			<title>Square.67</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st3"/>
+		</g>
+		<g id="shape68-147" v:mID="68" v:groupContext="shape" transform="translate(522.12,-117.375)">
+			<title>Square.68</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape69-149" v:mID="69" v:groupContext="shape" transform="translate(558.12,-117.375)">
+			<title>Square.69</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape70-151" v:mID="70" v:groupContext="shape" transform="translate(414.12,-81.375)">
+			<title>Square.70</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape71-153" v:mID="71" v:groupContext="shape" transform="translate(450.12,-81.375)">
+			<title>Square.71</title>
+			<desc>2/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="225.495" width="36.01" height="36"/>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+			<text x="6.56" y="229.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2/16</text>		</g>
+		<g id="shape72-156" v:mID="72" v:groupContext="shape" transform="translate(486.12,-81.3749)">
+			<title>Square.72</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape73-158" v:mID="73" v:groupContext="shape" transform="translate(522.12,-81.3749)">
+			<title>Square.73</title>
+			<desc>2/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="225.495" width="36.01" height="36"/>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+			<text x="6.56" y="229.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2/16</text>		</g>
+		<g id="shape74-161" v:mID="74" v:groupContext="shape" transform="translate(558.12,-81.3749)">
+			<title>Square.74</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape75-163" v:mID="75" v:groupContext="shape" transform="translate(414.12,-45.375)">
+			<title>Square.75</title>
+			<desc>1/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="225.495" width="36.01" height="36"/>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+			<text x="6.56" y="229.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1/16</text>		</g>
+		<g id="shape76-166" v:mID="76" v:groupContext="shape" transform="translate(450.12,-45.375)">
+			<title>Square.76</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape77-168" v:mID="77" v:groupContext="shape" transform="translate(486.12,-45.375)">
+			<title>Square.77</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape78-170" v:mID="78" v:groupContext="shape" transform="translate(522.12,-45.375)">
+			<title>Square.78</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape79-172" v:mID="79" v:groupContext="shape" transform="translate(558.12,-45.375)">
+			<title>Square.79</title>
+			<desc>1/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="225.495" width="36.01" height="36"/>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+			<text x="6.56" y="229.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1/16</text>		</g>
+		<g id="shape80-175" v:mID="80" v:groupContext="shape" transform="translate(612.12,-189.375)">
+			<title>Square.80</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape81-177" v:mID="81" v:groupContext="shape" transform="translate(648.12,-189.375)">
+			<title>Square.81</title>
+			<desc>1/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="225.495" width="36.01" height="36"/>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+			<text x="6.56" y="229.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1/16</text>		</g>
+		<g id="shape82-180" v:mID="82" v:groupContext="shape" transform="translate(684.12,-189.375)">
+			<title>Square.82</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape83-182" v:mID="83" v:groupContext="shape" transform="translate(720.12,-189.375)">
+			<title>Square.83</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape84-184" v:mID="84" v:groupContext="shape" transform="translate(756.12,-189.375)">
+			<title>Square.84</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape85-186" v:mID="85" v:groupContext="shape" transform="translate(612.12,-153.375)">
+			<title>Square.85</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape86-188" v:mID="86" v:groupContext="shape" transform="translate(648.12,-153.375)">
+			<title>Square.86</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape87-190" v:mID="87" v:groupContext="shape" transform="translate(684.12,-153.375)">
+			<title>Square.87</title>
+			<desc>2/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="225.495" width="36.01" height="36"/>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+			<text x="6.56" y="229.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2/16</text>		</g>
+		<g id="shape88-193" v:mID="88" v:groupContext="shape" transform="translate(720.12,-153.375)">
+			<title>Square.88</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape89-195" v:mID="89" v:groupContext="shape" transform="translate(756.12,-153.375)">
+			<title>Square.89</title>
+			<desc>1/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="225.495" width="36.01" height="36"/>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+			<text x="6.56" y="229.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1/16</text>		</g>
+		<g id="shape90-198" v:mID="90" v:groupContext="shape" transform="translate(612.12,-117.375)">
+			<title>Square.90</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape91-200" v:mID="91" v:groupContext="shape" transform="translate(648.12,-117.375)">
+			<title>Square.91</title>
+			<desc>2/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="225.495" width="36.01" height="36"/>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+			<text x="6.56" y="229.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2/16</text>		</g>
+		<g id="shape92-203" v:mID="92" v:groupContext="shape" transform="translate(684.12,-117.375)">
+			<title>Square.92</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st3"/>
+		</g>
+		<g id="shape93-205" v:mID="93" v:groupContext="shape" transform="translate(720.12,-117.375)">
+			<title>Square.93</title>
+			<desc>2/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="225.495" width="36.01" height="36"/>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+			<text x="6.56" y="229.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2/16</text>		</g>
+		<g id="shape94-208" v:mID="94" v:groupContext="shape" transform="translate(756.12,-117.375)">
+			<title>Square.94</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape95-210" v:mID="95" v:groupContext="shape" transform="translate(612.12,-81.375)">
+			<title>Square.95</title>
+			<desc>1/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="225.495" width="36.01" height="36"/>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+			<text x="6.56" y="229.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1/16</text>		</g>
+		<g id="shape96-213" v:mID="96" v:groupContext="shape" transform="translate(648.12,-81.375)">
+			<title>Square.96</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape97-215" v:mID="97" v:groupContext="shape" transform="translate(684.12,-81.3749)">
+			<title>Square.97</title>
+			<desc>2/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="225.495" width="36.01" height="36"/>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+			<text x="6.56" y="229.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2/16</text>		</g>
+		<g id="shape98-218" v:mID="98" v:groupContext="shape" transform="translate(720.12,-81.3749)">
+			<title>Square.98</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape99-220" v:mID="99" v:groupContext="shape" transform="translate(756.12,-81.3749)">
+			<title>Square.99</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape100-222" v:mID="100" v:groupContext="shape" transform="translate(612.12,-45.375)">
+			<title>Square.100</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape101-224" v:mID="101" v:groupContext="shape" transform="translate(648.12,-45.375)">
+			<title>Square.101</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape102-226" v:mID="102" v:groupContext="shape" transform="translate(684.12,-45.375)">
+			<title>Square.102</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape103-228" v:mID="103" v:groupContext="shape" transform="translate(720.12,-45.375)">
+			<title>Square.103</title>
+			<desc>1/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="225.495" width="36.01" height="36"/>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+			<text x="6.56" y="229.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1/16</text>		</g>
+		<g id="shape104-231" v:mID="104" v:groupContext="shape" transform="translate(756.12,-45.375)">
+			<title>Square.104</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape236-233" v:mID="236" v:groupContext="shape" transform="translate(54.12,-18.375)">
+			<title>Sheet.236</title>
+			<desc>d = 0, 4</desc>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="54" cy="229.995" width="108" height="27"/>
+			<rect x="0" y="216.495" width="108" height="27" class="st4"/>
+			<text x="27.42" y="235.4" class="st5" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>d<tspan class="st6"> </tspan><tspan
+						class="st6">= 0, 4</tspan></text>		</g>
+		<g id="shape237-238" v:mID="237" v:groupContext="shape" transform="translate(252.12,-18.375)">
+			<title>Sheet.237</title>
+			<desc>d = 1, 5</desc>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="54" cy="229.995" width="108" height="27"/>
+			<rect x="0" y="216.495" width="108" height="27" class="st4"/>
+			<text x="27.42" y="235.4" class="st5" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>d<tspan class="st6"> </tspan><tspan
+						class="st6">= 1, 5</tspan></text>		</g>
+		<g id="shape238-243" v:mID="238" v:groupContext="shape" transform="translate(450.12,-18.375)">
+			<title>Sheet.238</title>
+			<desc>d = 2, 6</desc>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="54" cy="229.995" width="108" height="27"/>
+			<rect x="0" y="216.495" width="108" height="27" class="st4"/>
+			<text x="27.42" y="235.4" class="st5" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>d<tspan class="st6"> </tspan><tspan
+						class="st6">= 2, 6</tspan></text>		</g>
+		<g id="shape239-248" v:mID="239" v:groupContext="shape" transform="translate(648.12,-18.375)">
+			<title>Sheet.239</title>
+			<desc>d = 3, 7</desc>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="54" cy="229.995" width="108" height="27"/>
+			<rect x="0" y="216.495" width="108" height="27" class="st4"/>
+			<text x="27.42" y="235.4" class="st5" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>d<tspan class="st6"> </tspan><tspan
+						class="st6">= 3, 7</tspan></text>		</g>
+	</g>
+</svg>
diff --git a/doc/img/tx_basis.svg b/doc/img/tx_basis.svg
new file mode 100644
index 0000000..eb27b03
--- /dev/null
+++ b/doc/img/tx_basis.svg
@@ -0,0 +1 @@
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" viewBox="0 0 381.36 266.69"><defs><style>.cls-1,.cls-21{fill:none;}.cls-2{clip-path:url(#clip-path);}.cls-3{fill:#ddebf7;}.cls-4{clip-path:url(#clip-path-2);}.cls-15,.cls-24,.cls-5{font-size:11.04px;}.cls-5{font-family:Calibri, Calibri;}.cls-6{letter-spacing:0em;}.cls-7{letter-spacing:0em;}.cls-8{letter-spacing:0em;}.cls-9{letter-spacing:0em;}.cls-10{letter-spacing:0em;}.cls-11{letter-spacing:0em;}.cls-12{letter-spacing:0em;}.cls-13{letter-spacing:0em;}.cls-14{font-size:7.32px;}.cls-14,.cls-15{font-family:Calibri-Italic, Calibri;font-style:italic;}.cls-16{letter-spacing:0em;}.cls-17{letter-spacing:0em;}.cls-18{letter-spacing:0.01em;}.cls-19{letter-spacing:0.01em;}.cls-20{clip-path:url(#clip-path-13);}.cls-21{stroke:#000;stroke-linecap:square;stroke-linejoin:round;stroke-width:0.14px;}.cls-22{clip-path:url(#clip-path-14);}.cls-23{clip-path:url(#clip-path-15);}.cls-24,.cls-25,.cls-33{font-family:CambriaMath, Cambria Math;}.cls-25{font-size:8.04px;}.cls-26{fill-rule:evenodd;}.cls-27{letter-spacing:0em;}.cls-28{clip-path:url(#clip-path-35);}.cls-29{clip-path:url(#clip-path-47);}.cls-30{letter-spacing:0em;}.cls-31{letter-spacing:-0.01em;}.cls-32{clip-path:url(#clip-path-98);}.cls-33{font-size:11.06px;}</style><clipPath id="clip-path" transform="translate(-1.43 -338.09)"><rect class="cls-1" x="1.92" y="1.92" width="381.22" height="594.46"/></clipPath><clipPath id="clip-path-2" transform="translate(-1.43 -338.09)"><rect class="cls-1" x="1.92" y="339.53" width="381.22" height="17.16"/></clipPath><clipPath id="clip-path-13" transform="translate(-1.43 -338.09)"><rect class="cls-1" width="385.18" height="598.42"/></clipPath><clipPath id="clip-path-14" transform="translate(-1.43 -338.09)"><rect class="cls-1" x="132.72" y="370.06" width="181.68" height="53.04"/></clipPath><clipPath id="clip-path-15" transform="translate(-1.43 -338.09)"><rect class="cls-1" x="0.96" y="0.94" width="382.08" height="595.32"/></clipPath><clipPath id="clip-path-35" transform="translate(-1.43 -338.09)"><rect class="cls-1" x="137.4" y="411.46" width="131.16" height="27.24"/></clipPath><clipPath id="clip-path-47" transform="translate(-1.43 -338.09)"><rect class="cls-1" x="131.52" y="451.78" width="194.28" height="41.52"/></clipPath><clipPath id="clip-path-98" transform="translate(-1.43 -338.09)"><rect class="cls-1" x="134.4" y="566.98" width="105.84" height="18.36"/></clipPath></defs><title>tables2Asset 1</title><g id="Layer_2" data-name="Layer 2"><g id="Layer_1-2" data-name="Layer 1"><g class="cls-2"><rect class="cls-3" x="0.01" y="0.96" width="381.34" height="18.24"/></g><g class="cls-4"><text class="cls-5" transform="translate(21.49 13.8)">Tra<tspan class="cls-6" x="14.52" y="0">n</tspan><tspan x="20.28" y="0">sf</tspan><tspan class="cls-7" x="27.97" y="0">o</tspan><tspan class="cls-8" x="33.83" y="0">r</tspan><tspan class="cls-9" x="37.67" y="0">m</tspan><tspan x="46.53" y="0"> </tspan><tspan class="cls-10" x="49.02" y="0">T</tspan><tspan class="cls-7" x="54.42" y="0">y</tspan><tspan class="cls-6" x="59.46" y="0">p</tspan><tspan x="65.22" y="0">e</tspan></text></g><g class="cls-4"><text class="cls-5" transform="translate(164.19 13.2)">Basis f<tspan class="cls-11" x="28.33" y="0">u</tspan><tspan class="cls-6" x="34.08" y="0">n</tspan><tspan class="cls-8" x="39.85" y="0">cti</tspan><tspan class="cls-12" x="50.67" y="0">o</tspan><tspan class="cls-13" x="56.55" y="0">n</tspan><tspan class="cls-8" x="62.31" y="0" xml:space="preserve"> T</tspan></text></g><g class="cls-4"><text class="cls-14" transform="translate(234.54 15.36)">i</text></g><g class="cls-4"><text class="cls-5" transform="translate(238.26 13.2)">(</text></g><g class="cls-4"><text class="cls-15" transform="translate(241.62 13.2)">j</text></g><g class="cls-4"><text class="cls-5" transform="translate(246.3 13.2)">), </text></g><g class="cls-4"><text class="cls-15" transform="translate(254.94 13.2)">i</text></g><g class="cls-4"><text class="cls-5" transform="translate(259.5 13.2)">, </text></g><g class="cls-4"><text class="cls-15" transform="translate(264.78 13.2)">j</text></g><g class="cls-4"><text class="cls-5" transform="translate(269.46 13.2)"> <tspan class="cls-16" x="2.5" y="0">=</tspan><tspan class="cls-17" x="8.02" y="0"> </tspan><tspan class="cls-18" x="10.51" y="0">0</tspan><tspan class="cls-17" x="16.16" y="0">, </tspan><tspan class="cls-18" x="21.41" y="0">1</tspan><tspan class="cls-17" x="27.06" y="0">, </tspan><tspan class="cls-19" x="32.31" y="0">…</tspan><tspan x="40" y="0">, N</tspan><tspan class="cls-11" x="52.38" y="0">-</tspan><tspan class="cls-17" x="55.72" y="0">1</tspan></text></g><g class="cls-2"><text class="cls-5" transform="translate(43.69 67.44)"><tspan class="cls-7">D</tspan><tspan x="6.83" y="0">CT-2</tspan></text><text class="cls-5" transform="translate(44.17 137.55)"><tspan class="cls-7">D</tspan><tspan x="6.83" y="0">ST-4</tspan></text><text class="cls-5" transform="translate(44.17 188.67)"><tspan class="cls-7">D</tspan><tspan x="6.83" y="0">ST-7</tspan></text><text class="cls-5" transform="translate(49.81 237.99)">IDT</text></g><g class="cls-20"><line class="cls-21" x1="113.25" y1="1.98" x2="113.25" y2="17.58"/><rect x="113.19" y="1.92" width="0.96" height="15.72"/><line class="cls-21" x1="113.25" y1="20.58" x2="113.25" y2="256.79"/><rect x="113.19" y="20.52" width="0.96" height="236.33"/><rect x="0.01" width="381.34" height="1.92"/><rect x="0.01" y="17.64" width="381.34" height="0.96"/><rect x="0.01" y="19.56" width="381.34" height="0.96"/><line class="cls-21" x1="0.07" y1="107.97" x2="381.29" y2="107.97"/><rect x="0.01" y="107.91" width="381.34" height="0.96"/><line class="cls-21" x1="0.07" y1="158.73" x2="381.29" y2="158.73"/><rect x="0.01" y="158.67" width="381.34" height="0.96"/><line class="cls-21" x1="0.07" y1="210.21" x2="381.29" y2="210.21"/><rect x="0.01" y="210.15" width="381.34" height="0.96"/><rect x="0.01" y="256.85" width="381.34" height="1.92"/></g><g class="cls-22"><path d="M139.1,395.72h-2.27l.06-.26a1.15,1.15,0,0,0,.32-.09.48.48,0,0,0,.18-.17,1.29,1.29,0,0,0,.16-.35c0-.17.11-.41.18-.74l1.16-5.27h-.69a1,1,0,0,0-.93.45,6.74,6.74,0,0,0-.49.89h-.51l.4-1.82h5.63l-.42,1.9h-.52a7.68,7.68,0,0,0,0-.79.92.92,0,0,0-.1-.39.37.37,0,0,0-.18-.19,1,1,0,0,0-.39-.05h-.8l-1.17,5.31c0,.14-.06.25-.07.34l0,.26a2.32,2.32,0,0,0,0,.24.48.48,0,0,0,.06.26.28.28,0,0,0,.16.14,1.61,1.61,0,0,0,.37.07Z" transform="translate(-1.43 -338.09)"/></g><g class="cls-23"><path d="M142.24,395.35a2.84,2.84,0,0,0,.09-.58c0-.12,0-.2-.08-.24a.54.54,0,0,0-.33-.07l0-.21,1,0h.23l-.56,2.52a2.84,2.84,0,0,0-.09.58.44.44,0,0,0,.05.23.18.18,0,0,0,.16.07.45.45,0,0,0,.25-.09,3,3,0,0,0,.38-.37l.22.22a3.32,3.32,0,0,1-.62.55,1.09,1.09,0,0,1-.54.14.51.51,0,0,1-.41-.18.76.76,0,0,1-.15-.48,3.63,3.63,0,0,1,.11-.76Zm1.27-2.64-.18.76h-.7l.18-.76Z" transform="translate(-1.43 -338.09)"/></g><g class="cls-23"><text class="cls-24" transform="translate(143.1 57.63)">(</text></g><g class="cls-23"><path d="M151.06,395.81a4.38,4.38,0,0,1-.47,1.32,2,2,0,0,1-.77.75,2.38,2.38,0,0,1-1.13.25,2,2,0,0,1-.46-.05l.12-.48a1.31,1.31,0,0,0,.41,0,1.16,1.16,0,0,0,.44-.07,1.14,1.14,0,0,0,.34-.26,2,2,0,0,0,.29-.51,5.39,5.39,0,0,0,.24-.83l.87-3.88a3.79,3.79,0,0,0,.13-.81A.39.39,0,0,0,151,391a.85.85,0,0,0-.45-.1l.06-.28,1.31-.05h.31Zm1.62-7.36-.25,1.05h-1l.25-1.05Z" transform="translate(-1.43 -338.09)"/></g><g class="cls-23"><text class="cls-24" transform="translate(151.86 57.63)">)</text></g><g class="cls-23"><text class="cls-24" transform="translate(159.42 57.63)">=</text></g><g class="cls-23"><path d="M177.09,395.81a1.21,1.21,0,0,1-1.22-.89h0a1.94,1.94,0,0,1-.39.48,1.53,1.53,0,0,1-.52.29,2.11,2.11,0,0,1-.75.12,1.65,1.65,0,0,1-.76-.19,1.37,1.37,0,0,1-.56-.58,2,2,0,0,1-.22-1,3.43,3.43,0,0,1,.63-1.93,5.86,5.86,0,0,1,1.78-1.67l.27.37a5.18,5.18,0,0,0-1.28,1.51,3.62,3.62,0,0,0-.47,1.75,1.61,1.61,0,0,0,.24.95.77.77,0,0,0,.66.33.88.88,0,0,0,.68-.32,2.1,2.1,0,0,0,.41-.95l.45-2h.9l-.41,1.85a3.25,3.25,0,0,0-.07.59.94.94,0,0,0,.19.65.66.66,0,0,0,.52.21,1.13,1.13,0,0,0,.8-.33,2.1,2.1,0,0,0,.53-.95,4.86,4.86,0,0,0,.19-1.45,4,4,0,0,0-.13-1,2.57,2.57,0,0,0-.38-.85l.37-.32a3.59,3.59,0,0,1,.81,1.13,3.25,3.25,0,0,1,.27,1.32,3.35,3.35,0,0,1-.31,1.46,2.49,2.49,0,0,1-.89,1.05,2.34,2.34,0,0,1-1.31.38Z" transform="translate(-1.43 -338.09)"/></g><g class="cls-23"><text class="cls-25" transform="translate(178.74 59.91)">0</text></g><g class="cls-23"><text class="cls-24" transform="translate(186.06 57.63)">∙</text></g><polygon class="cls-26" points="198.35 34.22 200.03 34.22 200.03 34.26 207.86 34.26 207.86 34.98 199.58 34.98 199.58 34.93 198.95 34.93 196.54 71.02 196.04 71.02 192.6 64.69 191.58 65.23 191.37 64.85 193.31 63.82 196.06 68.91 198.35 34.22"/><rect x="199.58" y="54.06" width="8.28" height="0.72"/><g class="cls-23"><text class="cls-24" transform="translate(200.72 49.23)">2</text></g><g class="cls-23"><path d="M208.83,395.8l-.06.27a.82.82,0,0,0-.29.09.49.49,0,0,0-.16.16,1.61,1.61,0,0,0-.15.34c0,.15-.11.4-.19.75l-1.26,5.75H206l-1.57-4.63c-.19-.55-.36-1.1-.5-1.65h-.07q0,.27-.18,1.05c-.1.51-.2,1-.31,1.51l-.47,2.15a3.82,3.82,0,0,0-.12.84.39.39,0,0,0,.13.34.72.72,0,0,0,.4.13l-.06.26h-1.88l.06-.26a.86.86,0,0,0,.3-.11.7.7,0,0,0,.18-.2,1.66,1.66,0,0,0,.12-.32q.06-.18.18-.72l.92-4.15c0-.15.06-.3.09-.44a3,3,0,0,0,0-.43.42.42,0,0,0-.13-.35.76.76,0,0,0-.4-.11l.06-.27h1.72l1.42,4.2c.21.6.36,1.12.48,1.53h.06c0-.22.09-.58.19-1.09s.2-1,.29-1.34l.37-1.7a4.05,4.05,0,0,0,.12-.87.42.42,0,0,0-.13-.35.74.74,0,0,0-.4-.11l.06-.27Z" transform="translate(-1.43 -338.09)"/></g><g class="cls-23"><text class="cls-24" transform="translate(210.44 57.63)">∙</text></g><g class="cls-23"><text class="cls-24" transform="translate(215.6 57.63)"><tspan class="cls-27">c</tspan><tspan x="4.91" y="0">os</tspan></text></g><path class="cls-26" d="M240,378.52l.25.41a12.83,12.83,0,0,0-2.94,5.56,32,32,0,0,0,0,16.15,12.9,12.9,0,0,0,2.95,5.63l-.25.4a12.67,12.67,0,0,1-3.52-5.71,27.72,27.72,0,0,1,0-16.78A12.76,12.76,0,0,1,240,378.52Zm65.56,0a12.76,12.76,0,0,1,3.52,5.66,27.72,27.72,0,0,1,0,16.78,12.67,12.67,0,0,1-3.52,5.71l-.25-.4a12.79,12.79,0,0,0,2.95-5.63,29.71,29.71,0,0,0,1-8.05,29.13,29.13,0,0,0-1-8.1,12.83,12.83,0,0,0-2.94-5.56Z" transform="translate(-1.43 -338.09)"/><polygon class="cls-26" points="239.18 54.06 271.34 54.06 303.5 54.06 303.5 54.78 271.34 54.78 239.18 54.78 239.18 54.06"/><g class="cls-23"><path d="M244.35,384.9c.06-.28.14-.63.26-1s.21-.71.29-.93l0-.05h-1.58l-.66,3.07c-.06.24-.12.5-.19.77s-.13.48-.17.6h-1.14l.05-.26a6.09,6.09,0,0,0,1.06-2.66l.35-1.52h-.17a1.38,1.38,0,0,0-.64.13,1.83,1.83,0,0,0-.58.51l-.34-.28c.19-.23.34-.4.46-.52a2,2,0,0,1,.37-.3,1.58,1.58,0,0,1,.44-.19,2.52,2.52,0,0,1,.58-.06h4.42l-.15.71h-1.29l-.61,2.7a4.59,4.59,0,0,0-.11.8.53.53,0,0,0,.07.31.26.26,0,0,0,.22.1c.21,0,.47-.18.76-.52l.31.29a3.45,3.45,0,0,1-.75.65,1.39,1.39,0,0,1-.72.2.73.73,0,0,1-.57-.25,1,1,0,0,1-.21-.66,5.29,5.29,0,0,1,.15-1.05Z" transform="translate(-1.43 -338.09)"/></g><g class="cls-23"><text class="cls-24" transform="translate(248.36 49.23)">∙</text></g><g class="cls-23"><path d="M256.26,383.69a4.12,4.12,0,0,0,.12-.81.39.39,0,0,0-.11-.33.89.89,0,0,0-.46-.1l.06-.28,1.32-.05h.32l-.78,3.46a4,4,0,0,0-.12.8.53.53,0,0,0,.07.31.27.27,0,0,0,.22.1.63.63,0,0,0,.35-.14,3.19,3.19,0,0,0,.51-.5l.31.3a5,5,0,0,1-.86.76,1.4,1.4,0,0,1-.73.2.72.72,0,0,1-.57-.26,1,1,0,0,1-.21-.65,4.26,4.26,0,0,1,.16-1.05Zm1.74-3.64-.25,1.05h-1l.25-1.05Z" transform="translate(-1.43 -338.09)"/></g><g class="cls-23"><text class="cls-24" transform="translate(259.76 49.23)">∙</text></g><path class="cls-26" d="M270.53,379.09l.14.41a3.15,3.15,0,0,0-1.87,1.62,6.76,6.76,0,0,0-.6,3,7.06,7.06,0,0,0,.6,3.11,3.16,3.16,0,0,0,1.86,1.64l-.13.41a3.87,3.87,0,0,1-2.42-1.79,7,7,0,0,1,0-6.64A3.86,3.86,0,0,1,270.53,379.09Zm30.2,0a3.87,3.87,0,0,1,2.41,1.79,6.16,6.16,0,0,1,.85,3.32,6.23,6.23,0,0,1-.84,3.32,3.9,3.9,0,0,1-2.42,1.79l-.13-.41a3.14,3.14,0,0,0,1.85-1.64,7.06,7.06,0,0,0,.6-3.11,6.76,6.76,0,0,0-.6-3,3.12,3.12,0,0,0-1.87-1.62Z" transform="translate(-1.43 -338.09)"/><g class="cls-23"><path d="M274.69,386.46a2.46,2.46,0,0,0,.43,0,.78.78,0,0,0,.26-.11.62.62,0,0,0,.15-.2c0-.08.09-.21.15-.37h.45l-.09,1.57h-4.43v-.26a6.59,6.59,0,0,1,.62-1.16,11.52,11.52,0,0,1,1.12-1.39q.61-.69.9-1.05a4.89,4.89,0,0,0,.45-.67,2.42,2.42,0,0,0,.21-.53,2.22,2.22,0,0,0,.06-.51,1.92,1.92,0,0,0-.12-.71,1.08,1.08,0,0,0-.4-.49,1.17,1.17,0,0,0-.67-.18,1.37,1.37,0,0,0-1.36,1.12h-.65v-1A5.92,5.92,0,0,1,273,380a4.45,4.45,0,0,1,1-.13,2.27,2.27,0,0,1,1.51.43,1.58,1.58,0,0,1,.52,1.28,2.2,2.2,0,0,1-.06.53,2.27,2.27,0,0,1-.18.47,3.74,3.74,0,0,1-.32.5c-.14.18-.29.37-.46.55s-.45.49-.85.93a10.93,10.93,0,0,0-1.51,1.89Zm4.31,1a4.38,4.38,0,0,1-.47,1.32,2,2,0,0,1-.77.75,2.38,2.38,0,0,1-1.13.25,2,2,0,0,1-.46-.05l.12-.48a1.31,1.31,0,0,0,.41,0,1.16,1.16,0,0,0,.44-.07,1.14,1.14,0,0,0,.34-.26,2,2,0,0,0,.29-.51,5.39,5.39,0,0,0,.24-.83l.87-3.88a3.79,3.79,0,0,0,.13-.81.39.39,0,0,0-.12-.33.85.85,0,0,0-.45-.1l.06-.28,1.31-.05h.31Zm1.62-7.36-.25,1.05h-.95l.25-1.05Z" transform="translate(-1.43 -338.09)"/></g><g class="cls-23"><text class="cls-24" transform="translate(282.2 49.23)">+</text></g><g class="cls-23"><text class="cls-24" transform="translate(292.88 49.23)">1</text></g><g class="cls-23"><path d="M269.41,402.3a2.46,2.46,0,0,0,.43,0,.78.78,0,0,0,.26-.11.62.62,0,0,0,.15-.2c0-.08.09-.21.15-.37h.45l-.09,1.57h-4.43v-.26a6.59,6.59,0,0,1,.62-1.16,11.52,11.52,0,0,1,1.12-1.39q.62-.69.9-1.05a4.89,4.89,0,0,0,.45-.67,2.42,2.42,0,0,0,.21-.53,2.22,2.22,0,0,0,.06-.51,1.92,1.92,0,0,0-.12-.71,1.08,1.08,0,0,0-.4-.49,1.17,1.17,0,0,0-.67-.18,1.37,1.37,0,0,0-1.36,1.12h-.65v-1a5.92,5.92,0,0,1,1.22-.45,4.45,4.45,0,0,1,1-.13,2.27,2.27,0,0,1,1.51.43,1.58,1.58,0,0,1,.52,1.28,2.2,2.2,0,0,1-.06.53,2.27,2.27,0,0,1-.18.47,3.74,3.74,0,0,1-.32.5c-.14.18-.29.37-.46.55s-.45.49-.85.93a10.93,10.93,0,0,0-1.51,1.89Zm10.12-6.5-.06.27a.82.82,0,0,0-.29.09.49.49,0,0,0-.16.16,1.61,1.61,0,0,0-.15.34c-.05.15-.11.4-.19.75l-1.26,5.75h-.74l-1.57-4.63c-.19-.55-.36-1.1-.5-1.65h-.07q0,.27-.18,1.05c-.1.51-.2,1-.31,1.51l-.47,2.15a3.82,3.82,0,0,0-.12.84.39.39,0,0,0,.13.34.72.72,0,0,0,.4.13l-.06.26h-1.88l.06-.26a.86.86,0,0,0,.3-.11.7.7,0,0,0,.18-.2,2.63,2.63,0,0,0,.13-.32c0-.12.09-.36.17-.72l.92-4.15c0-.15.06-.3.09-.44a3,3,0,0,0,0-.43.42.42,0,0,0-.13-.35.76.76,0,0,0-.4-.11l.06-.27h1.72l1.42,4.2c.21.6.36,1.12.48,1.53h.06c0-.22.09-.58.19-1.09s.2-1,.29-1.34l.37-1.7a4.05,4.05,0,0,0,.12-.87.42.42,0,0,0-.13-.35.74.74,0,0,0-.4-.11l.06-.27Z" transform="translate(-1.43 -338.09)"/></g><g class="cls-28"><text class="cls-5" transform="translate(136.02 91.95)">where </text></g><g class="cls-23"><path d="M175.77,430.13a1.21,1.21,0,0,1-1.22-.89h0a1.94,1.94,0,0,1-.39.48,1.53,1.53,0,0,1-.52.29,2.11,2.11,0,0,1-.75.12,1.65,1.65,0,0,1-.76-.19,1.37,1.37,0,0,1-.56-.58,2,2,0,0,1-.22-1,3.43,3.43,0,0,1,.63-1.93,5.86,5.86,0,0,1,1.78-1.67l.27.37a5.18,5.18,0,0,0-1.28,1.51,3.62,3.62,0,0,0-.47,1.75,1.61,1.61,0,0,0,.24.95.77.77,0,0,0,.66.33.88.88,0,0,0,.68-.32,2.1,2.1,0,0,0,.41-.95l.45-2h.9l-.41,1.85a3.25,3.25,0,0,0-.07.59.94.94,0,0,0,.19.65.66.66,0,0,0,.52.21,1.13,1.13,0,0,0,.8-.33,2.1,2.1,0,0,0,.53-.95,4.86,4.86,0,0,0,.19-1.45,4,4,0,0,0-.13-1,2.57,2.57,0,0,0-.38-.85l.37-.32a3.59,3.59,0,0,1,.81,1.13,3.25,3.25,0,0,1,.27,1.32,3.35,3.35,0,0,1-.31,1.46,2.49,2.49,0,0,1-.89,1.05,2.34,2.34,0,0,1-1.31.38Z" transform="translate(-1.43 -338.09)"/></g><g class="cls-23"><text class="cls-25" transform="translate(177.42 94.23)">0</text></g><g class="cls-23"><text class="cls-24" transform="translate(185.34 91.95)">=</text></g><path class="cls-26" d="M202.24,421.8l.14.42a3.09,3.09,0,0,0-1.87,1.62,6.74,6.74,0,0,0-.6,3,7,7,0,0,0,.6,3.11,3.1,3.1,0,0,0,1.86,1.64l-.13.42a3.87,3.87,0,0,1-2.42-1.79,7,7,0,0,1,0-6.65A3.93,3.93,0,0,1,202.24,421.8Zm24.92,0a3.91,3.91,0,0,1,2.41,1.79,7,7,0,0,1,0,6.65,3.9,3.9,0,0,1-2.42,1.79l-.13-.42a3.08,3.08,0,0,0,1.85-1.64,7,7,0,0,0,.6-3.11,6.74,6.74,0,0,0-.6-3,3.06,3.06,0,0,0-1.87-1.62Z" transform="translate(-1.43 -338.09)"/><g class="cls-23"><path d="M203.92,426.41a4.12,4.12,0,0,0,.12-.81.39.39,0,0,0-.11-.33.89.89,0,0,0-.46-.1l.06-.28,1.32-.05h.32l-.78,3.46a4,4,0,0,0-.12.8.53.53,0,0,0,.07.31.27.27,0,0,0,.22.1.63.63,0,0,0,.35-.14,3.19,3.19,0,0,0,.51-.5l.31.3a5,5,0,0,1-.86.76,1.4,1.4,0,0,1-.73.2.72.72,0,0,1-.57-.26,1,1,0,0,1-.21-.65,4.26,4.26,0,0,1,.16-1.05Zm1.74-3.64-.25,1.05h-1l.25-1.05Z" transform="translate(-1.43 -338.09)"/></g><g class="cls-23"><text class="cls-24" transform="translate(208.02 91.95)">=</text></g><g class="cls-23"><text class="cls-24" transform="translate(219.3 91.95)">0</text></g><g class="cls-23"><text class="cls-24" transform="translate(229.98 91.95)">?</text></g><polygon class="cls-26" points="247.1 75.66 253.22 75.66 253.22 76.38 247.51 76.38 247.51 76.41 246.43 76.41 244.03 100.31 243.53 100.31 240.2 94.2 239.21 94.72 239 94.34 240.87 93.35 243.58 98.36 245.82 75.69 247.1 75.69 247.1 75.66"/><rect x="247.1" y="88.38" width="6.12" height="0.72"/><g class="cls-23"><text class="cls-25" transform="translate(248 85.47)">2</text></g><g class="cls-23"><path d="M254.25,430.2l0,.19a.83.83,0,0,0-.21.07.46.46,0,0,0-.12.12,1.06,1.06,0,0,0-.1.25,3.94,3.94,0,0,0-.14.54l-.92,4.19h-.54L251,432.19c-.14-.4-.26-.8-.37-1.2h0c0,.13-.06.38-.13.76s-.15.74-.22,1.1l-.35,1.57a3.29,3.29,0,0,0-.08.61.3.3,0,0,0,.09.25.5.5,0,0,0,.29.09l0,.19h-1.37l0-.19a.51.51,0,0,0,.22-.08.39.39,0,0,0,.13-.15,1.85,1.85,0,0,0,.09-.23c0-.09.07-.26.13-.52l.67-3a2.21,2.21,0,0,0,.06-.32,1.71,1.71,0,0,0,0-.31.29.29,0,0,0-.1-.26.52.52,0,0,0-.29-.09l0-.19h1.26l1,3.06c.15.44.27.81.35,1.11h0c0-.16.07-.42.15-.79s.14-.7.2-1l.28-1.23a3.58,3.58,0,0,0,.08-.63.31.31,0,0,0-.09-.26.52.52,0,0,0-.29-.09l0-.19Z" transform="translate(-1.43 -338.09)"/></g><g class="cls-23"><text class="cls-24" transform="translate(255.08 91.95)">:</text></g><g class="cls-28"><text class="cls-24" transform="translate(259.76 91.95)">1</text></g><g class="cls-29"><path d="M138,477.48h-2.27l.06-.26a1.15,1.15,0,0,0,.32-.09.48.48,0,0,0,.18-.17,1.29,1.29,0,0,0,.16-.35c.05-.16.11-.41.18-.74l1.16-5.27h-.69a1,1,0,0,0-.93.45,6.74,6.74,0,0,0-.49.89h-.51l.4-1.82h5.63l-.42,1.9h-.52a7.68,7.68,0,0,0,0-.79.92.92,0,0,0-.1-.39.37.37,0,0,0-.18-.19,1,1,0,0,0-.39,0h-.8l-1.17,5.31c0,.14-.06.25-.07.34l0,.26a2.32,2.32,0,0,0,0,.24.48.48,0,0,0,.06.26.28.28,0,0,0,.16.14,1.61,1.61,0,0,0,.37.07Z" transform="translate(-1.43 -338.09)"/></g><g class="cls-23"><path d="M141.13,477.11a2.84,2.84,0,0,0,.09-.58c0-.12,0-.2-.08-.24a.54.54,0,0,0-.33-.07l0-.21,1,0H142l-.56,2.52a2.84,2.84,0,0,0-.09.58.44.44,0,0,0,.05.23.18.18,0,0,0,.16.07.45.45,0,0,0,.25-.09,3,3,0,0,0,.38-.37l.22.22a3.32,3.32,0,0,1-.62.55,1.09,1.09,0,0,1-.54.14.51.51,0,0,1-.41-.18.76.76,0,0,1-.15-.48,3.63,3.63,0,0,1,.11-.76Zm1.27-2.64-.18.76h-.7l.18-.76Z" transform="translate(-1.43 -338.09)"/></g><g class="cls-23"><text class="cls-24" transform="translate(141.99 139.39)">(</text></g><g class="cls-23"><path d="M150,477.57a4.38,4.38,0,0,1-.47,1.32,2,2,0,0,1-.77.75,2.38,2.38,0,0,1-1.13.25,2,2,0,0,1-.46-.05l.12-.48a1.31,1.31,0,0,0,.41,0,1.16,1.16,0,0,0,.44-.07,1.14,1.14,0,0,0,.34-.26,2,2,0,0,0,.29-.51,5.39,5.39,0,0,0,.24-.83l.87-3.88A3.79,3.79,0,0,0,150,473a.39.39,0,0,0-.12-.33.85.85,0,0,0-.45-.1l.06-.28,1.31-.05h.31Zm1.62-7.36-.25,1.05h-.95l.25-1.05Z" transform="translate(-1.43 -338.09)"/></g><g class="cls-23"><text class="cls-24" transform="translate(150.75 139.39)">)</text></g><g class="cls-23"><text class="cls-24" transform="translate(158.31 139.39)">=</text></g><polygon class="cls-26" points="176.73 115.98 178.42 115.98 178.42 116.03 186.25 116.03 186.25 116.75 177.97 116.75 177.97 116.7 177.34 116.7 174.93 152.79 174.43 152.79 170.99 146.45 169.97 147 169.76 146.62 171.7 145.59 174.45 150.68 176.73 115.98"/><rect x="177.97" y="135.83" width="8.28" height="0.72"/><g class="cls-23"><text class="cls-24" transform="translate(179.1 130.99)">2</text></g><g class="cls-23"><path d="M187.23,477.56l-.06.27a.82.82,0,0,0-.29.09.49.49,0,0,0-.16.16,1.61,1.61,0,0,0-.15.34c0,.15-.11.4-.19.75l-1.26,5.75h-.74l-1.57-4.63c-.19-.55-.36-1.1-.5-1.65h-.07q0,.27-.18,1.05c-.1.51-.2,1-.31,1.51l-.47,2.15a3.82,3.82,0,0,0-.12.84.39.39,0,0,0,.13.34.72.72,0,0,0,.4.13l-.06.26h-1.88l.06-.26a.86.86,0,0,0,.3-.11.7.7,0,0,0,.18-.2,1.66,1.66,0,0,0,.12-.32q.06-.18.18-.72l.92-4.15c0-.15.06-.3.09-.44a3,3,0,0,0,0-.43.42.42,0,0,0-.13-.35.76.76,0,0,0-.4-.11l.06-.27h1.72l1.42,4.2c.21.6.36,1.12.48,1.53h.06c0-.22.09-.58.19-1.09s.2-1,.29-1.34l.37-1.7a4.05,4.05,0,0,0,.12-.87.42.42,0,0,0-.13-.35.74.74,0,0,0-.4-.11l.06-.27Z" transform="translate(-1.43 -338.09)"/></g><g class="cls-23"><text class="cls-24" transform="translate(188.82 139.39)">∙</text></g><g class="cls-23"><text class="cls-24" transform="translate(193.98 139.39)"><tspan class="cls-12">s</tspan><tspan class="cls-30" x="4.8" y="0">i</tspan><tspan x="7.91" y="0">n</tspan></text></g><path class="cls-26" d="M216.8,460.29l.26.4a12.88,12.88,0,0,0-2.95,5.57,31.83,31.83,0,0,0,0,16.14,12.84,12.84,0,0,0,2.95,5.64l-.26.4a12.75,12.75,0,0,1-3.51-5.71,25.79,25.79,0,0,1-1.3-8.38,25.53,25.53,0,0,1,1.3-8.4A12.78,12.78,0,0,1,216.8,460.29Zm100,0a12.71,12.71,0,0,1,3.52,5.66,25.8,25.8,0,0,1,1.3,8.4,26.07,26.07,0,0,1-1.3,8.38,12.67,12.67,0,0,1-3.52,5.71l-.25-.4a12.93,12.93,0,0,0,2.94-5.64,31.83,31.83,0,0,0,0-16.14,13,13,0,0,0-2.94-5.57Z" transform="translate(-1.43 -338.09)"/><polygon class="cls-26" points="216.01 135.83 248.93 135.83 281.85 135.83 314.77 135.83 314.77 136.55 281.85 136.55 248.93 136.55 216.01 136.55 216.01 135.83"/><g class="cls-23"><path d="M221.17,466.67c.06-.29.14-.64.26-1.05s.21-.71.29-.93l0-.05h-1.58l-.66,3.07c-.06.24-.12.5-.19.77s-.13.48-.17.6H218l.05-.26a6.09,6.09,0,0,0,1.06-2.66l.35-1.52h-.17a1.38,1.38,0,0,0-.64.13,1.76,1.76,0,0,0-.58.52l-.34-.29c.19-.23.34-.4.46-.52a2,2,0,0,1,.37-.3A1.58,1.58,0,0,1,219,464a2.52,2.52,0,0,1,.58-.06H224l-.15.71h-1.29l-.61,2.7a4.59,4.59,0,0,0-.11.8.53.53,0,0,0,.07.31.26.26,0,0,0,.22.1c.21,0,.47-.18.76-.52l.31.29a3.45,3.45,0,0,1-.75.65,1.39,1.39,0,0,1-.72.2.73.73,0,0,1-.57-.25,1,1,0,0,1-.21-.66,5.29,5.29,0,0,1,.15-1.05Z" transform="translate(-1.43 -338.09)"/></g><g class="cls-23"><text class="cls-24" transform="translate(225.3 130.99)">∙</text></g><g class="cls-23"><text class="cls-24" transform="translate(230.46 130.99)">(</text></g><g class="cls-23"><text class="cls-24" transform="translate(235.02 130.99)">2</text></g><g class="cls-23"><path d="M243.88,465.45a4.12,4.12,0,0,0,.12-.81.39.39,0,0,0-.11-.33.89.89,0,0,0-.46-.1l.06-.28,1.32-.05h.32l-.78,3.46a4,4,0,0,0-.12.8.53.53,0,0,0,.07.31.27.27,0,0,0,.22.1.63.63,0,0,0,.35-.14,3.19,3.19,0,0,0,.51-.5l.31.3a5,5,0,0,1-.86.76,1.4,1.4,0,0,1-.73.2.72.72,0,0,1-.57-.26,1,1,0,0,1-.21-.65,4.26,4.26,0,0,1,.16-1.05Zm1.74-3.64-.25,1.05h-1l.25-1.05Z" transform="translate(-1.43 -338.09)"/></g><g class="cls-23"><text class="cls-24" transform="translate(247.38 130.99)">+</text></g><g class="cls-23"><text class="cls-24" transform="translate(258.06 130.99)">1</text></g><g class="cls-23"><text class="cls-24" transform="translate(264.18 130.99)">)</text></g><g class="cls-23"><text class="cls-24" transform="translate(271.16 130.99)">∙</text></g><path class="cls-26" d="M281.91,460.85l.15.42a3.06,3.06,0,0,0-1.87,1.62,6.73,6.73,0,0,0-.61,3,7,7,0,0,0,.61,3.11,3.05,3.05,0,0,0,1.85,1.64l-.13.42a3.9,3.9,0,0,1-2.42-1.79,7,7,0,0,1,0-6.65A4,4,0,0,1,281.91,460.85Zm30.08,0a3.93,3.93,0,0,1,2.42,1.79,7,7,0,0,1,0,6.65,3.9,3.9,0,0,1-2.42,1.79l-.13-.42a3.05,3.05,0,0,0,1.85-1.64,6.89,6.89,0,0,0,.61-3.11,6.74,6.74,0,0,0-.6-3,3.08,3.08,0,0,0-1.88-1.62Z" transform="translate(-1.43 -338.09)"/><g class="cls-23"><path d="M286.09,468.22a2.46,2.46,0,0,0,.43,0,.78.78,0,0,0,.26-.11.62.62,0,0,0,.15-.2c0-.08.09-.21.15-.37h.45l-.09,1.57H283v-.26a6.59,6.59,0,0,1,.62-1.16,11.52,11.52,0,0,1,1.12-1.39q.62-.69.9-1a4.89,4.89,0,0,0,.45-.67,2.42,2.42,0,0,0,.21-.53,2.22,2.22,0,0,0,.06-.51,1.92,1.92,0,0,0-.12-.71,1.08,1.08,0,0,0-.4-.49,1.17,1.17,0,0,0-.67-.18,1.37,1.37,0,0,0-1.36,1.12h-.65v-1a5.92,5.92,0,0,1,1.22-.45,4.45,4.45,0,0,1,1-.13,2.27,2.27,0,0,1,1.51.43,1.58,1.58,0,0,1,.52,1.28,2.2,2.2,0,0,1-.06.53,2.27,2.27,0,0,1-.18.47,3.74,3.74,0,0,1-.32.5c-.14.18-.29.37-.46.55s-.45.49-.85.93a10.93,10.93,0,0,0-1.51,1.89Zm4.31.95a4.38,4.38,0,0,1-.47,1.32,2,2,0,0,1-.77.75,2.38,2.38,0,0,1-1.13.25,2,2,0,0,1-.46-.05l.12-.48a1.31,1.31,0,0,0,.41,0,1.16,1.16,0,0,0,.44-.07,1.14,1.14,0,0,0,.34-.26,2,2,0,0,0,.29-.51,5.39,5.39,0,0,0,.24-.83l.87-3.88a3.79,3.79,0,0,0,.13-.81.39.39,0,0,0-.12-.33.85.85,0,0,0-.45-.1l.06-.28,1.31-.05h.31Zm1.62-7.36-.25,1.05h-.95l.25-1.05Z" transform="translate(-1.43 -338.09)"/></g><g class="cls-23"><text class="cls-24" transform="translate(293.48 130.99)">+</text></g><g class="cls-23"><text class="cls-24" transform="translate(304.16 130.99)">1</text></g><g class="cls-23"><text class="cls-24" transform="translate(258.18 146.83)">4</text></g><g class="cls-23"><path d="M273.51,477.56l-.06.27a.82.82,0,0,0-.29.09.49.49,0,0,0-.16.16,1.61,1.61,0,0,0-.15.34c-.05.15-.11.4-.19.75l-1.26,5.75h-.74l-1.57-4.63c-.19-.55-.36-1.1-.5-1.65h-.07q0,.27-.18,1.05c-.1.51-.2,1-.31,1.51l-.47,2.15a3.82,3.82,0,0,0-.12.84.39.39,0,0,0,.13.34.72.72,0,0,0,.4.13l-.06.26H266l.06-.26a.86.86,0,0,0,.3-.11.7.7,0,0,0,.18-.2,1.66,1.66,0,0,0,.12-.32q.06-.18.18-.72l.92-4.15c0-.15.06-.3.09-.44a3,3,0,0,0,0-.43.42.42,0,0,0-.13-.35.76.76,0,0,0-.4-.11l.06-.27h1.72l1.42,4.2c.21.6.36,1.12.48,1.53h.06c0-.22.09-.58.19-1.09s.2-1,.29-1.34l.37-1.7a4.05,4.05,0,0,0,.12-.87.42.42,0,0,0-.13-.35.74.74,0,0,0-.4-.11l.06-.27Z" transform="translate(-1.43 -338.09)"/></g><g class="cls-23"><path d="M138.55,528.68h-2.27l.06-.26a1.15,1.15,0,0,0,.32-.09.48.48,0,0,0,.18-.17,1.29,1.29,0,0,0,.16-.35c.05-.16.11-.41.18-.74l1.16-5.27h-.69a1,1,0,0,0-.93.45,6.74,6.74,0,0,0-.49.89h-.51l.4-1.82h5.63l-.42,1.9h-.52a7.68,7.68,0,0,0,0-.79.92.92,0,0,0-.1-.39.37.37,0,0,0-.18-.19,1,1,0,0,0-.39-.05h-.8l-1.17,5.31c0,.14-.06.25-.07.34l0,.26a2.32,2.32,0,0,0,0,.24.48.48,0,0,0,.06.26.28.28,0,0,0,.16.14,1.61,1.61,0,0,0,.37.07Z" transform="translate(-1.43 -338.09)"/></g><g class="cls-23"><path d="M141.69,528.31a2.84,2.84,0,0,0,.09-.58c0-.12,0-.2-.08-.24a.54.54,0,0,0-.33-.07l0-.21,1,0h.23l-.56,2.52a2.84,2.84,0,0,0-.09.58.44.44,0,0,0,.05.23.18.18,0,0,0,.16.07.45.45,0,0,0,.25-.09,3,3,0,0,0,.38-.37l.22.22a3.32,3.32,0,0,1-.62.55,1.09,1.09,0,0,1-.54.14.51.51,0,0,1-.41-.18.76.76,0,0,1-.15-.48,3.63,3.63,0,0,1,.11-.76Zm1.27-2.64-.18.76h-.7l.18-.76Z" transform="translate(-1.43 -338.09)"/></g><g class="cls-23"><text class="cls-24" transform="translate(142.55 190.59)">(</text></g><g class="cls-23"><path d="M150.53,528.77a4.38,4.38,0,0,1-.47,1.32,2,2,0,0,1-.77.75,2.38,2.38,0,0,1-1.13.25,2,2,0,0,1-.46-.05l.12-.48a1.31,1.31,0,0,0,.41,0,1.16,1.16,0,0,0,.44-.07,1.14,1.14,0,0,0,.34-.26,2,2,0,0,0,.29-.51,5.39,5.39,0,0,0,.24-.83l.87-3.88a3.79,3.79,0,0,0,.13-.81.39.39,0,0,0-.12-.33.85.85,0,0,0-.45-.1l.06-.28,1.31,0h.31Zm1.62-7.36-.25,1.05h-1l.25-1.05Z" transform="translate(-1.43 -338.09)"/></g><g class="cls-23"><text class="cls-24" transform="translate(151.33 190.59)">)</text></g><g class="cls-23"><text class="cls-24" transform="translate(158.89 190.59)">=</text></g><polygon class="cls-26" points="177.29 167.4 178.97 167.4 178.97 167.45 212.13 167.45 212.13 168.17 178.53 168.17 178.53 168.12 177.89 168.12 175.48 204.21 174.98 204.21 171.54 197.87 170.52 198.42 170.31 198.03 172.25 197.01 175 202.09 177.29 167.4"/><rect x="178.53" y="187.01" width="33.6" height="0.72"/><g class="cls-23"><text class="cls-24" transform="translate(192.25 182.19)">4</text></g><g class="cls-23"><text class="cls-24" transform="translate(178.57 198.03)">2</text></g><g class="cls-23"><path d="M193.9,528.76l-.06.27a.82.82,0,0,0-.29.09.49.49,0,0,0-.16.16,1.61,1.61,0,0,0-.15.34c-.05.15-.11.4-.19.75l-1.26,5.75h-.74l-1.57-4.63c-.19-.55-.36-1.1-.5-1.65h-.07q0,.27-.18,1c-.1.51-.2,1-.31,1.51l-.47,2.15a3.82,3.82,0,0,0-.12.84.39.39,0,0,0,.13.34.72.72,0,0,0,.4.13l-.06.26h-1.88l.06-.26a.86.86,0,0,0,.3-.11.7.7,0,0,0,.18-.2,1.66,1.66,0,0,0,.12-.32q.06-.18.18-.72l.92-4.15c0-.15.06-.3.09-.44a3,3,0,0,0,0-.43.42.42,0,0,0-.13-.35.76.76,0,0,0-.4-.11l.06-.27h1.72L191,533c.21.6.36,1.12.48,1.53h.06c0-.22.09-.58.19-1.09s.2-1,.29-1.34l.37-1.7a4.05,4.05,0,0,0,.12-.87.42.42,0,0,0-.13-.35.74.74,0,0,0-.4-.11l.06-.27Z" transform="translate(-1.43 -338.09)"/></g><g class="cls-23"><text class="cls-24" transform="translate(195.49 198.03)">+</text></g><g class="cls-23"><text class="cls-24" transform="translate(206.05 198.03)">1</text></g><g class="cls-23"><text class="cls-24" transform="translate(214.69 190.59)">∙</text></g><g class="cls-23"><text class="cls-24" transform="translate(219.85 190.59)"><tspan class="cls-27">s</tspan><tspan class="cls-31" x="4.79" y="0">i</tspan><tspan x="7.8" y="0">n</tspan></text></g><path class="cls-26" d="M242.68,511.47l.25.4a12.87,12.87,0,0,0-2.94,5.57,31.83,31.83,0,0,0,0,16.14,12.93,12.93,0,0,0,2.94,5.64l-.25.4a12.67,12.67,0,0,1-3.52-5.71,26.08,26.08,0,0,1-1.3-8.39,25.69,25.69,0,0,1,1.3-8.39A12.71,12.71,0,0,1,242.68,511.47Zm94,0a12.71,12.71,0,0,1,3.52,5.66,25.43,25.43,0,0,1,1.3,8.39,25.81,25.81,0,0,1-1.3,8.39,12.67,12.67,0,0,1-3.52,5.71l-.25-.4a12.84,12.84,0,0,0,2.95-5.64,29.66,29.66,0,0,0,1-8.05,29,29,0,0,0-1-8.09,12.87,12.87,0,0,0-2.94-5.57Z" transform="translate(-1.43 -338.09)"/><polygon class="cls-26" points="241.89 187.01 288.27 187.01 334.65 187.01 334.65 187.73 288.27 187.73 241.89 187.73 241.89 187.01"/><g class="cls-23"><path d="M247,517.87c.06-.29.14-.64.26-1s.21-.71.29-.93l0,0H246l-.66,3.07c-.06.24-.12.5-.19.77s-.13.48-.17.6h-1.14l.05-.26a6.09,6.09,0,0,0,1.06-2.66l.35-1.52h-.17a1.38,1.38,0,0,0-.64.13,1.76,1.76,0,0,0-.58.52l-.34-.29c.19-.23.34-.4.46-.52a2,2,0,0,1,.37-.3,1.58,1.58,0,0,1,.44-.19,2.52,2.52,0,0,1,.58-.06h4.42l-.15.71h-1.29l-.61,2.7a4.59,4.59,0,0,0-.11.8.53.53,0,0,0,.07.31.26.26,0,0,0,.22.1c.21,0,.47-.18.76-.52l.31.29a3.45,3.45,0,0,1-.75.65,1.39,1.39,0,0,1-.72.2.73.73,0,0,1-.57-.25,1,1,0,0,1-.21-.66,5.29,5.29,0,0,1,.15-1.05Z" transform="translate(-1.43 -338.09)"/></g><g class="cls-23"><text class="cls-24" transform="translate(251.05 182.19)">∙</text></g><g class="cls-23"><text class="cls-24" transform="translate(256.23 182.19)">(</text></g><g class="cls-23"><text class="cls-24" transform="translate(260.79 182.19)">2</text></g><g class="cls-23"><path d="M269.65,516.65a4.12,4.12,0,0,0,.12-.81.39.39,0,0,0-.11-.33.89.89,0,0,0-.46-.1l.06-.28,1.32,0h.32l-.78,3.46a4,4,0,0,0-.12.8.53.53,0,0,0,.07.31.27.27,0,0,0,.22.1.63.63,0,0,0,.35-.14,3.19,3.19,0,0,0,.51-.5l.31.3a5,5,0,0,1-.86.76,1.4,1.4,0,0,1-.73.2.72.72,0,0,1-.57-.26,1,1,0,0,1-.21-.65,4.26,4.26,0,0,1,.16-1.05Zm1.74-3.64-.25,1h-1l.25-1Z" transform="translate(-1.43 -338.09)"/></g><g class="cls-23"><text class="cls-24" transform="translate(273.15 182.19)">+</text></g><g class="cls-23"><text class="cls-24" transform="translate(283.83 182.19)">1</text></g><g class="cls-23"><text class="cls-24" transform="translate(289.95 182.19)">)</text></g><g class="cls-23"><text class="cls-24" transform="translate(296.91 182.19)">∙</text></g><path class="cls-26" d="M307.67,512l.14.42a3.09,3.09,0,0,0-1.87,1.62,6.74,6.74,0,0,0-.6,3,7,7,0,0,0,.6,3.11,3.13,3.13,0,0,0,1.86,1.64l-.13.42a3.91,3.91,0,0,1-2.42-1.79,7,7,0,0,1,0-6.65A3.9,3.9,0,0,1,307.67,512Zm24.08,0a3.88,3.88,0,0,1,2.41,1.79,7,7,0,0,1,0,6.65,3.93,3.93,0,0,1-2.42,1.79l-.13-.42a3.11,3.11,0,0,0,1.85-1.64,7,7,0,0,0,.6-3.11,6.74,6.74,0,0,0-.6-3,3.06,3.06,0,0,0-1.87-1.62Z" transform="translate(-1.43 -338.09)"/><g class="cls-23"><path d="M310,520.37a4.38,4.38,0,0,1-.47,1.32,2,2,0,0,1-.77.75,2.38,2.38,0,0,1-1.13.25,2,2,0,0,1-.46-.05l.12-.48a1.31,1.31,0,0,0,.41,0,1.16,1.16,0,0,0,.44-.07,1.14,1.14,0,0,0,.34-.26,2,2,0,0,0,.29-.51,5.39,5.39,0,0,0,.24-.83l.87-3.88a3.79,3.79,0,0,0,.13-.81.39.39,0,0,0-.12-.33.85.85,0,0,0-.45-.1l.06-.28,1.31,0h.31Zm1.62-7.36-.25,1h-.95l.25-1Z" transform="translate(-1.43 -338.09)"/></g><g class="cls-23"><text class="cls-24" transform="translate(313.23 182.19)">+</text></g><g class="cls-23"><text class="cls-24" transform="translate(323.91 182.19)">1</text></g><g class="cls-23"><text class="cls-24" transform="translate(281.07 198.03)">2</text></g><g class="cls-23"><path d="M296.4,528.76l-.06.27a.82.82,0,0,0-.29.09.49.49,0,0,0-.16.16,1.61,1.61,0,0,0-.15.34c-.05.15-.11.4-.19.75l-1.26,5.75h-.74L292,531.49c-.19-.55-.36-1.1-.5-1.65h-.07q0,.27-.18,1c-.1.51-.2,1-.31,1.51l-.47,2.15a3.82,3.82,0,0,0-.12.84.39.39,0,0,0,.13.34.72.72,0,0,0,.4.13l-.06.26h-1.88l.06-.26a.86.86,0,0,0,.3-.11.7.7,0,0,0,.18-.2,1.66,1.66,0,0,0,.12-.32q.06-.18.18-.72l.92-4.15c0-.15.06-.3.09-.44a3,3,0,0,0,0-.43.42.42,0,0,0-.13-.35.76.76,0,0,0-.4-.11l.06-.27h1.72l1.42,4.2c.21.6.36,1.12.48,1.53H294c0-.22.09-.58.19-1.09s.2-1,.28-1.34l.38-1.7a4.05,4.05,0,0,0,.12-.87.42.42,0,0,0-.13-.35.74.74,0,0,0-.4-.11l.06-.27Z" transform="translate(-1.43 -338.09)"/></g><g class="cls-32"><path d="M139.76,577.52h-2.27l0-.26a1.46,1.46,0,0,0,.32-.09A.48.48,0,0,0,138,577a1,1,0,0,0,.16-.36c.05-.16.12-.4.19-.73l1.15-5.29h-.68a1.18,1.18,0,0,0-.54.1,1.17,1.17,0,0,0-.4.36,5.41,5.41,0,0,0-.48.89h-.52l.4-1.82H143l-.42,1.9H142c0-.35,0-.61,0-.79a1.14,1.14,0,0,0-.11-.4.47.47,0,0,0-.18-.19,1.18,1.18,0,0,0-.39,0h-.8L139.34,576q0,.21-.06.33a2.28,2.28,0,0,0-.05.51.6.6,0,0,0,.05.26.3.3,0,0,0,.17.14,1.5,1.5,0,0,0,.37.07Z" transform="translate(-1.43 -338.09)"/></g><g class="cls-23"><path d="M142.89,577.17a2.84,2.84,0,0,0,.09-.58c0-.12,0-.2-.08-.24a.54.54,0,0,0-.33-.07l0-.21,1,0h.23l-.56,2.52a2.84,2.84,0,0,0-.09.58.44.44,0,0,0,0,.23.18.18,0,0,0,.16.07.45.45,0,0,0,.25-.09A3,3,0,0,0,144,579l.22.22a3.32,3.32,0,0,1-.62.55,1.09,1.09,0,0,1-.54.14.51.51,0,0,1-.41-.18.76.76,0,0,1-.15-.48,3.63,3.63,0,0,1,.11-.76Zm1.27-2.64-.18.76h-.7l.18-.76Z" transform="translate(-1.43 -338.09)"/></g><path class="cls-26" d="M149.39,569.27l.14.42a3.09,3.09,0,0,0-1.87,1.62,6.74,6.74,0,0,0-.6,3,7,7,0,0,0,.6,3.11,3.13,3.13,0,0,0,1.86,1.64l-.13.42a3.91,3.91,0,0,1-2.42-1.79,7,7,0,0,1,0-6.65A3.9,3.9,0,0,1,149.39,569.27Zm4.88,0a3.88,3.88,0,0,1,2.41,1.79,7,7,0,0,1,0,6.65,3.93,3.93,0,0,1-2.42,1.79l-.13-.42a3.11,3.11,0,0,0,1.85-1.64,7,7,0,0,0,.6-3.11,6.74,6.74,0,0,0-.6-3,3.06,3.06,0,0,0-1.87-1.62Z" transform="translate(-1.43 -338.09)"/><g class="cls-23"><path d="M151.73,577.61a4.12,4.12,0,0,1-.47,1.32,1.85,1.85,0,0,1-.77.75,2.4,2.4,0,0,1-1.14.25,1.47,1.47,0,0,1-.45,0l.12-.47a1.72,1.72,0,0,0,.41,0,1.15,1.15,0,0,0,.44-.08,1,1,0,0,0,.34-.26,2.06,2.06,0,0,0,.29-.5,4.84,4.84,0,0,0,.24-.84l.88-3.89a4.33,4.33,0,0,0,.12-.81.42.42,0,0,0-.11-.33.89.89,0,0,0-.46-.1l.06-.28,1.32-.05h.31Zm1.63-7.37-.25,1h-1l.25-1Z" transform="translate(-1.43 -338.09)"/></g><g class="cls-23"><text class="cls-33" transform="translate(160.09 239.43)">=</text></g><path class="cls-26" d="M177,569.27l.14.42a3.09,3.09,0,0,0-1.87,1.62,6.74,6.74,0,0,0-.6,3,7,7,0,0,0,.6,3.11,3.13,3.13,0,0,0,1.86,1.64l-.13.42a3.91,3.91,0,0,1-2.42-1.79,7,7,0,0,1,0-6.65A3.9,3.9,0,0,1,177,569.27Zm31.28,0a3.88,3.88,0,0,1,2.41,1.79,7,7,0,0,1,0,6.65,3.93,3.93,0,0,1-2.42,1.79l-.13-.42a3.11,3.11,0,0,0,1.85-1.64,7,7,0,0,0,.6-3.11,6.74,6.74,0,0,0-.6-3,3.06,3.06,0,0,0-1.87-1.62Z" transform="translate(-1.43 -338.09)"/><g class="cls-23"><path d="M178.67,573.88a3.62,3.62,0,0,0,.13-.81.39.39,0,0,0-.12-.33.89.89,0,0,0-.46-.1l.06-.28,1.32-.05h.32l-.78,3.47a3.87,3.87,0,0,0-.12.79.63.63,0,0,0,.07.32.28.28,0,0,0,.23.1.6.6,0,0,0,.34-.14,3.64,3.64,0,0,0,.52-.5l.31.3a5.37,5.37,0,0,1-.86.76,1.44,1.44,0,0,1-.74.2.68.68,0,0,1-.56-.26,1,1,0,0,1-.21-.66,4.7,4.7,0,0,1,.15-1Zm1.74-3.64-.25,1h-.95l.25-1Z" transform="translate(-1.43 -338.09)"/></g><g class="cls-23"><text class="cls-33" transform="translate(182.77 239.43)">=</text></g><g class="cls-23"><text class="cls-33" transform="translate(191.05 239.43)">=</text></g><g class="cls-23"><path d="M205.73,577.61a4.12,4.12,0,0,1-.47,1.32,1.85,1.85,0,0,1-.77.75,2.4,2.4,0,0,1-1.14.25,1.47,1.47,0,0,1-.45,0l.12-.47a1.72,1.72,0,0,0,.41,0,1.15,1.15,0,0,0,.44-.08,1,1,0,0,0,.34-.26,2.06,2.06,0,0,0,.29-.5,4.84,4.84,0,0,0,.24-.84l.88-3.89a4.33,4.33,0,0,0,.12-.81.42.42,0,0,0-.11-.33.89.89,0,0,0-.46-.1l.06-.28,1.32-.05h.31Zm1.63-7.37-.25,1h-1l.25-1Z" transform="translate(-1.43 -338.09)"/></g><g class="cls-23"><text class="cls-33" transform="translate(213.49 239.43)">?</text></g><g class="cls-23"><text class="cls-33" transform="translate(219.85 239.43)">1</text></g><g class="cls-23"><text class="cls-33" transform="translate(225.97 239.43)">:</text></g><g class="cls-32"><text class="cls-33" transform="translate(230.77 239.43)">0</text></g></g></g></svg>
\ No newline at end of file
diff --git a/doc/img/tx_cands_large.svg b/doc/img/tx_cands_large.svg
new file mode 100644
index 0000000..fb4f5f4
--- /dev/null
+++ b/doc/img/tx_cands_large.svg
@@ -0,0 +1 @@
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" viewBox="0 0 324.79 73.56"><defs><style>.cls-1,.cls-22{fill:none;}.cls-2{fill:#ddebf7;}.cls-3{clip-path:url(#clip-path);}.cls-16,.cls-17,.cls-4{font-size:12px;fill:#333;}.cls-4{font-family:Calibri-Bold, Calibri;font-weight:700;}.cls-5{letter-spacing:0em;}.cls-6{letter-spacing:0em;}.cls-7{letter-spacing:0em;}.cls-8{letter-spacing:0em;}.cls-9{letter-spacing:0.01em;}.cls-10{letter-spacing:0em;}.cls-11{letter-spacing:0em;}.cls-12{letter-spacing:0em;}.cls-13{letter-spacing:0.01em;}.cls-14{letter-spacing:0em;}.cls-15{clip-path:url(#clip-path-4);}.cls-16{font-family:Calibri, Calibri;}.cls-17{font-family:Calibri-Italic, Calibri;font-style:italic;}.cls-18{letter-spacing:0em;}.cls-19{letter-spacing:0em;}.cls-20{clip-path:url(#clip-path-7);}.cls-21{clip-path:url(#clip-path-10);}.cls-22{stroke:#000;stroke-linecap:square;stroke-linejoin:round;stroke-width:0.14px;}</style><clipPath id="clip-path" transform="translate(-53.04 -30.24)"><rect class="cls-1" x="1.92" y="31.68" width="376.27" height="22.92"/></clipPath><clipPath id="clip-path-4" transform="translate(-53.04 -30.24)"><rect class="cls-1" x="1.92" y="55.56" width="376.27" height="22.92"/></clipPath><clipPath id="clip-path-7" transform="translate(-53.04 -30.24)"><rect class="cls-1" x="1.92" y="79.44" width="376.27" height="22.92"/></clipPath><clipPath id="clip-path-10" transform="translate(-53.04 -30.24)"><rect class="cls-1" width="380.26" height="105.36"/></clipPath></defs><title>tables2Asset 1</title><g id="Layer_2" data-name="Layer 2"><g id="Layer_1-2" data-name="Layer 1"><rect class="cls-2" y="0.96" width="324.79" height="24"/><g class="cls-3"><text class="cls-4" transform="translate(11.3 16.92)"><tspan class="cls-5">M</tspan><tspan class="cls-6" x="10.44" y="0">a</tspan><tspan x="16.33" y="0">x(</tspan><tspan class="cls-7" x="25.58" y="0">w</tspan><tspan class="cls-8" x="34.55" y="0">i</tspan><tspan class="cls-7" x="37.53" y="0">d</tspan><tspan x="43.99" y="0">t</tspan><tspan class="cls-9" x="48.15" y="0">h</tspan><tspan x="54.65" y="0">,</tspan><tspan class="cls-10" x="57.75" y="0"> </tspan><tspan class="cls-7" x="60.51" y="0">h</tspan><tspan class="cls-11" x="66.97" y="0">e</tspan><tspan class="cls-12" x="72.98" y="0">i</tspan><tspan class="cls-5" x="75.96" y="0">g</tspan><tspan class="cls-7" x="81.6" y="0">h</tspan><tspan x="88.06" y="0">t)</tspan></text></g><g class="cls-3"><text class="cls-4" transform="translate(158.09 16.92)"><tspan class="cls-7">In</tspan><tspan x="9.69" y="0">t</tspan><tspan class="cls-13" x="13.85" y="0">r</tspan><tspan x="18.18" y="0">a</tspan></text></g><g class="cls-3"><text class="cls-4" transform="translate(261.07 16.92)"><tspan class="cls-14">In</tspan><tspan x="9.69" y="0">ter</tspan></text></g><g class="cls-15"><text class="cls-16" transform="translate(53.18 40.8)"><tspan class="cls-8">3</tspan><tspan x="6.12" y="0">2</tspan></text></g><g class="cls-15"><text class="cls-17" transform="translate(148.13 40.8)"><tspan class="cls-8">D</tspan><tspan x="7.42" y="0">CT</tspan><tspan class="cls-6" x="19.54" y="0">O</tspan><tspan class="cls-5" x="27.35" y="0">n</tspan><tspan x="33.47" y="0">ly</tspan></text></g><g class="cls-15"><text class="cls-17" transform="translate(235.75 40.8)"><tspan class="cls-8">D</tspan><tspan x="7.42" y="0">CT</tspan><tspan class="cls-6" x="19.54" y="0">O</tspan><tspan class="cls-5" x="27.35" y="0">n</tspan><tspan x="33.47" y="0">l</tspan><tspan class="cls-18" x="36.23" y="0">y</tspan><tspan x="41.61" y="0">,</tspan><tspan class="cls-14" x="44.6" y="0" xml:space="preserve">  </tspan><tspan x="50.07" y="0">I</tspan><tspan class="cls-19" x="53.1" y="0">D</tspan><tspan x="60.49" y="0">TX</tspan></text></g><g class="cls-20"><text class="cls-16" transform="translate(53.18 64.68)"><tspan class="cls-8">6</tspan><tspan x="6.12" y="0">4</tspan></text></g><g class="cls-20"><text class="cls-17" transform="translate(148.13 64.68)"><tspan class="cls-8">D</tspan><tspan x="7.42" y="0">CT</tspan><tspan class="cls-6" x="19.54" y="0">O</tspan><tspan class="cls-5" x="27.35" y="0">n</tspan><tspan x="33.47" y="0">ly</tspan></text></g><g class="cls-20"><text class="cls-17" transform="translate(251.23 64.68)"><tspan class="cls-8">D</tspan><tspan x="7.42" y="0">CT</tspan><tspan class="cls-6" x="19.54" y="0">O</tspan><tspan class="cls-5" x="27.35" y="0">n</tspan><tspan x="33.47" y="0">ly</tspan></text></g><g class="cls-21"><line class="cls-22" x1="118.07" y1="1.98" x2="118.07" y2="23.82"/><rect x="118.01" y="1.92" width="0.96" height="21.96"/><line class="cls-22" x1="221.17" y1="1.98" x2="221.17" y2="23.82"/><rect x="221.11" y="1.92" width="0.96" height="21.96"/><line class="cls-22" x1="118.07" y1="25.86" x2="118.07" y2="71.58"/><rect x="118.01" y="25.8" width="0.96" height="45.84"/><line class="cls-22" x1="221.17" y1="25.86" x2="221.17" y2="71.58"/><rect x="221.11" y="25.8" width="0.96" height="45.84"/><rect width="324.79" height="1.92"/><rect y="23.88" width="324.79" height="1.92"/><rect y="71.64" width="324.79" height="1.92"/></g></g></g></svg>
\ No newline at end of file
diff --git a/doc/img/tx_cands_small.svg b/doc/img/tx_cands_small.svg
new file mode 100644
index 0000000..ddd9a87
--- /dev/null
+++ b/doc/img/tx_cands_small.svg
@@ -0,0 +1 @@
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" viewBox="0 0 380.5 90.27"><defs><style>.cls-1,.cls-30{fill:none;}.cls-2{fill:#ddebf7;}.cls-3{clip-path:url(#clip-path);}.cls-17,.cls-18,.cls-4{font-size:12px;}.cls-17,.cls-18,.cls-19,.cls-4{fill:#333;}.cls-4{font-family:Calibri-Bold, Calibri;font-weight:700;}.cls-5{letter-spacing:0em;}.cls-6{letter-spacing:0em;}.cls-7{letter-spacing:0em;}.cls-8{letter-spacing:0.01em;}.cls-9{letter-spacing:0em;}.cls-10{letter-spacing:0.01em;}.cls-11{letter-spacing:0em;}.cls-12{letter-spacing:0em;}.cls-13{letter-spacing:0em;}.cls-14{letter-spacing:0em;}.cls-15{letter-spacing:0.01em;}.cls-16{clip-path:url(#clip-path-4);}.cls-17{font-family:Calibri, Calibri;}.cls-18{font-family:Calibri-Italic, Calibri;}.cls-18,.cls-19{font-style:italic;}.cls-19{font-size:11.04px;font-family:SegoeUI-Italic, Segoe UI;}.cls-20{letter-spacing:0em;}.cls-21{letter-spacing:0em;}.cls-22{letter-spacing:0em;}.cls-23{letter-spacing:0em;}.cls-24{letter-spacing:0em;}.cls-25{clip-path:url(#clip-path-8);}.cls-26{clip-path:url(#clip-path-12);}.cls-27{letter-spacing:0em;}.cls-28{letter-spacing:0em;}.cls-29{clip-path:url(#clip-path-17);}.cls-30{stroke:#000;stroke-linecap:square;stroke-linejoin:round;stroke-width:0.14px;}</style><clipPath id="clip-path" transform="translate(-53.04 -30.24)"><rect class="cls-1" x="1.92" y="31.68" width="431.98" height="22.92"/></clipPath><clipPath id="clip-path-4" transform="translate(-53.04 -30.24)"><rect class="cls-1" x="1.92" y="55.56" width="431.98" height="22.92"/></clipPath><clipPath id="clip-path-8" transform="translate(-53.04 -30.24)"><rect class="cls-1" x="1.92" y="79.44" width="431.98" height="22.92"/></clipPath><clipPath id="clip-path-12" transform="translate(-53.04 -30.24)"><rect class="cls-1" x="1.92" y="103.33" width="431.98" height="15.74"/></clipPath><clipPath id="clip-path-17" transform="translate(-53.04 -30.24)"><rect class="cls-1" width="435.94" height="673.9"/></clipPath></defs><title>tx_cands_smallAsset 1</title><g id="Layer_2" data-name="Layer 2"><g id="Layer_1-2" data-name="Layer 1"><rect class="cls-2" y="0.96" width="380.5" height="24"/><g class="cls-3"><text class="cls-4" transform="translate(8.66 16.92)"><tspan class="cls-5">M</tspan><tspan class="cls-6" x="10.44" y="0">i</tspan><tspan class="cls-7" x="13.42" y="0">n</tspan><tspan x="19.89" y="0">(w</tspan><tspan class="cls-8" x="32.57" y="0">i</tspan><tspan class="cls-9" x="35.59" y="0">d</tspan><tspan x="42.05" y="0">t</tspan><tspan class="cls-10" x="46.21" y="0">h</tspan><tspan x="52.71" y="0">,</tspan><tspan class="cls-11" x="55.8" y="0"> </tspan><tspan class="cls-7" x="58.56" y="0">h</tspan><tspan class="cls-12" x="65.03" y="0">e</tspan><tspan class="cls-13" x="71.03" y="0">i</tspan><tspan class="cls-14" x="74.02" y="0">g</tspan><tspan class="cls-9" x="79.66" y="0">h</tspan><tspan x="86.12" y="0">t)</tspan></text></g><g class="cls-3"><text class="cls-4" transform="translate(159.77 16.92)"><tspan class="cls-9">In</tspan><tspan x="9.69" y="0">t</tspan><tspan class="cls-15" x="13.85" y="0">r</tspan><tspan x="18.18" y="0">a</tspan></text></g><g class="cls-3"><text class="cls-4" transform="translate(294.19 16.92)"><tspan class="cls-7">In</tspan><tspan x="9.69" y="0">ter</tspan></text></g><g class="cls-16"><text class="cls-17" transform="translate(52.7 40.8)">4</text></g><g class="cls-16"><text class="cls-18" transform="translate(122.57 40.8)"><tspan class="cls-13">D</tspan><tspan x="7.42" y="0">T</tspan><tspan class="cls-11" x="13.27" y="0">T</tspan><tspan x="19.16" y="0">4, </tspan></text></g><g class="cls-16"><text class="cls-19" transform="translate(155.93 40.8)"><tspan class="cls-20">I</tspan><tspan class="cls-21" x="2.88" y="0">D</tspan><tspan class="cls-22" x="10.7" y="0">TX</tspan><tspan x="22.96" y="0">,</tspan><tspan class="cls-23" x="25.36" y="0"> </tspan><tspan class="cls-24" x="28.37" y="0">1</tspan><tspan class="cls-21" x="34.34" y="0">DD</tspan><tspan x="49.97" y="0">CT</tspan></text></g><g class="cls-16"><text class="cls-18" transform="translate(290.59 40.8)">ALL<tspan class="cls-13" x="17.03" y="0">1</tspan><tspan x="23.15" y="0">6</tspan></text></g><g class="cls-25"><text class="cls-17" transform="translate(52.7 64.68)">8</text></g><g class="cls-25"><text class="cls-18" transform="translate(122.57 64.68)"><tspan class="cls-13">D</tspan><tspan x="7.42" y="0">T</tspan><tspan class="cls-11" x="13.27" y="0">T</tspan><tspan x="19.16" y="0">4, </tspan></text></g><g class="cls-25"><text class="cls-19" transform="translate(155.93 64.68)"><tspan class="cls-20">I</tspan><tspan class="cls-21" x="2.88" y="0">D</tspan><tspan class="cls-22" x="10.7" y="0">TX</tspan><tspan x="22.96" y="0">,</tspan><tspan class="cls-23" x="25.36" y="0"> </tspan><tspan class="cls-24" x="28.37" y="0">1</tspan><tspan class="cls-21" x="34.34" y="0">DD</tspan><tspan x="49.97" y="0">CT</tspan></text></g><g class="cls-25"><text class="cls-18" transform="translate(290.59 64.68)">ALL<tspan class="cls-13" x="17.03" y="0">1</tspan><tspan x="23.15" y="0">6</tspan></text></g><g class="cls-26"><text class="cls-17" transform="translate(49.58 84.99)"><tspan class="cls-13">1</tspan><tspan x="6.12" y="0">6</tspan></text></g><g class="cls-26"><text class="cls-18" transform="translate(142.49 84.99)"><tspan class="cls-13">D</tspan><tspan x="7.42" y="0">T</tspan><tspan class="cls-11" x="13.27" y="0">T</tspan><tspan x="19.16" y="0">4, </tspan></text></g><g class="cls-26"><text class="cls-19" transform="translate(175.85 84.99)"><tspan class="cls-20">IDT</tspan><tspan x="16.44" y="0">X</tspan></text></g><g class="cls-26"><text class="cls-18" transform="translate(257.11 84.99)"><tspan class="cls-13">D</tspan><tspan x="7.42" y="0">T</tspan><tspan class="cls-11" x="13.27" y="0">T</tspan><tspan x="19.16" y="0">9, </tspan></text></g><g class="cls-26"><text class="cls-19" transform="translate(290.47 84.99)"><tspan class="cls-27">I</tspan><tspan class="cls-21" x="2.88" y="0">D</tspan><tspan class="cls-22" x="10.7" y="0">TX</tspan><tspan x="22.96" y="0">,</tspan><tspan class="cls-22" x="25.36" y="0"> </tspan><tspan class="cls-28" x="28.37" y="0">1</tspan><tspan class="cls-21" x="34.34" y="0">DD</tspan><tspan x="49.97" y="0">CT</tspan></text></g><g class="cls-29"><line class="cls-30" x1="110.84" y1="1.98" x2="110.84" y2="23.82"/><rect x="110.78" y="1.92" width="0.96" height="21.96"/><line class="cls-30" x1="231.73" y1="1.98" x2="231.73" y2="23.82"/><rect x="231.67" y="1.92" width="0.96" height="21.96"/><line class="cls-30" x1="110.84" y1="25.86" x2="110.84" y2="88.29"/><rect x="110.78" y="25.81" width="0.96" height="62.54"/><line class="cls-30" x1="231.73" y1="25.86" x2="231.73" y2="88.29"/><rect x="231.67" y="25.81" width="0.96" height="62.54"/><rect width="380.5" height="1.92"/><rect y="23.88" width="380.5" height="1.92"/><rect y="88.35" width="380.5" height="1.92"/></g></g></g></svg>
\ No newline at end of file
diff --git a/doc/img/tx_chroma.svg b/doc/img/tx_chroma.svg
new file mode 100644
index 0000000..a0915e0
--- /dev/null
+++ b/doc/img/tx_chroma.svg
@@ -0,0 +1 @@
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" viewBox="0 0 380.5 244.23"><defs><style>.cls-1,.cls-41{fill:none;}.cls-2{fill:#ddebf7;}.cls-3{clip-path:url(#clip-path);}.cls-4{font-size:12px;font-family:Calibri-Bold, Calibri;font-weight:700;}.cls-19,.cls-4{fill:#333;}.cls-5{letter-spacing:0em;}.cls-6{letter-spacing:0.01em;}.cls-7{letter-spacing:0em;}.cls-8{letter-spacing:0em;}.cls-9{letter-spacing:0em;}.cls-10{letter-spacing:0em;}.cls-11{letter-spacing:0em;}.cls-12{clip-path:url(#clip-path-3);}.cls-13{letter-spacing:0em;}.cls-14{letter-spacing:0em;}.cls-15{letter-spacing:0em;}.cls-16{letter-spacing:0em;}.cls-17{letter-spacing:0.01em;}.cls-18{clip-path:url(#clip-path-5);}.cls-19{font-size:11.04px;font-family:SegoeUI-Italic, Segoe UI;font-style:italic;}.cls-20{letter-spacing:0em;}.cls-21{letter-spacing:0em;}.cls-22{clip-path:url(#clip-path-8);}.cls-23{letter-spacing:0em;}.cls-24{letter-spacing:-0.01em;}.cls-25{clip-path:url(#clip-path-11);}.cls-26{letter-spacing:0em;}.cls-27{letter-spacing:0em;}.cls-28{clip-path:url(#clip-path-14);}.cls-29{letter-spacing:0em;}.cls-30{letter-spacing:0em;}.cls-31{clip-path:url(#clip-path-17);}.cls-32{clip-path:url(#clip-path-20);}.cls-33{clip-path:url(#clip-path-23);}.cls-34{clip-path:url(#clip-path-26);}.cls-35{clip-path:url(#clip-path-29);}.cls-36{clip-path:url(#clip-path-32);}.cls-37{clip-path:url(#clip-path-35);}.cls-38{clip-path:url(#clip-path-38);}.cls-39{clip-path:url(#clip-path-41);}.cls-40{clip-path:url(#clip-path-44);}.cls-41{stroke:#000;stroke-linecap:square;stroke-linejoin:round;stroke-width:0.14px;}</style><clipPath id="clip-path" transform="translate(-53.04 -15.71)"><rect class="cls-1" x="53.52" y="17.15" width="110.3" height="30.24"/></clipPath><clipPath id="clip-path-3" transform="translate(-53.04 -15.71)"><rect class="cls-1" x="1.92" y="17.15" width="431.98" height="30.24"/></clipPath><clipPath id="clip-path-5" transform="translate(-53.04 -15.71)"><rect class="cls-1" x="1.92" y="48.35" width="431.98" height="15"/></clipPath><clipPath id="clip-path-8" transform="translate(-53.04 -15.71)"><rect class="cls-1" x="1.92" y="64.31" width="431.98" height="15"/></clipPath><clipPath id="clip-path-11" transform="translate(-53.04 -15.71)"><rect class="cls-1" x="1.92" y="80.27" width="431.98" height="15"/></clipPath><clipPath id="clip-path-14" transform="translate(-53.04 -15.71)"><rect class="cls-1" x="1.92" y="96.23" width="431.98" height="15"/></clipPath><clipPath id="clip-path-17" transform="translate(-53.04 -15.71)"><rect class="cls-1" x="1.92" y="112.2" width="431.98" height="15.02"/></clipPath><clipPath id="clip-path-20" transform="translate(-53.04 -15.71)"><rect class="cls-1" x="1.92" y="128.18" width="431.98" height="15"/></clipPath><clipPath id="clip-path-23" transform="translate(-53.04 -15.71)"><rect class="cls-1" x="1.92" y="144.14" width="431.98" height="15"/></clipPath><clipPath id="clip-path-26" transform="translate(-53.04 -15.71)"><rect class="cls-1" x="1.92" y="160.1" width="431.98" height="15"/></clipPath><clipPath id="clip-path-29" transform="translate(-53.04 -15.71)"><rect class="cls-1" x="1.92" y="176.06" width="431.98" height="15"/></clipPath><clipPath id="clip-path-32" transform="translate(-53.04 -15.71)"><rect class="cls-1" x="1.92" y="192.02" width="431.98" height="15"/></clipPath><clipPath id="clip-path-35" transform="translate(-53.04 -15.71)"><rect class="cls-1" x="1.92" y="207.98" width="431.98" height="15"/></clipPath><clipPath id="clip-path-38" transform="translate(-53.04 -15.71)"><rect class="cls-1" x="1.92" y="223.94" width="431.98" height="17.88"/></clipPath><clipPath id="clip-path-41" transform="translate(-53.04 -15.71)"><rect class="cls-1" x="1.92" y="242.78" width="431.98" height="15.72"/></clipPath><clipPath id="clip-path-44" transform="translate(-53.04 -15.71)"><rect class="cls-1" width="435.94" height="567.07"/></clipPath></defs><title>tx_chromaAsset 1</title><g id="Layer_2" data-name="Layer 2"><g id="Layer_1-2" data-name="Layer 1"><rect class="cls-2" y="0.96" width="380.5" height="31.32"/><g class="cls-3"><text class="cls-4" transform="translate(16.58 12.84)"><tspan class="cls-5">In</tspan><tspan x="9.69" y="0">t</tspan><tspan class="cls-6" x="13.85" y="0">r</tspan><tspan class="cls-7" x="18.18" y="0">a</tspan><tspan class="cls-5" x="24.07" y="0"> </tspan><tspan x="26.81" y="0">P</tspan><tspan class="cls-8" x="33.2" y="0">r</tspan><tspan class="cls-7" x="37.47" y="0">e</tspan><tspan class="cls-5" x="43.48" y="0">d</tspan><tspan class="cls-9" x="49.94" y="0">i</tspan><tspan x="52.93" y="0">c</tspan><tspan class="cls-9" x="57.95" y="0">ti</tspan><tspan x="65.01" y="0">o</tspan><tspan class="cls-10" x="71.46" y="0">n</tspan><tspan x="77.95" y="0"> </tspan></text></g><g class="cls-3"><text class="cls-4" transform="translate(41.06 28.44)"><tspan class="cls-11">M</tspan><tspan x="10.44" y="0">o</tspan><tspan class="cls-10" x="16.89" y="0">d</tspan><tspan x="23.38" y="0">e</tspan></text></g><g class="cls-12"><text class="cls-4" transform="translate(126.17 20.64)">V<tspan class="cls-11" x="7.1" y="0">e</tspan><tspan class="cls-13" x="13.09" y="0">r</tspan><tspan class="cls-6" x="17.39" y="0">ti</tspan><tspan x="24.49" y="0">cal</tspan><tspan class="cls-10" x="38.38" y="0"> </tspan><tspan x="41.14" y="0">t</tspan><tspan class="cls-6" x="45.3" y="0">r</tspan><tspan class="cls-7" x="49.64" y="0">a</tspan><tspan class="cls-5" x="55.53" y="0">n</tspan><tspan x="61.99" y="0">s</tspan><tspan class="cls-9" x="66.78" y="0">f</tspan><tspan x="70.61" y="0">o</tspan><tspan class="cls-14" x="77.06" y="0">r</tspan><tspan x="81.39" y="0">m</tspan></text></g><g class="cls-12"><text class="cls-4" transform="translate(253.87 20.64)">Ho<tspan class="cls-10" x="14.02" y="0">r</tspan><tspan class="cls-9" x="18.33" y="0">i</tspan><tspan x="21.32" y="0">z</tspan><tspan class="cls-10" x="26.09" y="0">o</tspan><tspan class="cls-15" x="32.59" y="0">n</tspan><tspan x="39.05" y="0">tal</tspan><tspan class="cls-14" x="52.08" y="0"> </tspan><tspan x="54.85" y="0">t</tspan><tspan class="cls-6" x="59.01" y="0">r</tspan><tspan class="cls-7" x="63.35" y="0">a</tspan><tspan class="cls-15" x="69.24" y="0">n</tspan><tspan x="75.7" y="0">s</tspan><tspan class="cls-16" x="80.49" y="0">f</tspan><tspan x="84.32" y="0">o</tspan><tspan class="cls-17" x="90.77" y="0">r</tspan><tspan x="95.1" y="0">m</tspan></text></g><g class="cls-18"><text class="cls-19" transform="translate(9.62 44.28)"><tspan class="cls-20">D</tspan><tspan x="7.82" y="0">C_P</tspan><tspan class="cls-21" x="25.8" y="0">R</tspan><tspan x="32.27" y="0">ED</tspan></text></g><g class="cls-18"><text class="cls-19" transform="translate(160.37 44.28)"><tspan class="cls-20">D</tspan><tspan x="7.82" y="0">CT</tspan></text></g><g class="cls-18"><text class="cls-19" transform="translate(294.91 44.28)"><tspan class="cls-20">D</tspan><tspan x="7.82" y="0">CT</tspan></text></g><g class="cls-22"><text class="cls-19" transform="translate(9.62 60.24)">V_P<tspan class="cls-23" x="17.86" y="0">R</tspan><tspan x="24.35" y="0">ED</tspan></text></g><g class="cls-22"><text class="cls-19" transform="translate(157.61 60.24)">A<tspan class="cls-24" x="6.98" y="0">D</tspan><tspan x="14.77" y="0">ST</tspan></text></g><g class="cls-22"><text class="cls-19" transform="translate(294.91 60.24)"><tspan class="cls-20">D</tspan><tspan x="7.82" y="0">CT</tspan></text></g><g class="cls-25"><text class="cls-19" transform="translate(9.62 76.2)">H<tspan class="cls-26" x="7.82" y="0">_</tspan><tspan x="12.37" y="0">P</tspan><tspan class="cls-21" x="18.72" y="0">R</tspan><tspan x="25.19" y="0">ED</tspan></text></g><g class="cls-25"><text class="cls-19" transform="translate(160.37 76.2)"><tspan class="cls-20">D</tspan><tspan x="7.82" y="0">CT</tspan></text></g><g class="cls-25"><text class="cls-19" transform="translate(292.15 76.2)">A<tspan class="cls-24" x="6.98" y="0">D</tspan><tspan class="cls-27" x="14.77" y="0">ST</tspan></text></g><g class="cls-28"><text class="cls-19" transform="translate(9.62 92.16)"><tspan class="cls-20">D</tspan><tspan class="cls-29" x="7.82" y="0">45</tspan><tspan x="19.76" y="0">_P</tspan><tspan class="cls-30" x="30.69" y="0">R</tspan><tspan x="37.14" y="0">ED</tspan></text></g><g class="cls-28"><text class="cls-19" transform="translate(160.37 92.16)"><tspan class="cls-20">D</tspan><tspan x="7.82" y="0">CT</tspan></text></g><g class="cls-28"><text class="cls-19" transform="translate(294.91 92.16)"><tspan class="cls-20">D</tspan><tspan x="7.82" y="0">CT</tspan></text></g><g class="cls-31"><text class="cls-19" transform="translate(9.62 108.15)"><tspan class="cls-20">D</tspan><tspan class="cls-29" x="7.82" y="0">135</tspan><tspan x="25.74" y="0">_P</tspan><tspan class="cls-29" x="36.67" y="0">R</tspan><tspan x="43.12" y="0">ED</tspan></text></g><g class="cls-31"><text class="cls-19" transform="translate(157.61 108.15)">A<tspan class="cls-24" x="6.98" y="0">D</tspan><tspan x="14.77" y="0">ST</tspan></text></g><g class="cls-31"><text class="cls-19" transform="translate(292.15 108.15)">A<tspan class="cls-24" x="6.98" y="0">D</tspan><tspan class="cls-27" x="14.77" y="0">ST</tspan></text></g><g class="cls-32"><text class="cls-19" transform="translate(9.62 124.11)"><tspan class="cls-20">D</tspan><tspan class="cls-29" x="7.82" y="0">113</tspan><tspan x="25.74" y="0">_P</tspan><tspan class="cls-29" x="36.67" y="0">R</tspan><tspan x="43.12" y="0">ED</tspan></text></g><g class="cls-32"><text class="cls-19" transform="translate(157.61 124.11)">A<tspan class="cls-24" x="6.98" y="0">D</tspan><tspan x="14.77" y="0">ST</tspan></text></g><g class="cls-32"><text class="cls-19" transform="translate(294.91 124.11)"><tspan class="cls-20">D</tspan><tspan x="7.82" y="0">CT</tspan></text></g><g class="cls-33"><text class="cls-19" transform="translate(9.62 140.07)"><tspan class="cls-20">D</tspan><tspan class="cls-29" x="7.82" y="0">157</tspan><tspan x="25.74" y="0">_P</tspan><tspan class="cls-29" x="36.67" y="0">R</tspan><tspan x="43.12" y="0">ED</tspan></text></g><g class="cls-33"><text class="cls-19" transform="translate(160.37 140.07)"><tspan class="cls-20">D</tspan><tspan x="7.82" y="0">CT</tspan></text></g><g class="cls-33"><text class="cls-19" transform="translate(292.15 140.07)">A<tspan class="cls-24" x="6.98" y="0">D</tspan><tspan class="cls-27" x="14.77" y="0">ST</tspan></text></g><g class="cls-34"><text class="cls-19" transform="translate(9.62 156.03)"><tspan class="cls-20">D</tspan><tspan class="cls-29" x="7.82" y="0">203</tspan><tspan x="25.74" y="0">_P</tspan><tspan class="cls-29" x="36.67" y="0">R</tspan><tspan x="43.12" y="0">ED</tspan></text></g><g class="cls-34"><text class="cls-19" transform="translate(160.37 156.03)"><tspan class="cls-20">D</tspan><tspan x="7.82" y="0">CT</tspan></text></g><g class="cls-34"><text class="cls-19" transform="translate(292.15 156.03)">A<tspan class="cls-24" x="6.98" y="0">D</tspan><tspan class="cls-27" x="14.77" y="0">ST</tspan></text></g><g class="cls-35"><text class="cls-19" transform="translate(9.62 171.99)"><tspan class="cls-20">D</tspan><tspan class="cls-29" x="7.82" y="0">67</tspan><tspan x="19.76" y="0">_P</tspan><tspan class="cls-30" x="30.69" y="0">R</tspan><tspan x="37.14" y="0">ED</tspan></text></g><g class="cls-35"><text class="cls-19" transform="translate(157.61 171.99)">A<tspan class="cls-24" x="6.98" y="0">D</tspan><tspan x="14.77" y="0">ST</tspan></text></g><g class="cls-35"><text class="cls-19" transform="translate(294.91 171.99)"><tspan class="cls-20">D</tspan><tspan x="7.82" y="0">CT</tspan></text></g><g class="cls-36"><text class="cls-19" transform="translate(9.62 187.95)">SM<tspan class="cls-26" x="14.52" y="0">OOT</tspan><tspan x="36.87" y="0">H</tspan><tspan class="cls-26" x="44.69" y="0">_</tspan><tspan x="49.23" y="0">P</tspan><tspan class="cls-21" x="55.58" y="0">R</tspan><tspan x="62.05" y="0">ED</tspan></text></g><g class="cls-36"><text class="cls-19" transform="translate(157.61 187.95)">A<tspan class="cls-24" x="6.98" y="0">D</tspan><tspan x="14.77" y="0">ST</tspan></text></g><g class="cls-36"><text class="cls-19" transform="translate(292.15 187.95)">A<tspan class="cls-24" x="6.98" y="0">D</tspan><tspan class="cls-27" x="14.77" y="0">ST</tspan></text></g><g class="cls-37"><text class="cls-19" transform="translate(9.62 203.91)">SM<tspan class="cls-26" x="14.52" y="0">OOT</tspan><tspan x="36.87" y="0">H</tspan><tspan class="cls-26" x="44.69" y="0">_</tspan><tspan x="49.23" y="0">V_P</tspan><tspan class="cls-23" x="67.1" y="0">R</tspan><tspan x="73.58" y="0">ED</tspan></text></g><g class="cls-37"><text class="cls-19" transform="translate(157.61 203.91)">A<tspan class="cls-24" x="6.98" y="0">D</tspan><tspan x="14.77" y="0">ST</tspan></text></g><g class="cls-37"><text class="cls-19" transform="translate(294.91 203.91)"><tspan class="cls-20">D</tspan><tspan x="7.82" y="0">CT</tspan></text></g><g class="cls-38"><text class="cls-19" transform="translate(9.62 221.19)">SM<tspan class="cls-26" x="14.52" y="0">OOT</tspan><tspan x="36.87" y="0">H</tspan><tspan class="cls-26" x="44.69" y="0">_</tspan><tspan x="49.23" y="0">H</tspan><tspan class="cls-26" x="57.05" y="0">_</tspan><tspan x="61.6" y="0">P</tspan><tspan class="cls-21" x="67.95" y="0">R</tspan><tspan x="74.42" y="0">ED</tspan></text></g><g class="cls-38"><text class="cls-19" transform="translate(160.37 221.19)"><tspan class="cls-20">D</tspan><tspan x="7.82" y="0">CT</tspan></text></g><g class="cls-38"><text class="cls-19" transform="translate(292.15 221.19)">A<tspan class="cls-24" x="6.98" y="0">D</tspan><tspan class="cls-27" x="14.77" y="0">ST</tspan></text></g><g class="cls-39"><text class="cls-19" transform="translate(9.62 238.95)">PAE<tspan class="cls-26" x="18.83" y="0">T</tspan><tspan x="24.61" y="0">H</tspan><tspan class="cls-26" x="32.42" y="0">_</tspan><tspan x="36.97" y="0">P</tspan><tspan class="cls-21" x="43.32" y="0">R</tspan><tspan x="49.79" y="0">ED</tspan></text></g><g class="cls-39"><text class="cls-19" transform="translate(157.61 238.95)">A<tspan class="cls-24" x="6.98" y="0">D</tspan><tspan x="14.77" y="0">ST</tspan></text></g><g class="cls-39"><text class="cls-19" transform="translate(292.15 238.95)">A<tspan class="cls-24" x="6.98" y="0">D</tspan><tspan class="cls-27" x="14.77" y="0">ST</tspan></text></g><g class="cls-40"><line class="cls-41" x1="110.84" y1="1.98" x2="110.84" y2="31.14"/><rect x="110.78" y="1.92" width="0.96" height="29.28"/><line class="cls-41" x1="231.73" y1="1.98" x2="231.73" y2="31.14"/><rect x="231.67" y="1.92" width="0.96" height="29.28"/><line class="cls-41" x1="110.84" y1="33.18" x2="110.84" y2="242.25"/><rect x="110.78" y="33.13" width="0.96" height="209.18"/><line class="cls-41" x1="231.73" y1="33.18" x2="231.73" y2="242.25"/><rect x="231.67" y="33.13" width="0.96" height="209.18"/><rect width="380.5" height="1.92"/><rect y="31.2" width="380.5" height="1.92"/><rect y="242.31" width="380.5" height="1.92"/></g></g></g></svg>
\ No newline at end of file
diff --git a/doc/img/tx_partition.svg b/doc/img/tx_partition.svg
new file mode 100644
index 0000000..e0ce50c
--- /dev/null
+++ b/doc/img/tx_partition.svg
@@ -0,0 +1 @@
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" viewBox="0 0 172.61 310.73"><defs><style>.cls-1,.cls-38{fill:none;}.cls-2{clip-path:url(#clip-path);}.cls-3{fill:#ddebf7;}.cls-4{clip-path:url(#clip-path-2);}.cls-5{font-size:11.04px;font-family:Calibri, Calibri;}.cls-6{letter-spacing:0em;}.cls-7{letter-spacing:0em;}.cls-8{letter-spacing:0em;}.cls-9{letter-spacing:0em;}.cls-10{letter-spacing:0em;}.cls-11{letter-spacing:0em;}.cls-12{clip-path:url(#clip-path-4);}.cls-13{letter-spacing:0em;}.cls-14{letter-spacing:0em;}.cls-15{letter-spacing:0.01em;}.cls-16{letter-spacing:0em;}.cls-17{clip-path:url(#clip-path-8);}.cls-18{clip-path:url(#clip-path-10);}.cls-19{letter-spacing:0.01em;}.cls-20{clip-path:url(#clip-path-12);}.cls-21{clip-path:url(#clip-path-14);}.cls-22{clip-path:url(#clip-path-16);}.cls-23{clip-path:url(#clip-path-18);}.cls-24{clip-path:url(#clip-path-20);}.cls-25{letter-spacing:0.01em;}.cls-26{clip-path:url(#clip-path-22);}.cls-27{clip-path:url(#clip-path-24);}.cls-28{clip-path:url(#clip-path-26);}.cls-29{clip-path:url(#clip-path-28);}.cls-30{clip-path:url(#clip-path-30);}.cls-31{clip-path:url(#clip-path-32);}.cls-32{clip-path:url(#clip-path-34);}.cls-33{clip-path:url(#clip-path-36);}.cls-34{clip-path:url(#clip-path-38);}.cls-35{clip-path:url(#clip-path-40);}.cls-36{clip-path:url(#clip-path-42);}.cls-37{clip-path:url(#clip-path-44);}.cls-38{stroke:#000;stroke-linecap:square;stroke-linejoin:round;stroke-width:0.14px;}</style><clipPath id="clip-path" transform="translate(-1.44 -15.24)"><rect class="cls-1" x="1.92" y="1.92" width="172.49" height="323.09"/></clipPath><clipPath id="clip-path-2" transform="translate(-1.44 -15.24)"><rect class="cls-1" x="1.92" y="17.16" width="86.9" height="29.52"/></clipPath><clipPath id="clip-path-4" transform="translate(-1.44 -15.24)"><rect class="cls-1" x="89.78" y="17.16" width="83.66" height="29.52"/></clipPath><clipPath id="clip-path-8" transform="translate(-1.44 -15.24)"><rect class="cls-1" x="1.92" y="62.88" width="172.49" height="13.56"/></clipPath><clipPath id="clip-path-10" transform="translate(-1.44 -15.24)"><rect class="cls-1" x="1.92" y="77.4" width="172.49" height="13.56"/></clipPath><clipPath id="clip-path-12" transform="translate(-1.44 -15.24)"><rect class="cls-1" x="1.92" y="91.92" width="172.49" height="13.56"/></clipPath><clipPath id="clip-path-14" transform="translate(-1.44 -15.24)"><rect class="cls-1" x="1.92" y="106.45" width="172.49" height="13.58"/></clipPath><clipPath id="clip-path-16" transform="translate(-1.44 -15.24)"><rect class="cls-1" x="1.92" y="120.99" width="172.49" height="13.56"/></clipPath><clipPath id="clip-path-18" transform="translate(-1.44 -15.24)"><rect class="cls-1" x="1.92" y="135.51" width="172.49" height="13.56"/></clipPath><clipPath id="clip-path-20" transform="translate(-1.44 -15.24)"><rect class="cls-1" x="1.92" y="150.03" width="172.49" height="13.56"/></clipPath><clipPath id="clip-path-22" transform="translate(-1.44 -15.24)"><rect class="cls-1" x="1.92" y="164.55" width="172.49" height="13.56"/></clipPath><clipPath id="clip-path-24" transform="translate(-1.44 -15.24)"><rect class="cls-1" x="1.92" y="179.07" width="172.49" height="13.56"/></clipPath><clipPath id="clip-path-26" transform="translate(-1.44 -15.24)"><rect class="cls-1" x="1.92" y="193.59" width="172.49" height="13.56"/></clipPath><clipPath id="clip-path-28" transform="translate(-1.44 -15.24)"><rect class="cls-1" x="1.92" y="208.11" width="172.49" height="13.56"/></clipPath><clipPath id="clip-path-30" transform="translate(-1.44 -15.24)"><rect class="cls-1" x="1.92" y="222.63" width="172.49" height="13.56"/></clipPath><clipPath id="clip-path-32" transform="translate(-1.44 -15.24)"><rect class="cls-1" x="1.92" y="237.15" width="172.49" height="13.56"/></clipPath><clipPath id="clip-path-34" transform="translate(-1.44 -15.24)"><rect class="cls-1" x="1.92" y="251.67" width="172.49" height="13.56"/></clipPath><clipPath id="clip-path-36" transform="translate(-1.44 -15.24)"><rect class="cls-1" x="1.92" y="266.19" width="172.49" height="13.58"/></clipPath><clipPath id="clip-path-38" transform="translate(-1.44 -15.24)"><rect class="cls-1" x="1.92" y="280.73" width="172.49" height="13.56"/></clipPath><clipPath id="clip-path-40" transform="translate(-1.44 -15.24)"><rect class="cls-1" x="1.92" y="295.25" width="172.49" height="13.56"/></clipPath><clipPath id="clip-path-42" transform="translate(-1.44 -15.24)"><rect class="cls-1" x="1.92" y="309.77" width="172.49" height="14.28"/></clipPath><clipPath id="clip-path-44" transform="translate(-1.44 -15.24)"><rect class="cls-1" width="176.45" height="327.05"/></clipPath></defs><title>tables2Asset 1</title><g id="Layer_2" data-name="Layer 2"><g id="Layer_1-2" data-name="Layer 1"><g class="cls-2"><rect class="cls-3" y="1.44" width="172.61" height="30.6"/></g><g class="cls-4"><text class="cls-5" transform="translate(5.28 13.2)">Tra<tspan class="cls-6" x="14.52" y="0">n</tspan><tspan x="20.28" y="0">sf</tspan><tspan class="cls-7" x="27.97" y="0">o</tspan><tspan class="cls-8" x="33.83" y="0">r</tspan><tspan class="cls-9" x="37.67" y="0">m</tspan><tspan class="cls-8" x="46.53" y="0" xml:space="preserve"> size </tspan><tspan class="cls-10" x="68.22" y="0">o</tspan><tspan x="74.1" y="0">f </tspan></text></g><g class="cls-4"><text class="cls-5" transform="translate(12.96 27.72)">cu<tspan class="cls-11" x="10.47" y="0">r</tspan><tspan x="14.28" y="0">rent depth</tspan></text></g><g class="cls-12"><text class="cls-5" transform="translate(91.46 13.2)">Tra<tspan class="cls-6" x="14.52" y="0">n</tspan><tspan x="20.28" y="0">sf</tspan><tspan class="cls-7" x="27.97" y="0">o</tspan><tspan class="cls-8" x="33.83" y="0">r</tspan><tspan class="cls-9" x="37.67" y="0">m</tspan><tspan class="cls-8" x="46.53" y="0" xml:space="preserve"> size </tspan><tspan class="cls-10" x="68.22" y="0">o</tspan><tspan x="74.1" y="0">f </tspan></text></g><g class="cls-12"><text class="cls-5" transform="translate(105.86 27.72)"><tspan class="cls-6">n</tspan><tspan x="5.77" y="0">e</tspan><tspan class="cls-13" x="11.26" y="0">x</tspan><tspan x="16.06" y="0">t</tspan><tspan class="cls-14" x="19.76" y="0"> </tspan><tspan class="cls-6" x="22.28" y="0">d</tspan><tspan x="28.05" y="0">epth</tspan></text></g><g class="cls-2"><text class="cls-5" transform="translate(27.48 43.32)">T<tspan class="cls-7" x="5.38" y="0">X</tspan><tspan x="11.14" y="0">_</tspan><tspan class="cls-15" x="16.64" y="0">4</tspan><tspan class="cls-8" x="22.29" y="0">X4</tspan></text></g><g class="cls-2"><text class="cls-5" transform="translate(113.78 43.32)">T<tspan class="cls-16" x="5.38" y="0">X</tspan><tspan x="11.14" y="0">_</tspan><tspan class="cls-15" x="16.64" y="0">4</tspan><tspan class="cls-8" x="22.29" y="0">X4</tspan></text></g><g class="cls-17"><text class="cls-5" transform="translate(27.48 58.2)">T<tspan class="cls-7" x="5.38" y="0">X</tspan><tspan x="11.14" y="0">_</tspan><tspan class="cls-15" x="16.64" y="0">8</tspan><tspan class="cls-8" x="22.29" y="0">X8</tspan></text></g><g class="cls-17"><text class="cls-5" transform="translate(113.78 58.2)">T<tspan class="cls-16" x="5.38" y="0">X</tspan><tspan x="11.14" y="0">_</tspan><tspan class="cls-15" x="16.64" y="0">4</tspan><tspan class="cls-8" x="22.29" y="0">X4</tspan></text></g><g class="cls-18"><text class="cls-5" transform="translate(21.84 72.72)">T<tspan class="cls-7" x="5.38" y="0">X</tspan><tspan x="11.14" y="0">_</tspan><tspan class="cls-15" x="16.64" y="0">1</tspan><tspan class="cls-16" x="22.29" y="0">6</tspan><tspan class="cls-8" x="27.92" y="0">X</tspan><tspan class="cls-19" x="33.65" y="0">1</tspan><tspan class="cls-8" x="39.31" y="0">6</tspan></text></g><g class="cls-18"><text class="cls-5" transform="translate(113.78 72.72)">T<tspan class="cls-16" x="5.38" y="0">X</tspan><tspan x="11.14" y="0">_</tspan><tspan class="cls-15" x="16.64" y="0">8</tspan><tspan class="cls-8" x="22.29" y="0">X8</tspan></text></g><g class="cls-20"><text class="cls-5" transform="translate(21.84 87.24)">T<tspan class="cls-7" x="5.38" y="0">X</tspan><tspan x="11.14" y="0">_</tspan><tspan class="cls-15" x="16.64" y="0">3</tspan><tspan class="cls-16" x="22.29" y="0">2</tspan><tspan class="cls-8" x="27.92" y="0">X</tspan><tspan class="cls-19" x="33.65" y="0">3</tspan><tspan class="cls-8" x="39.31" y="0">2</tspan></text></g><g class="cls-20"><text class="cls-5" transform="translate(108.14 87.24)">T<tspan class="cls-7" x="5.38" y="0">X</tspan><tspan x="11.14" y="0">_</tspan><tspan class="cls-15" x="16.64" y="0">1</tspan><tspan class="cls-16" x="22.29" y="0">6</tspan><tspan class="cls-8" x="27.92" y="0">X</tspan><tspan class="cls-19" x="33.65" y="0">1</tspan><tspan class="cls-8" x="39.31" y="0">6</tspan></text></g><g class="cls-21"><text class="cls-5" transform="translate(21.84 101.79)">T<tspan class="cls-7" x="5.38" y="0">X</tspan><tspan x="11.14" y="0">_</tspan><tspan class="cls-15" x="16.64" y="0">6</tspan><tspan class="cls-16" x="22.29" y="0">4</tspan><tspan class="cls-8" x="27.92" y="0">X</tspan><tspan class="cls-19" x="33.65" y="0">6</tspan><tspan class="cls-8" x="39.31" y="0">4</tspan></text></g><g class="cls-21"><text class="cls-5" transform="translate(108.14 101.79)">T<tspan class="cls-7" x="5.38" y="0">X</tspan><tspan x="11.14" y="0">_</tspan><tspan class="cls-15" x="16.64" y="0">3</tspan><tspan class="cls-16" x="22.29" y="0">2</tspan><tspan class="cls-8" x="27.92" y="0">X</tspan><tspan class="cls-19" x="33.65" y="0">3</tspan><tspan class="cls-8" x="39.31" y="0">2</tspan></text></g><g class="cls-22"><text class="cls-5" transform="translate(27.48 116.31)">T<tspan class="cls-7" x="5.38" y="0">X</tspan><tspan x="11.14" y="0">_</tspan><tspan class="cls-15" x="16.64" y="0">4</tspan><tspan class="cls-8" x="22.29" y="0">X8</tspan></text></g><g class="cls-22"><text class="cls-5" transform="translate(113.78 116.31)">T<tspan class="cls-16" x="5.38" y="0">X</tspan><tspan x="11.14" y="0">_</tspan><tspan class="cls-15" x="16.64" y="0">4</tspan><tspan class="cls-8" x="22.29" y="0">X4</tspan></text></g><g class="cls-23"><text class="cls-5" transform="translate(27.48 130.83)">T<tspan class="cls-7" x="5.38" y="0">X</tspan><tspan x="11.14" y="0">_</tspan><tspan class="cls-15" x="16.64" y="0">8</tspan><tspan class="cls-8" x="22.29" y="0">X4</tspan></text></g><g class="cls-23"><text class="cls-5" transform="translate(113.78 130.83)">T<tspan class="cls-16" x="5.38" y="0">X</tspan><tspan x="11.14" y="0">_</tspan><tspan class="cls-15" x="16.64" y="0">4</tspan><tspan class="cls-8" x="22.29" y="0">X4</tspan></text></g><g class="cls-24"><text class="cls-5" transform="translate(24.72 145.35)">T<tspan class="cls-7" x="5.38" y="0">X</tspan><tspan x="11.14" y="0">_</tspan><tspan class="cls-15" x="16.64" y="0">8</tspan><tspan class="cls-8" x="22.29" y="0">X</tspan><tspan class="cls-25" x="28.02" y="0">1</tspan><tspan x="33.68" y="0">6</tspan></text></g><g class="cls-24"><text class="cls-5" transform="translate(113.78 145.35)">T<tspan class="cls-16" x="5.38" y="0">X</tspan><tspan x="11.14" y="0">_</tspan><tspan class="cls-15" x="16.64" y="0">8</tspan><tspan class="cls-8" x="22.29" y="0">X8</tspan></text></g><g class="cls-26"><text class="cls-5" transform="translate(24.72 159.87)">T<tspan class="cls-7" x="5.38" y="0">X</tspan><tspan x="11.14" y="0">_</tspan><tspan class="cls-15" x="16.64" y="0">1</tspan><tspan class="cls-16" x="22.29" y="0">6</tspan><tspan class="cls-8" x="27.92" y="0">X8</tspan></text></g><g class="cls-26"><text class="cls-5" transform="translate(113.78 159.87)">T<tspan class="cls-16" x="5.38" y="0">X</tspan><tspan x="11.14" y="0">_</tspan><tspan class="cls-15" x="16.64" y="0">8</tspan><tspan class="cls-8" x="22.29" y="0">X8</tspan></text></g><g class="cls-27"><text class="cls-5" transform="translate(21.84 174.39)">T<tspan class="cls-7" x="5.38" y="0">X</tspan><tspan x="11.14" y="0">_</tspan><tspan class="cls-15" x="16.64" y="0">1</tspan><tspan class="cls-16" x="22.29" y="0">6</tspan><tspan class="cls-8" x="27.92" y="0">X</tspan><tspan class="cls-19" x="33.65" y="0">3</tspan><tspan class="cls-8" x="39.31" y="0">2</tspan></text></g><g class="cls-27"><text class="cls-5" transform="translate(108.14 174.39)">T<tspan class="cls-7" x="5.38" y="0">X</tspan><tspan x="11.14" y="0">_</tspan><tspan class="cls-15" x="16.64" y="0">1</tspan><tspan class="cls-16" x="22.29" y="0">6</tspan><tspan class="cls-8" x="27.92" y="0">X</tspan><tspan class="cls-19" x="33.65" y="0">1</tspan><tspan class="cls-8" x="39.31" y="0">6</tspan></text></g><g class="cls-28"><text class="cls-5" transform="translate(21.84 188.91)">T<tspan class="cls-7" x="5.38" y="0">X</tspan><tspan x="11.14" y="0">_</tspan><tspan class="cls-15" x="16.64" y="0">3</tspan><tspan class="cls-16" x="22.29" y="0">2</tspan><tspan class="cls-8" x="27.92" y="0">X</tspan><tspan class="cls-19" x="33.65" y="0">1</tspan><tspan class="cls-8" x="39.31" y="0">6</tspan></text></g><g class="cls-28"><text class="cls-5" transform="translate(108.14 188.91)">T<tspan class="cls-7" x="5.38" y="0">X</tspan><tspan x="11.14" y="0">_</tspan><tspan class="cls-15" x="16.64" y="0">1</tspan><tspan class="cls-16" x="22.29" y="0">6</tspan><tspan class="cls-8" x="27.92" y="0">X</tspan><tspan class="cls-19" x="33.65" y="0">1</tspan><tspan class="cls-8" x="39.31" y="0">6</tspan></text></g><g class="cls-29"><text class="cls-5" transform="translate(21.84 203.43)">T<tspan class="cls-7" x="5.38" y="0">X</tspan><tspan x="11.14" y="0">_</tspan><tspan class="cls-15" x="16.64" y="0">3</tspan><tspan class="cls-16" x="22.29" y="0">2</tspan><tspan class="cls-8" x="27.92" y="0">X</tspan><tspan class="cls-19" x="33.65" y="0">6</tspan><tspan class="cls-8" x="39.31" y="0">4</tspan></text></g><g class="cls-29"><text class="cls-5" transform="translate(108.14 203.43)">T<tspan class="cls-7" x="5.38" y="0">X</tspan><tspan x="11.14" y="0">_</tspan><tspan class="cls-15" x="16.64" y="0">3</tspan><tspan class="cls-16" x="22.29" y="0">2</tspan><tspan class="cls-8" x="27.92" y="0">X</tspan><tspan class="cls-19" x="33.65" y="0">3</tspan><tspan class="cls-8" x="39.31" y="0">2</tspan></text></g><g class="cls-30"><text class="cls-5" transform="translate(21.84 217.95)">T<tspan class="cls-7" x="5.38" y="0">X</tspan><tspan x="11.14" y="0">_</tspan><tspan class="cls-15" x="16.64" y="0">6</tspan><tspan class="cls-16" x="22.29" y="0">4</tspan><tspan class="cls-8" x="27.92" y="0">X</tspan><tspan class="cls-19" x="33.65" y="0">3</tspan><tspan class="cls-8" x="39.31" y="0">2</tspan></text></g><g class="cls-30"><text class="cls-5" transform="translate(108.14 217.95)">T<tspan class="cls-7" x="5.38" y="0">X</tspan><tspan x="11.14" y="0">_</tspan><tspan class="cls-15" x="16.64" y="0">3</tspan><tspan class="cls-16" x="22.29" y="0">2</tspan><tspan class="cls-8" x="27.92" y="0">X</tspan><tspan class="cls-19" x="33.65" y="0">3</tspan><tspan class="cls-8" x="39.31" y="0">2</tspan></text></g><g class="cls-31"><text class="cls-5" transform="translate(24.72 232.47)">T<tspan class="cls-7" x="5.38" y="0">X</tspan><tspan x="11.14" y="0">_</tspan><tspan class="cls-15" x="16.64" y="0">4</tspan><tspan class="cls-8" x="22.29" y="0">X</tspan><tspan class="cls-25" x="28.02" y="0">1</tspan><tspan x="33.68" y="0">6</tspan></text></g><g class="cls-31"><text class="cls-5" transform="translate(113.78 232.47)">T<tspan class="cls-16" x="5.38" y="0">X</tspan><tspan x="11.14" y="0">_</tspan><tspan class="cls-15" x="16.64" y="0">4</tspan><tspan class="cls-8" x="22.29" y="0">X8</tspan></text></g><g class="cls-32"><text class="cls-5" transform="translate(24.72 246.99)">T<tspan class="cls-7" x="5.38" y="0">X</tspan><tspan x="11.14" y="0">_</tspan><tspan class="cls-15" x="16.64" y="0">1</tspan><tspan class="cls-16" x="22.29" y="0">6</tspan><tspan class="cls-8" x="27.92" y="0">X4</tspan></text></g><g class="cls-32"><text class="cls-5" transform="translate(113.78 246.99)">T<tspan class="cls-16" x="5.38" y="0">X</tspan><tspan x="11.14" y="0">_</tspan><tspan class="cls-15" x="16.64" y="0">8</tspan><tspan class="cls-8" x="22.29" y="0">X4</tspan></text></g><g class="cls-33"><text class="cls-5" transform="translate(24.72 261.53)">T<tspan class="cls-7" x="5.38" y="0">X</tspan><tspan x="11.14" y="0">_</tspan><tspan class="cls-15" x="16.64" y="0">8</tspan><tspan class="cls-8" x="22.29" y="0">X</tspan><tspan class="cls-25" x="28.02" y="0">3</tspan><tspan x="33.68" y="0">2</tspan></text></g><g class="cls-33"><text class="cls-5" transform="translate(110.9 261.53)">T<tspan class="cls-7" x="5.38" y="0">X</tspan><tspan x="11.14" y="0">_</tspan><tspan class="cls-15" x="16.64" y="0">8</tspan><tspan class="cls-8" x="22.29" y="0">X</tspan><tspan class="cls-25" x="28.02" y="0">1</tspan><tspan x="33.68" y="0">6</tspan></text></g><g class="cls-34"><text class="cls-5" transform="translate(24.72 276.05)">T<tspan class="cls-7" x="5.38" y="0">X</tspan><tspan x="11.14" y="0">_</tspan><tspan class="cls-15" x="16.64" y="0">3</tspan><tspan class="cls-16" x="22.29" y="0">2</tspan><tspan class="cls-8" x="27.92" y="0">X8</tspan></text></g><g class="cls-34"><text class="cls-5" transform="translate(110.9 276.05)">T<tspan class="cls-7" x="5.38" y="0">X</tspan><tspan x="11.14" y="0">_</tspan><tspan class="cls-15" x="16.64" y="0">1</tspan><tspan class="cls-16" x="22.29" y="0">6</tspan><tspan class="cls-8" x="27.92" y="0">X8</tspan></text></g><g class="cls-35"><text class="cls-5" transform="translate(21.84 290.57)">T<tspan class="cls-7" x="5.38" y="0">X</tspan><tspan x="11.14" y="0">_</tspan><tspan class="cls-15" x="16.64" y="0">1</tspan><tspan class="cls-16" x="22.29" y="0">6</tspan><tspan class="cls-8" x="27.92" y="0">X</tspan><tspan class="cls-19" x="33.65" y="0">6</tspan><tspan class="cls-8" x="39.31" y="0">4</tspan></text></g><g class="cls-35"><text class="cls-5" transform="translate(108.14 290.57)">T<tspan class="cls-7" x="5.38" y="0">X</tspan><tspan x="11.14" y="0">_</tspan><tspan class="cls-15" x="16.64" y="0">1</tspan><tspan class="cls-16" x="22.29" y="0">6</tspan><tspan class="cls-8" x="27.92" y="0">X</tspan><tspan class="cls-19" x="33.65" y="0">3</tspan><tspan class="cls-8" x="39.31" y="0">2</tspan></text></g><g class="cls-36"><text class="cls-5" transform="translate(21.84 305.45)">T<tspan class="cls-7" x="5.38" y="0">X</tspan><tspan x="11.14" y="0">_</tspan><tspan class="cls-15" x="16.64" y="0">6</tspan><tspan class="cls-16" x="22.29" y="0">4</tspan><tspan class="cls-8" x="27.92" y="0">X</tspan><tspan class="cls-19" x="33.65" y="0">1</tspan><tspan class="cls-8" x="39.31" y="0">6</tspan></text></g><g class="cls-36"><text class="cls-5" transform="translate(108.14 305.45)">T<tspan class="cls-7" x="5.38" y="0">X</tspan><tspan x="11.14" y="0">_</tspan><tspan class="cls-15" x="16.64" y="0">3</tspan><tspan class="cls-16" x="22.29" y="0">2</tspan><tspan class="cls-8" x="27.92" y="0">X</tspan><tspan class="cls-19" x="33.65" y="0">1</tspan><tspan class="cls-8" x="39.31" y="0">6</tspan></text></g><g class="cls-37"><line class="cls-38" x1="87.44" y1="2.94" x2="87.44" y2="30.42"/><rect x="87.38" y="2.88" width="0.96" height="27.6"/><line class="cls-38" x1="87.44" y1="33.42" x2="87.44" y2="307.79"/><rect x="87.38" y="33.36" width="0.96" height="274.49"/><rect width="172.61" height="2.88"/><rect y="30.48" width="172.61" height="0.96"/><rect y="32.4" width="172.61" height="0.96"/><rect y="307.85" width="172.61" height="2.88"/></g></g></g></svg>
\ No newline at end of file
diff --git a/doc/img/tx_set.svg b/doc/img/tx_set.svg
new file mode 100644
index 0000000..dee10d4
--- /dev/null
+++ b/doc/img/tx_set.svg
@@ -0,0 +1 @@
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" viewBox="0 0 347.4 549.8"><defs><style>.cls-1,.cls-60{fill:none;}.cls-2{fill:#ddebf7;}.cls-3{clip-path:url(#clip-path);}.cls-19,.cls-4{font-size:12px;fill:#333;}.cls-4{font-family:Calibri-Bold, Calibri;font-weight:700;}.cls-5{letter-spacing:0em;}.cls-6{letter-spacing:0em;}.cls-7{letter-spacing:0em;}.cls-8{letter-spacing:0em;}.cls-9{letter-spacing:0em;}.cls-10{letter-spacing:0em;}.cls-11{letter-spacing:0.01em;}.cls-12{letter-spacing:0em;}.cls-13{letter-spacing:0em;}.cls-14{letter-spacing:0em;}.cls-15{letter-spacing:0.01em;}.cls-16{letter-spacing:0em;}.cls-17{letter-spacing:0em;}.cls-18{clip-path:url(#clip-path-4);}.cls-19{font-family:Calibri-Italic, Calibri;font-style:italic;}.cls-20{clip-path:url(#clip-path-7);}.cls-21{letter-spacing:0em;}.cls-22{letter-spacing:0em;}.cls-23{clip-path:url(#clip-path-10);}.cls-24{clip-path:url(#clip-path-12);}.cls-25{clip-path:url(#clip-path-14);}.cls-26{clip-path:url(#clip-path-16);}.cls-27{clip-path:url(#clip-path-18);}.cls-28{clip-path:url(#clip-path-20);}.cls-29{clip-path:url(#clip-path-22);}.cls-30{clip-path:url(#clip-path-24);}.cls-31{clip-path:url(#clip-path-26);}.cls-32{clip-path:url(#clip-path-28);}.cls-33{clip-path:url(#clip-path-30);}.cls-34{clip-path:url(#clip-path-32);}.cls-35{clip-path:url(#clip-path-34);}.cls-36{clip-path:url(#clip-path-36);}.cls-37{clip-path:url(#clip-path-38);}.cls-38{clip-path:url(#clip-path-40);}.cls-39{clip-path:url(#clip-path-42);}.cls-40{clip-path:url(#clip-path-44);}.cls-41{clip-path:url(#clip-path-46);}.cls-42{clip-path:url(#clip-path-48);}.cls-43{clip-path:url(#clip-path-50);}.cls-44{clip-path:url(#clip-path-52);}.cls-45{clip-path:url(#clip-path-54);}.cls-46{clip-path:url(#clip-path-56);}.cls-47{clip-path:url(#clip-path-58);}.cls-48{clip-path:url(#clip-path-60);}.cls-49{clip-path:url(#clip-path-62);}.cls-50{clip-path:url(#clip-path-64);}.cls-51{clip-path:url(#clip-path-66);}.cls-52{clip-path:url(#clip-path-68);}.cls-53{clip-path:url(#clip-path-70);}.cls-54{clip-path:url(#clip-path-72);}.cls-55{letter-spacing:0.01em;}.cls-56{clip-path:url(#clip-path-73);}.cls-57{clip-path:url(#clip-path-74);}.cls-58{clip-path:url(#clip-path-75);}.cls-59{clip-path:url(#clip-path-76);}.cls-60{stroke:#000;stroke-linecap:square;stroke-linejoin:round;stroke-width:0.14px;}</style><clipPath id="clip-path" transform="translate(-53.03 -15.71)"><rect class="cls-1" x="1.92" y="17.15" width="502.08" height="30.24"/></clipPath><clipPath id="clip-path-4" transform="translate(-53.03 -15.71)"><rect class="cls-1" x="1.92" y="48.35" width="502.08" height="15"/></clipPath><clipPath id="clip-path-7" transform="translate(-53.03 -15.71)"><rect class="cls-1" x="1.92" y="64.31" width="502.08" height="15"/></clipPath><clipPath id="clip-path-10" transform="translate(-53.03 -15.71)"><rect class="cls-1" x="1.92" y="80.27" width="502.08" height="15"/></clipPath><clipPath id="clip-path-12" transform="translate(-53.03 -15.71)"><rect class="cls-1" x="1.92" y="96.23" width="502.08" height="15"/></clipPath><clipPath id="clip-path-14" transform="translate(-53.03 -15.71)"><rect class="cls-1" x="1.92" y="112.2" width="502.08" height="15.02"/></clipPath><clipPath id="clip-path-16" transform="translate(-53.03 -15.71)"><rect class="cls-1" x="1.92" y="128.18" width="502.08" height="15"/></clipPath><clipPath id="clip-path-18" transform="translate(-53.03 -15.71)"><rect class="cls-1" x="1.92" y="144.14" width="502.08" height="15"/></clipPath><clipPath id="clip-path-20" transform="translate(-53.03 -15.71)"><rect class="cls-1" x="1.92" y="160.1" width="502.08" height="15"/></clipPath><clipPath id="clip-path-22" transform="translate(-53.03 -15.71)"><rect class="cls-1" x="1.92" y="176.06" width="502.08" height="15"/></clipPath><clipPath id="clip-path-24" transform="translate(-53.03 -15.71)"><rect class="cls-1" x="1.92" y="192.02" width="502.08" height="15"/></clipPath><clipPath id="clip-path-26" transform="translate(-53.03 -15.71)"><rect class="cls-1" x="1.92" y="207.98" width="502.08" height="15"/></clipPath><clipPath id="clip-path-28" transform="translate(-53.03 -15.71)"><rect class="cls-1" x="1.92" y="223.94" width="502.08" height="17.88"/></clipPath><clipPath id="clip-path-30" transform="translate(-53.03 -15.71)"><rect class="cls-1" x="1.92" y="242.78" width="502.08" height="15.72"/></clipPath><clipPath id="clip-path-32" transform="translate(-53.03 -15.71)"><rect class="cls-1" x="1.92" y="259.46" width="502.08" height="14.3"/></clipPath><clipPath id="clip-path-34" transform="translate(-53.03 -15.71)"><rect class="cls-1" x="1.92" y="274.72" width="502.08" height="14.28"/></clipPath><clipPath id="clip-path-36" transform="translate(-53.03 -15.71)"><rect class="cls-1" x="1.92" y="289.96" width="502.08" height="14.28"/></clipPath><clipPath id="clip-path-38" transform="translate(-53.03 -15.71)"><rect class="cls-1" x="1.92" y="305.2" width="502.08" height="14.28"/></clipPath><clipPath id="clip-path-40" transform="translate(-53.03 -15.71)"><rect class="cls-1" x="1.92" y="320.44" width="502.08" height="14.28"/></clipPath><clipPath id="clip-path-42" transform="translate(-53.03 -15.71)"><rect class="cls-1" x="1.92" y="335.68" width="502.08" height="14.28"/></clipPath><clipPath id="clip-path-44" transform="translate(-53.03 -15.71)"><rect class="cls-1" x="1.92" y="350.92" width="502.08" height="14.28"/></clipPath><clipPath id="clip-path-46" transform="translate(-53.03 -15.71)"><rect class="cls-1" x="1.92" y="366.16" width="502.08" height="14.28"/></clipPath><clipPath id="clip-path-48" transform="translate(-53.03 -15.71)"><rect class="cls-1" x="1.92" y="381.4" width="502.08" height="14.28"/></clipPath><clipPath id="clip-path-50" transform="translate(-53.03 -15.71)"><rect class="cls-1" x="1.92" y="396.64" width="502.08" height="14.28"/></clipPath><clipPath id="clip-path-52" transform="translate(-53.03 -15.71)"><rect class="cls-1" x="1.92" y="411.88" width="502.08" height="14.28"/></clipPath><clipPath id="clip-path-54" transform="translate(-53.03 -15.71)"><rect class="cls-1" x="1.92" y="427.13" width="502.08" height="14.3"/></clipPath><clipPath id="clip-path-56" transform="translate(-53.03 -15.71)"><rect class="cls-1" x="1.92" y="442.39" width="502.08" height="14.28"/></clipPath><clipPath id="clip-path-58" transform="translate(-53.03 -15.71)"><rect class="cls-1" x="1.92" y="457.63" width="502.08" height="14.28"/></clipPath><clipPath id="clip-path-60" transform="translate(-53.03 -15.71)"><rect class="cls-1" x="1.92" y="472.87" width="502.08" height="14.28"/></clipPath><clipPath id="clip-path-62" transform="translate(-53.03 -15.71)"><rect class="cls-1" x="1.92" y="488.11" width="502.08" height="14.28"/></clipPath><clipPath id="clip-path-64" transform="translate(-53.03 -15.71)"><rect class="cls-1" x="1.92" y="503.35" width="502.08" height="14.28"/></clipPath><clipPath id="clip-path-66" transform="translate(-53.03 -15.71)"><rect class="cls-1" x="1.92" y="518.59" width="502.08" height="14.28"/></clipPath><clipPath id="clip-path-68" transform="translate(-53.03 -15.71)"><rect class="cls-1" x="1.92" y="533.83" width="502.08" height="14.28"/></clipPath><clipPath id="clip-path-70" transform="translate(-53.03 -15.71)"><rect class="cls-1" x="1.92" y="549.07" width="502.08" height="15"/></clipPath><clipPath id="clip-path-72" transform="translate(-53.03 -15.71)"><rect class="cls-1" x="53.04" y="79.79" width="118.49" height="31.92"/></clipPath><clipPath id="clip-path-73" transform="translate(-53.03 -15.71)"><rect class="cls-1" x="53.04" y="111.72" width="118.49" height="63.86"/></clipPath><clipPath id="clip-path-74" transform="translate(-53.03 -15.71)"><rect class="cls-1" x="53.04" y="175.58" width="118.49" height="144.38"/></clipPath><clipPath id="clip-path-75" transform="translate(-53.03 -15.71)"><rect class="cls-1" x="53.04" y="319.97" width="118.49" height="244.58"/></clipPath><clipPath id="clip-path-76" transform="translate(-53.03 -15.71)"><rect class="cls-1" width="506.04" height="567.07"/></clipPath></defs><title>tx_setAsset 1</title><g id="Layer_2" data-name="Layer 2"><g id="Layer_1-2" data-name="Layer 1"><rect class="cls-2" x="0.01" y="0.96" width="347.38" height="31.32"/><g class="cls-3"><text class="cls-4" transform="translate(24.51 20.64)"><tspan class="cls-5">Tr</tspan><tspan class="cls-6" x="10.28" y="0">a</tspan><tspan class="cls-7" x="16.17" y="0">n</tspan><tspan x="22.63" y="0">s</tspan><tspan class="cls-5" x="27.42" y="0">f</tspan><tspan x="31.25" y="0">o</tspan><tspan class="cls-8" x="37.7" y="0">r</tspan><tspan class="cls-6" x="42.03" y="0">m</tspan><tspan class="cls-7" x="51.75" y="0"> </tspan><tspan x="54.49" y="0">set</tspan></text></g><g class="cls-3"><text class="cls-4" transform="translate(127.26 20.64)">V<tspan class="cls-9" x="7.1" y="0">e</tspan><tspan class="cls-10" x="13.09" y="0">r</tspan><tspan class="cls-11" x="17.39" y="0">ti</tspan><tspan x="24.49" y="0">cal</tspan><tspan class="cls-12" x="38.38" y="0"> </tspan><tspan x="41.14" y="0">t</tspan><tspan class="cls-11" x="45.3" y="0">r</tspan><tspan class="cls-13" x="49.64" y="0">a</tspan><tspan class="cls-7" x="55.53" y="0">n</tspan><tspan x="61.99" y="0">s</tspan><tspan class="cls-5" x="66.78" y="0">f</tspan><tspan class="cls-14" x="70.61" y="0">o</tspan><tspan class="cls-15" x="77.06" y="0">r</tspan><tspan x="81.39" y="0">m</tspan></text></g><g class="cls-3"><text class="cls-4" transform="translate(234.8 20.64)">Ho<tspan class="cls-12" x="14.02" y="0">r</tspan><tspan class="cls-5" x="18.33" y="0">i</tspan><tspan x="21.32" y="0">z</tspan><tspan class="cls-12" x="26.09" y="0">o</tspan><tspan class="cls-16" x="32.59" y="0">n</tspan><tspan x="39.05" y="0">tal</tspan><tspan class="cls-8" x="52.08" y="0"> </tspan><tspan x="54.85" y="0">t</tspan><tspan class="cls-11" x="59.01" y="0">r</tspan><tspan class="cls-13" x="63.35" y="0">a</tspan><tspan class="cls-16" x="69.24" y="0">n</tspan><tspan x="75.7" y="0">s</tspan><tspan class="cls-17" x="80.49" y="0">f</tspan><tspan x="84.32" y="0">o</tspan><tspan class="cls-15" x="90.77" y="0">r</tspan><tspan x="95.1" y="0">m</tspan></text></g><g class="cls-18"><text class="cls-19" transform="translate(37.35 44.16)"><tspan class="cls-5">D</tspan><tspan x="7.42" y="0">CT</tspan><tspan class="cls-13" x="19.54" y="0">O</tspan><tspan class="cls-9" x="27.35" y="0">n</tspan><tspan x="33.47" y="0">ly</tspan></text></g><g class="cls-18"><text class="cls-19" transform="translate(162.06 44.16)"><tspan class="cls-5">D</tspan><tspan x="7.42" y="0">CT</tspan></text></g><g class="cls-18"><text class="cls-19" transform="translate(276.44 44.16)"><tspan class="cls-5">D</tspan><tspan x="7.42" y="0">CT</tspan></text></g><g class="cls-20"><text class="cls-19" transform="translate(46.95 60.12)">I<tspan class="cls-21" x="3.02" y="0">D</tspan><tspan x="10.42" y="0">TX</tspan></text></g><g class="cls-20"><text class="cls-19" transform="translate(163.62 60.12)">I<tspan class="cls-21" x="3.02" y="0">D</tspan><tspan x="10.42" y="0">T</tspan></text></g><g class="cls-20"><text class="cls-19" transform="translate(278 60.12)">I<tspan class="cls-22" x="3.02" y="0">D</tspan><tspan x="10.42" y="0">T</tspan></text></g><g class="cls-23"><text class="cls-19" transform="translate(160.62 76.08)"><tspan class="cls-5">D</tspan><tspan x="7.42" y="0">CT </tspan></text></g><g class="cls-23"><text class="cls-19" transform="translate(276.68 76.08)">I<tspan class="cls-22" x="3.02" y="0">D</tspan><tspan x="10.42" y="0">T </tspan></text></g><g class="cls-24"><text class="cls-19" transform="translate(163.62 92.04)">I<tspan class="cls-21" x="3.02" y="0">D</tspan><tspan x="10.42" y="0">T</tspan></text></g><g class="cls-24"><text class="cls-19" transform="translate(276.44 92.04)"><tspan class="cls-5">D</tspan><tspan x="7.42" y="0">CT</tspan></text></g><g class="cls-25"><text class="cls-19" transform="translate(157.62 108.03)">A<tspan class="cls-12" x="6.94" y="0">D</tspan><tspan x="14.37" y="0">ST </tspan></text></g><g class="cls-25"><text class="cls-19" transform="translate(272 108.03)">A<tspan class="cls-12" x="6.94" y="0">D</tspan><tspan x="14.37" y="0">ST </tspan></text></g><g class="cls-26"><text class="cls-19" transform="translate(157.62 123.99)">A<tspan class="cls-12" x="6.94" y="0">D</tspan><tspan x="14.37" y="0">ST </tspan></text></g><g class="cls-26"><text class="cls-19" transform="translate(275 123.99)"><tspan class="cls-5">D</tspan><tspan x="7.42" y="0">CT </tspan></text></g><g class="cls-27"><text class="cls-19" transform="translate(160.62 139.95)"><tspan class="cls-5">D</tspan><tspan x="7.42" y="0">CT </tspan></text></g><g class="cls-27"><text class="cls-19" transform="translate(272 139.95)">A<tspan class="cls-12" x="6.94" y="0">D</tspan><tspan x="14.37" y="0">ST </tspan></text></g><g class="cls-28"><text class="cls-19" transform="translate(162.06 155.91)"><tspan class="cls-5">D</tspan><tspan x="7.42" y="0">CT</tspan></text></g><g class="cls-28"><text class="cls-19" transform="translate(276.44 155.91)"><tspan class="cls-5">D</tspan><tspan x="7.42" y="0">CT</tspan></text></g><g class="cls-29"><text class="cls-19" transform="translate(160.62 171.87)"><tspan class="cls-5">D</tspan><tspan x="7.42" y="0">CT </tspan></text></g><g class="cls-29"><text class="cls-19" transform="translate(275 171.87)"><tspan class="cls-5">D</tspan><tspan x="7.42" y="0">CT </tspan></text></g><g class="cls-30"><text class="cls-19" transform="translate(160.62 187.83)"><tspan class="cls-5">D</tspan><tspan x="7.42" y="0">CT </tspan></text></g><g class="cls-30"><text class="cls-19" transform="translate(272 187.83)">A<tspan class="cls-12" x="6.94" y="0">D</tspan><tspan x="14.37" y="0">ST </tspan></text></g><g class="cls-31"><text class="cls-19" transform="translate(160.62 203.79)"><tspan class="cls-5">D</tspan><tspan x="7.42" y="0">CT </tspan></text></g><g class="cls-31"><text class="cls-19" transform="translate(253.04 203.79)">Fli<tspan class="cls-13" x="11.02" y="0">p</tspan><tspan class="cls-9" x="17.16" y="0">p</tspan><tspan x="23.28" y="0">ed </tspan><tspan class="cls-16" x="37.89" y="0">A</tspan><tspan class="cls-17" x="44.86" y="0">D</tspan><tspan x="52.28" y="0">ST </tspan></text></g><g class="cls-32"><text class="cls-19" transform="translate(158.94 221.19)">A<tspan class="cls-12" x="6.94" y="0">D</tspan><tspan x="14.37" y="0">ST</tspan></text></g><g class="cls-32"><text class="cls-19" transform="translate(276.44 221.19)"><tspan class="cls-5">D</tspan><tspan x="7.42" y="0">CT</tspan></text></g><g class="cls-33"><text class="cls-19" transform="translate(157.62 238.95)">A<tspan class="cls-12" x="6.94" y="0">D</tspan><tspan x="14.37" y="0">ST </tspan></text></g><g class="cls-33"><text class="cls-19" transform="translate(272 238.95)">A<tspan class="cls-12" x="6.94" y="0">D</tspan><tspan x="14.37" y="0">ST </tspan></text></g><g class="cls-34"><text class="cls-19" transform="translate(157.62 255.03)">A<tspan class="cls-12" x="6.94" y="0">D</tspan><tspan x="14.37" y="0">ST </tspan></text></g><g class="cls-34"><text class="cls-19" transform="translate(253.04 255.03)">Fli<tspan class="cls-13" x="11.02" y="0">p</tspan><tspan class="cls-9" x="17.16" y="0">p</tspan><tspan x="23.28" y="0">ed </tspan><tspan class="cls-16" x="37.89" y="0">A</tspan><tspan class="cls-17" x="44.86" y="0">D</tspan><tspan x="52.28" y="0">ST </tspan></text></g><g class="cls-35"><text class="cls-19" transform="translate(138.66 270.29)">Fli<tspan class="cls-13" x="11.02" y="0">p</tspan><tspan class="cls-9" x="17.16" y="0">p</tspan><tspan x="23.28" y="0">ed </tspan><tspan class="cls-7" x="37.89" y="0">A</tspan><tspan class="cls-5" x="44.86" y="0">D</tspan><tspan x="52.28" y="0">ST </tspan></text></g><g class="cls-35"><text class="cls-19" transform="translate(275 270.29)"><tspan class="cls-5">D</tspan><tspan x="7.42" y="0">CT </tspan></text></g><g class="cls-36"><text class="cls-19" transform="translate(138.66 285.53)">Fli<tspan class="cls-13" x="11.02" y="0">p</tspan><tspan class="cls-9" x="17.16" y="0">p</tspan><tspan x="23.28" y="0">ed </tspan><tspan class="cls-7" x="37.89" y="0">A</tspan><tspan class="cls-5" x="44.86" y="0">D</tspan><tspan x="52.28" y="0">ST </tspan></text></g><g class="cls-36"><text class="cls-19" transform="translate(272 285.53)">A<tspan class="cls-12" x="6.94" y="0">D</tspan><tspan x="14.37" y="0">ST </tspan></text></g><g class="cls-37"><text class="cls-19" transform="translate(139.98 300.77)">Fli<tspan class="cls-13" x="11.02" y="0">p</tspan><tspan class="cls-9" x="17.16" y="0">p</tspan><tspan x="23.28" y="0">ed </tspan><tspan class="cls-7" x="37.89" y="0">A</tspan><tspan class="cls-5" x="44.86" y="0">D</tspan><tspan x="52.28" y="0">ST</tspan></text></g><g class="cls-37"><text class="cls-19" transform="translate(253.04 300.77)">Fli<tspan class="cls-13" x="11.02" y="0">p</tspan><tspan class="cls-9" x="17.16" y="0">p</tspan><tspan x="23.28" y="0">ed </tspan><tspan class="cls-16" x="37.89" y="0">A</tspan><tspan class="cls-17" x="44.86" y="0">D</tspan><tspan x="52.28" y="0">ST </tspan></text></g><g class="cls-38"><text class="cls-19" transform="translate(160.62 316.01)"><tspan class="cls-5">D</tspan><tspan x="7.42" y="0">CT </tspan></text></g><g class="cls-38"><text class="cls-19" transform="translate(275 316.01)"><tspan class="cls-5">D</tspan><tspan x="7.42" y="0">CT </tspan></text></g><g class="cls-39"><text class="cls-19" transform="translate(160.62 331.25)"><tspan class="cls-5">D</tspan><tspan x="7.42" y="0">CT </tspan></text></g><g class="cls-39"><text class="cls-19" transform="translate(272 331.25)">A<tspan class="cls-12" x="6.94" y="0">D</tspan><tspan x="14.37" y="0">ST </tspan></text></g><g class="cls-40"><text class="cls-19" transform="translate(160.62 346.49)"><tspan class="cls-5">D</tspan><tspan x="7.42" y="0">CT </tspan></text></g><g class="cls-40"><text class="cls-19" transform="translate(253.04 346.49)">Fli<tspan class="cls-13" x="11.02" y="0">p</tspan><tspan class="cls-9" x="17.16" y="0">p</tspan><tspan x="23.28" y="0">ed </tspan><tspan class="cls-16" x="37.89" y="0">A</tspan><tspan class="cls-17" x="44.86" y="0">D</tspan><tspan x="52.28" y="0">ST </tspan></text></g><g class="cls-41"><text class="cls-19" transform="translate(160.62 361.73)"><tspan class="cls-5">D</tspan><tspan x="7.42" y="0">CT </tspan></text></g><g class="cls-41"><text class="cls-19" transform="translate(276.68 361.73)">I<tspan class="cls-22" x="3.02" y="0">D</tspan><tspan x="10.42" y="0">T </tspan></text></g><g class="cls-42"><text class="cls-19" transform="translate(157.62 376.97)">A<tspan class="cls-12" x="6.94" y="0">D</tspan><tspan x="14.37" y="0">ST </tspan></text></g><g class="cls-42"><text class="cls-19" transform="translate(275 376.97)"><tspan class="cls-5">D</tspan><tspan x="7.42" y="0">CT </tspan></text></g><g class="cls-43"><text class="cls-19" transform="translate(157.62 392.21)">A<tspan class="cls-12" x="6.94" y="0">D</tspan><tspan x="14.37" y="0">ST </tspan></text></g><g class="cls-43"><text class="cls-19" transform="translate(272 392.21)">A<tspan class="cls-12" x="6.94" y="0">D</tspan><tspan x="14.37" y="0">ST </tspan></text></g><g class="cls-44"><text class="cls-19" transform="translate(157.62 407.45)">A<tspan class="cls-12" x="6.94" y="0">D</tspan><tspan x="14.37" y="0">ST </tspan></text></g><g class="cls-44"><text class="cls-19" transform="translate(253.04 407.45)">Fli<tspan class="cls-13" x="11.02" y="0">p</tspan><tspan class="cls-9" x="17.16" y="0">p</tspan><tspan x="23.28" y="0">ed </tspan><tspan class="cls-16" x="37.89" y="0">A</tspan><tspan class="cls-17" x="44.86" y="0">D</tspan><tspan x="52.28" y="0">ST </tspan></text></g><g class="cls-45"><text class="cls-19" transform="translate(157.62 422.72)">A<tspan class="cls-12" x="6.94" y="0">D</tspan><tspan x="14.37" y="0">ST </tspan></text></g><g class="cls-45"><text class="cls-19" transform="translate(276.68 422.72)">I<tspan class="cls-22" x="3.02" y="0">D</tspan><tspan x="10.42" y="0">T </tspan></text></g><g class="cls-46"><text class="cls-19" transform="translate(138.66 437.96)">Fli<tspan class="cls-13" x="11.02" y="0">p</tspan><tspan class="cls-9" x="17.16" y="0">p</tspan><tspan x="23.28" y="0">ed </tspan><tspan class="cls-7" x="37.89" y="0">A</tspan><tspan class="cls-5" x="44.86" y="0">D</tspan><tspan x="52.28" y="0">ST </tspan></text></g><g class="cls-46"><text class="cls-19" transform="translate(275 437.96)"><tspan class="cls-5">D</tspan><tspan x="7.42" y="0">CT </tspan></text></g><g class="cls-47"><text class="cls-19" transform="translate(138.66 453.2)">Fli<tspan class="cls-13" x="11.02" y="0">p</tspan><tspan class="cls-9" x="17.16" y="0">p</tspan><tspan x="23.28" y="0">ed </tspan><tspan class="cls-7" x="37.89" y="0">A</tspan><tspan class="cls-5" x="44.86" y="0">D</tspan><tspan x="52.28" y="0">ST </tspan></text></g><g class="cls-47"><text class="cls-19" transform="translate(272 453.2)">A<tspan class="cls-12" x="6.94" y="0">D</tspan><tspan x="14.37" y="0">ST </tspan></text></g><g class="cls-48"><text class="cls-19" transform="translate(138.66 468.44)">Fli<tspan class="cls-13" x="11.02" y="0">p</tspan><tspan class="cls-9" x="17.16" y="0">p</tspan><tspan x="23.28" y="0">ed </tspan><tspan class="cls-7" x="37.89" y="0">A</tspan><tspan class="cls-5" x="44.86" y="0">D</tspan><tspan x="52.28" y="0">ST </tspan></text></g><g class="cls-48"><text class="cls-19" transform="translate(253.04 468.44)">Fli<tspan class="cls-13" x="11.02" y="0">p</tspan><tspan class="cls-9" x="17.16" y="0">p</tspan><tspan x="23.28" y="0">ed </tspan><tspan class="cls-16" x="37.89" y="0">A</tspan><tspan class="cls-17" x="44.86" y="0">D</tspan><tspan x="52.28" y="0">ST </tspan></text></g><g class="cls-49"><text class="cls-19" transform="translate(138.66 483.68)">Fli<tspan class="cls-13" x="11.02" y="0">p</tspan><tspan class="cls-9" x="17.16" y="0">p</tspan><tspan x="23.28" y="0">ed </tspan><tspan class="cls-7" x="37.89" y="0">A</tspan><tspan class="cls-5" x="44.86" y="0">D</tspan><tspan x="52.28" y="0">ST </tspan></text></g><g class="cls-49"><text class="cls-19" transform="translate(276.68 483.68)">I<tspan class="cls-22" x="3.02" y="0">D</tspan><tspan x="10.42" y="0">T </tspan></text></g><g class="cls-50"><text class="cls-19" transform="translate(162.3 498.92)">I<tspan class="cls-21" x="3.02" y="0">D</tspan><tspan x="10.42" y="0">T </tspan></text></g><g class="cls-50"><text class="cls-19" transform="translate(275 498.92)"><tspan class="cls-5">D</tspan><tspan x="7.42" y="0">CT </tspan></text></g><g class="cls-51"><text class="cls-19" transform="translate(162.3 514.16)">I<tspan class="cls-21" x="3.02" y="0">D</tspan><tspan x="10.42" y="0">T </tspan></text></g><g class="cls-51"><text class="cls-19" transform="translate(272 514.16)">A<tspan class="cls-12" x="6.94" y="0">D</tspan><tspan x="14.37" y="0">ST </tspan></text></g><g class="cls-52"><text class="cls-19" transform="translate(162.3 529.4)">I<tspan class="cls-21" x="3.02" y="0">D</tspan><tspan x="10.42" y="0">T </tspan></text></g><g class="cls-52"><text class="cls-19" transform="translate(253.04 529.4)">Fli<tspan class="cls-13" x="11.02" y="0">p</tspan><tspan class="cls-9" x="17.16" y="0">p</tspan><tspan x="23.28" y="0">ed </tspan><tspan class="cls-16" x="37.89" y="0">A</tspan><tspan class="cls-17" x="44.86" y="0">D</tspan><tspan x="52.28" y="0">ST </tspan></text></g><g class="cls-53"><text class="cls-19" transform="translate(162.3 544.88)">I<tspan class="cls-21" x="3.02" y="0">D</tspan><tspan x="10.42" y="0">T </tspan></text></g><g class="cls-53"><text class="cls-19" transform="translate(276.68 544.88)">I<tspan class="cls-22" x="3.02" y="0">D</tspan><tspan x="10.42" y="0">T </tspan></text></g><g class="cls-54"><text class="cls-19" transform="translate(41.67 83.64)">1<tspan class="cls-55" x="6.08" y="0">D</tspan><tspan class="cls-5" x="13.54" y="0">D</tspan><tspan x="20.96" y="0">CT</tspan></text></g><g class="cls-56"><text class="cls-19" transform="translate(45.51 131.55)"><tspan class="cls-5">D</tspan><tspan x="7.42" y="0">T</tspan><tspan class="cls-12" x="13.27" y="0">T</tspan><tspan x="19.16" y="0">4</tspan></text></g><g class="cls-57"><text class="cls-19" transform="translate(45.51 235.59)"><tspan class="cls-5">D</tspan><tspan x="7.42" y="0">T</tspan><tspan class="cls-12" x="13.27" y="0">T</tspan><tspan x="19.16" y="0">9</tspan></text></g><g class="cls-58"><text class="cls-19" transform="translate(43.59 430.16)">ALL<tspan class="cls-5" x="17.03" y="0">1</tspan><tspan x="23.15" y="0">6</tspan></text></g><g class="cls-59"><line class="cls-60" x1="118.08" y1="1.98" x2="118.08" y2="31.14"/><rect x="118.02" y="1.92" width="0.96" height="29.28"/><line class="cls-60" x1="226.82" y1="1.98" x2="226.82" y2="31.14"/><rect x="226.76" y="1.92" width="0.96" height="29.28"/><line class="cls-60" x1="118.08" y1="33.18" x2="118.08" y2="547.82"/><rect x="118.02" y="33.13" width="0.96" height="514.75"/><line class="cls-60" x1="226.82" y1="33.18" x2="226.82" y2="547.82"/><rect x="226.76" y="33.13" width="0.96" height="514.75"/><rect x="0.01" width="347.38" height="1.92"/><rect x="0.01" y="31.2" width="347.38" height="1.92"/><line class="cls-60" x1="0.07" y1="47.7" x2="347.33" y2="47.7"/><rect x="0.01" y="47.64" width="347.38" height="0.96"/><line class="cls-60" x1="0.07" y1="63.66" x2="347.33" y2="63.66"/><rect x="0.01" y="63.6" width="347.38" height="0.96"/><line class="cls-60" x1="0.07" y1="95.58" x2="347.33" y2="95.58"/><rect x="0.01" y="95.52" width="347.38" height="0.96"/><line class="cls-60" x1="0.07" y1="159.45" x2="347.33" y2="159.45"/><rect x="0.01" y="159.39" width="347.38" height="0.96"/><line class="cls-60" x1="0.07" y1="303.83" x2="347.33" y2="303.83"/><rect x="0.01" y="303.77" width="347.38" height="0.96"/><rect x="0.01" y="547.88" width="347.38" height="1.92"/></g></g></g></svg>
\ No newline at end of file
diff --git a/docs.cmake b/docs.cmake
index 28ca5c0..24299ec 100644
--- a/docs.cmake
+++ b/docs.cmake
@@ -20,14 +20,23 @@
 set(AOM_DOXYGEN_OUTPUT_DIR "${AOM_CONFIG_DIR}/dox")
 set(AOM_DOXYGEN_SECTIONS "av1")
 
-set(AOM_DOXYGEN_SOURCES "${AOM_ROOT}/aom/aom.h" "${AOM_ROOT}/aom/aom_codec.h"
-                        "${AOM_ROOT}/aom/aom_decoder.h"
-                        "${AOM_ROOT}/aom/aom_encoder.h"
-                        "${AOM_ROOT}/aom/aom_frame_buffer.h"
-                        "${AOM_ROOT}/aom/aom_image.h"
-                        "${AOM_ROOT}/aom/aom_integer.h"
-                        "${AOM_ROOT}/keywords.dox" "${AOM_ROOT}/mainpage.dox"
-                        "${AOM_ROOT}/usage.dox")
+set(AOM_DOXYGEN_SOURCES
+    "${AOM_ROOT}/aom/aom.h"
+    "${AOM_ROOT}/aom/aom_codec.h"
+    "${AOM_ROOT}/aom/aom_decoder.h"
+    "${AOM_ROOT}/aom/aom_encoder.h"
+    "${AOM_ROOT}/aom/aom_frame_buffer.h"
+    "${AOM_ROOT}/aom/aom_image.h"
+    "${AOM_ROOT}/aom/aom_integer.h"
+    "${AOM_ROOT}/av1/common/av1_common_int.h"
+    "${AOM_ROOT}/av1/common/av1_loopfilter.h"
+    "${AOM_ROOT}/av1/common/blockd.h"
+    "${AOM_ROOT}/av1/common/cdef.h"
+    "${AOM_ROOT}/av1/common/enums.h"
+    "${AOM_ROOT}/av1/common/restoration.h"
+    "${AOM_ROOT}/keywords.dox"
+    "${AOM_ROOT}/mainpage.dox"
+    "${AOM_ROOT}/usage.dox")
 
 if(CONFIG_AV1_DECODER)
   set(AOM_DOXYGEN_EXAMPLE_SOURCES ${AOM_DOXYGEN_EXAMPLE_SOURCES}
@@ -45,7 +54,8 @@
   set(AOM_DOXYGEN_SECTIONS ${AOM_DOXYGEN_SECTIONS} "av1_decoder decoder")
 
   set(AOM_DOXYGEN_SOURCES ${AOM_DOXYGEN_SOURCES} "${AOM_ROOT}/aom/aomdx.h"
-                          "${AOM_ROOT}/usage_dx.dox")
+                          "${AOM_ROOT}/usage_dx.dox"
+                          "${AOM_ROOT}/av1/decoder/decoder.h")
 
   if(CONFIG_ANALYZER)
     set(AOM_DOXYGEN_EXAMPLE_SOURCES ${AOM_DOXYGEN_EXAMPLE_SOURCES}
@@ -62,6 +72,9 @@
     set(AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS ${AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS}
                                          "Bitstream inspector.")
   endif()
+
+  set(AOM_DOXYGEN_SOURCES ${AOM_DOXYGEN_SOURCES}
+                          "${AOM_ROOT}/doc/dev_guide/av1_decoder.dox")
 endif()
 
 if(CONFIG_AV1_ENCODER)
@@ -95,6 +108,50 @@
 
   set(AOM_DOXYGEN_SOURCES ${AOM_DOXYGEN_SOURCES} "${AOM_ROOT}/aom/aomcx.h"
                           "${AOM_ROOT}/usage_cx.dox")
+  set(AOM_DOXYGEN_SOURCES ${AOM_DOXYGEN_SOURCES}
+                          "${AOM_ROOT}/doc/dev_guide/av1_encoder.dox")
+  set(AOM_DOXYGEN_SOURCES
+      ${AOM_DOXYGEN_SOURCES}
+      "${AOM_ROOT}/aom_scale/yv12config.h"
+      "${AOM_ROOT}/av1/encoder/bitstream.h"
+      "${AOM_ROOT}/av1/encoder/block.h"
+      "${AOM_ROOT}/av1/encoder/aq_cyclicrefresh.h"
+      "${AOM_ROOT}/av1/encoder/encode_strategy.c"
+      "${AOM_ROOT}/av1/encoder/encode_strategy.h"
+      "${AOM_ROOT}/av1/encoder/encodeframe.c"
+      "${AOM_ROOT}/av1/encoder/encoder.c"
+      "${AOM_ROOT}/av1/encoder/encoder.h"
+      "${AOM_ROOT}/av1/encoder/encodetxb.h"
+      "${AOM_ROOT}/av1/encoder/firstpass.h"
+      "${AOM_ROOT}/av1/encoder/gop_structure.h"
+      "${AOM_ROOT}/av1/encoder/interp_search.c"
+      "${AOM_ROOT}/av1/encoder/intra_mode_search.h"
+      "${AOM_ROOT}/av1/encoder/intra_mode_search.c"
+      "${AOM_ROOT}/av1/encoder/intra_mode_search_utils.h"
+      "${AOM_ROOT}/av1/encoder/lookahead.h"
+      "${AOM_ROOT}/av1/encoder/palette.h"
+      "${AOM_ROOT}/av1/encoder/palette.c"
+      "${AOM_ROOT}/av1/encoder/partition_search.h"
+      "${AOM_ROOT}/av1/encoder/partition_search.c"
+      "${AOM_ROOT}/av1/encoder/pass2_strategy.h"
+      "${AOM_ROOT}/av1/encoder/pass2_strategy.c"
+      "${AOM_ROOT}/av1/encoder/pickcdef.h"
+      "${AOM_ROOT}/av1/encoder/picklpf.h"
+      "${AOM_ROOT}/av1/encoder/pickrst.h"
+      "${AOM_ROOT}/av1/encoder/ratectrl.c"
+      "${AOM_ROOT}/av1/encoder/ratectrl.h"
+      "${AOM_ROOT}/av1/encoder/rc_utils.h"
+      "${AOM_ROOT}/av1/encoder/rdopt.h"
+      "${AOM_ROOT}/av1/encoder/rdopt.c"
+      "${AOM_ROOT}/av1/encoder/speed_features.h"
+      "${AOM_ROOT}/av1/encoder/svc_layercontext.c"
+      "${AOM_ROOT}/av1/encoder/svc_layercontext.h"
+      "${AOM_ROOT}/av1/encoder/temporal_filter.h"
+      "${AOM_ROOT}/av1/encoder/temporal_filter.c"
+      "${AOM_ROOT}/av1/encoder/tpl_model.h"
+      "${AOM_ROOT}/av1/encoder/tx_search.h"
+      "${AOM_ROOT}/av1/encoder/var_based_part.h"
+      "${AOM_ROOT}/av1/encoder/nonrd_pickmode.c")
 endif()
 
 if(CONFIG_AV1_DECODER AND CONFIG_AV1_ENCODER)
@@ -231,6 +288,16 @@
   get_filename_component(samples_dox ${samples_dox} NAME)
   set(AOM_DOXYGEN_SOURCES ${AOM_DOXYGEN_SOURCES} ${samples_dox})
 
+  # There are issues to show Markdown file for old Doxygen version. Here, only
+  # enable Markdown support for 1.8.16 or newer.
+  if(${DOXYGEN_VERSION_VALUE} GREATER_EQUAL 1008016)
+    set(AOM_DOXYGEN_SECTIONS ${AOM_DOXYGEN_SECTIONS} "av1_md_support")
+    set(AOM_DOXYGEN_SOURCES ${AOM_DOXYGEN_SOURCES} "${AOM_ROOT}/README.md")
+    # Uncomment and add AlgorithmDescription.md in result page when it is done.
+    # set(AOM_DOXYGEN_SOURCES ${AOM_DOXYGEN_SOURCES}
+    # "${AOM_ROOT}/doc/AlgorithmDescription.md")
+  endif()
+
   # Generate libaom's doxyfile.
   file(WRITE "${AOM_DOXYFILE}" "##\n## GENERATED FILE. DO NOT EDIT\n##\n")
   file(READ "${AOM_ROOT}/${AOM_DOXYGEN_CONFIG_TEMPLATE}" doxygen_template_data)
@@ -245,6 +312,24 @@
   write_cmake_list_to_doxygen_config_var("ENABLED_SECTIONS"
                                          "AOM_DOXYGEN_SECTIONS")
 
+  # Add AOMedia logo.
+  set(aom_logo "aomedia_logo_200.png")
+  configure_file(${AOM_ROOT}/${aom_logo} ${AOM_CONFIG_DIR}/${aom_logo} COPYONLY)
+  file(APPEND "${AOM_DOXYFILE}"
+       "PROJECT_LOGO = ${AOM_CONFIG_DIR}/${aom_logo}\n")
+
+  # Only set HAVE_DOT to YES if dot tool is found.
+  if(DOXYGEN_DOT_FOUND)
+    file(APPEND "${AOM_DOXYFILE}" "HAVE_DOT = YES\n")
+    file(APPEND "${AOM_DOXYFILE}" "DOT_GRAPH_MAX_NODES = 10000\n")
+  endif()
+
+  # Add image path.
+  file(APPEND "${AOM_DOXYFILE}" "IMAGE_PATH += ${AOM_ROOT}/doc/dev_guide\n")
+
+  # Allow banner style comments
+  file(APPEND "${AOM_DOXYFILE}" "JAVADOC_BANNER = YES")
+
   # Add the doxygen generation rule.
   add_custom_target(docs ALL
                     COMMAND "${DOXYGEN_EXECUTABLE}" "${AOM_DOXYFILE}"
diff --git a/examples/analyzer.cc b/examples/analyzer.cc
index 3598821..501f502 100644
--- a/examples/analyzer.cc
+++ b/examples/analyzer.cc
@@ -39,7 +39,6 @@
 
   AvxVideoReader *reader;
   const AvxVideoInfo *info;
-  const AvxInterface *decoder;
 
   insp_frame_data frame_data;
 
@@ -92,8 +91,8 @@
     fprintf(stderr, "Unknown input codec.");
     return false;
   }
-  printf("Using %s\n", aom_codec_iface_name(decoder->codec_interface()));
-  if (aom_codec_dec_init(&codec, decoder->codec_interface(), NULL, 0)) {
+  printf("Using %s\n", aom_codec_iface_name(decoder));
+  if (aom_codec_dec_init(&codec, decoder, NULL, 0)) {
     fprintf(stderr, "Failed to initialize decoder.");
     return false;
   }
diff --git a/examples/aom_cx_set_ref.c b/examples/aom_cx_set_ref.c
index 2f4f658..3aea2cf 100644
--- a/examples/aom_cx_set_ref.c
+++ b/examples/aom_cx_set_ref.c
@@ -108,7 +108,7 @@
     }
 
     printf(
-        "Encode/decode mismatch on frame %d at"
+        "Encode/decode mismatch on frame %u at"
         " Y[%d, %d] {%d/%d},"
         " U[%d, %d] {%d/%d},"
         " V[%d, %d] {%d/%d}",
@@ -186,7 +186,6 @@
   aom_codec_err_t res;
   AvxVideoInfo info;
   AvxVideoWriter *writer = NULL;
-  const AvxInterface *encoder = NULL;
   int flags = 0;
   int allocated_raw_shift = 0;
   aom_img_fmt_t raw_fmt = AOM_IMG_FMT_I420;
@@ -229,7 +228,7 @@
   outfile_arg = argv[5];
   update_frame_num_arg = argv[6];
 
-  encoder = get_aom_encoder_by_name(codec_arg);
+  aom_codec_iface_t *encoder = get_aom_encoder_by_short_name(codec_arg);
   if (!encoder) die("Unsupported codec.");
 
   update_frame_num = (unsigned int)strtoul(update_frame_num_arg, NULL, 0);
@@ -246,7 +245,7 @@
       die("Update frame number couldn't larger than limit\n");
   }
 
-  info.codec_fourcc = encoder->fourcc;
+  info.codec_fourcc = get_fourcc_by_aom_encoder(encoder);
   info.frame_width = (int)strtol(width_arg, NULL, 0);
   info.frame_height = (int)strtol(height_arg, NULL, 0);
   info.time_base.numerator = 1;
@@ -266,13 +265,13 @@
   // Allocate memory with the border so that it can be used as a reference.
   if (!aom_img_alloc_with_border(&ext_ref, ref_fmt, info.frame_width,
                                  info.frame_height, 32, 8,
-                                 AOM_BORDER_IN_PIXELS)) {
+                                 AOM_DEC_BORDER_IN_PIXELS)) {
     die("Failed to allocate image.");
   }
 
-  printf("Using %s\n", aom_codec_iface_name(encoder->codec_interface()));
+  printf("Using %s\n", aom_codec_iface_name(encoder));
 
-  res = aom_codec_enc_config_default(encoder->codec_interface(), &cfg, 0);
+  res = aom_codec_enc_config_default(encoder, &cfg, 0);
   if (res) die_codec(&ecodec, "Failed to get default codec config.");
 
   cfg.g_w = info.frame_width;
@@ -293,17 +292,17 @@
   if (!(infile = fopen(infile_arg, "rb")))
     die("Failed to open %s for reading.", infile_arg);
 
-  if (aom_codec_enc_init(&ecodec, encoder->codec_interface(), &cfg, flags))
-    die_codec(&ecodec, "Failed to initialize encoder");
+  if (aom_codec_enc_init(&ecodec, encoder, &cfg, flags))
+    die("Failed to initialize encoder");
 
   // Disable alt_ref.
   if (aom_codec_control(&ecodec, AOME_SET_ENABLEAUTOALTREF, 0))
     die_codec(&ecodec, "Failed to set enable auto alt ref");
 
   if (test_decode) {
-    const AvxInterface *decoder = get_aom_decoder_by_name(codec_arg);
-    if (aom_codec_dec_init(&dcodec, decoder->codec_interface(), NULL, 0))
-      die_codec(&dcodec, "Failed to initialize decoder.");
+    aom_codec_iface_t *decoder = get_aom_decoder_by_short_name(codec_arg);
+    if (aom_codec_dec_init(&dcodec, decoder, NULL, 0))
+      die("Failed to initialize decoder.");
   }
 
   // Encode frames.
@@ -358,7 +357,7 @@
 
   printf("\n");
   fclose(infile);
-  printf("Processed %d frames.\n", frame_out);
+  printf("Processed %u frames.\n", frame_out);
 
   if (test_decode) {
     if (!mismatch_seen)
diff --git a/examples/av1_dec_fuzzer.cc b/examples/av1_dec_fuzzer.cc
index 1cddc8c..9b9a0b9 100644
--- a/examples/av1_dec_fuzzer.cc
+++ b/examples/av1_dec_fuzzer.cc
@@ -34,7 +34,7 @@
     return 0;
   }
 
-  const aom_codec_iface_t *codec_interface = aom_codec_av1_dx();
+  aom_codec_iface_t *codec_interface = aom_codec_av1_dx();
   aom_codec_ctx_t codec;
   // Set thread count in the range [1, 64].
   const unsigned int threads = (data[IVF_FILE_HDR_SZ] & 0x3f) + 1;
diff --git a/examples/build_av1_dec_fuzzer.sh b/examples/build_av1_dec_fuzzer.sh
index 0dcb254..2ceb652 100755
--- a/examples/build_av1_dec_fuzzer.sh
+++ b/examples/build_av1_dec_fuzzer.sh
@@ -33,11 +33,11 @@
   echo "  git clone https://aomedia.googlesource.com/aom"
   exit 2
 fi
-if [[ -z "$CC" ]]; then
+if [[ -z "${CC:-}" ]]; then
   echo "Set the CC environment variable to point to your C compiler."
   exit 2
 fi
-if [[ -z "$CXX" ]]; then
+if [[ -z "${CXX:-}" ]]; then
   echo "Set the CXX environment variable to point to your C++ compiler."
   exit 2
 fi
@@ -47,7 +47,7 @@
 # Run CMake with address sanitizer enabled and build the codec.
 # Enable DO_RANGE_CHECK_CLAMP to suppress the noise of integer overflows
 # in the transform functions. Also set memory limits.
-EXTRA_C_FLAGS='-DDO_RANGE_CHECK_CLAMP=1 -DAOM_MAX_ALLOCABLE_MEMORY=1073741824'
+EXTRA_C_FLAGS='-UNDEBUG -DDO_RANGE_CHECK_CLAMP=1 -DAOM_MAX_ALLOCABLE_MEMORY=1073741824'
 cd "${BUILD_DIR}"
 cmake "${AOM_DIR}" -DCMAKE_BUILD_TYPE=RelWithDebInfo -DCONFIG_PIC=1 \
   -DCONFIG_SCALABILITY=0 -DFORCE_HIGHBITDEPTH_DECODING=0 \
@@ -61,7 +61,7 @@
 
 # Build the av1 fuzzer
 $CXX -std=c++11 -DDECODER=av1 -I${AOM_DIR} -I${BUILD_DIR} \
-    -fsanitize=fuzzer,address -Wl,--start-group \
+    -g -fsanitize=fuzzer,address -Wl,--start-group \
     ${AOM_DIR}/examples/av1_dec_fuzzer.cc -o ${BUILD_DIR}/av1_dec_fuzzer \
     ${BUILD_DIR}/libaom.a -Wl,--end-group
 
diff --git a/examples/decode_to_md5.c b/examples/decode_to_md5.c
index bc127b7..07f788f 100644
--- a/examples/decode_to_md5.c
+++ b/examples/decode_to_md5.c
@@ -77,10 +77,8 @@
 int main(int argc, char **argv) {
   int frame_cnt = 0;
   FILE *outfile = NULL;
-  aom_codec_ctx_t codec;
   AvxVideoReader *reader = NULL;
   const AvxVideoInfo *info = NULL;
-  const AvxInterface *decoder = NULL;
 
   exec_name = argv[0];
 
@@ -94,13 +92,14 @@
 
   info = aom_video_reader_get_info(reader);
 
-  decoder = get_aom_decoder_by_fourcc(info->codec_fourcc);
+  aom_codec_iface_t *decoder = get_aom_decoder_by_fourcc(info->codec_fourcc);
   if (!decoder) die("Unknown input codec.");
 
-  printf("Using %s\n", aom_codec_iface_name(decoder->codec_interface()));
+  printf("Using %s\n", aom_codec_iface_name(decoder));
 
-  if (aom_codec_dec_init(&codec, decoder->codec_interface(), NULL, 0))
-    die_codec(&codec, "Failed to initialize decoder");
+  aom_codec_ctx_t codec;
+  if (aom_codec_dec_init(&codec, decoder, NULL, 0))
+    die("Failed to initialize decoder");
 
   while (aom_video_reader_read_frame(reader)) {
     aom_codec_iter_t iter = NULL;
@@ -116,7 +115,7 @@
 
       get_image_md5(img, digest);
       print_md5(outfile, digest);
-      fprintf(outfile, "  img-%dx%d-%04d.i420\n", img->d_w, img->d_h,
+      fprintf(outfile, "  img-%ux%u-%04d.i420\n", img->d_w, img->d_h,
               ++frame_cnt);
     }
   }
diff --git a/examples/decode_with_drops.c b/examples/decode_with_drops.c
index 2144019..9bec6ee 100644
--- a/examples/decode_with_drops.c
+++ b/examples/decode_with_drops.c
@@ -72,8 +72,6 @@
 int main(int argc, char **argv) {
   int frame_cnt = 0;
   FILE *outfile = NULL;
-  aom_codec_ctx_t codec;
-  const AvxInterface *decoder = NULL;
   AvxVideoReader *reader = NULL;
   const AvxVideoInfo *info = NULL;
   int n = 0;
@@ -99,13 +97,13 @@
 
   info = aom_video_reader_get_info(reader);
 
-  decoder = get_aom_decoder_by_fourcc(info->codec_fourcc);
+  aom_codec_iface_t *decoder = get_aom_decoder_by_fourcc(info->codec_fourcc);
   if (!decoder) die("Unknown input codec.");
 
-  printf("Using %s\n", aom_codec_iface_name(decoder->codec_interface()));
-
-  if (aom_codec_dec_init(&codec, decoder->codec_interface(), NULL, 0))
-    die_codec(&codec, "Failed to initialize decoder.");
+  printf("Using %s\n", aom_codec_iface_name(decoder));
+  aom_codec_ctx_t codec;
+  if (aom_codec_dec_init(&codec, decoder, NULL, 0))
+    die("Failed to initialize decoder.");
 
   while (aom_video_reader_read_frame(reader)) {
     aom_codec_iter_t iter = NULL;
diff --git a/examples/inspect.c b/examples/inspect.c
index 526bdc1..0a2e962 100644
--- a/examples/inspect.c
+++ b/examples/inspect.c
@@ -267,7 +267,7 @@
   char offset;
 };
 struct parm_offset parm_offsets[] = {
-  { "blockSize", offsetof(insp_mi_data, sb_type) },
+  { "blockSize", offsetof(insp_mi_data, bsize) },
   { "transformSize", offsetof(insp_mi_data, tx_size) },
   { "transformType", offsetof(insp_mi_data, tx_type) },
   { "dualFilterType", offsetof(insp_mi_data, dual_filter_type) },
@@ -627,7 +627,7 @@
   buf += put_str(buf, "{\n");
   if (layers & BLOCK_SIZE_LAYER) {
     buf += put_block_info(buf, block_size_map, "blockSize",
-                          offsetof(insp_mi_data, sb_type), 0);
+                          offsetof(insp_mi_data, bsize), 0);
   }
   if (layers & TRANSFORM_SIZE_LAYER) {
     buf += put_block_info(buf, tx_size_map, "transformSize",
@@ -755,12 +755,11 @@
   reader = aom_video_reader_open(file);
   if (!reader) die("Failed to open %s for reading.", file);
   info = aom_video_reader_get_info(reader);
-  const AvxInterface *decoder = get_aom_decoder_by_fourcc(info->codec_fourcc);
+  aom_codec_iface_t *decoder = get_aom_decoder_by_fourcc(info->codec_fourcc);
   if (!decoder) die("Unknown input codec.");
-  fprintf(stderr, "Using %s\n",
-          aom_codec_iface_name(decoder->codec_interface()));
-  if (aom_codec_dec_init(&codec, decoder->codec_interface(), NULL, 0))
-    die_codec(&codec, "Failed to initialize decoder.");
+  fprintf(stderr, "Using %s\n", aom_codec_iface_name(decoder));
+  if (aom_codec_dec_init(&codec, decoder, NULL, 0))
+    die("Failed to initialize decoder.");
   ifd_init(&frame_data, info->frame_width, info->frame_height);
   ifd_init_cb();
   return EXIT_SUCCESS;
@@ -793,6 +792,7 @@
     }
 
     frame = adr.buf;
+    frame_size = end_frame - frame;
     if (frame == end_frame) have_frame = 0;
   } while (adr.show_existing);
 
diff --git a/examples/lightfield_bitstream_parsing.c b/examples/lightfield_bitstream_parsing.c
index ffcbcb9..30e4fca 100644
--- a/examples/lightfield_bitstream_parsing.c
+++ b/examples/lightfield_bitstream_parsing.c
@@ -192,10 +192,8 @@
 }
 
 int main(int argc, char **argv) {
-  aom_codec_ctx_t codec;
   AvxVideoReader *reader = NULL;
   AvxVideoWriter *writer = NULL;
-  const AvxInterface *decoder = NULL;
   const AvxVideoInfo *info = NULL;
   int num_references;
   int i;
@@ -220,12 +218,13 @@
 
   tile_list_file = argv[4];
 
-  decoder = get_aom_decoder_by_fourcc(info->codec_fourcc);
+  aom_codec_iface_t *decoder = get_aom_decoder_by_fourcc(info->codec_fourcc);
   if (!decoder) die("Unknown input codec.");
-  printf("Using %s\n", aom_codec_iface_name(decoder->codec_interface()));
+  printf("Using %s\n", aom_codec_iface_name(decoder));
 
-  if (aom_codec_dec_init(&codec, decoder->codec_interface(), NULL, 0))
-    die_codec(&codec, "Failed to initialize decoder.");
+  aom_codec_ctx_t codec;
+  if (aom_codec_dec_init(&codec, decoder, NULL, 0))
+    die("Failed to initialize decoder.");
 
   // Decode anchor frames.
   AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1_SET_TILE_MODE, 0);
diff --git a/examples/lightfield_decoder.c b/examples/lightfield_decoder.c
index a292e9c..61a54cb 100644
--- a/examples/lightfield_decoder.c
+++ b/examples/lightfield_decoder.c
@@ -156,9 +156,7 @@
 
 int main(int argc, char **argv) {
   FILE *outfile = NULL;
-  aom_codec_ctx_t codec;
   AvxVideoReader *reader = NULL;
-  const AvxInterface *decoder = NULL;
   const AvxVideoInfo *info = NULL;
   int num_references;
   aom_img_fmt_t ref_fmt = 0;
@@ -189,13 +187,15 @@
 
   info = aom_video_reader_get_info(reader);
 
+  aom_codec_iface_t *decoder;
   if (info->codec_fourcc == LST_FOURCC)
     decoder = get_aom_decoder_by_fourcc(AV1_FOURCC);
   else
     die("Unknown input codec.");
-  printf("Using %s\n", aom_codec_iface_name(decoder->codec_interface()));
+  printf("Using %s\n", aom_codec_iface_name(decoder));
 
-  if (aom_codec_dec_init(&codec, decoder->codec_interface(), NULL, 0))
+  aom_codec_ctx_t codec;
+  if (aom_codec_dec_init(&codec, decoder, NULL, 0))
     die_codec(&codec, "Failed to initialize decoder.");
 
   if (AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1D_SET_IS_ANNEXB,
@@ -240,7 +240,7 @@
     while ((img = aom_codec_get_frame(&codec, &iter)) != NULL) {
       char name[1024];
       snprintf(name, sizeof(name), "ref_%d.yuv", i);
-      printf("writing ref image to %s, %d, %d\n", name, img->d_w, img->d_h);
+      printf("writing ref image to %s, %u, %u\n", name, img->d_w, img->d_h);
       FILE *ref_file = fopen(name, "wb");
       aom_img_write(img, ref_file);
       fclose(ref_file);
diff --git a/examples/lightfield_encoder.c b/examples/lightfield_encoder.c
index e80fe24..d24aabd 100644
--- a/examples/lightfield_encoder.c
+++ b/examples/lightfield_encoder.c
@@ -128,7 +128,7 @@
 }
 
 static aom_fixed_buf_t pass0(aom_image_t *raw, FILE *infile,
-                             const AvxInterface *encoder,
+                             aom_codec_iface_t *encoder,
                              const aom_codec_enc_cfg_t *cfg, int lf_width,
                              int lf_height, int lf_blocksize, int flags,
                              aom_image_t *raw_shift) {
@@ -140,8 +140,8 @@
   aom_fixed_buf_t stats = { NULL, 0 };
   aom_image_t *frame_to_encode;
 
-  if (aom_codec_enc_init(&codec, encoder->codec_interface(), cfg, flags))
-    die_codec(&codec, "Failed to initialize encoder");
+  if (aom_codec_enc_init(&codec, encoder, cfg, flags))
+    die("Failed to initialize encoder");
   if (aom_codec_control(&codec, AOME_SET_ENABLEAUTOALTREF, 0))
     die_codec(&codec, "Failed to turn off auto altref");
   if (aom_codec_control(&codec, AV1E_SET_FRAME_PARALLEL_DECODING, 0))
@@ -231,10 +231,10 @@
 }
 
 static void pass1(aom_image_t *raw, FILE *infile, const char *outfile_name,
-                  const AvxInterface *encoder, aom_codec_enc_cfg_t *cfg,
+                  aom_codec_iface_t *encoder, aom_codec_enc_cfg_t *cfg,
                   int lf_width, int lf_height, int lf_blocksize, int flags,
                   aom_image_t *raw_shift) {
-  AvxVideoInfo info = { encoder->fourcc,
+  AvxVideoInfo info = { get_fourcc_by_aom_encoder(encoder),
                         cfg->g_w,
                         cfg->g_h,
                         { cfg->g_timebase.num, cfg->g_timebase.den },
@@ -253,15 +253,15 @@
   writer = aom_video_writer_open(outfile_name, kContainerIVF, &info);
   if (!writer) die("Failed to open %s for writing", outfile_name);
 
-  if (aom_codec_enc_init(&codec, encoder->codec_interface(), cfg, flags))
-    die_codec(&codec, "Failed to initialize encoder");
+  if (aom_codec_enc_init(&codec, encoder, cfg, flags))
+    die("Failed to initialize encoder");
   if (aom_codec_control(&codec, AOME_SET_ENABLEAUTOALTREF, 0))
     die_codec(&codec, "Failed to turn off auto altref");
   if (aom_codec_control(&codec, AV1E_SET_FRAME_PARALLEL_DECODING, 0))
     die_codec(&codec, "Failed to set frame parallel decoding");
   if (aom_codec_control(&codec, AV1E_ENABLE_EXT_TILE_DEBUG, 1))
     die_codec(&codec, "Failed to enable encoder ext_tile debug");
-  if (aom_codec_control(&codec, AOME_SET_CPUUSED, 1))
+  if (aom_codec_control(&codec, AOME_SET_CPUUSED, 3))
     die_codec(&codec, "Failed to set cpu-used");
 
   // Note: The superblock is a sequence parameter and has to be the same for 1
@@ -438,7 +438,6 @@
   aom_fixed_buf_t stats;
   int flags = 0;
 
-  const AvxInterface *encoder = NULL;
   const int fps = 30;
   const int bitrate = 200;  // kbit/s
   const char *const width_arg = argv[1];
@@ -452,7 +451,7 @@
 
   if (argc < 8) die("Invalid number of arguments");
 
-  encoder = get_aom_encoder_by_name("av1");
+  aom_codec_iface_t *encoder = get_aom_encoder_by_short_name("av1");
   if (!encoder) die("Unsupported codec.");
 
   w = (int)strtol(width_arg, NULL, 0);
@@ -478,10 +477,10 @@
                   32);
   }
 
-  printf("Using %s\n", aom_codec_iface_name(encoder->codec_interface()));
+  printf("Using %s\n", aom_codec_iface_name(encoder));
 
   // Configuration
-  res = aom_codec_enc_config_default(encoder->codec_interface(), &cfg, 0);
+  res = aom_codec_enc_config_default(encoder, &cfg, 0);
   if (res) die_codec(&codec, "Failed to get default codec config.");
 
   cfg.g_w = w;
diff --git a/examples/lightfield_tile_list_decoder.c b/examples/lightfield_tile_list_decoder.c
index 3b928df..6811972 100644
--- a/examples/lightfield_tile_list_decoder.c
+++ b/examples/lightfield_tile_list_decoder.c
@@ -98,9 +98,7 @@
 
 int main(int argc, char **argv) {
   FILE *outfile = NULL;
-  aom_codec_ctx_t codec;
   AvxVideoReader *reader = NULL;
-  const AvxInterface *decoder = NULL;
   const AvxVideoInfo *info = NULL;
   int num_references;
   int num_tile_lists;
@@ -129,16 +127,17 @@
 
   info = aom_video_reader_get_info(reader);
 
-  decoder = get_aom_decoder_by_fourcc(info->codec_fourcc);
+  aom_codec_iface_t *decoder = get_aom_decoder_by_fourcc(info->codec_fourcc);
   if (!decoder) die("Unknown input codec.");
-  printf("Using %s\n", aom_codec_iface_name(decoder->codec_interface()));
+  printf("Using %s\n", aom_codec_iface_name(decoder));
 
-  if (aom_codec_dec_init(&codec, decoder->codec_interface(), NULL, 0))
-    die_codec(&codec, "Failed to initialize decoder.");
+  aom_codec_ctx_t codec;
+  if (aom_codec_dec_init(&codec, decoder, NULL, 0))
+    die("Failed to initialize decoder.");
 
   if (AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1D_SET_IS_ANNEXB,
                                     info->is_annexb)) {
-    die("Failed to set annex b status");
+    die_codec(&codec, "Failed to set annex b status");
   }
 
   // Decode anchor frames.
@@ -179,7 +178,7 @@
     while ((img = aom_codec_get_frame(&codec, &iter)) != NULL) {
       char name[1024];
       snprintf(name, sizeof(name), "ref_%d.yuv", i);
-      printf("writing ref image to %s, %d, %d\n", name, img->d_w, img->d_h);
+      printf("writing ref image to %s, %u, %u\n", name, img->d_w, img->d_h);
       FILE *ref_file = fopen(name, "wb");
       aom_img_write(img, ref_file);
       fclose(ref_file);
diff --git a/examples/lossless_encoder.c b/examples/lossless_encoder.c
index e0253d2..1971b9c 100644
--- a/examples/lossless_encoder.c
+++ b/examples/lossless_encoder.c
@@ -57,14 +57,12 @@
 
 int main(int argc, char **argv) {
   FILE *infile = NULL;
-  aom_codec_ctx_t codec;
   aom_codec_enc_cfg_t cfg;
   int frame_count = 0;
   aom_image_t raw;
   aom_codec_err_t res;
   AvxVideoInfo info;
   AvxVideoWriter *writer = NULL;
-  const AvxInterface *encoder = NULL;
   const int fps = 30;
 
   exec_name = argv[0];
@@ -75,10 +73,10 @@
 
   if (argc < 5) die("Invalid number of arguments");
 
-  encoder = get_aom_encoder_by_name("av1");
+  aom_codec_iface_t *encoder = get_aom_encoder_by_short_name("av1");
   if (!encoder) die("Unsupported codec.");
 
-  info.codec_fourcc = encoder->fourcc;
+  info.codec_fourcc = get_fourcc_by_aom_encoder(encoder);
   info.frame_width = (int)strtol(argv[1], NULL, 0);
   info.frame_height = (int)strtol(argv[2], NULL, 0);
   info.time_base.numerator = 1;
@@ -94,9 +92,10 @@
     die("Failed to allocate image.");
   }
 
-  printf("Using %s\n", aom_codec_iface_name(encoder->codec_interface()));
+  printf("Using %s\n", aom_codec_iface_name(encoder));
 
-  res = aom_codec_enc_config_default(encoder->codec_interface(), &cfg, 0);
+  aom_codec_ctx_t codec;
+  res = aom_codec_enc_config_default(encoder, &cfg, 0);
   if (res) die_codec(&codec, "Failed to get default codec config.");
 
   cfg.g_w = info.frame_width;
@@ -110,8 +109,8 @@
   if (!(infile = fopen(argv[3], "rb")))
     die("Failed to open %s for reading.", argv[3]);
 
-  if (aom_codec_enc_init(&codec, encoder->codec_interface(), &cfg, 0))
-    die_codec(&codec, "Failed to initialize encoder");
+  if (aom_codec_enc_init(&codec, encoder, &cfg, 0))
+    die("Failed to initialize encoder");
 
   if (AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1E_SET_LOSSLESS, 1))
     die_codec(&codec, "Failed to use lossless mode");
diff --git a/examples/scalable_decoder.c b/examples/scalable_decoder.c
index c229242..00fe820 100644
--- a/examples/scalable_decoder.c
+++ b/examples/scalable_decoder.c
@@ -93,8 +93,6 @@
   int frame_cnt = 0;
   FILE *outfile[MAX_LAYERS];
   char filename[80];
-  aom_codec_ctx_t codec;
-  const AvxInterface *decoder = NULL;
   FILE *inputfile = NULL;
   uint8_t *buf = NULL;
   size_t bytes_in_buffer = 0;
@@ -114,11 +112,12 @@
   obu_ctx.avx_ctx->file = inputfile;
   obu_ctx.avx_ctx->filename = argv[1];
 
-  decoder = get_aom_decoder_by_index(0);
-  printf("Using %s\n", aom_codec_iface_name(decoder->codec_interface()));
+  aom_codec_iface_t *decoder = get_aom_decoder_by_index(0);
+  printf("Using %s\n", aom_codec_iface_name(decoder));
 
-  if (aom_codec_dec_init(&codec, decoder->codec_interface(), NULL, 0))
-    die_codec(&codec, "Failed to initialize decoder.");
+  aom_codec_ctx_t codec;
+  if (aom_codec_dec_init(&codec, decoder, NULL, 0))
+    die("Failed to initialize decoder.");
 
   if (aom_codec_control(&codec, AV1D_SET_OUTPUT_ALL_LAYERS, 1)) {
     die_codec(&codec, "Failed to set output_all_layers control.");
@@ -128,7 +127,7 @@
   const size_t ret = fread(tmpbuf, 1, 32, inputfile);
   if (ret != 32) die_codec(&codec, "Input is not a valid obu file");
   si.is_annexb = 0;
-  if (aom_codec_peek_stream_info(decoder->codec_interface(), tmpbuf, 32, &si)) {
+  if (aom_codec_peek_stream_info(decoder, tmpbuf, 32, &si)) {
     die_codec(&codec, "Input is not a valid obu file");
   }
   fseek(inputfile, -32, SEEK_CUR);
@@ -143,7 +142,7 @@
 
   // open any enhancement layer output yuv files
   for (i = 1; i < si.number_spatial_layers; i++) {
-    snprintf(filename, sizeof(filename), "out_lyr%d.yuv", i);
+    snprintf(filename, sizeof(filename), "out_lyr%u.yuv", i);
     if (!(outfile[i] = fopen(filename, "wb")))
       die("Failed to open output for writing.");
   }
diff --git a/examples/scalable_encoder.c b/examples/scalable_encoder.c
index 7af03e2..5bfd184 100644
--- a/examples/scalable_encoder.c
+++ b/examples/scalable_encoder.c
@@ -120,13 +120,11 @@
 int main(int argc, char **argv) {
   FILE *infile0 = NULL;
   FILE *infile1 = NULL;
-  aom_codec_ctx_t codec;
   aom_codec_enc_cfg_t cfg;
   int frame_count = 0;
   aom_image_t raw0, raw1;
   aom_codec_err_t res;
   AvxVideoInfo info;
-  const AvxInterface *encoder = NULL;
   const int fps = 30;
   const int bitrate = 200;
   int keyframe_interval = 0;
@@ -157,10 +155,10 @@
   outfile_arg = argv[6];
   max_frames = (int)strtol(argv[7], NULL, 0);
 
-  encoder = get_aom_encoder_by_name(codec_arg);
+  aom_codec_iface_t *encoder = get_aom_encoder_by_short_name(codec_arg);
   if (!encoder) die("Unsupported codec.");
 
-  info.codec_fourcc = encoder->fourcc;
+  info.codec_fourcc = get_fourcc_by_aom_encoder(encoder);
   info.frame_width = (int)strtol(width_arg, NULL, 0);
   info.frame_height = (int)strtol(height_arg, NULL, 0);
   info.time_base.numerator = 1;
@@ -184,9 +182,10 @@
   keyframe_interval = 100;
   if (keyframe_interval < 0) die("Invalid keyframe interval value.");
 
-  printf("Using %s\n", aom_codec_iface_name(encoder->codec_interface()));
+  printf("Using %s\n", aom_codec_iface_name(encoder));
 
-  res = aom_codec_enc_config_default(encoder->codec_interface(), &cfg, 0);
+  aom_codec_ctx_t codec;
+  res = aom_codec_enc_config_default(encoder, &cfg, 0);
   if (res) die_codec(&codec, "Failed to get default codec config.");
 
   cfg.g_w = info.frame_width;
@@ -207,8 +206,8 @@
   if (!(infile1 = fopen(infile1_arg, "rb")))
     die("Failed to open %s for reading.", infile0_arg);
 
-  if (aom_codec_enc_init(&codec, encoder->codec_interface(), &cfg, 0))
-    die_codec(&codec, "Failed to initialize encoder");
+  if (aom_codec_enc_init(&codec, encoder, &cfg, 0))
+    die("Failed to initialize encoder");
   if (aom_codec_control(&codec, AOME_SET_CPUUSED, 8))
     die_codec(&codec, "Failed to set cpu to 8");
 
diff --git a/examples/set_maps.c b/examples/set_maps.c
index 9aeb96e..69b4bcc 100644
--- a/examples/set_maps.c
+++ b/examples/set_maps.c
@@ -121,12 +121,11 @@
   aom_codec_ctx_t codec;
   aom_codec_enc_cfg_t cfg;
   int frame_count = 0;
-  const int limit = 15;
+  const int limit = 10;
   aom_image_t raw;
   aom_codec_err_t res;
   AvxVideoInfo info;
   AvxVideoWriter *writer = NULL;
-  const AvxInterface *encoder = NULL;
   const int fps = 2;  // TODO(dkovalev) add command line argument
   const double bits_per_pixel_per_frame = 0.067;
 
@@ -135,12 +134,12 @@
 
   memset(&info, 0, sizeof(info));
 
-  encoder = get_aom_encoder_by_name(argv[1]);
+  aom_codec_iface_t *encoder = get_aom_encoder_by_short_name(argv[1]);
   if (encoder == NULL) {
     die("Unsupported codec.");
   }
   assert(encoder != NULL);
-  info.codec_fourcc = encoder->fourcc;
+  info.codec_fourcc = get_fourcc_by_aom_encoder(encoder);
   info.frame_width = (int)strtol(argv[2], NULL, 0);
   info.frame_height = (int)strtol(argv[3], NULL, 0);
   info.time_base.numerator = 1;
@@ -156,9 +155,9 @@
     die("Failed to allocate image.");
   }
 
-  printf("Using %s\n", aom_codec_iface_name(encoder->codec_interface()));
+  printf("Using %s\n", aom_codec_iface_name(encoder));
 
-  res = aom_codec_enc_config_default(encoder->codec_interface(), &cfg, 0);
+  res = aom_codec_enc_config_default(encoder, &cfg, 0);
   if (res) die_codec(&codec, "Failed to get default codec config.");
 
   cfg.g_w = info.frame_width;
@@ -175,8 +174,11 @@
   if (!(infile = fopen(argv[4], "rb")))
     die("Failed to open %s for reading.", argv[4]);
 
-  if (aom_codec_enc_init(&codec, encoder->codec_interface(), &cfg, 0))
-    die_codec(&codec, "Failed to initialize encoder");
+  if (aom_codec_enc_init(&codec, encoder, &cfg, 0))
+    die("Failed to initialize encoder");
+
+  if (aom_codec_control(&codec, AOME_SET_CPUUSED, 2))
+    die_codec(&codec, "Failed to set cpu-used");
 
   // Encode frames.
   while (aom_img_read(&raw, infile) && frame_count < limit) {
@@ -184,7 +186,7 @@
 
     if (frame_count == 5) {
       set_active_map(&cfg, &codec);
-    } else if (frame_count == 11) {
+    } else if (frame_count == 9) {
       unset_active_map(&cfg, &codec);
     }
 
diff --git a/examples/simple_decoder.c b/examples/simple_decoder.c
index d098d1e..b6891dc 100644
--- a/examples/simple_decoder.c
+++ b/examples/simple_decoder.c
@@ -92,9 +92,7 @@
 int main(int argc, char **argv) {
   int frame_cnt = 0;
   FILE *outfile = NULL;
-  aom_codec_ctx_t codec;
   AvxVideoReader *reader = NULL;
-  const AvxInterface *decoder = NULL;
   const AvxVideoInfo *info = NULL;
 
   exec_name = argv[0];
@@ -109,13 +107,14 @@
 
   info = aom_video_reader_get_info(reader);
 
-  decoder = get_aom_decoder_by_fourcc(info->codec_fourcc);
+  aom_codec_iface_t *decoder = get_aom_decoder_by_fourcc(info->codec_fourcc);
   if (!decoder) die("Unknown input codec.");
 
-  printf("Using %s\n", aom_codec_iface_name(decoder->codec_interface()));
+  printf("Using %s\n", aom_codec_iface_name(decoder));
 
-  if (aom_codec_dec_init(&codec, decoder->codec_interface(), NULL, 0))
-    die_codec(&codec, "Failed to initialize decoder.");
+  aom_codec_ctx_t codec;
+  if (aom_codec_dec_init(&codec, decoder, NULL, 0))
+    die("Failed to initialize decoder.");
 
   while (aom_video_reader_read_frame(reader)) {
     aom_codec_iter_t iter = NULL;
diff --git a/examples/simple_encoder.c b/examples/simple_encoder.c
index 01a37cf..682fe98 100644
--- a/examples/simple_encoder.c
+++ b/examples/simple_encoder.c
@@ -100,6 +100,7 @@
 #include <string.h>
 
 #include "aom/aom_encoder.h"
+#include "aom/aomcx.h"
 #include "common/tools_common.h"
 #include "common/video_writer.h"
 
@@ -151,7 +152,6 @@
   aom_codec_err_t res;
   AvxVideoInfo info;
   AvxVideoWriter *writer = NULL;
-  const AvxInterface *encoder = NULL;
   const int fps = 30;
   const int bitrate = 200;
   int keyframe_interval = 0;
@@ -180,10 +180,10 @@
   keyframe_interval_arg = argv[6];
   max_frames = (int)strtol(argv[8], NULL, 0);
 
-  encoder = get_aom_encoder_by_name(codec_arg);
+  aom_codec_iface_t *encoder = get_aom_encoder_by_short_name(codec_arg);
   if (!encoder) die("Unsupported codec.");
 
-  info.codec_fourcc = encoder->fourcc;
+  info.codec_fourcc = get_fourcc_by_aom_encoder(encoder);
   info.frame_width = (int)strtol(width_arg, NULL, 0);
   info.frame_height = (int)strtol(height_arg, NULL, 0);
   info.time_base.numerator = 1;
@@ -202,9 +202,9 @@
   keyframe_interval = (int)strtol(keyframe_interval_arg, NULL, 0);
   if (keyframe_interval < 0) die("Invalid keyframe interval value.");
 
-  printf("Using %s\n", aom_codec_iface_name(encoder->codec_interface()));
+  printf("Using %s\n", aom_codec_iface_name(encoder));
 
-  res = aom_codec_enc_config_default(encoder->codec_interface(), &cfg, 0);
+  res = aom_codec_enc_config_default(encoder, &cfg, 0);
   if (res) die_codec(&codec, "Failed to get default codec config.");
 
   cfg.g_w = info.frame_width;
@@ -220,8 +220,11 @@
   if (!(infile = fopen(infile_arg, "rb")))
     die("Failed to open %s for reading.", infile_arg);
 
-  if (aom_codec_enc_init(&codec, encoder->codec_interface(), &cfg, 0))
-    die_codec(&codec, "Failed to initialize encoder");
+  if (aom_codec_enc_init(&codec, encoder, &cfg, 0))
+    die("Failed to initialize encoder");
+
+  if (aom_codec_control(&codec, AOME_SET_CPUUSED, 2))
+    die_codec(&codec, "Failed to set cpu-used");
 
   // Encode frames.
   while (aom_img_read(&raw, infile)) {
diff --git a/examples/svc_encoder_rtc.c b/examples/svc_encoder_rtc.c
index 1316c6c..0ae235a 100644
--- a/examples/svc_encoder_rtc.c
+++ b/examples/svc_encoder_rtc.c
@@ -20,67 +20,105 @@
 #include "aom/aom_encoder.h"
 #include "aom/aomcx.h"
 #include "av1/common/enums.h"
+#include "av1/encoder/encoder.h"
+#include "common/args.h"
 #include "common/tools_common.h"
 #include "common/video_writer.h"
 #include "aom_ports/aom_timer.h"
 
+#define OPTION_BUFFER_SIZE 1024
+
+typedef struct {
+  const char *output_filename;
+  char options[OPTION_BUFFER_SIZE];
+  struct AvxInputContext input_ctx;
+  int speed;
+  int aq_mode;
+  int layering_mode;
+} AppInput;
+
+typedef enum {
+  QUANTIZER = 0,
+  BITRATE,
+  SCALE_FACTOR,
+  AUTO_ALT_REF,
+  ALL_OPTION_TYPES
+} LAYER_OPTION_TYPE;
+
+static const arg_def_t outputfile =
+    ARG_DEF("o", "output", 1, "Output filename");
+static const arg_def_t frames_arg =
+    ARG_DEF("f", "frames", 1, "Number of frames to encode");
+static const arg_def_t threads_arg =
+    ARG_DEF("th", "threads", 1, "Number of threads to use");
+static const arg_def_t width_arg = ARG_DEF("w", "width", 1, "Source width");
+static const arg_def_t height_arg = ARG_DEF("h", "height", 1, "Source height");
+static const arg_def_t timebase_arg =
+    ARG_DEF("t", "timebase", 1, "Timebase (num/den)");
+static const arg_def_t bitrate_arg = ARG_DEF(
+    "b", "target-bitrate", 1, "Encoding bitrate, in kilobits per second");
+static const arg_def_t spatial_layers_arg =
+    ARG_DEF("sl", "spatial-layers", 1, "Number of spatial SVC layers");
+static const arg_def_t temporal_layers_arg =
+    ARG_DEF("tl", "temporal-layers", 1, "Number of temporal SVC layers");
+static const arg_def_t layering_mode_arg =
+    ARG_DEF("lm", "layering-mode", 1, "Temporal layering scheme.");
+static const arg_def_t kf_dist_arg =
+    ARG_DEF("k", "kf-dist", 1, "Number of frames between keyframes");
+static const arg_def_t scale_factors_arg =
+    ARG_DEF("r", "scale-factors", 1, "Scale factors (lowest to highest layer)");
+static const arg_def_t min_q_arg =
+    ARG_DEF(NULL, "min-q", 1, "Minimum quantizer");
+static const arg_def_t max_q_arg =
+    ARG_DEF(NULL, "max-q", 1, "Maximum quantizer");
+static const arg_def_t speed_arg =
+    ARG_DEF("sp", "speed", 1, "Speed configuration");
+static const arg_def_t aqmode_arg =
+    ARG_DEF("aq", "aqmode", 1, "AQ mode off/on");
+static const arg_def_t bitrates_arg =
+    ARG_DEF("bl", "bitrates", 1,
+            "Bitrates[spatial_layer * num_temporal_layer + temporal_layer]");
+static const arg_def_t dropframe_thresh_arg =
+    ARG_DEF(NULL, "drop-frame", 1, "Temporal resampling threshold (buf %)");
+static const arg_def_t error_resilient_arg =
+    ARG_DEF(NULL, "error-resilient", 1, "Error resilient flag");
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static const struct arg_enum_list bitdepth_enum[] = {
+  { "8", AOM_BITS_8 }, { "10", AOM_BITS_10 }, { "12", AOM_BITS_12 }, { NULL, 0 }
+};
+
+static const arg_def_t bitdepth_arg = ARG_DEF_ENUM(
+    "d", "bit-depth", 1, "Bit depth for codec 8, 10 or 12. ", bitdepth_enum);
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+static const arg_def_t *svc_args[] = {
+  &frames_arg,          &outputfile,   &width_arg,
+  &height_arg,          &timebase_arg, &bitrate_arg,
+  &spatial_layers_arg,  &kf_dist_arg,  &scale_factors_arg,
+  &min_q_arg,           &max_q_arg,    &temporal_layers_arg,
+  &layering_mode_arg,   &threads_arg,  &aqmode_arg,
+#if CONFIG_AV1_HIGHBITDEPTH
+  &bitdepth_arg,
+#endif
+  &speed_arg,           &bitrates_arg, &dropframe_thresh_arg,
+  &error_resilient_arg, NULL
+};
+
 #define zero(Dest) memset(&(Dest), 0, sizeof(Dest));
 
 static const char *exec_name;
 
-void usage_exit(void) { exit(EXIT_FAILURE); }
-
-static int mode_to_num_temporal_layers[10] = { 1, 2, 3, 3, 2, 1, 1, 3, 3, 3 };
-static int mode_to_num_spatial_layers[10] = { 1, 1, 1, 1, 1, 2, 3, 3, 3, 3 };
-static int mode_to_num_layers[10] = { 1, 2, 3, 3, 2, 2, 3, 9, 9, 9 };
-
-// For rate control encoding stats.
-struct RateControlMetrics {
-  // Number of input frames per layer.
-  int layer_input_frames[AOM_MAX_TS_LAYERS];
-  // Number of encoded non-key frames per layer.
-  int layer_enc_frames[AOM_MAX_TS_LAYERS];
-  // Framerate per layer layer (cumulative).
-  double layer_framerate[AOM_MAX_TS_LAYERS];
-  // Target average frame size per layer (per-frame-bandwidth per layer).
-  double layer_pfb[AOM_MAX_LAYERS];
-  // Actual average frame size per layer.
-  double layer_avg_frame_size[AOM_MAX_LAYERS];
-  // Average rate mismatch per layer (|target - actual| / target).
-  double layer_avg_rate_mismatch[AOM_MAX_LAYERS];
-  // Actual encoding bitrate per layer (cumulative across temporal layers).
-  double layer_encoding_bitrate[AOM_MAX_LAYERS];
-  // Average of the short-time encoder actual bitrate.
-  // TODO(marpan): Should we add these short-time stats for each layer?
-  double avg_st_encoding_bitrate;
-  // Variance of the short-time encoder actual bitrate.
-  double variance_st_encoding_bitrate;
-  // Window (number of frames) for computing short-timee encoding bitrate.
-  int window_size;
-  // Number of window measurements.
-  int window_count;
-  int layer_target_bitrate[AOM_MAX_LAYERS];
-};
-
-static int read_frame(struct AvxInputContext *input_ctx, aom_image_t *img) {
-  FILE *f = input_ctx->file;
-  y4m_input *y4m = &input_ctx->y4m;
-  int shortread = 0;
-
-  if (input_ctx->file_type == FILE_TYPE_Y4M) {
-    if (y4m_input_fetch_frame(y4m, f, img) < 1) return 0;
-  } else {
-    shortread = read_yuv_frame(input_ctx, img);
-  }
-
-  return !shortread;
+void usage_exit(void) {
+  fprintf(stderr, "Usage: %s <options> input_filename -o output_filename\n",
+          exec_name);
+  fprintf(stderr, "Options:\n");
+  arg_show_usage(stderr, svc_args);
+  exit(EXIT_FAILURE);
 }
 
 static int file_is_y4m(const char detect[4]) {
-  if (memcmp(detect, "YUV4", 4) == 0) {
-    return 1;
-  }
-  return 0;
+  return memcmp(detect, "YUV4", 4) == 0;
 }
 
 static int fourcc_is_ivf(const char detect[4]) {
@@ -90,10 +128,10 @@
   return 0;
 }
 
-static void close_input_file(struct AvxInputContext *input) {
-  fclose(input->file);
-  if (input->file_type == FILE_TYPE_Y4M) y4m_input_close(&input->y4m);
-}
+static const int option_max_values[ALL_OPTION_TYPES] = { 63, INT_MAX, INT_MAX,
+                                                         1 };
+
+static const int option_min_values[ALL_OPTION_TYPES] = { 0, 0, 1, 0 };
 
 static void open_input_file(struct AvxInputContext *input,
                             aom_chroma_sample_position_t csp) {
@@ -143,6 +181,268 @@
   }
 }
 
+static aom_codec_err_t extract_option(LAYER_OPTION_TYPE type, char *input,
+                                      int *value0, int *value1) {
+  if (type == SCALE_FACTOR) {
+    *value0 = (int)strtol(input, &input, 10);
+    if (*input++ != '/') return AOM_CODEC_INVALID_PARAM;
+    *value1 = (int)strtol(input, &input, 10);
+
+    if (*value0 < option_min_values[SCALE_FACTOR] ||
+        *value1 < option_min_values[SCALE_FACTOR] ||
+        *value0 > option_max_values[SCALE_FACTOR] ||
+        *value1 > option_max_values[SCALE_FACTOR] ||
+        *value0 > *value1)  // num shouldn't be greater than den
+      return AOM_CODEC_INVALID_PARAM;
+  } else {
+    *value0 = atoi(input);
+    if (*value0 < option_min_values[type] || *value0 > option_max_values[type])
+      return AOM_CODEC_INVALID_PARAM;
+  }
+  return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t parse_layer_options_from_string(
+    aom_svc_params_t *svc_params, LAYER_OPTION_TYPE type, const char *input,
+    int *option0, int *option1) {
+  aom_codec_err_t res = AOM_CODEC_OK;
+  char *input_string;
+  char *token;
+  const char *delim = ",";
+  int num_layers = svc_params->number_spatial_layers;
+  int i = 0;
+
+  if (type == BITRATE)
+    num_layers =
+        svc_params->number_spatial_layers * svc_params->number_temporal_layers;
+
+  if (input == NULL || option0 == NULL ||
+      (option1 == NULL && type == SCALE_FACTOR))
+    return AOM_CODEC_INVALID_PARAM;
+
+  input_string = malloc(strlen(input));
+  memcpy(input_string, input, strlen(input));
+  if (input_string == NULL) return AOM_CODEC_MEM_ERROR;
+  token = strtok(input_string, delim);  // NOLINT
+  for (i = 0; i < num_layers; ++i) {
+    if (token != NULL) {
+      res = extract_option(type, token, option0 + i, option1 + i);
+      if (res != AOM_CODEC_OK) break;
+      token = strtok(NULL, delim);  // NOLINT
+    } else {
+      break;
+    }
+  }
+  if (res == AOM_CODEC_OK && i != num_layers) {
+    res = AOM_CODEC_INVALID_PARAM;
+  }
+  free(input_string);
+  return res;
+}
+
+static void parse_command_line(int argc, const char **argv_,
+                               AppInput *app_input,
+                               aom_svc_params_t *svc_params,
+                               aom_codec_enc_cfg_t *enc_cfg) {
+  struct arg arg;
+  char **argv = NULL;
+  char **argi = NULL;
+  char **argj = NULL;
+  char string_options[1024] = { 0 };
+
+  // Default settings
+  svc_params->number_spatial_layers = 1;
+  svc_params->number_temporal_layers = 1;
+  app_input->layering_mode = 0;
+  enc_cfg->g_threads = 1;
+  enc_cfg->rc_end_usage = AOM_CBR;
+
+  // process command line options
+  argv = argv_dup(argc - 1, argv_ + 1);
+  for (argi = argj = argv; (*argj = *argi); argi += arg.argv_step) {
+    arg.argv_step = 1;
+
+    if (arg_match(&arg, &outputfile, argi)) {
+      app_input->output_filename = arg.val;
+    } else if (arg_match(&arg, &width_arg, argi)) {
+      enc_cfg->g_w = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &height_arg, argi)) {
+      enc_cfg->g_h = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &timebase_arg, argi)) {
+      enc_cfg->g_timebase = arg_parse_rational(&arg);
+    } else if (arg_match(&arg, &bitrate_arg, argi)) {
+      enc_cfg->rc_target_bitrate = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &spatial_layers_arg, argi)) {
+      svc_params->number_spatial_layers = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &temporal_layers_arg, argi)) {
+      svc_params->number_temporal_layers = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &speed_arg, argi)) {
+      app_input->speed = arg_parse_uint(&arg);
+      if (app_input->speed > 9) {
+        warn("Mapping speed %d to speed 9.\n", app_input->speed);
+      }
+    } else if (arg_match(&arg, &aqmode_arg, argi)) {
+      app_input->aq_mode = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &threads_arg, argi)) {
+      enc_cfg->g_threads = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &layering_mode_arg, argi)) {
+      app_input->layering_mode = arg_parse_int(&arg);
+    } else if (arg_match(&arg, &kf_dist_arg, argi)) {
+      enc_cfg->kf_min_dist = arg_parse_uint(&arg);
+      enc_cfg->kf_max_dist = enc_cfg->kf_min_dist;
+    } else if (arg_match(&arg, &scale_factors_arg, argi)) {
+      parse_layer_options_from_string(svc_params, SCALE_FACTOR, arg.val,
+                                      svc_params->scaling_factor_num,
+                                      svc_params->scaling_factor_den);
+    } else if (arg_match(&arg, &min_q_arg, argi)) {
+      enc_cfg->rc_min_quantizer = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &max_q_arg, argi)) {
+      enc_cfg->rc_max_quantizer = arg_parse_uint(&arg);
+#if CONFIG_AV1_HIGHBITDEPTH
+    } else if (arg_match(&arg, &bitdepth_arg, argi)) {
+      enc_cfg->g_bit_depth = arg_parse_enum_or_int(&arg);
+      switch (enc_cfg->g_bit_depth) {
+        case AOM_BITS_8:
+          enc_cfg->g_input_bit_depth = 8;
+          enc_cfg->g_profile = 0;
+          break;
+        case AOM_BITS_10:
+          enc_cfg->g_input_bit_depth = 10;
+          enc_cfg->g_profile = 2;
+          break;
+        case AOM_BITS_12:
+          enc_cfg->g_input_bit_depth = 12;
+          enc_cfg->g_profile = 2;
+          break;
+        default:
+          die("Error: Invalid bit depth selected (%d)\n", enc_cfg->g_bit_depth);
+          break;
+      }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    } else if (arg_match(&arg, &dropframe_thresh_arg, argi)) {
+      enc_cfg->rc_dropframe_thresh = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &error_resilient_arg, argi)) {
+      enc_cfg->g_error_resilient = arg_parse_uint(&arg);
+      if (enc_cfg->g_error_resilient != 0 && enc_cfg->g_error_resilient != 1)
+        die("Invalid value for error resilient (0, 1): %d.",
+            enc_cfg->g_error_resilient);
+    } else {
+      ++argj;
+    }
+  }
+
+  // Total bitrate needs to be parsed after the number of layers.
+  for (argi = argj = argv; (*argj = *argi); argi += arg.argv_step) {
+    arg.argv_step = 1;
+    if (arg_match(&arg, &bitrates_arg, argi)) {
+      parse_layer_options_from_string(svc_params, BITRATE, arg.val,
+                                      svc_params->layer_target_bitrate, NULL);
+    } else {
+      ++argj;
+    }
+  }
+
+  // There will be a space in front of the string options
+  if (strlen(string_options) > 0)
+    strncpy(app_input->options, string_options, OPTION_BUFFER_SIZE);
+
+  // Check for unrecognized options
+  for (argi = argv; *argi; ++argi)
+    if (argi[0][0] == '-' && strlen(argi[0]) > 1)
+      die("Error: Unrecognized option %s\n", *argi);
+
+  if (argv[0] == NULL) {
+    usage_exit();
+  }
+
+  app_input->input_ctx.filename = argv[0];
+  free(argv);
+
+  open_input_file(&app_input->input_ctx, 0);
+  if (app_input->input_ctx.file_type == FILE_TYPE_Y4M) {
+    enc_cfg->g_w = app_input->input_ctx.width;
+    enc_cfg->g_h = app_input->input_ctx.height;
+  }
+
+  if (enc_cfg->g_w < 16 || enc_cfg->g_w % 2 || enc_cfg->g_h < 16 ||
+      enc_cfg->g_h % 2)
+    die("Invalid resolution: %d x %d\n", enc_cfg->g_w, enc_cfg->g_h);
+
+  printf(
+      "Codec %s\n"
+      "layers: %d\n"
+      "width %u, height: %u\n"
+      "num: %d, den: %d, bitrate: %u\n"
+      "gop size: %u\n",
+      aom_codec_iface_name(aom_codec_av1_cx()),
+      svc_params->number_spatial_layers, enc_cfg->g_w, enc_cfg->g_h,
+      enc_cfg->g_timebase.num, enc_cfg->g_timebase.den,
+      enc_cfg->rc_target_bitrate, enc_cfg->kf_max_dist);
+}
+
+static unsigned int mode_to_num_temporal_layers[10] = { 1, 2, 3, 3, 2,
+                                                        1, 1, 3, 3, 3 };
+static unsigned int mode_to_num_spatial_layers[10] = { 1, 1, 1, 1, 1,
+                                                       2, 3, 3, 3, 3 };
+
+// For rate control encoding stats.
+struct RateControlMetrics {
+  // Number of input frames per layer.
+  int layer_input_frames[AOM_MAX_TS_LAYERS];
+  // Number of encoded non-key frames per layer.
+  int layer_enc_frames[AOM_MAX_TS_LAYERS];
+  // Framerate per layer layer (cumulative).
+  double layer_framerate[AOM_MAX_TS_LAYERS];
+  // Target average frame size per layer (per-frame-bandwidth per layer).
+  double layer_pfb[AOM_MAX_LAYERS];
+  // Actual average frame size per layer.
+  double layer_avg_frame_size[AOM_MAX_LAYERS];
+  // Average rate mismatch per layer (|target - actual| / target).
+  double layer_avg_rate_mismatch[AOM_MAX_LAYERS];
+  // Actual encoding bitrate per layer (cumulative across temporal layers).
+  double layer_encoding_bitrate[AOM_MAX_LAYERS];
+  // Average of the short-time encoder actual bitrate.
+  // TODO(marpan): Should we add these short-time stats for each layer?
+  double avg_st_encoding_bitrate;
+  // Variance of the short-time encoder actual bitrate.
+  double variance_st_encoding_bitrate;
+  // Window (number of frames) for computing short-timee encoding bitrate.
+  int window_size;
+  // Number of window measurements.
+  int window_count;
+  int layer_target_bitrate[AOM_MAX_LAYERS];
+};
+
+// Reference frames used in this example encoder.
+enum {
+  SVC_LAST_FRAME = 0,
+  SVC_LAST2_FRAME,
+  SVC_LAST3_FRAME,
+  SVC_GOLDEN_FRAME,
+  SVC_BWDREF_FRAME,
+  SVC_ALTREF2_FRAME,
+  SVC_ALTREF_FRAME
+};
+
+static int read_frame(struct AvxInputContext *input_ctx, aom_image_t *img) {
+  FILE *f = input_ctx->file;
+  y4m_input *y4m = &input_ctx->y4m;
+  int shortread = 0;
+
+  if (input_ctx->file_type == FILE_TYPE_Y4M) {
+    if (y4m_input_fetch_frame(y4m, f, img) < 1) return 0;
+  } else {
+    shortread = read_yuv_frame(input_ctx, img);
+  }
+
+  return !shortread;
+}
+
+static void close_input_file(struct AvxInputContext *input) {
+  fclose(input->file);
+  if (input->file_type == FILE_TYPE_Y4M) y4m_input_close(&input->y4m);
+}
+
 // Note: these rate control metrics assume only 1 key frame in the
 // sequence (i.e., first frame only). So for temporal pattern# 7
 // (which has key frame for every frame on base layer), the metrics
@@ -200,7 +500,7 @@
   int tot_num_frames = 0;
   double perc_fluctuation = 0.0;
   printf("Total number of processed frames: %d\n\n", frame_cnt - 1);
-  printf("Rate control layer stats for %d layer(s):\n\n", ts_number_layers);
+  printf("Rate control layer stats for %u layer(s):\n\n", ts_number_layers);
   for (unsigned int sl = 0; sl < ss_number_layers; ++sl) {
     tot_num_frames = 0;
     for (unsigned int tl = 0; tl < ts_number_layers; ++tl) {
@@ -216,7 +516,7 @@
           rc->layer_avg_frame_size[i] / rc->layer_enc_frames[tl];
       rc->layer_avg_rate_mismatch[i] =
           100.0 * rc->layer_avg_rate_mismatch[i] / rc->layer_enc_frames[tl];
-      printf("For layer#: %d %d \n", sl, tl);
+      printf("For layer#: %u %u \n", sl, tl);
       printf("Bitrate (target vs actual): %d %f\n", rc->layer_target_bitrate[i],
              rc->layer_encoding_bitrate[i]);
       printf("Average frame size (target vs actual): %f %f\n", rc->layer_pfb[i],
@@ -245,38 +545,38 @@
 }
 
 // Layer pattern configuration.
-static int set_layer_pattern(int layering_mode, int superframe_cnt,
-                             aom_svc_layer_id_t *layer_id,
-                             aom_svc_ref_frame_config_t *ref_frame_config,
-                             int *use_svc_control, int spatial_layer_id,
-                             int is_key_frame, int ksvc_mode) {
+static void set_layer_pattern(int layering_mode, int superframe_cnt,
+                              aom_svc_layer_id_t *layer_id,
+                              aom_svc_ref_frame_config_t *ref_frame_config,
+                              int *use_svc_control, int spatial_layer_id,
+                              int is_key_frame, int ksvc_mode) {
   int i;
+  int enable_longterm_temporal_ref = 1;
   int shift = (layering_mode == 7) ? 2 : 0;
   *use_svc_control = 1;
   layer_id->spatial_layer_id = spatial_layer_id;
+  int lag_index = 0;
+  int base_count = superframe_cnt >> 2;
   // Set the referende map buffer idx for the 7 references:
   // LAST_FRAME (0), LAST2_FRAME(1), LAST3_FRAME(2), GOLDEN_FRAME(3),
   // BWDREF_FRAME(4), ALTREF2_FRAME(5), ALTREF_FRAME(6).
   for (i = 0; i < INTER_REFS_PER_FRAME; i++) ref_frame_config->ref_idx[i] = i;
   for (i = 0; i < INTER_REFS_PER_FRAME; i++) ref_frame_config->reference[i] = 0;
   for (i = 0; i < REF_FRAMES; i++) ref_frame_config->refresh[i] = 0;
-  // Note for this layered patterns only use LAST and GF for prediction in
-  // non-rd mode (speed >= 7).
-  int layer_flags = AOM_EFLAG_NO_REF_LAST2 | AOM_EFLAG_NO_REF_LAST3 |
-                    AOM_EFLAG_NO_REF_ARF | AOM_EFLAG_NO_REF_BWD |
-                    AOM_EFLAG_NO_REF_ARF2;
+
   if (ksvc_mode) {
     // Same pattern as case 8.
     layering_mode = 8;
     if (!is_key_frame)
       // No inter-layer prediction on inter-frames.
-      layer_flags |= AOM_EFLAG_NO_REF_GF;
+      ref_frame_config->reference[SVC_LAST_FRAME] = 1;
   }
   switch (layering_mode) {
     case 0:
-      // 1-layer: update LAST on every frame, reference LAST and GF.
+      // 1-layer: update LAST on every frame, reference LAST.
       layer_id->temporal_layer_id = 0;
       ref_frame_config->refresh[0] = 1;
+      ref_frame_config->reference[SVC_LAST_FRAME] = 1;
       break;
     case 1:
       // 2-temporal layer.
@@ -284,12 +584,13 @@
       //  0    2    4
       if (superframe_cnt % 2 == 0) {
         layer_id->temporal_layer_id = 0;
-        // Update LAST on layer 0, reference LAST and GF.
+        // Update LAST on layer 0, reference LAST.
         ref_frame_config->refresh[0] = 1;
+        ref_frame_config->reference[SVC_LAST_FRAME] = 1;
       } else {
         layer_id->temporal_layer_id = 1;
         // No updates on layer 1, only reference LAST (TL0).
-        layer_flags |= AOM_EFLAG_NO_REF_GF;
+        ref_frame_config->reference[SVC_LAST_FRAME] = 1;
       }
       break;
     case 2:
@@ -300,28 +601,74 @@
       if (superframe_cnt % 4 == 0) {
         // Base layer.
         layer_id->temporal_layer_id = 0;
-        // Update LAST on layer 0, reference LAST and GF.
+        // Update LAST on layer 0, reference LAST.
         ref_frame_config->refresh[0] = 1;
+        ref_frame_config->reference[SVC_LAST_FRAME] = 1;
       } else if ((superframe_cnt - 1) % 4 == 0) {
         layer_id->temporal_layer_id = 2;
         // First top layer: no updates, only reference LAST (TL0).
-        layer_flags |= AOM_EFLAG_NO_REF_GF;
+        ref_frame_config->reference[SVC_LAST_FRAME] = 1;
       } else if ((superframe_cnt - 2) % 4 == 0) {
         layer_id->temporal_layer_id = 1;
         // Middle layer (TL1): update LAST2, only reference LAST (TL0).
         ref_frame_config->refresh[1] = 1;
-        layer_flags |= AOM_EFLAG_NO_REF_GF;
+        ref_frame_config->reference[SVC_LAST_FRAME] = 1;
       } else if ((superframe_cnt - 3) % 4 == 0) {
         layer_id->temporal_layer_id = 2;
         // Second top layer: no updates, only reference LAST.
         // Set buffer idx for LAST to slot 1, since that was the slot
         // updated in previous frame. So LAST is TL1 frame.
-        ref_frame_config->ref_idx[0] = 1;
-        ref_frame_config->ref_idx[1] = 0;
-        layer_flags |= AOM_EFLAG_NO_REF_GF;
+        ref_frame_config->ref_idx[SVC_LAST_FRAME] = 1;
+        ref_frame_config->ref_idx[SVC_LAST2_FRAME] = 0;
+        ref_frame_config->reference[SVC_LAST_FRAME] = 1;
       }
       break;
     case 3:
+      // 3 TL, same as above, except allow for predicting
+      // off 2 more references (GOLDEN and ALTREF), with
+      // GOLDEN updated periodically, and ALTREF lagging from
+      // LAST from ~4 frames. Both GOLDEN and ALTREF
+      // can only be updated on base temporal layer.
+
+      // Keep golden fixed at slot 3.
+      ref_frame_config->ref_idx[SVC_GOLDEN_FRAME] = 3;
+      // Cyclically refresh slots 4, 5, 6, 7, for lag altref.
+      lag_index = 4 + (base_count % 4);
+      // Set the altref slot to lag_index.
+      ref_frame_config->ref_idx[SVC_ALTREF_FRAME] = lag_index;
+      if (superframe_cnt % 4 == 0) {
+        // Base layer.
+        layer_id->temporal_layer_id = 0;
+        // Update LAST on layer 0, reference LAST.
+        ref_frame_config->refresh[0] = 1;
+        ref_frame_config->reference[SVC_LAST_FRAME] = 1;
+        // Refresh GOLDEN every x ~10 base layer frames.
+        if (base_count % 10 == 0) ref_frame_config->refresh[3] = 1;
+        // Refresh lag_index slot, needed for lagging altref.
+        ref_frame_config->refresh[lag_index] = 1;
+      } else if ((superframe_cnt - 1) % 4 == 0) {
+        layer_id->temporal_layer_id = 2;
+        // First top layer: no updates, only reference LAST (TL0).
+        ref_frame_config->reference[SVC_LAST_FRAME] = 1;
+      } else if ((superframe_cnt - 2) % 4 == 0) {
+        layer_id->temporal_layer_id = 1;
+        // Middle layer (TL1): update LAST2, only reference LAST (TL0).
+        ref_frame_config->refresh[1] = 1;
+        ref_frame_config->reference[SVC_LAST_FRAME] = 1;
+      } else if ((superframe_cnt - 3) % 4 == 0) {
+        layer_id->temporal_layer_id = 2;
+        // Second top layer: no updates, only reference LAST.
+        // Set buffer idx for LAST to slot 1, since that was the slot
+        // updated in previous frame. So LAST is TL1 frame.
+        ref_frame_config->ref_idx[SVC_LAST_FRAME] = 1;
+        ref_frame_config->ref_idx[SVC_LAST2_FRAME] = 0;
+        ref_frame_config->reference[SVC_LAST_FRAME] = 1;
+      }
+      // Every frame can reference GOLDEN AND ALTREF.
+      ref_frame_config->reference[SVC_GOLDEN_FRAME] = 1;
+      ref_frame_config->reference[SVC_ALTREF_FRAME] = 1;
+      break;
+    case 4:
       // 3-temporal layer: but middle layer updates GF, so 2nd TL2 will
       // only reference GF (not LAST). Other frames only reference LAST.
       //   1    3   5    7
@@ -332,37 +679,20 @@
         layer_id->temporal_layer_id = 0;
         // Update LAST on layer 0, only reference LAST.
         ref_frame_config->refresh[0] = 1;
-        layer_flags |= AOM_EFLAG_NO_REF_GF;
+        ref_frame_config->reference[SVC_LAST_FRAME] = 1;
       } else if ((superframe_cnt - 1) % 4 == 0) {
         layer_id->temporal_layer_id = 2;
         // First top layer: no updates, only reference LAST (TL0).
-        layer_flags |= AOM_EFLAG_NO_REF_GF;
+        ref_frame_config->reference[SVC_LAST_FRAME] = 1;
       } else if ((superframe_cnt - 2) % 4 == 0) {
         layer_id->temporal_layer_id = 1;
         // Middle layer (TL1): update GF, only reference LAST (TL0).
         ref_frame_config->refresh[3] = 1;
-        layer_flags |= AOM_EFLAG_NO_REF_GF;
+        ref_frame_config->reference[SVC_LAST_FRAME] = 1;
       } else if ((superframe_cnt - 3) % 4 == 0) {
         layer_id->temporal_layer_id = 2;
         // Second top layer: no updates, only reference GF.
-        layer_flags |= AOM_EFLAG_NO_REF_LAST;
-      }
-      break;
-    case 4:
-      // 2-temporal layer with the old update flags, not with the new
-      // SVC control.
-      *use_svc_control = 0;
-      //    1    3    5
-      //  0    2    4
-      if (superframe_cnt % 2 == 0) {
-        layer_id->temporal_layer_id = 0;
-        // Update LAST on layer 0, reference LAST and GF.
-        layer_flags |= AOM_EFLAG_NO_UPD_GF | AOM_EFLAG_NO_UPD_ARF;
-      } else {
-        layer_id->temporal_layer_id = 1;
-        // No updates on layer 1, only reference LAST (TL0).
-        layer_flags |= AOM_EFLAG_NO_UPD_LAST | AOM_EFLAG_NO_UPD_GF |
-                       AOM_EFLAG_NO_UPD_ARF | AOM_EFLAG_NO_REF_GF;
+        ref_frame_config->reference[SVC_GOLDEN_FRAME] = 1;
       }
       break;
     case 5:
@@ -371,13 +701,15 @@
       if (layer_id->spatial_layer_id == 0) {
         // Reference LAST, update LAST.
         ref_frame_config->refresh[0] = 1;
-        layer_flags |= AOM_EFLAG_NO_REF_GF;
+        ref_frame_config->reference[SVC_LAST_FRAME] = 1;
       } else if (layer_id->spatial_layer_id == 1) {
         // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 1
         // and GOLDEN to slot 0. Update slot 1 (LAST).
-        ref_frame_config->ref_idx[0] = 1;
-        ref_frame_config->ref_idx[3] = 0;
+        ref_frame_config->ref_idx[SVC_LAST_FRAME] = 1;
+        ref_frame_config->ref_idx[SVC_GOLDEN_FRAME] = 0;
         ref_frame_config->refresh[1] = 1;
+        ref_frame_config->reference[SVC_LAST_FRAME] = 1;
+        ref_frame_config->reference[SVC_GOLDEN_FRAME] = 1;
       }
       break;
     case 6:
@@ -392,23 +724,35 @@
         for (i = 0; i < INTER_REFS_PER_FRAME; i++)
           ref_frame_config->ref_idx[i] = 0;
         ref_frame_config->refresh[0] = 1;
-        layer_flags |= AOM_EFLAG_NO_REF_GF;
+        ref_frame_config->reference[SVC_LAST_FRAME] = 1;
       } else if (layer_id->spatial_layer_id == 1) {
         // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 1
         // and GOLDEN (and all other refs) to slot 0.
         // Update slot 1 (LAST).
         for (i = 0; i < INTER_REFS_PER_FRAME; i++)
           ref_frame_config->ref_idx[i] = 0;
-        ref_frame_config->ref_idx[0] = 1;
+        ref_frame_config->ref_idx[SVC_LAST_FRAME] = 1;
         ref_frame_config->refresh[1] = 1;
+        ref_frame_config->reference[SVC_LAST_FRAME] = 1;
+        ref_frame_config->reference[SVC_GOLDEN_FRAME] = 1;
       } else if (layer_id->spatial_layer_id == 2) {
         // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 2
         // and GOLDEN (and all other refs) to slot 1.
         // Update slot 2 (LAST).
         for (i = 0; i < INTER_REFS_PER_FRAME; i++)
           ref_frame_config->ref_idx[i] = 1;
-        ref_frame_config->ref_idx[0] = 2;
+        ref_frame_config->ref_idx[SVC_LAST_FRAME] = 2;
         ref_frame_config->refresh[2] = 1;
+        ref_frame_config->reference[SVC_LAST_FRAME] = 1;
+        ref_frame_config->reference[SVC_GOLDEN_FRAME] = 1;
+        // For 3 spatial layer case: allow for top spatial layer to use
+        // additional temporal reference. Update every 10 frames.
+        if (enable_longterm_temporal_ref) {
+          ref_frame_config->ref_idx[SVC_ALTREF_FRAME] = REF_FRAMES - 1;
+          ref_frame_config->reference[SVC_ALTREF_FRAME] = 1;
+          if (base_count % 10 == 0)
+            ref_frame_config->refresh[REF_FRAMES - 1] = 1;
+        }
       }
       break;
     case 7:
@@ -423,9 +767,8 @@
       // No overlap in buffer updates between TL2 and TL1.
       // TL2 updates slot 3 and 4, TL1 updates 5, 6, 7.
       // Set the references via the svc_ref_frame_config control.
-      layer_flags = 0;
       // Always reference LAST.
-      ref_frame_config->reference[0] = 1;
+      ref_frame_config->reference[SVC_LAST_FRAME] = 1;
       if (superframe_cnt % 4 == 0) {
         // Base temporal layer.
         layer_id->temporal_layer_id = 0;
@@ -441,7 +784,7 @@
           // Update slot 1 (LAST).
           for (i = 0; i < INTER_REFS_PER_FRAME; i++)
             ref_frame_config->ref_idx[i] = 0;
-          ref_frame_config->ref_idx[0] = 1;
+          ref_frame_config->ref_idx[SVC_LAST_FRAME] = 1;
           ref_frame_config->refresh[1] = 1;
         } else if (layer_id->spatial_layer_id == 2) {
           // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 2,
@@ -449,7 +792,7 @@
           // Update slot 2 (LAST).
           for (i = 0; i < INTER_REFS_PER_FRAME; i++)
             ref_frame_config->ref_idx[i] = 1;
-          ref_frame_config->ref_idx[0] = 2;
+          ref_frame_config->ref_idx[SVC_LAST_FRAME] = 2;
           ref_frame_config->refresh[2] = 1;
         }
       } else if ((superframe_cnt - 1) % 4 == 0) {
@@ -461,7 +804,7 @@
           // Set all other buffer_idx to slot 0.
           for (i = 0; i < INTER_REFS_PER_FRAME; i++)
             ref_frame_config->ref_idx[i] = 0;
-          ref_frame_config->ref_idx[3] = 3;
+          ref_frame_config->ref_idx[SVC_GOLDEN_FRAME] = 3;
           ref_frame_config->refresh[3] = 1;
         } else if (layer_id->spatial_layer_id == 1) {
           // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 1,
@@ -469,8 +812,8 @@
           // Set LAST2 to slot 4 and Update slot 4.
           for (i = 0; i < INTER_REFS_PER_FRAME; i++)
             ref_frame_config->ref_idx[i] = 3;
-          ref_frame_config->ref_idx[0] = 1;
-          ref_frame_config->ref_idx[1] = 4;
+          ref_frame_config->ref_idx[SVC_LAST_FRAME] = 1;
+          ref_frame_config->ref_idx[SVC_LAST2_FRAME] = 4;
           ref_frame_config->refresh[4] = 1;
         } else if (layer_id->spatial_layer_id == 2) {
           // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 2,
@@ -478,7 +821,7 @@
           // No update.
           for (i = 0; i < INTER_REFS_PER_FRAME; i++)
             ref_frame_config->ref_idx[i] = 4;
-          ref_frame_config->ref_idx[0] = 2;
+          ref_frame_config->ref_idx[SVC_LAST_FRAME] = 2;
         }
       } else if ((superframe_cnt - 2) % 4 == 0) {
         // Middle temporal enhancement layer.
@@ -489,25 +832,25 @@
           // Set GOLDEN to slot 5 and update slot 5.
           for (i = 0; i < INTER_REFS_PER_FRAME; i++)
             ref_frame_config->ref_idx[i] = 0;
-          ref_frame_config->ref_idx[3] = 5 - shift;
+          ref_frame_config->ref_idx[SVC_GOLDEN_FRAME] = 5 - shift;
           ref_frame_config->refresh[5 - shift] = 1;
         } else if (layer_id->spatial_layer_id == 1) {
           // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 1,
           // GOLDEN (and all other refs) to slot 5.
-          // Set LAST2 to slot 6 and update slot 6.
+          // Set LAST3 to slot 6 and update slot 6.
           for (i = 0; i < INTER_REFS_PER_FRAME; i++)
             ref_frame_config->ref_idx[i] = 5 - shift;
-          ref_frame_config->ref_idx[0] = 1;
-          ref_frame_config->ref_idx[2] = 6 - shift;
+          ref_frame_config->ref_idx[SVC_LAST_FRAME] = 1;
+          ref_frame_config->ref_idx[SVC_LAST3_FRAME] = 6 - shift;
           ref_frame_config->refresh[6 - shift] = 1;
         } else if (layer_id->spatial_layer_id == 2) {
           // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 2,
           // GOLDEN (and all other refs) to slot 6.
-          // Set LAST2 to slot 6 and update slot 7.
+          // Set LAST3 to slot 7 and update slot 7.
           for (i = 0; i < INTER_REFS_PER_FRAME; i++)
             ref_frame_config->ref_idx[i] = 6 - shift;
-          ref_frame_config->ref_idx[0] = 2;
-          ref_frame_config->ref_idx[2] = 7 - shift;
+          ref_frame_config->ref_idx[SVC_LAST_FRAME] = 2;
+          ref_frame_config->ref_idx[SVC_LAST3_FRAME] = 7 - shift;
           ref_frame_config->refresh[7 - shift] = 1;
         }
       } else if ((superframe_cnt - 3) % 4 == 0) {
@@ -519,69 +862,73 @@
           // Set all other buffer_idx to 0.
           for (i = 0; i < INTER_REFS_PER_FRAME; i++)
             ref_frame_config->ref_idx[i] = 0;
-          ref_frame_config->ref_idx[0] = 5 - shift;
-          ref_frame_config->ref_idx[3] = 3;
+          ref_frame_config->ref_idx[SVC_LAST_FRAME] = 5 - shift;
+          ref_frame_config->ref_idx[SVC_GOLDEN_FRAME] = 3;
           ref_frame_config->refresh[3] = 1;
         } else if (layer_id->spatial_layer_id == 1) {
           // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 6,
           // GOLDEN to slot 3. Set LAST2 to slot 4 and update slot 4.
           for (i = 0; i < INTER_REFS_PER_FRAME; i++)
             ref_frame_config->ref_idx[i] = 0;
-          ref_frame_config->ref_idx[0] = 6 - shift;
-          ref_frame_config->ref_idx[3] = 3;
-          ref_frame_config->ref_idx[1] = 4;
+          ref_frame_config->ref_idx[SVC_LAST_FRAME] = 6 - shift;
+          ref_frame_config->ref_idx[SVC_GOLDEN_FRAME] = 3;
+          ref_frame_config->ref_idx[SVC_LAST2_FRAME] = 4;
           ref_frame_config->refresh[4] = 1;
         } else if (layer_id->spatial_layer_id == 2) {
           // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 7,
           // GOLDEN to slot 4. No update.
           for (i = 0; i < INTER_REFS_PER_FRAME; i++)
             ref_frame_config->ref_idx[i] = 0;
-          ref_frame_config->ref_idx[0] = 7 - shift;
-          ref_frame_config->ref_idx[3] = 4;
+          ref_frame_config->ref_idx[SVC_LAST_FRAME] = 7 - shift;
+          ref_frame_config->ref_idx[SVC_GOLDEN_FRAME] = 4;
         }
       }
       if (layer_id->spatial_layer_id > 0)
-        ref_frame_config->reference[3] = 1;  // Reference GOLDEN.
+        // Reference GOLDEN.
+        ref_frame_config->reference[SVC_GOLDEN_FRAME] = 1;
+      // For 3 spatial layer case 7 (where there is free buffer slot):
+      // allow for top spatial layer to use additional temporal reference.
+      // Additional reference is only updated on base temporal layer, every
+      // 10 TL0 frames here.
+      if (enable_longterm_temporal_ref && layer_id->spatial_layer_id == 2 &&
+          layering_mode == 7) {
+        ref_frame_config->ref_idx[SVC_ALTREF_FRAME] = REF_FRAMES - 1;
+        ref_frame_config->reference[SVC_ALTREF_FRAME] = 1;
+        if (base_count % 10 == 0 && layer_id->temporal_layer_id == 0)
+          ref_frame_config->refresh[REF_FRAMES - 1] = 1;
+      }
       break;
     default: assert(0); die("Error: Unsupported temporal layering mode!\n");
   }
-  return layer_flags;
 }
 
-int main(int argc, char **argv) {
+int main(int argc, const char **argv) {
+  AppInput app_input;
   AvxVideoWriter *outfile[AOM_MAX_LAYERS] = { NULL };
-  aom_codec_ctx_t codec;
+  AvxVideoWriter *total_layer_file = NULL;
   aom_codec_enc_cfg_t cfg;
   int frame_cnt = 0;
   aom_image_t raw;
-  aom_codec_err_t res;
-  unsigned int width;
-  unsigned int height;
-  uint32_t error_resilient = 0;
-  int speed;
   int frame_avail;
   int got_data = 0;
   int flags = 0;
   unsigned i;
   int pts = 0;             // PTS starts at 0.
   int frame_duration = 1;  // 1 timebase tick per frame.
-  int layering_mode = 0;
   aom_svc_layer_id_t layer_id;
   aom_svc_params_t svc_params;
   aom_svc_ref_frame_config_t ref_frame_config;
-  const AvxInterface *encoder = NULL;
-  struct AvxInputContext input_ctx;
+
   struct RateControlMetrics rc;
   int64_t cx_time = 0;
-  const int min_args_base = 13;
-  const int min_args = min_args_base;
   double sum_bitrate = 0.0;
   double sum_bitrate2 = 0.0;
   double framerate = 30.0;
   int use_svc_control = 1;
+  int set_err_resil_frame = 0;
   zero(rc.layer_target_bitrate);
   memset(&layer_id, 0, sizeof(aom_svc_layer_id_t));
-  memset(&input_ctx, 0, sizeof(input_ctx));
+  memset(&app_input, 0, sizeof(AppInput));
   memset(&svc_params, 0, sizeof(svc_params));
 
   // Flag to test dynamic scaling of source frames for single
@@ -589,81 +936,70 @@
   const int test_dynamic_scaling_single_layer = 0;
 
   /* Setup default input stream settings */
-  input_ctx.framerate.numerator = 30;
-  input_ctx.framerate.denominator = 1;
-  input_ctx.only_i420 = 1;
-  input_ctx.bit_depth = 0;
-  unsigned int ts_number_layers = 1;
-  unsigned int ss_number_layers = 1;
+  app_input.input_ctx.framerate.numerator = 30;
+  app_input.input_ctx.framerate.denominator = 1;
+  app_input.input_ctx.only_i420 = 1;
+  app_input.input_ctx.bit_depth = 0;
   exec_name = argv[0];
-  // Check usage and arguments.
-  if (argc < min_args) {
-    die("Usage: %s <infile> <outfile> <codec_type(av1)> <width> <height> "
-        "<rate_num> <rate_den> <speed> <frame_drop_threshold> "
-        "<error_resilient> <threads> <mode> "
-        "<Rate_0> ... <Rate_nlayers-1>\n",
-        argv[0]);
+
+  // start with default encoder configuration
+  aom_codec_err_t res =
+      aom_codec_enc_config_default(aom_codec_av1_cx(), &cfg, 0);
+  if (res) {
+    die("Failed to get config: %s\n", aom_codec_err_to_string(res));
   }
 
-  encoder = get_aom_encoder_by_name(argv[3]);
+  // Real time parameters.
+  cfg.g_usage = AOM_USAGE_REALTIME;
 
-  width = (unsigned int)strtoul(argv[4], NULL, 0);
-  height = (unsigned int)strtoul(argv[5], NULL, 0);
-  if (width < 16 || width % 2 || height < 16 || height % 2) {
-    die("Invalid resolution: %d x %d", width, height);
+  cfg.rc_end_usage = AOM_CBR;
+  cfg.rc_min_quantizer = 2;
+  cfg.rc_max_quantizer = 52;
+  cfg.rc_undershoot_pct = 50;
+  cfg.rc_overshoot_pct = 50;
+  cfg.rc_buf_initial_sz = 600;
+  cfg.rc_buf_optimal_sz = 600;
+  cfg.rc_buf_sz = 1000;
+  cfg.rc_resize_mode = 0;  // Set to RESIZE_DYNAMIC for dynamic resize.
+  cfg.g_lag_in_frames = 0;
+  cfg.kf_mode = AOM_KF_AUTO;
+
+  parse_command_line(argc, argv, &app_input, &svc_params, &cfg);
+
+  unsigned int ts_number_layers = svc_params.number_temporal_layers;
+  unsigned int ss_number_layers = svc_params.number_spatial_layers;
+
+  unsigned int width = cfg.g_w;
+  unsigned int height = cfg.g_h;
+
+  if (ts_number_layers !=
+          mode_to_num_temporal_layers[app_input.layering_mode] ||
+      ss_number_layers != mode_to_num_spatial_layers[app_input.layering_mode]) {
+    die("Number of layers doesn't match layering mode.");
   }
 
-  layering_mode = (int)strtol(argv[12], NULL, 0);
-  if (layering_mode < 0 || layering_mode > 13) {
-    die("Invalid layering mode (0..12) %s", argv[12]);
-  }
-
-  if (argc != min_args + mode_to_num_layers[layering_mode]) {
-    die("Invalid number of arguments");
-  }
-
-  ts_number_layers = mode_to_num_temporal_layers[layering_mode];
-  ss_number_layers = mode_to_num_spatial_layers[layering_mode];
-
-  input_ctx.filename = argv[1];
-  open_input_file(&input_ctx, 0);
-
   // Y4M reader has its own allocation.
-  if (input_ctx.file_type != FILE_TYPE_Y4M) {
+  if (app_input.input_ctx.file_type != FILE_TYPE_Y4M) {
     if (!aom_img_alloc(&raw, AOM_IMG_FMT_I420, width, height, 32)) {
       die("Failed to allocate image", width, height);
     }
   }
 
-  // Populate encoder configuration.
-  res = aom_codec_enc_config_default(encoder->codec_interface(), &cfg, 0);
-  if (res) {
-    printf("Failed to get config: %s\n", aom_codec_err_to_string(res));
-    return EXIT_FAILURE;
+  aom_codec_iface_t *encoder = get_aom_encoder_by_short_name("av1");
+
+  memcpy(&rc.layer_target_bitrate[0], &svc_params.layer_target_bitrate[0],
+         sizeof(svc_params.layer_target_bitrate));
+
+  unsigned int total_rate = 0;
+  for (i = 0; i < ss_number_layers; i++) {
+    total_rate +=
+        svc_params
+            .layer_target_bitrate[i * ts_number_layers + ts_number_layers - 1];
   }
-
-  // Update the default configuration with our settings.
-  cfg.g_w = width;
-  cfg.g_h = height;
-
-  // Timebase format e.g. 30fps: numerator=1, demoninator = 30.
-  cfg.g_timebase.num = (int)strtol(argv[6], NULL, 0);
-  cfg.g_timebase.den = (int)strtol(argv[7], NULL, 0);
-
-  speed = (int)strtol(argv[8], NULL, 0);
-  if (speed < 0 || speed > 8) {
-    die("Invalid speed setting: must be positive");
+  if (total_rate != cfg.rc_target_bitrate) {
+    die("Incorrect total target bitrate");
   }
 
-  for (i = min_args_base;
-       (int)i < min_args_base + mode_to_num_layers[layering_mode]; ++i) {
-    rc.layer_target_bitrate[i - 13] = (int)strtol(argv[i], NULL, 0);
-    svc_params.layer_target_bitrate[i - 13] = rc.layer_target_bitrate[i - 13];
-  }
-
-  cfg.rc_target_bitrate =
-      svc_params.layer_target_bitrate[ss_number_layers * ts_number_layers - 1];
-
   svc_params.framerate_factor[0] = 1;
   if (ts_number_layers == 2) {
     svc_params.framerate_factor[0] = 2;
@@ -674,78 +1010,59 @@
     svc_params.framerate_factor[2] = 1;
   }
 
-  // Real time parameters.
-  cfg.g_usage = AOM_USAGE_REALTIME;
-
-  cfg.rc_dropframe_thresh = (unsigned int)strtoul(argv[9], NULL, 0);
-  cfg.rc_end_usage = AOM_CBR;
-  cfg.rc_min_quantizer = 2;
-  cfg.rc_max_quantizer = 52;
-  cfg.rc_undershoot_pct = 50;
-  cfg.rc_overshoot_pct = 50;
-  cfg.rc_buf_initial_sz = 600;
-  cfg.rc_buf_optimal_sz = 600;
-  cfg.rc_buf_sz = 1000;
-
-  // Use 1 thread as default.
-  cfg.g_threads = (unsigned int)strtoul(argv[11], NULL, 0);
-
-  error_resilient = (uint32_t)strtoul(argv[10], NULL, 0);
-  if (error_resilient != 0 && error_resilient != 1) {
-    die("Invalid value for error resilient (0, 1): %d.", error_resilient);
-  }
-  // Enable error resilient mode.
-  cfg.g_error_resilient = error_resilient;
-  cfg.g_lag_in_frames = 0;
-  cfg.kf_mode = AOM_KF_AUTO;
-
-  // Disable automatic keyframe placement.
-  cfg.kf_min_dist = cfg.kf_max_dist = 3000;
-
   framerate = cfg.g_timebase.den / cfg.g_timebase.num;
   set_rate_control_metrics(&rc, framerate, ss_number_layers, ts_number_layers);
 
-  if (input_ctx.file_type == FILE_TYPE_Y4M) {
-    if (input_ctx.width != cfg.g_w || input_ctx.height != cfg.g_h) {
+  if (app_input.input_ctx.file_type == FILE_TYPE_Y4M) {
+    if (app_input.input_ctx.width != cfg.g_w ||
+        app_input.input_ctx.height != cfg.g_h) {
       die("Incorrect width or height: %d x %d", cfg.g_w, cfg.g_h);
     }
-    if (input_ctx.framerate.numerator != cfg.g_timebase.den ||
-        input_ctx.framerate.denominator != cfg.g_timebase.num) {
+    if (app_input.input_ctx.framerate.numerator != cfg.g_timebase.den ||
+        app_input.input_ctx.framerate.denominator != cfg.g_timebase.num) {
       die("Incorrect framerate: numerator %d denominator %d",
           cfg.g_timebase.num, cfg.g_timebase.den);
     }
   }
 
+  AvxVideoInfo info;
+  info.codec_fourcc = get_fourcc_by_aom_encoder(encoder);
+  info.frame_width = cfg.g_w;
+  info.frame_height = cfg.g_h;
+  info.time_base.numerator = cfg.g_timebase.num;
+  info.time_base.denominator = cfg.g_timebase.den;
   // Open an output file for each stream.
   for (unsigned int sl = 0; sl < ss_number_layers; ++sl) {
     for (unsigned tl = 0; tl < ts_number_layers; ++tl) {
       i = sl * ts_number_layers + tl;
       char file_name[PATH_MAX];
-      AvxVideoInfo info;
-      info.codec_fourcc = encoder->fourcc;
-      info.frame_width = cfg.g_w;
-      info.frame_height = cfg.g_h;
-      info.time_base.numerator = cfg.g_timebase.num;
-      info.time_base.denominator = cfg.g_timebase.den;
 
-      snprintf(file_name, sizeof(file_name), "%s_%d.av1", argv[2], i);
+      snprintf(file_name, sizeof(file_name), "%s_%u.av1",
+               app_input.output_filename, i);
       outfile[i] = aom_video_writer_open(file_name, kContainerIVF, &info);
       if (!outfile[i]) die("Failed to open %s for writing", file_name);
-      assert(outfile[i] != NULL);
     }
   }
+  total_layer_file =
+      aom_video_writer_open(app_input.output_filename, kContainerIVF, &info);
+  if (!total_layer_file)
+    die("Failed to open %s for writing", app_input.output_filename);
 
   // Initialize codec.
-  if (aom_codec_enc_init(&codec, encoder->codec_interface(), &cfg, 0))
-    die_codec(&codec, "Failed to initialize encoder");
+  aom_codec_ctx_t codec;
+  if (aom_codec_enc_init(&codec, encoder, &cfg, 0))
+    die("Failed to initialize encoder");
 
-  aom_codec_control(&codec, AOME_SET_CPUUSED, speed);
-  aom_codec_control(&codec, AV1E_SET_AQ_MODE, 3);
+  aom_codec_control(&codec, AOME_SET_CPUUSED, app_input.speed);
+  aom_codec_control(&codec, AV1E_SET_AQ_MODE, app_input.aq_mode);
   aom_codec_control(&codec, AV1E_SET_GF_CBR_BOOST_PCT, 0);
   aom_codec_control(&codec, AV1E_SET_ENABLE_CDEF, 1);
   aom_codec_control(&codec, AV1E_SET_ENABLE_ORDER_HINT, 0);
   aom_codec_control(&codec, AV1E_SET_ENABLE_TPL_MODEL, 0);
   aom_codec_control(&codec, AV1E_SET_DELTAQ_MODE, 0);
+  aom_codec_control(&codec, AV1E_SET_COEFF_COST_UPD_FREQ, 2);
+  aom_codec_control(&codec, AV1E_SET_MODE_COST_UPD_FREQ, 2);
+  aom_codec_control(&codec, AV1E_SET_MV_COST_UPD_FREQ, 3);
 
   svc_params.number_spatial_layers = ss_number_layers;
   svc_params.number_temporal_layers = ts_number_layers;
@@ -781,7 +1098,7 @@
   frame_avail = 1;
   while (frame_avail || got_data) {
     struct aom_usec_timer timer;
-    frame_avail = read_frame(&input_ctx, &raw);
+    frame_avail = read_frame(&(app_input.input_ctx), &raw);
     int is_key_frame = (frame_cnt % cfg.kf_max_dist) == 0;
     // Loop over spatial layers.
     for (unsigned int slx = 0; slx < ss_number_layers; slx++) {
@@ -791,13 +1108,21 @@
 
       // Set the reference/update flags, layer_id, and reference_map
       // buffer index.
-      flags = set_layer_pattern(layering_mode, frame_cnt, &layer_id,
-                                &ref_frame_config, &use_svc_control, slx,
-                                is_key_frame, (layering_mode == 9));
+      set_layer_pattern(app_input.layering_mode, frame_cnt, &layer_id,
+                        &ref_frame_config, &use_svc_control, slx, is_key_frame,
+                        (app_input.layering_mode == 9));
       aom_codec_control(&codec, AV1E_SET_SVC_LAYER_ID, &layer_id);
       if (use_svc_control)
         aom_codec_control(&codec, AV1E_SET_SVC_REF_FRAME_CONFIG,
                           &ref_frame_config);
+      if (set_err_resil_frame) {
+        // Set error_resilient per frame: off/0 for base layer and
+        // on/1 for enhancement layer frames.
+        int err_resil_mode =
+            (layer_id.spatial_layer_id > 0 || layer_id.temporal_layer_id > 0);
+        aom_codec_control(&codec, AV1E_SET_ERROR_RESILIENT_MODE,
+                          err_resil_mode);
+      }
 
       layer = slx * ts_number_layers + layer_id.temporal_layer_id;
       if (frame_avail && slx == 0) ++rc.layer_input_frames[layer];
@@ -835,18 +1160,21 @@
                                              pkt->data.frame.sz, pts);
                 if (sl == (unsigned int)layer_id.spatial_layer_id)
                   rc.layer_encoding_bitrate[j] += 8.0 * pkt->data.frame.sz;
-                // Keep count of rate control stats per layer (for non-key).
-                if (tl == (unsigned int)layer_id.temporal_layer_id &&
-                    sl == (unsigned int)layer_id.spatial_layer_id &&
-                    !(pkt->data.frame.flags & AOM_FRAME_IS_KEY)) {
-                  rc.layer_avg_frame_size[j] += 8.0 * pkt->data.frame.sz;
-                  rc.layer_avg_rate_mismatch[j] +=
-                      fabs(8.0 * pkt->data.frame.sz - rc.layer_pfb[j]) /
-                      rc.layer_pfb[j];
-                  if (slx == 0) ++rc.layer_enc_frames[tl];
-                }
               }
             }
+            // Write everything into the top layer.
+            aom_video_writer_write_frame(total_layer_file, pkt->data.frame.buf,
+                                         pkt->data.frame.sz, pts);
+            // Keep count of rate control stats per layer (for non-key).
+            if (!(pkt->data.frame.flags & AOM_FRAME_IS_KEY)) {
+              unsigned int j = layer_id.spatial_layer_id * ts_number_layers +
+                               layer_id.temporal_layer_id;
+              rc.layer_avg_frame_size[j] += 8.0 * pkt->data.frame.sz;
+              rc.layer_avg_rate_mismatch[j] +=
+                  fabs(8.0 * pkt->data.frame.sz - rc.layer_pfb[j]) /
+                  rc.layer_pfb[j];
+              if (slx == 0) ++rc.layer_enc_frames[layer_id.temporal_layer_id];
+            }
 
             // Update for short-time encoding bitrate states, for moving window
             // of size rc->window, shifted by rc->window / 2.
@@ -886,7 +1214,7 @@
     ++frame_cnt;
     pts += frame_duration;
   }
-  close_input_file(&input_ctx);
+  close_input_file(&(app_input.input_ctx));
   printout_rate_control_summary(&rc, frame_cnt, ss_number_layers,
                                 ts_number_layers);
   printf("\n");
@@ -899,8 +1227,9 @@
   // Try to rewrite the output file headers with the actual frame count.
   for (i = 0; i < ss_number_layers * ts_number_layers; ++i)
     aom_video_writer_close(outfile[i]);
+  aom_video_writer_close(total_layer_file);
 
-  if (input_ctx.file_type != FILE_TYPE_Y4M) {
+  if (app_input.input_ctx.file_type != FILE_TYPE_Y4M) {
     aom_img_free(&raw);
   }
   return EXIT_SUCCESS;
diff --git a/examples/twopass_encoder.c b/examples/twopass_encoder.c
index a03bc6c..075eeae 100644
--- a/examples/twopass_encoder.c
+++ b/examples/twopass_encoder.c
@@ -52,6 +52,7 @@
 #include <string.h>
 
 #include "aom/aom_encoder.h"
+#include "aom/aomcx.h"
 #include "common/tools_common.h"
 #include "common/video_writer.h"
 
@@ -117,14 +118,14 @@
 }
 
 static aom_fixed_buf_t pass0(aom_image_t *raw, FILE *infile,
-                             const AvxInterface *encoder,
+                             aom_codec_iface_t *encoder,
                              const aom_codec_enc_cfg_t *cfg, int limit) {
   aom_codec_ctx_t codec;
   int frame_count = 0;
   aom_fixed_buf_t stats = { NULL, 0 };
 
-  if (aom_codec_enc_init(&codec, encoder->codec_interface(), cfg, 0))
-    die_codec(&codec, "Failed to initialize encoder");
+  if (aom_codec_enc_init(&codec, encoder, cfg, 0))
+    die("Failed to initialize encoder");
 
   // Calculate frame statistics.
   while (aom_img_read(raw, infile) && frame_count < limit) {
@@ -143,9 +144,9 @@
 }
 
 static void pass1(aom_image_t *raw, FILE *infile, const char *outfile_name,
-                  const AvxInterface *encoder, const aom_codec_enc_cfg_t *cfg,
+                  aom_codec_iface_t *encoder, const aom_codec_enc_cfg_t *cfg,
                   int limit) {
-  AvxVideoInfo info = { encoder->fourcc,
+  AvxVideoInfo info = { get_fourcc_by_aom_encoder(encoder),
                         cfg->g_w,
                         cfg->g_h,
                         { cfg->g_timebase.num, cfg->g_timebase.den },
@@ -157,8 +158,11 @@
   writer = aom_video_writer_open(outfile_name, kContainerIVF, &info);
   if (!writer) die("Failed to open %s for writing", outfile_name);
 
-  if (aom_codec_enc_init(&codec, encoder->codec_interface(), cfg, 0))
-    die_codec(&codec, "Failed to initialize encoder");
+  if (aom_codec_enc_init(&codec, encoder, cfg, 0))
+    die("Failed to initialize encoder");
+
+  if (aom_codec_control(&codec, AOME_SET_CPUUSED, 2))
+    die_codec(&codec, "Failed to set cpu-used");
 
   // Encode frames.
   while (aom_img_read(raw, infile) && frame_count < limit) {
@@ -188,7 +192,6 @@
   aom_codec_err_t res;
   aom_fixed_buf_t stats;
 
-  const AvxInterface *encoder = NULL;
   const int fps = 30;       // TODO(dkovalev) add command line argument
   const int bitrate = 200;  // kbit/s TODO(dkovalev) add command line argument
   const char *const codec_arg = argv[1];
@@ -205,7 +208,7 @@
 
   if (limit == 0) limit = 100;
 
-  encoder = get_aom_encoder_by_name(codec_arg);
+  aom_codec_iface_t *encoder = get_aom_encoder_by_short_name(codec_arg);
   if (!encoder) die("Unsupported codec.");
 
   w = (int)strtol(width_arg, NULL, 0);
@@ -217,10 +220,10 @@
   if (!aom_img_alloc(&raw, AOM_IMG_FMT_I420, w, h, 1))
     die("Failed to allocate image", w, h);
 
-  printf("Using %s\n", aom_codec_iface_name(encoder->codec_interface()));
+  printf("Using %s\n", aom_codec_iface_name(encoder));
 
   // Configuration
-  res = aom_codec_enc_config_default(encoder->codec_interface(), &cfg, 0);
+  res = aom_codec_enc_config_default(encoder, &cfg, 0);
   if (res) die_codec(&codec, "Failed to get default codec config.");
 
   cfg.g_w = w;
diff --git a/libs.doxy_template b/libs.doxy_template
index c522e21..dab0ba0 100644
--- a/libs.doxy_template
+++ b/libs.doxy_template
@@ -1,4 +1,4 @@
-## Copyright (c) 2016, Alliance for Open Media. All rights reserved
+## Copyright (c) 2020, Alliance for Open Media. All rights reserved
 ##
 ## This source code is subject to the terms of the BSD 2 Clause License and
 ## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
@@ -8,92 +8,140 @@
 ## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 ##
 
-# Doxyfile 1.5.4
+# Doxyfile 1.8.16
 
 # This file describes the settings to be used by the documentation system
-# doxygen (www.doxygen.org) for a project
+# doxygen (www.doxygen.org) for a project.
 #
-# All text after a hash (#) is considered a comment and will be ignored
+# All text after a double hash (##) is considered a comment and is placed in
+# front of the TAG it is preceding.
+#
+# All text after a single hash (#) is considered a comment and will be ignored.
 # The format is:
-#       TAG = value [value, ...]
-# For lists items can also be appended using:
-#       TAG += value [value, ...]
-# Values that contain spaces should be placed between quotes (" ")
+# TAG = value [value, ...]
+# For lists, items can also be appended using:
+# TAG += value [value, ...]
+# Values that contain spaces should be placed between quotes (\" \").
 
 #---------------------------------------------------------------------------
 # Project related configuration options
 #---------------------------------------------------------------------------
 
-# This tag specifies the encoding used for all characters in the config file that
-# follow. The default is UTF-8 which is also the encoding used for all text before
-# the first occurrence of this tag. Doxygen uses libiconv (or the iconv built into
-# libc) for the transcoding. See http://www.gnu.org/software/libiconv for the list of
-# possible encodings.
+# This tag specifies the encoding used for all characters in the configuration
+# file that follow. The default is UTF-8 which is also the encoding used for all
+# text before the first occurrence of this tag. Doxygen uses libiconv (or the
+# iconv built into libc) for the transcoding. See
+# https://www.gnu.org/software/libiconv/ for the list of possible encodings.
+# The default value is: UTF-8.
 
 DOXYFILE_ENCODING      = UTF-8
 
-# The PROJECT_NAME tag is a single word (or a sequence of words surrounded
-# by quotes) that should identify the project.
+# The PROJECT_NAME tag is a single word (or a sequence of words surrounded by
+# double-quotes, unless you are using Doxywizard) that should identify the
+# project for which the documentation is generated. This name is used in the
+# title of most generated pages and in a few other places.
+# The default value is: My Project.
 
-PROJECT_NAME           = "AOMedia Codec SDK"
+PROJECT_NAME           = "AOMedia AV1 Codec"
 
-# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute)
-# base path where the generated documentation will be put.
-# If a relative path is entered, it will be relative to the location
-# where doxygen was started. If left blank the current directory will be used.
+# The PROJECT_NUMBER tag can be used to enter a project or revision number. This
+# could be handy for archiving the generated documentation or if some version
+# control system is used.
+
+PROJECT_NUMBER         =
+
+# Using the PROJECT_BRIEF tag one can provide an optional one line description
+# for a project that appears at the top of each page and should give viewer a
+# quick idea about the purpose of the project. Keep the description short.
+
+PROJECT_BRIEF          =
+
+# With the PROJECT_LOGO tag one can specify a logo or an icon that is included
+# in the documentation. The maximum height of the logo should not exceed 55
+# pixels and the maximum width should not exceed 200 pixels. Doxygen will copy
+# the logo to the output directory.
+
+PROJECT_LOGO           =
+
+# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path
+# into which the generated documentation will be written. If a relative path is
+# entered, it will be relative to the location where doxygen was started. If
+# left blank the current directory will be used.
 
 OUTPUT_DIRECTORY       = docs
 
-# If the CREATE_SUBDIRS tag is set to YES, then doxygen will create
-# 4096 sub-directories (in 2 levels) under the output directory of each output
-# format and will distribute the generated files over these directories.
-# Enabling this option can be useful when feeding doxygen a huge amount of
-# source files, where putting all generated files in the same directory would
-# otherwise cause performance problems for the file system.
+# If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub-
+# directories (in 2 levels) under the output directory of each output format and
+# will distribute the generated files over these directories. Enabling this
+# option can be useful when feeding doxygen a huge amount of source files, where
+# putting all generated files in the same directory would otherwise causes
+# performance problems for the file system.
+# The default value is: NO.
 
 CREATE_SUBDIRS         = NO
 
+# If the ALLOW_UNICODE_NAMES tag is set to YES, doxygen will allow non-ASCII
+# characters to appear in the names of generated files. If set to NO, non-ASCII
+# characters will be escaped, for example _xE3_x81_x84 will be used for Unicode
+# U+3044.
+# The default value is: NO.
+
+ALLOW_UNICODE_NAMES    = NO
+
 # The OUTPUT_LANGUAGE tag is used to specify the language in which all
 # documentation generated by doxygen is written. Doxygen will use this
 # information to generate all constant output in the proper language.
-# The default language is English, other supported languages are:
-# Afrikaans, Arabic, Brazilian, Catalan, Chinese, Chinese-Traditional,
-# Croatian, Czech, Danish, Dutch, Finnish, French, German, Greek, Hungarian,
-# Italian, Japanese, Japanese-en (Japanese with English messages), Korean,
-# Korean-en, Lithuanian, Norwegian, Polish, Portuguese, Romanian, Russian,
-# Serbian, Slovak, Slovene, Spanish, Swedish, and Ukrainian.
+# Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Catalan, Chinese,
+# Chinese-Traditional, Croatian, Czech, Danish, Dutch, English (United States),
+# Esperanto, Farsi (Persian), Finnish, French, German, Greek, Hungarian,
+# Indonesian, Italian, Japanese, Japanese-en (Japanese with English messages),
+# Korean, Korean-en (Korean with English messages), Latvian, Lithuanian,
+# Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese, Romanian, Russian,
+# Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish, Swedish, Turkish,
+# Ukrainian and Vietnamese.
+# The default value is: English.
 
 OUTPUT_LANGUAGE        = English
 
-# If the BRIEF_MEMBER_DESC tag is set to YES (the default) Doxygen will
-# include brief member descriptions after the members that are listed in
-# the file and class documentation (similar to java_doc).
-# Set to NO to disable this.
+# The OUTPUT_TEXT_DIRECTION tag is used to specify the direction in which all
+# documentation generated by doxygen is written. Doxygen will use this
+# information to generate all generated output in the proper direction.
+# Possible values are: None, LTR, RTL and Context.
+# The default value is: None.
+
+OUTPUT_TEXT_DIRECTION  = None
+
+# If the BRIEF_MEMBER_DESC tag is set to YES, doxygen will include brief member
+# descriptions after the members that are listed in the file and class
+# documentation (similar to Javadoc). Set to NO to disable this.
+# The default value is: YES.
 
 BRIEF_MEMBER_DESC      = YES
 
-# If the REPEAT_BRIEF tag is set to YES (the default) Doxygen will prepend
-# the brief description of a member or function before the detailed description.
-# Note: if both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the
+# If the REPEAT_BRIEF tag is set to YES, doxygen will prepend the brief
+# description of a member or function before the detailed description
+#
+# Note: If both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the
 # brief descriptions will be completely suppressed.
+# The default value is: YES.
 
 REPEAT_BRIEF           = YES
 
-# This tag implements a quasi-intelligent brief description abbreviator
-# that is used to form the text in various listings. Each string
-# in this list, if found as the leading text of the brief description, will be
-# stripped from the text and the result after processing the whole list, is
-# used as the annotated text. Otherwise, the brief description is used as-is.
-# If left blank, the following values are used ("$name" is automatically
-# replaced with the name of the entity): "The $name class" "The $name widget"
-# "The $name file" "is" "provides" "specifies" "contains"
-# "represents" "a" "an" "the"
+# This tag implements a quasi-intelligent brief description abbreviator that is
+# used to form the text in various listings. Each string in this list, if found
+# as the leading text of the brief description, will be stripped from the text
+# and the result, after processing the whole list, is used as the annotated
+# text. Otherwise, the brief description is used as-is. If left blank, the
+# following values are used ($name is automatically replaced with the name of
+# the entity):The $name class, The $name widget, The $name file, is, provides,
+# specifies, contains, represents, a, an and the.
 
 ABBREVIATE_BRIEF       =
 
 # If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then
-# Doxygen will generate a detailed section even if there is only a brief
+# doxygen will generate a detailed section even if there is only a brief
 # description.
+# The default value is: NO.
 
 ALWAYS_DETAILED_SEC    = NO
 
@@ -101,873 +149,1907 @@
 # inherited members of a class in the documentation of that class as if those
 # members were ordinary class members. Constructors, destructors and assignment
 # operators of the base classes will not be shown.
+# The default value is: NO.
 
 INLINE_INHERITED_MEMB  = NO
 
-# If the FULL_PATH_NAMES tag is set to YES then Doxygen will prepend the full
-# path before files name in the file list and in the header files. If set
-# to NO the shortest path that makes the file name unique will be used.
+# If the FULL_PATH_NAMES tag is set to YES, doxygen will prepend the full path
+# before files name in the file list and in the header files. If set to NO the
+# shortest path that makes the file name unique will be used
+# The default value is: YES.
 
 FULL_PATH_NAMES        = YES
 
-# If the FULL_PATH_NAMES tag is set to YES then the STRIP_FROM_PATH tag
-# can be used to strip a user-defined part of the path. Stripping is
-# only done if one of the specified strings matches the left-hand part of
-# the path. The tag can be used to show relative paths in the file list.
-# If left blank the directory from which doxygen is run is used as the
-# path to strip.
+# The STRIP_FROM_PATH tag can be used to strip a user-defined part of the path.
+# Stripping is only done if one of the specified strings matches the left-hand
+# part of the path. The tag can be used to show relative paths in the file list.
+# If left blank the directory from which doxygen is run is used as the path to
+# strip.
+#
+# Note that you can specify absolute paths here, but also relative paths, which
+# will be relative from the directory where doxygen is started.
+# This tag requires that the tag FULL_PATH_NAMES is set to YES.
 
 STRIP_FROM_PATH        =
 
-# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of
-# the path mentioned in the documentation of a class, which tells
-# the reader which header file to include in order to use a class.
-# If left blank only the name of the header file containing the class
-# definition is used. Otherwise one should specify the include paths that
-# are normally passed to the compiler using the -I flag.
+# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the
+# path mentioned in the documentation of a class, which tells the reader which
+# header file to include in order to use a class. If left blank only the name of
+# the header file containing the class definition is used. Otherwise one should
+# specify the list of include paths that are normally passed to the compiler
+# using the -I flag.
 
 STRIP_FROM_INC_PATH    =
 
-# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter
-# (but less readable) file names. This can be useful is your file systems
-# doesn't support long names like on DOS, Mac, or CD-ROM.
+# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but
+# less readable) file names. This can be useful is your file systems doesn't
+# support long names like on DOS, Mac, or CD-ROM.
+# The default value is: NO.
 
 SHORT_NAMES            = NO
 
-# If the JAVADOC_AUTOBRIEF tag is set to YES then Doxygen
-# will interpret the first line (until the first dot) of a java_doc-style
-# comment as the brief description. If set to NO, the java_doc
-# comments will behave just like regular Qt-style comments
-# (thus requiring an explicit @brief command for a brief description.)
+# If the JAVADOC_AUTOBRIEF tag is set to YES then doxygen will interpret the
+# first line (until the first dot) of a Javadoc-style comment as the brief
+# description. If set to NO, the Javadoc-style will behave just like regular Qt-
+# style comments (thus requiring an explicit @brief command for a brief
+# description.)
+# The default value is: NO.
 
 JAVADOC_AUTOBRIEF      = NO
 
-# If the QT_AUTOBRIEF tag is set to YES then Doxygen will
-# interpret the first line (until the first dot) of a Qt-style
-# comment as the brief description. If set to NO, the comments
-# will behave just like regular Qt-style comments (thus requiring
-# an explicit \brief command for a brief description.)
+# If the JAVADOC_BANNER tag is set to YES then doxygen will interpret a line
+# such as
+# /***************
+# as being the beginning of a Javadoc-style comment "banner". If set to NO, the
+# Javadoc-style will behave just like regular comments and it will not be
+# interpreted by doxygen.
+# The default value is: NO.
+
+JAVADOC_BANNER         = NO
+
+# If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first
+# line (until the first dot) of a Qt-style comment as the brief description. If
+# set to NO, the Qt-style will behave just like regular Qt-style comments (thus
+# requiring an explicit \brief command for a brief description.)
+# The default value is: NO.
 
 QT_AUTOBRIEF           = NO
 
-# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make Doxygen
-# treat a multi-line C++ special comment block (i.e. a block of //! or ///
-# comments) as a brief description. This used to be the default behaviour.
-# The new default is to treat a multi-line C++ comment block as a detailed
-# description. Set this tag to YES if you prefer the old behaviour instead.
+# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make doxygen treat a
+# multi-line C++ special comment block (i.e. a block of //! or /// comments) as
+# a brief description. This used to be the default behavior. The new default is
+# to treat a multi-line C++ comment block as a detailed description. Set this
+# tag to YES if you prefer the old behavior instead.
+#
+# Note that setting this tag to YES also means that rational rose comments are
+# not recognized any more.
+# The default value is: NO.
 
 MULTILINE_CPP_IS_BRIEF = NO
 
-# If the INHERIT_DOCS tag is set to YES (the default) then an undocumented
-# member inherits the documentation from any documented member that it
-# re-implements.
+# If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the
+# documentation from any documented member that it re-implements.
+# The default value is: YES.
 
 INHERIT_DOCS           = YES
 
-# If the SEPARATE_MEMBER_PAGES tag is set to YES, then doxygen will produce
-# a new page for each member. If set to NO, the documentation of a member will
-# be part of the file/class/namespace that contains it.
+# If the SEPARATE_MEMBER_PAGES tag is set to YES then doxygen will produce a new
+# page for each member. If set to NO, the documentation of a member will be part
+# of the file/class/namespace that contains it.
+# The default value is: NO.
 
 SEPARATE_MEMBER_PAGES  = NO
 
-# The TAB_SIZE tag can be used to set the number of spaces in a tab.
-# Doxygen uses this value to replace tabs by spaces in code fragments.
+# The TAB_SIZE tag can be used to set the number of spaces in a tab. Doxygen
+# uses this value to replace tabs by spaces in code fragments.
+# Minimum value: 1, maximum value: 16, default value: 4.
 
 TAB_SIZE               = 4
 
-# This tag can be used to specify a number of aliases that acts
-# as commands in the documentation. An alias has the form "name=value".
-# For example adding "sideeffect=\par Side Effects:\n" will allow you to
-# put the command \sideeffect (or @sideeffect) in the documentation, which
-# will result in a user-defined paragraph with heading "Side Effects:".
-# You can put \n's in the value part of an alias to insert newlines.
+# This tag can be used to specify a number of aliases that act as commands in
+# the documentation. An alias has the form:
+# name=value
+# For example adding
+# "sideeffect=@par Side Effects:\n"
+# will allow you to put the command \sideeffect (or @sideeffect) in the
+# documentation, which will result in a user-defined paragraph with heading
+# "Side Effects:". You can put \n's in the value part of an alias to insert
+# newlines (in the resulting output). You can put ^^ in the value part of an
+# alias to insert a newline as if a physical newline was in the original file.
+# When you need a literal { or } or , in the value part of an alias you have to
+# escape them by means of a backslash (\), this can lead to conflicts with the
+# commands \{ and \} for these it is advised to use the version @{ and @} or use
+# a double escape (\\{ and \\})
 
 ALIASES                =
 
-# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C
-# sources only. Doxygen will then generate output that is more tailored for C.
-# For instance, some of the names that are used will be different. The list
-# of all members will be omitted, etc.
+# This tag can be used to specify a number of word-keyword mappings (TCL only).
+# A mapping has the form "name=value". For example adding "class=itcl::class"
+# will allow you to use the command class in the itcl::class meaning.
+
+TCL_SUBST              =
+
+# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources
+# only. Doxygen will then generate output that is more tailored for C. For
+# instance, some of the names that are used will be different. The list of all
+# members will be omitted, etc.
+# The default value is: NO.
 
 OPTIMIZE_OUTPUT_FOR_C  = YES
 
-# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java
-# sources only. Doxygen will then generate output that is more tailored for Java.
-# For instance, namespaces will be presented as packages, qualified scopes
-# will look different, etc.
+# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java or
+# Python sources only. Doxygen will then generate output that is more tailored
+# for that language. For instance, namespaces will be presented as packages,
+# qualified scopes will look different, etc.
+# The default value is: NO.
 
 OPTIMIZE_OUTPUT_JAVA   = NO
 
-# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want to
-# include (a tag file for) the STL sources as input, then you should
-# set this tag to YES in order to let doxygen match functions declarations and
-# definitions whose arguments contain STL classes (e.g. func(std::string); v.s.
-# func(std::string) {}). This also make the inheritance and collaboration
+# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran
+# sources. Doxygen will then generate output that is tailored for Fortran.
+# The default value is: NO.
+
+OPTIMIZE_FOR_FORTRAN   = NO
+
+# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL
+# sources. Doxygen will then generate output that is tailored for VHDL.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_VHDL   = NO
+
+# Set the OPTIMIZE_OUTPUT_SLICE tag to YES if your project consists of Slice
+# sources only. Doxygen will then generate output that is more tailored for that
+# language. For instance, namespaces will be presented as modules, types will be
+# separated into more groups, etc.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_SLICE  = NO
+
+# Doxygen selects the parser to use depending on the extension of the files it
+# parses. With this tag you can assign which parser to use for a given
+# extension. Doxygen has a built-in mapping, but you can override or extend it
+# using this tag. The format is ext=language, where ext is a file extension, and
+# language is one of the parsers supported by doxygen: IDL, Java, Javascript,
+# Csharp (C#), C, C++, D, PHP, md (Markdown), Objective-C, Python, Slice,
+# Fortran (fixed format Fortran: FortranFixed, free formatted Fortran:
+# FortranFree, unknown formatted Fortran: Fortran. In the later case the parser
+# tries to guess whether the code is fixed or free formatted code, this is the
+# default for Fortran type files), VHDL, tcl. For instance to make doxygen treat
+# .inc files as Fortran files (default is PHP), and .f files as C (default is
+# Fortran), use: inc=Fortran f=C.
+#
+# Note: For files without extension you can use no_extension as a placeholder.
+#
+# Note that for custom extensions you also need to set FILE_PATTERNS otherwise
+# the files are not read by doxygen.
+
+EXTENSION_MAPPING      =
+
+# If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments
+# according to the Markdown format, which allows for more readable
+# documentation. See https://daringfireball.net/projects/markdown/ for details.
+# The output of markdown processing is further processed by doxygen, so you can
+# mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in
+# case of backward compatibilities issues.
+# The default value is: YES.
+
+MARKDOWN_SUPPORT       = YES
+
+# When the TOC_INCLUDE_HEADINGS tag is set to a non-zero value, all headings up
+# to that level are automatically included in the table of contents, even if
+# they do not have an id attribute.
+# Note: This feature currently applies only to Markdown headings.
+# Minimum value: 0, maximum value: 99, default value: 5.
+# This tag requires that the tag MARKDOWN_SUPPORT is set to YES.
+
+TOC_INCLUDE_HEADINGS   = 5
+
+# When enabled doxygen tries to link words that correspond to documented
+# classes, or namespaces to their corresponding documentation. Such a link can
+# be prevented in individual cases by putting a % sign in front of the word or
+# globally by setting AUTOLINK_SUPPORT to NO.
+# The default value is: YES.
+
+AUTOLINK_SUPPORT       = YES
+
+# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want
+# to include (a tag file for) the STL sources as input, then you should set this
+# tag to YES in order to let doxygen match functions declarations and
+# definitions whose arguments contain STL classes (e.g. func(std::string);
+# versus func(std::string) {}). This also make the inheritance and collaboration
 # diagrams that involve STL classes more complete and accurate.
+# The default value is: NO.
 
 BUILTIN_STL_SUPPORT    = NO
 
 # If you use Microsoft's C++/CLI language, you should set this option to YES to
 # enable parsing support.
+# The default value is: NO.
 
 CPP_CLI_SUPPORT        = NO
 
-# Set the SIP_SUPPORT tag to YES if your project consists of sip sources only.
-# Doxygen will parse them like normal C++ but will assume all classes use public
-# instead of private inheritance when no explicit protection keyword is present.
+# Set the SIP_SUPPORT tag to YES if your project consists of sip (see:
+# https://www.riverbankcomputing.com/software/sip/intro) sources only. Doxygen
+# will parse them like normal C++ but will assume all classes use public instead
+# of private inheritance when no explicit protection keyword is present.
+# The default value is: NO.
 
 SIP_SUPPORT            = NO
 
+# For Microsoft's IDL there are propget and propput attributes to indicate
+# getter and setter methods for a property. Setting this option to YES will make
+# doxygen to replace the get and set methods by a property in the documentation.
+# This will only work if the methods are indeed getting or setting a simple
+# type. If this is not the case, or you want to show the methods anyway, you
+# should set this option to NO.
+# The default value is: YES.
+
+IDL_PROPERTY_SUPPORT   = YES
+
 # If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC
-# tag is set to YES, then doxygen will reuse the documentation of the first
+# tag is set to YES then doxygen will reuse the documentation of the first
 # member in the group (if any) for the other members of the group. By default
 # all members of a group must be documented explicitly.
+# The default value is: NO.
 
 DISTRIBUTE_GROUP_DOC   = NO
 
-# Set the SUBGROUPING tag to YES (the defqault) to allow class member groups of
-# the same type (for instance a group of public functions) to be put as a
-# subgroup of that type (e.g. under the Public Functions section). Set it to
-# NO to prevent subgrouping. Alternatively, this can be done per class using
-# the \nosubgrouping command.
+# If one adds a struct or class to a group and this option is enabled, then also
+# any nested class or struct is added to the same group. By default this option
+# is disabled and one has to add nested compounds explicitly via \ingroup.
+# The default value is: NO.
+
+GROUP_NESTED_COMPOUNDS = NO
+
+# Set the SUBGROUPING tag to YES to allow class member groups of the same type
+# (for instance a group of public functions) to be put as a subgroup of that
+# type (e.g. under the Public Functions section). Set it to NO to prevent
+# subgrouping. Alternatively, this can be done per class using the
+# \nosubgrouping command.
+# The default value is: YES.
 
 SUBGROUPING            = YES
 
-# When TYPEDEF_HIDES_STRUCT is enabled, a typedef of a struct (or union) is
-# documented as struct with the name of the typedef. So
-# typedef struct type_s {} type_t, will appear in the documentation as a struct
-# with name type_t. When disabled the typedef will appear as a member of a file,
-# namespace, or class. And the struct will be named type_s. This can typically
-# be useful for C code where the coding convention is that all structs are
-# typedef'ed and only the typedef is referenced never the struct's name.
+# When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and unions
+# are shown inside the group in which they are included (e.g. using \ingroup)
+# instead of on a separate page (for HTML and Man pages) or section (for LaTeX
+# and RTF).
+#
+# Note that this feature does not work in combination with
+# SEPARATE_MEMBER_PAGES.
+# The default value is: NO.
+
+INLINE_GROUPED_CLASSES = NO
+
+# When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and unions
+# with only public data fields or simple typedef fields will be shown inline in
+# the documentation of the scope in which they are defined (i.e. file,
+# namespace, or group documentation), provided this scope is documented. If set
+# to NO, structs, classes, and unions are shown on a separate page (for HTML and
+# Man pages) or section (for LaTeX and RTF).
+# The default value is: NO.
+
+INLINE_SIMPLE_STRUCTS  = NO
+
+# When TYPEDEF_HIDES_STRUCT tag is enabled, a typedef of a struct, union, or
+# enum is documented as struct, union, or enum with the name of the typedef. So
+# typedef struct TypeS {} TypeT, will appear in the documentation as a struct
+# with name TypeT. When disabled the typedef will appear as a member of a file,
+# namespace, or class. And the struct will be named TypeS. This can typically be
+# useful for C code in case the coding convention dictates that all compound
+# types are typedef'ed and only the typedef is referenced, never the tag name.
+# The default value is: NO.
 
 TYPEDEF_HIDES_STRUCT   = NO
 
+# The size of the symbol lookup cache can be set using LOOKUP_CACHE_SIZE. This
+# cache is used to resolve symbols given their name and scope. Since this can be
+# an expensive process and often the same symbol appears multiple times in the
+# code, doxygen keeps a cache of pre-resolved symbols. If the cache is too small
+# doxygen will become slower. If the cache is too large, memory is wasted. The
+# cache size is given by this formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range
+# is 0..9, the default is 0, corresponding to a cache size of 2^16=65536
+# symbols. At the end of a run doxygen will report the cache usage and suggest
+# the optimal cache size from a speed point of view.
+# Minimum value: 0, maximum value: 9, default value: 0.
+
+LOOKUP_CACHE_SIZE      = 0
+
 #---------------------------------------------------------------------------
 # Build related configuration options
 #---------------------------------------------------------------------------
 
-# If the EXTRACT_ALL tag is set to YES doxygen will assume all entities in
-# documentation are documented, even if no documentation was available.
-# Private class members and static file members will be hidden unless
-# the EXTRACT_PRIVATE and EXTRACT_STATIC tags are set to YES
+# If the EXTRACT_ALL tag is set to YES, doxygen will assume all entities in
+# documentation are documented, even if no documentation was available. Private
+# class members and static file members will be hidden unless the
+# EXTRACT_PRIVATE respectively EXTRACT_STATIC tags are set to YES.
+# Note: This will also disable the warnings about undocumented members that are
+# normally produced when WARNINGS is set to YES.
+# The default value is: NO.
 
 EXTRACT_ALL            = NO
 
-# If the EXTRACT_PRIVATE tag is set to YES all private members of a class
-# will be included in the documentation.
+# If the EXTRACT_PRIVATE tag is set to YES, all private members of a class will
+# be included in the documentation.
+# The default value is: NO.
 
 EXTRACT_PRIVATE        = NO
 
-# If the EXTRACT_STATIC tag is set to YES all static members of a file
-# will be included in the documentation.
+# If the EXTRACT_PRIV_VIRTUAL tag is set to YES, documented private virtual
+# methods of a class will be included in the documentation.
+# The default value is: NO.
 
-EXTRACT_STATIC         = NO
+EXTRACT_PRIV_VIRTUAL   = NO
 
-# If the EXTRACT_LOCAL_CLASSES tag is set to YES classes (and structs)
-# defined locally in source files will be included in the documentation.
-# If set to NO only classes defined in header files are included.
+# If the EXTRACT_PACKAGE tag is set to YES, all members with package or internal
+# scope will be included in the documentation.
+# The default value is: NO.
+
+EXTRACT_PACKAGE        = NO
+
+# If the EXTRACT_STATIC tag is set to YES, all static members of a file will be
+# included in the documentation.
+# The default value is: NO.
+
+EXTRACT_STATIC         = YES
+
+# If the EXTRACT_LOCAL_CLASSES tag is set to YES, classes (and structs) defined
+# locally in source files will be included in the documentation. If set to NO,
+# only classes defined in header files are included. Does not have any effect
+# for Java sources.
+# The default value is: YES.
 
 EXTRACT_LOCAL_CLASSES  = YES
 
-# This flag is only useful for Objective-C code. When set to YES local
-# methods, which are defined in the implementation section but not in
-# the interface are included in the documentation.
-# If set to NO (the default) only methods in the interface are included.
+# This flag is only useful for Objective-C code. If set to YES, local methods,
+# which are defined in the implementation section but not in the interface are
+# included in the documentation. If set to NO, only methods in the interface are
+# included.
+# The default value is: NO.
 
 EXTRACT_LOCAL_METHODS  = NO
 
-# If this flag is set to YES, the members of anonymous namespaces will be extracted
-# and appear in the documentation as a namespace called 'anonymous_namespace{file}',
-# where file will be replaced with the base name of the file that contains the anonymous
-# namespace. By default anonymous namespace are hidden.
+# If this flag is set to YES, the members of anonymous namespaces will be
+# extracted and appear in the documentation as a namespace called
+# 'anonymous_namespace{file}', where file will be replaced with the base name of
+# the file that contains the anonymous namespace. By default anonymous namespace
+# are hidden.
+# The default value is: NO.
 
 EXTRACT_ANON_NSPACES   = NO
 
-# If the HIDE_UNDOC_MEMBERS tag is set to YES, Doxygen will hide all
-# undocumented members of documented classes, files or namespaces.
-# If set to NO (the default) these members will be included in the
-# various overviews, but no documentation section is generated.
-# This option has no effect if EXTRACT_ALL is enabled.
+# If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all
+# undocumented members inside documented classes or files. If set to NO these
+# members will be included in the various overviews, but no documentation
+# section is generated. This option has no effect if EXTRACT_ALL is enabled.
+# The default value is: NO.
 
 HIDE_UNDOC_MEMBERS     = NO
 
-# If the HIDE_UNDOC_CLASSES tag is set to YES, Doxygen will hide all
-# undocumented classes that are normally visible in the class hierarchy.
-# If set to NO (the default) these classes will be included in the various
-# overviews. This option has no effect if EXTRACT_ALL is enabled.
+# If the HIDE_UNDOC_CLASSES tag is set to YES, doxygen will hide all
+# undocumented classes that are normally visible in the class hierarchy. If set
+# to NO, these classes will be included in the various overviews. This option
+# has no effect if EXTRACT_ALL is enabled.
+# The default value is: NO.
 
 HIDE_UNDOC_CLASSES     = NO
 
-# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, Doxygen will hide all
-# friend (class|struct|union) declarations.
-# If set to NO (the default) these declarations will be included in the
-# documentation.
+# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend
+# (class|struct|union) declarations. If set to NO, these declarations will be
+# included in the documentation.
+# The default value is: NO.
 
 HIDE_FRIEND_COMPOUNDS  = NO
 
-# If the HIDE_IN_BODY_DOCS tag is set to YES, Doxygen will hide any
-# documentation blocks found inside the body of a function.
-# If set to NO (the default) these blocks will be appended to the
-# function's detailed documentation block.
+# If the HIDE_IN_BODY_DOCS tag is set to YES, doxygen will hide any
+# documentation blocks found inside the body of a function. If set to NO, these
+# blocks will be appended to the function's detailed documentation block.
+# The default value is: NO.
 
 HIDE_IN_BODY_DOCS      = NO
 
-# The INTERNAL_DOCS tag determines if documentation
-# that is typed after a \internal command is included. If the tag is set
-# to NO (the default) then the documentation will be excluded.
-# Set it to YES to include the internal documentation.
+# The INTERNAL_DOCS tag determines if documentation that is typed after a
+# \internal command is included. If the tag is set to NO then the documentation
+# will be excluded. Set it to YES to include the internal documentation.
+# The default value is: NO.
 
 INTERNAL_DOCS          = NO
 
-# If the CASE_SENSE_NAMES tag is set to NO then Doxygen will only generate
-# file names in lower-case letters. If set to YES upper-case letters are also
+# If the CASE_SENSE_NAMES tag is set to NO then doxygen will only generate file
+# names in lower-case letters. If set to YES, upper-case letters are also
 # allowed. This is useful if you have classes or files whose names only differ
 # in case and if your file system supports case sensitive file names. Windows
-# and Mac users are advised to set this option to NO.
+# (including Cygwin) ands Mac users are advised to set this option to NO.
+# The default value is: system dependent.
 
 CASE_SENSE_NAMES       = YES
 
-# If the HIDE_SCOPE_NAMES tag is set to NO (the default) then Doxygen
-# will show members with their full class and namespace scopes in the
-# documentation. If set to YES the scope will be hidden.
+# If the HIDE_SCOPE_NAMES tag is set to NO then doxygen will show members with
+# their full class and namespace scopes in the documentation. If set to YES, the
+# scope will be hidden.
+# The default value is: NO.
 
 HIDE_SCOPE_NAMES       = NO
 
-# If the SHOW_INCLUDE_FILES tag is set to YES (the default) then Doxygen
-# will put a list of the files that are included by a file in the documentation
-# of that file.
+# If the HIDE_COMPOUND_REFERENCE tag is set to NO (default) then doxygen will
+# append additional text to a page's title, such as Class Reference. If set to
+# YES the compound reference will be hidden.
+# The default value is: NO.
+
+HIDE_COMPOUND_REFERENCE= NO
+
+# If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of
+# the files that are included by a file in the documentation of that file.
+# The default value is: YES.
 
 SHOW_INCLUDE_FILES     = YES
 
-# If the INLINE_INFO tag is set to YES (the default) then a tag [inline]
-# is inserted in the documentation for inline members.
+# If the SHOW_GROUPED_MEMB_INC tag is set to YES then Doxygen will add for each
+# grouped member an include statement to the documentation, telling the reader
+# which file to include in order to use the member.
+# The default value is: NO.
+
+SHOW_GROUPED_MEMB_INC  = NO
+
+# If the FORCE_LOCAL_INCLUDES tag is set to YES then doxygen will list include
+# files with double quotes in the documentation rather than with sharp brackets.
+# The default value is: NO.
+
+FORCE_LOCAL_INCLUDES   = NO
+
+# If the INLINE_INFO tag is set to YES then a tag [inline] is inserted in the
+# documentation for inline members.
+# The default value is: YES.
 
 INLINE_INFO            = YES
 
-# If the SORT_MEMBER_DOCS tag is set to YES (the default) then doxygen
-# will sort the (detailed) documentation of file and class members
-# alphabetically by member name. If set to NO the members will appear in
-# declaration order.
+# If the SORT_MEMBER_DOCS tag is set to YES then doxygen will sort the
+# (detailed) documentation of file and class members alphabetically by member
+# name. If set to NO, the members will appear in declaration order.
+# The default value is: YES.
 
 SORT_MEMBER_DOCS       = NO
 
-# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the
-# brief documentation of file, namespace and class members alphabetically
-# by member name. If set to NO (the default) the members will appear in
-# declaration order.
+# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the brief
+# descriptions of file, namespace and class members alphabetically by member
+# name. If set to NO, the members will appear in declaration order. Note that
+# this will also influence the order of the classes in the class list.
+# The default value is: NO.
 
 SORT_BRIEF_DOCS        = NO
 
-# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be
-# sorted by fully-qualified names, including namespaces. If set to
-# NO (the default), the class list will be sorted only by class name,
-# not including the namespace part.
+# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen will sort the
+# (brief and detailed) documentation of class members so that constructors and
+# destructors are listed first. If set to NO the constructors will appear in the
+# respective orders defined by SORT_BRIEF_DOCS and SORT_MEMBER_DOCS.
+# Note: If SORT_BRIEF_DOCS is set to NO this option is ignored for sorting brief
+# member documentation.
+# Note: If SORT_MEMBER_DOCS is set to NO this option is ignored for sorting
+# detailed member documentation.
+# The default value is: NO.
+
+SORT_MEMBERS_CTORS_1ST = NO
+
+# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the hierarchy
+# of group names into alphabetical order. If set to NO the group names will
+# appear in their defined order.
+# The default value is: NO.
+
+SORT_GROUP_NAMES       = NO
+
+# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be sorted by
+# fully-qualified names, including namespaces. If set to NO, the class list will
+# be sorted only by class name, not including the namespace part.
 # Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES.
-# Note: This option applies only to the class list, not to the
-# alphabetical list.
+# Note: This option applies only to the class list, not to the alphabetical
+# list.
+# The default value is: NO.
 
 SORT_BY_SCOPE_NAME     = NO
 
-# The GENERATE_TODOLIST tag can be used to enable (YES) or
-# disable (NO) the todo list. This list is created by putting \todo
-# commands in the documentation.
+# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to do proper
+# type resolution of all parameters of a function it will reject a match between
+# the prototype and the implementation of a member function even if there is
+# only one candidate or it is obvious which candidate to choose by doing a
+# simple string match. By disabling STRICT_PROTO_MATCHING doxygen will still
+# accept a match between prototype and implementation in such cases.
+# The default value is: NO.
+
+STRICT_PROTO_MATCHING  = NO
+
+# The GENERATE_TODOLIST tag can be used to enable (YES) or disable (NO) the todo
+# list. This list is created by putting \todo commands in the documentation.
+# The default value is: YES.
 
 GENERATE_TODOLIST      = YES
 
-# The GENERATE_TESTLIST tag can be used to enable (YES) or
-# disable (NO) the test list. This list is created by putting \test
-# commands in the documentation.
+# The GENERATE_TESTLIST tag can be used to enable (YES) or disable (NO) the test
+# list. This list is created by putting \test commands in the documentation.
+# The default value is: YES.
 
 GENERATE_TESTLIST      = YES
 
-# The GENERATE_BUGLIST tag can be used to enable (YES) or
-# disable (NO) the bug list. This list is created by putting \bug
-# commands in the documentation.
+# The GENERATE_BUGLIST tag can be used to enable (YES) or disable (NO) the bug
+# list. This list is created by putting \bug commands in the documentation.
+# The default value is: YES.
 
 GENERATE_BUGLIST       = YES
 
-# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or
-# disable (NO) the deprecated list. This list is created by putting
-# \deprecated commands in the documentation.
+# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or disable (NO)
+# the deprecated list. This list is created by putting \deprecated commands in
+# the documentation.
+# The default value is: YES.
 
 GENERATE_DEPRECATEDLIST= YES
 
-# The ENABLED_SECTIONS tag can be used to enable conditional
-# documentation sections, marked by \if sectionname ... \endif.
+# The ENABLED_SECTIONS tag can be used to enable conditional documentation
+# sections, marked by \if <section_label> ... \endif and \cond <section_label>
+# ... \endcond blocks.
 
 ENABLED_SECTIONS       =
 
-# The MAX_INITIALIZER_LINES tag determines the maximum number of lines
-# the initial value of a variable or define consists of for it to appear in
-# the documentation. If the initializer consists of more lines than specified
-# here it will be hidden. Use a value of 0 to hide initializers completely.
-# The appearance of the initializer of individual variables and defines in the
-# documentation can be controlled using \showinitializer or \hideinitializer
-# command in the documentation regardless of this setting.
+# The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the
+# initial value of a variable or macro / define can have for it to appear in the
+# documentation. If the initializer consists of more lines than specified here
+# it will be hidden. Use a value of 0 to hide initializers completely. The
+# appearance of the value of individual variables and macros / defines can be
+# controlled using \showinitializer or \hideinitializer command in the
+# documentation regardless of this setting.
+# Minimum value: 0, maximum value: 10000, default value: 30.
 
 MAX_INITIALIZER_LINES  = 30
 
-# Set the SHOW_USED_FILES tag to NO to disable the list of files generated
-# at the bottom of the documentation of classes and structs. If set to YES the
+# Set the SHOW_USED_FILES tag to NO to disable the list of files generated at
+# the bottom of the documentation of classes and structs. If set to YES, the
 # list will mention the files that were used to generate the documentation.
+# The default value is: YES.
 
 SHOW_USED_FILES        = YES
 
+# Set the SHOW_FILES tag to NO to disable the generation of the Files page. This
+# will remove the Files entry from the Quick Index and from the Folder Tree View
+# (if specified).
+# The default value is: YES.
+
+SHOW_FILES             = YES
+
+# Set the SHOW_NAMESPACES tag to NO to disable the generation of the Namespaces
+# page. This will remove the Namespaces entry from the Quick Index and from the
+# Folder Tree View (if specified).
+# The default value is: YES.
+
+SHOW_NAMESPACES        = YES
+
 # The FILE_VERSION_FILTER tag can be used to specify a program or script that
-# doxygen should invoke to get the current version for each file (typically from the
-# version control system). Doxygen will invoke the program by executing (via
-# popen()) the command <command> <input-file>, where <command> is the value of
-# the FILE_VERSION_FILTER tag, and <input-file> is the name of an input file
-# provided by doxygen. Whatever the program writes to standard output
-# is used as the file version. See the manual for examples.
+# doxygen should invoke to get the current version for each file (typically from
+# the version control system). Doxygen will invoke the program by executing (via
+# popen()) the command command input-file, where command is the value of the
+# FILE_VERSION_FILTER tag, and input-file is the name of an input file provided
+# by doxygen. Whatever the program writes to standard output is used as the file
+# version. For an example see the documentation.
 
 FILE_VERSION_FILTER    =
 
+# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed
+# by doxygen. The layout file controls the global structure of the generated
+# output files in an output format independent way. To create the layout file
+# that represents doxygen's defaults, run doxygen with the -l option. You can
+# optionally specify a file name after the option, if omitted DoxygenLayout.xml
+# will be used as the name of the layout file.
+#
+# Note that if you run doxygen from a directory containing a file called
+# DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE
+# tag is left empty.
+
+LAYOUT_FILE            =
+
+# The CITE_BIB_FILES tag can be used to specify one or more bib files containing
+# the reference definitions. This must be a list of .bib files. The .bib
+# extension is automatically appended if omitted. This requires the bibtex tool
+# to be installed. See also https://en.wikipedia.org/wiki/BibTeX for more info.
+# For LaTeX the style of the bibliography can be controlled using
+# LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the
+# search path. See also \cite for info how to create references.
+
+CITE_BIB_FILES         =
+
 #---------------------------------------------------------------------------
-# configuration options related to warning and progress messages
+# Configuration options related to warning and progress messages
 #---------------------------------------------------------------------------
 
-# The QUIET tag can be used to turn on/off the messages that are generated
-# by doxygen. Possible values are YES and NO. If left blank NO is used.
+# The QUIET tag can be used to turn on/off the messages that are generated to
+# standard output by doxygen. If QUIET is set to YES this implies that the
+# messages are off.
+# The default value is: NO.
 
 QUIET                  = YES
 
 # The WARNINGS tag can be used to turn on/off the warning messages that are
-# generated by doxygen. Possible values are YES and NO. If left blank
-# NO is used.
+# generated to standard error (stderr) by doxygen. If WARNINGS is set to YES
+# this implies that the warnings are on.
+#
+# Tip: Turn warnings on while writing the documentation.
+# The default value is: YES.
 
 WARNINGS               = YES
 
-# If WARN_IF_UNDOCUMENTED is set to YES, then doxygen will generate warnings
-# for undocumented members. If EXTRACT_ALL is set to YES then this flag will
-# automatically be disabled.
+# If the WARN_IF_UNDOCUMENTED tag is set to YES then doxygen will generate
+# warnings for undocumented members. If EXTRACT_ALL is set to YES then this flag
+# will automatically be disabled.
+# The default value is: YES.
 
 WARN_IF_UNDOCUMENTED   = YES
 
-# If WARN_IF_DOC_ERROR is set to YES, doxygen will generate warnings for
-# potential errors in the documentation, such as not documenting some
-# parameters in a documented function, or documenting parameters that
-# don't exist or using markup commands wrongly.
+# If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for
+# potential errors in the documentation, such as not documenting some parameters
+# in a documented function, or documenting parameters that don't exist or using
+# markup commands wrongly.
+# The default value is: YES.
 
 WARN_IF_DOC_ERROR      = YES
 
-# This WARN_NO_PARAMDOC option can be abled to get warnings for
-# functions that are documented, but have no documentation for their parameters
-# or return value. If set to NO (the default) doxygen will only warn about
-# wrong or incomplete parameter documentation, but not about the absence of
-# documentation.
+# This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that
+# are documented, but have no documentation for their parameters or return
+# value. If set to NO, doxygen will only warn about wrong or incomplete
+# parameter documentation, but not about the absence of documentation. If
+# EXTRACT_ALL is set to YES then this flag will automatically be disabled.
+# The default value is: NO.
 
 WARN_NO_PARAMDOC       = NO
 
-# The WARN_FORMAT tag determines the format of the warning messages that
-# doxygen can produce. The string should contain the $file, $line, and $text
-# tags, which will be replaced by the file and line number from which the
-# warning originated and the warning text. Optionally the format may contain
-# $version, which will be replaced by the version of the file (if it could
-# be obtained via FILE_VERSION_FILTER)
+# If the WARN_AS_ERROR tag is set to YES then doxygen will immediately stop when
+# a warning is encountered.
+# The default value is: NO.
+
+WARN_AS_ERROR          = NO
+
+# The WARN_FORMAT tag determines the format of the warning messages that doxygen
+# can produce. The string should contain the $file, $line, and $text tags, which
+# will be replaced by the file and line number from which the warning originated
+# and the warning text. Optionally the format may contain $version, which will
+# be replaced by the version of the file (if it could be obtained via
+# FILE_VERSION_FILTER)
+# The default value is: $file:$line: $text.
 
 WARN_FORMAT            = "$file:$line: $text"
 
-# The WARN_LOGFILE tag can be used to specify a file to which warning
-# and error messages should be written. If left blank the output is written
-# to stderr.
+# The WARN_LOGFILE tag can be used to specify a file to which warning and error
+# messages should be written. If left blank the output is written to standard
+# error (stderr).
 
 WARN_LOGFILE           =
 
 #---------------------------------------------------------------------------
-# configuration options related to the input files
+# Configuration options related to the input files
 #---------------------------------------------------------------------------
 
-# The INPUT tag can be used to specify the files and/or directories that contain
-# documented source files. You may enter file names like "myfile.cpp" or
-# directories like "/usr/src/myproject". Separate the files or directories
-# with spaces.
+# The INPUT tag is used to specify the files and/or directories that contain
+# documented source files. You may enter file names like myfile.cpp or
+# directories like /usr/src/myproject. Separate the files or directories with
+# spaces. See also FILE_PATTERNS and EXTENSION_MAPPING
+# Note: If this tag is empty the current directory is searched.
 
-INPUT =
+INPUT                  =
 
-# This tag can be used to specify the character encoding of the source files that
-# doxygen parses. Internally doxygen uses the UTF-8 encoding, which is also the default
-# input encoding. Doxygen uses libiconv (or the iconv built into libc) for the transcoding.
-# See http://www.gnu.org/software/libiconv for the list of possible encodings.
+# This tag can be used to specify the character encoding of the source files
+# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
+# libiconv (or the iconv built into libc) for the transcoding. See the libiconv
+# documentation (see: https://www.gnu.org/software/libiconv/) for the list of
+# possible encodings.
+# The default value is: UTF-8.
 
 INPUT_ENCODING         = UTF-8
 
 # If the value of the INPUT tag contains directories, you can use the
-# FILE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp
-# and *.h) to filter out the source-files in the directories. If left
-# blank the following patterns are tested:
-# *.c *.cc *.cxx *.cpp *.c++ *.java *.ii *.ixx *.ipp *.i++ *.inl *.h *.hh *.hxx
-# *.hpp *.h++ *.idl *.odl *.cs *.php *.php3 *.inc *.m *.mm *.py *.f90
+# FILE_PATTERNS tag to specify one or more wildcard patterns (like *.cpp and
+# *.h) to filter out the source-files in the directories.
+#
+# Note that for custom extensions or not directly supported extensions you also
+# need to set EXTENSION_MAPPING for the extension otherwise the files are not
+# read by doxygen.
+#
+# If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp,
+# *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h,
+# *.hh, *.hxx, *.hpp, *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc,
+# *.m, *.markdown, *.md, *.mm, *.dox, *.py, *.pyw, *.f90, *.f95, *.f03, *.f08,
+# *.f, *.for, *.tcl, *.vhd, *.vhdl, *.ucf, *.qsf and *.ice.
 
 FILE_PATTERNS          =
 
-# The RECURSIVE tag can be used to turn specify whether or not subdirectories
-# should be searched for input files as well. Possible values are YES and NO.
-# If left blank NO is used.
+# The RECURSIVE tag can be used to specify whether or not subdirectories should
+# be searched for input files as well.
+# The default value is: NO.
 
 RECURSIVE              = NO
 
-# The EXCLUDE tag can be used to specify files and/or directories that should
+# The EXCLUDE tag can be used to specify files and/or directories that should be
 # excluded from the INPUT source files. This way you can easily exclude a
 # subdirectory from a directory tree whose root is specified with the INPUT tag.
+#
+# Note that relative paths are relative to the directory from which doxygen is
+# run.
 
 EXCLUDE                =
 
-# The EXCLUDE_SYMLINKS tag can be used select whether or not files or
-# directories that are symbolic links (a Unix filesystem feature) are excluded
+# The EXCLUDE_SYMLINKS tag can be used to select whether or not files or
+# directories that are symbolic links (a Unix file system feature) are excluded
 # from the input.
+# The default value is: NO.
 
 EXCLUDE_SYMLINKS       = NO
 
 # If the value of the INPUT tag contains directories, you can use the
 # EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude
-# certain files from those directories. Note that the wildcards are matched
-# against the file with absolute path, so to exclude all test directories
-# for example use the pattern */test/*
+# certain files from those directories.
+#
+# Note that the wildcards are matched against the file with absolute path, so to
+# exclude all test directories for example use the pattern */test/*
 
 EXCLUDE_PATTERNS       =
 
 # The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names
-# (namespaces, classes, functions, etc.) that should be excluded from the output.
-# The symbol name can be a fully qualified name, a word, or if the wildcard * is used,
-# a substring. Examples: ANamespace, AClass, AClass::ANamespace, ANamespace::*Test
+# (namespaces, classes, functions, etc.) that should be excluded from the
+# output. The symbol name can be a fully qualified name, a word, or if the
+# wildcard * is used, a substring. Examples: ANamespace, AClass,
+# AClass::ANamespace, ANamespace::*Test
+#
+# Note that the wildcards are matched against the file with absolute path, so to
+# exclude all test directories use the pattern */test/*
 
 EXCLUDE_SYMBOLS        =
 
-# The EXAMPLE_PATH tag can be used to specify one or more files or
-# directories that contain example code fragments that are included (see
-# the \include command).
+# The EXAMPLE_PATH tag can be used to specify one or more files or directories
+# that contain example code fragments that are included (see the \include
+# command).
 
 EXAMPLE_PATH           =
 
 # If the value of the EXAMPLE_PATH tag contains directories, you can use the
-# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp
-# and *.h) to filter out the source-files in the directories. If left
-# blank all files are included.
+# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and
+# *.h) to filter out the source-files in the directories. If left blank all
+# files are included.
 
 EXAMPLE_PATTERNS       =
 
 # If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be
-# searched for input files to be used with the \include or \dontinclude
-# commands irrespective of the value of the RECURSIVE tag.
-# Possible values are YES and NO. If left blank NO is used.
+# searched for input files to be used with the \include or \dontinclude commands
+# irrespective of the value of the RECURSIVE tag.
+# The default value is: NO.
 
 EXAMPLE_RECURSIVE      = NO
 
-# The IMAGE_PATH tag can be used to specify one or more files or
-# directories that contain image that are included in the documentation (see
-# the \image command).
+# The IMAGE_PATH tag can be used to specify one or more files or directories
+# that contain images that are to be included in the documentation (see the
+# \image command).
 
 IMAGE_PATH             =
 
 # The INPUT_FILTER tag can be used to specify a program that doxygen should
 # invoke to filter for each input file. Doxygen will invoke the filter program
-# by executing (via popen()) the command <filter> <input-file>, where <filter>
-# is the value of the INPUT_FILTER tag, and <input-file> is the name of an
-# input file. Doxygen will then use the output that the filter program writes
-# to standard output.  If FILTER_PATTERNS is specified, this tag will be
-# ignored.
+# by executing (via popen()) the command:
+#
+# <filter> <input-file>
+#
+# where <filter> is the value of the INPUT_FILTER tag, and <input-file> is the
+# name of an input file. Doxygen will then use the output that the filter
+# program writes to standard output. If FILTER_PATTERNS is specified, this tag
+# will be ignored.
+#
+# Note that the filter must not add or remove lines; it is applied before the
+# code is scanned, but not when the output code is generated. If lines are added
+# or removed, the anchors will not be placed correctly.
+#
+# Note that for custom extensions or not directly supported extensions you also
+# need to set EXTENSION_MAPPING for the extension otherwise the files are not
+# properly processed by doxygen.
 
 INPUT_FILTER           =
 
 # The FILTER_PATTERNS tag can be used to specify filters on a per file pattern
-# basis.  Doxygen will compare the file name with each pattern and apply the
-# filter if there is a match.  The filters are a list of the form:
-# pattern=filter (like *.cpp=my_cpp_filter). See INPUT_FILTER for further
-# info on how filters are used. If FILTER_PATTERNS is empty, INPUT_FILTER
-# is applied to all files.
+# basis. Doxygen will compare the file name with each pattern and apply the
+# filter if there is a match. The filters are a list of the form: pattern=filter
+# (like *.cpp=my_cpp_filter). See INPUT_FILTER for further information on how
+# filters are used. If the FILTER_PATTERNS tag is empty or if none of the
+# patterns match the file name, INPUT_FILTER is applied.
+#
+# Note that for custom extensions or not directly supported extensions you also
+# need to set EXTENSION_MAPPING for the extension otherwise the files are not
+# properly processed by doxygen.
 
 FILTER_PATTERNS        =
 
 # If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using
-# INPUT_FILTER) will be used to filter the input files when producing source
-# files to browse (i.e. when SOURCE_BROWSER is set to YES).
+# INPUT_FILTER) will also be used to filter the input files that are used for
+# producing the source files to browse (i.e. when SOURCE_BROWSER is set to YES).
+# The default value is: NO.
 
 FILTER_SOURCE_FILES    = NO
 
+# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file
+# pattern. A pattern will override the setting for FILTER_PATTERN (if any) and
+# it is also possible to disable source filtering for a specific pattern using
+# *.ext= (so without naming a filter).
+# This tag requires that the tag FILTER_SOURCE_FILES is set to YES.
+
+FILTER_SOURCE_PATTERNS =
+
+# If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that
+# is part of the input, its contents will be placed on the main page
+# (index.html). This can be useful if you have a project on for instance GitHub
+# and want to reuse the introduction page also for the doxygen output.
+
+USE_MDFILE_AS_MAINPAGE =
+
 #---------------------------------------------------------------------------
-# configuration options related to source browsing
+# Configuration options related to source browsing
 #---------------------------------------------------------------------------
 
-# If the SOURCE_BROWSER tag is set to YES then a list of source files will
-# be generated. Documented entities will be cross-referenced with these sources.
-# Note: To get rid of all source code in the generated output, make sure also
-# VERBATIM_HEADERS is set to NO. If you have enabled CALL_GRAPH or CALLER_GRAPH
-# then you must also enable this option. If you don't then doxygen will produce
-# a warning and turn it on anyway
+# If the SOURCE_BROWSER tag is set to YES then a list of source files will be
+# generated. Documented entities will be cross-referenced with these sources.
+#
+# Note: To get rid of all source code in the generated output, make sure that
+# also VERBATIM_HEADERS is set to NO.
+# The default value is: NO.
 
 SOURCE_BROWSER         = NO
 
-# Setting the INLINE_SOURCES tag to YES will include the body
-# of functions and classes directly in the documentation.
+# Setting the INLINE_SOURCES tag to YES will include the body of functions,
+# classes and enums directly into the documentation.
+# The default value is: NO.
 
 INLINE_SOURCES         = NO
 
-# Setting the STRIP_CODE_COMMENTS tag to YES (the default) will instruct
-# doxygen to hide any special comment blocks from generated source code
-# fragments. Normal C and C++ comments will always remain visible.
+# Setting the STRIP_CODE_COMMENTS tag to YES will instruct doxygen to hide any
+# special comment blocks from generated source code fragments. Normal C, C++ and
+# Fortran comments will always remain visible.
+# The default value is: YES.
 
 STRIP_CODE_COMMENTS    = YES
 
-# If the REFERENCED_BY_RELATION tag is set to YES (the default)
-# then for each documented function all documented
-# functions referencing it will be listed.
+# If the REFERENCED_BY_RELATION tag is set to YES then for each documented
+# entity all documented functions referencing it will be listed.
+# The default value is: NO.
 
 REFERENCED_BY_RELATION = YES
 
-# If the REFERENCES_RELATION tag is set to YES (the default)
-# then for each documented function all documented entities
-# called/used by that function will be listed.
+# If the REFERENCES_RELATION tag is set to YES then for each documented function
+# all documented entities called/used by that function will be listed.
+# The default value is: NO.
 
 REFERENCES_RELATION    = YES
 
-# If the REFERENCES_LINK_SOURCE tag is set to YES (the default)
-# and SOURCE_BROWSER tag is set to YES, then the hyperlinks from
-# functions in REFERENCES_RELATION and REFERENCED_BY_RELATION lists will
-# link to the source code.  Otherwise they will link to the documentstion.
+# If the REFERENCES_LINK_SOURCE tag is set to YES and SOURCE_BROWSER tag is set
+# to YES then the hyperlinks from functions in REFERENCES_RELATION and
+# REFERENCED_BY_RELATION lists will link to the source code. Otherwise they will
+# link to the documentation.
+# The default value is: YES.
 
 REFERENCES_LINK_SOURCE = YES
 
-# If the USE_HTAGS tag is set to YES then the references to source code
-# will point to the HTML generated by the htags(1) tool instead of doxygen
-# built-in source browser. The htags tool is part of GNU's global source
-# tagging system (see http://www.gnu.org/software/global/global.html). You
-# will need version 4.8.6 or higher.
+# If SOURCE_TOOLTIPS is enabled (the default) then hovering a hyperlink in the
+# source code will show a tooltip with additional information such as prototype,
+# brief description and links to the definition and documentation. Since this
+# will make the HTML file larger and loading of large files a bit slower, you
+# can opt to disable this feature.
+# The default value is: YES.
+# This tag requires that the tag SOURCE_BROWSER is set to YES.
+
+SOURCE_TOOLTIPS        = YES
+
+# If the USE_HTAGS tag is set to YES then the references to source code will
+# point to the HTML generated by the htags(1) tool instead of doxygen built-in
+# source browser. The htags tool is part of GNU's global source tagging system
+# (see https://www.gnu.org/software/global/global.html). You will need version
+# 4.8.6 or higher.
+#
+# To use it do the following:
+# - Install the latest version of global
+# - Enable SOURCE_BROWSER and USE_HTAGS in the configuration file
+# - Make sure the INPUT points to the root of the source tree
+# - Run doxygen as normal
+#
+# Doxygen will invoke htags (and that will in turn invoke gtags), so these
+# tools must be available from the command line (i.e. in the search path).
+#
+# The result: instead of the source browser generated by doxygen, the links to
+# source code will now point to the output of htags.
+# The default value is: NO.
+# This tag requires that the tag SOURCE_BROWSER is set to YES.
 
 USE_HTAGS              = NO
 
-# If the VERBATIM_HEADERS tag is set to YES (the default) then Doxygen
-# will generate a verbatim copy of the header file for each class for
-# which an include is specified. Set to NO to disable this.
+# If the VERBATIM_HEADERS tag is set the YES then doxygen will generate a
+# verbatim copy of the header file for each class for which an include is
+# specified. Set to NO to disable this.
+# See also: Section \class.
+# The default value is: YES.
 
 VERBATIM_HEADERS       = YES
 
+# If the CLANG_ASSISTED_PARSING tag is set to YES then doxygen will use the
+# clang parser (see: http://clang.llvm.org/) for more accurate parsing at the
+# cost of reduced performance. This can be particularly helpful with template
+# rich C++ code for which doxygen's built-in parser lacks the necessary type
+# information.
+# Note: The availability of this option depends on whether or not doxygen was
+# generated with the -Duse_libclang=ON option for CMake.
+# The default value is: NO.
+
+CLANG_ASSISTED_PARSING = NO
+
+# If clang assisted parsing is enabled you can provide the compiler with command
+# line options that you would normally use when invoking the compiler. Note that
+# the include paths will already be set by doxygen for the files and directories
+# specified with INPUT and INCLUDE_PATH.
+# This tag requires that the tag CLANG_ASSISTED_PARSING is set to YES.
+
+CLANG_OPTIONS          =
+
+# If clang assisted parsing is enabled you can provide the clang parser with the
+# path to the compilation database (see:
+# http://clang.llvm.org/docs/HowToSetupToolingForLLVM.html) used when the files
+# were built. This is equivalent to specifying the "-p" option to a clang tool,
+# such as clang-check. These options will then be passed to the parser.
+# Note: The availability of this option depends on whether or not doxygen was
+# generated with the -Duse_libclang=ON option for CMake.
+
+CLANG_DATABASE_PATH    =
+
 #---------------------------------------------------------------------------
-# configuration options related to the alphabetical class index
+# Configuration options related to the alphabetical class index
 #---------------------------------------------------------------------------
 
-# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index
-# of all compounds will be generated. Enable this if the project
-# contains a lot of classes, structs, unions or interfaces.
+# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index of all
+# compounds will be generated. Enable this if the project contains a lot of
+# classes, structs, unions or interfaces.
+# The default value is: YES.
 
 ALPHABETICAL_INDEX     = NO
-
-# If the alphabetical index is enabled (see ALPHABETICAL_INDEX) then
-# the COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns
-# in which this list will be split (can be a number in the range [1..20])
+# The COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns in
+# which the alphabetical index list will be split.
+# Minimum value: 1, maximum value: 20, default value: 5.
+# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
 
 COLS_IN_ALPHA_INDEX    = 5
 
-# In case all classes in a project start with a common prefix, all
-# classes will be put under the same header in the alphabetical index.
-# The IGNORE_PREFIX tag can be used to specify one or more prefixes that
-# should be ignored while generating the index headers.
+# In case all classes in a project start with a common prefix, all classes will
+# be put under the same header in the alphabetical index. The IGNORE_PREFIX tag
+# can be used to specify a prefix (or a list of prefixes) that should be ignored
+# while generating the index headers.
+# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
 
 IGNORE_PREFIX          =
 
 #---------------------------------------------------------------------------
-# configuration options related to the HTML output
+# Configuration options related to the HTML output
 #---------------------------------------------------------------------------
 
-# If the GENERATE_HTML tag is set to YES (the default) Doxygen will
-# generate HTML output.
+# If the GENERATE_HTML tag is set to YES, doxygen will generate HTML output
+# The default value is: YES.
 
 GENERATE_HTML          = YES
 
-# The HTML_OUTPUT tag is used to specify where the HTML docs will be put.
-# If a relative path is entered the value of OUTPUT_DIRECTORY will be
-# put in front of it. If left blank `html' will be used as the default path.
+# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: html.
+# This tag requires that the tag GENERATE_HTML is set to YES.
 
 HTML_OUTPUT            = html
 
-# The HTML_FILE_EXTENSION tag can be used to specify the file extension for
-# each generated HTML page (for example: .htm,.php,.asp). If it is left blank
-# doxygen will generate files with .html extension.
+# The HTML_FILE_EXTENSION tag can be used to specify the file extension for each
+# generated HTML page (for example: .htm, .php, .asp).
+# The default value is: .html.
+# This tag requires that the tag GENERATE_HTML is set to YES.
 
 HTML_FILE_EXTENSION    = .html
 
-# The HTML_HEADER tag can be used to specify a personal HTML header for
-# each generated HTML page. If it is left blank doxygen will generate a
+# The HTML_HEADER tag can be used to specify a user-defined HTML header file for
+# each generated HTML page. If the tag is left blank doxygen will generate a
 # standard header.
+#
+# To get valid HTML the header file that includes any scripts and style sheets
+# that doxygen needs, which is dependent on the configuration options used (e.g.
+# the setting GENERATE_TREEVIEW). It is highly recommended to start with a
+# default header using
+# doxygen -w html new_header.html new_footer.html new_stylesheet.css
+# YourConfigFile
+# and then modify the file new_header.html. See also section "Doxygen usage"
+# for information on how to generate the default header that doxygen normally
+# uses.
+# Note: The header is subject to change so you typically have to regenerate the
+# default header when upgrading to a newer version of doxygen. For a description
+# of the possible markers and block names see the documentation.
+# This tag requires that the tag GENERATE_HTML is set to YES.
 
 HTML_HEADER            =
 
-# The HTML_FOOTER tag can be used to specify a personal HTML footer for
-# each generated HTML page. If it is left blank doxygen will generate a
-# standard footer.
+# The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each
+# generated HTML page. If the tag is left blank doxygen will generate a standard
+# footer. See HTML_HEADER for more information on how to generate a default
+# footer and what special commands can be used inside the footer. See also
+# section "Doxygen usage" for information on how to generate the default footer
+# that doxygen normally uses.
+# This tag requires that the tag GENERATE_HTML is set to YES.
 
 HTML_FOOTER            =
 
-# The HTML_STYLESHEET tag can be used to specify a user-defined cascading
-# style sheet that is used by each HTML page. It can be used to
-# fine-tune the look of the HTML output. If the tag is left blank doxygen
-# will generate a default style sheet. Note that doxygen will try to copy
-# the style sheet file to the HTML output directory, so don't put your own
-# stylesheet in the HTML output directory as well, or it will be erased!
+# The HTML_STYLESHEET tag can be used to specify a user-defined cascading style
+# sheet that is used by each HTML page. It can be used to fine-tune the look of
+# the HTML output. If left blank doxygen will generate a default style sheet.
+# See also section "Doxygen usage" for information on how to generate the style
+# sheet that doxygen normally uses.
+# Note: It is recommended to use HTML_EXTRA_STYLESHEET instead of this tag, as
+# it is more robust and this tag (HTML_STYLESHEET) will in the future become
+# obsolete.
+# This tag requires that the tag GENERATE_HTML is set to YES.
 
 HTML_STYLESHEET        =
 
-# If the GENERATE_HTMLHELP tag is set to YES, additional index files
-# will be generated that can be used as input for tools like the
-# Microsoft HTML help workshop to generate a compressed HTML help file (.chm)
-# of the generated HTML documentation.
+# The HTML_EXTRA_STYLESHEET tag can be used to specify additional user-defined
+# cascading style sheets that are included after the standard style sheets
+# created by doxygen. Using this option one can overrule certain style aspects.
+# This is preferred over using HTML_STYLESHEET since it does not replace the
+# standard style sheet and is therefore more robust against future updates.
+# Doxygen will copy the style sheet files to the output directory.
+# Note: The order of the extra style sheet files is of importance (e.g. the last
+# style sheet in the list overrules the setting of the previous ones in the
+# list). For an example see the documentation.
+# This tag requires that the tag GENERATE_HTML is set to YES.
 
-GENERATE_HTMLHELP      = NO
+HTML_EXTRA_STYLESHEET  =
+
+# The HTML_EXTRA_FILES tag can be used to specify one or more extra images or
+# other source files which should be copied to the HTML output directory. Note
+# that these files will be copied to the base HTML output directory. Use the
+# $relpath^ marker in the HTML_HEADER and/or HTML_FOOTER files to load these
+# files. In the HTML_STYLESHEET file, use the file name only. Also note that the
+# files will be copied as-is; there are no commands or markers available.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_EXTRA_FILES       =
+
+# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen
+# will adjust the colors in the style sheet and background images according to
+# this color. Hue is specified as an angle on a colorwheel, see
+# https://en.wikipedia.org/wiki/Hue for more information. For instance the value
+# 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300
+# purple, and 360 is red again.
+# Minimum value: 0, maximum value: 359, default value: 220.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE_HUE    = 220
+
+# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of the colors
+# in the HTML output. For a value of 0 the output will use grayscales only. A
+# value of 255 will produce the most vivid colors.
+# Minimum value: 0, maximum value: 255, default value: 100.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE_SAT    = 100
+
+# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to the
+# luminance component of the colors in the HTML output. Values below 100
+# gradually make the output lighter, whereas values above 100 make the output
+# darker. The value divided by 100 is the actual gamma applied, so 80 represents
+# a gamma of 0.8, The value 220 represents a gamma of 2.2, and 100 does not
+# change the gamma.
+# Minimum value: 40, maximum value: 240, default value: 80.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE_GAMMA  = 80
+
+# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML
+# page will contain the date and time when the page was generated. Setting this
+# to YES can help to show when doxygen was last run and thus if the
+# documentation is up to date.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_TIMESTAMP         = NO
+
+# If the HTML_DYNAMIC_MENUS tag is set to YES then the generated HTML
+# documentation will contain a main index with vertical navigation menus that
+# are dynamically created via Javascript. If disabled, the navigation index will
+# consists of multiple levels of tabs that are statically embedded in every HTML
+# page. Disable this option to support browsers that do not have Javascript,
+# like the Qt help browser.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_DYNAMIC_MENUS     = YES
 
 # If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML
 # documentation will contain sections that can be hidden and shown after the
-# page has loaded. For this to work a browser that supports
-# java_script and DHTML is required (for instance Mozilla 1.0+, Firefox
-# Netscape 6.0+, Internet explorer 5.0+, Konqueror, or Safari).
+# page has loaded.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
 
 HTML_DYNAMIC_SECTIONS  = NO
 
-# If the GENERATE_HTMLHELP tag is set to YES, the CHM_FILE tag can
-# be used to specify the file name of the resulting .chm file. You
-# can add a path in front of the file if the result should not be
+# With HTML_INDEX_NUM_ENTRIES one can control the preferred number of entries
+# shown in the various tree structured indices initially; the user can expand
+# and collapse entries dynamically later on. Doxygen will expand the tree to
+# such a level that at most the specified number of entries are visible (unless
+# a fully collapsed tree already exceeds this amount). So setting the number of
+# entries 1 will produce a full collapsed tree by default. 0 is a special value
+# representing an infinite number of entries and will result in a full expanded
+# tree by default.
+# Minimum value: 0, maximum value: 9999, default value: 100.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_INDEX_NUM_ENTRIES = 100
+
+# If the GENERATE_DOCSET tag is set to YES, additional index files will be
+# generated that can be used as input for Apple's Xcode 3 integrated development
+# environment (see: https://developer.apple.com/xcode/), introduced with OSX
+# 10.5 (Leopard). To create a documentation set, doxygen will generate a
+# Makefile in the HTML output directory. Running make will produce the docset in
+# that directory and running make install will install the docset in
+# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at
+# startup. See https://developer.apple.com/library/archive/featuredarticles/Doxy
+# genXcode/_index.html for more information.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_DOCSET        = NO
+
+# This tag determines the name of the docset feed. A documentation feed provides
+# an umbrella under which multiple documentation sets from a single provider
+# (such as a company or product suite) can be grouped.
+# The default value is: Doxygen generated docs.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_FEEDNAME        = "Doxygen generated docs"
+
+# This tag specifies a string that should uniquely identify the documentation
+# set bundle. This should be a reverse domain-name style string, e.g.
+# com.mycompany.MyDocSet. Doxygen will append .docset to the name.
+# The default value is: org.doxygen.Project.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_BUNDLE_ID       = org.doxygen.Project
+
+# The DOCSET_PUBLISHER_ID tag specifies a string that should uniquely identify
+# the documentation publisher. This should be a reverse domain-name style
+# string, e.g. com.mycompany.MyDocSet.documentation.
+# The default value is: org.doxygen.Publisher.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_PUBLISHER_ID    = org.doxygen.Publisher
+
+# The DOCSET_PUBLISHER_NAME tag identifies the documentation publisher.
+# The default value is: Publisher.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_PUBLISHER_NAME  = Publisher
+
+# If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three
+# additional HTML index files: index.hhp, index.hhc, and index.hhk. The
+# index.hhp is a project file that can be read by Microsoft's HTML Help Workshop
+# (see: https://www.microsoft.com/en-us/download/details.aspx?id=21138) on
+# Windows.
+#
+# The HTML Help Workshop contains a compiler that can convert all HTML output
+# generated by doxygen into a single compiled HTML file (.chm). Compiled HTML
+# files are now used as the Windows 98 help format, and will replace the old
+# Windows help format (.hlp) on all Windows platforms in the future. Compressed
+# HTML files also contain an index, a table of contents, and you can search for
+# words in the documentation. The HTML workshop also contains a viewer for
+# compressed HTML files.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_HTMLHELP      = NO
+
+# The CHM_FILE tag can be used to specify the file name of the resulting .chm
+# file. You can add a path in front of the file if the result should not be
 # written to the html output directory.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
 
 CHM_FILE               =
 
-# If the GENERATE_HTMLHELP tag is set to YES, the HHC_LOCATION tag can
-# be used to specify the location (absolute path including file name) of
-# the HTML help compiler (hhc.exe). If non-empty doxygen will try to run
-# the HTML help compiler on the generated index.hhp.
+# The HHC_LOCATION tag can be used to specify the location (absolute path
+# including file name) of the HTML help compiler (hhc.exe). If non-empty,
+# doxygen will try to run the HTML help compiler on the generated index.hhp.
+# The file has to be specified with full path.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
 
 HHC_LOCATION           =
 
-# If the GENERATE_HTMLHELP tag is set to YES, the GENERATE_CHI flag
-# controls if a separate .chi index file is generated (YES) or that
-# it should be included in the master .chm file (NO).
+# The GENERATE_CHI flag controls if a separate .chi index file is generated
+# (YES) or that it should be included in the master .chm file (NO).
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
 
 GENERATE_CHI           = NO
 
-# If the GENERATE_HTMLHELP tag is set to YES, the BINARY_TOC flag
-# controls whether a binary table of contents is generated (YES) or a
-# normal table of contents (NO) in the .chm file.
+# The CHM_INDEX_ENCODING is used to encode HtmlHelp index (hhk), content (hhc)
+# and project file content.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+CHM_INDEX_ENCODING     =
+
+# The BINARY_TOC flag controls whether a binary table of contents is generated
+# (YES) or a normal table of contents (NO) in the .chm file. Furthermore it
+# enables the Previous and Next buttons.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
 
 BINARY_TOC             = NO
 
-# The TOC_EXPAND flag can be set to YES to add extra items for group members
-# to the contents of the HTML help documentation and to the tree view.
+# The TOC_EXPAND flag can be set to YES to add extra items for group members to
+# the table of contents of the HTML help documentation and to the tree view.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
 
 TOC_EXPAND             = NO
 
-# The DISABLE_INDEX tag can be used to turn on/off the condensed index at
-# top of each HTML page. The value NO (the default) enables the index and
-# the value YES disables it.
+# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and
+# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated that
+# can be used as input for Qt's qhelpgenerator to generate a Qt Compressed Help
+# (.qch) of the generated HTML documentation.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_QHP           = NO
+
+# If the QHG_LOCATION tag is specified, the QCH_FILE tag can be used to specify
+# the file name of the resulting .qch file. The path specified is relative to
+# the HTML output folder.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QCH_FILE               =
+
+# The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help
+# Project output. For more information please see Qt Help Project / Namespace
+# (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#namespace).
+# The default value is: org.doxygen.Project.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_NAMESPACE          = org.doxygen.Project
+
+# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt
+# Help Project output. For more information please see Qt Help Project / Virtual
+# Folders (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#virtual-
+# folders).
+# The default value is: doc.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_VIRTUAL_FOLDER     = doc
+
+# If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom
+# filter to add. For more information please see Qt Help Project / Custom
+# Filters (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom-
+# filters).
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_CUST_FILTER_NAME   =
+
+# The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the
+# custom filter to add. For more information please see Qt Help Project / Custom
+# Filters (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom-
+# filters).
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_CUST_FILTER_ATTRS  =
+
+# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this
+# project's filter section matches. Qt Help Project / Filter Attributes (see:
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#filter-attributes).
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_SECT_FILTER_ATTRS  =
+
+# The QHG_LOCATION tag can be used to specify the location of Qt's
+# qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the
+# generated .qhp file.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHG_LOCATION           =
+
+# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be
+# generated, together with the HTML files, they form an Eclipse help plugin. To
+# install this plugin and make it available under the help contents menu in
+# Eclipse, the contents of the directory containing the HTML and XML files needs
+# to be copied into the plugins directory of eclipse. The name of the directory
+# within the plugins directory should be the same as the ECLIPSE_DOC_ID value.
+# After copying Eclipse needs to be restarted before the help appears.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_ECLIPSEHELP   = NO
+
+# A unique identifier for the Eclipse help plugin. When installing the plugin
+# the directory name containing the HTML and XML files should also have this
+# name. Each documentation set should have its own identifier.
+# The default value is: org.doxygen.Project.
+# This tag requires that the tag GENERATE_ECLIPSEHELP is set to YES.
+
+ECLIPSE_DOC_ID         = org.doxygen.Project
+
+# If you want full control over the layout of the generated HTML pages it might
+# be necessary to disable the index and replace it with your own. The
+# DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs) at top
+# of each HTML page. A value of NO enables the index and the value YES disables
+# it. Since the tabs in the index contain the same information as the navigation
+# tree, you can set this option to YES if you also set GENERATE_TREEVIEW to YES.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
 
 DISABLE_INDEX          = NO
 
-# This tag can be used to set the number of enum values (range [1..20])
-# that doxygen will group on one line in the generated HTML documentation.
+# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index
+# structure should be generated to display hierarchical information. If the tag
+# value is set to YES, a side panel will be generated containing a tree-like
+# index structure (just like the one that is generated for HTML Help). For this
+# to work a browser that supports JavaScript, DHTML, CSS and frames is required
+# (i.e. any modern browser). Windows users are probably better off using the
+# HTML help feature. Via custom style sheets (see HTML_EXTRA_STYLESHEET) one can
+# further fine-tune the look of the index. As an example, the default style
+# sheet generated by doxygen has an example that shows how to put an image at
+# the root of the tree instead of the PROJECT_NAME. Since the tree basically has
+# the same information as the tab index, you could consider setting
+# DISABLE_INDEX to YES when enabling this option.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_TREEVIEW      = YES
+
+# The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that
+# doxygen will group on one line in the generated HTML documentation.
+#
+# Note that a value of 0 will completely suppress the enum values from appearing
+# in the overview section.
+# Minimum value: 0, maximum value: 20, default value: 4.
+# This tag requires that the tag GENERATE_HTML is set to YES.
 
 ENUM_VALUES_PER_LINE   = 4
 
-# If the GENERATE_TREEVIEW tag is set to YES, a side panel will be
-# generated containing a tree-like index structure (just like the one that
-# is generated for HTML Help). For this to work a browser that supports
-# java_script, DHTML, CSS and frames is required (for instance Mozilla 1.0+,
-# Netscape 6.0+, Internet explorer 5.0+, or Konqueror). Windows users are
-# probably better off using the HTML help feature.
-
-GENERATE_TREEVIEW      = NO
-
-# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be
-# used to set the initial width (in pixels) of the frame in which the tree
-# is shown.
+# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be used
+# to set the initial width (in pixels) of the frame in which the tree is shown.
+# Minimum value: 0, maximum value: 1500, default value: 250.
+# This tag requires that the tag GENERATE_HTML is set to YES.
 
 TREEVIEW_WIDTH         = 250
 
+# If the EXT_LINKS_IN_WINDOW option is set to YES, doxygen will open links to
+# external symbols imported via tag files in a separate window.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+EXT_LINKS_IN_WINDOW    = NO
+
+# Use this tag to change the font size of LaTeX formulas included as images in
+# the HTML documentation. When you change the font size after a successful
+# doxygen run you need to manually remove any form_*.png images from the HTML
+# output directory to force them to be regenerated.
+# Minimum value: 8, maximum value: 50, default value: 10.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+FORMULA_FONTSIZE       = 10
+
+# Use the FORMULA_TRANSPARENT tag to determine whether or not the images
+# generated for formulas are transparent PNGs. Transparent PNGs are not
+# supported properly for IE 6.0, but are supported on all modern browsers.
+#
+# Note that when changing this option you need to delete any form_*.png files in
+# the HTML output directory before the changes have effect.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+FORMULA_TRANSPARENT    = YES
+
+# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see
+# https://www.mathjax.org) which uses client side Javascript for the rendering
+# instead of using pre-rendered bitmaps. Use this if you do not have LaTeX
+# installed or if you want to formulas look prettier in the HTML output. When
+# enabled you may also need to install MathJax separately and configure the path
+# to it using the MATHJAX_RELPATH option.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+USE_MATHJAX            = YES
+
+# When MathJax is enabled you can set the default output format to be used for
+# the MathJax output. See the MathJax site (see:
+# http://docs.mathjax.org/en/latest/output.html) for more details.
+# Possible values are: HTML-CSS (which is slower, but has the best
+# compatibility), NativeMML (i.e. MathML) and SVG.
+# The default value is: HTML-CSS.
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_FORMAT         = HTML-CSS
+
+# When MathJax is enabled you need to specify the location relative to the HTML
+# output directory using the MATHJAX_RELPATH option. The destination directory
+# should contain the MathJax.js script. For instance, if the mathjax directory
+# is located at the same level as the HTML output directory, then
+# MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax
+# Content Delivery Network so you can quickly see the result without installing
+# MathJax. However, it is strongly recommended to install a local copy of
+# MathJax from https://www.mathjax.org before deployment.
+# The default value is: https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/.
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_RELPATH        = https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/
+
+# The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax
+# extension names that should be enabled during MathJax rendering. For example
+# MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_EXTENSIONS     =
+
+# The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces
+# of code that will be used on startup of the MathJax code. See the MathJax site
+# (see: http://docs.mathjax.org/en/latest/output.html) for more details. For an
+# example see the documentation.
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_CODEFILE       =
+
+# When the SEARCHENGINE tag is enabled doxygen will generate a search box for
+# the HTML output. The underlying search engine uses javascript and DHTML and
+# should work on any modern browser. Note that when using HTML help
+# (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets (GENERATE_DOCSET)
+# there is already a search function so this one should typically be disabled.
+# For large projects the javascript based search engine can be slow, then
+# enabling SERVER_BASED_SEARCH may provide a better solution. It is possible to
+# search using the keyboard; to jump to the search box use <access key> + S
+# (what the <access key> is depends on the OS and browser, but it is typically
+# <CTRL>, <ALT>/<option>, or both). Inside the search box use the <cursor down
+# key> to jump into the search results window, the results can be navigated
+# using the <cursor keys>. Press <Enter> to select an item or <escape> to cancel
+# the search. The filter options can be selected when the cursor is inside the
+# search box by pressing <Shift>+<cursor down>. Also here use the <cursor keys>
+# to select a filter and <Enter> or <escape> to activate or cancel the filter
+# option.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+SEARCHENGINE           = YES
+
+# When the SERVER_BASED_SEARCH tag is enabled the search engine will be
+# implemented using a web server instead of a web client using Javascript. There
+# are two flavors of web server based searching depending on the EXTERNAL_SEARCH
+# setting. When disabled, doxygen will generate a PHP script for searching and
+# an index file used by the script. When EXTERNAL_SEARCH is enabled the indexing
+# and searching needs to be provided by external tools. See the section
+# "External Indexing and Searching" for details.
+# The default value is: NO.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+SERVER_BASED_SEARCH    = NO
+
+# When EXTERNAL_SEARCH tag is enabled doxygen will no longer generate the PHP
+# script for searching. Instead the search results are written to an XML file
+# which needs to be processed by an external indexer. Doxygen will invoke an
+# external search engine pointed to by the SEARCHENGINE_URL option to obtain the
+# search results.
+#
+# Doxygen ships with an example indexer (doxyindexer) and search engine
+# (doxysearch.cgi) which are based on the open source search engine library
+# Xapian (see: https://xapian.org/).
+#
+# See the section "External Indexing and Searching" for details.
+# The default value is: NO.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+EXTERNAL_SEARCH        = NO
+
+# The SEARCHENGINE_URL should point to a search engine hosted by a web server
+# which will return the search results when EXTERNAL_SEARCH is enabled.
+#
+# Doxygen ships with an example indexer (doxyindexer) and search engine
+# (doxysearch.cgi) which are based on the open source search engine library
+# Xapian (see: https://xapian.org/). See the section "External Indexing and
+# Searching" for details.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+SEARCHENGINE_URL       =
+
+# When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the unindexed
+# search data is written to a file for indexing by an external tool. With the
+# SEARCHDATA_FILE tag the name of this file can be specified.
+# The default file is: searchdata.xml.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+SEARCHDATA_FILE        = searchdata.xml
+
+# When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the
+# EXTERNAL_SEARCH_ID tag can be used as an identifier for the project. This is
+# useful in combination with EXTRA_SEARCH_MAPPINGS to search through multiple
+# projects and redirect the results back to the right project.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+EXTERNAL_SEARCH_ID     =
+
+# The EXTRA_SEARCH_MAPPINGS tag can be used to enable searching through doxygen
+# projects other than the one defined by this configuration file, but that are
+# all added to the same external search index. Each project needs to have a
+# unique id set via EXTERNAL_SEARCH_ID. The search mapping then maps the id of
+# to a relative location where the documentation can be found. The format is:
+# EXTRA_SEARCH_MAPPINGS = tagname1=loc1 tagname2=loc2 ...
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+EXTRA_SEARCH_MAPPINGS  =
+
 #---------------------------------------------------------------------------
-# configuration options related to the la_te_x output
+# Configuration options related to the LaTeX output
 #---------------------------------------------------------------------------
 
-# If the GENERATE_LATEX tag is set to YES (the default) Doxygen will
-# generate Latex output.
+# If the GENERATE_LATEX tag is set to YES, doxygen will generate LaTeX output.
+# The default value is: YES.
 
 GENERATE_LATEX         = YES
 
-# The LATEX_OUTPUT tag is used to specify where the la_te_x docs will be put.
-# If a relative path is entered the value of OUTPUT_DIRECTORY will be
-# put in front of it. If left blank `latex' will be used as the default path.
+# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: latex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
 
 LATEX_OUTPUT           = latex
 
-# The LATEX_CMD_NAME tag can be used to specify the la_te_x command name to be
-# invoked. If left blank `latex' will be used as the default command name.
+# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be
+# invoked.
+#
+# Note that when not enabling USE_PDFLATEX the default is latex when enabling
+# USE_PDFLATEX the default is pdflatex and when in the later case latex is
+# chosen this is overwritten by pdflatex. For specific output languages the
+# default can have been set differently, this depends on the implementation of
+# the output language.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
 
 LATEX_CMD_NAME         = latex
 
-# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to
-# generate index for la_te_x. If left blank `makeindex' will be used as the
-# default command name.
+# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to generate
+# index for LaTeX.
+# Note: This tag is used in the Makefile / make.bat.
+# See also: LATEX_MAKEINDEX_CMD for the part in the generated output file
+# (.tex).
+# The default file is: makeindex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
 
 MAKEINDEX_CMD_NAME     = makeindex
 
-# If the COMPACT_LATEX tag is set to YES Doxygen generates more compact
-# la_te_x documents. This may be useful for small projects and may help to
-# save some trees in general.
+# The LATEX_MAKEINDEX_CMD tag can be used to specify the command name to
+# generate index for LaTeX. In case there is no backslash (\) as first character
+# it will be automatically added in the LaTeX code.
+# Note: This tag is used in the generated output file (.tex).
+# See also: MAKEINDEX_CMD_NAME for the part in the Makefile / make.bat.
+# The default value is: makeindex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_MAKEINDEX_CMD    = makeindex
+
+# If the COMPACT_LATEX tag is set to YES, doxygen generates more compact LaTeX
+# documents. This may be useful for small projects and may help to save some
+# trees in general.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
 
 COMPACT_LATEX          = YES
 
-# The PAPER_TYPE tag can be used to set the paper type that is used
-# by the printer. Possible values are: a4, a4wide, letter, legal and
-# executive. If left blank a4wide will be used.
+# The PAPER_TYPE tag can be used to set the paper type that is used by the
+# printer.
+# Possible values are: a4 (210 x 297 mm), letter (8.5 x 11 inches), legal (8.5 x
+# 14 inches) and executive (7.25 x 10.5 inches).
+# The default value is: a4.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
 
 PAPER_TYPE             = letter
 
-# The EXTRA_PACKAGES tag can be to specify one or more names of la_te_x
-# packages that should be included in the la_te_x output.
+# The EXTRA_PACKAGES tag can be used to specify one or more LaTeX package names
+# that should be included in the LaTeX output. The package can be specified just
+# by its name or with the correct syntax as to be used with the LaTeX
+# \usepackage command. To get the times font for instance you can specify :
+# EXTRA_PACKAGES=times or EXTRA_PACKAGES={times}
+# To use the option intlimits with the amsmath package you can specify:
+# EXTRA_PACKAGES=[intlimits]{amsmath}
+# If left blank no extra packages will be included.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
 
 EXTRA_PACKAGES         =
 
-# The LATEX_HEADER tag can be used to specify a personal la_te_x header for
-# the generated latex document. The header should contain everything until
-# the first chapter. If it is left blank doxygen will generate a
-# standard header. Notice: only use this tag if you know what you are doing!
+# The LATEX_HEADER tag can be used to specify a personal LaTeX header for the
+# generated LaTeX document. The header should contain everything until the first
+# chapter. If it is left blank doxygen will generate a standard header. See
+# section "Doxygen usage" for information on how to let doxygen write the
+# default header to a separate file.
+#
+# Note: Only use a user-defined header if you know what you are doing! The
+# following commands have a special meaning inside the header: $title,
+# $datetime, $date, $doxygenversion, $projectname, $projectnumber,
+# $projectbrief, $projectlogo. Doxygen will replace $title with the empty
+# string, for the replacement values of the other commands the user is referred
+# to HTML_HEADER.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
 
 LATEX_HEADER           =
 
-# If the PDF_HYPERLINKS tag is set to YES, the la_te_x that is generated
-# is prepared for conversion to pdf (using ps2pdf). The pdf file will
-# contain links (just like the HTML output) instead of page references
-# This makes the output suitable for online browsing using a pdf viewer.
+# The LATEX_FOOTER tag can be used to specify a personal LaTeX footer for the
+# generated LaTeX document. The footer should contain everything after the last
+# chapter. If it is left blank doxygen will generate a standard footer. See
+# LATEX_HEADER for more information on how to generate a default footer and what
+# special commands can be used inside the footer.
+#
+# Note: Only use a user-defined footer if you know what you are doing!
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_FOOTER           =
+
+# The LATEX_EXTRA_STYLESHEET tag can be used to specify additional user-defined
+# LaTeX style sheets that are included after the standard style sheets created
+# by doxygen. Using this option one can overrule certain style aspects. Doxygen
+# will copy the style sheet files to the output directory.
+# Note: The order of the extra style sheet files is of importance (e.g. the last
+# style sheet in the list overrules the setting of the previous ones in the
+# list).
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_EXTRA_STYLESHEET =
+
+# The LATEX_EXTRA_FILES tag can be used to specify one or more extra images or
+# other source files which should be copied to the LATEX_OUTPUT output
+# directory. Note that the files will be copied as-is; there are no commands or
+# markers available.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_EXTRA_FILES      =
+
+# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated is
+# prepared for conversion to PDF (using ps2pdf or pdflatex). The PDF file will
+# contain links (just like the HTML output) instead of page references. This
+# makes the output suitable for online browsing using a PDF viewer.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
 
 PDF_HYPERLINKS         = YES
 
-# If the USE_PDFLATEX tag is set to YES, pdflatex will be used instead of
-# plain latex in the generated Makefile. Set this option to YES to get a
+# If the USE_PDFLATEX tag is set to YES, doxygen will use pdflatex to generate
+# the PDF file directly from the LaTeX files. Set this option to YES, to get a
 # higher quality PDF documentation.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
 
 USE_PDFLATEX           = YES
 
-# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \\batchmode.
-# command to the generated la_te_x files. This will instruct la_te_x to keep
-# running if errors occur, instead of asking the user for help.
-# This option is also used when generating formulas in HTML.
+# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \batchmode
+# command to the generated LaTeX files. This will instruct LaTeX to keep running
+# if errors occur, instead of asking the user for help. This option is also used
+# when generating formulas in HTML.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
 
 LATEX_BATCHMODE        = NO
 
-# If LATEX_HIDE_INDICES is set to YES then doxygen will not
-# include the index chapters (such as File Index, Compound Index, etc.)
-# in the output.
+# If the LATEX_HIDE_INDICES tag is set to YES then doxygen will not include the
+# index chapters (such as File Index, Compound Index, etc.) in the output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
 
 LATEX_HIDE_INDICES     = NO
 
+# If the LATEX_SOURCE_CODE tag is set to YES then doxygen will include source
+# code with syntax highlighting in the LaTeX output.
+#
+# Note that which sources are shown also depends on other settings such as
+# SOURCE_BROWSER.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_SOURCE_CODE      = NO
+
+# The LATEX_BIB_STYLE tag can be used to specify the style to use for the
+# bibliography, e.g. plainnat, or ieeetr. See
+# https://en.wikipedia.org/wiki/BibTeX and \cite for more info.
+# The default value is: plain.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_BIB_STYLE        = plain
+
+# If the LATEX_TIMESTAMP tag is set to YES then the footer of each generated
+# page will contain the date and time when the page was generated. Setting this
+# to NO can help when comparing the output of multiple runs.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_TIMESTAMP        = NO
+
+# The LATEX_EMOJI_DIRECTORY tag is used to specify the (relative or absolute)
+# path from which the emoji images will be read. If a relative path is entered,
+# it will be relative to the LATEX_OUTPUT directory. If left blank the
+# LATEX_OUTPUT directory will be used.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_EMOJI_DIRECTORY  =
+
 #---------------------------------------------------------------------------
-# configuration options related to the RTF output
+# Configuration options related to the RTF output
 #---------------------------------------------------------------------------
 
-# If the GENERATE_RTF tag is set to YES Doxygen will generate RTF output
-# The RTF output is optimized for Word 97 and may not look very pretty with
-# other RTF readers or editors.
+# If the GENERATE_RTF tag is set to YES, doxygen will generate RTF output. The
+# RTF output is optimized for Word 97 and may not look too pretty with other RTF
+# readers/editors.
+# The default value is: NO.
 
 GENERATE_RTF           = NO
 
-# The RTF_OUTPUT tag is used to specify where the RTF docs will be put.
-# If a relative path is entered the value of OUTPUT_DIRECTORY will be
-# put in front of it. If left blank `rtf' will be used as the default path.
+# The RTF_OUTPUT tag is used to specify where the RTF docs will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: rtf.
+# This tag requires that the tag GENERATE_RTF is set to YES.
 
 RTF_OUTPUT             = rtf
 
-# If the COMPACT_RTF tag is set to YES Doxygen generates more compact
-# RTF documents. This may be useful for small projects and may help to
-# save some trees in general.
+# If the COMPACT_RTF tag is set to YES, doxygen generates more compact RTF
+# documents. This may be useful for small projects and may help to save some
+# trees in general.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_RTF is set to YES.
 
 COMPACT_RTF            = NO
 
-# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated
-# will contain hyperlink fields. The RTF file will
-# contain links (just like the HTML output) instead of page references.
-# This makes the output suitable for online browsing using WORD or other
-# programs which support those fields.
-# Note: wordpad (write) and others do not support links.
+# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated will
+# contain hyperlink fields. The RTF file will contain links (just like the HTML
+# output) instead of page references. This makes the output suitable for online
+# browsing using Word or some other Word compatible readers that support those
+# fields.
+#
+# Note: WordPad (write) and others do not support links.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_RTF is set to YES.
 
 RTF_HYPERLINKS         = NO
 
 # Load stylesheet definitions from file. Syntax is similar to doxygen's
-# config file, i.e. a series of assignments. You only have to provide
+# configuration file, i.e. a series of assignments. You only have to provide
 # replacements, missing definitions are set to their default value.
+#
+# See also section "Doxygen usage" for information on how to generate the
+# default style sheet that doxygen normally uses.
+# This tag requires that the tag GENERATE_RTF is set to YES.
 
 RTF_STYLESHEET_FILE    =
 
-# Set optional variables used in the generation of an rtf document.
-# Syntax is similar to doxygen's config file.
+# Set optional variables used in the generation of an RTF document. Syntax is
+# similar to doxygen's configuration file. A template extensions file can be
+# generated using doxygen -e rtf extensionFile.
+# This tag requires that the tag GENERATE_RTF is set to YES.
 
 RTF_EXTENSIONS_FILE    =
 
+# If the RTF_SOURCE_CODE tag is set to YES then doxygen will include source code
+# with syntax highlighting in the RTF output.
+#
+# Note that which sources are shown also depends on other settings such as
+# SOURCE_BROWSER.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_SOURCE_CODE        = NO
+
 #---------------------------------------------------------------------------
-# configuration options related to the man page output
+# Configuration options related to the man page output
 #---------------------------------------------------------------------------
 
-# If the GENERATE_MAN tag is set to YES (the default) Doxygen will
-# generate man pages
+# If the GENERATE_MAN tag is set to YES, doxygen will generate man pages for
+# classes and files.
+# The default value is: NO.
 
 GENERATE_MAN           = NO
 
-# The MAN_OUTPUT tag is used to specify where the man pages will be put.
-# If a relative path is entered the value of OUTPUT_DIRECTORY will be
-# put in front of it. If left blank `man' will be used as the default path.
+# The MAN_OUTPUT tag is used to specify where the man pages will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it. A directory man3 will be created inside the directory specified by
+# MAN_OUTPUT.
+# The default directory is: man.
+# This tag requires that the tag GENERATE_MAN is set to YES.
 
 MAN_OUTPUT             = man
 
-# The MAN_EXTENSION tag determines the extension that is added to
-# the generated man pages (default is the subroutine's section .3)
+# The MAN_EXTENSION tag determines the extension that is added to the generated
+# man pages. In case the manual section does not start with a number, the number
+# 3 is prepended. The dot (.) at the beginning of the MAN_EXTENSION tag is
+# optional.
+# The default value is: .3.
+# This tag requires that the tag GENERATE_MAN is set to YES.
 
 MAN_EXTENSION          = .3
 
-# If the MAN_LINKS tag is set to YES and Doxygen generates man output,
-# then it will generate one additional man file for each entity
-# documented in the real man page(s). These additional files
-# only source the real man page, but without them the man command
-# would be unable to find the correct page. The default is NO.
+# The MAN_SUBDIR tag determines the name of the directory created within
+# MAN_OUTPUT in which the man pages are placed. If defaults to man followed by
+# MAN_EXTENSION with the initial . removed.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_SUBDIR             =
+
+# If the MAN_LINKS tag is set to YES and doxygen generates man output, then it
+# will generate one additional man file for each entity documented in the real
+# man page(s). These additional files only source the real man page, but without
+# them the man command would be unable to find the correct page.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_MAN is set to YES.
 
 MAN_LINKS              = YES
 
 #---------------------------------------------------------------------------
-# configuration options for the auto_gen Definitions output
+# Configuration options related to the DOCBOOK output
 #---------------------------------------------------------------------------
 
-# If the GENERATE_AUTOGEN_DEF tag is set to YES Doxygen will
-# generate an auto_gen Definitions (see autogen.sf.net) file
-# that captures the structure of the code including all
-# documentation. Note that this feature is still experimental
-# and incomplete at the moment.
+# If the GENERATE_DOCBOOK tag is set to YES, doxygen will generate Docbook files
+# that can be used to generate PDF.
+# The default value is: NO.
+
+GENERATE_DOCBOOK       = NO
+
+# The DOCBOOK_OUTPUT tag is used to specify where the Docbook pages will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be put in
+# front of it.
+# The default directory is: docbook.
+# This tag requires that the tag GENERATE_DOCBOOK is set to YES.
+
+DOCBOOK_OUTPUT         = docbook
+
+# If the DOCBOOK_PROGRAMLISTING tag is set to YES, doxygen will include the
+# program listings (including syntax highlighting and cross-referencing
+# information) to the DOCBOOK output. Note that enabling this will significantly
+# increase the size of the DOCBOOK output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_DOCBOOK is set to YES.
+
+DOCBOOK_PROGRAMLISTING = NO
+
+#---------------------------------------------------------------------------
+# Configuration options for the AutoGen Definitions output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_AUTOGEN_DEF tag is set to YES, doxygen will generate an
+# AutoGen Definitions (see http://autogen.sourceforge.net/) file that captures
+# the structure of the code including all documentation. Note that this feature
+# is still experimental and incomplete at the moment.
+# The default value is: NO.
 
 GENERATE_AUTOGEN_DEF   = NO
 
 #---------------------------------------------------------------------------
-# configuration options related to the Perl module output
+# Configuration options related to the Perl module output
 #---------------------------------------------------------------------------
 
-# If the GENERATE_PERLMOD tag is set to YES Doxygen will
-# generate a Perl module file that captures the structure of
-# the code including all documentation. Note that this
-# feature is still experimental and incomplete at the
-# moment.
+# If the GENERATE_PERLMOD tag is set to YES, doxygen will generate a Perl module
+# file that captures the structure of the code including all documentation.
+#
+# Note that this feature is still experimental and incomplete at the moment.
+# The default value is: NO.
 
 GENERATE_PERLMOD       = NO
 
-# If the PERLMOD_LATEX tag is set to YES Doxygen will generate
-# the necessary Makefile rules, Perl scripts and la_te_x code to be able
-# to generate PDF and DVI output from the Perl module output.
+# If the PERLMOD_LATEX tag is set to YES, doxygen will generate the necessary
+# Makefile rules, Perl scripts and LaTeX code to be able to generate PDF and DVI
+# output from the Perl module output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_PERLMOD is set to YES.
 
 PERLMOD_LATEX          = NO
 
-# If the PERLMOD_PRETTY tag is set to YES the Perl module output will be
-# nicely formatted so it can be parsed by a human reader.  This is useful
-# if you want to understand what is going on.  On the other hand, if this
-# tag is set to NO the size of the Perl module output will be much smaller
-# and Perl will parse it just the same.
+# If the PERLMOD_PRETTY tag is set to YES, the Perl module output will be nicely
+# formatted so it can be parsed by a human reader. This is useful if you want to
+# understand what is going on. On the other hand, if this tag is set to NO, the
+# size of the Perl module output will be much smaller and Perl will parse it
+# just the same.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_PERLMOD is set to YES.
 
 PERLMOD_PRETTY         = YES
 
-# The names of the make variables in the generated doxyrules.make file
-# are prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX.
-# This is useful so different doxyrules.make files included by the same
-# Makefile don't overwrite each other's variables.
+# The names of the make variables in the generated doxyrules.make file are
+# prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX. This is useful
+# so different doxyrules.make files included by the same Makefile don't
+# overwrite each other's variables.
+# This tag requires that the tag GENERATE_PERLMOD is set to YES.
 
 PERLMOD_MAKEVAR_PREFIX =
 
@@ -975,278 +2057,438 @@
 # Configuration options related to the preprocessor
 #---------------------------------------------------------------------------
 
-# If the ENABLE_PREPROCESSING tag is set to YES (the default) Doxygen will
-# evaluate all C-preprocessor directives found in the sources and include
-# files.
+# If the ENABLE_PREPROCESSING tag is set to YES, doxygen will evaluate all
+# C-preprocessor directives found in the sources and include files.
+# The default value is: YES.
 
 ENABLE_PREPROCESSING   = YES
 
-# If the MACRO_EXPANSION tag is set to YES Doxygen will expand all macro
-# names in the source code. If set to NO (the default) only conditional
-# compilation will be performed. Macro expansion can be done in a controlled
-# way by setting EXPAND_ONLY_PREDEF to YES.
+# If the MACRO_EXPANSION tag is set to YES, doxygen will expand all macro names
+# in the source code. If set to NO, only conditional compilation will be
+# performed. Macro expansion can be done in a controlled way by setting
+# EXPAND_ONLY_PREDEF to YES.
+# The default value is: NO.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
 
 MACRO_EXPANSION        = YES
 
-# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES
-# then the macro expansion is limited to the macros specified with the
-# PREDEFINED and EXPAND_AS_DEFINED tags.
+# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES then
+# the macro expansion is limited to the macros specified with the PREDEFINED and
+# EXPAND_AS_DEFINED tags.
+# The default value is: NO.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
 
 EXPAND_ONLY_PREDEF     = NO
 
-# If the SEARCH_INCLUDES tag is set to YES (the default) the includes files
-# in the INCLUDE_PATH (see below) will be search if a #include is found.
+# If the SEARCH_INCLUDES tag is set to YES, the include files in the
+# INCLUDE_PATH will be searched if a #include is found.
+# The default value is: YES.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
 
 SEARCH_INCLUDES        = YES
 
 # The INCLUDE_PATH tag can be used to specify one or more directories that
-# contain include files that are not input files but should be processed by
-# the preprocessor.
+# contain include files that are not input files but should be processed by the
+# preprocessor.
+# This tag requires that the tag SEARCH_INCLUDES is set to YES.
 
 INCLUDE_PATH           =
 
 # You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard
 # patterns (like *.h and *.hpp) to filter out the header-files in the
-# directories. If left blank, the patterns specified with FILE_PATTERNS will
-# be used.
+# directories. If left blank, the patterns specified with FILE_PATTERNS will be
+# used.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
 
 INCLUDE_FILE_PATTERNS  = *.h
 
-# The PREDEFINED tag can be used to specify one or more macro names that
-# are defined before the preprocessor is started (similar to the -D option of
-# gcc). The argument of the tag is a list of macros of the form: name
-# or name=definition (no spaces). If the definition and the = are
-# omitted =1 is assumed. To prevent a macro definition from being
-# undefined via #undef or recursively expanded use the := operator
-# instead of the = operator.
+# The PREDEFINED tag can be used to specify one or more macro names that are
+# defined before the preprocessor is started (similar to the -D option of e.g.
+# gcc). The argument of the tag is a list of macros of the form: name or
+# name=definition (no spaces). If the definition and the "=" are omitted, "=1"
+# is assumed. To prevent a macro definition from being undefined via #undef or
+# recursively expanded use the := operator instead of the = operator.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+#
+# In builds where CONFIG_REALTIME_ONLY is set some functions are #ifdefed out
+# which causes reference failures. Hence for doxygen we set it to 0 here.
 
-PREDEFINED             =
+PREDEFINED             = CONFIG_REALTIME_ONLY=0
 
-# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then
-# this tag can be used to specify a list of macro names that should be expanded.
-# The macro definition that is found in the sources will be used.
-# Use the PREDEFINED tag if you want to use a different macro definition.
+# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this
+# tag can be used to specify a list of macro names that should be expanded. The
+# macro definition that is found in the sources will be used. Use the PREDEFINED
+# tag if you want to use a different macro definition that overrules the
+# definition found in the source code.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
 
 EXPAND_AS_DEFINED      =
 
-# If the SKIP_FUNCTION_MACROS tag is set to YES (the default) then
-# doxygen's preprocessor will remove all function-like macros that are alone
-# on a line, have an all uppercase name, and do not end with a semicolon. Such
-# function macros are typically used for boiler-plate code, and will confuse
-# the parser if not removed.
+# If the SKIP_FUNCTION_MACROS tag is set to YES then doxygen's preprocessor will
+# remove all references to function-like macros that are alone on a line, have
+# an all uppercase name, and do not end with a semicolon. Such function macros
+# are typically used for boiler-plate code, and will confuse the parser if not
+# removed.
+# The default value is: YES.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
 
 SKIP_FUNCTION_MACROS   = YES
 
 #---------------------------------------------------------------------------
-# Configuration::additions related to external references
+# Configuration options related to external references
 #---------------------------------------------------------------------------
 
-# The TAGFILES option can be used to specify one or more tagfiles.
-# Optionally an initial location of the external documentation
-# can be added for each tagfile. The format of a tag file without
-# this location is as follows:
-#   TAGFILES = file1 file2 ...
+# The TAGFILES tag can be used to specify one or more tag files. For each tag
+# file the location of the external documentation should be added. The format of
+# a tag file without this location is as follows:
+# TAGFILES = file1 file2 ...
 # Adding location for the tag files is done as follows:
-#   TAGFILES = file1=loc1 "file2 = loc2" ...
-# where "loc1" and "loc2" can be relative or absolute paths or
-# URLs. If a location is present for each tag, the installdox tool
-# does not have to be run to correct the links.
-# Note that each tag file must have a unique name
-# (where the name does NOT include the path)
-# If a tag file is not located in the directory in which doxygen
-# is run, you must also specify the path to the tagfile here.
+# TAGFILES = file1=loc1 "file2 = loc2" ...
+# where loc1 and loc2 can be relative or absolute paths or URLs. See the
+# section "Linking to external documentation" for more information about the use
+# of tag files.
+# Note: Each tag file must have a unique name (where the name does NOT include
+# the path). If a tag file is not located in the directory in which doxygen is
+# run, you must also specify the path to the tagfile here.
 
 TAGFILES               =
 
-# When a file name is specified after GENERATE_TAGFILE, doxygen will create
-# a tag file that is based on the input files it reads.
+# When a file name is specified after GENERATE_TAGFILE, doxygen will create a
+# tag file that is based on the input files it reads. See section "Linking to
+# external documentation" for more information about the usage of tag files.
 
 GENERATE_TAGFILE       =
 
-# If the ALLEXTERNALS tag is set to YES all external classes will be listed
-# in the class index. If set to NO only the inherited external classes
-# will be listed.
+# If the ALLEXTERNALS tag is set to YES, all external class will be listed in
+# the class index. If set to NO, only the inherited external classes will be
+# listed.
+# The default value is: NO.
 
 ALLEXTERNALS           = NO
 
-# If the EXTERNAL_GROUPS tag is set to YES all external groups will be listed
-# in the modules index. If set to NO, only the current project's groups will
-# be listed.
+# If the EXTERNAL_GROUPS tag is set to YES, all external groups will be listed
+# in the modules index. If set to NO, only the current project's groups will be
+# listed.
+# The default value is: YES.
 
 EXTERNAL_GROUPS        = YES
 
-# The PERL_PATH should be the absolute path and name of the perl script
-# interpreter (i.e. the result of `which perl').
+# If the EXTERNAL_PAGES tag is set to YES, all external pages will be listed in
+# the related pages index. If set to NO, only the current project's pages will
+# be listed.
+# The default value is: YES.
 
-PERL_PATH              = /usr/bin/perl
+EXTERNAL_PAGES         = YES
 
 #---------------------------------------------------------------------------
 # Configuration options related to the dot tool
 #---------------------------------------------------------------------------
 
-# If the CLASS_DIAGRAMS tag is set to YES (the default) Doxygen will
-# generate a inheritance diagram (in HTML, RTF and la_te_x) for classes with base
-# or super classes. Setting the tag to NO turns the diagrams off. Note that
-# this option is superseded by the HAVE_DOT option below. This is only a
-# fallback. It is recommended to install and use dot, since it yields more
+# If the CLASS_DIAGRAMS tag is set to YES, doxygen will generate a class diagram
+# (in HTML and LaTeX) for classes with base or super classes. Setting the tag to
+# NO turns the diagrams off. Note that this option also works with HAVE_DOT
+# disabled, but it is recommended to install and use dot, since it yields more
 # powerful graphs.
+# The default value is: YES.
 
 CLASS_DIAGRAMS         = YES
 
-# You can define message sequence charts within doxygen comments using the \msc
-# command. Doxygen will then run the mscgen tool (see http://www.mcternan.me.uk/mscgen/) to
-# produce the chart and insert it in the documentation. The MSCGEN_PATH tag allows you to
-# specify the directory where the mscgen tool resides. If left empty the tool is assumed to
-# be found in the default search path.
+# You can include diagrams made with dia in doxygen documentation. Doxygen will
+# then run dia to produce the diagram and insert it in the documentation. The
+# DIA_PATH tag allows you to specify the directory where the dia binary resides.
+# If left empty dia is assumed to be found in the default search path.
 
-MSCGEN_PATH            =
+DIA_PATH               =
 
-# If set to YES, the inheritance and collaboration graphs will hide
-# inheritance and usage relations if the target is undocumented
-# or is not a class.
+# If set to YES the inheritance and collaboration graphs will hide inheritance
+# and usage relations if the target is undocumented or is not a class.
+# The default value is: YES.
 
 HIDE_UNDOC_RELATIONS   = YES
 
 # If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is
-# available from the path. This tool is part of Graphviz, a graph visualization
-# toolkit from AT&T and Lucent Bell Labs. The other options in this section
-# have no effect if this option is set to NO (the default)
+# available from the path. This tool is part of Graphviz (see:
+# http://www.graphviz.org/), a graph visualization toolkit from AT&T and Lucent
+# Bell Labs. The other options in this section have no effect if this option is
+# set to NO
+# The default value is: YES.
 
 HAVE_DOT               = NO
 
-# If the CLASS_GRAPH and HAVE_DOT tags are set to YES then doxygen
-# will generate a graph for each documented class showing the direct and
-# indirect inheritance relations. Setting this tag to YES will force the
-# the CLASS_DIAGRAMS tag to NO.
+# The DOT_NUM_THREADS specifies the number of dot invocations doxygen is allowed
+# to run in parallel. When set to 0 doxygen will base this on the number of
+# processors available in the system. You can set it explicitly to a value
+# larger than 0 to get control over the balance between CPU load and processing
+# speed.
+# Minimum value: 0, maximum value: 32, default value: 0.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_NUM_THREADS        = 0
+
+# When you want a differently looking font in the dot files that doxygen
+# generates you can specify the font name using DOT_FONTNAME. You need to make
+# sure dot is able to find the font, which can be done by putting it in a
+# standard location or by setting the DOTFONTPATH environment variable or by
+# setting DOT_FONTPATH to the directory containing the font.
+# The default value is: Helvetica.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_FONTNAME           = Helvetica
+
+# The DOT_FONTSIZE tag can be used to set the size (in points) of the font of
+# dot graphs.
+# Minimum value: 4, maximum value: 24, default value: 10.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_FONTSIZE           = 10
+
+# By default doxygen will tell dot to use the default font as specified with
+# DOT_FONTNAME. If you specify a different font using DOT_FONTNAME you can set
+# the path where dot can find it using this tag.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_FONTPATH           =
+
+# If the CLASS_GRAPH tag is set to YES then doxygen will generate a graph for
+# each documented class showing the direct and indirect inheritance relations.
+# Setting this tag to YES will force the CLASS_DIAGRAMS tag to NO.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
 
 CLASS_GRAPH            = YES
 
-# If the COLLABORATION_GRAPH and HAVE_DOT tags are set to YES then doxygen
-# will generate a graph for each documented class showing the direct and
-# indirect implementation dependencies (inheritance, containment, and
-# class references variables) of the class with other documented classes.
+# If the COLLABORATION_GRAPH tag is set to YES then doxygen will generate a
+# graph for each documented class showing the direct and indirect implementation
+# dependencies (inheritance, containment, and class references variables) of the
+# class with other documented classes.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
 
 COLLABORATION_GRAPH    = YES
 
-# If the GROUP_GRAPHS and HAVE_DOT tags are set to YES then doxygen
-# will generate a graph for groups, showing the direct groups dependencies
+# If the GROUP_GRAPHS tag is set to YES then doxygen will generate a graph for
+# groups, showing the direct groups dependencies.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
 
 GROUP_GRAPHS           = YES
 
-# If the UML_LOOK tag is set to YES doxygen will generate inheritance and
+# If the UML_LOOK tag is set to YES, doxygen will generate inheritance and
 # collaboration diagrams in a style similar to the OMG's Unified Modeling
 # Language.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
 
 UML_LOOK               = NO
 
-# If set to YES, the inheritance and collaboration graphs will show the
-# relations between templates and their instances.
+# If the UML_LOOK tag is enabled, the fields and methods are shown inside the
+# class node. If there are many fields or methods and many nodes the graph may
+# become too big to be useful. The UML_LIMIT_NUM_FIELDS threshold limits the
+# number of items for each type to make the size more manageable. Set this to 0
+# for no limit. Note that the threshold may be exceeded by 50% before the limit
+# is enforced. So when you set the threshold to 10, up to 15 fields may appear,
+# but if the number exceeds 15, the total amount of fields shown is limited to
+# 10.
+# Minimum value: 0, maximum value: 100, default value: 10.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+UML_LIMIT_NUM_FIELDS   = 10
+
+# If the TEMPLATE_RELATIONS tag is set to YES then the inheritance and
+# collaboration graphs will show the relations between templates and their
+# instances.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
 
 TEMPLATE_RELATIONS     = NO
 
-# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDE_GRAPH, and HAVE_DOT
-# tags are set to YES then doxygen will generate a graph for each documented
-# file showing the direct and indirect include dependencies of the file with
-# other documented files.
+# If the INCLUDE_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are set to
+# YES then doxygen will generate a graph for each documented file showing the
+# direct and indirect include dependencies of the file with other documented
+# files.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
 
 INCLUDE_GRAPH          = YES
 
-# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDED_BY_GRAPH, and
-# HAVE_DOT tags are set to YES then doxygen will generate a graph for each
-# documented header file showing the documented files that directly or
-# indirectly include this file.
+# If the INCLUDED_BY_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are
+# set to YES then doxygen will generate a graph for each documented file showing
+# the direct and indirect include dependencies of the file with other documented
+# files.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
 
 INCLUDED_BY_GRAPH      = YES
 
-# If the CALL_GRAPH, SOURCE_BROWSER and HAVE_DOT tags are set to YES then doxygen will
-# generate a call dependency graph for every global function or class method.
+# If the CALL_GRAPH tag is set to YES then doxygen will generate a call
+# dependency graph for every global function or class method.
+#
 # Note that enabling this option will significantly increase the time of a run.
 # So in most cases it will be better to enable call graphs for selected
-# functions only using the \callgraph command.
+# functions only using the \callgraph command. Disabling a call graph can be
+# accomplished by means of the command \hidecallgraph.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
 
 CALL_GRAPH             = NO
 
-# If the CALLER_GRAPH, SOURCE_BROWSER and HAVE_DOT tags are set to YES then doxygen will
-# generate a caller dependency graph for every global function or class method.
+# If the CALLER_GRAPH tag is set to YES then doxygen will generate a caller
+# dependency graph for every global function or class method.
+#
 # Note that enabling this option will significantly increase the time of a run.
 # So in most cases it will be better to enable caller graphs for selected
-# functions only using the \callergraph command.
+# functions only using the \callergraph command. Disabling a caller graph can be
+# accomplished by means of the command \hidecallergraph.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
 
 CALLER_GRAPH           = NO
 
-# If the GRAPHICAL_HIERARCHY and HAVE_DOT tags are set to YES then doxygen
-# will graphical hierarchy of all classes instead of a textual one.
+# If the GRAPHICAL_HIERARCHY tag is set to YES then doxygen will graphical
+# hierarchy of all classes instead of a textual one.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
 
 GRAPHICAL_HIERARCHY    = YES
 
-# If the DIRECTORY_GRAPH, SHOW_DIRECTORIES and HAVE_DOT tags are set to YES
-# then doxygen will show the dependencies a directory has on other directories
-# in a graphical way. The dependency relations are determined by the #include
-# relations between the files in the directories.
+# If the DIRECTORY_GRAPH tag is set to YES then doxygen will show the
+# dependencies a directory has on other directories in a graphical way. The
+# dependency relations are determined by the #include relations between the
+# files in the directories.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
 
 DIRECTORY_GRAPH        = YES
 
 # The DOT_IMAGE_FORMAT tag can be used to set the image format of the images
-# generated by dot. Possible values are png, jpg, or gif
-# If left blank png will be used.
+# generated by dot. For an explanation of the image formats see the section
+# output formats in the documentation of the dot tool (Graphviz (see:
+# http://www.graphviz.org/)).
+# Note: If you choose svg you need to set HTML_FILE_EXTENSION to xhtml in order
+# to make the SVG files visible in IE 9+ (other browsers do not have this
+# requirement).
+# Possible values are: png, png:cairo, png:cairo:cairo, png:cairo:gd, png:gd,
+# png:gd:gd, jpg, jpg:cairo, jpg:cairo:gd, jpg:gd, jpg:gd:gd, gif, gif:cairo,
+# gif:cairo:gd, gif:gd, gif:gd:gd, svg, png:gd, png:gd:gd, png:cairo,
+# png:cairo:gd, png:cairo:cairo, png:cairo:gdiplus, png:gdiplus and
+# png:gdiplus:gdiplus.
+# The default value is: png.
+# This tag requires that the tag HAVE_DOT is set to YES.
 
 DOT_IMAGE_FORMAT       = png
 
-# The tag DOT_PATH can be used to specify the path where the dot tool can be
+# If DOT_IMAGE_FORMAT is set to svg, then this option can be set to YES to
+# enable generation of interactive SVG images that allow zooming and panning.
+#
+# Note that this requires a modern browser other than Internet Explorer. Tested
+# and working are Firefox, Chrome, Safari, and Opera.
+# Note: For IE 9+ you need to set HTML_FILE_EXTENSION to xhtml in order to make
+# the SVG files visible. Older versions of IE do not have SVG support.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+INTERACTIVE_SVG        = NO
+
+# The DOT_PATH tag can be used to specify the path where the dot tool can be
 # found. If left blank, it is assumed the dot tool can be found in the path.
+# This tag requires that the tag HAVE_DOT is set to YES.
 
 DOT_PATH               =
 
 # The DOTFILE_DIRS tag can be used to specify one or more directories that
-# contain dot files that are included in the documentation (see the
-# \dotfile command).
+# contain dot files that are included in the documentation (see the \dotfile
+# command).
+# This tag requires that the tag HAVE_DOT is set to YES.
 
 DOTFILE_DIRS           =
 
-# The MAX_DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of
-# nodes that will be shown in the graph. If the number of nodes in a graph
-# becomes larger than this value, doxygen will truncate the graph, which is
-# visualized by representing a node as a red box. Note that doxygen if the number
-# of direct children of the root node in a graph is already larger than
-# MAX_DOT_GRAPH_NOTES then the graph will not be shown at all. Also note
-# that the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH.
+# The MSCFILE_DIRS tag can be used to specify one or more directories that
+# contain msc files that are included in the documentation (see the \mscfile
+# command).
+
+MSCFILE_DIRS           =
+
+# The DIAFILE_DIRS tag can be used to specify one or more directories that
+# contain dia files that are included in the documentation (see the \diafile
+# command).
+
+DIAFILE_DIRS           =
+
+# When using plantuml, the PLANTUML_JAR_PATH tag should be used to specify the
+# path where java can find the plantuml.jar file. If left blank, it is assumed
+# PlantUML is not used or called during a preprocessing step. Doxygen will
+# generate a warning when it encounters a \startuml command in this case and
+# will not generate output for the diagram.
+
+PLANTUML_JAR_PATH      =
+
+# When using plantuml, the PLANTUML_CFG_FILE tag can be used to specify a
+# configuration file for plantuml.
+
+PLANTUML_CFG_FILE      =
+
+# When using plantuml, the specified paths are searched for files specified by
+# the !include statement in a plantuml block.
+
+PLANTUML_INCLUDE_PATH  =
+
+# The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of nodes
+# that will be shown in the graph. If the number of nodes in a graph becomes
+# larger than this value, doxygen will truncate the graph, which is visualized
+# by representing a node as a red box. Note that doxygen if the number of direct
+# children of the root node in a graph is already larger than
+# DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note that
+# the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH.
+# Minimum value: 0, maximum value: 10000, default value: 50.
+# This tag requires that the tag HAVE_DOT is set to YES.
 
 DOT_GRAPH_MAX_NODES    = 50
 
-# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the
-# graphs generated by dot. A depth value of 3 means that only nodes reachable
-# from the root by following a path via at most 3 edges will be shown. Nodes
-# that lay further from the root node will be omitted. Note that setting this
-# option to 1 or 2 may greatly reduce the computation time needed for large
-# code bases. Also note that the size of a graph can be further restricted by
+# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the graphs
+# generated by dot. A depth value of 3 means that only nodes reachable from the
+# root by following a path via at most 3 edges will be shown. Nodes that lay
+# further from the root node will be omitted. Note that setting this option to 1
+# or 2 may greatly reduce the computation time needed for large code bases. Also
+# note that the size of a graph can be further restricted by
 # DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction.
+# Minimum value: 0, maximum value: 1000, default value: 0.
+# This tag requires that the tag HAVE_DOT is set to YES.
 
 MAX_DOT_GRAPH_DEPTH    = 0
 
 # Set the DOT_TRANSPARENT tag to YES to generate images with a transparent
-# background. This is disabled by default, which results in a white background.
+# background. This is disabled by default, because dot on Windows does not seem
+# to support this out of the box.
+#
 # Warning: Depending on the platform used, enabling this option may lead to
 # badly anti-aliased labels on the edges of a graph (i.e. they become hard to
 # read).
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
 
-DOT_TRANSPARENT        = YES
+DOT_TRANSPARENT        = NO
 
-# Set the DOT_MULTI_TARGETS tag to YES allow dot to generate multiple output
+# Set the DOT_MULTI_TARGETS tag to YES to allow dot to generate multiple output
 # files in one run (i.e. multiple -o and -T options on the command line). This
-# makes dot run faster, but since only newer versions of dot (>1.8.10)
-# support this, this feature is disabled by default.
+# makes dot run faster, but since only newer versions of dot (>1.8.10) support
+# this, this feature is disabled by default.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
 
 DOT_MULTI_TARGETS      = NO
 
-# If the GENERATE_LEGEND tag is set to YES (the default) Doxygen will
-# generate a legend page explaining the meaning of the various boxes and
-# arrows in the dot generated graphs.
+# If the GENERATE_LEGEND tag is set to YES doxygen will generate a legend page
+# explaining the meaning of the various boxes and arrows in the dot generated
+# graphs.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
 
 GENERATE_LEGEND        = YES
 
-# If the DOT_CLEANUP tag is set to YES (the default) Doxygen will
-# remove the intermediate dot files that are used to generate
-# the various graphs.
+# If the DOT_CLEANUP tag is set to YES, doxygen will remove the intermediate dot
+# files that are used to generate the various graphs.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
 
 DOT_CLEANUP            = YES
 
diff --git a/mainpage.dox b/mainpage.dox
index 03a299a..10924ac 100644
--- a/mainpage.dox
+++ b/mainpage.dox
@@ -1,28 +1,31 @@
-/*!\mainpage AMedia Codec SDK
+/*!\mainpage AOMedia AV1 Codec
 
-  \section main_contents Page Contents
-  - \ref main_intro
-  - \ref main_startpoints
-  - \ref main_support
+  \tableofcontents
 
-  \section main_intro Introduction
-  Welcome to the AMedia Codec SDK. This SDK allows you to integrate your
+  \section aom_sdk AOMedia Codec SDK
+
+  \subsection main_intro Introduction
+  Welcome to the AOMedia Codec SDK. This SDK allows you to integrate your
   applications with the AOM and AV1 video codecs.
 
   This distribution of the AOMedia Codec SDK includes the following support:
 
-  \if aom_encoder
+  \if av1_encoder
   - \ref aom_encoder
   \endif
-  \if aom_decoder
+  \if av1_decoder
   - \ref aom_decoder
   \endif
 
 
-  \section main_startpoints Starting Points
+  \subsection main_startpoints Starting Points
   - Consult the \ref changelog for a complete list of improvements in this
     release.
+  \if av1_md_support
+  - [README](\ref LREADME) contains instructions on compiling the sample applications.
+  \else
   - \ref readme contains instructions on compiling the sample applications.
+  \endif
   - Read the \ref usage "usage" for a narrative on codec usage.
   - Read the \ref samples "sample code" for examples of how to interact with the
     codec.
@@ -33,20 +36,33 @@
   \if decoder
   - \ref decoder reference
   \endif
+  <br>
+
+  \section av1_guide AV1 Developer's Guide
+
+  \if av1_encoder
+  - \ref encoder_guide
+  \endif
+
+  \if av1_decoder
+  - \ref decoder_guide
+  \endif
+  <br>
 
   \section main_support Support Options & FAQ
-  The AOMedia project is an open source project supported by its community. For
-  questions about this SDK, please mail the apps-devel@webmproject.org list.
-  To contribute, see http://www.webmproject.org/code/contribute and mail
-  codec-devel@webmproject.org.
+  The AOMedia project is an open source project supported by its community.
+  For questions about this SDK or for help, please visit http://aomedia.org/
+  and email the aomediacodec@jointdevelopment.kavi.com list.
 */
 
 /*!\page changelog CHANGELOG
    \verbinclude CHANGELOG
 */
 
+\ifnot av1_md_support
 /*!\page readme README.md
    \include README.md
 */
+\endif
 
 /*!\defgroup codecs Supported Codecs */
diff --git a/stats/aomstats.c b/stats/aomstats.c
index 4a15adf..8d59377 100644
--- a/stats/aomstats.c
+++ b/stats/aomstats.c
@@ -11,10 +11,12 @@
 
 #include "stats/aomstats.h"
 
+#include <assert.h>
 #include <math.h>
 #include <stdlib.h>
 #include <string.h>
 
+#include "aom_dsp/aom_dsp_common.h"
 #include "common/tools_common.h"
 
 int stats_open_file(stats_io_t *stats, const char *fpf, int pass) {
@@ -83,24 +85,28 @@
 void stats_write(stats_io_t *stats, const void *pkt, size_t len) {
   if (stats->file) {
     (void)fwrite(pkt, 1, len, stats->file);
-  } else {
-    if (stats->buf.sz + len > stats->buf_alloc_sz) {
-      size_t new_sz = stats->buf_alloc_sz + 64 * 1024;
-      char *new_ptr = realloc(stats->buf.buf, new_sz);
-
-      if (new_ptr) {
-        stats->buf_ptr = new_ptr + (stats->buf_ptr - (char *)stats->buf.buf);
-        stats->buf.buf = new_ptr;
-        stats->buf_alloc_sz = new_sz;
-      } else {
-        fatal("Failed to realloc firstpass stats buffer.");
-      }
-    }
-
-    memcpy(stats->buf_ptr, pkt, len);
-    stats->buf.sz += len;
-    stats->buf_ptr += len;
+    return;
   }
+  assert(stats->buf.sz <= stats->buf_alloc_sz);
+  assert(0 < stats->buf_alloc_sz);
+  if (stats->buf.sz + len > stats->buf_alloc_sz) {
+    // Grow by a factor of 1.5 each time, for amortized constant time.
+    // Also make sure there is enough room for the data.
+    size_t new_sz = AOMMAX((3 * stats->buf_alloc_sz) / 2, stats->buf.sz + len);
+    char *new_ptr = realloc(stats->buf.buf, new_sz);
+
+    if (new_ptr) {
+      stats->buf_ptr = new_ptr + (stats->buf_ptr - (char *)stats->buf.buf);
+      stats->buf.buf = new_ptr;
+      stats->buf_alloc_sz = new_sz;
+    } else {
+      fatal("Failed to realloc firstpass stats buffer.");
+    }
+  }
+
+  memcpy(stats->buf_ptr, pkt, len);
+  stats->buf.sz += len;
+  stats->buf_ptr += len;
 }
 
 aom_fixed_buf_t stats_get(stats_io_t *stats) { return stats->buf; }
diff --git a/test/active_map_test.cc b/test/active_map_test.cc
index 0f8a732..4b0ae7a 100644
--- a/test/active_map_test.cc
+++ b/test/active_map_test.cc
@@ -92,12 +92,12 @@
 
 TEST_P(ActiveMapTestLarge, Test) { DoTest(); }
 
-AV1_INSTANTIATE_TEST_CASE(ActiveMapTestLarge,
-                          ::testing::Values(::libaom_test::kRealTime),
-                          ::testing::Range(0, 5));
+AV1_INSTANTIATE_TEST_SUITE(ActiveMapTestLarge,
+                           ::testing::Values(::libaom_test::kRealTime),
+                           ::testing::Range(0, 5));
 
-AV1_INSTANTIATE_TEST_CASE(ActiveMapTest,
-                          ::testing::Values(::libaom_test::kRealTime),
-                          ::testing::Range(5, 9));
+AV1_INSTANTIATE_TEST_SUITE(ActiveMapTest,
+                           ::testing::Values(::libaom_test::kRealTime),
+                           ::testing::Range(5, 9));
 
 }  // namespace
diff --git a/test/altref_test.cc b/test/altref_test.cc
index 43df39f..a09f7dd 100644
--- a/test/altref_test.cc
+++ b/test/altref_test.cc
@@ -15,84 +15,205 @@
 #include "test/i420_video_source.h"
 #include "test/util.h"
 namespace {
+typedef struct {
+  const unsigned int min_kf_dist;
+  const unsigned int max_kf_dist;
+  const unsigned int min_gf_interval;
+  const unsigned int max_gf_interval;
+  const unsigned int lag_in_frames;
+  libaom_test::TestMode encoding_mode;
+} AltRefTestParams;
 
-class AltRefForcedKeyTestLarge
-    : public ::libaom_test::CodecTestWith2Params<libaom_test::TestMode, int>,
+static const AltRefTestParams TestParams[] = {
+  { 0, 10, 4, 8, 10, ::libaom_test::kOnePassGood },
+  { 0, 30, 8, 12, 16, ::libaom_test::kOnePassGood },
+  { 30, 30, 12, 16, 25, ::libaom_test::kOnePassGood },
+  { 0, 60, 12, 20, 25, ::libaom_test::kOnePassGood },
+  { 60, 60, 16, 28, 30, ::libaom_test::kOnePassGood },
+  { 0, 100, 16, 32, 35, ::libaom_test::kOnePassGood },
+  { 0, 10, 4, 8, 10, ::libaom_test::kTwoPassGood },
+  { 0, 30, 8, 12, 16, ::libaom_test::kTwoPassGood },
+  { 30, 30, 12, 16, 25, ::libaom_test::kTwoPassGood },
+  { 0, 60, 16, 24, 25, ::libaom_test::kTwoPassGood },
+  { 60, 60, 20, 28, 30, ::libaom_test::kTwoPassGood },
+  { 0, 100, 24, 32, 35, ::libaom_test::kTwoPassGood },
+};
+
+std::ostream &operator<<(std::ostream &os, const AltRefTestParams &test_arg) {
+  return os << "AltRefTestParams { min_kf_dist:" << test_arg.min_kf_dist
+            << " max_kf_dist:" << test_arg.max_kf_dist
+            << " min_gf_interval:" << test_arg.min_gf_interval
+            << " max_gf_interval:" << test_arg.max_gf_interval
+            << " lag_in_frames:" << test_arg.lag_in_frames
+            << " encoding_mode:" << test_arg.encoding_mode << " }";
+}
+
+// This class is used to check the presence of altref frame.
+class AltRefFramePresenceTestLarge
+    : public ::libaom_test::CodecTestWith2Params<AltRefTestParams, aom_rc_mode>,
       public ::libaom_test::EncoderTest {
  protected:
-  AltRefForcedKeyTestLarge()
-      : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)),
-        cpu_used_(GET_PARAM(2)), forced_kf_frame_num_(1), frame_num_(0) {}
-  virtual ~AltRefForcedKeyTestLarge() {}
+  AltRefFramePresenceTestLarge()
+      : EncoderTest(GET_PARAM(0)), altref_test_params_(GET_PARAM(1)),
+        rc_end_usage_(GET_PARAM(2)) {
+    is_arf_frame_present_ = 0;
+  }
+  virtual ~AltRefFramePresenceTestLarge() {}
 
   virtual void SetUp() {
     InitializeConfig();
-    SetMode(encoding_mode_);
-    cfg_.rc_end_usage = AOM_VBR;
-    cfg_.g_threads = 0;
+    SetMode(altref_test_params_.encoding_mode);
+    const aom_rational timebase = { 1, 30 };
+    cfg_.g_timebase = timebase;
+    cfg_.rc_end_usage = rc_end_usage_;
+    cfg_.g_threads = 1;
+    cfg_.kf_min_dist = altref_test_params_.min_kf_dist;
+    cfg_.kf_max_dist = altref_test_params_.max_kf_dist;
+    cfg_.g_lag_in_frames = altref_test_params_.lag_in_frames;
   }
 
+  virtual bool DoDecode() const { return 1; }
+
   virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
                                   ::libaom_test::Encoder *encoder) {
     if (video->frame() == 0) {
-      encoder->Control(AOME_SET_CPUUSED, cpu_used_);
+      encoder->Control(AOME_SET_CPUUSED, 5);
       encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
-#if CONFIG_AV1_ENCODER
-      // override test default for tile columns if necessary.
-      if (GET_PARAM(0) == &libaom_test::kAV1) {
-        encoder->Control(AV1E_SET_TILE_COLUMNS, 6);
-      }
-#endif
+      encoder->Control(AV1E_SET_MIN_GF_INTERVAL,
+                       altref_test_params_.min_gf_interval);
+      encoder->Control(AV1E_SET_MAX_GF_INTERVAL,
+                       altref_test_params_.max_gf_interval);
     }
-    frame_flags_ =
-        (video->frame() == forced_kf_frame_num_) ? AOM_EFLAG_FORCE_KF : 0;
+  }
+
+  virtual bool HandleDecodeResult(const aom_codec_err_t res_dec,
+                                  libaom_test::Decoder *decoder) {
+    EXPECT_EQ(AOM_CODEC_OK, res_dec) << decoder->DecodeError();
+    if (is_arf_frame_present_ != 1 && AOM_CODEC_OK == res_dec) {
+      aom_codec_ctx_t *ctx_dec = decoder->GetDecoder();
+      AOM_CODEC_CONTROL_TYPECHECKED(ctx_dec, AOMD_GET_ALTREF_PRESENT,
+                                    &is_arf_frame_present_);
+    }
+    return AOM_CODEC_OK == res_dec;
+  }
+
+  const AltRefTestParams altref_test_params_;
+  int is_arf_frame_present_;
+  aom_rc_mode rc_end_usage_;
+};
+
+TEST_P(AltRefFramePresenceTestLarge, AltRefFrameEncodePresenceTest) {
+  libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                     cfg_.g_timebase.den, cfg_.g_timebase.num,
+                                     0, 100);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  ASSERT_EQ(is_arf_frame_present_, 1);
+}
+
+AV1_INSTANTIATE_TEST_SUITE(AltRefFramePresenceTestLarge,
+                           ::testing::ValuesIn(TestParams),
+                           ::testing::Values(AOM_Q, AOM_VBR, AOM_CBR, AOM_CQ));
+
+typedef struct {
+  const ::libaom_test::TestMode encoding_mode;
+  const unsigned int min_gf_interval;
+  const unsigned int max_gf_interval;
+} gfIntervalParam;
+
+const gfIntervalParam gfTestParams[] = {
+  // single pass
+  { ::libaom_test::kOnePassGood, 0, 6 },
+  { ::libaom_test::kOnePassGood, 0, 8 },
+  { ::libaom_test::kOnePassGood, 5, 10 },
+  { ::libaom_test::kOnePassGood, 8, 16 },
+  { ::libaom_test::kOnePassGood, 16, 16 },
+
+  // two pass
+  { ::libaom_test::kTwoPassGood, 0, 6 },
+  { ::libaom_test::kTwoPassGood, 0, 8 },
+  { ::libaom_test::kTwoPassGood, 5, 10 },
+  { ::libaom_test::kTwoPassGood, 8, 16 },
+  { ::libaom_test::kTwoPassGood, 16, 32 },
+  // disabled below test case because it causes failure
+  // TODO(anyone): enable below test case once issue is fixed.
+  // { ::libaom_test::kTwoPassGood, 20, 32 },
+};
+
+// This class is used to test if the gf interval bounds configured by the user
+// are respected by the encoder.
+class GoldenFrameIntervalTestLarge
+    : public ::libaom_test::CodecTestWith2Params<gfIntervalParam, aom_rc_mode>,
+      public ::libaom_test::EncoderTest {
+ protected:
+  GoldenFrameIntervalTestLarge()
+      : EncoderTest(GET_PARAM(0)), gf_interval_param_(GET_PARAM(1)),
+        rc_end_usage_(GET_PARAM(2)) {
+    baseline_gf_interval_ = -1;
+    limit_ = 60;
+    frame_num_ = 0;
+  }
+  virtual ~GoldenFrameIntervalTestLarge() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(gf_interval_param_.encoding_mode);
+    const aom_rational timebase = { 1, 30 };
+    cfg_.g_timebase = timebase;
+    cfg_.rc_end_usage = rc_end_usage_;
+    cfg_.g_threads = 1;
+    // kf_min_dist is equal to kf_max_dist to make sure that there are no scene
+    // cuts due to which the min_gf_interval may not be respected.
+    cfg_.kf_min_dist = limit_;
+    cfg_.kf_max_dist = limit_;
+    cfg_.g_limit = limit_;
+    cfg_.g_lag_in_frames = 35;
+    cfg_.rc_target_bitrate = 1000;
+  }
+
+  virtual bool DoDecode() const { return 1; }
+
+  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                                  ::libaom_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      encoder->Control(AOME_SET_CPUUSED, 5);
+      encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
+      encoder->Control(AV1E_SET_MIN_GF_INTERVAL,
+                       gf_interval_param_.min_gf_interval);
+      encoder->Control(AV1E_SET_MAX_GF_INTERVAL,
+                       gf_interval_param_.max_gf_interval);
+    }
+    if (frame_num_ > 0) {
+      encoder->Control(AV1E_GET_BASELINE_GF_INTERVAL, &baseline_gf_interval_);
+      ASSERT_LE(baseline_gf_interval_,
+                (int)gf_interval_param_.max_gf_interval + 1);
+      if ((frame_num_ + (int)gf_interval_param_.min_gf_interval) <= limit_) {
+        ASSERT_GE(baseline_gf_interval_,
+                  (int)gf_interval_param_.min_gf_interval);
+      }
+    }
   }
 
   virtual void FramePktHook(const aom_codec_cx_pkt_t *pkt) {
-    if (frame_num_ == forced_kf_frame_num_) {
-      ASSERT_EQ(pkt->data.frame.flags & AOM_FRAME_IS_KEY,
-                static_cast<aom_codec_frame_flags_t>(AOM_FRAME_IS_KEY))
-          << "Frame #" << frame_num_ << " isn't a keyframe!";
-    }
+    (void)pkt;
     ++frame_num_;
   }
 
-  ::libaom_test::TestMode encoding_mode_;
-  int cpu_used_;
-  unsigned int forced_kf_frame_num_;
-  unsigned int frame_num_;
+  const gfIntervalParam gf_interval_param_;
+  int baseline_gf_interval_;
+  int limit_;
+  int frame_num_;
+  aom_rc_mode rc_end_usage_;
 };
 
-TEST_P(AltRefForcedKeyTestLarge, Frame1IsKey) {
-  const aom_rational timebase = { 1, 30 };
-  const int lag_values[] = { 3, 15, 25, -1 };
-
-  forced_kf_frame_num_ = 1;
-  for (int i = 0; lag_values[i] != -1; ++i) {
-    frame_num_ = 0;
-    cfg_.g_lag_in_frames = lag_values[i];
-    libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
-                                       timebase.den, timebase.num, 0, 30);
-    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-  }
+TEST_P(GoldenFrameIntervalTestLarge, GoldenFrameIntervalTest) {
+  libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                     cfg_.g_timebase.den, cfg_.g_timebase.num,
+                                     0, limit_);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 }
 
-TEST_P(AltRefForcedKeyTestLarge, ForcedFrameIsKey) {
-  const aom_rational timebase = { 1, 30 };
-  const int lag_values[] = { 3, 15, 25, -1 };
-
-  for (int i = 0; lag_values[i] != -1; ++i) {
-    frame_num_ = 0;
-    forced_kf_frame_num_ = lag_values[i] - 1;
-    cfg_.g_lag_in_frames = lag_values[i];
-    libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
-                                       timebase.den, timebase.num, 0, 30);
-    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-  }
-}
-
-AV1_INSTANTIATE_TEST_CASE(AltRefForcedKeyTestLarge,
-                          ::testing::Values(::libaom_test::kOnePassGood),
-                          ::testing::Values(2, 5));
+AV1_INSTANTIATE_TEST_SUITE(GoldenFrameIntervalTestLarge,
+                           ::testing::ValuesIn(gfTestParams),
+                           ::testing::Values(AOM_Q, AOM_VBR, AOM_CQ, AOM_CBR));
 
 }  // namespace
diff --git a/test/aomcx_set_ref.sh b/test/aomcx_set_ref.sh
index f51b73c..237e2f3 100755
--- a/test/aomcx_set_ref.sh
+++ b/test/aomcx_set_ref.sh
@@ -41,7 +41,7 @@
 
   eval "${AOM_TEST_PREFIX}" "${encoder}" "${codec}" "${YUV_RAW_INPUT_WIDTH}" \
       "${YUV_RAW_INPUT_HEIGHT}" "${YUV_RAW_INPUT}" "${output_file}" \
-      "${ref_frame_num}" "${limit}" ${devnull}
+      "${ref_frame_num}" "${limit}" ${devnull} || return 1
 
   [ -e "${output_file}" ] || return 1
 }
diff --git a/test/aomdec.sh b/test/aomdec.sh
index 9271422..eda18bb 100755
--- a/test/aomdec.sh
+++ b/test/aomdec.sh
@@ -67,7 +67,7 @@
   if [ "$(aomdec_can_decode_av1)" = "yes" ]; then
     local file="${AV1_IVF_FILE}"
     if [ ! -e "${file}" ]; then
-      encode_yuv_raw_input_av1 "${file}" --ivf
+      encode_yuv_raw_input_av1 "${file}" --ivf || return 1
     fi
     aomdec "${AV1_IVF_FILE}" --summary --noblit
   fi
@@ -77,29 +77,39 @@
   if [ "$(aomdec_can_decode_av1)" = "yes" ]; then
     local file="av1.error-resilient.ivf"
     if [ ! -e "${file}" ]; then
-      encode_yuv_raw_input_av1 "${file}" --ivf --error-resilient=1
+      encode_yuv_raw_input_av1 "${file}" --ivf --error-resilient=1 || return 1
     fi
     aomdec "${file}" --summary --noblit
   fi
 }
 
-aomdec_av1_ivf_multithread() {
+ivf_multithread() {
+  local row_mt="$1"
   if [ "$(aomdec_can_decode_av1)" = "yes" ]; then
     local file="${AV1_IVF_FILE}"
     if [ ! -e "${file}" ]; then
-      encode_yuv_raw_input_av1 "${file}" --ivf
+      encode_yuv_raw_input_av1 "${file}" --ivf || return 1
     fi
     for threads in 2 3 4 5 6 7 8; do
-      aomdec "${file}" --summary --noblit --threads=$threads
+      aomdec "${file}" --summary --noblit --threads=$threads --row-mt=$row_mt \
+        || return 1
     done
   fi
 }
 
+aomdec_av1_ivf_multithread() {
+  ivf_multithread 0  # --row-mt=0
+}
+
+aomdec_av1_ivf_multithread_row_mt() {
+  ivf_multithread 1  # --row-mt=1
+}
+
 aomdec_aom_ivf_pipe_input() {
   if [ "$(aomdec_can_decode_av1)" = "yes" ]; then
     local file="${AV1_IVF_FILE}"
     if [ ! -e "${file}" ]; then
-      encode_yuv_raw_input_av1 "${file}" --ivf
+      encode_yuv_raw_input_av1 "${file}" --ivf || return 1
     fi
     aomdec_pipe "${AV1_IVF_FILE}" --summary --noblit
   fi
@@ -109,7 +119,7 @@
   if [ "$(aomdec_can_decode_av1)" = "yes" ]; then
     local file="${AV1_OBU_ANNEXB_FILE}"
     if [ ! -e "${file}" ]; then
-      encode_yuv_raw_input_av1 "${file}" --obu --annexb=1
+      encode_yuv_raw_input_av1 "${file}" --obu --annexb=1 || return 1
     fi
     aomdec "${file}" --summary --noblit --annexb
   fi
@@ -119,7 +129,7 @@
   if [ "$(aomdec_can_decode_av1)" = "yes" ]; then
     local file="${AV1_OBU_SEC5_FILE}"
     if [ ! -e "${file}" ]; then
-      encode_yuv_raw_input_av1 "${file}" --obu
+      encode_yuv_raw_input_av1 "${file}" --obu || return 1
     fi
     aomdec "${file}" --summary --noblit
   fi
@@ -130,7 +140,7 @@
      [ "$(webm_io_available)" = "yes" ]; then
     local file="${AV1_WEBM_FILE}"
     if [ ! -e "${file}" ]; then
-      encode_yuv_raw_input_av1 "${file}"
+      encode_yuv_raw_input_av1 "${file}" || return 1
     fi
     aomdec "${AV1_WEBM_FILE}" --summary --noblit
   fi
@@ -139,6 +149,7 @@
 aomdec_tests="aomdec_av1_ivf
               aomdec_av1_ivf_error_resilient
               aomdec_av1_ivf_multithread
+              aomdec_av1_ivf_multithread_row_mt
               aomdec_aom_ivf_pipe_input
               aomdec_av1_obu_annexb
               aomdec_av1_obu_section5
diff --git a/test/aomenc.sh b/test/aomenc.sh
index b030397..0c0d1b1 100755
--- a/test/aomenc.sh
+++ b/test/aomenc.sh
@@ -89,7 +89,7 @@
     aomenc $(yuv_raw_input) \
       $(aomenc_encode_test_fast_params) \
       --ivf \
-      --output="${output}"
+      --output="${output}" || return 1
 
     if [ ! -e "${output}" ]; then
       elog "Output file does not exist."
@@ -108,7 +108,7 @@
       $(aomenc_encode_test_fast_params) \
       --obu \
       --annexb=1 \
-      --output="${output}"
+      --output="${output}" || return 1
 
     if [ ! -e "${output}" ]; then
       elog "Output file does not exist."
@@ -126,7 +126,7 @@
     aomenc $(yuv_raw_input) \
       $(aomenc_encode_test_fast_params) \
       --obu \
-      --output="${output}"
+      --output="${output}" || return 1
 
     if [ ! -e "${output}" ]; then
       elog "Output file does not exist."
@@ -144,7 +144,7 @@
     fi
     aomenc $(yuv_raw_input) \
       $(aomenc_encode_test_fast_params) \
-      --output="${output}"
+      --output="${output}" || return 1
 
     if [ ! -e "${output}" ]; then
       elog "Output file does not exist."
@@ -160,7 +160,7 @@
     aomenc $(yuv_raw_input) \
       $(aomenc_encode_test_fast_params) \
       --passes=1 \
-      --output="${output}"
+      --output="${output}" || return 1
 
     if [ ! -e "${output}" ]; then
       elog "Output file does not exist."
@@ -176,7 +176,7 @@
       $(aomenc_encode_test_fast_params) \
       --ivf \
       --output="${output}" \
-      --lossless=1
+      --lossless=1 || return 1
 
     if [ ! -e "${output}" ]; then
       elog "Output file does not exist."
@@ -193,7 +193,7 @@
       --ivf \
       --output="${output}" \
       --min-q=0 \
-      --max-q=0
+      --max-q=0 || return 1
 
     if [ ! -e "${output}" ]; then
       elog "Output file does not exist."
@@ -212,7 +212,7 @@
       $(aomenc_encode_test_fast_params) \
       --limit=${lag_total_frames} \
       --lag-in-frames=${lag_frames} \
-      --output="${output}"
+      --output="${output}" || return 1
 
     if [ ! -e "${output}" ]; then
       elog "Output file does not exist."
@@ -228,7 +228,7 @@
     local output="${AOM_TEST_OUTPUT_DIR}/av1_non_square_par.webm"
     aomenc $(y4m_input_non_square_par) \
       $(aomenc_encode_test_fast_params) \
-      --output="${output}"
+      --output="${output}" || return 1
 
     if [ ! -e "${output}" ]; then
       elog "Output file does not exist."
@@ -245,7 +245,7 @@
       aomenc $(yuv_raw_input) \
         $(aomenc_encode_test_fast_params) \
         --cdf-update-mode=${mode} \
-        --output="${output}"
+        --output="${output}" || return 1
 
       if [ ! -e "${output}" ]; then
         elog "Output file does not exist."
diff --git a/test/aq_segment_test.cc b/test/aq_segment_test.cc
index 83bfdb6..e2715a1 100644
--- a/test/aq_segment_test.cc
+++ b/test/aq_segment_test.cc
@@ -86,12 +86,12 @@
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 }
 
-AV1_INSTANTIATE_TEST_CASE(AqSegmentTest,
-                          ::testing::Values(::libaom_test::kRealTime,
-                                            ::libaom_test::kOnePassGood),
-                          ::testing::Range(5, 9), ::testing::Range(0, 4));
-AV1_INSTANTIATE_TEST_CASE(AqSegmentTestLarge,
-                          ::testing::Values(::libaom_test::kRealTime,
-                                            ::libaom_test::kOnePassGood),
-                          ::testing::Range(3, 5), ::testing::Range(0, 4));
+AV1_INSTANTIATE_TEST_SUITE(AqSegmentTest,
+                           ::testing::Values(::libaom_test::kRealTime,
+                                             ::libaom_test::kOnePassGood),
+                           ::testing::Range(5, 9), ::testing::Range(0, 4));
+AV1_INSTANTIATE_TEST_SUITE(AqSegmentTestLarge,
+                           ::testing::Values(::libaom_test::kRealTime,
+                                             ::libaom_test::kOnePassGood),
+                           ::testing::Range(3, 5), ::testing::Range(0, 4));
 }  // namespace
diff --git a/test/av1_convolve_2d_test.cc b/test/av1_convolve_2d_test.cc
deleted file mode 100644
index 50a58f0..0000000
--- a/test/av1_convolve_2d_test.cc
+++ /dev/null
@@ -1,261 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <tuple>
-
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
-#include "test/av1_convolve_2d_test_util.h"
-
-using libaom_test::ACMRandom;
-using libaom_test::AV1Convolve2D::AV1Convolve2DSrTest;
-using libaom_test::AV1Convolve2D::AV1JntConvolve2DTest;
-#if CONFIG_AV1_HIGHBITDEPTH
-using libaom_test::AV1HighbdConvolve2D::AV1HighbdConvolve2DSrTest;
-using libaom_test::AV1HighbdConvolve2D::AV1HighbdJntConvolve2DTest;
-#endif
-using std::make_tuple;
-using std::tuple;
-
-namespace {
-
-TEST_P(AV1Convolve2DSrTest, DISABLED_Speed) { RunSpeedTest(GET_PARAM(0)); }
-
-TEST_P(AV1Convolve2DSrTest, CheckOutput) { RunCheckOutput(GET_PARAM(0)); }
-
-INSTANTIATE_TEST_SUITE_P(
-    C_COPY, AV1Convolve2DSrTest,
-    libaom_test::AV1Convolve2D::BuildParams(av1_convolve_2d_copy_sr_c, 0, 0));
-INSTANTIATE_TEST_SUITE_P(
-    C_X, AV1Convolve2DSrTest,
-    libaom_test::AV1Convolve2D::BuildParams(av1_convolve_x_sr_c, 1, 0));
-INSTANTIATE_TEST_SUITE_P(
-    C_Y, AV1Convolve2DSrTest,
-    libaom_test::AV1Convolve2D::BuildParams(av1_convolve_y_sr_c, 0, 1));
-INSTANTIATE_TEST_SUITE_P(
-    C, AV1Convolve2DSrTest,
-    libaom_test::AV1Convolve2D::BuildParams(av1_convolve_2d_sr_c, 1, 1));
-#if HAVE_SSE2
-INSTANTIATE_TEST_SUITE_P(SSE2_COPY, AV1Convolve2DSrTest,
-                         libaom_test::AV1Convolve2D::BuildParams(
-                             av1_convolve_2d_copy_sr_sse2, 0, 0));
-INSTANTIATE_TEST_SUITE_P(
-    SSE2_X, AV1Convolve2DSrTest,
-    libaom_test::AV1Convolve2D::BuildParams(av1_convolve_x_sr_sse2, 1, 0));
-INSTANTIATE_TEST_SUITE_P(
-    SSE2_Y, AV1Convolve2DSrTest,
-    libaom_test::AV1Convolve2D::BuildParams(av1_convolve_y_sr_sse2, 0, 1));
-INSTANTIATE_TEST_SUITE_P(
-    SSE2, AV1Convolve2DSrTest,
-    libaom_test::AV1Convolve2D::BuildParams(av1_convolve_2d_sr_sse2, 1, 1));
-#if HAVE_AVX2
-INSTANTIATE_TEST_SUITE_P(AVX2_COPY, AV1Convolve2DSrTest,
-                         libaom_test::AV1Convolve2D::BuildParams(
-                             av1_convolve_2d_copy_sr_avx2, 0, 0));
-INSTANTIATE_TEST_SUITE_P(
-    AVX2_X, AV1Convolve2DSrTest,
-    libaom_test::AV1Convolve2D::BuildParams(av1_convolve_x_sr_avx2, 1, 0));
-
-INSTANTIATE_TEST_SUITE_P(
-    AVX2_Y, AV1Convolve2DSrTest,
-    libaom_test::AV1Convolve2D::BuildParams(av1_convolve_y_sr_avx2, 0, 1));
-
-INSTANTIATE_TEST_SUITE_P(
-    AVX2, AV1Convolve2DSrTest,
-    libaom_test::AV1Convolve2D::BuildParams(av1_convolve_2d_sr_avx2, 1, 1));
-#endif  // HAVE_AVX2
-#endif  // HAVE_SSE2
-
-#if HAVE_NEON
-INSTANTIATE_TEST_SUITE_P(
-    NEON_X, AV1Convolve2DSrTest,
-    libaom_test::AV1Convolve2D::BuildParams(av1_convolve_x_sr_neon, 1, 0));
-
-INSTANTIATE_TEST_SUITE_P(
-    NEON_Y, AV1Convolve2DSrTest,
-    libaom_test::AV1Convolve2D::BuildParams(av1_convolve_y_sr_neon, 0, 1));
-
-INSTANTIATE_TEST_SUITE_P(
-    NEON, AV1Convolve2DSrTest,
-    libaom_test::AV1Convolve2D::BuildParams(av1_convolve_2d_sr_neon, 1, 1));
-
-INSTANTIATE_TEST_SUITE_P(NEON_COPY, AV1Convolve2DSrTest,
-                         libaom_test::AV1Convolve2D::BuildParams(
-                             av1_convolve_2d_copy_sr_neon, 0, 0));
-#endif  // HAVE_NEON
-
-TEST_P(AV1JntConvolve2DTest, CheckOutput) { RunCheckOutput(GET_PARAM(0)); }
-TEST_P(AV1JntConvolve2DTest, DISABLED_Speed) { RunSpeedTest(GET_PARAM(0)); }
-
-INSTANTIATE_TEST_SUITE_P(C_COPY, AV1JntConvolve2DTest,
-                         libaom_test::AV1Convolve2D::BuildParams(
-                             av1_dist_wtd_convolve_2d_copy_c, 0, 0));
-
-INSTANTIATE_TEST_SUITE_P(
-    C_X, AV1JntConvolve2DTest,
-    libaom_test::AV1Convolve2D::BuildParams(av1_dist_wtd_convolve_x_c, 1, 0));
-
-INSTANTIATE_TEST_SUITE_P(
-    C_Y, AV1JntConvolve2DTest,
-    libaom_test::AV1Convolve2D::BuildParams(av1_dist_wtd_convolve_y_c, 0, 1));
-
-#if HAVE_SSE2
-INSTANTIATE_TEST_SUITE_P(SSE2_COPY, AV1JntConvolve2DTest,
-                         libaom_test::AV1Convolve2D::BuildParams(
-                             av1_dist_wtd_convolve_2d_copy_sse2, 0, 0));
-INSTANTIATE_TEST_SUITE_P(SSE2, AV1JntConvolve2DTest,
-                         libaom_test::AV1Convolve2D::BuildParams(
-                             av1_dist_wtd_convolve_2d_sse2, 1, 1));
-
-INSTANTIATE_TEST_SUITE_P(SSE2_X, AV1JntConvolve2DTest,
-                         libaom_test::AV1Convolve2D::BuildParams(
-                             av1_dist_wtd_convolve_x_sse2, 1, 0));
-
-INSTANTIATE_TEST_SUITE_P(SSE2_Y, AV1JntConvolve2DTest,
-                         libaom_test::AV1Convolve2D::BuildParams(
-                             av1_dist_wtd_convolve_y_sse2, 0, 1));
-
-#if HAVE_SSSE3
-INSTANTIATE_TEST_SUITE_P(SSSE3, AV1JntConvolve2DTest,
-                         libaom_test::AV1Convolve2D::BuildParams(
-                             av1_dist_wtd_convolve_2d_ssse3, 1, 1));
-
-#if HAVE_AVX2
-INSTANTIATE_TEST_SUITE_P(AVX2_COPY, AV1JntConvolve2DTest,
-                         libaom_test::AV1Convolve2D::BuildParams(
-                             av1_dist_wtd_convolve_2d_copy_avx2, 0, 0));
-INSTANTIATE_TEST_SUITE_P(AVX2_X, AV1JntConvolve2DTest,
-                         libaom_test::AV1Convolve2D::BuildParams(
-                             av1_dist_wtd_convolve_x_avx2, 1, 0));
-
-INSTANTIATE_TEST_SUITE_P(AVX2_Y, AV1JntConvolve2DTest,
-                         libaom_test::AV1Convolve2D::BuildParams(
-                             av1_dist_wtd_convolve_y_avx2, 0, 1));
-
-INSTANTIATE_TEST_SUITE_P(AVX2, AV1JntConvolve2DTest,
-                         libaom_test::AV1Convolve2D::BuildParams(
-                             av1_dist_wtd_convolve_2d_avx2, 1, 1));
-#endif  // HAVE_AVX2
-#endif  // HAVE_SSSE3
-#endif  // HAVE_SSE2
-#if HAVE_NEON
-INSTANTIATE_TEST_SUITE_P(NEON_COPY, AV1JntConvolve2DTest,
-                         libaom_test::AV1Convolve2D::BuildParams(
-                             av1_dist_wtd_convolve_2d_copy_neon, 0, 0));
-
-INSTANTIATE_TEST_SUITE_P(NEON, AV1JntConvolve2DTest,
-                         libaom_test::AV1Convolve2D::BuildParams(
-                             av1_dist_wtd_convolve_2d_neon, 1, 1));
-INSTANTIATE_TEST_SUITE_P(NEON_X, AV1JntConvolve2DTest,
-                         libaom_test::AV1Convolve2D::BuildParams(
-                             av1_dist_wtd_convolve_x_neon, 1, 0));
-
-INSTANTIATE_TEST_SUITE_P(NEON_Y, AV1JntConvolve2DTest,
-                         libaom_test::AV1Convolve2D::BuildParams(
-                             av1_dist_wtd_convolve_y_neon, 0, 1));
-#endif  // HAVE_NEON
-
-#if CONFIG_AV1_HIGHBITDEPTH
-TEST_P(AV1HighbdConvolve2DSrTest, CheckOutput) { RunCheckOutput(GET_PARAM(1)); }
-TEST_P(AV1HighbdConvolve2DSrTest, DISABLED_Speed) {
-  RunSpeedTest(GET_PARAM(1));
-}
-
-INSTANTIATE_TEST_SUITE_P(C_X, AV1HighbdConvolve2DSrTest,
-                         libaom_test::AV1HighbdConvolve2D::BuildParams(
-                             av1_highbd_convolve_x_sr_c, 1, 0));
-
-INSTANTIATE_TEST_SUITE_P(C_Y, AV1HighbdConvolve2DSrTest,
-                         libaom_test::AV1HighbdConvolve2D::BuildParams(
-                             av1_highbd_convolve_y_sr_c, 0, 1));
-
-INSTANTIATE_TEST_SUITE_P(C_COPY, AV1HighbdConvolve2DSrTest,
-                         libaom_test::AV1HighbdConvolve2D::BuildParams(
-                             av1_highbd_convolve_2d_copy_sr_c, 0, 0));
-#if HAVE_SSE2
-INSTANTIATE_TEST_SUITE_P(SSE2_COPY, AV1HighbdConvolve2DSrTest,
-                         libaom_test::AV1HighbdConvolve2D::BuildParams(
-                             av1_highbd_convolve_2d_copy_sr_sse2, 0, 0));
-#if HAVE_SSSE3
-INSTANTIATE_TEST_SUITE_P(SSSE3, AV1HighbdConvolve2DSrTest,
-                         libaom_test::AV1HighbdConvolve2D::BuildParams(
-                             av1_highbd_convolve_2d_sr_ssse3, 1, 1));
-INSTANTIATE_TEST_SUITE_P(SSSE3_X, AV1HighbdConvolve2DSrTest,
-                         libaom_test::AV1HighbdConvolve2D::BuildParams(
-                             av1_highbd_convolve_x_sr_ssse3, 1, 0));
-INSTANTIATE_TEST_SUITE_P(SSSE3_Y, AV1HighbdConvolve2DSrTest,
-                         libaom_test::AV1HighbdConvolve2D::BuildParams(
-                             av1_highbd_convolve_y_sr_ssse3, 0, 1));
-#if HAVE_AVX2
-INSTANTIATE_TEST_SUITE_P(AVX2, AV1HighbdConvolve2DSrTest,
-                         libaom_test::AV1HighbdConvolve2D::BuildParams(
-                             av1_highbd_convolve_2d_sr_avx2, 1, 1));
-INSTANTIATE_TEST_SUITE_P(AVX2_X, AV1HighbdConvolve2DSrTest,
-                         libaom_test::AV1HighbdConvolve2D::BuildParams(
-                             av1_highbd_convolve_x_sr_avx2, 1, 0));
-INSTANTIATE_TEST_SUITE_P(AVX2_Y, AV1HighbdConvolve2DSrTest,
-                         libaom_test::AV1HighbdConvolve2D::BuildParams(
-                             av1_highbd_convolve_y_sr_avx2, 0, 1));
-INSTANTIATE_TEST_SUITE_P(AVX2_COPY, AV1HighbdConvolve2DSrTest,
-                         libaom_test::AV1HighbdConvolve2D::BuildParams(
-                             av1_highbd_convolve_2d_copy_sr_avx2, 0, 0));
-#endif  // HAVE_AVX2
-#endif  // HAVE_SSSE3
-#endif  // HAVE_SSE2
-TEST_P(AV1HighbdJntConvolve2DTest, CheckOutput) {
-  RunCheckOutput(GET_PARAM(1));
-}
-
-TEST_P(AV1HighbdJntConvolve2DTest, DISABLED_Speed) {
-  RunSpeedTest(GET_PARAM(1));
-}
-
-INSTANTIATE_TEST_SUITE_P(C_X, AV1HighbdJntConvolve2DTest,
-                         libaom_test::AV1HighbdConvolve2D::BuildParams(
-                             av1_highbd_dist_wtd_convolve_x_c, 1, 0));
-
-INSTANTIATE_TEST_SUITE_P(C_Y, AV1HighbdJntConvolve2DTest,
-                         libaom_test::AV1HighbdConvolve2D::BuildParams(
-                             av1_highbd_dist_wtd_convolve_y_c, 0, 1));
-
-INSTANTIATE_TEST_SUITE_P(C_COPY, AV1HighbdJntConvolve2DTest,
-                         libaom_test::AV1HighbdConvolve2D::BuildParams(
-                             av1_highbd_dist_wtd_convolve_2d_copy_c, 0, 0));
-#if HAVE_SSE4_1
-INSTANTIATE_TEST_SUITE_P(SSE4_1_COPY, AV1HighbdJntConvolve2DTest,
-                         libaom_test::AV1HighbdConvolve2D::BuildParams(
-                             av1_highbd_dist_wtd_convolve_2d_copy_sse4_1, 0,
-                             0));
-INSTANTIATE_TEST_SUITE_P(SSE4_1, AV1HighbdJntConvolve2DTest,
-                         libaom_test::AV1HighbdConvolve2D::BuildParams(
-                             av1_highbd_dist_wtd_convolve_2d_sse4_1, 1, 1));
-INSTANTIATE_TEST_SUITE_P(SSE4_1_X, AV1HighbdJntConvolve2DTest,
-                         libaom_test::AV1HighbdConvolve2D::BuildParams(
-                             av1_highbd_dist_wtd_convolve_x_sse4_1, 1, 0));
-INSTANTIATE_TEST_SUITE_P(SSE4_1_Y, AV1HighbdJntConvolve2DTest,
-                         libaom_test::AV1HighbdConvolve2D::BuildParams(
-                             av1_highbd_dist_wtd_convolve_y_sse4_1, 0, 1));
-#if HAVE_AVX2
-INSTANTIATE_TEST_SUITE_P(AVX2_COPY, AV1HighbdJntConvolve2DTest,
-                         libaom_test::AV1HighbdConvolve2D::BuildParams(
-                             av1_highbd_dist_wtd_convolve_2d_copy_avx2, 0, 0));
-INSTANTIATE_TEST_SUITE_P(AVX2, AV1HighbdJntConvolve2DTest,
-                         libaom_test::AV1HighbdConvolve2D::BuildParams(
-                             av1_highbd_dist_wtd_convolve_2d_avx2, 1, 1));
-INSTANTIATE_TEST_SUITE_P(AVX2_X, AV1HighbdJntConvolve2DTest,
-                         libaom_test::AV1HighbdConvolve2D::BuildParams(
-                             av1_highbd_dist_wtd_convolve_x_avx2, 1, 0));
-INSTANTIATE_TEST_SUITE_P(AVX2_Y, AV1HighbdJntConvolve2DTest,
-                         libaom_test::AV1HighbdConvolve2D::BuildParams(
-                             av1_highbd_dist_wtd_convolve_y_avx2, 0, 1));
-#endif  // HAVE_AVX2
-#endif  // HAVE_SSE4_1
-#endif  // CONFIG_AV1_HIGHBITDEPTH
-}  // namespace
diff --git a/test/av1_convolve_2d_test_util.cc b/test/av1_convolve_2d_test_util.cc
deleted file mode 100644
index 6f103d3..0000000
--- a/test/av1_convolve_2d_test_util.cc
+++ /dev/null
@@ -1,708 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "test/av1_convolve_2d_test_util.h"
-
-#include "aom_ports/aom_timer.h"
-#include "av1/common/common_data.h"
-#include "av1/common/convolve.h"
-
-using std::make_tuple;
-using std::tuple;
-
-namespace libaom_test {
-
-const int kMaxSize = 128 + 32;  // padding
-namespace AV1Convolve2D {
-
-::testing::internal::ParamGenerator<Convolve2DParam> BuildParams(
-    convolve_2d_func filter, int has_subx, int has_suby) {
-  return ::testing::Combine(::testing::Values(filter),
-                            ::testing::Values(has_subx),
-                            ::testing::Values(has_suby),
-                            ::testing::Range(BLOCK_4X4, BLOCK_SIZES_ALL));
-}
-
-AV1Convolve2DSrTest::~AV1Convolve2DSrTest() {}
-void AV1Convolve2DSrTest::SetUp() {
-  rnd_.Reset(ACMRandom::DeterministicSeed());
-}
-
-void AV1Convolve2DSrTest::TearDown() { libaom_test::ClearSystemState(); }
-
-void AV1Convolve2DSrTest::RunCheckOutput(convolve_2d_func test_impl) {
-  const int w = kMaxSize, h = kMaxSize;
-  const int has_subx = GET_PARAM(1);
-  const int has_suby = GET_PARAM(2);
-  const int block_idx = GET_PARAM(3);
-  int hfilter, vfilter, subx, suby;
-  uint8_t input[kMaxSize * kMaxSize];
-  DECLARE_ALIGNED(32, uint8_t, output[MAX_SB_SQUARE]);
-  DECLARE_ALIGNED(32, uint8_t, output2[MAX_SB_SQUARE]);
-
-  for (int i = 0; i < h; ++i)
-    for (int j = 0; j < w; ++j) input[i * w + j] = rnd_.Rand8();
-  for (int i = 0; i < MAX_SB_SQUARE; ++i)
-    output[i] = output2[i] = static_cast<uint8_t>(rnd_.Rand31());
-
-  // Make sure that sizes 2xN and Nx2 are also tested for chroma.
-  const int num_sizes =
-      (block_size_wide[block_idx] == 4 || block_size_high[block_idx] == 4) ? 2
-                                                                           : 1;
-  for (int shift = 0; shift < num_sizes; ++shift) {  // luma and chroma
-    const int out_w = block_size_wide[block_idx] >> shift;
-    const int out_h = block_size_high[block_idx] >> shift;
-    for (hfilter = EIGHTTAP_REGULAR; hfilter < INTERP_FILTERS_ALL; ++hfilter) {
-      for (vfilter = EIGHTTAP_REGULAR; vfilter < INTERP_FILTERS_ALL;
-           ++vfilter) {
-        const InterpFilterParams *filter_params_x =
-            av1_get_interp_filter_params_with_block_size((InterpFilter)hfilter,
-                                                         out_w);
-        const InterpFilterParams *filter_params_y =
-            av1_get_interp_filter_params_with_block_size((InterpFilter)vfilter,
-                                                         out_h);
-        for (int do_average = 0; do_average < 1; ++do_average) {
-          ConvolveParams conv_params1 =
-              get_conv_params_no_round(do_average, 0, NULL, 0, 0, 8);
-          ConvolveParams conv_params2 =
-              get_conv_params_no_round(do_average, 0, NULL, 0, 0, 8);
-
-          const int subx_range = has_subx ? 16 : 1;
-          const int suby_range = has_suby ? 16 : 1;
-          for (subx = 0; subx < subx_range; ++subx) {
-            for (suby = 0; suby < suby_range; ++suby) {
-              // Choose random locations within the source block
-              const int offset_r = 3 + rnd_.PseudoUniform(h - out_h - 7);
-              const int offset_c = 3 + rnd_.PseudoUniform(w - out_w - 7);
-              av1_convolve_2d_sr_c(input + offset_r * w + offset_c, w, output,
-                                   MAX_SB_SIZE, out_w, out_h, filter_params_x,
-                                   filter_params_y, subx, suby, &conv_params1);
-              test_impl(input + offset_r * w + offset_c, w, output2,
-                        MAX_SB_SIZE, out_w, out_h, filter_params_x,
-                        filter_params_y, subx, suby, &conv_params2);
-
-              if (memcmp(output, output2, sizeof(output))) {
-                for (int i = 0; i < MAX_SB_SIZE; ++i) {
-                  for (int j = 0; j < MAX_SB_SIZE; ++j) {
-                    int idx = i * MAX_SB_SIZE + j;
-                    ASSERT_EQ(output[idx], output2[idx])
-                        << out_w << "x" << out_h << " Pixel mismatch at index "
-                        << idx << " = (" << i << ", " << j
-                        << "), sub pixel offset = (" << suby << ", " << subx
-                        << ")";
-                  }
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-void AV1Convolve2DSrTest::RunSpeedTest(convolve_2d_func test_impl) {
-  const int w = kMaxSize, h = kMaxSize;
-  const int has_subx = GET_PARAM(1);
-  const int has_suby = GET_PARAM(2);
-  const int block_idx = GET_PARAM(3);
-
-  uint8_t input[kMaxSize * kMaxSize];
-  DECLARE_ALIGNED(32, uint8_t, output[MAX_SB_SQUARE]);
-
-  for (int i = 0; i < h; ++i)
-    for (int j = 0; j < w; ++j) input[i * w + j] = rnd_.Rand8();
-
-  int hfilter = EIGHTTAP_REGULAR, vfilter = EIGHTTAP_REGULAR;
-  int subx = 0, suby = 0;
-
-  const int do_average = 0;
-  ConvolveParams conv_params2 =
-      get_conv_params_no_round(do_average, 0, NULL, 0, 0, 8);
-
-  // Make sure that sizes 2xN and Nx2 are also tested for chroma.
-  const int num_sizes =
-      (block_size_wide[block_idx] == 4 || block_size_high[block_idx] == 4) ? 2
-                                                                           : 1;
-  for (int shift = 0; shift < num_sizes; ++shift) {  // luma and chroma
-    const int out_w = block_size_wide[block_idx] >> shift;
-    const int out_h = block_size_high[block_idx] >> shift;
-    const int num_loops = 1000000000 / (out_w + out_h);
-
-    const InterpFilterParams *filter_params_x =
-        av1_get_interp_filter_params_with_block_size((InterpFilter)hfilter,
-                                                     out_w);
-    const InterpFilterParams *filter_params_y =
-        av1_get_interp_filter_params_with_block_size((InterpFilter)vfilter,
-                                                     out_h);
-
-    aom_usec_timer timer;
-    aom_usec_timer_start(&timer);
-
-    for (int i = 0; i < num_loops; ++i)
-      test_impl(input, w, output, MAX_SB_SIZE, out_w, out_h, filter_params_x,
-                filter_params_y, subx, suby, &conv_params2);
-
-    aom_usec_timer_mark(&timer);
-    const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
-    printf("%d,%d convolve %3dx%-3d: %7.2f us\n", has_subx, has_suby, out_w,
-           out_h, 1000.0 * elapsed_time / num_loops);
-  }
-}
-
-AV1JntConvolve2DTest::~AV1JntConvolve2DTest() {}
-void AV1JntConvolve2DTest::SetUp() {
-  rnd_.Reset(ACMRandom::DeterministicSeed());
-}
-
-void AV1JntConvolve2DTest::TearDown() { libaom_test::ClearSystemState(); }
-
-void AV1JntConvolve2DTest::RunCheckOutput(convolve_2d_func test_impl) {
-  const int w = kMaxSize, h = kMaxSize;
-  const int has_subx = GET_PARAM(1);
-  const int has_suby = GET_PARAM(2);
-  const int block_idx = GET_PARAM(3);
-  int hfilter, vfilter, subx, suby;
-  uint8_t input[kMaxSize * kMaxSize];
-  DECLARE_ALIGNED(32, CONV_BUF_TYPE, output1[MAX_SB_SQUARE]);
-  DECLARE_ALIGNED(32, CONV_BUF_TYPE, output2[MAX_SB_SQUARE]);
-  DECLARE_ALIGNED(16, uint8_t, output8_1[MAX_SB_SQUARE]);
-  DECLARE_ALIGNED(16, uint8_t, output8_2[MAX_SB_SQUARE]);
-
-  for (int i = 0; i < h; ++i)
-    for (int j = 0; j < w; ++j) input[i * w + j] = rnd_.Rand8();
-  for (int i = 0; i < MAX_SB_SQUARE; ++i) {
-    output1[i] = output2[i] = rnd_.Rand16();
-    output8_1[i] = output8_2[i] = rnd_.Rand8();
-  }
-
-  const int out_w = block_size_wide[block_idx];
-  const int out_h = block_size_high[block_idx];
-  for (hfilter = EIGHTTAP_REGULAR; hfilter < INTERP_FILTERS_ALL; ++hfilter) {
-    for (vfilter = EIGHTTAP_REGULAR; vfilter < INTERP_FILTERS_ALL; ++vfilter) {
-      const InterpFilterParams *filter_params_x =
-          av1_get_interp_filter_params_with_block_size((InterpFilter)hfilter,
-                                                       out_w);
-      const InterpFilterParams *filter_params_y =
-          av1_get_interp_filter_params_with_block_size((InterpFilter)vfilter,
-                                                       out_h);
-      for (int do_average = 0; do_average <= 1; ++do_average) {
-        ConvolveParams conv_params1 =
-            get_conv_params_no_round(do_average, 0, output1, MAX_SB_SIZE, 1, 8);
-        ConvolveParams conv_params2 =
-            get_conv_params_no_round(do_average, 0, output2, MAX_SB_SIZE, 1, 8);
-
-        // Test special case where dist_wtd_comp_avg is not used
-        conv_params1.use_dist_wtd_comp_avg = 0;
-        conv_params2.use_dist_wtd_comp_avg = 0;
-
-        const int subx_range = has_subx ? 16 : 1;
-        const int suby_range = has_suby ? 16 : 1;
-        for (subx = 0; subx < subx_range; ++subx) {
-          for (suby = 0; suby < suby_range; ++suby) {
-            // Choose random locations within the source block
-            const int offset_r = 3 + rnd_.PseudoUniform(h - out_h - 7);
-            const int offset_c = 3 + rnd_.PseudoUniform(w - out_w - 7);
-            av1_dist_wtd_convolve_2d_c(input + offset_r * w + offset_c, w,
-                                       output8_1, MAX_SB_SIZE, out_w, out_h,
-                                       filter_params_x, filter_params_y, subx,
-                                       suby, &conv_params1);
-            test_impl(input + offset_r * w + offset_c, w, output8_2,
-                      MAX_SB_SIZE, out_w, out_h, filter_params_x,
-                      filter_params_y, subx, suby, &conv_params2);
-
-            for (int i = 0; i < out_h; ++i) {
-              for (int j = 0; j < out_w; ++j) {
-                int idx = i * MAX_SB_SIZE + j;
-                ASSERT_EQ(output1[idx], output2[idx])
-                    << "Mismatch at unit tests for av1_dist_wtd_convolve_2d\n"
-                    << out_w << "x" << out_h << " Pixel mismatch at index "
-                    << idx << " = (" << i << ", " << j
-                    << "), sub pixel offset = (" << suby << ", " << subx << ")";
-              }
-            }
-
-            if (memcmp(output8_1, output8_2, sizeof(output8_1))) {
-              for (int i = 0; i < MAX_SB_SIZE; ++i) {
-                for (int j = 0; j < MAX_SB_SIZE; ++j) {
-                  int idx = i * MAX_SB_SIZE + j;
-                  ASSERT_EQ(output8_1[idx], output8_2[idx])
-                      << out_w << "x" << out_h << " Pixel mismatch at index "
-                      << idx << " = (" << i << ", " << j
-                      << "), sub pixel offset = (" << suby << ", " << subx
-                      << ")";
-                }
-              }
-            }
-          }
-        }
-
-        // Test different combination of fwd and bck offset weights
-        for (int k = 0; k < 2; ++k) {
-          for (int l = 0; l < 4; ++l) {
-            conv_params1.use_dist_wtd_comp_avg = 1;
-            conv_params2.use_dist_wtd_comp_avg = 1;
-            conv_params1.fwd_offset = quant_dist_lookup_table[k][l][0];
-            conv_params1.bck_offset = quant_dist_lookup_table[k][l][1];
-            conv_params2.fwd_offset = quant_dist_lookup_table[k][l][0];
-            conv_params2.bck_offset = quant_dist_lookup_table[k][l][1];
-
-            for (subx = 0; subx < subx_range; ++subx) {
-              for (suby = 0; suby < suby_range; ++suby) {
-                // Choose random locations within the source block
-                const int offset_r = 3 + rnd_.PseudoUniform(h - out_h - 7);
-                const int offset_c = 3 + rnd_.PseudoUniform(w - out_w - 7);
-                av1_dist_wtd_convolve_2d_c(input + offset_r * w + offset_c, w,
-                                           output8_1, MAX_SB_SIZE, out_w, out_h,
-                                           filter_params_x, filter_params_y,
-                                           subx, suby, &conv_params1);
-                test_impl(input + offset_r * w + offset_c, w, output8_2,
-                          MAX_SB_SIZE, out_w, out_h, filter_params_x,
-                          filter_params_y, subx, suby, &conv_params2);
-
-                for (int i = 0; i < out_h; ++i) {
-                  for (int j = 0; j < out_w; ++j) {
-                    int idx = i * MAX_SB_SIZE + j;
-                    ASSERT_EQ(output1[idx], output2[idx])
-                        << "Mismatch at unit tests for "
-                           "av1_dist_wtd_convolve_2d\n"
-                        << out_w << "x" << out_h << " Pixel mismatch at index "
-                        << idx << " = (" << i << ", " << j
-                        << "), sub pixel offset = (" << suby << ", " << subx
-                        << ")";
-                  }
-                }
-                if (memcmp(output8_1, output8_2, sizeof(output8_1))) {
-                  for (int i = 0; i < MAX_SB_SIZE; ++i) {
-                    for (int j = 0; j < MAX_SB_SIZE; ++j) {
-                      int idx = i * MAX_SB_SIZE + j;
-                      ASSERT_EQ(output8_1[idx], output8_2[idx])
-                          << out_w << "x" << out_h
-                          << " Pixel mismatch at index " << idx << " = (" << i
-                          << ", " << j << "), sub pixel offset = (" << suby
-                          << ", " << subx << ")";
-                    }
-                  }
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-void AV1JntConvolve2DTest::RunSpeedTest(convolve_2d_func test_impl) {
-  const int w = kMaxSize, h = kMaxSize;
-  const int has_subx = GET_PARAM(1);
-  const int has_suby = GET_PARAM(2);
-  const int block_idx = GET_PARAM(3);
-
-  int subx = 0, suby = 0;
-  uint8_t input[kMaxSize * kMaxSize];
-  DECLARE_ALIGNED(32, CONV_BUF_TYPE, output[MAX_SB_SQUARE]);
-  DECLARE_ALIGNED(16, uint8_t, output8[MAX_SB_SQUARE]);
-  int hfilter = EIGHTTAP_REGULAR, vfilter = EIGHTTAP_REGULAR;
-  for (int i = 0; i < h; ++i)
-    for (int j = 0; j < w; ++j) input[i * w + j] = rnd_.Rand8();
-  for (int i = 0; i < MAX_SB_SQUARE; ++i) {
-    output[i] = rnd_.Rand16();
-    output8[i] = rnd_.Rand8();
-  }
-
-  const int out_w = block_size_wide[block_idx];
-  const int out_h = block_size_high[block_idx];
-  const int num_loops = 1000000000 / (out_w + out_h);
-  const int do_average = 0;
-
-  const InterpFilterParams *filter_params_x =
-      av1_get_interp_filter_params_with_block_size((InterpFilter)hfilter,
-                                                   out_w);
-  const InterpFilterParams *filter_params_y =
-      av1_get_interp_filter_params_with_block_size((InterpFilter)vfilter,
-                                                   out_h);
-
-  ConvolveParams conv_params =
-      get_conv_params_no_round(do_average, 0, output, MAX_SB_SIZE, 1, 8);
-
-  conv_params.use_dist_wtd_comp_avg = 0;
-
-  // Choose random locations within the source block
-  const int offset_r = 3 + rnd_.PseudoUniform(h - out_h - 7);
-  const int offset_c = 3 + rnd_.PseudoUniform(w - out_w - 7);
-
-  aom_usec_timer timer;
-  aom_usec_timer_start(&timer);
-
-  for (int i = 0; i < num_loops; ++i)
-    test_impl(input + offset_r * w + offset_c, w, output8, MAX_SB_SIZE, out_w,
-              out_h, filter_params_x, filter_params_y, subx, suby,
-              &conv_params);
-
-  aom_usec_timer_mark(&timer);
-  const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
-  printf("%d,%d convolve %3dx%-3d: %7.2f us\n", has_subx, has_suby, out_w,
-         out_h, 1000.0 * elapsed_time / num_loops);
-}
-}  // namespace AV1Convolve2D
-
-#if CONFIG_AV1_HIGHBITDEPTH
-namespace AV1HighbdConvolve2D {
-::testing::internal::ParamGenerator<HighbdConvolve2DParam> BuildParams(
-    highbd_convolve_2d_func filter, int has_subx, int has_suby) {
-  return ::testing::Combine(
-      ::testing::Range(8, 13, 2), ::testing::Values(filter),
-      ::testing::Values(has_subx), ::testing::Values(has_suby),
-      ::testing::Range(BLOCK_4X4, BLOCK_SIZES_ALL));
-}
-
-AV1HighbdConvolve2DSrTest::~AV1HighbdConvolve2DSrTest() {}
-void AV1HighbdConvolve2DSrTest::SetUp() {
-  rnd_.Reset(ACMRandom::DeterministicSeed());
-}
-
-void AV1HighbdConvolve2DSrTest::TearDown() { libaom_test::ClearSystemState(); }
-
-void AV1HighbdConvolve2DSrTest::RunSpeedTest(
-    highbd_convolve_2d_func test_impl) {
-  const int w = kMaxSize, h = kMaxSize;
-  const int bd = GET_PARAM(0);
-  const int has_subx = GET_PARAM(2);
-  const int has_suby = GET_PARAM(3);
-  const int block_idx = GET_PARAM(4);
-  int hfilter, vfilter, subx, suby;
-  uint16_t input[kMaxSize * kMaxSize];
-  DECLARE_ALIGNED(32, uint16_t, output[MAX_SB_SQUARE]);
-
-  for (int i = 0; i < h; ++i)
-    for (int j = 0; j < w; ++j)
-      input[i * w + j] = rnd_.Rand16() & ((1 << bd) - 1);
-
-  hfilter = EIGHTTAP_REGULAR;
-  vfilter = EIGHTTAP_REGULAR;
-  int do_average = 0;
-
-  const int offset_r = 3;
-  const int offset_c = 3;
-  subx = 0;
-  suby = 0;
-
-  ConvolveParams conv_params =
-      get_conv_params_no_round(do_average, 0, NULL, 0, 0, bd);
-
-  // Make sure that sizes 2xN and Nx2 are also tested for chroma.
-  const int num_sizes =
-      (block_size_wide[block_idx] == 4 || block_size_high[block_idx] == 4) ? 2
-                                                                           : 1;
-
-  for (int shift = 0; shift < num_sizes; ++shift) {  // luma and chroma
-    const int out_w = block_size_wide[block_idx] >> shift;
-    const int out_h = block_size_high[block_idx] >> shift;
-    const int num_loops = 1000000000 / (out_w + out_h);
-
-    const InterpFilterParams *filter_params_x =
-        av1_get_interp_filter_params_with_block_size((InterpFilter)hfilter,
-                                                     out_w);
-    const InterpFilterParams *filter_params_y =
-        av1_get_interp_filter_params_with_block_size((InterpFilter)vfilter,
-                                                     out_h);
-
-    aom_usec_timer timer;
-    aom_usec_timer_start(&timer);
-    for (int i = 0; i < num_loops; ++i)
-      test_impl(input + offset_r * w + offset_c, w, output, MAX_SB_SIZE, out_w,
-                out_h, filter_params_x, filter_params_y, subx, suby,
-                &conv_params, bd);
-
-    aom_usec_timer_mark(&timer);
-    const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
-    printf("%d,%d convolve %3dx%-3d: %7.2f us\n", has_subx, has_suby, out_w,
-           out_h, 1000.0 * elapsed_time / num_loops);
-  }
-}
-
-void AV1HighbdConvolve2DSrTest::RunCheckOutput(
-    highbd_convolve_2d_func test_impl) {
-  const int w = kMaxSize, h = kMaxSize;
-  const int bd = GET_PARAM(0);
-  const int has_subx = GET_PARAM(2);
-  const int has_suby = GET_PARAM(3);
-  const int block_idx = GET_PARAM(4);
-  int hfilter, vfilter, subx, suby;
-  uint16_t input[kMaxSize * kMaxSize];
-  DECLARE_ALIGNED(32, uint16_t, output[MAX_SB_SQUARE]);
-  DECLARE_ALIGNED(32, uint16_t, output2[MAX_SB_SQUARE]);
-
-  for (int i = 0; i < h; ++i)
-    for (int j = 0; j < w; ++j)
-      input[i * w + j] = rnd_.Rand16() & ((1 << bd) - 1);
-  for (int i = 0; i < MAX_SB_SQUARE; ++i)
-    output[i] = output2[i] = static_cast<int16_t>(rnd_.Rand31());
-
-  // Make sure that sizes 2xN and Nx2 are also tested for chroma.
-  const int num_sizes =
-      (block_size_wide[block_idx] == 4 || block_size_high[block_idx] == 4) ? 2
-                                                                           : 1;
-  for (int shift = 0; shift < num_sizes; ++shift) {  // luma and chroma
-    const int out_w = block_size_wide[block_idx] >> shift;
-    const int out_h = block_size_high[block_idx] >> shift;
-    for (hfilter = EIGHTTAP_REGULAR; hfilter < INTERP_FILTERS_ALL; ++hfilter) {
-      for (vfilter = EIGHTTAP_REGULAR; vfilter < INTERP_FILTERS_ALL;
-           ++vfilter) {
-        const InterpFilterParams *filter_params_x =
-            av1_get_interp_filter_params_with_block_size((InterpFilter)hfilter,
-                                                         out_w);
-        const InterpFilterParams *filter_params_y =
-            av1_get_interp_filter_params_with_block_size((InterpFilter)vfilter,
-                                                         out_h);
-        for (int do_average = 0; do_average < 1; ++do_average) {
-          ConvolveParams conv_params1 =
-              get_conv_params_no_round(do_average, 0, NULL, 0, 0, bd);
-          ConvolveParams conv_params2 =
-              get_conv_params_no_round(do_average, 0, NULL, 0, 0, bd);
-
-          const int subx_range = has_subx ? 16 : 1;
-          const int suby_range = has_suby ? 16 : 1;
-          for (subx = 0; subx < subx_range; ++subx) {
-            for (suby = 0; suby < suby_range; ++suby) {
-              // Choose random locations within the source block
-              const int offset_r = 3 + rnd_.PseudoUniform(h - out_h - 7);
-              const int offset_c = 3 + rnd_.PseudoUniform(w - out_w - 7);
-              av1_highbd_convolve_2d_sr_c(input + offset_r * w + offset_c, w,
-                                          output, MAX_SB_SIZE, out_w, out_h,
-                                          filter_params_x, filter_params_y,
-                                          subx, suby, &conv_params1, bd);
-              test_impl(input + offset_r * w + offset_c, w, output2,
-                        MAX_SB_SIZE, out_w, out_h, filter_params_x,
-                        filter_params_y, subx, suby, &conv_params2, bd);
-
-              if (memcmp(output, output2, sizeof(output))) {
-                for (int i = 0; i < MAX_SB_SIZE; ++i) {
-                  for (int j = 0; j < MAX_SB_SIZE; ++j) {
-                    int idx = i * MAX_SB_SIZE + j;
-                    ASSERT_EQ(output[idx], output2[idx])
-                        << out_w << "x" << out_h << " Pixel mismatch at index "
-                        << idx << " = (" << i << ", " << j
-                        << "), sub pixel offset = (" << suby << ", " << subx
-                        << ")";
-                  }
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-AV1HighbdJntConvolve2DTest::~AV1HighbdJntConvolve2DTest() {}
-void AV1HighbdJntConvolve2DTest::SetUp() {
-  rnd_.Reset(ACMRandom::DeterministicSeed());
-}
-
-void AV1HighbdJntConvolve2DTest::TearDown() { libaom_test::ClearSystemState(); }
-
-void AV1HighbdJntConvolve2DTest::RunSpeedTest(
-    highbd_convolve_2d_func test_impl) {
-  const int w = kMaxSize, h = kMaxSize;
-  const int bd = GET_PARAM(0);
-  const int block_idx = GET_PARAM(4);
-  int hfilter, vfilter, subx, suby;
-  uint16_t input[kMaxSize * kMaxSize];
-  DECLARE_ALIGNED(32, CONV_BUF_TYPE, output[MAX_SB_SQUARE]);
-  DECLARE_ALIGNED(32, uint16_t, output16[MAX_SB_SQUARE]);
-
-  for (int i = 0; i < h; ++i)
-    for (int j = 0; j < w; ++j)
-      input[i * w + j] = rnd_.Rand16() & ((1 << bd) - 1);
-  for (int i = 0; i < MAX_SB_SQUARE; ++i) output[i] = rnd_.Rand16();
-  hfilter = EIGHTTAP_REGULAR;
-  vfilter = EIGHTTAP_REGULAR;
-  int do_average = 0;
-  const int out_w = block_size_wide[block_idx];
-  const int out_h = block_size_high[block_idx];
-
-  const InterpFilterParams *filter_params_x =
-      av1_get_interp_filter_params_with_block_size((InterpFilter)hfilter,
-                                                   out_w);
-  const InterpFilterParams *filter_params_y =
-      av1_get_interp_filter_params_with_block_size((InterpFilter)vfilter,
-                                                   out_h);
-
-  ConvolveParams conv_params =
-      get_conv_params_no_round(do_average, 0, output, MAX_SB_SIZE, 1, bd);
-
-  // Test special case where dist_wtd_comp_avg is not used
-  conv_params.use_dist_wtd_comp_avg = 0;
-
-  subx = 0;
-  suby = 0;
-  // Choose random locations within the source block
-  const int offset_r = 3;
-  const int offset_c = 3;
-
-  const int num_loops = 1000000000 / (out_w + out_h);
-  aom_usec_timer timer;
-  aom_usec_timer_start(&timer);
-  for (int i = 0; i < num_loops; ++i)
-    test_impl(input + offset_r * w + offset_c, w, output16, MAX_SB_SIZE, out_w,
-              out_h, filter_params_x, filter_params_y, subx, suby, &conv_params,
-              bd);
-
-  aom_usec_timer_mark(&timer);
-  const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
-  printf("convolve %3dx%-3d: %7.2f us\n", out_w, out_h,
-         1000.0 * elapsed_time / num_loops);
-}
-
-void AV1HighbdJntConvolve2DTest::RunCheckOutput(
-    highbd_convolve_2d_func test_impl) {
-  const int w = kMaxSize, h = kMaxSize;
-  const int bd = GET_PARAM(0);
-  const int has_subx = GET_PARAM(2);
-  const int has_suby = GET_PARAM(3);
-  const int block_idx = GET_PARAM(4);
-  int hfilter, vfilter, subx, suby;
-  uint16_t input[kMaxSize * kMaxSize];
-  DECLARE_ALIGNED(32, CONV_BUF_TYPE, output1[MAX_SB_SQUARE]);
-  DECLARE_ALIGNED(32, CONV_BUF_TYPE, output2[MAX_SB_SQUARE]);
-  DECLARE_ALIGNED(32, uint16_t, output16_1[MAX_SB_SQUARE]);
-  DECLARE_ALIGNED(32, uint16_t, output16_2[MAX_SB_SQUARE]);
-
-  for (int i = 0; i < h; ++i)
-    for (int j = 0; j < w; ++j)
-      input[i * w + j] = rnd_.Rand16() & ((1 << bd) - 1);
-  for (int i = 0; i < MAX_SB_SQUARE; ++i) {
-    output1[i] = output2[i] = rnd_.Rand16();
-    output16_1[i] = output16_2[i] = rnd_.Rand16();
-  }
-
-  const int out_w = block_size_wide[block_idx];
-  const int out_h = block_size_high[block_idx];
-  for (hfilter = EIGHTTAP_REGULAR; hfilter < INTERP_FILTERS_ALL; ++hfilter) {
-    for (vfilter = EIGHTTAP_REGULAR; vfilter < INTERP_FILTERS_ALL; ++vfilter) {
-      const InterpFilterParams *filter_params_x =
-          av1_get_interp_filter_params_with_block_size((InterpFilter)hfilter,
-                                                       out_w);
-      const InterpFilterParams *filter_params_y =
-          av1_get_interp_filter_params_with_block_size((InterpFilter)vfilter,
-                                                       out_h);
-      for (int do_average = 0; do_average <= 1; ++do_average) {
-        ConvolveParams conv_params1 = get_conv_params_no_round(
-            do_average, 0, output1, MAX_SB_SIZE, 1, bd);
-        ConvolveParams conv_params2 = get_conv_params_no_round(
-            do_average, 0, output2, MAX_SB_SIZE, 1, bd);
-
-        // Test special case where dist_wtd_comp_avg is not used
-        conv_params1.use_dist_wtd_comp_avg = 0;
-        conv_params2.use_dist_wtd_comp_avg = 0;
-
-        const int subx_range = has_subx ? 16 : 1;
-        const int suby_range = has_suby ? 16 : 1;
-        for (subx = 0; subx < subx_range; ++subx) {
-          for (suby = 0; suby < suby_range; ++suby) {
-            // Choose random locations within the source block
-            const int offset_r = 3 + rnd_.PseudoUniform(h - out_h - 7);
-            const int offset_c = 3 + rnd_.PseudoUniform(w - out_w - 7);
-            av1_highbd_dist_wtd_convolve_2d_c(
-                input + offset_r * w + offset_c, w, output16_1, MAX_SB_SIZE,
-                out_w, out_h, filter_params_x, filter_params_y, subx, suby,
-                &conv_params1, bd);
-            test_impl(input + offset_r * w + offset_c, w, output16_2,
-                      MAX_SB_SIZE, out_w, out_h, filter_params_x,
-                      filter_params_y, subx, suby, &conv_params2, bd);
-
-            for (int i = 0; i < out_h; ++i) {
-              for (int j = 0; j < out_w; ++j) {
-                int idx = i * MAX_SB_SIZE + j;
-                ASSERT_EQ(output1[idx], output2[idx])
-                    << out_w << "x" << out_h << " Pixel mismatch at index "
-                    << idx << " = (" << i << ", " << j
-                    << "), sub pixel offset = (" << suby << ", " << subx << ")";
-              }
-            }
-
-            if (memcmp(output16_1, output16_2, sizeof(output16_1))) {
-              for (int i = 0; i < MAX_SB_SIZE; ++i) {
-                for (int j = 0; j < MAX_SB_SIZE; ++j) {
-                  int idx = i * MAX_SB_SIZE + j;
-                  ASSERT_EQ(output16_1[idx], output16_2[idx])
-                      << out_w << "x" << out_h << " Pixel mismatch at index "
-                      << idx << " = (" << i << ", " << j
-                      << "), sub pixel offset = (" << suby << ", " << subx
-                      << ")";
-                }
-              }
-            }
-          }
-        }
-
-        // Test different combination of fwd and bck offset weights
-        for (int k = 0; k < 2; ++k) {
-          for (int l = 0; l < 4; ++l) {
-            conv_params1.use_dist_wtd_comp_avg = 1;
-            conv_params2.use_dist_wtd_comp_avg = 1;
-            conv_params1.fwd_offset = quant_dist_lookup_table[k][l][0];
-            conv_params1.bck_offset = quant_dist_lookup_table[k][l][1];
-            conv_params2.fwd_offset = quant_dist_lookup_table[k][l][0];
-            conv_params2.bck_offset = quant_dist_lookup_table[k][l][1];
-
-            const int subx_range = has_subx ? 16 : 1;
-            const int suby_range = has_suby ? 16 : 1;
-            for (subx = 0; subx < subx_range; ++subx) {
-              for (suby = 0; suby < suby_range; ++suby) {
-                // Choose random locations within the source block
-                const int offset_r = 3 + rnd_.PseudoUniform(h - out_h - 7);
-                const int offset_c = 3 + rnd_.PseudoUniform(w - out_w - 7);
-                av1_highbd_dist_wtd_convolve_2d_c(
-                    input + offset_r * w + offset_c, w, output16_1, MAX_SB_SIZE,
-                    out_w, out_h, filter_params_x, filter_params_y, subx, suby,
-                    &conv_params1, bd);
-                test_impl(input + offset_r * w + offset_c, w, output16_2,
-                          MAX_SB_SIZE, out_w, out_h, filter_params_x,
-                          filter_params_y, subx, suby, &conv_params2, bd);
-
-                for (int i = 0; i < out_h; ++i) {
-                  for (int j = 0; j < out_w; ++j) {
-                    int idx = i * MAX_SB_SIZE + j;
-                    ASSERT_EQ(output1[idx], output2[idx])
-                        << out_w << "x" << out_h << " Pixel mismatch at index "
-                        << idx << " = (" << i << ", " << j
-                        << "), sub pixel offset = (" << suby << ", " << subx
-                        << ")";
-                  }
-                }
-
-                if (memcmp(output16_1, output16_2, sizeof(output16_1))) {
-                  for (int i = 0; i < MAX_SB_SIZE; ++i) {
-                    for (int j = 0; j < MAX_SB_SIZE; ++j) {
-                      int idx = i * MAX_SB_SIZE + j;
-                      ASSERT_EQ(output16_1[idx], output16_2[idx])
-                          << out_w << "x" << out_h
-                          << " Pixel mismatch at index " << idx << " = (" << i
-                          << ", " << j << "), sub pixel offset = (" << suby
-                          << ", " << subx << ")";
-                    }
-                  }
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-}
-}  // namespace AV1HighbdConvolve2D
-#endif  // CONFIG_AV1_HIGHBITDEPTH
-}  // namespace libaom_test
diff --git a/test/av1_convolve_2d_test_util.h b/test/av1_convolve_2d_test_util.h
deleted file mode 100644
index 3c19cfe..0000000
--- a/test/av1_convolve_2d_test_util.h
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_TEST_AV1_CONVOLVE_2D_TEST_UTIL_H_
-#define AOM_TEST_AV1_CONVOLVE_2D_TEST_UTIL_H_
-
-#include <tuple>
-
-#include "config/av1_rtcd.h"
-#include "config/aom_dsp_rtcd.h"
-
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
-#include "test/acm_random.h"
-#include "test/util.h"
-
-#include "test/clear_system_state.h"
-#include "test/register_state_check.h"
-
-namespace libaom_test {
-
-namespace AV1Convolve2D {
-
-typedef void (*convolve_2d_func)(const uint8_t *src, int src_stride,
-                                 uint8_t *dst, int dst_stride, int w, int h,
-                                 const InterpFilterParams *filter_params_x,
-                                 const InterpFilterParams *filter_params_y,
-                                 const int subpel_x_qn, const int subpel_y_qn,
-                                 ConvolveParams *conv_params);
-
-typedef std::tuple<convolve_2d_func, int, int, BLOCK_SIZE> Convolve2DParam;
-
-::testing::internal::ParamGenerator<Convolve2DParam> BuildParams(
-    convolve_2d_func filter, int subx_exist, int suby_exist);
-
-class AV1Convolve2DSrTest : public ::testing::TestWithParam<Convolve2DParam> {
- public:
-  virtual ~AV1Convolve2DSrTest();
-  virtual void SetUp();
-
-  virtual void TearDown();
-
- protected:
-  void RunCheckOutput(convolve_2d_func test_impl);
-  void RunSpeedTest(convolve_2d_func test_impl);
-
-  libaom_test::ACMRandom rnd_;
-};
-
-class AV1JntConvolve2DTest : public ::testing::TestWithParam<Convolve2DParam> {
- public:
-  virtual ~AV1JntConvolve2DTest();
-  virtual void SetUp();
-
-  virtual void TearDown();
-
- protected:
-  void RunCheckOutput(convolve_2d_func test_impl);
-  void RunSpeedTest(convolve_2d_func test_impl);
-
-  libaom_test::ACMRandom rnd_;
-};
-}  // namespace AV1Convolve2D
-
-#if CONFIG_AV1_HIGHBITDEPTH
-namespace AV1HighbdConvolve2D {
-typedef void (*highbd_convolve_2d_func)(
-    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
-    int h, const InterpFilterParams *filter_params_x,
-    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
-    const int subpel_y_qn, ConvolveParams *conv_params, int bd);
-
-typedef std::tuple<int, highbd_convolve_2d_func, int, int, BLOCK_SIZE>
-    HighbdConvolve2DParam;
-
-::testing::internal::ParamGenerator<HighbdConvolve2DParam> BuildParams(
-    highbd_convolve_2d_func filter, int subx_exist, int suby_exist);
-
-class AV1HighbdConvolve2DSrTest
-    : public ::testing::TestWithParam<HighbdConvolve2DParam> {
- public:
-  virtual ~AV1HighbdConvolve2DSrTest();
-  virtual void SetUp();
-
-  virtual void TearDown();
-
- protected:
-  void RunCheckOutput(highbd_convolve_2d_func test_impl);
-  void RunSpeedTest(highbd_convolve_2d_func test_impl);
-
-  libaom_test::ACMRandom rnd_;
-};
-
-class AV1HighbdJntConvolve2DTest
-    : public ::testing::TestWithParam<HighbdConvolve2DParam> {
- public:
-  virtual ~AV1HighbdJntConvolve2DTest();
-  virtual void SetUp();
-
-  virtual void TearDown();
-
- protected:
-  void RunCheckOutput(highbd_convolve_2d_func test_impl);
-  void RunSpeedTest(highbd_convolve_2d_func test_impl);
-
-  libaom_test::ACMRandom rnd_;
-};
-}  // namespace AV1HighbdConvolve2D
-#endif  // CONFIG_AV1_HIGHBITDEPTH
-
-}  // namespace libaom_test
-
-#endif  // AOM_TEST_AV1_CONVOLVE_2D_TEST_UTIL_H_
diff --git a/test/av1_convolve_scale_test.cc b/test/av1_convolve_scale_test.cc
index ffd0bab..a1c5746 100644
--- a/test/av1_convolve_scale_test.cc
+++ b/test/av1_convolve_scale_test.cc
@@ -104,7 +104,6 @@
   params_.filter_ptr = &coeffs_[0];
   params_.taps = n;
   // These are ignored by the functions being tested. Set them to whatever.
-  params_.subpel_shifts = SUBPEL_SHIFTS;
   params_.interp_filter = EIGHTTAP_REGULAR;
 }
 
diff --git a/test/av1_convolve_test.cc b/test/av1_convolve_test.cc
new file mode 100644
index 0000000..658b4fe
--- /dev/null
+++ b/test/av1_convolve_test.cc
@@ -0,0 +1,1520 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <ostream>
+#include <set>
+#include <vector>
+#include "config/av1_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+namespace {
+
+// TODO(any): Remove following INTERP_FILTERS_ALL define, so that 12-tap filter
+// is tested once 12-tap filter SIMD is done.
+#undef INTERP_FILTERS_ALL
+#define INTERP_FILTERS_ALL 4
+
+// All single reference convolve tests are parameterized on block size,
+// bit-depth, and function to test.
+//
+// Note that parameterizing on these variables (and not other parameters) is
+// a conscious decision - Jenkins needs some degree of parallelization to run
+// the tests within the time limit, but if the number of parameters increases
+// too much, the gtest framework does not handle it well (increased overhead per
+// test, huge amount of output to stdout, etc.).
+//
+// Also note that the test suites must be named with the architecture, e.g.,
+// C, C_X, AVX2_X, ... The test suite that runs on Jenkins sometimes runs tests
+// that cannot deal with intrinsics (e.g., the Valgrind tests on 32-bit x86
+// binaries) and will disable tests using a filter like
+// --gtest_filter=-:SSE4_1.*. If the test suites are not named this way, the
+// testing infrastructure will not selectively filter them properly.
+class BlockSize {
+ public:
+  BlockSize(int w, int h) : width_(w), height_(h) {}
+
+  int Width() const { return width_; }
+  int Height() const { return height_; }
+
+  bool operator<(const BlockSize &other) const {
+    if (Width() == other.Width()) {
+      return Height() < other.Height();
+    }
+    return Width() < other.Width();
+  }
+
+  bool operator==(const BlockSize &other) const {
+    return Width() == other.Width() && Height() == other.Height();
+  }
+
+ private:
+  int width_;
+  int height_;
+};
+
+// Block size / bit depth / test function used to parameterize the tests.
+template <typename T>
+class TestParam {
+ public:
+  TestParam(const BlockSize &block, int bd, T test_func)
+      : block_(block), bd_(bd), test_func_(test_func) {}
+
+  const BlockSize &Block() const { return block_; }
+  int BitDepth() const { return bd_; }
+  T TestFunction() const { return test_func_; }
+
+  bool operator==(const TestParam &other) const {
+    return Block() == other.Block() && BitDepth() == other.BitDepth() &&
+           TestFunction() == other.TestFunction();
+  }
+
+ private:
+  BlockSize block_;
+  int bd_;
+  T test_func_;
+};
+
+template <typename T>
+std::ostream &operator<<(std::ostream &os, const TestParam<T> &test_arg) {
+  return os << "TestParam { width:" << test_arg.Block().Width()
+            << " height:" << test_arg.Block().Height()
+            << " bd:" << test_arg.BitDepth() << " }";
+}
+
+// Generate the list of all block widths / heights that need to be tested,
+// includes chroma and luma sizes, for the given bit-depths. The test
+// function is the same for all generated parameters.
+template <typename T>
+std::vector<TestParam<T>> GetTestParams(std::initializer_list<int> bit_depths,
+                                        T test_func) {
+  std::set<BlockSize> sizes;
+  for (int b = BLOCK_4X4; b < BLOCK_SIZES_ALL; ++b) {
+    const int w = block_size_wide[b];
+    const int h = block_size_high[b];
+    sizes.insert(BlockSize(w, h));
+    // Add in smaller chroma sizes as well.
+    if (w == 4 || h == 4) {
+      sizes.insert(BlockSize(w / 2, h / 2));
+    }
+  }
+  std::vector<TestParam<T>> result;
+  for (const BlockSize &block : sizes) {
+    for (int bd : bit_depths) {
+      result.push_back(TestParam<T>(block, bd, test_func));
+    }
+  }
+  return result;
+}
+
+template <typename T>
+std::vector<TestParam<T>> GetLowbdTestParams(T test_func) {
+  return GetTestParams({ 8 }, test_func);
+}
+
+template <typename T>
+::testing::internal::ParamGenerator<TestParam<T>> BuildLowbdParams(
+    T test_func) {
+  return ::testing::ValuesIn(GetLowbdTestParams(test_func));
+}
+
+// Test the test-parameters generators work as expected.
+class AV1ConvolveParametersTest : public ::testing::Test {};
+
+TEST_F(AV1ConvolveParametersTest, GetLowbdTestParams) {
+  auto v = GetLowbdTestParams(av1_convolve_x_sr_c);
+  ASSERT_EQ(27U, v.size());
+  for (const auto &p : v) {
+    ASSERT_EQ(8, p.BitDepth());
+    // Needed (instead of ASSERT_EQ(...) since gtest does not
+    // have built in printing for arbitrary functions, which
+    // causes a compilation error.
+    bool same_fn = av1_convolve_x_sr_c == p.TestFunction();
+    ASSERT_TRUE(same_fn);
+  }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+template <typename T>
+std::vector<TestParam<T>> GetHighbdTestParams(T test_func) {
+  return GetTestParams({ 10, 12 }, test_func);
+}
+
+template <typename T>
+::testing::internal::ParamGenerator<TestParam<T>> BuildHighbdParams(
+    T test_func) {
+  return ::testing::ValuesIn(GetHighbdTestParams(test_func));
+}
+
+TEST_F(AV1ConvolveParametersTest, GetHighbdTestParams) {
+  auto v = GetHighbdTestParams(av1_highbd_convolve_x_sr_c);
+  ASSERT_EQ(54U, v.size());
+  int num_10 = 0;
+  int num_12 = 0;
+  for (const auto &p : v) {
+    ASSERT_TRUE(p.BitDepth() == 10 || p.BitDepth() == 12);
+    bool same_fn = av1_highbd_convolve_x_sr_c == p.TestFunction();
+    ASSERT_TRUE(same_fn);
+    if (p.BitDepth() == 10) {
+      ++num_10;
+    } else {
+      ++num_12;
+    }
+  }
+  ASSERT_EQ(num_10, num_12);
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+// AV1ConvolveTest is the base class that all convolve tests should derive from.
+// It provides storage/methods for generating randomized buffers for both
+// low bit-depth and high bit-depth, and setup/teardown methods for clearing
+// system state. Implementors can get the bit-depth / block-size /
+// test function by calling GetParam().
+template <typename T>
+class AV1ConvolveTest : public ::testing::TestWithParam<TestParam<T>> {
+ public:
+  virtual ~AV1ConvolveTest() { TearDown(); }
+
+  virtual void SetUp() override {
+    rnd_.Reset(libaom_test::ACMRandom::DeterministicSeed());
+  }
+
+  virtual void TearDown() override { libaom_test::ClearSystemState(); }
+
+  // Randomizes the 8-bit input buffer and returns a pointer to it. Note that
+  // the pointer is safe to use with an 8-tap filter. The stride can range
+  // from width to (width + kPadding). Also note that the pointer is to the
+  // same memory location.
+  static constexpr int kInputPadding = 8;
+
+  // Get a pointer to a buffer with stride == width. Note that we must have
+  // the test param passed in explicitly -- the gtest framework does not
+  // support calling GetParam() within a templatized class.
+  // Note that FirstRandomInput8 always returns the same pointer -- if two
+  // inputs are needed, also use SecondRandomInput8.
+  const uint8_t *FirstRandomInput8(const TestParam<T> &param) {
+    // Note we can't call GetParam() directly -- gtest does not support
+    // this for parameterized types.
+    return RandomInput8(input8_1_, param);
+  }
+
+  const uint8_t *SecondRandomInput8(const TestParam<T> &param) {
+    return RandomInput8(input8_2_, param);
+  }
+
+  // Some of the intrinsics perform writes in 32 byte chunks. Moreover, some
+  // of the instrinsics assume that the stride is also a multiple of 32.
+  // To satisfy these constraints and also remain simple, output buffer strides
+  // are assumed MAX_SB_SIZE.
+  static constexpr int kOutputStride = MAX_SB_SIZE;
+
+  // Check that two 8-bit output buffers are identical.
+  void AssertOutputBufferEq(const uint8_t *p1, const uint8_t *p2, int width,
+                            int height) {
+    ASSERT_TRUE(p1 != p2) << "Buffers must be at different memory locations";
+    for (int j = 0; j < height; ++j) {
+      if (memcmp(p1, p2, sizeof(*p1) * width) == 0) {
+        p1 += kOutputStride;
+        p2 += kOutputStride;
+        continue;
+      }
+      for (int i = 0; i < width; ++i) {
+        ASSERT_EQ(p1[i], p2[i])
+            << width << "x" << height << " Pixel mismatch at (" << i << ", "
+            << j << ")";
+      }
+    }
+  }
+
+  // Check that two 16-bit output buffers are identical.
+  void AssertOutputBufferEq(const uint16_t *p1, const uint16_t *p2, int width,
+                            int height) {
+    ASSERT_TRUE(p1 != p2) << "Buffers must be in different memory locations";
+    for (int j = 0; j < height; ++j) {
+      if (memcmp(p1, p2, sizeof(*p1) * width) == 0) {
+        p1 += kOutputStride;
+        p2 += kOutputStride;
+        continue;
+      }
+      for (int i = 0; i < width; ++i) {
+        ASSERT_EQ(p1[i], p2[i])
+            << width << "x" << height << " Pixel mismatch at (" << i << ", "
+            << j << ")";
+      }
+    }
+  }
+
+#if CONFIG_AV1_HIGHBITDEPTH
+  // Note that the randomized values are capped by bit-depth.
+  const uint16_t *FirstRandomInput16(const TestParam<T> &param) {
+    return RandomInput16(input16_1_, param);
+  }
+
+  const uint16_t *SecondRandomInput16(const TestParam<T> &param) {
+    return RandomInput16(input16_2_, param);
+  }
+#endif
+
+ private:
+  const uint8_t *RandomInput8(uint8_t *p, const TestParam<T> &param) {
+    EXPECT_EQ(8, param.BitDepth());
+    EXPECT_GE(MAX_SB_SIZE, param.Block().Width());
+    EXPECT_GE(MAX_SB_SIZE, param.Block().Height());
+    const int padded_width = param.Block().Width() + kInputPadding;
+    const int padded_height = param.Block().Height() + kInputPadding;
+    Randomize(p, padded_width * padded_height);
+    return p + (kInputPadding / 2) * padded_width + kInputPadding / 2;
+  }
+
+  void Randomize(uint8_t *p, int size) {
+    for (int i = 0; i < size; ++i) {
+      p[i] = rnd_.Rand8();
+    }
+  }
+
+#if CONFIG_AV1_HIGHBITDEPTH
+  const uint16_t *RandomInput16(uint16_t *p, const TestParam<T> &param) {
+    // Check that this is only called with high bit-depths.
+    EXPECT_TRUE(param.BitDepth() == 10 || param.BitDepth() == 12);
+    EXPECT_GE(MAX_SB_SIZE, param.Block().Width());
+    EXPECT_GE(MAX_SB_SIZE, param.Block().Height());
+    const int padded_width = param.Block().Width() + kInputPadding;
+    const int padded_height = param.Block().Height() + kInputPadding;
+    Randomize(p, padded_width * padded_height, param.BitDepth());
+    return p + (kInputPadding / 2) * padded_width + kInputPadding / 2;
+  }
+
+  void Randomize(uint16_t *p, int size, int bit_depth) {
+    for (int i = 0; i < size; ++i) {
+      p[i] = rnd_.Rand16() & ((1 << bit_depth) - 1);
+    }
+  }
+#endif
+
+  static constexpr int kInputStride = MAX_SB_SIZE + kInputPadding;
+
+  libaom_test::ACMRandom rnd_;
+  // Statically allocate all the memory that is needed for the tests. Note
+  // that we cannot allocate output memory here. It must use DECLARE_ALIGNED,
+  // which is a C99 feature and interacts badly with C++ member variables.
+  uint8_t input8_1_[kInputStride * kInputStride];
+  uint8_t input8_2_[kInputStride * kInputStride];
+#if CONFIG_AV1_HIGHBITDEPTH
+  uint16_t input16_1_[kInputStride * kInputStride];
+  uint16_t input16_2_[kInputStride * kInputStride];
+#endif
+};
+
+////////////////////////////////////////////////////////
+// Single reference convolve-x functions (low bit-depth)
+////////////////////////////////////////////////////////
+typedef void (*convolve_x_func)(const uint8_t *src, int src_stride,
+                                uint8_t *dst, int dst_stride, int w, int h,
+                                const InterpFilterParams *filter_params_x,
+                                const int subpel_x_qn,
+                                ConvolveParams *conv_params);
+
+class AV1ConvolveXTest : public AV1ConvolveTest<convolve_x_func> {
+ public:
+  void RunTest() {
+    for (int sub_x = 0; sub_x < 16; ++sub_x) {
+      for (int filter = EIGHTTAP_REGULAR; filter < INTERP_FILTERS_ALL;
+           ++filter) {
+        InterpFilter f = static_cast<InterpFilter>(filter);
+        TestConvolve(sub_x, f);
+      }
+    }
+  }
+
+ private:
+  void TestConvolve(const int sub_x, const InterpFilter filter) {
+    const int width = GetParam().Block().Width();
+    const int height = GetParam().Block().Height();
+    const InterpFilterParams *filter_params_x =
+        av1_get_interp_filter_params_with_block_size(filter, width);
+    ConvolveParams conv_params1 = get_conv_params_no_round(0, 0, NULL, 0, 0, 8);
+    const uint8_t *input = FirstRandomInput8(GetParam());
+    DECLARE_ALIGNED(32, uint8_t, reference[MAX_SB_SQUARE]);
+    av1_convolve_x_sr(input, width, reference, kOutputStride, width, height,
+                      filter_params_x, sub_x, &conv_params1);
+
+    ConvolveParams conv_params2 = get_conv_params_no_round(0, 0, NULL, 0, 0, 8);
+    convolve_x_func test_func = GetParam().TestFunction();
+    DECLARE_ALIGNED(32, uint8_t, test[MAX_SB_SQUARE]);
+    test_func(input, width, test, kOutputStride, width, height, filter_params_x,
+              sub_x, &conv_params2);
+    AssertOutputBufferEq(reference, test, width, height);
+  }
+};
+
+TEST_P(AV1ConvolveXTest, RunTest) { RunTest(); }
+INSTANTIATE_TEST_SUITE_P(C, AV1ConvolveXTest,
+                         BuildLowbdParams(av1_convolve_x_sr_c));
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_SUITE_P(SSE2, AV1ConvolveXTest,
+                         BuildLowbdParams(av1_convolve_x_sr_sse2));
+#endif
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(AVX2, AV1ConvolveXTest,
+                         BuildLowbdParams(av1_convolve_x_sr_avx2));
+#endif
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, AV1ConvolveXTest,
+                         BuildLowbdParams(av1_convolve_x_sr_neon));
+#endif
+
+#if CONFIG_AV1_HIGHBITDEPTH
+/////////////////////////////////////////////////////////
+// Single reference convolve-x functions (high bit-depth)
+/////////////////////////////////////////////////////////
+typedef void (*highbd_convolve_x_func)(
+    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
+    int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn,
+    ConvolveParams *conv_params, int bd);
+
+class AV1ConvolveXHighbdTest : public AV1ConvolveTest<highbd_convolve_x_func> {
+ public:
+  void RunTest() {
+    for (int sub_x = 0; sub_x < 16; ++sub_x) {
+      for (int filter = EIGHTTAP_REGULAR; filter < INTERP_FILTERS_ALL;
+           ++filter) {
+        InterpFilter f = static_cast<InterpFilter>(filter);
+        TestConvolve(sub_x, f);
+      }
+    }
+  }
+
+ private:
+  void TestConvolve(const int sub_x, const InterpFilter filter) {
+    const int width = GetParam().Block().Width();
+    const int height = GetParam().Block().Height();
+    const int bit_depth = GetParam().BitDepth();
+    const InterpFilterParams *filter_params_x =
+        av1_get_interp_filter_params_with_block_size(filter, width);
+    ConvolveParams conv_params1 =
+        get_conv_params_no_round(0, 0, NULL, 0, 0, bit_depth);
+    const uint16_t *input = FirstRandomInput16(GetParam());
+    DECLARE_ALIGNED(32, uint16_t, reference[MAX_SB_SQUARE]);
+    av1_highbd_convolve_x_sr(input, width, reference, kOutputStride, width,
+                             height, filter_params_x, sub_x, &conv_params1,
+                             bit_depth);
+
+    ConvolveParams conv_params2 =
+        get_conv_params_no_round(0, 0, NULL, 0, 0, bit_depth);
+    DECLARE_ALIGNED(32, uint16_t, test[MAX_SB_SQUARE]);
+    GetParam().TestFunction()(input, width, test, kOutputStride, width, height,
+                              filter_params_x, sub_x, &conv_params2, bit_depth);
+    AssertOutputBufferEq(reference, test, width, height);
+  }
+};
+
+TEST_P(AV1ConvolveXHighbdTest, RunTest) { RunTest(); }
+
+INSTANTIATE_TEST_SUITE_P(C, AV1ConvolveXHighbdTest,
+                         BuildHighbdParams(av1_highbd_convolve_x_sr_c));
+
+#if HAVE_SSSE3
+INSTANTIATE_TEST_SUITE_P(SSSE3, AV1ConvolveXHighbdTest,
+                         BuildHighbdParams(av1_highbd_convolve_x_sr_ssse3));
+#endif
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(AVX2, AV1ConvolveXHighbdTest,
+                         BuildHighbdParams(av1_highbd_convolve_x_sr_avx2));
+#endif
+
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+////////////////////////////////////////////////////////
+// Single reference convolve-y functions (low bit-depth)
+////////////////////////////////////////////////////////
+typedef void (*convolve_y_func)(const uint8_t *src, int src_stride,
+                                uint8_t *dst, int dst_stride, int w, int h,
+                                const InterpFilterParams *filter_params_y,
+                                const int subpel_y_qn);
+
+class AV1ConvolveYTest : public AV1ConvolveTest<convolve_y_func> {
+ public:
+  void RunTest() {
+    for (int sub_y = 0; sub_y < 16; ++sub_y) {
+      for (int filter = EIGHTTAP_REGULAR; filter < INTERP_FILTERS_ALL;
+           ++filter) {
+        InterpFilter f = static_cast<InterpFilter>(filter);
+        TestConvolve(sub_y, f);
+      }
+    }
+  }
+
+ private:
+  void TestConvolve(const int sub_y, const InterpFilter filter) {
+    const int width = GetParam().Block().Width();
+    const int height = GetParam().Block().Height();
+
+    const InterpFilterParams *filter_params_y =
+        av1_get_interp_filter_params_with_block_size(filter, height);
+    const uint8_t *input = FirstRandomInput8(GetParam());
+    DECLARE_ALIGNED(32, uint8_t, reference[MAX_SB_SQUARE]);
+    av1_convolve_y_sr(input, width, reference, kOutputStride, width, height,
+                      filter_params_y, sub_y);
+    DECLARE_ALIGNED(32, uint8_t, test[MAX_SB_SQUARE]);
+    GetParam().TestFunction()(input, width, test, kOutputStride, width, height,
+                              filter_params_y, sub_y);
+    AssertOutputBufferEq(reference, test, width, height);
+  }
+};
+
+TEST_P(AV1ConvolveYTest, RunTest) { RunTest(); }
+
+INSTANTIATE_TEST_SUITE_P(C, AV1ConvolveYTest,
+                         BuildLowbdParams(av1_convolve_y_sr_c));
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_SUITE_P(SSE2, AV1ConvolveYTest,
+                         BuildLowbdParams(av1_convolve_y_sr_sse2));
+#endif
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(AVX2, AV1ConvolveYTest,
+                         BuildLowbdParams(av1_convolve_y_sr_avx2));
+#endif
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, AV1ConvolveYTest,
+                         BuildLowbdParams(av1_convolve_y_sr_neon));
+#endif
+
+#if CONFIG_AV1_HIGHBITDEPTH
+/////////////////////////////////////////////////////////
+// Single reference convolve-y functions (high bit-depth)
+/////////////////////////////////////////////////////////
+typedef void (*highbd_convolve_y_func)(
+    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
+    int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn,
+    int bd);
+
+class AV1ConvolveYHighbdTest : public AV1ConvolveTest<highbd_convolve_y_func> {
+ public:
+  void RunTest() {
+    for (int sub_y = 0; sub_y < 16; ++sub_y) {
+      for (int filter = EIGHTTAP_REGULAR; filter < INTERP_FILTERS_ALL;
+           ++filter) {
+        InterpFilter f = static_cast<InterpFilter>(filter);
+        TestConvolve(sub_y, f);
+      }
+    }
+  }
+
+ private:
+  void TestConvolve(const int sub_y, const InterpFilter filter) {
+    const int width = GetParam().Block().Width();
+    const int height = GetParam().Block().Height();
+    const int bit_depth = GetParam().BitDepth();
+    const InterpFilterParams *filter_params_y =
+        av1_get_interp_filter_params_with_block_size(filter, height);
+    const uint16_t *input = FirstRandomInput16(GetParam());
+    DECLARE_ALIGNED(32, uint16_t, reference[MAX_SB_SQUARE]);
+    av1_highbd_convolve_y_sr(input, width, reference, kOutputStride, width,
+                             height, filter_params_y, sub_y, bit_depth);
+    DECLARE_ALIGNED(32, uint16_t, test[MAX_SB_SQUARE]);
+    GetParam().TestFunction()(input, width, test, kOutputStride, width, height,
+                              filter_params_y, sub_y, bit_depth);
+    AssertOutputBufferEq(reference, test, width, height);
+  }
+};
+
+TEST_P(AV1ConvolveYHighbdTest, RunTest) { RunTest(); }
+
+INSTANTIATE_TEST_SUITE_P(C, AV1ConvolveYHighbdTest,
+                         BuildHighbdParams(av1_highbd_convolve_y_sr_c));
+
+#if HAVE_SSSE3
+INSTANTIATE_TEST_SUITE_P(SSSE3, AV1ConvolveYHighbdTest,
+                         BuildHighbdParams(av1_highbd_convolve_y_sr_ssse3));
+#endif
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(AVX2, AV1ConvolveYHighbdTest,
+                         BuildHighbdParams(av1_highbd_convolve_y_sr_avx2));
+#endif
+
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+//////////////////////////////////////////////////////////////
+// Single reference convolve-copy functions (low bit-depth)
+//////////////////////////////////////////////////////////////
+typedef void (*convolve_copy_func)(const uint8_t *src, ptrdiff_t src_stride,
+                                   uint8_t *dst, ptrdiff_t dst_stride, int w,
+                                   int h);
+
+class AV1ConvolveCopyTest : public AV1ConvolveTest<convolve_copy_func> {
+ public:
+  void RunTest() {
+    const int width = GetParam().Block().Width();
+    const int height = GetParam().Block().Height();
+    const uint8_t *input = FirstRandomInput8(GetParam());
+    DECLARE_ALIGNED(32, uint8_t, reference[MAX_SB_SQUARE]);
+    aom_convolve_copy(input, width, reference, kOutputStride, width, height);
+    DECLARE_ALIGNED(32, uint8_t, test[MAX_SB_SQUARE]);
+    GetParam().TestFunction()(input, width, test, kOutputStride, width, height);
+    AssertOutputBufferEq(reference, test, width, height);
+  }
+};
+
+// Note that even though these are AOM convolve functions, we are using the
+// newer AV1 test framework.
+TEST_P(AV1ConvolveCopyTest, RunTest) { RunTest(); }
+
+INSTANTIATE_TEST_SUITE_P(C, AV1ConvolveCopyTest,
+                         BuildLowbdParams(aom_convolve_copy_c));
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_SUITE_P(SSE2, AV1ConvolveCopyTest,
+                         BuildLowbdParams(aom_convolve_copy_sse2));
+#endif
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(AVX2, AV1ConvolveCopyTest,
+                         BuildLowbdParams(aom_convolve_copy_avx2));
+#endif
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, AV1ConvolveCopyTest,
+                         BuildLowbdParams(aom_convolve_copy_neon));
+#endif
+
+#if HAVE_MSA
+INSTANTIATE_TEST_SUITE_P(MSA, AV1ConvolveCopyTest,
+                         BuildLowbdParams(aom_convolve_copy_msa));
+#endif
+
+#if HAVE_DSPR2
+INSTANTIATE_TEST_SUITE_P(DSPR2, AV1ConvolveCopyTest,
+                         BuildLowbdParams(aom_convolve_copy_dspr2));
+#endif
+
+#if CONFIG_AV1_HIGHBITDEPTH
+///////////////////////////////////////////////////////////////
+// Single reference convolve-copy functions (high bit-depth)
+///////////////////////////////////////////////////////////////
+typedef void (*highbd_convolve_copy_func)(const uint16_t *src,
+                                          ptrdiff_t src_stride, uint16_t *dst,
+                                          ptrdiff_t dst_stride, int w, int h);
+
+class AV1ConvolveCopyHighbdTest
+    : public AV1ConvolveTest<highbd_convolve_copy_func> {
+ public:
+  void RunTest() {
+    const BlockSize &block = GetParam().Block();
+    const int width = block.Width();
+    const int height = block.Height();
+    const uint16_t *input = FirstRandomInput16(GetParam());
+    DECLARE_ALIGNED(32, uint16_t, reference[MAX_SB_SQUARE]);
+    aom_highbd_convolve_copy(input, width, reference, kOutputStride, width,
+                             height);
+    DECLARE_ALIGNED(32, uint16_t, test[MAX_SB_SQUARE]);
+    GetParam().TestFunction()(input, width, test, kOutputStride, width, height);
+    AssertOutputBufferEq(reference, test, width, height);
+  }
+};
+
+TEST_P(AV1ConvolveCopyHighbdTest, RunTest) { RunTest(); }
+
+INSTANTIATE_TEST_SUITE_P(C, AV1ConvolveCopyHighbdTest,
+                         BuildHighbdParams(aom_highbd_convolve_copy_c));
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_SUITE_P(SSE2, AV1ConvolveCopyHighbdTest,
+                         BuildHighbdParams(aom_highbd_convolve_copy_sse2));
+#endif
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(AVX2, AV1ConvolveCopyHighbdTest,
+                         BuildHighbdParams(aom_highbd_convolve_copy_avx2));
+#endif
+
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+/////////////////////////////////////////////////////////
+// Single reference convolve-2D functions (low bit-depth)
+/////////////////////////////////////////////////////////
+typedef void (*convolve_2d_func)(const uint8_t *src, int src_stride,
+                                 uint8_t *dst, int dst_stride, int w, int h,
+                                 const InterpFilterParams *filter_params_x,
+                                 const InterpFilterParams *filter_params_y,
+                                 const int subpel_x_qn, const int subpel_y_qn,
+                                 ConvolveParams *conv_params);
+
+class AV1Convolve2DTest : public AV1ConvolveTest<convolve_2d_func> {
+ public:
+  void RunTest() {
+    for (int sub_x = 0; sub_x < 16; ++sub_x) {
+      for (int sub_y = 0; sub_y < 16; ++sub_y) {
+        for (int h_f = EIGHTTAP_REGULAR; h_f < INTERP_FILTERS_ALL; ++h_f) {
+          for (int v_f = EIGHTTAP_REGULAR; v_f < INTERP_FILTERS_ALL; ++v_f) {
+            TestConvolve(static_cast<InterpFilter>(h_f),
+                         static_cast<InterpFilter>(v_f), sub_x, sub_y);
+          }
+        }
+      }
+    }
+  }
+
+ private:
+  void TestConvolve(const InterpFilter h_f, const InterpFilter v_f,
+                    const int sub_x, const int sub_y) {
+    const int width = GetParam().Block().Width();
+    const int height = GetParam().Block().Height();
+    const InterpFilterParams *filter_params_x =
+        av1_get_interp_filter_params_with_block_size(h_f, width);
+    const InterpFilterParams *filter_params_y =
+        av1_get_interp_filter_params_with_block_size(v_f, height);
+    const uint8_t *input = FirstRandomInput8(GetParam());
+    DECLARE_ALIGNED(32, uint8_t, reference[MAX_SB_SQUARE]);
+    ConvolveParams conv_params1 = get_conv_params_no_round(0, 0, NULL, 0, 0, 8);
+    av1_convolve_2d_sr(input, width, reference, kOutputStride, width, height,
+                       filter_params_x, filter_params_y, sub_x, sub_y,
+                       &conv_params1);
+    DECLARE_ALIGNED(32, uint8_t, test[MAX_SB_SQUARE]);
+    ConvolveParams conv_params2 = get_conv_params_no_round(0, 0, NULL, 0, 0, 8);
+    GetParam().TestFunction()(input, width, test, kOutputStride, width, height,
+                              filter_params_x, filter_params_y, sub_x, sub_y,
+                              &conv_params2);
+    AssertOutputBufferEq(reference, test, width, height);
+  }
+};
+
+TEST_P(AV1Convolve2DTest, RunTest) { RunTest(); }
+
+INSTANTIATE_TEST_SUITE_P(C, AV1Convolve2DTest,
+                         BuildLowbdParams(av1_convolve_2d_sr_c));
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_SUITE_P(SSE2, AV1Convolve2DTest,
+                         BuildLowbdParams(av1_convolve_2d_sr_sse2));
+#endif
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(AVX2, AV1Convolve2DTest,
+                         BuildLowbdParams(av1_convolve_2d_sr_avx2));
+#endif
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, AV1Convolve2DTest,
+                         BuildLowbdParams(av1_convolve_2d_sr_neon));
+#endif
+
+#if CONFIG_AV1_HIGHBITDEPTH
+//////////////////////////////////////////////////////////
+// Single reference convolve-2d functions (high bit-depth)
+//////////////////////////////////////////////////////////
+
+typedef void (*highbd_convolve_2d_func)(
+    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
+    int h, const InterpFilterParams *filter_params_x,
+    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+    const int subpel_y_qn, ConvolveParams *conv_params, int bd);
+
+class AV1Convolve2DHighbdTest
+    : public AV1ConvolveTest<highbd_convolve_2d_func> {
+ public:
+  void RunTest() {
+    for (int sub_x = 0; sub_x < 16; ++sub_x) {
+      for (int sub_y = 0; sub_y < 16; ++sub_y) {
+        for (int h_f = EIGHTTAP_REGULAR; h_f < INTERP_FILTERS_ALL; ++h_f) {
+          for (int v_f = EIGHTTAP_REGULAR; v_f < INTERP_FILTERS_ALL; ++v_f) {
+            TestConvolve(static_cast<InterpFilter>(h_f),
+                         static_cast<InterpFilter>(v_f), sub_x, sub_y);
+          }
+        }
+      }
+    }
+  }
+
+ private:
+  void TestConvolve(const InterpFilter h_f, const InterpFilter v_f,
+                    const int sub_x, const int sub_y) {
+    const int width = GetParam().Block().Width();
+    const int height = GetParam().Block().Height();
+    const int bit_depth = GetParam().BitDepth();
+    const InterpFilterParams *filter_params_x =
+        av1_get_interp_filter_params_with_block_size(h_f, width);
+    const InterpFilterParams *filter_params_y =
+        av1_get_interp_filter_params_with_block_size(v_f, height);
+    const uint16_t *input = FirstRandomInput16(GetParam());
+    DECLARE_ALIGNED(32, uint16_t, reference[MAX_SB_SQUARE]);
+    ConvolveParams conv_params1 =
+        get_conv_params_no_round(0, 0, NULL, 0, 0, bit_depth);
+    av1_highbd_convolve_2d_sr(input, width, reference, kOutputStride, width,
+                              height, filter_params_x, filter_params_y, sub_x,
+                              sub_y, &conv_params1, bit_depth);
+    DECLARE_ALIGNED(32, uint16_t, test[MAX_SB_SQUARE]);
+    ConvolveParams conv_params2 =
+        get_conv_params_no_round(0, 0, NULL, 0, 0, bit_depth);
+    GetParam().TestFunction()(input, width, test, kOutputStride, width, height,
+                              filter_params_x, filter_params_y, sub_x, sub_y,
+                              &conv_params2, bit_depth);
+    AssertOutputBufferEq(reference, test, width, height);
+  }
+};
+
+TEST_P(AV1Convolve2DHighbdTest, RunTest) { RunTest(); }
+
+INSTANTIATE_TEST_SUITE_P(C, AV1Convolve2DHighbdTest,
+                         BuildHighbdParams(av1_highbd_convolve_2d_sr_c));
+
+#if HAVE_SSSE3
+INSTANTIATE_TEST_SUITE_P(SSSE3, AV1Convolve2DHighbdTest,
+                         BuildHighbdParams(av1_highbd_convolve_2d_sr_ssse3));
+#endif
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(AVX2, AV1Convolve2DHighbdTest,
+                         BuildHighbdParams(av1_highbd_convolve_2d_sr_avx2));
+#endif
+
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+//////////////////////////
+// Compound Convolve Tests
+//////////////////////////
+
+// The compound functions do not work for chroma block sizes. Provide
+// a function to generate test parameters for just luma block sizes.
+template <typename T>
+std::vector<TestParam<T>> GetLumaTestParams(
+    std::initializer_list<int> bit_depths, T test_func) {
+  std::set<BlockSize> sizes;
+  for (int b = BLOCK_4X4; b < BLOCK_SIZES_ALL; ++b) {
+    const int w = block_size_wide[b];
+    const int h = block_size_high[b];
+    sizes.insert(BlockSize(w, h));
+  }
+  std::vector<TestParam<T>> result;
+  for (int bit_depth : bit_depths) {
+    for (const auto &block : sizes) {
+      result.push_back(TestParam<T>(block, bit_depth, test_func));
+    }
+  }
+  return result;
+}
+
+template <typename T>
+std::vector<TestParam<T>> GetLowbdLumaTestParams(T test_func) {
+  return GetLumaTestParams({ 8 }, test_func);
+}
+
+template <typename T>
+::testing::internal::ParamGenerator<TestParam<T>> BuildLowbdLumaParams(
+    T test_func) {
+  return ::testing::ValuesIn(GetLowbdLumaTestParams(test_func));
+}
+
+TEST_F(AV1ConvolveParametersTest, GetLowbdLumaTestParams) {
+  auto v = GetLowbdLumaTestParams(av1_dist_wtd_convolve_x_c);
+  ASSERT_EQ(22U, v.size());
+  for (const auto &e : v) {
+    ASSERT_EQ(8, e.BitDepth());
+    bool same_fn = av1_dist_wtd_convolve_x_c == e.TestFunction();
+    ASSERT_TRUE(same_fn);
+  }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+template <typename T>
+std::vector<TestParam<T>> GetHighbdLumaTestParams(T test_func) {
+  return GetLumaTestParams({ 10, 12 }, test_func);
+}
+
+TEST_F(AV1ConvolveParametersTest, GetHighbdLumaTestParams) {
+  auto v = GetHighbdLumaTestParams(av1_highbd_dist_wtd_convolve_x_c);
+  ASSERT_EQ(44U, v.size());
+  int num_10 = 0;
+  int num_12 = 0;
+  for (const auto &e : v) {
+    ASSERT_TRUE(10 == e.BitDepth() || 12 == e.BitDepth());
+    bool same_fn = av1_highbd_dist_wtd_convolve_x_c == e.TestFunction();
+    ASSERT_TRUE(same_fn);
+    if (e.BitDepth() == 10) {
+      ++num_10;
+    } else {
+      ++num_12;
+    }
+  }
+  ASSERT_EQ(num_10, num_12);
+}
+
+template <typename T>
+::testing::internal::ParamGenerator<TestParam<T>> BuildHighbdLumaParams(
+    T test_func) {
+  return ::testing::ValuesIn(GetHighbdLumaTestParams(test_func));
+}
+
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+// Compound cases also need to test different frame offsets and weightings.
+class CompoundParam {
+ public:
+  CompoundParam(bool use_dist_wtd_comp_avg, int fwd_offset, int bck_offset)
+      : use_dist_wtd_comp_avg_(use_dist_wtd_comp_avg), fwd_offset_(fwd_offset),
+        bck_offset_(bck_offset) {}
+
+  bool UseDistWtdCompAvg() const { return use_dist_wtd_comp_avg_; }
+  int FwdOffset() const { return fwd_offset_; }
+  int BckOffset() const { return bck_offset_; }
+
+ private:
+  bool use_dist_wtd_comp_avg_;
+  int fwd_offset_;
+  int bck_offset_;
+};
+
+std::vector<CompoundParam> GetCompoundParams() {
+  std::vector<CompoundParam> result;
+  result.push_back(CompoundParam(false, 0, 0));
+  for (int k = 0; k < 2; ++k) {
+    for (int l = 0; l < 4; ++l) {
+      result.push_back(CompoundParam(true, quant_dist_lookup_table[k][l][0],
+                                     quant_dist_lookup_table[k][l][1]));
+    }
+  }
+  return result;
+}
+
+TEST_F(AV1ConvolveParametersTest, GetCompoundParams) {
+  auto v = GetCompoundParams();
+  ASSERT_EQ(9U, v.size());
+  ASSERT_FALSE(v[0].UseDistWtdCompAvg());
+  for (size_t i = 1; i < v.size(); ++i) {
+    ASSERT_TRUE(v[i].UseDistWtdCompAvg());
+  }
+}
+
+////////////////////////////////////////////////
+// Compound convolve-x functions (low bit-depth)
+////////////////////////////////////////////////
+
+ConvolveParams GetConvolveParams(int do_average, CONV_BUF_TYPE *conv_buf,
+                                 int width, int bit_depth,
+                                 const CompoundParam &compound) {
+  ConvolveParams conv_params =
+      get_conv_params_no_round(do_average, 0, conv_buf, width, 1, bit_depth);
+  conv_params.use_dist_wtd_comp_avg = compound.UseDistWtdCompAvg();
+  conv_params.fwd_offset = compound.FwdOffset();
+  conv_params.bck_offset = compound.BckOffset();
+  return conv_params;
+}
+
+class AV1ConvolveXCompoundTest : public AV1ConvolveTest<convolve_x_func> {
+ public:
+  void RunTest() {
+    auto compound_params = GetCompoundParams();
+    for (int sub_pix = 0; sub_pix < 16; ++sub_pix) {
+      for (int f = EIGHTTAP_REGULAR; f < INTERP_FILTERS_ALL; ++f) {
+        for (const auto &c : compound_params) {
+          TestConvolve(sub_pix, static_cast<InterpFilter>(f), c);
+        }
+      }
+    }
+  }
+
+ protected:
+  virtual const InterpFilterParams *FilterParams(InterpFilter f,
+                                                 const BlockSize &block) const {
+    return av1_get_interp_filter_params_with_block_size(f, block.Width());
+  }
+
+  virtual convolve_x_func ReferenceFunc() const {
+    return av1_dist_wtd_convolve_x;
+  }
+
+ private:
+  void TestConvolve(const int sub_pix, const InterpFilter filter,
+                    const CompoundParam &compound) {
+    const int width = GetParam().Block().Width();
+    const int height = GetParam().Block().Height();
+    const uint8_t *input1 = FirstRandomInput8(GetParam());
+    const uint8_t *input2 = SecondRandomInput8(GetParam());
+    DECLARE_ALIGNED(32, uint8_t, reference[MAX_SB_SQUARE]);
+    DECLARE_ALIGNED(32, CONV_BUF_TYPE, reference_conv_buf[MAX_SB_SQUARE]);
+    Convolve(ReferenceFunc(), input1, input2, reference, reference_conv_buf,
+             compound, sub_pix, filter);
+
+    DECLARE_ALIGNED(32, uint8_t, test[MAX_SB_SQUARE]);
+    DECLARE_ALIGNED(32, CONV_BUF_TYPE, test_conv_buf[MAX_SB_SQUARE]);
+    Convolve(GetParam().TestFunction(), input1, input2, test, test_conv_buf,
+             compound, sub_pix, filter);
+
+    AssertOutputBufferEq(reference_conv_buf, test_conv_buf, width, height);
+    AssertOutputBufferEq(reference, test, width, height);
+  }
+
+ private:
+  void Convolve(convolve_x_func test_func, const uint8_t *src1,
+                const uint8_t *src2, uint8_t *dst, CONV_BUF_TYPE *conv_buf,
+                const CompoundParam &compound, const int sub_pix,
+                const InterpFilter filter) {
+    const int width = GetParam().Block().Width();
+    const int height = GetParam().Block().Height();
+    const InterpFilterParams *filter_params =
+        FilterParams(filter, GetParam().Block());
+
+    ConvolveParams conv_params =
+        GetConvolveParams(0, conv_buf, kOutputStride, 8, compound);
+    test_func(src1, width, dst, kOutputStride, width, height, filter_params,
+              sub_pix, &conv_params);
+
+    conv_params = GetConvolveParams(1, conv_buf, kOutputStride, 8, compound);
+    test_func(src2, width, dst, kOutputStride, width, height, filter_params,
+              sub_pix, &conv_params);
+  }
+};
+
+TEST_P(AV1ConvolveXCompoundTest, RunTest) { RunTest(); }
+
+INSTANTIATE_TEST_SUITE_P(C, AV1ConvolveXCompoundTest,
+                         BuildLowbdLumaParams(av1_dist_wtd_convolve_x_c));
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_SUITE_P(SSE2, AV1ConvolveXCompoundTest,
+                         BuildLowbdLumaParams(av1_dist_wtd_convolve_x_sse2));
+#endif
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(AVX2, AV1ConvolveXCompoundTest,
+                         BuildLowbdLumaParams(av1_dist_wtd_convolve_x_avx2));
+#endif
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, AV1ConvolveXCompoundTest,
+                         BuildLowbdLumaParams(av1_dist_wtd_convolve_x_neon));
+#endif
+
+#if CONFIG_AV1_HIGHBITDEPTH
+/////////////////////////////////////////////////
+// Compound convolve-x functions (high bit-depth)
+/////////////////////////////////////////////////
+class AV1ConvolveXHighbdCompoundTest
+    : public AV1ConvolveTest<highbd_convolve_x_func> {
+ public:
+  void RunTest() {
+    auto compound_params = GetCompoundParams();
+    for (int sub_pix = 0; sub_pix < 16; ++sub_pix) {
+      for (int f = EIGHTTAP_REGULAR; f < INTERP_FILTERS_ALL; ++f) {
+        for (const auto &c : compound_params) {
+          TestConvolve(sub_pix, static_cast<InterpFilter>(f), c);
+        }
+      }
+    }
+  }
+
+ protected:
+  virtual const InterpFilterParams *FilterParams(InterpFilter f,
+                                                 const BlockSize &block) const {
+    return av1_get_interp_filter_params_with_block_size(f, block.Width());
+  }
+
+  virtual highbd_convolve_x_func ReferenceFunc() const {
+    return av1_highbd_dist_wtd_convolve_x;
+  }
+
+ private:
+  void TestConvolve(const int sub_pix, const InterpFilter filter,
+                    const CompoundParam &compound) {
+    const int width = GetParam().Block().Width();
+    const int height = GetParam().Block().Height();
+
+    const uint16_t *input1 = FirstRandomInput16(GetParam());
+    const uint16_t *input2 = SecondRandomInput16(GetParam());
+    DECLARE_ALIGNED(32, uint16_t, reference[MAX_SB_SQUARE]);
+    DECLARE_ALIGNED(32, CONV_BUF_TYPE, reference_conv_buf[MAX_SB_SQUARE]);
+    Convolve(ReferenceFunc(), input1, input2, reference, reference_conv_buf,
+             compound, sub_pix, filter);
+
+    DECLARE_ALIGNED(32, uint16_t, test[MAX_SB_SQUARE]);
+    DECLARE_ALIGNED(32, CONV_BUF_TYPE, test_conv_buf[MAX_SB_SQUARE]);
+    Convolve(GetParam().TestFunction(), input1, input2, test, test_conv_buf,
+             compound, sub_pix, filter);
+
+    AssertOutputBufferEq(reference_conv_buf, test_conv_buf, width, height);
+    AssertOutputBufferEq(reference, test, width, height);
+  }
+
+  void Convolve(highbd_convolve_x_func test_func, const uint16_t *src1,
+                const uint16_t *src2, uint16_t *dst, CONV_BUF_TYPE *conv_buf,
+                const CompoundParam &compound, const int sub_pix,
+                const InterpFilter filter) {
+    const int width = GetParam().Block().Width();
+    const int height = GetParam().Block().Height();
+    const int bit_depth = GetParam().BitDepth();
+    const InterpFilterParams *filter_params =
+        FilterParams(filter, GetParam().Block());
+    ConvolveParams conv_params =
+        GetConvolveParams(0, conv_buf, kOutputStride, bit_depth, compound);
+    test_func(src1, width, dst, kOutputStride, width, height, filter_params,
+              sub_pix, &conv_params, bit_depth);
+    conv_params =
+        GetConvolveParams(1, conv_buf, kOutputStride, bit_depth, compound);
+    test_func(src2, width, dst, kOutputStride, width, height, filter_params,
+              sub_pix, &conv_params, bit_depth);
+  }
+};
+
+TEST_P(AV1ConvolveXHighbdCompoundTest, RunTest) { RunTest(); }
+
+INSTANTIATE_TEST_SUITE_P(
+    C, AV1ConvolveXHighbdCompoundTest,
+    BuildHighbdLumaParams(av1_highbd_dist_wtd_convolve_x_c));
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(
+    SSE4_1, AV1ConvolveXHighbdCompoundTest,
+    BuildHighbdLumaParams(av1_highbd_dist_wtd_convolve_x_sse4_1));
+#endif
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(
+    AVX2, AV1ConvolveXHighbdCompoundTest,
+    BuildHighbdLumaParams(av1_highbd_dist_wtd_convolve_x_avx2));
+#endif
+
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+////////////////////////////////////////////////
+// Compound convolve-y functions (low bit-depth)
+////////////////////////////////////////////////
+
+// Note that the X and Y convolve functions have the same type signature and
+// logic; they only differentiate the filter parameters and reference function.
+class AV1ConvolveYCompoundTest : public AV1ConvolveXCompoundTest {
+ protected:
+  virtual const InterpFilterParams *FilterParams(
+      InterpFilter f, const BlockSize &block) const override {
+    return av1_get_interp_filter_params_with_block_size(f, block.Height());
+  }
+
+  virtual convolve_x_func ReferenceFunc() const override {
+    return av1_dist_wtd_convolve_y;
+  }
+};
+
+TEST_P(AV1ConvolveYCompoundTest, RunTest) { RunTest(); }
+
+INSTANTIATE_TEST_SUITE_P(C, AV1ConvolveYCompoundTest,
+                         BuildLowbdLumaParams(av1_dist_wtd_convolve_y_c));
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_SUITE_P(SSE2, AV1ConvolveYCompoundTest,
+                         BuildLowbdLumaParams(av1_dist_wtd_convolve_y_sse2));
+#endif
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(AVX2, AV1ConvolveYCompoundTest,
+                         BuildLowbdLumaParams(av1_dist_wtd_convolve_y_avx2));
+#endif
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, AV1ConvolveYCompoundTest,
+                         BuildLowbdLumaParams(av1_dist_wtd_convolve_y_neon));
+#endif
+
+#if CONFIG_AV1_HIGHBITDEPTH
+/////////////////////////////////////////////////
+// Compound convolve-y functions (high bit-depth)
+/////////////////////////////////////////////////
+
+// Again, the X and Y convolve functions have the same type signature and logic.
+class AV1ConvolveYHighbdCompoundTest : public AV1ConvolveXHighbdCompoundTest {
+  virtual highbd_convolve_x_func ReferenceFunc() const override {
+    return av1_highbd_dist_wtd_convolve_y;
+  }
+  virtual const InterpFilterParams *FilterParams(
+      InterpFilter f, const BlockSize &block) const override {
+    return av1_get_interp_filter_params_with_block_size(f, block.Height());
+  }
+};
+
+TEST_P(AV1ConvolveYHighbdCompoundTest, RunTest) { RunTest(); }
+
+INSTANTIATE_TEST_SUITE_P(
+    C, AV1ConvolveYHighbdCompoundTest,
+    BuildHighbdLumaParams(av1_highbd_dist_wtd_convolve_y_c));
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(
+    SSE4_1, AV1ConvolveYHighbdCompoundTest,
+    BuildHighbdLumaParams(av1_highbd_dist_wtd_convolve_y_sse4_1));
+#endif
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(
+    AVX2, AV1ConvolveYHighbdCompoundTest,
+    BuildHighbdLumaParams(av1_highbd_dist_wtd_convolve_y_avx2));
+#endif
+
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+//////////////////////////////////////////////////////
+// Compound convolve-2d-copy functions (low bit-depth)
+//////////////////////////////////////////////////////
+typedef void (*compound_conv_2d_copy_func)(const uint8_t *src, int src_stride,
+                                           uint8_t *dst, int dst_stride, int w,
+                                           int h, ConvolveParams *conv_params);
+
+class AV1Convolve2DCopyCompoundTest
+    : public AV1ConvolveTest<compound_conv_2d_copy_func> {
+ public:
+  void RunTest() {
+    auto compound_params = GetCompoundParams();
+    for (const auto &compound : compound_params) {
+      TestConvolve(compound);
+    }
+  }
+
+ private:
+  void TestConvolve(const CompoundParam &compound) {
+    const BlockSize &block = GetParam().Block();
+    const int width = block.Width();
+    const int height = block.Height();
+
+    const uint8_t *input1 = FirstRandomInput8(GetParam());
+    const uint8_t *input2 = SecondRandomInput8(GetParam());
+    DECLARE_ALIGNED(32, uint8_t, reference[MAX_SB_SQUARE]);
+    DECLARE_ALIGNED(32, CONV_BUF_TYPE, reference_conv_buf[MAX_SB_SQUARE]);
+    Convolve(av1_dist_wtd_convolve_2d_copy, input1, input2, reference,
+             reference_conv_buf, compound);
+
+    DECLARE_ALIGNED(32, uint8_t, test[MAX_SB_SQUARE]);
+    DECLARE_ALIGNED(32, CONV_BUF_TYPE, test_conv_buf[MAX_SB_SQUARE]);
+    Convolve(GetParam().TestFunction(), input1, input2, test, test_conv_buf,
+             compound);
+
+    AssertOutputBufferEq(reference_conv_buf, test_conv_buf, width, height);
+    AssertOutputBufferEq(reference, test, width, height);
+  }
+
+ private:
+  void Convolve(compound_conv_2d_copy_func test_func, const uint8_t *src1,
+                const uint8_t *src2, uint8_t *dst, uint16_t *conv_buf,
+                const CompoundParam &compound) {
+    const BlockSize &block = GetParam().Block();
+    const int width = block.Width();
+    const int height = block.Height();
+    ConvolveParams conv_params =
+        GetConvolveParams(0, conv_buf, kOutputStride, 8, compound);
+    test_func(src1, width, dst, kOutputStride, width, height, &conv_params);
+
+    conv_params = GetConvolveParams(1, conv_buf, kOutputStride, 8, compound);
+    test_func(src2, width, dst, kOutputStride, width, height, &conv_params);
+  }
+};
+
+TEST_P(AV1Convolve2DCopyCompoundTest, RunTest) { RunTest(); }
+
+INSTANTIATE_TEST_SUITE_P(C, AV1Convolve2DCopyCompoundTest,
+                         BuildLowbdLumaParams(av1_dist_wtd_convolve_2d_copy_c));
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_SUITE_P(
+    SSE2, AV1Convolve2DCopyCompoundTest,
+    BuildLowbdLumaParams(av1_dist_wtd_convolve_2d_copy_sse2));
+#endif
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(
+    AVX2, AV1Convolve2DCopyCompoundTest,
+    BuildLowbdLumaParams(av1_dist_wtd_convolve_2d_copy_avx2));
+#endif
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+    NEON, AV1Convolve2DCopyCompoundTest,
+    BuildLowbdLumaParams(av1_dist_wtd_convolve_2d_copy_neon));
+#endif
+
+#if CONFIG_AV1_HIGHBITDEPTH
+///////////////////////////////////////////////////////
+// Compound convolve-2d-copy functions (high bit-depth)
+///////////////////////////////////////////////////////
+typedef void (*highbd_compound_conv_2d_copy_func)(const uint16_t *src,
+                                                  int src_stride, uint16_t *dst,
+                                                  int dst_stride, int w, int h,
+                                                  ConvolveParams *conv_params,
+                                                  int bd);
+
+class AV1Convolve2DCopyHighbdCompoundTest
+    : public AV1ConvolveTest<highbd_compound_conv_2d_copy_func> {
+ public:
+  void RunTest() {
+    auto compound_params = GetCompoundParams();
+    for (const auto &compound : compound_params) {
+      TestConvolve(compound);
+    }
+  }
+
+ private:
+  void TestConvolve(const CompoundParam &compound) {
+    const BlockSize &block = GetParam().Block();
+    const int width = block.Width();
+    const int height = block.Height();
+
+    const uint16_t *input1 = FirstRandomInput16(GetParam());
+    const uint16_t *input2 = SecondRandomInput16(GetParam());
+    DECLARE_ALIGNED(32, uint16_t, reference[MAX_SB_SQUARE]);
+    DECLARE_ALIGNED(32, CONV_BUF_TYPE, reference_conv_buf[MAX_SB_SQUARE]);
+    Convolve(av1_highbd_dist_wtd_convolve_2d_copy, input1, input2, reference,
+             reference_conv_buf, compound);
+
+    DECLARE_ALIGNED(32, uint16_t, test[MAX_SB_SQUARE]);
+    DECLARE_ALIGNED(32, CONV_BUF_TYPE, test_conv_buf[MAX_SB_SQUARE]);
+    Convolve(GetParam().TestFunction(), input1, input2, test, test_conv_buf,
+             compound);
+
+    AssertOutputBufferEq(reference_conv_buf, test_conv_buf, width, height);
+    AssertOutputBufferEq(reference, test, width, height);
+  }
+
+  void Convolve(highbd_compound_conv_2d_copy_func test_func,
+                const uint16_t *src1, const uint16_t *src2, uint16_t *dst,
+                uint16_t *conv_buf, const CompoundParam &compound) {
+    const BlockSize &block = GetParam().Block();
+    const int width = block.Width();
+    const int height = block.Height();
+    const int bit_depth = GetParam().BitDepth();
+
+    ConvolveParams conv_params =
+        GetConvolveParams(0, conv_buf, kOutputStride, bit_depth, compound);
+    test_func(src1, width, dst, kOutputStride, width, height, &conv_params,
+              bit_depth);
+
+    conv_params =
+        GetConvolveParams(1, conv_buf, kOutputStride, bit_depth, compound);
+    test_func(src2, width, dst, kOutputStride, width, height, &conv_params,
+              bit_depth);
+  }
+};
+
+TEST_P(AV1Convolve2DCopyHighbdCompoundTest, RunTest) { RunTest(); }
+
+INSTANTIATE_TEST_SUITE_P(
+    C, AV1Convolve2DCopyHighbdCompoundTest,
+    BuildHighbdLumaParams(av1_highbd_dist_wtd_convolve_2d_copy_c));
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(
+    SSE4_1, AV1Convolve2DCopyHighbdCompoundTest,
+    BuildHighbdLumaParams(av1_highbd_dist_wtd_convolve_2d_copy_sse4_1));
+#endif
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(
+    AVX2, AV1Convolve2DCopyHighbdCompoundTest,
+    BuildHighbdLumaParams(av1_highbd_dist_wtd_convolve_2d_copy_avx2));
+#endif
+
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+/////////////////////////////////////////////////
+// Compound convolve-2d functions (low bit-depth)
+/////////////////////////////////////////////////
+
+class AV1Convolve2DCompoundTest : public AV1ConvolveTest<convolve_2d_func> {
+ public:
+  void RunTest() {
+    auto compound_params = GetCompoundParams();
+    for (int h_f = EIGHTTAP_REGULAR; h_f < INTERP_FILTERS_ALL; ++h_f) {
+      for (int v_f = EIGHTTAP_REGULAR; v_f < INTERP_FILTERS_ALL; ++v_f) {
+        for (int sub_x = 0; sub_x < 16; ++sub_x) {
+          for (int sub_y = 0; sub_y < 16; ++sub_y) {
+            for (const auto &compound : compound_params) {
+              TestConvolve(static_cast<InterpFilter>(h_f),
+                           static_cast<InterpFilter>(v_f), sub_x, sub_y,
+                           compound);
+            }
+          }
+        }
+      }
+    }
+  }
+
+ private:
+  void TestConvolve(const InterpFilter h_f, const InterpFilter v_f,
+                    const int sub_x, const int sub_y,
+                    const CompoundParam &compound) {
+    const BlockSize &block = GetParam().Block();
+    const int width = block.Width();
+    const int height = block.Height();
+
+    const uint8_t *input1 = FirstRandomInput8(GetParam());
+    const uint8_t *input2 = SecondRandomInput8(GetParam());
+    DECLARE_ALIGNED(32, uint8_t, reference[MAX_SB_SQUARE]);
+    DECLARE_ALIGNED(32, CONV_BUF_TYPE, reference_conv_buf[MAX_SB_SQUARE]);
+    Convolve(av1_dist_wtd_convolve_2d, input1, input2, reference,
+             reference_conv_buf, compound, h_f, v_f, sub_x, sub_y);
+
+    DECLARE_ALIGNED(32, uint8_t, test[MAX_SB_SQUARE]);
+    DECLARE_ALIGNED(32, CONV_BUF_TYPE, test_conv_buf[MAX_SB_SQUARE]);
+    Convolve(GetParam().TestFunction(), input1, input2, test, test_conv_buf,
+             compound, h_f, v_f, sub_x, sub_y);
+
+    AssertOutputBufferEq(reference_conv_buf, test_conv_buf, width, height);
+    AssertOutputBufferEq(reference, test, width, height);
+  }
+
+ private:
+  void Convolve(convolve_2d_func test_func, const uint8_t *src1,
+                const uint8_t *src2, uint8_t *dst, uint16_t *conv_buf,
+                const CompoundParam &compound, const InterpFilter h_f,
+                const InterpFilter v_f, const int sub_x, const int sub_y) {
+    const BlockSize &block = GetParam().Block();
+    const int width = block.Width();
+    const int height = block.Height();
+
+    const InterpFilterParams *filter_params_x =
+        av1_get_interp_filter_params_with_block_size(h_f, width);
+    const InterpFilterParams *filter_params_y =
+        av1_get_interp_filter_params_with_block_size(v_f, height);
+    ConvolveParams conv_params =
+        GetConvolveParams(0, conv_buf, kOutputStride, 8, compound);
+
+    test_func(src1, width, dst, kOutputStride, width, height, filter_params_x,
+              filter_params_y, sub_x, sub_y, &conv_params);
+
+    conv_params = GetConvolveParams(1, conv_buf, kOutputStride, 8, compound);
+    test_func(src2, width, dst, kOutputStride, width, height, filter_params_x,
+              filter_params_y, sub_x, sub_y, &conv_params);
+  }
+};
+
+TEST_P(AV1Convolve2DCompoundTest, RunTest) { RunTest(); }
+
+INSTANTIATE_TEST_SUITE_P(C, AV1Convolve2DCompoundTest,
+                         BuildLowbdLumaParams(av1_dist_wtd_convolve_2d_c));
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_SUITE_P(SSE2, AV1Convolve2DCompoundTest,
+                         BuildLowbdLumaParams(av1_dist_wtd_convolve_2d_sse2));
+#endif
+
+#if HAVE_SSSE3
+INSTANTIATE_TEST_SUITE_P(SSSE3, AV1Convolve2DCompoundTest,
+                         BuildLowbdLumaParams(av1_dist_wtd_convolve_2d_ssse3));
+#endif
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(AVX2, AV1Convolve2DCompoundTest,
+                         BuildLowbdLumaParams(av1_dist_wtd_convolve_2d_avx2));
+#endif
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, AV1Convolve2DCompoundTest,
+                         BuildLowbdLumaParams(av1_dist_wtd_convolve_2d_neon));
+#endif
+
+#if CONFIG_AV1_HIGHBITDEPTH
+//////////////////////////////////////////////////
+// Compound convolve-2d functions (high bit-depth)
+//////////////////////////////////////////////////
+
+class AV1Convolve2DHighbdCompoundTest
+    : public AV1ConvolveTest<highbd_convolve_2d_func> {
+ public:
+  void RunTest() {
+    auto compound_params = GetCompoundParams();
+    for (int h_f = EIGHTTAP_REGULAR; h_f < INTERP_FILTERS_ALL; ++h_f) {
+      for (int v_f = EIGHTTAP_REGULAR; v_f < INTERP_FILTERS_ALL; ++v_f) {
+        for (int sub_x = 0; sub_x < 16; ++sub_x) {
+          for (int sub_y = 0; sub_y < 16; ++sub_y) {
+            for (const auto &compound : compound_params) {
+              TestConvolve(static_cast<InterpFilter>(h_f),
+                           static_cast<InterpFilter>(v_f), sub_x, sub_y,
+                           compound);
+            }
+          }
+        }
+      }
+    }
+  }
+
+ private:
+  void TestConvolve(const InterpFilter h_f, const InterpFilter v_f,
+                    const int sub_x, const int sub_y,
+                    const CompoundParam &compound) {
+    const BlockSize &block = GetParam().Block();
+    const int width = block.Width();
+    const int height = block.Height();
+    const uint16_t *input1 = FirstRandomInput16(GetParam());
+    const uint16_t *input2 = SecondRandomInput16(GetParam());
+    DECLARE_ALIGNED(32, uint16_t, reference[MAX_SB_SQUARE]);
+    DECLARE_ALIGNED(32, CONV_BUF_TYPE, reference_conv_buf[MAX_SB_SQUARE]);
+    Convolve(av1_highbd_dist_wtd_convolve_2d, input1, input2, reference,
+             reference_conv_buf, compound, h_f, v_f, sub_x, sub_y);
+
+    DECLARE_ALIGNED(32, uint16_t, test[MAX_SB_SQUARE]);
+    DECLARE_ALIGNED(32, CONV_BUF_TYPE, test_conv_buf[MAX_SB_SQUARE]);
+    Convolve(GetParam().TestFunction(), input1, input2, test, test_conv_buf,
+             compound, h_f, v_f, sub_x, sub_y);
+
+    AssertOutputBufferEq(reference_conv_buf, test_conv_buf, width, height);
+    AssertOutputBufferEq(reference, test, width, height);
+  }
+
+ private:
+  void Convolve(highbd_convolve_2d_func test_func, const uint16_t *src1,
+                const uint16_t *src2, uint16_t *dst, uint16_t *conv_buf,
+                const CompoundParam &compound, const InterpFilter h_f,
+                const InterpFilter v_f, const int sub_x, const int sub_y) {
+    const BlockSize &block = GetParam().Block();
+    const int width = block.Width();
+    const int height = block.Height();
+
+    const InterpFilterParams *filter_params_x =
+        av1_get_interp_filter_params_with_block_size(h_f, width);
+    const InterpFilterParams *filter_params_y =
+        av1_get_interp_filter_params_with_block_size(v_f, height);
+    const int bit_depth = GetParam().BitDepth();
+    ConvolveParams conv_params =
+        GetConvolveParams(0, conv_buf, kOutputStride, bit_depth, compound);
+    test_func(src1, width, dst, kOutputStride, width, height, filter_params_x,
+              filter_params_y, sub_x, sub_y, &conv_params, bit_depth);
+
+    conv_params =
+        GetConvolveParams(1, conv_buf, kOutputStride, bit_depth, compound);
+    test_func(src2, width, dst, kOutputStride, width, height, filter_params_x,
+              filter_params_y, sub_x, sub_y, &conv_params, bit_depth);
+  }
+};
+
+TEST_P(AV1Convolve2DHighbdCompoundTest, RunTest) { RunTest(); }
+
+INSTANTIATE_TEST_SUITE_P(
+    C, AV1Convolve2DHighbdCompoundTest,
+    BuildHighbdLumaParams(av1_highbd_dist_wtd_convolve_2d_c));
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(
+    SSE4_1, AV1Convolve2DHighbdCompoundTest,
+    BuildHighbdLumaParams(av1_highbd_dist_wtd_convolve_2d_sse4_1));
+#endif
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(
+    AVX2, AV1Convolve2DHighbdCompoundTest,
+    BuildHighbdLumaParams(av1_highbd_dist_wtd_convolve_2d_avx2));
+#endif
+
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+}  // namespace
diff --git a/test/av1_encoder_parms_get_to_decoder.cc b/test/av1_encoder_parms_get_to_decoder.cc
index 76b82f5..717584d 100644
--- a/test/av1_encoder_parms_get_to_decoder.cc
+++ b/test/av1_encoder_parms_get_to_decoder.cc
@@ -155,6 +155,6 @@
   ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
 }
 
-AV1_INSTANTIATE_TEST_CASE(AVxEncoderParmsGetToDecoder,
-                          ::testing::ValuesIn(kAV1EncodeParameterSet));
+AV1_INSTANTIATE_TEST_SUITE(AVxEncoderParmsGetToDecoder,
+                           ::testing::ValuesIn(kAV1EncodeParameterSet));
 }  // namespace
diff --git a/test/av1_ext_tile_test.cc b/test/av1_ext_tile_test.cc
index 424d2f06..ad45667 100644
--- a/test/av1_ext_tile_test.cc
+++ b/test/av1_ext_tile_test.cc
@@ -199,7 +199,7 @@
 
 TEST_P(AV1ExtTileTest, DecoderResultTest) { TestRoundTrip(); }
 
-AV1_INSTANTIATE_TEST_CASE(
+AV1_INSTANTIATE_TEST_SUITE(
     // Now only test 2-pass mode.
     AV1ExtTileTest, ::testing::Values(::libaom_test::kTwoPassGood),
     ::testing::Range(1, 4));
@@ -208,7 +208,7 @@
 
 TEST_P(AV1ExtTileTestLarge, DecoderResultTest) { TestRoundTrip(); }
 
-AV1_INSTANTIATE_TEST_CASE(
+AV1_INSTANTIATE_TEST_SUITE(
     // Now only test 2-pass mode.
     AV1ExtTileTestLarge, ::testing::Values(::libaom_test::kTwoPassGood),
     ::testing::Range(0, 1));
diff --git a/test/av1_fwd_txfm2d_test.cc b/test/av1_fwd_txfm2d_test.cc
index dd60665..0e7eb09 100644
--- a/test/av1_fwd_txfm2d_test.cc
+++ b/test/av1_fwd_txfm2d_test.cc
@@ -354,6 +354,7 @@
 typedef std::tuple<TX_SIZE, lowbd_fwd_txfm_func> LbdFwdTxfm2dParam;
 
 class AV1FwdTxfm2dTest : public ::testing::TestWithParam<LbdFwdTxfm2dParam> {};
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1FwdTxfm2dTest);
 
 TEST_P(AV1FwdTxfm2dTest, match) {
   AV1FwdTxfm2dMatchTest(GET_PARAM(0), GET_PARAM(1));
@@ -418,6 +419,20 @@
                                  Values(av1_lowbd_fwd_txfm_avx2)));
 #endif  // HAVE_AVX2
 
+#if HAVE_NEON
+
+static TX_SIZE fwd_txfm_for_neon[] = { TX_4X4,   TX_8X8,   TX_16X16, TX_32X32,
+                                       TX_64X64, TX_4X8,   TX_8X4,   TX_8X16,
+                                       TX_16X8,  TX_16X32, TX_32X16, TX_32X64,
+                                       TX_64X32, TX_4X16,  TX_16X4,  TX_8X32,
+                                       TX_32X8,  TX_16X64, TX_64X16 };
+
+INSTANTIATE_TEST_SUITE_P(NEON, AV1FwdTxfm2dTest,
+                         Combine(ValuesIn(fwd_txfm_for_neon),
+                                 Values(av1_lowbd_fwd_txfm_neon)));
+
+#endif  // HAVE_NEON
+
 typedef void (*Highbd_fwd_txfm_func)(const int16_t *src_diff, tran_low_t *coeff,
                                      int diff_stride, TxfmParam *txfm_param);
 
@@ -548,6 +563,7 @@
 
 class AV1HighbdFwdTxfm2dTest
     : public ::testing::TestWithParam<HighbdFwdTxfm2dParam> {};
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1HighbdFwdTxfm2dTest);
 
 TEST_P(AV1HighbdFwdTxfm2dTest, match) {
   AV1HighbdFwdTxfm2dMatchTest(GET_PARAM(0), GET_PARAM(1));
@@ -580,4 +596,17 @@
                          Combine(ValuesIn(Highbd_fwd_txfm_for_avx2),
                                  Values(av1_highbd_fwd_txfm)));
 #endif  // HAVE_AVX2
+
+#if HAVE_NEON
+static TX_SIZE Highbd_fwd_txfm_for_neon[] = {
+  TX_4X4,  TX_8X8,  TX_16X16, TX_32X32, TX_64X64, TX_4X8,   TX_8X4,
+  TX_8X16, TX_16X8, TX_16X32, TX_32X16, TX_32X64, TX_64X32, TX_4X16,
+  TX_16X4, TX_8X32, TX_32X8,  TX_16X64, TX_64X16
+};
+
+INSTANTIATE_TEST_SUITE_P(NEON, AV1HighbdFwdTxfm2dTest,
+                         Combine(ValuesIn(Highbd_fwd_txfm_for_neon),
+                                 Values(av1_highbd_fwd_txfm)));
+#endif  // HAVE_NEON
+
 }  // namespace
diff --git a/test/av1_highbd_iht_test.cc b/test/av1_highbd_iht_test.cc
index 8fea500..a576c0f 100644
--- a/test/av1_highbd_iht_test.cc
+++ b/test/av1_highbd_iht_test.cc
@@ -129,6 +129,7 @@
   uint16_t *output_;
   uint16_t *output_ref_;
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1HighbdInvHTNxN);
 
 void AV1HighbdInvHTNxN::RunBitexactCheck() {
   ACMRandom rnd(ACMRandom::DeterministicSeed());
@@ -204,6 +205,7 @@
  private:
   HighbdInvTxfm2dFunc target_func_;
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1HighbdInvTxfm2d);
 
 void AV1HighbdInvTxfm2d::RunAV1InvTxfm2dTest(TX_TYPE tx_type_, TX_SIZE tx_size_,
                                              int run_times, int bit_depth_,
@@ -359,4 +361,10 @@
 INSTANTIATE_TEST_SUITE_P(AVX2, AV1HighbdInvTxfm2d,
                          ::testing::Values(av1_highbd_inv_txfm_add_avx2));
 #endif
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, AV1HighbdInvTxfm2d,
+                         ::testing::Values(av1_highbd_inv_txfm_add_neon));
+#endif
+
 }  // namespace
diff --git a/test/av1_inv_txfm2d_test.cc b/test/av1_inv_txfm2d_test.cc
index eacdf85..d14acfe 100644
--- a/test/av1_inv_txfm2d_test.cc
+++ b/test/av1_inv_txfm2d_test.cc
@@ -272,6 +272,7 @@
  private:
   LbdInvTxfm2dFunc target_func_;
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1LbdInvTxfm2d);
 
 void AV1LbdInvTxfm2d::RunAV1InvTxfm2dTest(TxType tx_type, TxSize tx_size,
                                           int run_times, int gt_int16) {
diff --git a/test/av1_k_means_test.cc b/test/av1_k_means_test.cc
new file mode 100644
index 0000000..cda0c79
--- /dev/null
+++ b/test/av1_k_means_test.cc
@@ -0,0 +1,158 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <cstdlib>
+#include <new>
+#include <tuple>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_codec.h"
+#include "aom/aom_integer.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/aom_timer.h"
+#include "aom_ports/mem.h"
+#include "test/acm_random.h"
+#include "av1/encoder/palette.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+namespace AV1Kmeans {
+typedef void (*av1_calc_indices_dim1_func)(const int *data,
+                                           const int *centroids,
+                                           uint8_t *indices, int n, int k);
+
+typedef std::tuple<av1_calc_indices_dim1_func, BLOCK_SIZE>
+    av1_calc_indices_dim1Param;
+
+class AV1KmeansTest
+    : public ::testing::TestWithParam<av1_calc_indices_dim1Param> {
+ public:
+  ~AV1KmeansTest();
+  void SetUp();
+
+  void TearDown();
+
+ protected:
+  void RunCheckOutput(av1_calc_indices_dim1_func test_impl, BLOCK_SIZE bsize,
+                      int centroids);
+  void RunSpeedTest(av1_calc_indices_dim1_func test_impl, BLOCK_SIZE bsize,
+                    int centroids);
+  bool CheckResult(int n) {
+    for (int idx = 0; idx < n; ++idx) {
+      if (indices1_[idx] != indices2_[idx]) {
+        printf("%d ", idx);
+        printf("%d != %d ", indices1_[idx], indices2_[idx]);
+        return false;
+      }
+    }
+    return true;
+  }
+
+  libaom_test::ACMRandom rnd_;
+  int data_[5096];
+  int centroids_[8];
+  uint8_t indices1_[5096];
+  uint8_t indices2_[5096];
+};
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1KmeansTest);
+
+AV1KmeansTest::~AV1KmeansTest() { ; }
+
+void AV1KmeansTest::SetUp() {
+  rnd_.Reset(libaom_test::ACMRandom::DeterministicSeed());
+  /*uint8_t indices1_[5096];
+  uint8_t indices2_[5096];
+  int data_[5096];*/
+  for (int i = 0; i < 5096; ++i) {
+    data_[i] = (int)rnd_.Rand8() << 4;
+  }
+  for (int i = 0; i < 8; i++) {
+    centroids_[i] = (int)rnd_.Rand8() << 4;
+  }
+}
+
+void AV1KmeansTest::TearDown() { libaom_test::ClearSystemState(); }
+
+void AV1KmeansTest::RunCheckOutput(av1_calc_indices_dim1_func test_impl,
+                                   BLOCK_SIZE bsize, int k) {
+  const int w = block_size_wide[bsize];
+  const int h = block_size_high[bsize];
+  const int n = w * h;
+  av1_calc_indices_dim1_c(data_, centroids_, indices1_, n, k);
+  test_impl(data_, centroids_, indices2_, n, k);
+
+  ASSERT_EQ(CheckResult(n), true) << " block " << bsize << " Centroids " << n;
+}
+
+void AV1KmeansTest::RunSpeedTest(av1_calc_indices_dim1_func test_impl,
+                                 BLOCK_SIZE bsize, int k) {
+  const int w = block_size_wide[bsize];
+  const int h = block_size_high[bsize];
+  const int n = w * h;
+  const int num_loops = 1000000000 / n;
+
+  av1_calc_indices_dim1_func funcs[2] = { av1_calc_indices_dim1_c, test_impl };
+  double elapsed_time[2] = { 0 };
+  for (int i = 0; i < 2; ++i) {
+    aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
+    av1_calc_indices_dim1_func func = funcs[i];
+    for (int j = 0; j < num_loops; ++j) {
+      func(data_, centroids_, indices1_, n, k);
+    }
+    aom_usec_timer_mark(&timer);
+    double time = static_cast<double>(aom_usec_timer_elapsed(&timer));
+    elapsed_time[i] = 1000.0 * time / num_loops;
+  }
+  printf("av1_calc_indices_dim1 indices= %d centroids=%d: %7.2f/%7.2fns", n, k,
+         elapsed_time[0], elapsed_time[1]);
+  printf("(%3.2f)\n", elapsed_time[0] / elapsed_time[1]);
+}
+
+TEST_P(AV1KmeansTest, CheckOutput) {
+  // centroids = 2..8
+  RunCheckOutput(GET_PARAM(0), GET_PARAM(1), 2);
+  RunCheckOutput(GET_PARAM(0), GET_PARAM(1), 3);
+  RunCheckOutput(GET_PARAM(0), GET_PARAM(1), 4);
+  RunCheckOutput(GET_PARAM(0), GET_PARAM(1), 5);
+  RunCheckOutput(GET_PARAM(0), GET_PARAM(1), 6);
+  RunCheckOutput(GET_PARAM(0), GET_PARAM(1), 7);
+  RunCheckOutput(GET_PARAM(0), GET_PARAM(1), 8);
+}
+
+TEST_P(AV1KmeansTest, DISABLED_Speed) {
+  RunSpeedTest(GET_PARAM(0), GET_PARAM(1), 2);
+  RunSpeedTest(GET_PARAM(0), GET_PARAM(1), 3);
+  RunSpeedTest(GET_PARAM(0), GET_PARAM(1), 4);
+  RunSpeedTest(GET_PARAM(0), GET_PARAM(1), 5);
+  RunSpeedTest(GET_PARAM(0), GET_PARAM(1), 6);
+  RunSpeedTest(GET_PARAM(0), GET_PARAM(1), 7);
+  RunSpeedTest(GET_PARAM(0), GET_PARAM(1), 8);
+}
+
+#if HAVE_AVX2
+const BLOCK_SIZE kValidBlockSize[] = { BLOCK_8X8,   BLOCK_8X16,  BLOCK_8X32,
+                                       BLOCK_16X8,  BLOCK_16X16, BLOCK_16X32,
+                                       BLOCK_32X8,  BLOCK_32X16, BLOCK_32X32,
+                                       BLOCK_32X64, BLOCK_64X32, BLOCK_64X64,
+                                       BLOCK_16X64, BLOCK_64X16 };
+
+INSTANTIATE_TEST_SUITE_P(
+    AVX2, AV1KmeansTest,
+    ::testing::Combine(::testing::Values(&av1_calc_indices_dim1_avx2),
+                       ::testing::ValuesIn(kValidBlockSize)));
+#endif
+
+}  // namespace AV1Kmeans
diff --git a/test/av1_nn_predict_test.cc b/test/av1_nn_predict_test.cc
index c03cba8..04b44a2 100644
--- a/test/av1_nn_predict_test.cc
+++ b/test/av1_nn_predict_test.cc
@@ -70,6 +70,7 @@
   float *bias[NN_MAX_HIDDEN_LAYERS + 1] = { 0 };
   float *weights_buf = nullptr, *bias_buf = nullptr;
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(NnPredictTest);
 
 void NnPredictTest::RunNnPredictTest(const NN_CONFIG *const shape) {
   libaom_test::ClearSystemState();
@@ -214,4 +215,9 @@
                          ::testing::Values(av1_nn_predict_sse3));
 #endif
 
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, NnPredictTest,
+                         ::testing::Values(av1_nn_predict_neon));
+#endif
+
 }  // namespace
diff --git a/test/av1_quantize_test.cc b/test/av1_quantize_test.cc
index 39a3c33..020ae54 100644
--- a/test/av1_quantize_test.cc
+++ b/test/av1_quantize_test.cc
@@ -202,6 +202,7 @@
 
   QuantizeFuncParams params_;
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1QuantizeTest);
 
 TEST_P(AV1QuantizeTest, BitExactCheck) { RunQuantizeTest(); }
 TEST_P(AV1QuantizeTest, EobVerify) { RunEobTest(); }
diff --git a/test/av1_round_shift_array_test.cc b/test/av1_round_shift_array_test.cc
index 993fa9f..07f6b56 100644
--- a/test/av1_round_shift_array_test.cc
+++ b/test/av1_round_shift_array_test.cc
@@ -53,6 +53,7 @@
 
   libaom_test::ACMRandom rnd_;
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1CompRoundShiftTest);
 
 AV1CompRoundShiftTest::~AV1CompRoundShiftTest() { ; }
 
diff --git a/test/av1_wedge_utils_test.cc b/test/av1_wedge_utils_test.cc
index f9dc838..69280b4 100644
--- a/test/av1_wedge_utils_test.cc
+++ b/test/av1_wedge_utils_test.cc
@@ -164,6 +164,7 @@
  protected:
   static const int kIterations = 10000;
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(WedgeUtilsSSEOptTest);
 
 TEST_P(WedgeUtilsSSEOptTest, RandomValues) {
   DECLARE_ALIGNED(32, int16_t, r1[MAX_SB_SQUARE]);
@@ -230,6 +231,7 @@
   static const int kIterations = 10000;
   static const int kMaxSize = 8196;  // Size limited by SIMD implementation.
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(WedgeUtilsSignOptTest);
 
 TEST_P(WedgeUtilsSignOptTest, RandomValues) {
   DECLARE_ALIGNED(32, int16_t, r0[MAX_SB_SQUARE]);
@@ -329,6 +331,7 @@
  protected:
   static const int kIterations = 10000;
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(WedgeUtilsDeltaSquaresOptTest);
 
 TEST_P(WedgeUtilsDeltaSquaresOptTest, RandomValues) {
   DECLARE_ALIGNED(32, int16_t, a[MAX_SB_SQUARE]);
diff --git a/test/avg_test.cc b/test/avg_test.cc
index 1742aec..f5c9212 100644
--- a/test/avg_test.cc
+++ b/test/avg_test.cc
@@ -176,7 +176,37 @@
     ASM_REGISTER_STATE_CHECK(c_func_(hbuf_c_, source_data_, 0, height_));
     ASM_REGISTER_STATE_CHECK(asm_func_(hbuf_asm_, source_data_, 0, height_));
     EXPECT_EQ(0, memcmp(hbuf_c_, hbuf_asm_, sizeof(*hbuf_c_) * 16))
-        << "Output mismatch";
+        << "Output mismatch\n";
+  }
+
+  void RunSpeedTest() {
+    const int numIter = 5000000;
+    printf("Height = %d number of iteration is %d \n", height_, numIter);
+    aom_usec_timer c_timer_;
+    aom_usec_timer_start(&c_timer_);
+    for (int i = 0; i < numIter; i++) {
+      c_func_(hbuf_c_, source_data_, 0, height_);
+    }
+    aom_usec_timer_mark(&c_timer_);
+
+    aom_usec_timer asm_timer_;
+    aom_usec_timer_start(&asm_timer_);
+
+    for (int i = 0; i < numIter; i++) {
+      asm_func_(hbuf_asm_, source_data_, 0, height_);
+    }
+    aom_usec_timer_mark(&asm_timer_);
+
+    const int c_sum_time = static_cast<int>(aom_usec_timer_elapsed(&c_timer_));
+    const int asm_sum_time =
+        static_cast<int>(aom_usec_timer_elapsed(&asm_timer_));
+
+    printf("c_time = %d \t simd_time = %d \t Gain = %4.2f \n", c_sum_time,
+           asm_sum_time,
+           (static_cast<float>(c_sum_time) / static_cast<float>(asm_sum_time)));
+
+    EXPECT_EQ(0, memcmp(hbuf_c_, hbuf_asm_, sizeof(*hbuf_c_) * 16))
+        << "Output mismatch\n";
   }
 
  private:
@@ -185,6 +215,7 @@
   int16_t *hbuf_asm_;
   int16_t *hbuf_c_;
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(IntProRowTest);
 
 typedef int16_t (*IntProColFunc)(uint8_t const *ref, const int width);
 
@@ -205,6 +236,34 @@
     ASM_REGISTER_STATE_CHECK(sum_asm_ = asm_func_(source_data_, width_));
     EXPECT_EQ(sum_c_, sum_asm_) << "Output mismatch";
   }
+  void RunSpeedTest() {
+    const int numIter = 5000000;
+    printf("Width = %d number of iteration is %d \n", width_, numIter);
+    aom_usec_timer c_timer_;
+    aom_usec_timer_start(&c_timer_);
+    for (int i = 0; i < numIter; i++) {
+      sum_c_ = c_func_(source_data_, width_);
+    }
+    aom_usec_timer_mark(&c_timer_);
+
+    aom_usec_timer asm_timer_;
+    aom_usec_timer_start(&asm_timer_);
+
+    for (int i = 0; i < numIter; i++) {
+      sum_asm_ = asm_func_(source_data_, width_);
+    }
+    aom_usec_timer_mark(&asm_timer_);
+
+    const int c_sum_time = static_cast<int>(aom_usec_timer_elapsed(&c_timer_));
+    const int asm_sum_time =
+        static_cast<int>(aom_usec_timer_elapsed(&asm_timer_));
+
+    printf("c_time = %d \t simd_time = %d \t Gain = %4.2f \n", c_sum_time,
+           asm_sum_time,
+           (static_cast<float>(c_sum_time) / static_cast<float>(asm_sum_time)));
+
+    EXPECT_EQ(sum_c_, sum_asm_) << "Output mismatch \n";
+  }
 
  private:
   IntProColFunc asm_func_;
@@ -212,6 +271,7 @@
   int16_t sum_asm_;
   int16_t sum_c_;
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(IntProColTest);
 
 TEST_P(IntProRowTest, MinValue) {
   FillConstant(0);
@@ -228,6 +288,11 @@
   RunComparison();
 }
 
+TEST_P(IntProRowTest, DISABLED_Speed) {
+  FillRandom();
+  RunSpeedTest();
+}
+
 TEST_P(IntProColTest, MinValue) {
   FillConstant(0);
   RunComparison();
@@ -243,6 +308,161 @@
   RunComparison();
 }
 
+TEST_P(IntProColTest, DISABLED_Speed) {
+  FillRandom();
+  RunSpeedTest();
+}
+class VectorVarTestBase : public ::testing::Test {
+ public:
+  explicit VectorVarTestBase(int bwl) { m_bwl = bwl; }
+  VectorVarTestBase() {}
+  ~VectorVarTestBase() {}
+
+ protected:
+  static const int kDataAlignment = 16;
+
+  virtual void SetUp() {
+    width = 4 << m_bwl;
+
+    ref_vector = static_cast<int16_t *>(
+        aom_memalign(kDataAlignment, width * sizeof(ref_vector[0])));
+    ASSERT_TRUE(ref_vector != NULL);
+    src_vector = static_cast<int16_t *>(
+        aom_memalign(kDataAlignment, width * sizeof(src_vector[0])));
+    ASSERT_TRUE(src_vector != NULL);
+
+    rnd_.Reset(ACMRandom::DeterministicSeed());
+  }
+  virtual void TearDown() {
+    aom_free(ref_vector);
+    ref_vector = NULL;
+    aom_free(src_vector);
+    src_vector = NULL;
+    libaom_test::ClearSystemState();
+  }
+
+  void FillConstant(int16_t fill_constant_ref, int16_t fill_constant_src) {
+    for (int i = 0; i < width; ++i) {
+      ref_vector[i] = fill_constant_ref;
+      src_vector[i] = fill_constant_src;
+    }
+  }
+
+  void FillRandom() {
+    for (int i = 0; i < width; ++i) {
+      ref_vector[i] =
+          rnd_.Rand16() % max_range;  // acc. aom_vector_var_c brief.
+      src_vector[i] = rnd_.Rand16() % max_range;
+    }
+  }
+
+  int width;
+  int m_bwl;
+  int16_t *ref_vector;
+  int16_t *src_vector;
+  ACMRandom rnd_;
+
+  static const int max_range = 510;
+  static const int num_random_cmp = 50;
+};
+
+typedef int (*VectorVarFunc)(const int16_t *ref, const int16_t *src,
+                             const int bwl);
+
+typedef std::tuple<int, VectorVarFunc, VectorVarFunc> VecVarFunc;
+
+class VectorVarTest : public VectorVarTestBase,
+                      public ::testing::WithParamInterface<VecVarFunc> {
+ public:
+  VectorVarTest()
+      : VectorVarTestBase(GET_PARAM(0)), c_func(GET_PARAM(1)),
+        simd_func(GET_PARAM(2)) {}
+
+ protected:
+  int calcVarC() { return c_func(ref_vector, src_vector, m_bwl); }
+  int calcVarSIMD() { return simd_func(ref_vector, src_vector, m_bwl); }
+
+  VectorVarFunc c_func;
+  VectorVarFunc simd_func;
+};
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(VectorVarTest);
+
+TEST_P(VectorVarTest, MaxVar) {
+  FillConstant(0, max_range);
+  int c_var = calcVarC();
+  int simd_var = calcVarSIMD();
+  ASSERT_EQ(c_var, simd_var);
+}
+TEST_P(VectorVarTest, MaxVarRev) {
+  FillConstant(max_range, 0);
+  int c_var = calcVarC();
+  int simd_var = calcVarSIMD();
+  ASSERT_EQ(c_var, simd_var);
+}
+TEST_P(VectorVarTest, ZeroDiff) {
+  FillConstant(0, 0);
+  int c_var = calcVarC();
+  int simd_var = calcVarSIMD();
+  ASSERT_EQ(c_var, simd_var);
+}
+TEST_P(VectorVarTest, ZeroDiff2) {
+  FillConstant(max_range, max_range);
+  int c_var = calcVarC();
+  int simd_var = calcVarSIMD();
+  ASSERT_EQ(c_var, simd_var);
+}
+TEST_P(VectorVarTest, Constant) {
+  FillConstant(30, 90);
+  int c_var = calcVarC();
+  int simd_var = calcVarSIMD();
+  ASSERT_EQ(c_var, simd_var);
+}
+TEST_P(VectorVarTest, Random) {
+  for (size_t i = 0; i < num_random_cmp; i++) {
+    FillRandom();
+    int c_var = calcVarC();
+    int simd_var = calcVarSIMD();
+    ASSERT_EQ(c_var, simd_var);
+  }
+}
+TEST_P(VectorVarTest, DISABLED_Speed) {
+  FillRandom();
+  const int numIter = 50000;
+  printf("Width = %d number of iteration is %d \n", width, numIter);
+
+  int sum_c_var = 0;
+  int c_var = 0;
+
+  aom_usec_timer c_timer_;
+  aom_usec_timer_start(&c_timer_);
+  for (size_t i = 0; i < numIter; i++) {
+    c_var = calcVarC();
+    sum_c_var += c_var;
+  }
+  aom_usec_timer_mark(&c_timer_);
+
+  int simd_var = 0;
+  int sum_simd_var = 0;
+  aom_usec_timer simd_timer_;
+  aom_usec_timer_start(&simd_timer_);
+  for (size_t i = 0; i < numIter; i++) {
+    simd_var = calcVarSIMD();
+    sum_simd_var += simd_var;
+  }
+  aom_usec_timer_mark(&simd_timer_);
+
+  const int c_sum_time = static_cast<int>(aom_usec_timer_elapsed(&c_timer_));
+  const int simd_sum_time =
+      static_cast<int>(aom_usec_timer_elapsed(&simd_timer_));
+
+  printf("c_time = %d \t simd_time = %d \t Gain = %4.2f \n", c_sum_time,
+         simd_sum_time,
+         (static_cast<float>(c_sum_time) / static_cast<float>(simd_sum_time)));
+
+  EXPECT_EQ(c_var, simd_var) << "Output mismatch \n";
+  EXPECT_EQ(sum_c_var, sum_simd_var) << "Output mismatch \n";
+}
+
 using std::make_tuple;
 
 INSTANTIATE_TEST_SUITE_P(
@@ -286,6 +506,158 @@
                       make_tuple(16, 16, 0, 4, &aom_avg_4x4_neon),
                       make_tuple(16, 16, 5, 4, &aom_avg_4x4_neon),
                       make_tuple(32, 32, 15, 4, &aom_avg_4x4_neon)));
+INSTANTIATE_TEST_SUITE_P(
+    NEON, IntProRowTest,
+    ::testing::Values(make_tuple(16, &aom_int_pro_row_neon, &aom_int_pro_row_c),
+                      make_tuple(32, &aom_int_pro_row_neon, &aom_int_pro_row_c),
+                      make_tuple(64, &aom_int_pro_row_neon, &aom_int_pro_row_c),
+                      make_tuple(128, &aom_int_pro_row_neon,
+                                 &aom_int_pro_row_c)));
+
+INSTANTIATE_TEST_SUITE_P(
+    NEON, IntProColTest,
+    ::testing::Values(make_tuple(16, &aom_int_pro_col_neon, &aom_int_pro_col_c),
+                      make_tuple(32, &aom_int_pro_col_neon, &aom_int_pro_col_c),
+                      make_tuple(64, &aom_int_pro_col_neon, &aom_int_pro_col_c),
+                      make_tuple(128, &aom_int_pro_col_neon,
+                                 &aom_int_pro_col_c)));
+#endif
+
+typedef int (*SatdFunc)(const tran_low_t *coeffs, int length);
+typedef ::testing::tuple<int, SatdFunc, SatdFunc> SatdTestParam;
+class SatdTest : public ::testing::Test,
+                 public ::testing::WithParamInterface<SatdTestParam> {
+ protected:
+  virtual void SetUp() {
+    satd_size_ = GET_PARAM(0);
+    satd_func_ref_ = GET_PARAM(1);
+    satd_func_simd_ = GET_PARAM(2);
+
+    rnd_.Reset(ACMRandom::DeterministicSeed());
+    src_ = reinterpret_cast<tran_low_t *>(
+        aom_memalign(32, sizeof(*src_) * satd_size_));
+    ASSERT_TRUE(src_ != NULL);
+  }
+  virtual void TearDown() {
+    libaom_test::ClearSystemState();
+    aom_free(src_);
+  }
+  void FillConstant(const tran_low_t val) {
+    for (int i = 0; i < satd_size_; ++i) src_[i] = val;
+  }
+  void FillRandom() {
+    for (int i = 0; i < satd_size_; ++i) {
+      src_[i] = static_cast<int16_t>(rnd_.Rand16());
+    }
+  }
+  void Check(int expected) {
+    int total_ref;
+    ASM_REGISTER_STATE_CHECK(total_ref = satd_func_ref_(src_, satd_size_));
+    EXPECT_EQ(expected, total_ref);
+
+    int total_simd;
+    ASM_REGISTER_STATE_CHECK(total_simd = satd_func_simd_(src_, satd_size_));
+    EXPECT_EQ(expected, total_simd);
+  }
+  void RunComparison() {
+    int total_ref;
+    ASM_REGISTER_STATE_CHECK(total_ref = satd_func_ref_(src_, satd_size_));
+
+    int total_simd;
+    ASM_REGISTER_STATE_CHECK(total_simd = satd_func_simd_(src_, satd_size_));
+
+    EXPECT_EQ(total_ref, total_simd);
+  }
+  void RunSpeedTest() {
+    const int numIter = 500000;
+    printf("size = %d number of iteration is %d \n", satd_size_, numIter);
+
+    int total_ref;
+    aom_usec_timer c_timer_;
+    aom_usec_timer_start(&c_timer_);
+    for (int i = 0; i < numIter; i++) {
+      total_ref = satd_func_ref_(src_, satd_size_);
+    }
+    aom_usec_timer_mark(&c_timer_);
+
+    int total_simd;
+    aom_usec_timer simd_timer_;
+    aom_usec_timer_start(&simd_timer_);
+
+    for (int i = 0; i < numIter; i++) {
+      total_simd = satd_func_simd_(src_, satd_size_);
+    }
+    aom_usec_timer_mark(&simd_timer_);
+
+    const int c_sum_time = static_cast<int>(aom_usec_timer_elapsed(&c_timer_));
+    const int simd_sum_time =
+        static_cast<int>(aom_usec_timer_elapsed(&simd_timer_));
+
+    printf(
+        "c_time = %d \t simd_time = %d \t Gain = %4.2f \n", c_sum_time,
+        simd_sum_time,
+        (static_cast<float>(c_sum_time) / static_cast<float>(simd_sum_time)));
+
+    EXPECT_EQ(total_ref, total_simd) << "Output mismatch \n";
+  }
+  int satd_size_;
+
+ private:
+  tran_low_t *src_;
+  SatdFunc satd_func_ref_;
+  SatdFunc satd_func_simd_;
+  ACMRandom rnd_;
+};
+
+TEST_P(SatdTest, MinValue) {
+  const int kMin = -32640;
+  const int expected = -kMin * satd_size_;
+  FillConstant(kMin);
+  Check(expected);
+}
+TEST_P(SatdTest, MaxValue) {
+  const int kMax = 32640;
+  const int expected = kMax * satd_size_;
+  FillConstant(kMax);
+  Check(expected);
+}
+TEST_P(SatdTest, Random) {
+  int expected;
+  switch (satd_size_) {
+    case 16: expected = 205298; break;
+    case 64: expected = 1113950; break;
+    case 256: expected = 4268415; break;
+    case 1024: expected = 16954082; break;
+    default:
+      FAIL() << "Invalid satd size (" << satd_size_
+             << ") valid: 16/64/256/1024";
+  }
+  FillRandom();
+  Check(expected);
+}
+TEST_P(SatdTest, Match) {
+  FillRandom();
+  RunComparison();
+}
+TEST_P(SatdTest, DISABLED_Speed) {
+  FillRandom();
+  RunSpeedTest();
+}
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(SatdTest);
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+    NEON, SatdTest,
+    ::testing::Values(make_tuple(16, &aom_satd_c, &aom_satd_neon),
+                      make_tuple(64, &aom_satd_c, &aom_satd_neon),
+                      make_tuple(256, &aom_satd_c, &aom_satd_neon),
+                      make_tuple(1024, &aom_satd_c, &aom_satd_neon)));
+INSTANTIATE_TEST_SUITE_P(
+    NEON, VectorVarTest,
+    ::testing::Values(make_tuple(2, &aom_vector_var_c, &aom_vector_var_neon),
+                      make_tuple(3, &aom_vector_var_c, &aom_vector_var_neon),
+                      make_tuple(4, &aom_vector_var_c, &aom_vector_var_neon),
+                      make_tuple(5, &aom_vector_var_c, &aom_vector_var_neon)));
 #endif
 
 }  // namespace
diff --git a/test/blend_a64_mask_test.cc b/test/blend_a64_mask_test.cc
index 5c2c291..fc45664 100644
--- a/test/blend_a64_mask_test.cc
+++ b/test/blend_a64_mask_test.cc
@@ -190,6 +190,7 @@
     }
   }
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(BlendA64MaskTest8B);
 
 TEST_P(BlendA64MaskTest8B, RandomValues) {
   for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
@@ -304,6 +305,7 @@
     }
   }
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(BlendA64MaskTest8B_d16);
 
 TEST_P(BlendA64MaskTest8B_d16, RandomValues) {
   for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
@@ -404,6 +406,7 @@
 
   int bit_depth_;
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(BlendA64MaskTestHBD);
 
 TEST_P(BlendA64MaskTestHBD, RandomValues) {
   for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
diff --git a/test/blockd_test.cc b/test/block_test.cc
similarity index 65%
rename from test/blockd_test.cc
rename to test/block_test.cc
index 17e6968..510a756 100644
--- a/test/blockd_test.cc
+++ b/test/block_test.cc
@@ -9,8 +9,13 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
+#include "aom/aom_codec.h"
 #include "av1/common/blockd.h"
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/y4m_video_source.h"
+#include "test/util.h"
 
 // Verify the optimized implementation of get_partition_subsize() produces the
 // same results as the Partition_Subsize lookup table in the spec.
@@ -120,3 +125,78 @@
     }
   }
 }
+
+#if CONFIG_AV1_DECODER && CONFIG_AV1_ENCODER
+namespace {
+// This class is used to validate if sb_size configured is respected
+// in the bitstream
+class SuperBlockSizeTestLarge
+    : public ::libaom_test::CodecTestWith3Params<
+          libaom_test::TestMode, aom_superblock_size_t, aom_rc_mode>,
+      public ::libaom_test::EncoderTest {
+ protected:
+  SuperBlockSizeTestLarge()
+      : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)),
+        superblock_size_(GET_PARAM(2)), rc_end_usage_(GET_PARAM(3)) {
+    sb_size_violated_ = false;
+  }
+  virtual ~SuperBlockSizeTestLarge() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(encoding_mode_);
+    const aom_rational timebase = { 1, 30 };
+    cfg_.g_timebase = timebase;
+    cfg_.rc_end_usage = rc_end_usage_;
+    cfg_.g_threads = 1;
+    cfg_.g_lag_in_frames = 35;
+    cfg_.rc_target_bitrate = 1000;
+  }
+
+  virtual bool DoDecode() const { return 1; }
+
+  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                                  ::libaom_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      encoder->Control(AOME_SET_CPUUSED, 5);
+      encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
+      encoder->Control(AV1E_SET_SUPERBLOCK_SIZE, superblock_size_);
+    }
+  }
+
+  virtual bool HandleDecodeResult(const aom_codec_err_t res_dec,
+                                  libaom_test::Decoder *decoder) {
+    EXPECT_EQ(AOM_CODEC_OK, res_dec) << decoder->DecodeError();
+    if (AOM_CODEC_OK == res_dec &&
+        superblock_size_ != AOM_SUPERBLOCK_SIZE_DYNAMIC) {
+      aom_codec_ctx_t *ctx_dec = decoder->GetDecoder();
+      aom_superblock_size_t sb_size;
+      AOM_CODEC_CONTROL_TYPECHECKED(ctx_dec, AOMD_GET_SB_SIZE, &sb_size);
+      if (superblock_size_ != sb_size) {
+        sb_size_violated_ = true;
+      }
+    }
+    return AOM_CODEC_OK == res_dec;
+  }
+
+  ::libaom_test::TestMode encoding_mode_;
+  aom_superblock_size_t superblock_size_;
+  bool sb_size_violated_;
+  aom_rc_mode rc_end_usage_;
+};
+
+TEST_P(SuperBlockSizeTestLarge, SuperBlockSizeTest) {
+  ::libaom_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 1);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  ASSERT_EQ(sb_size_violated_, false)
+      << "Failed for SB size " << superblock_size_;
+}
+
+AV1_INSTANTIATE_TEST_SUITE(SuperBlockSizeTestLarge,
+                           ::testing::Values(::libaom_test::kOnePassGood,
+                                             ::libaom_test::kTwoPassGood),
+                           ::testing::Values(AOM_SUPERBLOCK_SIZE_64X64,
+                                             AOM_SUPERBLOCK_SIZE_128X128),
+                           ::testing::Values(AOM_Q, AOM_VBR, AOM_CBR, AOM_CQ));
+}  // namespace
+#endif
diff --git a/test/borders_test.cc b/test/borders_test.cc
index 31eacab..841f0d9 100644
--- a/test/borders_test.cc
+++ b/test/borders_test.cc
@@ -80,6 +80,6 @@
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 }
 
-AV1_INSTANTIATE_TEST_CASE(BordersTestLarge,
-                          ::testing::Values(::libaom_test::kTwoPassGood));
+AV1_INSTANTIATE_TEST_SUITE(BordersTestLarge,
+                           ::testing::Values(::libaom_test::kTwoPassGood));
 }  // namespace
diff --git a/test/cdef_test.cc b/test/cdef_test.cc
index a2ec1e3..14fa12b 100644
--- a/test/cdef_test.cc
+++ b/test/cdef_test.cc
@@ -53,8 +53,10 @@
   cdef_filter_block_func cdef;
   cdef_filter_block_func ref_cdef;
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(CDEFBlockTest);
 
 typedef CDEFBlockTest CDEFSpeedTest;
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(CDEFSpeedTest);
 
 void test_cdef(int bsize, int iterations, cdef_filter_block_func cdef,
                cdef_filter_block_func ref_cdef, int boundary, int depth) {
@@ -202,8 +204,10 @@
   find_dir_t finddir;
   find_dir_t ref_finddir;
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(CDEFFindDirTest);
 
 typedef CDEFFindDirTest CDEFFindDirSpeedTest;
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(CDEFFindDirSpeedTest);
 
 void test_finddir(int (*finddir)(const uint16_t *img, int stride, int32_t *var,
                                  int coeff_shift),
diff --git a/test/cfl_test.cc b/test/cfl_test.cc
index d297315..6959dbe 100644
--- a/test/cfl_test.cc
+++ b/test/cfl_test.cc
@@ -183,6 +183,7 @@
   cfl_subtract_average_fn sub_avg;
   cfl_subtract_average_fn sub_avg_ref;
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(CFLSubAvgTest);
 
 TEST_P(CFLSubAvgTest, SubAvgTest) {
   for (int it = 0; it < NUM_ITERATIONS; it++) {
@@ -286,6 +287,7 @@
     fun_444_ref = cfl_get_luma_subsampling_444_lbd_c(tx_size);
   }
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(CFLSubsampleLBDTest);
 
 TEST_P(CFLSubsampleLBDTest, SubsampleLBD420Test) {
   subsampleTest(fun_420, fun_420_ref, width >> 1, height >> 1,
@@ -329,6 +331,7 @@
     fun_444_ref = cfl_get_luma_subsampling_444_hbd_c(tx_size);
   }
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(CFLSubsampleHBDTest);
 
 TEST_P(CFLSubsampleHBDTest, SubsampleHBD420Test) {
   subsampleTest(fun_420, fun_420_ref, width >> 1, height >> 1,
@@ -372,6 +375,7 @@
   cfl_predict_lbd_fn predict;
   cfl_predict_lbd_fn predict_ref;
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(CFLPredictTest);
 
 TEST_P(CFLPredictTest, PredictTest) {
   for (int it = 0; it < NUM_ITERATIONS; it++) {
@@ -419,6 +423,7 @@
   cfl_predict_hbd_fn predict;
   cfl_predict_hbd_fn predict_ref;
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(CFLPredictHBDTest);
 
 TEST_P(CFLPredictHBDTest, PredictHBDTest) {
   int bd = 12;
diff --git a/test/codec_factory.h b/test/codec_factory.h
index 801b894..5ceb70b 100644
--- a/test/codec_factory.h
+++ b/test/codec_factory.h
@@ -161,7 +161,7 @@
 
 const libaom_test::AV1CodecFactory kAV1;
 
-#define AV1_INSTANTIATE_TEST_CASE(test, ...)                                \
+#define AV1_INSTANTIATE_TEST_SUITE(test, ...)                               \
   INSTANTIATE_TEST_SUITE_P(                                                 \
       AV1, test,                                                            \
       ::testing::Combine(                                                   \
diff --git a/test/comp_avg_pred_test.cc b/test/comp_avg_pred_test.cc
index ac625a7..4218ac3 100644
--- a/test/comp_avg_pred_test.cc
+++ b/test/comp_avg_pred_test.cc
@@ -13,10 +13,15 @@
 
 using libaom_test::ACMRandom;
 using libaom_test::AV1DISTWTDCOMPAVG::AV1DISTWTDCOMPAVGTest;
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1DISTWTDCOMPAVGTest);
 using libaom_test::AV1DISTWTDCOMPAVG::AV1DISTWTDCOMPAVGUPSAMPLEDTest;
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1DISTWTDCOMPAVGUPSAMPLEDTest);
 #if CONFIG_AV1_HIGHBITDEPTH
 using libaom_test::AV1DISTWTDCOMPAVG::AV1HighBDDISTWTDCOMPAVGTest;
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1HighBDDISTWTDCOMPAVGTest);
 using libaom_test::AV1DISTWTDCOMPAVG::AV1HighBDDISTWTDCOMPAVGUPSAMPLEDTest;
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(
+    AV1HighBDDISTWTDCOMPAVGUPSAMPLEDTest);
 #endif
 using std::make_tuple;
 using std::tuple;
diff --git a/test/comp_mask_variance_test.cc b/test/comp_mask_variance_test.cc
index b666306..46e1e55 100644
--- a/test/comp_mask_variance_test.cc
+++ b/test/comp_mask_variance_test.cc
@@ -35,12 +35,15 @@
                                     int ref_stride, const uint8_t *mask,
                                     int mask_stride, int invert_mask);
 
-#if HAVE_SSSE3 || HAVE_SSE2 || HAVE_AV2
+#if HAVE_SSSE3 || HAVE_SSE2 || HAVE_AVX2
 const BLOCK_SIZE kValidBlockSize[] = {
-  BLOCK_8X8,   BLOCK_8X16, BLOCK_8X32,  BLOCK_16X8,  BLOCK_16X16,
-  BLOCK_16X32, BLOCK_32X8, BLOCK_32X16, BLOCK_32X32,
+  BLOCK_8X8,   BLOCK_8X16,  BLOCK_8X32,   BLOCK_16X8,   BLOCK_16X16,
+  BLOCK_16X32, BLOCK_32X8,  BLOCK_32X16,  BLOCK_32X32,  BLOCK_32X64,
+  BLOCK_64X32, BLOCK_64X64, BLOCK_64X128, BLOCK_128X64, BLOCK_128X128,
+  BLOCK_16X64, BLOCK_64X16
 };
 #endif
+
 typedef std::tuple<comp_mask_pred_func, BLOCK_SIZE> CompMaskPredParam;
 
 class AV1CompMaskVarianceTest
@@ -53,6 +56,8 @@
 
  protected:
   void RunCheckOutput(comp_mask_pred_func test_impl, BLOCK_SIZE bsize, int inv);
+  void RunCheckDiffMask(comp_mask_pred_func test_impl, BLOCK_SIZE bsize,
+                        int inv);
   void RunSpeedTest(comp_mask_pred_func test_impl, BLOCK_SIZE bsize);
   bool CheckResult(int width, int height) {
     for (int y = 0; y < height; ++y) {
@@ -75,6 +80,7 @@
   uint8_t *ref_buffer_;
   uint8_t *ref_;
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1CompMaskVarianceTest);
 
 AV1CompMaskVarianceTest::~AV1CompMaskVarianceTest() { ; }
 
@@ -119,6 +125,23 @@
   }
 }
 
+void AV1CompMaskVarianceTest::RunCheckDiffMask(comp_mask_pred_func test_impl,
+                                               BLOCK_SIZE bsize, int inv) {
+  const int w = block_size_wide[bsize];
+  const int h = block_size_high[bsize];
+  static uint8_t *mask;
+  mask = (uint8_t *)malloc(64 * w * h);
+  av1_diffwtd_mask(mask, inv, 38, pred_, w, ref_, MAX_SB_SIZE, h, w);
+
+  aom_comp_mask_pred_c(comp_pred1_, pred_, w, h, ref_, MAX_SB_SIZE, mask, w,
+                       inv);
+  test_impl(comp_pred2_, pred_, w, h, ref_, MAX_SB_SIZE, mask, w, inv);
+
+  ASSERT_EQ(CheckResult(w, h), true) << " Diffwtd "
+                                     << " inv " << inv;
+  free(mask);
+}
+
 void AV1CompMaskVarianceTest::RunSpeedTest(comp_mask_pred_func test_impl,
                                            BLOCK_SIZE bsize) {
   const int w = block_size_wide[bsize];
@@ -150,6 +173,8 @@
   // inv = 0, 1
   RunCheckOutput(GET_PARAM(0), GET_PARAM(1), 0);
   RunCheckOutput(GET_PARAM(0), GET_PARAM(1), 1);
+  RunCheckDiffMask(GET_PARAM(0), GET_PARAM(1), 0);
+  RunCheckDiffMask(GET_PARAM(0), GET_PARAM(1), 1);
 }
 
 TEST_P(AV1CompMaskVarianceTest, DISABLED_Speed) {
@@ -296,6 +321,8 @@
  protected:
   void RunCheckOutput(highbd_comp_mask_pred_func test_impl, BLOCK_SIZE bsize,
                       int inv);
+  void RunCheckDiffMask(highbd_comp_mask_pred_func test_impl, BLOCK_SIZE bsize,
+                        int inv);
   void RunSpeedTest(highbd_comp_mask_pred_func test_impl, BLOCK_SIZE bsize);
   bool CheckResult(int width, int height) {
     for (int y = 0; y < height; ++y) {
@@ -318,6 +345,7 @@
   uint16_t *ref_buffer_;
   uint16_t *ref_;
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1HighbdCompMaskVarianceTest);
 
 AV1HighbdCompMaskVarianceTest::~AV1HighbdCompMaskVarianceTest() { ; }
 
@@ -329,9 +357,9 @@
       (uint16_t *)aom_memalign(16, MAX_SB_SQUARE * sizeof(*comp_pred1_));
   comp_pred2_ =
       (uint16_t *)aom_memalign(16, MAX_SB_SQUARE * sizeof(*comp_pred2_));
-  pred_ = (uint16_t *)aom_memalign(16, MAX_SB_SQUARE * sizeof(*pred_));
+  pred_ = (uint16_t *)aom_memalign(16, 4 * MAX_SB_SQUARE * sizeof(*pred_));
   ref_buffer_ = (uint16_t *)aom_memalign(
-      16, (MAX_SB_SQUARE + (8 * MAX_SB_SIZE)) * sizeof(*ref_buffer_));
+      16, (4 * MAX_SB_SQUARE + (8 * MAX_SB_SIZE)) * sizeof(*ref_buffer_));
   ref_ = ref_buffer_ + (8 * MAX_SB_SIZE);
 }
 
@@ -372,6 +400,35 @@
   }
 }
 
+void AV1HighbdCompMaskVarianceTest::RunCheckDiffMask(
+    highbd_comp_mask_pred_func test_impl, BLOCK_SIZE bsize, int inv) {
+  int bd_ = GET_PARAM(2);
+  const int w = block_size_wide[bsize];
+  const int h = block_size_high[bsize];
+
+  for (int i = 0; i < MAX_SB_SQUARE; ++i) {
+    pred_[i] = rnd_.Rand16() & ((1 << bd_) - 1);
+  }
+  for (int i = 0; i < MAX_SB_SQUARE + (8 * MAX_SB_SIZE); ++i) {
+    ref_buffer_[i] = rnd_.Rand16() & ((1 << bd_) - 1);
+  }
+  static uint8_t *mask;
+  mask = (uint8_t *)malloc(64 * w * h);
+  av1_diffwtd_mask_highbd(mask, inv, 38, pred_, w, ref_, MAX_SB_SIZE, h, w,
+                          bd_);
+
+  aom_highbd_comp_mask_pred_c(
+      CONVERT_TO_BYTEPTR(comp_pred1_), CONVERT_TO_BYTEPTR(pred_), w, h,
+      CONVERT_TO_BYTEPTR(ref_), MAX_SB_SIZE, mask, w, inv);
+
+  test_impl(CONVERT_TO_BYTEPTR(comp_pred2_), CONVERT_TO_BYTEPTR(pred_), w, h,
+            CONVERT_TO_BYTEPTR(ref_), MAX_SB_SIZE, mask, w, inv);
+
+  ASSERT_EQ(CheckResult(w, h), true) << " Diffwtd "
+                                     << " inv " << inv;
+  free(mask);
+}
+
 void AV1HighbdCompMaskVarianceTest::RunSpeedTest(
     highbd_comp_mask_pred_func test_impl, BLOCK_SIZE bsize) {
   int bd_ = GET_PARAM(2);
@@ -415,6 +472,8 @@
   // inv = 0, 1
   RunCheckOutput(GET_PARAM(0), GET_PARAM(1), 0);
   RunCheckOutput(GET_PARAM(0), GET_PARAM(1), 1);
+  RunCheckDiffMask(GET_PARAM(0), GET_PARAM(1), 0);
+  RunCheckDiffMask(GET_PARAM(0), GET_PARAM(1), 1);
 }
 
 TEST_P(AV1HighbdCompMaskVarianceTest, DISABLED_Speed) {
diff --git a/test/convolve_test.cc b/test/convolve_test.cc
index 0b1eea1..f4f8f39 100644
--- a/test/convolve_test.cc
+++ b/test/convolve_test.cc
@@ -39,10 +39,9 @@
                              int w, int h);
 
 struct ConvolveFunctions {
-  ConvolveFunctions(ConvolveFunc copy, ConvolveFunc h8, ConvolveFunc v8, int bd)
-      : copy_(copy), h8_(h8), v8_(v8), use_highbd_(bd) {}
+  ConvolveFunctions(ConvolveFunc h8, ConvolveFunc v8, int bd)
+      : h8_(h8), v8_(v8), use_highbd_(bd) {}
 
-  ConvolveFunc copy_;
   ConvolveFunc h8_;
   ConvolveFunc v8_;
   int use_highbd_;  // 0 if high bitdepth not used, else the actual bit depth.
@@ -269,7 +268,7 @@
 
 class ConvolveTest : public ::testing::TestWithParam<ConvolveParam> {
  public:
-  static void SetUpTestCase() {
+  static void SetUpTestSuite() {
     // Force input_ to be unaligned, output to be 16 byte aligned.
     input_ = reinterpret_cast<uint8_t *>(
                  aom_memalign(kDataAlignment, kInputBufferSize + 1)) +
@@ -293,7 +292,7 @@
 
   virtual void TearDown() { libaom_test::ClearSystemState(); }
 
-  static void TearDownTestCase() {
+  static void TearDownTestSuite() {
     aom_free(input_ - 1);
     input_ = NULL;
     aom_free(ref8_);
@@ -479,22 +478,6 @@
 
 TEST_P(ConvolveTest, GuardBlocks) { CheckGuardBlocks(); }
 
-TEST_P(ConvolveTest, Copy) {
-  uint8_t *const in = input();
-  uint8_t *const out = output();
-
-  ASM_REGISTER_STATE_CHECK(UUT_->copy_(in, kInputStride, out, kOutputStride,
-                                       NULL, 0, NULL, 0, Width(), Height()));
-
-  CheckGuardBlocks();
-
-  for (int y = 0; y < Height(); ++y)
-    for (int x = 0; x < Width(); ++x)
-      ASSERT_EQ(lookup(out, y * kOutputStride + x),
-                lookup(in, y * kInputStride + x))
-          << "(" << x << "," << y << ")";
-}
-
 const int kNumFilterBanks = SWITCHABLE_FILTERS;
 const int kNumFilters = 16;
 
@@ -561,9 +544,7 @@
                 in, kInputStride, out, kOutputStride, filters[filter_x], 16,
                 kInvalidFilter, 16, Width(), Height()));
           else
-            ASM_REGISTER_STATE_CHECK(UUT_->copy_(
-                in, kInputStride, out, kOutputStride, kInvalidFilter, 0,
-                kInvalidFilter, 0, Width(), Height()));
+            continue;
 
           CheckGuardBlocks();
 
@@ -645,9 +626,7 @@
                     in, kInputStride, out, kOutputStride, filters[filter_x], 16,
                     kInvalidFilter, 16, Width(), Height()));
               else
-                ASM_REGISTER_STATE_CHECK(UUT_->copy_(
-                    in, kInputStride, out, kOutputStride, kInvalidFilter, 0,
-                    kInvalidFilter, 0, Width(), Height()));
+                continue;
 
               for (int y = 0; y < Height(); ++y)
                 for (int x = 0; x < Width(); ++x)
@@ -664,26 +643,6 @@
   }
 }
 
-TEST_P(ConvolveTest, DISABLED_Copy_Speed) {
-  const uint8_t *const in = input();
-  uint8_t *const out = output();
-  const int kNumTests = 5000000;
-  const int width = Width();
-  const int height = Height();
-  aom_usec_timer timer;
-
-  aom_usec_timer_start(&timer);
-  for (int n = 0; n < kNumTests; ++n) {
-    UUT_->copy_(in, kInputStride, out, kOutputStride, NULL, 0, NULL, 0, width,
-                height);
-  }
-  aom_usec_timer_mark(&timer);
-
-  const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
-  printf("convolve_copy_%dx%d_%d: %d us\n", width, height,
-         UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time);
-}
-
 TEST_P(ConvolveTest, DISABLED_Speed) {
   uint8_t *const in = input();
   uint8_t *const out = output();
@@ -762,9 +721,6 @@
                       filter_x_stride, filter_y, filter_y_stride, w, h, bd); \
   }
 #if HAVE_SSE2 && ARCH_X86_64
-WRAP(convolve_copy_sse2, 8)
-WRAP(convolve_copy_sse2, 10)
-WRAP(convolve_copy_sse2, 12)
 WRAP(convolve8_horiz_sse2, 8)
 WRAP(convolve8_vert_sse2, 8)
 WRAP(convolve8_horiz_sse2, 10)
@@ -773,26 +729,20 @@
 WRAP(convolve8_vert_sse2, 12)
 #endif  // HAVE_SSE2 && ARCH_X86_64
 
-WRAP(convolve_copy_c, 8)
 WRAP(convolve8_horiz_c, 8)
 WRAP(convolve8_vert_c, 8)
-WRAP(convolve_copy_c, 10)
 WRAP(convolve8_horiz_c, 10)
 WRAP(convolve8_vert_c, 10)
-WRAP(convolve_copy_c, 12)
 WRAP(convolve8_horiz_c, 12)
 WRAP(convolve8_vert_c, 12)
 
 #if HAVE_AVX2
-WRAP(convolve_copy_avx2, 8)
 WRAP(convolve8_horiz_avx2, 8)
 WRAP(convolve8_vert_avx2, 8)
 
-WRAP(convolve_copy_avx2, 10)
 WRAP(convolve8_horiz_avx2, 10)
 WRAP(convolve8_vert_avx2, 10)
 
-WRAP(convolve_copy_avx2, 12)
 WRAP(convolve8_horiz_avx2, 12)
 WRAP(convolve8_vert_avx2, 12)
 #endif  // HAVE_AVX2
@@ -801,21 +751,18 @@
 #undef WRAP
 
 #if CONFIG_AV1_HIGHBITDEPTH
-const ConvolveFunctions wrap_convolve8_c(wrap_convolve_copy_c_8,
-                                         wrap_convolve8_horiz_c_8,
+const ConvolveFunctions wrap_convolve8_c(wrap_convolve8_horiz_c_8,
                                          wrap_convolve8_vert_c_8, 8);
-const ConvolveFunctions wrap_convolve10_c(wrap_convolve_copy_c_10,
-                                          wrap_convolve8_horiz_c_10,
+const ConvolveFunctions wrap_convolve10_c(wrap_convolve8_horiz_c_10,
                                           wrap_convolve8_vert_c_10, 10);
-const ConvolveFunctions wrap_convolve12_c(wrap_convolve_copy_c_12,
-                                          wrap_convolve8_horiz_c_12,
+const ConvolveFunctions wrap_convolve12_c(wrap_convolve8_horiz_c_12,
                                           wrap_convolve8_vert_c_12, 12);
 const ConvolveParam kArrayConvolve_c[] = { ALL_SIZES(wrap_convolve8_c),
                                            ALL_SIZES(wrap_convolve10_c),
                                            ALL_SIZES(wrap_convolve12_c) };
 #else
-const ConvolveFunctions convolve8_c(aom_convolve_copy_c, aom_convolve8_horiz_c,
-                                    aom_convolve8_vert_c, 0);
+const ConvolveFunctions convolve8_c(aom_convolve8_horiz_c, aom_convolve8_vert_c,
+                                    0);
 const ConvolveParam kArrayConvolve_c[] = { ALL_SIZES(convolve8_c) };
 #endif
 
@@ -824,21 +771,17 @@
 
 #if HAVE_SSE2 && ARCH_X86_64
 #if CONFIG_AV1_HIGHBITDEPTH
-const ConvolveFunctions wrap_convolve8_sse2(wrap_convolve_copy_sse2_8,
-                                            wrap_convolve8_horiz_sse2_8,
+const ConvolveFunctions wrap_convolve8_sse2(wrap_convolve8_horiz_sse2_8,
                                             wrap_convolve8_vert_sse2_8, 8);
-const ConvolveFunctions wrap_convolve10_sse2(wrap_convolve_copy_sse2_10,
-                                             wrap_convolve8_horiz_sse2_10,
+const ConvolveFunctions wrap_convolve10_sse2(wrap_convolve8_horiz_sse2_10,
                                              wrap_convolve8_vert_sse2_10, 10);
-const ConvolveFunctions wrap_convolve12_sse2(wrap_convolve_copy_sse2_12,
-                                             wrap_convolve8_horiz_sse2_12,
+const ConvolveFunctions wrap_convolve12_sse2(wrap_convolve8_horiz_sse2_12,
                                              wrap_convolve8_vert_sse2_12, 12);
 const ConvolveParam kArrayConvolve_sse2[] = { ALL_SIZES(wrap_convolve8_sse2),
                                               ALL_SIZES(wrap_convolve10_sse2),
                                               ALL_SIZES(wrap_convolve12_sse2) };
 #else
-const ConvolveFunctions convolve8_sse2(aom_convolve_copy_c,
-                                       aom_convolve8_horiz_sse2,
+const ConvolveFunctions convolve8_sse2(aom_convolve8_horiz_sse2,
                                        aom_convolve8_vert_sse2, 0);
 const ConvolveParam kArrayConvolve_sse2[] = { ALL_SIZES(convolve8_sse2) };
 #endif
@@ -847,8 +790,7 @@
 #endif
 
 #if HAVE_SSSE3
-const ConvolveFunctions convolve8_ssse3(aom_convolve_copy_c,
-                                        aom_convolve8_horiz_ssse3,
+const ConvolveFunctions convolve8_ssse3(aom_convolve8_horiz_ssse3,
                                         aom_convolve8_vert_ssse3, 0);
 
 const ConvolveParam kArrayConvolve8_ssse3[] = { ALL_SIZES(convolve8_ssse3) };
@@ -858,22 +800,18 @@
 
 #if HAVE_AVX2
 #if CONFIG_AV1_HIGHBITDEPTH
-const ConvolveFunctions wrap_convolve8_avx2(wrap_convolve_copy_avx2_8,
-                                            wrap_convolve8_horiz_avx2_8,
+const ConvolveFunctions wrap_convolve8_avx2(wrap_convolve8_horiz_avx2_8,
                                             wrap_convolve8_vert_avx2_8, 8);
-const ConvolveFunctions wrap_convolve10_avx2(wrap_convolve_copy_avx2_10,
-                                             wrap_convolve8_horiz_avx2_10,
+const ConvolveFunctions wrap_convolve10_avx2(wrap_convolve8_horiz_avx2_10,
                                              wrap_convolve8_vert_avx2_10, 10);
-const ConvolveFunctions wrap_convolve12_avx2(wrap_convolve_copy_avx2_12,
-                                             wrap_convolve8_horiz_avx2_12,
+const ConvolveFunctions wrap_convolve12_avx2(wrap_convolve8_horiz_avx2_12,
                                              wrap_convolve8_vert_avx2_12, 12);
 const ConvolveParam kArray_Convolve8_avx2[] = {
   ALL_SIZES_64(wrap_convolve8_avx2), ALL_SIZES_64(wrap_convolve10_avx2),
   ALL_SIZES_64(wrap_convolve12_avx2)
 };
 #else
-const ConvolveFunctions convolve8_avx2(aom_convolve_copy_c,
-                                       aom_convolve8_horiz_avx2,
+const ConvolveFunctions convolve8_avx2(aom_convolve8_horiz_avx2,
                                        aom_convolve8_vert_avx2, 0);
 const ConvolveParam kArray_Convolve8_avx2[] = { ALL_SIZES(convolve8_avx2) };
 #endif
diff --git a/test/corner_match_test.cc b/test/corner_match_test.cc
index c685dca..9c3a2b9 100644
--- a/test/corner_match_test.cc
+++ b/test/corner_match_test.cc
@@ -47,6 +47,7 @@
 
   libaom_test::ACMRandom rnd_;
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1CornerMatchTest);
 
 AV1CornerMatchTest::~AV1CornerMatchTest() {}
 void AV1CornerMatchTest::SetUp() {
diff --git a/test/cpu_speed_test.cc b/test/cpu_speed_test.cc
index 2a16497..9ef3a5c 100644
--- a/test/cpu_speed_test.cc
+++ b/test/cpu_speed_test.cc
@@ -169,12 +169,12 @@
 TEST_P(CpuSpeedTestLarge, TestEncodeHighBitrate) { TestEncodeHighBitrate(); }
 TEST_P(CpuSpeedTestLarge, TestLowBitrate) { TestLowBitrate(); }
 
-AV1_INSTANTIATE_TEST_CASE(CpuSpeedTest,
-                          ::testing::Values(::libaom_test::kTwoPassGood,
-                                            ::libaom_test::kOnePassGood),
-                          ::testing::Range(1, 3));
-AV1_INSTANTIATE_TEST_CASE(CpuSpeedTestLarge,
-                          ::testing::Values(::libaom_test::kTwoPassGood,
-                                            ::libaom_test::kOnePassGood),
-                          ::testing::Range(0, 1));
+AV1_INSTANTIATE_TEST_SUITE(CpuSpeedTest,
+                           ::testing::Values(::libaom_test::kTwoPassGood,
+                                             ::libaom_test::kOnePassGood),
+                           ::testing::Range(1, 3));
+AV1_INSTANTIATE_TEST_SUITE(CpuSpeedTestLarge,
+                           ::testing::Values(::libaom_test::kTwoPassGood,
+                                             ::libaom_test::kOnePassGood),
+                           ::testing::Range(0, 1));
 }  // namespace
diff --git a/test/datarate_test.cc b/test/datarate_test.cc
index 053c055..6501b47 100644
--- a/test/datarate_test.cc
+++ b/test/datarate_test.cc
@@ -58,7 +58,7 @@
     ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
     ASSERT_GE(effective_datarate_, cfg_.rc_target_bitrate * 0.7)
         << " The datarate for the file is lower than target by too much!";
-    ASSERT_LE(effective_datarate_, cfg_.rc_target_bitrate * 1.3)
+    ASSERT_LE(effective_datarate_, cfg_.rc_target_bitrate * 1.4)
         << " The datarate for the file is greater than target by too much!";
   }
 
@@ -84,6 +84,29 @@
         << " The datarate for the file is greater than target by too much!";
   }
 
+  virtual void ErrorResilienceOnSceneCuts() {
+    if (GET_PARAM(4) > 0) return;
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_buf_optimal_sz = 500;
+    cfg_.rc_buf_sz = 1000;
+    cfg_.rc_dropframe_thresh = 0;
+    cfg_.g_error_resilient = 1;
+    cfg_.rc_min_quantizer = 0;
+    cfg_.rc_max_quantizer = 63;
+    cfg_.rc_end_usage = AOM_CBR;
+    cfg_.g_lag_in_frames = 0;
+
+    ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352,
+                                         288, 30, 1, 0, 300);
+    cfg_.rc_target_bitrate = 500;
+    ResetModel();
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    ASSERT_GE(effective_datarate_, cfg_.rc_target_bitrate * 0.85)
+        << " The datarate for the file is lower than target by too much!";
+    ASSERT_LE(effective_datarate_, cfg_.rc_target_bitrate * 1.15)
+        << " The datarate for the file is greater than target by too much!";
+  }
+
   virtual void BasicRateTargetingCBRPeriodicKeyFrameTest() {
     cfg_.rc_buf_initial_sz = 500;
     cfg_.rc_buf_optimal_sz = 500;
@@ -108,6 +131,31 @@
         << " The datarate for the file is greater than target by too much!";
   }
 
+  virtual void CBRPeriodicKeyFrameOnSceneCuts() {
+    if (GET_PARAM(4) > 0) return;
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_buf_optimal_sz = 500;
+    cfg_.rc_buf_sz = 1000;
+    cfg_.rc_dropframe_thresh = 0;
+    cfg_.rc_min_quantizer = 0;
+    cfg_.rc_max_quantizer = 63;
+    cfg_.rc_end_usage = AOM_CBR;
+    cfg_.g_lag_in_frames = 0;
+    // Periodic keyframe
+    cfg_.kf_max_dist = 30;
+    cfg_.kf_min_dist = 30;
+
+    ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352,
+                                         288, 30, 1, 0, 300);
+    cfg_.rc_target_bitrate = 500;
+    ResetModel();
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    ASSERT_GE(effective_datarate_, cfg_.rc_target_bitrate * 0.85)
+        << " The datarate for the file is lower than target by too much!";
+    ASSERT_LE(effective_datarate_, cfg_.rc_target_bitrate * 1.3)
+        << " The datarate for the file is greater than target by too much!";
+  }
+
   virtual void BasicRateTargetingAQModeOnOffCBRTest() {
     if (GET_PARAM(4) > 0) return;
     cfg_.rc_buf_initial_sz = 500;
@@ -125,8 +173,7 @@
 
     ::libaom_test::I420VideoSource video("pixel_capture_w320h240.yuv", 320, 240,
                                          30, 1, 0, 310);
-    const int bitrate_array[1] = { 60 };
-    cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
+    cfg_.rc_target_bitrate = 60;
     ResetModel();
     ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
     ASSERT_GE(effective_datarate_, cfg_.rc_target_bitrate * 0.85)
@@ -245,6 +292,16 @@
   BasicRateTargetingCBRPeriodicKeyFrameTest();
 }
 
+// Check basic rate targeting for periodic key frame, aligned with scene change.
+TEST_P(DatarateTestLarge, PeriodicKeyFrameCBROnSceneCuts) {
+  CBRPeriodicKeyFrameOnSceneCuts();
+}
+
+// Check basic rate targeting with error resilience on for scene cuts.
+TEST_P(DatarateTestLarge, ErrorResilienceOnSceneCuts) {
+  ErrorResilienceOnSceneCuts();
+}
+
 // Check basic rate targeting for CBR.
 TEST_P(DatarateTestLarge, BasicRateTargeting444CBR) {
   BasicRateTargeting444CBRTest();
@@ -330,6 +387,16 @@
   BasicRateTargetingCBRPeriodicKeyFrameTest();
 }
 
+// Check basic rate targeting for periodic key frame, aligned with scene change.
+TEST_P(DatarateTestRealtime, PeriodicKeyFrameCBROnSceneCuts) {
+  CBRPeriodicKeyFrameOnSceneCuts();
+}
+
+// Check basic rate targeting with error resilience on for scene cuts.
+TEST_P(DatarateTestRealtime, ErrorResilienceOnSceneCuts) {
+  ErrorResilienceOnSceneCuts();
+}
+
 // Check basic rate targeting for CBR.
 TEST_P(DatarateTestRealtime, BasicRateTargeting444CBR) {
   BasicRateTargeting444CBRTest();
@@ -347,27 +414,27 @@
   ChangingSpeedTest();
 }
 
-AV1_INSTANTIATE_TEST_CASE(DatarateTestLarge,
-                          ::testing::Values(::libaom_test::kRealTime),
-                          ::testing::Range(5, 7), ::testing::Values(0, 3),
-                          ::testing::Values(0, 1));
+AV1_INSTANTIATE_TEST_SUITE(DatarateTestLarge,
+                           ::testing::Values(::libaom_test::kRealTime),
+                           ::testing::Range(5, 7), ::testing::Values(0, 3),
+                           ::testing::Values(0, 1));
 
-AV1_INSTANTIATE_TEST_CASE(DatarateTestFrameDropLarge,
-                          ::testing::Values(::libaom_test::kRealTime),
-                          ::testing::Range(5, 7), ::testing::Values(0, 3));
+AV1_INSTANTIATE_TEST_SUITE(DatarateTestFrameDropLarge,
+                           ::testing::Values(::libaom_test::kRealTime),
+                           ::testing::Range(5, 7), ::testing::Values(0, 3));
 
-AV1_INSTANTIATE_TEST_CASE(DatarateTestRealtime,
-                          ::testing::Values(::libaom_test::kRealTime),
-                          ::testing::Range(7, 9), ::testing::Values(0, 3),
-                          ::testing::Values(0, 1));
+AV1_INSTANTIATE_TEST_SUITE(DatarateTestRealtime,
+                           ::testing::Values(::libaom_test::kRealTime),
+                           ::testing::Range(7, 10), ::testing::Values(0, 3),
+                           ::testing::Values(0, 1));
 
-AV1_INSTANTIATE_TEST_CASE(DatarateTestFrameDropRealtime,
-                          ::testing::Values(::libaom_test::kRealTime),
-                          ::testing::Range(7, 9), ::testing::Values(0, 3));
+AV1_INSTANTIATE_TEST_SUITE(DatarateTestFrameDropRealtime,
+                           ::testing::Values(::libaom_test::kRealTime),
+                           ::testing::Range(7, 10), ::testing::Values(0, 3));
 
-AV1_INSTANTIATE_TEST_CASE(DatarateTestSpeedChangeRealtime,
-                          ::testing::Values(::libaom_test::kRealTime),
-                          ::testing::Values(0, 3));
+AV1_INSTANTIATE_TEST_SUITE(DatarateTestSpeedChangeRealtime,
+                           ::testing::Values(::libaom_test::kRealTime),
+                           ::testing::Values(0, 3));
 
 }  // namespace
 }  // namespace datarate_test
diff --git a/test/datarate_test.h b/test/datarate_test.h
index 3c15731..c596b22 100644
--- a/test/datarate_test.h
+++ b/test/datarate_test.h
@@ -56,6 +56,10 @@
       encoder->Control(AV1E_SET_AQ_MODE, aq_mode_);
       encoder->Control(AV1E_SET_TILE_COLUMNS, 0);
       if (cfg_.g_usage == AOM_USAGE_REALTIME) {
+        encoder->Control(AV1E_SET_ENABLE_GLOBAL_MOTION, 0);
+        encoder->Control(AV1E_SET_ENABLE_WARPED_MOTION, 0);
+        encoder->Control(AV1E_SET_ENABLE_RESTORATION, 0);
+        encoder->Control(AV1E_SET_ENABLE_OBMC, 0);
         encoder->Control(AV1E_SET_DELTAQ_MODE, 0);
         encoder->Control(AV1E_SET_ENABLE_TPL_MODEL, 0);
         encoder->Control(AV1E_SET_ENABLE_CDEF, 1);
diff --git a/test/decode_api_test.cc b/test/decode_api_test.cc
index 910640d..6acd50f 100644
--- a/test/decode_api_test.cc
+++ b/test/decode_api_test.cc
@@ -13,14 +13,13 @@
 
 #include "config/aom_config.h"
 
-#include "test/util.h"
 #include "aom/aomdx.h"
 #include "aom/aom_decoder.h"
 
 namespace {
 
 TEST(DecodeAPI, InvalidParams) {
-  static const aom_codec_iface_t *kCodecs[] = {
+  static aom_codec_iface_t *kCodecs[] = {
 #if CONFIG_AV1_DECODER
     aom_codec_av1_dx(),
 #endif
@@ -38,8 +37,9 @@
             aom_codec_decode(NULL, NULL, sizeof(buf), NULL));
   EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_destroy(NULL));
   EXPECT_TRUE(aom_codec_error(NULL) != NULL);
+  EXPECT_TRUE(aom_codec_error_detail(NULL) == NULL);
 
-  for (const aom_codec_iface_t *iface : kCodecs) {
+  for (aom_codec_iface_t *iface : kCodecs) {
     EXPECT_EQ(AOM_CODEC_INVALID_PARAM,
               aom_codec_dec_init(NULL, iface, NULL, 0));
 
diff --git a/test/decode_multithreaded_test.cc b/test/decode_multithreaded_test.cc
index 92253ed..5224dcc 100644
--- a/test/decode_multithreaded_test.cc
+++ b/test/decode_multithreaded_test.cc
@@ -111,7 +111,7 @@
     cfg_.rc_end_usage = AOM_VBR;
 
     libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 704, 576,
-                                       timebase.den, timebase.num, 0, 5);
+                                       timebase.den, timebase.num, 0, 2);
     ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 
     const char *md5_single_thread_str = md5_single_thread_.Get();
@@ -157,14 +157,14 @@
 }
 
 // TODO(ranjit): More tests have to be added using pre-generated MD5.
-AV1_INSTANTIATE_TEST_CASE(AV1DecodeMultiThreadedTest, ::testing::Values(1, 2),
-                          ::testing::Values(1, 2), ::testing::Values(1),
-                          ::testing::Values(3), ::testing::Values(0, 1));
-AV1_INSTANTIATE_TEST_CASE(AV1DecodeMultiThreadedTestLarge,
-                          ::testing::Values(0, 1, 2, 6),
-                          ::testing::Values(0, 1, 2, 6),
-                          ::testing::Values(1, 4), ::testing::Values(0),
-                          ::testing::Values(0, 1));
+AV1_INSTANTIATE_TEST_SUITE(AV1DecodeMultiThreadedTest, ::testing::Values(1, 2),
+                           ::testing::Values(1, 2), ::testing::Values(1),
+                           ::testing::Values(3), ::testing::Values(0, 1));
+AV1_INSTANTIATE_TEST_SUITE(AV1DecodeMultiThreadedTestLarge,
+                           ::testing::Values(0, 1, 2, 6),
+                           ::testing::Values(0, 1, 2, 6),
+                           ::testing::Values(1, 4), ::testing::Values(0),
+                           ::testing::Values(0, 1));
 
 class AV1DecodeMultiThreadedLSTestLarge
     : public AV1DecodeMultiThreadedTestLarge {};
@@ -177,9 +177,9 @@
   DoTest();
 }
 
-AV1_INSTANTIATE_TEST_CASE(AV1DecodeMultiThreadedLSTestLarge,
-                          ::testing::Values(6), ::testing::Values(6),
-                          ::testing::Values(1), ::testing::Values(0, 3),
-                          ::testing::Values(0, 1));
+AV1_INSTANTIATE_TEST_SUITE(AV1DecodeMultiThreadedLSTestLarge,
+                           ::testing::Values(6), ::testing::Values(6),
+                           ::testing::Values(1), ::testing::Values(0, 3),
+                           ::testing::Values(0, 1));
 
 }  // namespace
diff --git a/test/decode_perf_test.cc b/test/decode_perf_test.cc
index 691337c..9f0de03 100644
--- a/test/decode_perf_test.cc
+++ b/test/decode_perf_test.cc
@@ -242,6 +242,6 @@
   printf("}\n");
 }
 
-AV1_INSTANTIATE_TEST_CASE(AV1NewEncodeDecodePerfTest,
-                          ::testing::Values(::libaom_test::kTwoPassGood));
+AV1_INSTANTIATE_TEST_SUITE(AV1NewEncodeDecodePerfTest,
+                           ::testing::Values(::libaom_test::kTwoPassGood));
 }  // namespace
diff --git a/test/decode_to_md5.sh b/test/decode_to_md5.sh
index 2edd1cb..214755f 100755
--- a/test/decode_to_md5.sh
+++ b/test/decode_to_md5.sh
@@ -39,7 +39,7 @@
   fi
 
   eval "${AOM_TEST_PREFIX}" "${decoder}" "${input_file}" "${output_file}" \
-      ${devnull}
+      ${devnull} || return 1
 
   [ -e "${output_file}" ] || return 1
 
@@ -65,7 +65,7 @@
   if [ "$(av1_decode_available)" = "yes" ]; then
     if [ ! -e "${AV1_IVF_FILE}" ]; then
       file="${AOM_TEST_OUTPUT_DIR}/test_encode.ivf"
-      encode_yuv_raw_input_av1 "${file}" --ivf
+      encode_yuv_raw_input_av1 "${file}" --ivf || return 1
     fi
     decode_to_md5 "${file}" "av1" "${expected_md5}"
   fi
diff --git a/test/decode_with_drops.sh b/test/decode_with_drops.sh
index 155ee92..1fc13ce 100755
--- a/test/decode_with_drops.sh
+++ b/test/decode_with_drops.sh
@@ -39,7 +39,7 @@
   fi
 
   eval "${AOM_TEST_PREFIX}" "${decoder}" "${input_file}" "${output_file}" \
-      "${drop_mode}" ${devnull}
+      "${drop_mode}" ${devnull} || return 1
 
   [ -e "${output_file}" ] || return 1
 }
@@ -52,13 +52,13 @@
     local file="${AV1_IVF_FILE}"
     if [ ! -e "${AV1_IVF_FILE}" ]; then
       file="${AOM_TEST_OUTPUT_DIR}/test_encode.ivf"
-      encode_yuv_raw_input_av1 "${file}" --ivf
+      encode_yuv_raw_input_av1 "${file}" --ivf || return 1
     fi
     # Drop frames 3 and 4.
-    decode_with_drops "${file}" "av1" "3-4"
+    decode_with_drops "${file}" "av1" "3-4" || return 1
 
     # Test pattern mode: Drop 3 of every 4 frames.
-    decode_with_drops "${file}" "av1" "3/4"
+    decode_with_drops "${file}" "av1" "3/4" || return 1
   fi
 }
 
diff --git a/test/dr_prediction_test.cc b/test/dr_prediction_test.cc
index e8865c0..3e09912 100644
--- a/test/dr_prediction_test.cc
+++ b/test/dr_prediction_test.cc
@@ -471,4 +471,44 @@
 #endif  // CONFIG_AV1_HIGHBITDEPTH
 #endif  // HAVE_AVX2
 
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+    NEON, LowbdDrPredTest,
+    ::testing::Values(DrPredFunc<DrPred>(&z1_wrapper<av1_dr_prediction_z1_c>,
+                                         &z1_wrapper<av1_dr_prediction_z1_neon>,
+                                         AOM_BITS_8, kZ1Start),
+                      DrPredFunc<DrPred>(&z2_wrapper<av1_dr_prediction_z2_c>,
+                                         &z2_wrapper<av1_dr_prediction_z2_neon>,
+                                         AOM_BITS_8, kZ2Start),
+                      DrPredFunc<DrPred>(&z3_wrapper<av1_dr_prediction_z3_c>,
+                                         &z3_wrapper<av1_dr_prediction_z3_neon>,
+                                         AOM_BITS_8, kZ3Start)));
+
+TEST_P(LowbdDrPredTest, DISABLED_Speed) {
+  const int angles[] = { 3, 45, 87 };
+  for (enable_upsample_ = 0; enable_upsample_ < 2; ++enable_upsample_) {
+    for (int i = 0; i < 3; ++i) {
+      const int angle = angles[i] + start_angle_;
+      dx_ = av1_get_dx(angle);
+      dy_ = av1_get_dy(angle);
+      printf("enable_upsample: %d angle: %d ~~~~~~~~~~~~~~~\n",
+             enable_upsample_, angle);
+      if (dx_ && dy_) RunTest(true, false, angle);
+    }
+  }
+}
+
+TEST_P(LowbdDrPredTest, OperationCheck) {
+  if (params_.tst_fn == NULL) return;
+  // const int angles[] = { 3, 45, 81, 87, 93, 100, 145, 187, 199, 260 };
+  for (enable_upsample_ = 0; enable_upsample_ < 2; ++enable_upsample_) {
+    for (int angle = start_angle_; angle < stop_angle_; ++angle) {
+      dx_ = av1_get_dx(angle);
+      dy_ = av1_get_dy(angle);
+      if (dx_ && dy_) RunTest(false, false, angle);
+    }
+  }
+}
+#endif  // HAVE_NEON
+
 }  // namespace
diff --git a/test/dump_obu.sh b/test/dump_obu.sh
index da44dd7..7dcab94 100755
--- a/test/dump_obu.sh
+++ b/test/dump_obu.sh
@@ -51,7 +51,7 @@
       $(yuv_raw_input) \
       --ivf \
       --output=${dump_obu_test_file} \
-      ${devnull}
+      ${devnull} || return 1
 
     if [ ! -e "${dump_obu_test_file}" ]; then
       elog "dump_obu test input encode failed."
@@ -61,7 +61,7 @@
 }
 
 dump_obu() {
-  encode_test_file
+  encode_test_file || return 1
   eval $(aom_tool_path dump_obu) "${dump_obu_test_file}" ${devnull}
 }
 
diff --git a/test/encode_api_test.cc b/test/encode_api_test.cc
index 25bdb5c..4b79f25 100644
--- a/test/encode_api_test.cc
+++ b/test/encode_api_test.cc
@@ -9,22 +9,18 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
+#include <cstdlib>
+
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 
 #include "config/aom_config.h"
 
-#include "test/util.h"
 #include "aom/aomcx.h"
 #include "aom/aom_encoder.h"
 
 namespace {
 
 TEST(EncodeAPI, InvalidParams) {
-  static const aom_codec_iface_t *kCodecs[] = {
-#if CONFIG_AV1_ENCODER
-    aom_codec_av1_cx(),
-#endif
-  };
   uint8_t buf[1] = { 0 };
   aom_image_t img;
   aom_codec_ctx_t enc;
@@ -43,31 +39,35 @@
             aom_codec_enc_config_default(NULL, &cfg, 0));
   EXPECT_TRUE(aom_codec_error(NULL) != NULL);
 
-  for (const aom_codec_iface_t *iface : kCodecs) {
-    SCOPED_TRACE(aom_codec_iface_name(iface));
-    EXPECT_EQ(AOM_CODEC_INVALID_PARAM,
-              aom_codec_enc_init(NULL, iface, NULL, 0));
-    EXPECT_EQ(AOM_CODEC_INVALID_PARAM,
-              aom_codec_enc_init(&enc, iface, NULL, 0));
-    EXPECT_EQ(AOM_CODEC_INVALID_PARAM,
-              aom_codec_enc_config_default(iface, &cfg, 2));
+  aom_codec_iface_t *iface = aom_codec_av1_cx();
+  SCOPED_TRACE(aom_codec_iface_name(iface));
+  EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_enc_init(NULL, iface, NULL, 0));
+  EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_enc_init(&enc, iface, NULL, 0));
+  EXPECT_EQ(AOM_CODEC_INVALID_PARAM,
+            aom_codec_enc_config_default(iface, &cfg, 2));
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_config_default(iface, &cfg, 0));
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_init(&enc, iface, &cfg, 0));
+  EXPECT_EQ(NULL, aom_codec_get_global_headers(NULL));
 
-    EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_config_default(iface, &cfg, 0));
-    EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_init(&enc, iface, &cfg, 0));
-
-    EXPECT_EQ(NULL, aom_codec_get_global_headers(NULL));
-
-    aom_fixed_buf_t *glob_headers = aom_codec_get_global_headers(&enc);
-    EXPECT_TRUE(glob_headers->buf != NULL);
-    if (glob_headers) {
-      free(glob_headers->buf);
-      free(glob_headers);
-    }
-
-    EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, NULL, 0, 0, 0));
-
-    EXPECT_EQ(AOM_CODEC_OK, aom_codec_destroy(&enc));
+  aom_fixed_buf_t *glob_headers = aom_codec_get_global_headers(&enc);
+  EXPECT_TRUE(glob_headers->buf != NULL);
+  if (glob_headers) {
+    free(glob_headers->buf);
+    free(glob_headers);
   }
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, NULL, 0, 0, 0));
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_destroy(&enc));
+}
+
+TEST(EncodeAPI, InvalidControlId) {
+  aom_codec_iface_t *iface = aom_codec_av1_cx();
+  aom_codec_ctx_t enc;
+  aom_codec_enc_cfg_t cfg;
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_config_default(iface, &cfg, 0));
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_init(&enc, iface, &cfg, 0));
+  EXPECT_EQ(AOM_CODEC_ERROR, aom_codec_control(&enc, -1, 0));
+  EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_control(&enc, 0, 0));
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_destroy(&enc));
 }
 
 }  // namespace
diff --git a/test/encode_perf_test.cc b/test/encode_perf_test.cc
index 390a6e0..b626acd 100644
--- a/test/encode_perf_test.cc
+++ b/test/encode_perf_test.cc
@@ -179,6 +179,6 @@
   }
 }
 
-AV1_INSTANTIATE_TEST_CASE(AV1EncodePerfTest,
-                          ::testing::Values(::libaom_test::kRealTime));
+AV1_INSTANTIATE_TEST_SUITE(AV1EncodePerfTest,
+                           ::testing::Values(::libaom_test::kRealTime));
 }  // namespace
diff --git a/test/encode_small_width_height_test.cc b/test/encode_small_width_height_test.cc
new file mode 100644
index 0000000..6f52fd5
--- /dev/null
+++ b/test/encode_small_width_height_test.cc
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+// Tests for https://crbug.com/aomedia/2777.
+//
+// Encode images with a small width (<= two AV1 superblocks) or a small height
+// (<= one AV1 superblock) with multiple threads. aom_codec_encode() should
+// not crash.
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "aom/aomcx.h"
+#include "aom/aom_encoder.h"
+
+namespace {
+
+// Dummy buffer of zero samples.
+constexpr unsigned char kBuffer[256 * 512 + 2 * 128 * 256] = { 0 };
+
+TEST(EncodeSmallWidthHeight, SmallWidthMultiThreaded) {
+  // The image has only one tile and the tile is two AV1 superblocks wide.
+  // For speed >= 1, superblock size is 64x64 (see av1_select_sb_size()).
+  constexpr int kWidth = 128;
+  constexpr int kHeight = 512;
+
+  aom_image_t img;
+  EXPECT_EQ(&img, aom_img_wrap(&img, AOM_IMG_FMT_I420, kWidth, kHeight, 1,
+                               const_cast<unsigned char *>(kBuffer)));
+
+  aom_codec_iface_t *iface = aom_codec_av1_cx();
+  aom_codec_enc_cfg_t cfg;
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_config_default(iface, &cfg, 0));
+  cfg.g_threads = 2;
+  cfg.g_w = kWidth;
+  cfg.g_h = kHeight;
+  aom_codec_ctx_t enc;
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_init(&enc, iface, &cfg, 0));
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_control(&enc, AOME_SET_CPUUSED, 5));
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, &img, 0, 1, 0));
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, NULL, 0, 0, 0));
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_destroy(&enc));
+}
+
+TEST(EncodeSmallWidthHeight, SmallWidthMultiThreadedSpeed0) {
+  // The image has only one tile and the tile is two AV1 superblocks wide.
+  // For speed 0, superblock size is 128x128 (see av1_select_sb_size()).
+  constexpr int kWidth = 256;
+  constexpr int kHeight = 512;
+
+  aom_image_t img;
+  EXPECT_EQ(&img, aom_img_wrap(&img, AOM_IMG_FMT_I420, kWidth, kHeight, 1,
+                               const_cast<unsigned char *>(kBuffer)));
+
+  aom_codec_iface_t *iface = aom_codec_av1_cx();
+  aom_codec_enc_cfg_t cfg;
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_config_default(iface, &cfg, 0));
+  cfg.g_threads = 2;
+  cfg.g_w = kWidth;
+  cfg.g_h = kHeight;
+  aom_codec_ctx_t enc;
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_init(&enc, iface, &cfg, 0));
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_control(&enc, AOME_SET_CPUUSED, 0));
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, &img, 0, 1, 0));
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, NULL, 0, 0, 0));
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_destroy(&enc));
+}
+
+TEST(EncodeSmallWidthHeight, SmallHeightMultiThreaded) {
+  // The image has only one tile and the tile is one AV1 superblock tall.
+  // For speed >= 1, superblock size is 64x64 (see av1_select_sb_size()).
+  constexpr int kWidth = 512;
+  constexpr int kHeight = 64;
+
+  aom_image_t img;
+  EXPECT_EQ(&img, aom_img_wrap(&img, AOM_IMG_FMT_I420, kWidth, kHeight, 1,
+                               const_cast<unsigned char *>(kBuffer)));
+
+  aom_codec_iface_t *iface = aom_codec_av1_cx();
+  aom_codec_enc_cfg_t cfg;
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_config_default(iface, &cfg, 0));
+  cfg.g_threads = 2;
+  cfg.g_w = kWidth;
+  cfg.g_h = kHeight;
+  aom_codec_ctx_t enc;
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_init(&enc, iface, &cfg, 0));
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_control(&enc, AOME_SET_CPUUSED, 5));
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, &img, 0, 1, 0));
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, NULL, 0, 0, 0));
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_destroy(&enc));
+}
+
+TEST(EncodeSmallWidthHeight, SmallHeightMultiThreadedSpeed0) {
+  // The image has only one tile and the tile is one AV1 superblock tall.
+  // For speed 0, superblock size is 128x128 (see av1_select_sb_size()).
+  constexpr int kWidth = 512;
+  constexpr int kHeight = 128;
+
+  aom_image_t img;
+  EXPECT_EQ(&img, aom_img_wrap(&img, AOM_IMG_FMT_I420, kWidth, kHeight, 1,
+                               const_cast<unsigned char *>(kBuffer)));
+
+  aom_codec_iface_t *iface = aom_codec_av1_cx();
+  aom_codec_enc_cfg_t cfg;
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_config_default(iface, &cfg, 0));
+  cfg.g_threads = 2;
+  cfg.g_w = kWidth;
+  cfg.g_h = kHeight;
+  aom_codec_ctx_t enc;
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_init(&enc, iface, &cfg, 0));
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_control(&enc, AOME_SET_CPUUSED, 0));
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, &img, 0, 1, 0));
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, NULL, 0, 0, 0));
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_destroy(&enc));
+}
+
+}  // namespace
diff --git a/test/encode_test_driver.cc b/test/encode_test_driver.cc
index 01f8d50..c95a86d 100644
--- a/test/encode_test_driver.cc
+++ b/test/encode_test_driver.cc
@@ -180,9 +180,6 @@
 }
 
 void EncoderTest::RunLoop(VideoSource *video) {
-  aom_codec_dec_cfg_t dec_cfg = aom_codec_dec_cfg_t();
-  dec_cfg.allow_lowbitdepth = 1;
-
   stats_.Reset();
 
   ASSERT_TRUE(passes_ == 1 || passes_ == 2);
@@ -209,10 +206,11 @@
     }
 
     ASSERT_FALSE(::testing::Test::HasFatalFailure());
-
+#if CONFIG_AV1_DECODER
+    aom_codec_dec_cfg_t dec_cfg = aom_codec_dec_cfg_t();
+    dec_cfg.allow_lowbitdepth = 1;
     std::unique_ptr<Decoder> decoder(
         codec_->CreateDecoder(dec_cfg, 0 /* flags */));
-#if CONFIG_AV1_DECODER
     if (decoder->IsAV1()) {
       // Set dec_cfg.tile_row = -1 and dec_cfg.tile_col = -1 so that the whole
       // frame is decoded.
@@ -236,13 +234,16 @@
 
         CxDataIterator iter = encoder->GetCxData();
 
+#if CONFIG_AV1_DECODER
         bool has_cxdata = false;
         bool has_dxdata = false;
+#endif
         while (const aom_codec_cx_pkt_t *pkt = iter.Next()) {
           pkt = MutateEncoderOutputHook(pkt);
           again = true;
           switch (pkt->kind) {
             case AOM_CODEC_CX_FRAME_PKT:
+#if CONFIG_AV1_DECODER
               has_cxdata = true;
               if (decoder.get() != NULL && DoDecode()) {
                 aom_codec_err_t res_dec;
@@ -260,6 +261,7 @@
 
                 has_dxdata = true;
               }
+#endif
               ASSERT_GE(pkt->data.frame.pts, last_pts_);
               if (sl == number_spatial_layers_) last_pts_ = pkt->data.frame.pts;
               FramePktHook(pkt);
@@ -267,10 +269,12 @@
 
             case AOM_CODEC_PSNR_PKT: PSNRPktHook(pkt); break;
 
+            case AOM_CODEC_STATS_PKT: StatsPktHook(pkt); break;
+
             default: break;
           }
         }
-
+#if CONFIG_AV1_DECODER
         if (has_dxdata && has_cxdata) {
           const aom_image_t *img_enc = encoder->GetPreviewFrame();
           DxDataIterator dec_iter = decoder->GetDxData();
@@ -284,6 +288,7 @@
           }
           if (img_dec) DecompressedFrameHook(*img_dec, video->pts());
         }
+#endif
         if (!Continue()) break;
       }  // Loop over spatial layers
     }
diff --git a/test/encode_test_driver.h b/test/encode_test_driver.h
index 6319a52..9744b11 100644
--- a/test/encode_test_driver.h
+++ b/test/encode_test_driver.h
@@ -215,6 +215,9 @@
   // Hook to be called on every PSNR packet.
   virtual void PSNRPktHook(const aom_codec_cx_pkt_t * /*pkt*/) {}
 
+  // Hook to be called on every first pass stats packet.
+  virtual void StatsPktHook(const aom_codec_cx_pkt_t * /*pkt*/) {}
+
   // Hook to determine whether the encode loop should continue.
   virtual bool Continue() const {
     return !(::testing::Test::HasFatalFailure() || abort_);
diff --git a/test/encodetxb_test.cc b/test/encodetxb_test.cc
index 385d3f1..61838ca 100644
--- a/test/encodetxb_test.cc
+++ b/test/encodetxb_test.cc
@@ -100,6 +100,7 @@
   void SpeedTestGetNzMapContextsRun() {
     const int kNumTests = 2000000000;
     aom_usec_timer timer;
+    aom_usec_timer timer_ref;
 
     printf("Note: Only test the largest possible eob case!\n");
     for (int tx_size = TX_4X4; tx_size < TX_SIZES_ALL; ++tx_size) {
@@ -117,6 +118,16 @@
       levels_ = set_levels(levels_buf_, width);
       InitDataWithEob(scan, bwl, eob);
 
+      aom_usec_timer_start(&timer_ref);
+      for (int i = 0; i < numTests; ++i) {
+        av1_get_nz_map_contexts_c(levels_, scan, eob, (TX_SIZE)tx_size,
+                                  tx_class, coeff_contexts_ref_);
+      }
+      aom_usec_timer_mark(&timer_ref);
+
+      levels_ = set_levels(levels_buf_, width);
+      InitDataWithEob(scan, bwl, eob);
+
       aom_usec_timer_start(&timer);
       for (int i = 0; i < numTests; ++i) {
         get_nz_map_contexts_func_(levels_, scan, eob, (TX_SIZE)tx_size,
@@ -124,9 +135,14 @@
       }
       aom_usec_timer_mark(&timer);
 
+      const int elapsed_time_ref =
+          static_cast<int>(aom_usec_timer_elapsed(&timer_ref));
       const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
-      printf("get_nz_map_contexts_%2dx%2d: %7.1f ms\n", real_width, real_height,
-             elapsed_time / 1000.0);
+
+      printf("get_nz_map_contexts_%2dx%2d: %7.1f ms ref %7.1f ms gain %4.2f\n",
+             real_width, real_height, elapsed_time / 1000.0,
+             elapsed_time_ref / 1000.0,
+             (elapsed_time_ref * 1.0) / (elapsed_time * 1.0));
     }
   }
 
@@ -170,6 +186,7 @@
   int8_t *coeff_contexts_ref_;
   int8_t *coeff_contexts_;
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(EncodeTxbTest);
 
 TEST_P(EncodeTxbTest, GetNzMapContexts) { GetNzMapContextsRun(); }
 
@@ -182,6 +199,11 @@
                          ::testing::Values(av1_get_nz_map_contexts_sse2));
 #endif
 
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, EncodeTxbTest,
+                         ::testing::Values(av1_get_nz_map_contexts_neon));
+#endif
+
 typedef void (*av1_txb_init_levels_func)(const tran_low_t *const coeff,
                                          const int width, const int height,
                                          uint8_t *const levels);
@@ -195,6 +217,7 @@
   virtual void TearDown() { libaom_test::ClearSystemState(); }
   void RunTest(av1_txb_init_levels_func test_func, int tx_size, int is_speed);
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(EncodeTxbInitLevelTest);
 
 void EncodeTxbInitLevelTest::RunTest(av1_txb_init_levels_func test_func,
                                      int tx_size, int is_speed) {
@@ -260,4 +283,10 @@
     ::testing::Combine(::testing::Values(&av1_txb_init_levels_avx2),
                        ::testing::Range(0, static_cast<int>(TX_SIZES_ALL), 1)));
 #endif
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+    NEON, EncodeTxbInitLevelTest,
+    ::testing::Combine(::testing::Values(&av1_txb_init_levels_neon),
+                       ::testing::Range(0, static_cast<int>(TX_SIZES_ALL), 1)));
+#endif
 }  // namespace
diff --git a/test/end_to_end_test.cc b/test/end_to_end_test.cc
index 162a7c7..82a4a8c 100644
--- a/test/end_to_end_test.cc
+++ b/test/end_to_end_test.cc
@@ -199,13 +199,13 @@
 
 TEST_P(EndToEndTest, EndtoEndPSNRTest) { DoTest(); }
 
-AV1_INSTANTIATE_TEST_CASE(EndToEndTestLarge,
-                          ::testing::ValuesIn(kEncodingModeVectors),
-                          ::testing::ValuesIn(kTestVectors),
-                          ::testing::ValuesIn(kCpuUsedVectors));
+AV1_INSTANTIATE_TEST_SUITE(EndToEndTestLarge,
+                           ::testing::ValuesIn(kEncodingModeVectors),
+                           ::testing::ValuesIn(kTestVectors),
+                           ::testing::ValuesIn(kCpuUsedVectors));
 
-AV1_INSTANTIATE_TEST_CASE(EndToEndTest,
-                          ::testing::Values(kEncodingModeVectors[0]),
-                          ::testing::Values(kTestVectors[2]),  // 444
-                          ::testing::Values(kCpuUsedVectors[2]));
+AV1_INSTANTIATE_TEST_SUITE(EndToEndTest,
+                           ::testing::Values(kEncodingModeVectors[0]),
+                           ::testing::Values(kTestVectors[2]),  // 444
+                           ::testing::Values(kCpuUsedVectors[2]));
 }  // namespace
diff --git a/test/error_block_test.cc b/test/error_block_test.cc
index 462661e..ea0acf4 100644
--- a/test/error_block_test.cc
+++ b/test/error_block_test.cc
@@ -67,6 +67,7 @@
   ErrorBlockFunc error_block_op_;
   ErrorBlockFunc ref_error_block_op_;
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(ErrorBlockTest);
 
 TEST_P(ErrorBlockTest, OperationCheck) {
   ACMRandom rnd(ACMRandom::DeterministicSeed());
diff --git a/test/error_resilience_test.cc b/test/error_resilience_test.cc
index 1d52bb2..a30de4f 100644
--- a/test/error_resilience_test.cc
+++ b/test/error_resilience_test.cc
@@ -27,12 +27,13 @@
 const int kCpuUsed = 1;
 
 class ErrorResilienceTestLarge
-    : public ::libaom_test::CodecTestWithParam<libaom_test::TestMode>,
+    : public ::libaom_test::CodecTestWith2Params<libaom_test::TestMode, int>,
       public ::libaom_test::EncoderTest {
  protected:
   ErrorResilienceTestLarge()
       : EncoderTest(GET_PARAM(0)), psnr_(0.0), nframes_(0), mismatch_psnr_(0.0),
-        mismatch_nframes_(0), encoding_mode_(GET_PARAM(1)), allow_mismatch_(0) {
+        mismatch_nframes_(0), encoding_mode_(GET_PARAM(1)), allow_mismatch_(0),
+        enable_altref_(GET_PARAM(2)) {
     Reset();
   }
 
@@ -77,7 +78,10 @@
 
   virtual void PreEncodeFrameHook(libaom_test::VideoSource *video,
                                   libaom_test::Encoder *encoder) {
-    if (video->frame() == 0) encoder->Control(AOME_SET_CPUUSED, kCpuUsed);
+    if (video->frame() == 0) {
+      encoder->Control(AOME_SET_CPUUSED, kCpuUsed);
+      encoder->Control(AOME_SET_ENABLEAUTOALTREF, enable_altref_);
+    }
     frame_flags_ &=
         ~(AOM_EFLAG_NO_UPD_LAST | AOM_EFLAG_NO_UPD_GF | AOM_EFLAG_NO_UPD_ARF |
           AOM_EFLAG_NO_REF_FRAME_MVS | AOM_EFLAG_ERROR_RESILIENT |
@@ -320,6 +324,7 @@
   unsigned int s_frames_[kMaxSFrames];
   libaom_test::TestMode encoding_mode_;
   int allow_mismatch_;
+  int enable_altref_;
 };
 
 TEST_P(ErrorResilienceTestLarge, OnVersusOff) {
@@ -455,5 +460,92 @@
   EXPECT_LE(GetMismatchFrames(), GetEncodedFrames() - s_frame_list[0]);
 }
 
-AV1_INSTANTIATE_TEST_CASE(ErrorResilienceTestLarge, NONREALTIME_TEST_MODES);
+AV1_INSTANTIATE_TEST_SUITE(ErrorResilienceTestLarge, NONREALTIME_TEST_MODES,
+                           ::testing::Values(0, 1));
+
+// This class is used to check the presence of SFrame.
+class SFramePresenceTestLarge
+    : public ::libaom_test::CodecTestWith3Params<libaom_test::TestMode,
+                                                 aom_rc_mode, int>,
+      public ::libaom_test::EncoderTest {
+ protected:
+  SFramePresenceTestLarge()
+      : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)),
+        rc_end_usage_(GET_PARAM(2)), enable_altref_(GET_PARAM(3)) {
+    is_sframe_present_ = 0;
+    is_sframe_position_violated_ = 0;
+  }
+  virtual ~SFramePresenceTestLarge() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(encoding_mode_);
+    const aom_rational timebase = { 1, 30 };
+    cfg_.g_timebase = timebase;
+    cfg_.rc_end_usage = rc_end_usage_;
+    cfg_.g_threads = 1;
+    cfg_.kf_min_dist = 0;
+    cfg_.kf_max_dist = 60;
+    cfg_.g_lag_in_frames = 35;
+    cfg_.sframe_dist = 5;
+    if (enable_altref_) cfg_.sframe_mode = 2;
+  }
+
+  virtual bool DoDecode() const { return 1; }
+
+  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                                  ::libaom_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      encoder->Control(AOME_SET_CPUUSED, 5);
+      encoder->Control(AOME_SET_ENABLEAUTOALTREF, enable_altref_);
+    }
+  }
+
+  virtual bool HandleDecodeResult(const aom_codec_err_t res_dec,
+                                  libaom_test::Decoder *decoder) {
+    EXPECT_EQ(AOM_CODEC_OK, res_dec) << decoder->DecodeError();
+    if (AOM_CODEC_OK == res_dec) {
+      aom_codec_ctx_t *ctx_dec = decoder->GetDecoder();
+      AOM_CODEC_CONTROL_TYPECHECKED(ctx_dec, AOMD_GET_S_FRAME_INFO,
+                                    &sframe_info);
+      if (sframe_info.is_s_frame) {
+        is_sframe_present_ = 1;
+        if (enable_altref_ && is_sframe_position_violated_ == 0 &&
+            sframe_info.is_s_frame_at_altref == 0)
+          is_sframe_position_violated_ = 1;
+      }
+    }
+    return AOM_CODEC_OK == res_dec;
+  }
+
+  ::libaom_test::TestMode encoding_mode_;
+  aom_rc_mode rc_end_usage_;
+  int is_sframe_present_;
+  int is_sframe_position_violated_;
+  int enable_altref_;
+  aom_s_frame_info sframe_info;
+};
+
+// TODO(http://crbug.com/aomedia/2831): Disable the S frame unit test for frame
+// scheduling re-design.
+TEST_P(SFramePresenceTestLarge, DISABLED_SFramePresenceTest) {
+  libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                     cfg_.g_timebase.den, cfg_.g_timebase.num,
+                                     0, 100);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  ASSERT_EQ(is_sframe_present_, 1);
+  if (enable_altref_) {
+    ASSERT_EQ(is_sframe_position_violated_, 0);
+  }
+}
+
+/* TODO(anyone): Currently SFramePresenceTest fails when enable_altref_ = 1.
+ * Hence this configuration is not added. Add this configuration after the
+ * bug is fixed.
+ */
+AV1_INSTANTIATE_TEST_SUITE(SFramePresenceTestLarge,
+                           ::testing::Values(::libaom_test::kOnePassGood,
+                                             ::libaom_test::kTwoPassGood),
+                           ::testing::Values(AOM_Q, AOM_VBR, AOM_CBR, AOM_CQ),
+                           ::testing::Values(0));
 }  // namespace
diff --git a/test/ethread_test.cc b/test/ethread_test.cc
index 306cc2f..6c2107b 100644
--- a/test/ethread_test.cc
+++ b/test/ethread_test.cc
@@ -16,9 +16,188 @@
 #include "test/encode_test_driver.h"
 #include "test/md5_helper.h"
 #include "test/util.h"
+#include "test/y4m_video_source.h"
 #include "test/yuv_video_source.h"
+#include "av1/encoder/firstpass.h"
 
 namespace {
+const size_t kFirstPassStatsSz = sizeof(FIRSTPASS_STATS);
+class AVxFirstPassEncoderThreadTest
+    : public ::libaom_test::CodecTestWith4Params<libaom_test::TestMode, int,
+                                                 int, int>,
+      public ::libaom_test::EncoderTest {
+ protected:
+  AVxFirstPassEncoderThreadTest()
+      : EncoderTest(GET_PARAM(0)), encoder_initialized_(false),
+        encoding_mode_(GET_PARAM(1)), set_cpu_used_(GET_PARAM(2)),
+        tile_rows_(GET_PARAM(3)), tile_cols_(GET_PARAM(4)) {
+    init_flags_ = AOM_CODEC_USE_PSNR;
+
+    row_mt_ = 1;
+    firstpass_stats_.buf = NULL;
+    firstpass_stats_.sz = 0;
+  }
+  virtual ~AVxFirstPassEncoderThreadTest() { free(firstpass_stats_.buf); }
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(encoding_mode_);
+
+    cfg_.g_lag_in_frames = 35;
+    cfg_.rc_end_usage = AOM_VBR;
+    cfg_.rc_2pass_vbr_minsection_pct = 5;
+    cfg_.rc_2pass_vbr_maxsection_pct = 2000;
+    cfg_.rc_max_quantizer = 56;
+    cfg_.rc_min_quantizer = 0;
+  }
+
+  virtual void BeginPassHook(unsigned int /*pass*/) {
+    encoder_initialized_ = false;
+    abort_ = false;
+  }
+
+  virtual void EndPassHook() {
+    // For first pass stats test, only run first pass encoder.
+    if (cfg_.g_pass == AOM_RC_FIRST_PASS) abort_ = true;
+  }
+
+  virtual void PreEncodeFrameHook(::libaom_test::VideoSource * /*video*/,
+                                  ::libaom_test::Encoder *encoder) {
+    if (!encoder_initialized_) {
+      // Encode in 2-pass mode.
+      SetTileSize(encoder);
+      encoder->Control(AV1E_SET_ROW_MT, row_mt_);
+      encoder->Control(AOME_SET_CPUUSED, set_cpu_used_);
+      encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
+      encoder->Control(AOME_SET_ARNR_MAXFRAMES, 7);
+      encoder->Control(AOME_SET_ARNR_STRENGTH, 5);
+      encoder->Control(AV1E_SET_FRAME_PARALLEL_DECODING, 0);
+
+      encoder_initialized_ = true;
+    }
+  }
+
+  virtual void SetTileSize(libaom_test::Encoder *encoder) {
+    encoder->Control(AV1E_SET_TILE_COLUMNS, tile_cols_);
+    encoder->Control(AV1E_SET_TILE_ROWS, tile_rows_);
+  }
+
+  virtual void StatsPktHook(const aom_codec_cx_pkt_t *pkt) {
+    const uint8_t *const pkt_buf =
+        reinterpret_cast<uint8_t *>(pkt->data.twopass_stats.buf);
+    const size_t pkt_size = pkt->data.twopass_stats.sz;
+
+    // First pass stats size equals sizeof(FIRSTPASS_STATS)
+    EXPECT_EQ(pkt_size, kFirstPassStatsSz)
+        << "Error: First pass stats size doesn't equal kFirstPassStatsSz";
+
+    firstpass_stats_.buf =
+        realloc(firstpass_stats_.buf, firstpass_stats_.sz + pkt_size);
+    memcpy((uint8_t *)firstpass_stats_.buf + firstpass_stats_.sz, pkt_buf,
+           pkt_size);
+    firstpass_stats_.sz += pkt_size;
+  }
+
+  bool encoder_initialized_;
+  ::libaom_test::TestMode encoding_mode_;
+  int set_cpu_used_;
+  int tile_rows_;
+  int tile_cols_;
+  int row_mt_;
+  aom_fixed_buf_t firstpass_stats_;
+};
+
+static void compare_fp_stats_md5(aom_fixed_buf_t *fp_stats) {
+  // fp_stats consists of 2 set of first pass encoding stats. These 2 set of
+  // stats are compared to check if the stats match.
+  uint8_t *stats1 = reinterpret_cast<uint8_t *>(fp_stats->buf);
+  uint8_t *stats2 = stats1 + fp_stats->sz / 2;
+  ::libaom_test::MD5 md5_row_mt_0, md5_row_mt_1;
+
+  md5_row_mt_0.Add(stats1, fp_stats->sz / 2);
+  const char *md5_row_mt_0_str = md5_row_mt_0.Get();
+
+  md5_row_mt_1.Add(stats2, fp_stats->sz / 2);
+  const char *md5_row_mt_1_str = md5_row_mt_1.Get();
+
+  // Check md5 match.
+  ASSERT_STREQ(md5_row_mt_0_str, md5_row_mt_1_str)
+      << "MD5 checksums don't match";
+}
+
+TEST_P(AVxFirstPassEncoderThreadTest, FirstPassStatsTest) {
+  ::libaom_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
+  aom_fixed_buf_t firstpass_stats;
+  size_t single_run_sz;
+
+  cfg_.rc_target_bitrate = 1000;
+
+  // 5 encodes will be run:
+  // 1. row_mt_=0 and threads=1
+  // 2. row_mt_=1 and threads=1
+  // 3. row_mt_=1 and threads=2
+  // 4. row_mt_=1 and threads=4
+  // 5. row_mt_=1 and threads=8
+
+  // 4 comparisons will be made:
+  // 1. Between run 1 and run 2.
+  // 2. Between run 2 and run 3.
+  // 3. Between run 3 and run 4.
+  // 4. Between run 4 and run 5.
+
+  // Test row_mt_: 0 vs 1 at single thread case(threads = 1)
+  cfg_.g_threads = 1;
+
+  row_mt_ = 0;
+  init_flags_ = AOM_CODEC_USE_PSNR;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+  row_mt_ = 1;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+  firstpass_stats.buf = firstpass_stats_.buf;
+  firstpass_stats.sz = firstpass_stats_.sz;
+  single_run_sz = firstpass_stats_.sz / 2;
+
+  // Compare to check if using or not using row-mt are bit exact.
+  // Comparison 1 (between row_mt_=0 and row_mt_=1).
+  ASSERT_NO_FATAL_FAILURE(compare_fp_stats_md5(&firstpass_stats));
+
+  // Test single thread vs multiple threads
+  row_mt_ = 1;
+
+  cfg_.g_threads = 2;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+  // offset to the 2nd and 3rd run.
+  firstpass_stats.buf = reinterpret_cast<void *>(
+      reinterpret_cast<uint8_t *>(firstpass_stats_.buf) + single_run_sz);
+
+  // Compare to check if single-thread and multi-thread stats are bit exact.
+  // Comparison 2 (between threads=1 and threads=2).
+  ASSERT_NO_FATAL_FAILURE(compare_fp_stats_md5(&firstpass_stats));
+
+  cfg_.g_threads = 4;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+  // offset to the 3rd and 4th run
+  firstpass_stats.buf = reinterpret_cast<void *>(
+      reinterpret_cast<uint8_t *>(firstpass_stats_.buf) + single_run_sz * 2);
+
+  // Comparison 3 (between threads=2 and threads=4).
+  ASSERT_NO_FATAL_FAILURE(compare_fp_stats_md5(&firstpass_stats));
+
+  cfg_.g_threads = 8;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+  // offset to the 4th and 5th run.
+  firstpass_stats.buf = reinterpret_cast<void *>(
+      reinterpret_cast<uint8_t *>(firstpass_stats_.buf) + single_run_sz * 3);
+
+  // Comparison 4 (between threads=4 and threads=8).
+  compare_fp_stats_md5(&firstpass_stats);
+}
+
 class AVxEncoderThreadTest
     : public ::libaom_test::CodecTestWith5Params<libaom_test::TestMode, int,
                                                  int, int, int>,
@@ -51,7 +230,7 @@
     SetMode(encoding_mode_);
 
     if (encoding_mode_ != ::libaom_test::kRealTime) {
-      cfg_.g_lag_in_frames = 5;
+      cfg_.g_lag_in_frames = 6;
       cfg_.rc_end_usage = AOM_VBR;
       cfg_.rc_2pass_vbr_minsection_pct = 5;
       cfg_.rc_2pass_vbr_maxsection_pct = 2000;
@@ -76,9 +255,10 @@
       encoder->Control(AV1E_SET_ROW_MT, row_mt_);
       if (encoding_mode_ != ::libaom_test::kRealTime) {
         encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
-        encoder->Control(AOME_SET_ARNR_MAXFRAMES, 7);
+        encoder->Control(AOME_SET_ARNR_MAXFRAMES, 5);
         encoder->Control(AOME_SET_ARNR_STRENGTH, 5);
         encoder->Control(AV1E_SET_FRAME_PARALLEL_DECODING, 0);
+        encoder->Control(AV1E_SET_MAX_GF_INTERVAL, 4);
       } else {
         encoder->Control(AOME_SET_ENABLEAUTOALTREF, 0);
         encoder->Control(AV1E_SET_AQ_MODE, 3);
@@ -152,6 +332,9 @@
       ASSERT_EQ(single_thr_size_enc, multi_thr_size_enc);
       ASSERT_EQ(single_thr_md5_enc, multi_thr_md5_enc);
       ASSERT_EQ(single_thr_md5_dec, multi_thr_md5_dec);
+
+      DoTestMaxThreads(&video, single_thr_size_enc, single_thr_md5_enc,
+                       single_thr_md5_dec);
     } else if (row_mt_ == 1) {
       // Encode using multiple threads row-mt enabled.
       cfg_.g_threads = 2;
@@ -200,9 +383,36 @@
       ASSERT_EQ(multi_thr4_row_mt_size_enc, multi_thr2_row_mt_size_enc);
       ASSERT_EQ(multi_thr4_row_mt_md5_enc, multi_thr2_row_mt_md5_enc);
       ASSERT_EQ(multi_thr4_row_mt_md5_dec, multi_thr2_row_mt_md5_dec);
+
+      DoTestMaxThreads(&video, multi_thr2_row_mt_size_enc,
+                       multi_thr2_row_mt_md5_enc, multi_thr2_row_mt_md5_dec);
     }
   }
 
+  virtual void DoTestMaxThreads(::libaom_test::YUVVideoSource *video,
+                                const std::vector<size_t> ref_size_enc,
+                                const std::vector<std::string> ref_md5_enc,
+                                const std::vector<std::string> ref_md5_dec) {
+    // This value should be kept the same as MAX_NUM_THREADS
+    // in aom_thread.h
+    cfg_.g_threads = 64;
+    ASSERT_NO_FATAL_FAILURE(RunLoop(video));
+    std::vector<size_t> multi_thr_max_row_mt_size_enc;
+    std::vector<std::string> multi_thr_max_row_mt_md5_enc;
+    std::vector<std::string> multi_thr_max_row_mt_md5_dec;
+    multi_thr_max_row_mt_size_enc = size_enc_;
+    multi_thr_max_row_mt_md5_enc = md5_enc_;
+    multi_thr_max_row_mt_md5_dec = md5_dec_;
+    size_enc_.clear();
+    md5_enc_.clear();
+    md5_dec_.clear();
+
+    // Check that the vectors are equal.
+    ASSERT_EQ(ref_size_enc, multi_thr_max_row_mt_size_enc);
+    ASSERT_EQ(ref_md5_enc, multi_thr_max_row_mt_md5_enc);
+    ASSERT_EQ(ref_md5_dec, multi_thr_max_row_mt_md5_dec);
+  }
+
   bool encoder_initialized_;
   ::libaom_test::TestMode encoding_mode_;
   int set_cpu_used_;
@@ -229,27 +439,44 @@
   DoTest();
 }
 
-// For AV1, only test speed 0 to 3.
-// Here test cpu_used 2 and 3
-AV1_INSTANTIATE_TEST_CASE(AVxEncoderThreadTest,
-                          ::testing::Values(::libaom_test::kTwoPassGood),
-                          ::testing::Range(2, 4), ::testing::Values(0, 2),
-                          ::testing::Values(0, 1), ::testing::Values(0, 1));
+// first pass stats test
+AV1_INSTANTIATE_TEST_SUITE(AVxFirstPassEncoderThreadTest,
+                           ::testing::Values(::libaom_test::kTwoPassGood),
+                           ::testing::Range(0, 6, 2), ::testing::Range(0, 2),
+                           ::testing::Range(1, 3));
 
-// Test cpu_used 0 and 1.
-AV1_INSTANTIATE_TEST_CASE(AVxEncoderThreadTestLarge,
-                          ::testing::Values(::libaom_test::kTwoPassGood,
-                                            ::libaom_test::kOnePassGood),
-                          ::testing::Range(0, 2), ::testing::Values(0, 1, 2, 6),
-                          ::testing::Values(0, 1, 2, 6),
-                          ::testing::Values(0, 1));
+// For AV1, test speed 0, 1, 2, 3, 5.
+// Only test cpu_used 2 here.
+AV1_INSTANTIATE_TEST_SUITE(AVxEncoderThreadTest,
+                           ::testing::Values(::libaom_test::kTwoPassGood),
+                           ::testing::Values(2), ::testing::Values(0, 2),
+                           ::testing::Values(0, 2), ::testing::Values(0, 1));
+
+// Test cpu_used 0, 1, 3 and 5.
+AV1_INSTANTIATE_TEST_SUITE(AVxEncoderThreadTestLarge,
+                           ::testing::Values(::libaom_test::kTwoPassGood,
+                                             ::libaom_test::kOnePassGood),
+                           ::testing::Values(0, 1, 3, 5),
+                           ::testing::Values(1, 6), ::testing::Values(1, 6),
+                           ::testing::Values(0, 1));
 
 class AVxEncoderThreadLSTest : public AVxEncoderThreadTest {
   virtual void SetTileSize(libaom_test::Encoder *encoder) {
     encoder->Control(AV1E_SET_TILE_COLUMNS, tile_cols_);
     encoder->Control(AV1E_SET_TILE_ROWS, tile_rows_);
   }
+
+  virtual void DoTestMaxThreads(::libaom_test::YUVVideoSource *video,
+                                const std::vector<size_t> ref_size_enc,
+                                const std::vector<std::string> ref_md5_enc,
+                                const std::vector<std::string> ref_md5_dec) {
+    (void)video;
+    (void)ref_size_enc;
+    (void)ref_md5_enc;
+    (void)ref_md5_dec;
+  }
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AVxEncoderThreadLSTest);
 
 TEST_P(AVxEncoderThreadLSTest, EncoderResultTest) {
   cfg_.large_scale_tile = 1;
@@ -267,9 +494,9 @@
   DoTest();
 }
 
-AV1_INSTANTIATE_TEST_CASE(AVxEncoderThreadLSTestLarge,
-                          ::testing::Values(::libaom_test::kTwoPassGood,
-                                            ::libaom_test::kOnePassGood),
-                          ::testing::Range(0, 4), ::testing::Values(0, 6),
-                          ::testing::Values(0, 6), ::testing::Values(0, 1));
+AV1_INSTANTIATE_TEST_SUITE(AVxEncoderThreadLSTestLarge,
+                           ::testing::Values(::libaom_test::kTwoPassGood,
+                                             ::libaom_test::kOnePassGood),
+                           ::testing::Values(1, 3), ::testing::Values(0, 6),
+                           ::testing::Values(0, 6), ::testing::Values(1));
 }  // namespace
diff --git a/test/external_frame_buffer_test.cc b/test/external_frame_buffer_test.cc
index 1d726a4..5006b5b 100644
--- a/test/external_frame_buffer_test.cc
+++ b/test/external_frame_buffer_test.cc
@@ -532,7 +532,7 @@
 }
 #endif  // CONFIG_WEBM_IO
 
-AV1_INSTANTIATE_TEST_CASE(
+AV1_INSTANTIATE_TEST_SUITE(
     ExternalFrameBufferMD5Test,
     ::testing::ValuesIn(libaom_test::kAV1TestVectors,
                         libaom_test::kAV1TestVectors +
diff --git a/test/fdct4x4_test.cc b/test/fdct4x4_test.cc
index 6600f2c..69e4bda 100644
--- a/test/fdct4x4_test.cc
+++ b/test/fdct4x4_test.cc
@@ -89,10 +89,12 @@
 };
 
 using Trans4x4FDCTTranLow = Trans4x4FDCT<tran_low_t>;
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(Trans4x4FDCTTranLow);
 TEST_P(Trans4x4FDCTTranLow, CoeffCheck) { RunCoeffCheck(); }
 TEST_P(Trans4x4FDCTTranLow, MemCheck) { RunMemCheck(); }
 
 using Trans4x4FDCTInt16 = Trans4x4FDCT<int16_t>;
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(Trans4x4FDCTInt16);
 TEST_P(Trans4x4FDCTInt16, CoeffCheck) { RunCoeffCheck(); }
 TEST_P(Trans4x4FDCTInt16, MemCheck) { RunMemCheck(); }
 
diff --git a/test/filterintra_test.cc b/test/filterintra_test.cc
index 284353c..14cdd39 100644
--- a/test/filterintra_test.cc
+++ b/test/filterintra_test.cc
@@ -77,6 +77,38 @@
       tstIndex += 1;
     }
   }
+  void RunSpeedTest() const {
+    int stride = tx_size_wide[txSize_];
+    uint8_t *left = alloc_;
+    uint8_t *above = alloc_ + MaxTxSize;
+    const int numIter = 5000;
+
+    PrepareBuffer();
+    aom_usec_timer ref_timer;
+    aom_usec_timer_start(&ref_timer);
+    for (int i = 0; i < numIter; i++) {
+      predFuncRef_(predRef_, stride, txSize_, &above[1], left, mode_);
+    }
+    aom_usec_timer_mark(&ref_timer);
+
+    aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
+    for (int i = 0; i < numIter; i++) {
+      predFunc_(pred_, stride, txSize_, &above[1], left, mode_);
+    }
+    aom_usec_timer_mark(&timer);
+
+    const int ref_sum_time =
+        static_cast<int>(aom_usec_timer_elapsed(&ref_timer));
+    const int sum_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
+
+    printf("c_time = %d \t simd_time = %d \t Gain = %4.2f \t mode =  %d \n",
+           ref_sum_time, sum_time,
+           (static_cast<float>(ref_sum_time) / static_cast<float>(sum_time)),
+           static_cast<int>(mode_));
+
+    DiffPred(0);
+  }
 
  private:
   void PrepareBuffer() const {
@@ -110,8 +142,10 @@
 
 TEST_P(AV1FilterIntraPredTest, BitExactCheck) { RunTest(); }
 
-using std::make_tuple;
+TEST_P(AV1FilterIntraPredTest, DISABLED_Speed) { RunSpeedTest(); }
 
+using ::testing::make_tuple;
+#if HAVE_SSE4_1
 const PredFuncMode kPredFuncMdArray[] = {
   make_tuple(&av1_filter_intra_predictor_c, &av1_filter_intra_predictor_sse4_1,
              FILTER_DC_PRED),
@@ -133,4 +167,30 @@
     SSE4_1, AV1FilterIntraPredTest,
     ::testing::Combine(::testing::ValuesIn(kPredFuncMdArray),
                        ::testing::ValuesIn(kTxSize)));
+#endif  // HAVE_SSE4_1
+
+#if HAVE_NEON
+const PredFuncMode kPredFuncMdArrayNEON[] = {
+  make_tuple(&av1_filter_intra_predictor_c, &av1_filter_intra_predictor_neon,
+             FILTER_DC_PRED),
+  make_tuple(&av1_filter_intra_predictor_c, &av1_filter_intra_predictor_neon,
+             FILTER_V_PRED),
+  make_tuple(&av1_filter_intra_predictor_c, &av1_filter_intra_predictor_neon,
+             FILTER_H_PRED),
+  make_tuple(&av1_filter_intra_predictor_c, &av1_filter_intra_predictor_neon,
+             FILTER_D157_PRED),
+  make_tuple(&av1_filter_intra_predictor_c, &av1_filter_intra_predictor_neon,
+             FILTER_PAETH_PRED),
+};
+
+const TX_SIZE kTxSizeNEON[] = { TX_4X4,  TX_8X8,  TX_16X16, TX_32X32, TX_4X8,
+                                TX_8X4,  TX_8X16, TX_16X8,  TX_16X32, TX_32X16,
+                                TX_4X16, TX_16X4, TX_8X32,  TX_32X8 };
+
+INSTANTIATE_TEST_SUITE_P(
+    NEON, AV1FilterIntraPredTest,
+    ::testing::Combine(::testing::ValuesIn(kPredFuncMdArrayNEON),
+                       ::testing::ValuesIn(kTxSizeNEON)));
+#endif  // HAVE_NEON
+
 }  // namespace
diff --git a/test/frame_error_test.cc b/test/frame_error_test.cc
index 6d74a68..6478f09 100644
--- a/test/frame_error_test.cc
+++ b/test/frame_error_test.cc
@@ -52,6 +52,7 @@
   void RunSpeedTest(frame_error_func test_impl, int width, int height);
   libaom_test::ACMRandom rnd_;
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1FrameErrorTest);
 
 void AV1FrameErrorTest::RandomValues(frame_error_func test_impl, int width,
                                      int height) {
diff --git a/test/fwd_kf_test.cc b/test/fwd_kf_test.cc
index 50c2f36..8b0fb69 100644
--- a/test/fwd_kf_test.cc
+++ b/test/fwd_kf_test.cc
@@ -25,7 +25,7 @@
 } FwdKfTestParam;
 
 const FwdKfTestParam kTestParams[] = {
-  { 4, 33.4 },  { 6, 32.9 },  { 8, 32.6 },
+  { 4, 31.89 }, { 6, 32.8 },  { 8, 32.6 },
   { 12, 32.4 }, { 16, 32.3 }, { 18, 32.1 }
 };
 
@@ -99,7 +99,8 @@
   double psnr_;
 };
 
-TEST_P(ForwardKeyTest, ForwardKeyEncodeTest) {
+// TODO(crbug.com/aomedia/2807): Fix and re-enable the test.
+TEST_P(ForwardKeyTest, DISABLED_ForwardKeyEncodeTest) {
   libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
                                      cfg_.g_timebase.den, cfg_.g_timebase.num,
                                      0, 20);
@@ -110,7 +111,89 @@
       << "kf max dist = " << kf_max_dist_;
 }
 
-AV1_INSTANTIATE_TEST_CASE(ForwardKeyTest,
-                          ::testing::Values(::libaom_test::kTwoPassGood),
-                          ::testing::ValuesIn(kTestParams));
+AV1_INSTANTIATE_TEST_SUITE(ForwardKeyTest,
+                           ::testing::Values(::libaom_test::kTwoPassGood,
+                                             ::libaom_test::kOnePassGood),
+                           ::testing::ValuesIn(kTestParams));
+
+typedef struct {
+  const unsigned int min_kf_dist;
+  const unsigned int max_kf_dist;
+} kfIntervalParam;
+
+const kfIntervalParam kfTestParams[] = {
+  { 0, 10 }, { 10, 10 }, { 0, 30 }, { 30, 30 }
+};
+
+std::ostream &operator<<(std::ostream &os, const kfIntervalParam &test_arg) {
+  return os << "kfIntervalParam { min_kf_dist:" << test_arg.min_kf_dist
+            << " max_kf_dist:" << test_arg.max_kf_dist << " }";
+}
+
+// This class is used to test the presence of forward key frame.
+class ForwardKeyPresenceTestLarge
+    : public ::libaom_test::CodecTestWith3Params<libaom_test::TestMode,
+                                                 kfIntervalParam, aom_rc_mode>,
+      public ::libaom_test::EncoderTest {
+ protected:
+  ForwardKeyPresenceTestLarge()
+      : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)),
+        kf_dist_param_(GET_PARAM(2)), end_usage_check_(GET_PARAM(3)) {}
+  virtual ~ForwardKeyPresenceTestLarge() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(encoding_mode_);
+    const aom_rational timebase = { 1, 30 };
+    cfg_.g_timebase = timebase;
+    cfg_.rc_end_usage = end_usage_check_;
+    cfg_.g_threads = 1;
+    cfg_.kf_min_dist = kf_dist_param_.min_kf_dist;
+    cfg_.kf_max_dist = kf_dist_param_.max_kf_dist;
+    cfg_.fwd_kf_enabled = 1;
+    cfg_.g_lag_in_frames = 19;
+  }
+
+  virtual bool DoDecode() const { return 1; }
+
+  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                                  ::libaom_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      encoder->Control(AOME_SET_CPUUSED, 5);
+      encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
+    }
+  }
+
+  virtual bool HandleDecodeResult(const aom_codec_err_t res_dec,
+                                  libaom_test::Decoder *decoder) {
+    EXPECT_EQ(AOM_CODEC_OK, res_dec) << decoder->DecodeError();
+    if (is_fwd_kf_present_ != 1 && AOM_CODEC_OK == res_dec) {
+      aom_codec_ctx_t *ctx_dec = decoder->GetDecoder();
+      AOM_CODEC_CONTROL_TYPECHECKED(ctx_dec, AOMD_GET_FWD_KF_PRESENT,
+                                    &is_fwd_kf_present_);
+    }
+    return AOM_CODEC_OK == res_dec;
+  }
+
+  ::libaom_test::TestMode encoding_mode_;
+  const kfIntervalParam kf_dist_param_;
+  int is_fwd_kf_present_;
+  aom_rc_mode end_usage_check_;
+};
+
+// TODO(crbug.com/aomedia/2807): Fix and re-enable the test.
+TEST_P(ForwardKeyPresenceTestLarge, DISABLED_ForwardKeyEncodePresenceTest) {
+  is_fwd_kf_present_ = 0;
+  libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                     cfg_.g_timebase.den, cfg_.g_timebase.num,
+                                     0, 150);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  ASSERT_EQ(is_fwd_kf_present_, 1);
+}
+
+AV1_INSTANTIATE_TEST_SUITE(ForwardKeyPresenceTestLarge,
+                           ::testing::Values(::libaom_test::kOnePassGood,
+                                             ::libaom_test::kTwoPassGood),
+                           ::testing::ValuesIn(kfTestParams),
+                           ::testing::Values(AOM_Q, AOM_VBR, AOM_CBR, AOM_CQ));
 }  // namespace
diff --git a/test/fwht4x4_test.cc b/test/fwht4x4_test.cc
index d2f77b8..b600d26 100644
--- a/test/fwht4x4_test.cc
+++ b/test/fwht4x4_test.cc
@@ -37,7 +37,7 @@
 
 using libaom_test::FhtFunc;
 
-typedef std::tuple<FdctFunc, IdctFunc, TX_TYPE, aom_bit_depth_t, int>
+typedef std::tuple<FdctFunc, IdctFunc, TX_TYPE, aom_bit_depth_t, int, FdctFunc>
     Dct4x4Param;
 
 void fwht4x4_ref(const int16_t *in, tran_low_t *out, int stride,
@@ -67,6 +67,7 @@
     bit_depth_ = GET_PARAM(3);
     mask_ = (1 << bit_depth_) - 1;
     num_coeffs_ = GET_PARAM(4);
+    fwd_txfm_c_ = GET_PARAM(5);
   }
   virtual void TearDown() { libaom_test::ClearSystemState(); }
 
@@ -77,9 +78,89 @@
   void RunInvTxfm(const tran_low_t *out, uint8_t *dst, int stride) {
     inv_txfm_(out, dst, stride);
   }
+  void RunSpeedTest() {
+    if (!fwd_txfm_c_) {
+      GTEST_SKIP();
+    } else {
+      ACMRandom rnd(ACMRandom::DeterministicSeed());
+      const int count_test_block = 10;
+      const int numIter = 5000;
+
+      int c_sum_time = 0;
+      int simd_sum_time = 0;
+
+      int stride = 96;
+
+      int16_t *input_block = reinterpret_cast<int16_t *>(
+          aom_memalign(16, sizeof(int16_t) * stride * height_));
+      tran_low_t *output_ref_block = reinterpret_cast<tran_low_t *>(
+          aom_memalign(16, sizeof(output_ref_block[0]) * num_coeffs_));
+      tran_low_t *output_block = reinterpret_cast<tran_low_t *>(
+          aom_memalign(16, sizeof(output_block[0]) * num_coeffs_));
+
+      for (int i = 0; i < count_test_block; ++i) {
+        int j, k;
+        for (j = 0; j < height_; ++j) {
+          for (k = 0; k < pitch_; ++k) {
+            int in_idx = j * stride + k;
+            int out_idx = j * pitch_ + k;
+            input_block[in_idx] =
+                (rnd.Rand16() & mask_) - (rnd.Rand16() & mask_);
+            if (bit_depth_ == AOM_BITS_8) {
+              output_block[out_idx] = output_ref_block[out_idx] = rnd.Rand8();
+            } else {
+              output_block[out_idx] = output_ref_block[out_idx] =
+                  rnd.Rand16() & mask_;
+            }
+          }
+        }
+
+        aom_usec_timer c_timer_;
+        aom_usec_timer_start(&c_timer_);
+        for (int i = 0; i < numIter; i++) {
+          ASM_REGISTER_STATE_CHECK(
+              fwd_txfm_c_(input_block, output_ref_block, stride));
+        }
+        aom_usec_timer_mark(&c_timer_);
+
+        aom_usec_timer simd_timer_;
+        aom_usec_timer_start(&simd_timer_);
+
+        for (int i = 0; i < numIter; i++) {
+          ASM_REGISTER_STATE_CHECK(
+              fwd_txfm_(input_block, output_block, stride));
+        }
+        aom_usec_timer_mark(&simd_timer_);
+
+        c_sum_time += static_cast<int>(aom_usec_timer_elapsed(&c_timer_));
+        simd_sum_time += static_cast<int>(aom_usec_timer_elapsed(&simd_timer_));
+
+        // The minimum quant value is 4.
+        for (j = 0; j < height_; ++j) {
+          for (k = 0; k < pitch_; ++k) {
+            int out_idx = j * pitch_ + k;
+            ASSERT_EQ(output_block[out_idx], output_ref_block[out_idx])
+                << "Error: not bit-exact result at index: " << out_idx
+                << " at test block: " << i;
+          }
+        }
+      }
+
+      printf(
+          "c_time = %d \t simd_time = %d \t Gain = %4.2f \n", c_sum_time,
+          simd_sum_time,
+          (static_cast<float>(c_sum_time) / static_cast<float>(simd_sum_time)));
+
+      aom_free(input_block);
+      aom_free(output_ref_block);
+      aom_free(output_block);
+    }
+  }
 
   FdctFunc fwd_txfm_;
   IdctFunc inv_txfm_;
+
+  FdctFunc fwd_txfm_c_;  // C version of forward transform for speed test.
 };
 
 TEST_P(Trans4x4WHT, AccuracyCheck) { RunAccuracyCheck(0, 0.00001); }
@@ -89,12 +170,27 @@
 TEST_P(Trans4x4WHT, MemCheck) { RunMemCheck(); }
 
 TEST_P(Trans4x4WHT, InvAccuracyCheck) { RunInvAccuracyCheck(0); }
+
+TEST_P(Trans4x4WHT, DISABLED_Speed) { RunSpeedTest(); }
+
 using std::make_tuple;
 
 INSTANTIATE_TEST_SUITE_P(
     C, Trans4x4WHT,
     ::testing::Values(make_tuple(&av1_highbd_fwht4x4_c, &iwht4x4_10, DCT_DCT,
-                                 AOM_BITS_10, 16),
+                                 AOM_BITS_10, 16, static_cast<FdctFunc>(NULL)),
                       make_tuple(&av1_highbd_fwht4x4_c, &iwht4x4_12, DCT_DCT,
-                                 AOM_BITS_12, 16)));
+                                 AOM_BITS_12, 16,
+                                 static_cast<FdctFunc>(NULL))));
+#if HAVE_NEON
+
+INSTANTIATE_TEST_SUITE_P(
+    NEON, Trans4x4WHT,
+    ::testing::Values(make_tuple(&av1_highbd_fwht4x4_neon, &iwht4x4_10, DCT_DCT,
+                                 AOM_BITS_10, 16, &av1_highbd_fwht4x4_c),
+                      make_tuple(&av1_highbd_fwht4x4_neon, &iwht4x4_12, DCT_DCT,
+                                 AOM_BITS_12, 16, &av1_highbd_fwht4x4_c)));
+
+#endif  // HAVE_NEON
+
 }  // namespace
diff --git a/test/gf_pyr_height_test.cc b/test/gf_pyr_height_test.cc
index b1ade67..aac9cb6 100644
--- a/test/gf_pyr_height_test.cc
+++ b/test/gf_pyr_height_test.cc
@@ -25,32 +25,32 @@
   double psnr_thresh;
 } kTestParams[] = {
   // gf_min_pyr_height = 0
-  { 0, 0, 33.40 },
-  { 0, 1, 34.00 },
+  { 0, 0, 32.30 },
+  { 0, 1, 33.90 },
   { 0, 2, 34.00 },
   { 0, 3, 34.20 },
   { 0, 4, 34.30 },
-  { 0, 5, 34.40 },
+  { 0, 5, 34.35 },
   // gf_min_pyr_height = 1
-  { 1, 1, 34.00 },
+  { 1, 1, 33.90 },
   { 1, 2, 34.00 },
   { 1, 3, 34.20 },
   { 1, 4, 34.30 },
-  { 1, 5, 34.40 },
+  { 1, 5, 34.35 },
   // gf_min_pyr_height = 2
   { 2, 2, 34.00 },
   { 2, 3, 34.20 },
   { 2, 4, 34.30 },
-  { 2, 5, 34.40 },
+  { 2, 5, 34.35 },
   // gf_min_pyr_height = 3
   { 3, 3, 34.20 },
   { 3, 4, 34.30 },
-  { 3, 5, 34.40 },
+  { 3, 5, 34.35 },
   // gf_min_pyr_height = 4
   { 4, 4, 34.30 },
-  { 4, 5, 34.40 },
+  { 4, 5, 34.35 },
   // gf_min_pyr_height = 5
-  { 5, 5, 34.40 },
+  { 5, 5, 34.35 },
 };
 
 // Compiler may decide to add some padding to the struct above for alignment,
@@ -150,7 +150,7 @@
       << "GF Max Pyramid Height = " << gf_max_pyr_height_;
 }
 
-AV1_INSTANTIATE_TEST_CASE(GFPyrHeightTest, NONREALTIME_TEST_MODES,
-                          ::testing::Values(AOM_Q, AOM_VBR),
-                          ::testing::ValuesIn(kTestParams));
+AV1_INSTANTIATE_TEST_SUITE(GFPyrHeightTest, NONREALTIME_TEST_MODES,
+                           ::testing::Values(AOM_Q, AOM_VBR),
+                           ::testing::ValuesIn(kTestParams));
 }  // namespace
diff --git a/test/hbd_metrics_test.cc b/test/hbd_metrics_test.cc
index 5b03bee..8044b51 100644
--- a/test/hbd_metrics_test.cc
+++ b/test/hbd_metrics_test.cc
@@ -80,15 +80,15 @@
 double compute_hbd_aomssim(const YV12_BUFFER_CONFIG *source,
                            const YV12_BUFFER_CONFIG *dest, uint32_t in_bd,
                            uint32_t bd) {
-  double ssim, weight;
-  ssim = aom_highbd_calc_ssim(source, dest, &weight, bd, in_bd);
-  return 100 * pow(ssim / weight, 8.0);
+  double ssim[2], weight[2];
+  aom_highbd_calc_ssim(source, dest, weight, bd, in_bd, ssim);
+  return 100 * pow(ssim[0] / weight[0], 8.0);
 }
 
 double compute_aomssim(const YV12_BUFFER_CONFIG *source,
                        const YV12_BUFFER_CONFIG *dest) {
   double ssim, weight;
-  ssim = aom_calc_ssim(source, dest, &weight);
+  aom_calc_ssim(source, dest, &weight, &ssim);
   return 100 * pow(ssim / weight, 8.0);
 }
 
diff --git a/test/hiprec_convolve_test.cc b/test/hiprec_convolve_test.cc
index 59d28e8..3e93a06 100644
--- a/test/hiprec_convolve_test.cc
+++ b/test/hiprec_convolve_test.cc
@@ -17,8 +17,10 @@
 using libaom_test::ACMRandom;
 #if CONFIG_AV1_HIGHBITDEPTH
 using libaom_test::AV1HighbdHiprecConvolve::AV1HighbdHiprecConvolveTest;
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1HighbdHiprecConvolveTest);
 #endif
 using libaom_test::AV1HiprecConvolve::AV1HiprecConvolveTest;
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1HiprecConvolveTest);
 using std::make_tuple;
 using std::tuple;
 
diff --git a/test/horver_correlation_test.cc b/test/horver_correlation_test.cc
index ccb8edd..d1fd578 100644
--- a/test/horver_correlation_test.cc
+++ b/test/horver_correlation_test.cc
@@ -48,6 +48,7 @@
   ACMRandom rng_;
   int16_t *data_buf_;
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(HorverTest);
 
 void HorverTest::RunHorverTest(void) {
   for (int block_size = 0; block_size < BLOCK_SIZES_ALL; block_size++) {
@@ -140,6 +141,11 @@
     ::testing::Values(av1_get_horver_correlation_full_sse4_1));
 #endif  // HAVE_SSE4_1
 
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+    NEON, HorverTest, ::testing::Values(av1_get_horver_correlation_full_neon));
+#endif  // HAVE_NEON
+
 #if HAVE_AVX2
 INSTANTIATE_TEST_SUITE_P(
     AVX2, HorverTest, ::testing::Values(av1_get_horver_correlation_full_avx2));
diff --git a/test/horz_superres_test.cc b/test/horz_superres_test.cc
index 938b0b1..b8a268e 100644
--- a/test/horz_superres_test.cc
+++ b/test/horz_superres_test.cc
@@ -39,7 +39,8 @@
   unsigned int profile;
   unsigned int limit;
   unsigned int screen_content;
-  double psnr_threshold;
+  double psnr_threshold;   // used by modes other than AOM_SUPERRES_AUTO
+  double psnr_threshold2;  // used by AOM_SUPERRES_AUTO
 } TestVideoParam;
 
 std::ostream &operator<<(std::ostream &os, const TestVideoParam &test_arg) {
@@ -51,18 +52,21 @@
 }
 
 const TestVideoParam kTestVideoVectors[] = {
-  { "park_joy_90p_8_420.y4m", AOM_IMG_FMT_I420, AOM_BITS_8, 0, 5, 0, 25.5 },
+  { "park_joy_90p_8_420.y4m", AOM_IMG_FMT_I420, AOM_BITS_8, 0, 5, 0, 25.7,
+    45.0 },
 #if CONFIG_AV1_HIGHBITDEPTH
-  { "park_joy_90p_10_444.y4m", AOM_IMG_FMT_I44416, AOM_BITS_10, 1, 5, 0, 28.0 },
+  { "park_joy_90p_10_444.y4m", AOM_IMG_FMT_I44416, AOM_BITS_10, 1, 5, 0, 27.0,
+    48.0 },
 #endif
-  { "screendata.y4m", AOM_IMG_FMT_I420, AOM_BITS_8, 0, 4, 1, 20.0 },
+  { "screendata.y4m", AOM_IMG_FMT_I420, AOM_BITS_8, 0, 4, 1, 23.0, 56.0 },
   // Image coding (single frame).
-  { "niklas_1280_720_30.y4m", AOM_IMG_FMT_I420, AOM_BITS_8, 0, 1, 0, 32.0 },
+  { "niklas_1280_720_30.y4m", AOM_IMG_FMT_I420, AOM_BITS_8, 0, 1, 0, 32.0,
+    49.0 },
 };
 
 // Modes with extra params have their own tests.
-const SUPERRES_MODE kSuperresModesWithoutParams[] = { SUPERRES_RANDOM,
-                                                      SUPERRES_AUTO };
+const aom_superres_mode kSuperresModesWithoutParams[] = { AOM_SUPERRES_RANDOM,
+                                                          AOM_SUPERRES_AUTO };
 
 // Superres denominators and superres kf denominators to be tested
 typedef tuple<int, int> SuperresDenominatorPair;
@@ -84,7 +88,8 @@
 
 // Test parameter list:
 //  <[needed for EncoderTest], test_video_param_, superres_mode_>
-typedef tuple<const libaom_test::CodecFactory *, TestVideoParam, SUPERRES_MODE>
+typedef tuple<const libaom_test::CodecFactory *, TestVideoParam,
+              aom_superres_mode>
     HorzSuperresTestParam;
 
 class HorzSuperresEndToEndTest
@@ -157,16 +162,17 @@
     ASSERT_TRUE(video.get() != NULL);
 
     ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
+    const double psnr_thresh = (superres_mode_ == AOM_SUPERRES_AUTO)
+                                   ? test_video_param_.psnr_threshold2
+                                   : test_video_param_.psnr_threshold;
     const double psnr = GetAveragePsnr();
-    EXPECT_GT(psnr, test_video_param_.psnr_threshold)
-        << "superres_mode_ = " << superres_mode_;
+    EXPECT_GT(psnr, psnr_thresh);
 
-    EXPECT_EQ(test_video_param_.limit, frame_count_)
-        << "superres_mode_ = " << superres_mode_;
+    EXPECT_EQ(test_video_param_.limit, frame_count_);
   }
 
   TestVideoParam test_video_param_;
-  SUPERRES_MODE superres_mode_;
+  aom_superres_mode superres_mode_;
 
  private:
   double psnr_;
@@ -175,9 +181,9 @@
 
 TEST_P(HorzSuperresEndToEndTest, HorzSuperresEndToEndPSNRTest) { DoTest(); }
 
-AV1_INSTANTIATE_TEST_CASE(HorzSuperresEndToEndTest,
-                          ::testing::ValuesIn(kTestVideoVectors),
-                          ::testing::ValuesIn(kSuperresModesWithoutParams));
+AV1_INSTANTIATE_TEST_SUITE(HorzSuperresEndToEndTest,
+                           ::testing::ValuesIn(kTestVideoVectors),
+                           ::testing::ValuesIn(kSuperresModesWithoutParams));
 
 // Test parameter list:
 //  <[needed for EncoderTest], test_video_param_, tuple(superres_denom_,
@@ -192,7 +198,7 @@
  protected:
   HorzSuperresFixedEndToEndTest()
       : EncoderTest(GET_PARAM(0)), test_video_param_(GET_PARAM(1)),
-        superres_mode_(SUPERRES_FIXED), psnr_(0.0), frame_count_(0) {
+        superres_mode_(AOM_SUPERRES_FIXED), psnr_(0.0), frame_count_(0) {
     SuperresDenominatorPair denoms = GET_PARAM(2);
     superres_denom_ = std::get<0>(denoms);
     superres_kf_denom_ = std::get<1>(denoms);
@@ -275,7 +281,7 @@
   }
 
   TestVideoParam test_video_param_;
-  SUPERRES_MODE superres_mode_;
+  aom_superres_mode superres_mode_;
   int superres_denom_;
   int superres_kf_denom_;
 
@@ -286,9 +292,9 @@
 
 TEST_P(HorzSuperresFixedEndToEndTest, HorzSuperresFixedTestParam) { DoTest(); }
 
-AV1_INSTANTIATE_TEST_CASE(HorzSuperresFixedEndToEndTest,
-                          ::testing::ValuesIn(kTestVideoVectors),
-                          ::testing::ValuesIn(kSuperresDenominators));
+AV1_INSTANTIATE_TEST_SUITE(HorzSuperresFixedEndToEndTest,
+                           ::testing::ValuesIn(kTestVideoVectors),
+                           ::testing::ValuesIn(kSuperresDenominators));
 
 // Test parameter list:
 //  <[needed for EncoderTest], test_video_param_,
@@ -303,7 +309,7 @@
  protected:
   HorzSuperresQThreshEndToEndTest()
       : EncoderTest(GET_PARAM(0)), test_video_param_(GET_PARAM(1)),
-        superres_mode_(SUPERRES_QTHRESH), psnr_(0.0), frame_count_(0) {
+        superres_mode_(AOM_SUPERRES_QTHRESH), psnr_(0.0), frame_count_(0) {
     SuperresQThresholdPair qthresholds = GET_PARAM(2);
     superres_qthresh_ = std::get<0>(qthresholds);
     superres_kf_qthresh_ = std::get<1>(qthresholds);
@@ -386,7 +392,7 @@
   }
 
   TestVideoParam test_video_param_;
-  SUPERRES_MODE superres_mode_;
+  aom_superres_mode superres_mode_;
   int superres_qthresh_;
   int superres_kf_qthresh_;
 
@@ -399,8 +405,8 @@
   DoTest();
 }
 
-AV1_INSTANTIATE_TEST_CASE(HorzSuperresQThreshEndToEndTest,
-                          ::testing::ValuesIn(kTestVideoVectors),
-                          ::testing::ValuesIn(kSuperresQThresholds));
+AV1_INSTANTIATE_TEST_SUITE(HorzSuperresQThreshEndToEndTest,
+                           ::testing::ValuesIn(kTestVideoVectors),
+                           ::testing::ValuesIn(kSuperresQThresholds));
 
 }  // namespace
diff --git a/test/intrapred_test.cc b/test/intrapred_test.cc
index 779cf9a..b04ab50 100644
--- a/test/intrapred_test.cc
+++ b/test/intrapred_test.cc
@@ -97,6 +97,63 @@
     }
     ASSERT_EQ(0, error_count);
   }
+  void RunSpeedTest(Pixel *left_col, Pixel *above_data, Pixel *dst,
+                    Pixel *ref_dst) {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    const int block_width = params_.block_width;
+    const int block_height = params_.block_height;
+    above_row_ = above_data + 16;
+    left_col_ = left_col;
+    dst_ = dst;
+    ref_dst_ = ref_dst;
+    int error_count = 0;
+    const int numIter = 100;
+
+    int c_sum_time = 0;
+    int simd_sum_time = 0;
+    for (int i = 0; i < count_test_block; ++i) {
+      // Fill edges with random data, try first with saturated values.
+      for (int x = -1; x <= block_width * 2; x++) {
+        if (i == 0) {
+          above_row_[x] = mask_;
+        } else {
+          above_row_[x] = rnd.Rand16() & mask_;
+        }
+      }
+      for (int y = 0; y < block_height; y++) {
+        if (i == 0) {
+          left_col_[y] = mask_;
+        } else {
+          left_col_[y] = rnd.Rand16() & mask_;
+        }
+      }
+
+      aom_usec_timer c_timer_;
+      aom_usec_timer_start(&c_timer_);
+
+      PredictRefSpeedTest(numIter);
+
+      aom_usec_timer_mark(&c_timer_);
+
+      aom_usec_timer simd_timer_;
+      aom_usec_timer_start(&simd_timer_);
+
+      PredictFncSpeedTest(numIter);
+
+      aom_usec_timer_mark(&simd_timer_);
+
+      c_sum_time += static_cast<int>(aom_usec_timer_elapsed(&c_timer_));
+      simd_sum_time += static_cast<int>(aom_usec_timer_elapsed(&simd_timer_));
+
+      CheckPrediction(i, &error_count);
+    }
+
+    printf(
+        "blockWxH = %d x %d c_time = %d \t simd_time = %d \t Gain = %4.2f \n",
+        block_width, block_height, c_sum_time, simd_sum_time,
+        (static_cast<float>(c_sum_time) / static_cast<float>(simd_sum_time)));
+    ASSERT_EQ(0, error_count);
+  }
 
  protected:
   virtual void SetUp() {
@@ -107,6 +164,9 @@
 
   virtual void Predict() = 0;
 
+  virtual void PredictRefSpeedTest(int num) = 0;
+  virtual void PredictFncSpeedTest(int num) = 0;
+
   void CheckPrediction(int test_case_number, int *error_count) const {
     // For each pixel ensure that the calculated value is the same as reference.
     const int block_width = params_.block_width;
@@ -142,7 +202,21 @@
     ASM_REGISTER_STATE_CHECK(
         params_.pred_fn(dst_, stride_, above_row_, left_col_, bit_depth));
   }
+  void PredictRefSpeedTest(int num) {
+    const int bit_depth = params_.bit_depth;
+    for (int i = 0; i < num; i++) {
+      params_.ref_fn(ref_dst_, stride_, above_row_, left_col_, bit_depth);
+    }
+  }
+  void PredictFncSpeedTest(int num) {
+    const int bit_depth = params_.bit_depth;
+    for (int i = 0; i < num; i++) {
+      params_.pred_fn(ref_dst_, stride_, above_row_, left_col_, bit_depth);
+    }
+  }
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(HighbdIntraPredTest);
+
 #endif
 
 class LowbdIntraPredTest : public AV1IntraPredTest<IntraPred, uint8_t> {
@@ -152,7 +226,18 @@
     ASM_REGISTER_STATE_CHECK(
         params_.pred_fn(dst_, stride_, above_row_, left_col_));
   }
+  void PredictRefSpeedTest(int num) {
+    for (int i = 0; i < num; i++) {
+      params_.ref_fn(ref_dst_, stride_, above_row_, left_col_);
+    }
+  }
+  void PredictFncSpeedTest(int num) {
+    for (int i = 0; i < num; i++) {
+      params_.pred_fn(dst_, stride_, above_row_, left_col_);
+    }
+  }
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(LowbdIntraPredTest);
 
 #if CONFIG_AV1_HIGHBITDEPTH
 // Suppress an unitialized warning. Once there are implementations to test then
@@ -169,19 +254,26 @@
 }
 #endif
 
-// Same issue as above but for arm.
-#if !HAVE_NEON
 TEST_P(LowbdIntraPredTest, Bitexact) {
-  // max block size is 32
-  DECLARE_ALIGNED(16, uint8_t, left_col[2 * 32]);
-  DECLARE_ALIGNED(16, uint8_t, above_data[2 * 32 + 32]);
-  DECLARE_ALIGNED(16, uint8_t, dst[3 * 32 * 32]);
-  DECLARE_ALIGNED(16, uint8_t, ref_dst[3 * 32 * 32]);
+  // max block size is 64
+  DECLARE_ALIGNED(16, uint8_t, left_col[2 * 64]);
+  DECLARE_ALIGNED(16, uint8_t, above_data[2 * 64 + 64]);
+  DECLARE_ALIGNED(16, uint8_t, dst[3 * 64 * 64]);
+  DECLARE_ALIGNED(16, uint8_t, ref_dst[3 * 64 * 64]);
   av1_zero(left_col);
   av1_zero(above_data);
   RunTest(left_col, above_data, dst, ref_dst);
 }
-#endif  // !HAVE_NEON
+TEST_P(LowbdIntraPredTest, DISABLED_Speed) {
+  // max block size is 64
+  DECLARE_ALIGNED(16, uint8_t, left_col[2 * 64]);
+  DECLARE_ALIGNED(16, uint8_t, above_data[2 * 64 + 64]);
+  DECLARE_ALIGNED(16, uint8_t, dst[3 * 64 * 64]);
+  DECLARE_ALIGNED(16, uint8_t, ref_dst[3 * 64 * 64]);
+  av1_zero(left_col);
+  av1_zero(above_data);
+  RunSpeedTest(left_col, above_data, dst, ref_dst);
+}
 
 #if CONFIG_AV1_HIGHBITDEPTH
 // -----------------------------------------------------------------------------
@@ -229,6 +321,25 @@
 
 #endif  // HAVE_SSE2
 
+#if HAVE_NEON
+const IntraPredFunc<IntraPred> LowbdIntraPredTestVectorNeon[] = {
+  lowbd_entry(smooth, 4, 4, neon),   lowbd_entry(smooth, 4, 8, neon),
+  lowbd_entry(smooth, 8, 4, neon),   lowbd_entry(smooth, 8, 8, neon),
+  lowbd_entry(smooth, 8, 16, neon),  lowbd_entry(smooth, 16, 8, neon),
+  lowbd_entry(smooth, 16, 16, neon), lowbd_entry(smooth, 16, 32, neon),
+  lowbd_entry(smooth, 32, 16, neon), lowbd_entry(smooth, 32, 32, neon),
+  lowbd_entry(smooth, 32, 64, neon), lowbd_entry(smooth, 64, 32, neon),
+  lowbd_entry(smooth, 64, 64, neon),
+#if !CONFIG_REALTIME_ONLY
+  lowbd_entry(smooth, 4, 16, neon),  lowbd_entry(smooth, 8, 32, neon),
+  lowbd_entry(smooth, 16, 4, neon),  lowbd_entry(smooth, 16, 64, neon),
+  lowbd_entry(smooth, 32, 8, neon),  lowbd_entry(smooth, 64, 16, neon),
+#endif
+};
+INSTANTIATE_TEST_SUITE_P(NEON, LowbdIntraPredTest,
+                         ::testing::ValuesIn(LowbdIntraPredTestVectorNeon));
+#endif  // HAVE_NEON
+
 #if HAVE_SSSE3
 const IntraPredFunc<IntraPred> LowbdIntraPredTestVectorSsse3[] = {
   lowbd_intrapred(paeth, ssse3),
diff --git a/test/invalid_file_test.cc b/test/invalid_file_test.cc
index dd0956d..77839fa 100644
--- a/test/invalid_file_test.cc
+++ b/test/invalid_file_test.cc
@@ -133,10 +133,11 @@
   { 1, "invalid-google-142530197-1.ivf", NULL },
   { 4, "invalid-oss-fuzz-9463.ivf", "invalid-oss-fuzz-9463.ivf.res.2" },
   { 1, "invalid-oss-fuzz-9720.ivf", NULL },
-  { 1, "invalid-oss-fuzz-10389.ivf", "invalid-oss-fuzz-10389.ivf.res.2" },
+  { 1, "invalid-oss-fuzz-10389.ivf", "invalid-oss-fuzz-10389.ivf.res.4" },
   { 1, "invalid-oss-fuzz-11523.ivf", "invalid-oss-fuzz-11523.ivf.res.2" },
   { 4, "invalid-oss-fuzz-15363.ivf", NULL },
-  { 1, "invalid-oss-fuzz-16437.ivf", NULL },
+  { 1, "invalid-oss-fuzz-16437.ivf", "invalid-oss-fuzz-16437.ivf.res.2" },
+  { 1, "invalid-oss-fuzz-24706.ivf", NULL },
 #if CONFIG_AV1_HIGHBITDEPTH
   // These test vectors contain 10-bit or 12-bit video.
   { 1, "invalid-oss-fuzz-9288.ivf", NULL },
@@ -153,7 +154,7 @@
 #endif
 };
 
-AV1_INSTANTIATE_TEST_CASE(InvalidFileTest,
-                          ::testing::ValuesIn(kAV1InvalidFileTests));
+AV1_INSTANTIATE_TEST_SUITE(InvalidFileTest,
+                           ::testing::ValuesIn(kAV1InvalidFileTests));
 
 }  // namespace
diff --git a/test/kf_test.cc b/test/kf_test.cc
new file mode 100644
index 0000000..4a8eb86
--- /dev/null
+++ b/test/kf_test.cc
@@ -0,0 +1,268 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <ostream>
+
+#include "aom/aom_codec.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+
+namespace {
+typedef struct {
+  const unsigned int min_kf_dist;
+  const unsigned int max_kf_dist;
+} kfIntervalParam;
+
+const kfIntervalParam kfTestParams[] = {
+  { 1, 1 }, { 0, 10 }, { 10, 10 }, { 0, 30 }, { 30, 30 }
+};
+
+std::ostream &operator<<(std::ostream &os, const kfIntervalParam &test_arg) {
+  return os << "kfIntervalParam { min_kf_dist:" << test_arg.min_kf_dist
+            << " max_kf_dist:" << test_arg.max_kf_dist << " }";
+}
+
+// This class is used to test the presence of forward key frame.
+class KeyFrameIntervalTestLarge
+    : public ::libaom_test::CodecTestWith3Params<libaom_test::TestMode,
+                                                 kfIntervalParam, aom_rc_mode>,
+      public ::libaom_test::EncoderTest {
+ protected:
+  KeyFrameIntervalTestLarge()
+      : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)),
+        kf_dist_param_(GET_PARAM(2)), end_usage_check_(GET_PARAM(3)) {
+    kf_dist_ = -1;
+    is_kf_interval_violated_ = false;
+  }
+  virtual ~KeyFrameIntervalTestLarge() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(encoding_mode_);
+    const aom_rational timebase = { 1, 30 };
+    cfg_.g_timebase = timebase;
+    cfg_.rc_end_usage = end_usage_check_;
+    cfg_.g_threads = 1;
+    cfg_.kf_min_dist = kf_dist_param_.min_kf_dist;
+    cfg_.kf_max_dist = kf_dist_param_.max_kf_dist;
+    cfg_.g_lag_in_frames = 19;
+  }
+
+  virtual bool DoDecode() const { return 1; }
+
+  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                                  ::libaom_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      encoder->Control(AOME_SET_CPUUSED, 5);
+      encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
+    }
+  }
+
+  virtual bool HandleDecodeResult(const aom_codec_err_t res_dec,
+                                  libaom_test::Decoder *decoder) {
+    EXPECT_EQ(AOM_CODEC_OK, res_dec) << decoder->DecodeError();
+    if (AOM_CODEC_OK == res_dec) {
+      aom_codec_ctx_t *ctx_dec = decoder->GetDecoder();
+      int frame_flags = 0;
+      AOM_CODEC_CONTROL_TYPECHECKED(ctx_dec, AOMD_GET_FRAME_FLAGS,
+                                    &frame_flags);
+      if (kf_dist_ != -1) {
+        kf_dist_++;
+        if (kf_dist_ > (int)kf_dist_param_.max_kf_dist) {
+          is_kf_interval_violated_ = true;
+        }
+      }
+      if ((frame_flags & AOM_FRAME_IS_KEY) ==
+          static_cast<aom_codec_frame_flags_t>(AOM_FRAME_IS_KEY)) {
+        if (kf_dist_ != -1 && kf_dist_ < (int)kf_dist_param_.min_kf_dist) {
+          is_kf_interval_violated_ = true;
+        }
+        kf_dist_ = 0;
+      }
+    }
+    return AOM_CODEC_OK == res_dec;
+  }
+
+  ::libaom_test::TestMode encoding_mode_;
+  const kfIntervalParam kf_dist_param_;
+  int kf_dist_;
+  bool is_kf_interval_violated_;
+  aom_rc_mode end_usage_check_;
+};
+
+TEST_P(KeyFrameIntervalTestLarge, KeyFrameIntervalTest) {
+  libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                     cfg_.g_timebase.den, cfg_.g_timebase.num,
+                                     0, 75);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  ASSERT_EQ(is_kf_interval_violated_, false) << kf_dist_param_;
+}
+
+// This class tests for presence and placement of application forced key frames.
+class ForcedKeyTestLarge
+    : public ::libaom_test::CodecTestWith5Params<libaom_test::TestMode, int,
+                                                 int, int, aom_rc_mode>,
+      public ::libaom_test::EncoderTest {
+ protected:
+  ForcedKeyTestLarge()
+      : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)),
+        auto_alt_ref_(GET_PARAM(2)), fwd_kf_enabled_(GET_PARAM(3)),
+        cpu_used_(GET_PARAM(4)), rc_end_usage_(GET_PARAM(5)) {
+    forced_kf_frame_num_ = 1;
+    frame_num_ = 0;
+    is_kf_placement_violated_ = false;
+  }
+  virtual ~ForcedKeyTestLarge() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(encoding_mode_);
+    cfg_.rc_end_usage = rc_end_usage_;
+    cfg_.g_threads = 0;
+    cfg_.kf_max_dist = 30;
+    cfg_.kf_min_dist = 0;
+    cfg_.fwd_kf_enabled = fwd_kf_enabled_;
+  }
+
+  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                                  ::libaom_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      encoder->Control(AOME_SET_CPUUSED, cpu_used_);
+      encoder->Control(AOME_SET_ENABLEAUTOALTREF, auto_alt_ref_);
+#if CONFIG_AV1_ENCODER
+      // override test default for tile columns if necessary.
+      if (GET_PARAM(0) == &libaom_test::kAV1) {
+        encoder->Control(AV1E_SET_TILE_COLUMNS, 6);
+      }
+#endif
+    }
+    frame_flags_ =
+        ((int)video->frame() == forced_kf_frame_num_) ? AOM_EFLAG_FORCE_KF : 0;
+  }
+
+  virtual bool HandleDecodeResult(const aom_codec_err_t res_dec,
+                                  libaom_test::Decoder *decoder) {
+    EXPECT_EQ(AOM_CODEC_OK, res_dec) << decoder->DecodeError();
+    if (AOM_CODEC_OK == res_dec) {
+      if ((int)frame_num_ == forced_kf_frame_num_) {
+        aom_codec_ctx_t *ctx_dec = decoder->GetDecoder();
+        int frame_flags = 0;
+        AOM_CODEC_CONTROL_TYPECHECKED(ctx_dec, AOMD_GET_FRAME_FLAGS,
+                                      &frame_flags);
+        if ((frame_flags & AOM_FRAME_IS_KEY) !=
+            static_cast<aom_codec_frame_flags_t>(AOM_FRAME_IS_KEY)) {
+          is_kf_placement_violated_ = true;
+        }
+      }
+      ++frame_num_;
+    }
+    return AOM_CODEC_OK == res_dec;
+  }
+
+  ::libaom_test::TestMode encoding_mode_;
+  int auto_alt_ref_;
+  int fwd_kf_enabled_;
+  int cpu_used_;
+  aom_rc_mode rc_end_usage_;
+  int forced_kf_frame_num_;
+  unsigned int frame_num_;
+  bool is_kf_placement_violated_;
+};
+
+// TODO(crbug.com/aomedia/2810): Fix and re-enable the test.
+TEST_P(ForcedKeyTestLarge, DISABLED_Frame1IsKey) {
+  const aom_rational timebase = { 1, 30 };
+  const int lag_values[] = { 3, 15, 25, -1 };
+
+  forced_kf_frame_num_ = 1;
+  for (int i = 0; lag_values[i] != -1; ++i) {
+    frame_num_ = 0;
+    cfg_.g_lag_in_frames = lag_values[i];
+    is_kf_placement_violated_ = false;
+    libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       timebase.den, timebase.num, 0,
+                                       fwd_kf_enabled_ ? 60 : 30);
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    ASSERT_EQ(is_kf_placement_violated_, false)
+        << "Frame #" << frame_num_ << " isn't a keyframe!";
+  }
+}
+
+// This class checks the presence and placement of application
+// forced key frames.
+// TODO(crbug.com/aomedia/2808): Fix and re-enable the test.
+TEST_P(ForcedKeyTestLarge, DISABLED_ForcedFrameIsKey) {
+  const aom_rational timebase = { 1, 30 };
+  const int lag_values[] = { 3, 15, 25, -1 };
+
+  for (int i = 0; lag_values[i] != -1; ++i) {
+    frame_num_ = 0;
+    forced_kf_frame_num_ = lag_values[i] - 1;
+    cfg_.g_lag_in_frames = lag_values[i];
+    is_kf_placement_violated_ = false;
+    libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       timebase.den, timebase.num, 0,
+                                       fwd_kf_enabled_ ? 60 : 30);
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    ASSERT_EQ(is_kf_placement_violated_, false)
+        << "Frame #" << frame_num_ << " isn't a keyframe!";
+
+    // Two pass and single pass CBR are currently segfaulting for the case when
+    // forced kf is placed after lag in frames.
+    // TODO(anyone): Enable(uncomment) below test once above bug is fixed.
+    //    frame_num_ = 0;
+    //    forced_kf_frame_num_ = lag_values[i] + 1;
+    //    cfg_.g_lag_in_frames = lag_values[i];
+    //    is_kf_placement_violated_ = false;
+    //    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    //    ASSERT_EQ(is_kf_placement_violated_, false)
+    //    << "Frame #" << frame_num_ << " isn't a keyframe!";
+  }
+}
+
+// TODO(crbug.com/aomedia/2809): Fix and re-enable the test.
+TEST_P(ForcedKeyTestLarge, DISABLED_ForcedFrameIsKeyCornerCases) {
+  const aom_rational timebase = { 1, 30 };
+  const int kf_offsets[] = { -2, -1, 1, 2, 0 };
+  cfg_.g_lag_in_frames = 35;
+
+  for (int i = 0; kf_offsets[i] != 0; ++i) {
+    frame_num_ = 0;
+    forced_kf_frame_num_ = (int)cfg_.kf_max_dist + kf_offsets[i];
+    forced_kf_frame_num_ = forced_kf_frame_num_ > 0 ? forced_kf_frame_num_ : 1;
+    is_kf_placement_violated_ = false;
+    libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       timebase.den, timebase.num, 0,
+                                       fwd_kf_enabled_ ? 60 : 30);
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    ASSERT_EQ(is_kf_placement_violated_, false)
+        << "Frame #" << frame_num_ << " isn't a keyframe!";
+  }
+}
+
+AV1_INSTANTIATE_TEST_SUITE(KeyFrameIntervalTestLarge,
+                           testing::Values(::libaom_test::kOnePassGood,
+                                           ::libaom_test::kTwoPassGood),
+                           ::testing::ValuesIn(kfTestParams),
+                           ::testing::Values(AOM_Q, AOM_VBR, AOM_CBR, AOM_CQ));
+
+// TODO(anyone): Add CBR to list of rc_modes once forced kf placement after
+// lag in frames bug is fixed.
+AV1_INSTANTIATE_TEST_SUITE(ForcedKeyTestLarge,
+                           ::testing::Values(::libaom_test::kOnePassGood,
+                                             ::libaom_test::kTwoPassGood),
+                           ::testing::Values(0, 1), ::testing::Values(0, 1),
+                           ::testing::Values(2, 5),
+                           ::testing::Values(AOM_Q, AOM_VBR, AOM_CQ));
+}  // namespace
diff --git a/test/level_test.cc b/test/level_test.cc
index a9613c5..82ada98 100644
--- a/test/level_test.cc
+++ b/test/level_test.cc
@@ -80,7 +80,7 @@
 };
 
 TEST_P(LevelTest, TestTargetLevelApi) {
-  static const aom_codec_iface_t *codec = &aom_codec_av1_cx_algo;
+  static aom_codec_iface_t *codec = &aom_codec_av1_cx_algo;
   aom_codec_ctx_t enc;
   aom_codec_enc_cfg_t cfg;
   EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_config_default(codec, &cfg, 0));
@@ -121,6 +121,7 @@
                                        30, 1, 0, 40);
     target_level_ = kLevelKeepStats;
     cfg_.rc_target_bitrate = 1000;
+    cfg_.g_limit = 40;
     ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
     ASSERT_EQ(level_[0], 0);
   }
@@ -133,8 +134,9 @@
                                        30, 1, 0, 40);
     target_level_ = kLevelKeepStats;
     cfg_.rc_target_bitrate = 4000;
+    cfg_.g_limit = 40;
     ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-    ASSERT_EQ(level_[0], 1);
+    ASSERT_EQ(level_[0], 4);
   }
 }
 
@@ -151,7 +153,8 @@
   }
 }
 
-AV1_INSTANTIATE_TEST_CASE(LevelTest,
-                          ::testing::Values(::libaom_test::kTwoPassGood),
-                          ::testing::ValuesIn(kCpuUsedVectors));
+AV1_INSTANTIATE_TEST_SUITE(LevelTest,
+                           ::testing::Values(::libaom_test::kTwoPassGood,
+                                             ::libaom_test::kOnePassGood),
+                           ::testing::ValuesIn(kCpuUsedVectors));
 }  // namespace
diff --git a/test/lightfield_test.sh b/test/lightfield_test.sh
index 3de88af..cf1ea73 100755
--- a/test/lightfield_test.sh
+++ b/test/lightfield_test.sh
@@ -42,7 +42,7 @@
 
   eval "${AOM_TEST_PREFIX}" "${encoder}" "${img_width}" "${img_height}" \
       "${yuv_file}" "${lf_file}" "${lf_width}" \
-      "${lf_height}" "${lf_blocksize}" ${devnull}
+      "${lf_height}" "${lf_blocksize}" ${devnull} || return 1
 
   [ -e "${lf_file}" ] || return 1
 
@@ -73,7 +73,7 @@
   fi
 
   eval "${AOM_TEST_PREFIX}" "${bs_decoder}" "${lf_file}" "${tl_file}" \
-      "${num_references}" "${tl_text_file}" ${devnull}
+      "${num_references}" "${tl_text_file}" ${devnull} || return 1
 
   [ -e "${tl_file}" ] || return 1
 
@@ -86,7 +86,7 @@
   fi
 
   eval "${AOM_TEST_PREFIX}" "${tl_decoder}" "${tl_file}" "${tl_outfile}" \
-      "${num_references}" "${num_tile_lists}" ${devnull}
+      "${num_references}" "${num_tile_lists}" ${devnull} || return 1
 
   [ -e "${tl_outfile}" ] || return 1
 
@@ -99,7 +99,7 @@
   fi
 
   eval "${AOM_TEST_PREFIX}" "${ref_decoder}" "${lf_file}" "${tl_reffile}" \
-      "${num_references}" "${tl_text_file}" ${devnull}
+      "${num_references}" "${tl_text_file}" ${devnull} || return 1
 
   [ -e "${tl_reffile}" ] || return 1
 
diff --git a/test/lossless_test.cc b/test/lossless_test.cc
index 71ae5e7..b56b43a 100644
--- a/test/lossless_test.cc
+++ b/test/lossless_test.cc
@@ -24,18 +24,20 @@
 const int kMaxPsnr = 100;
 
 class LosslessTestLarge
-    : public ::libaom_test::CodecTestWithParam<libaom_test::TestMode>,
+    : public ::libaom_test::CodecTestWith2Params<libaom_test::TestMode,
+                                                 aom_rc_mode>,
       public ::libaom_test::EncoderTest {
  protected:
   LosslessTestLarge()
       : EncoderTest(GET_PARAM(0)), psnr_(kMaxPsnr), nframes_(0),
-        encoding_mode_(GET_PARAM(1)) {}
+        encoding_mode_(GET_PARAM(1)), rc_end_usage_(GET_PARAM(2)) {}
 
   virtual ~LosslessTestLarge() {}
 
   virtual void SetUp() {
     InitializeConfig();
     SetMode(encoding_mode_);
+    cfg_.rc_end_usage = rc_end_usage_;
   }
 
   virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
@@ -60,10 +62,25 @@
 
   double GetMinPsnr() const { return psnr_; }
 
+  virtual bool HandleDecodeResult(const aom_codec_err_t res_dec,
+                                  libaom_test::Decoder *decoder) {
+    EXPECT_EQ(AOM_CODEC_OK, res_dec) << decoder->DecodeError();
+    if (AOM_CODEC_OK == res_dec) {
+      aom_codec_ctx_t *ctx_dec = decoder->GetDecoder();
+      AOM_CODEC_CONTROL_TYPECHECKED(ctx_dec, AOMD_GET_LAST_QUANTIZER,
+                                    &base_qindex_);
+      EXPECT_EQ(base_qindex_, 0)
+          << "Error: Base_qindex is non zero for lossless coding";
+    }
+    return AOM_CODEC_OK == res_dec;
+  }
+
  private:
   double psnr_;
   unsigned int nframes_;
   libaom_test::TestMode encoding_mode_;
+  aom_rc_mode rc_end_usage_;
+  int base_qindex_;
 };
 
 TEST_P(LosslessTestLarge, TestLossLessEncoding) {
@@ -120,7 +137,8 @@
   EXPECT_GE(psnr_lossless, kMaxPsnr);
 }
 
-AV1_INSTANTIATE_TEST_CASE(LosslessTestLarge,
-                          ::testing::Values(::libaom_test::kOnePassGood,
-                                            ::libaom_test::kTwoPassGood));
+AV1_INSTANTIATE_TEST_SUITE(LosslessTestLarge,
+                           ::testing::Values(::libaom_test::kOnePassGood,
+                                             ::libaom_test::kTwoPassGood),
+                           ::testing::Values(AOM_Q, AOM_VBR, AOM_CBR, AOM_CQ));
 }  // namespace
diff --git a/test/lpf_test.cc b/test/lpf_test.cc
index e8eeceb..a30d02d 100644
--- a/test/lpf_test.cc
+++ b/test/lpf_test.cc
@@ -165,11 +165,15 @@
 
 #if CONFIG_AV1_HIGHBITDEPTH
 typedef LoopTestParam<hbdloop_op_t, hbdloop_param_t> Loop8Test6Param_hbd;
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(Loop8Test6Param_hbd);
 typedef LoopTestParam<hbddual_loop_op_t, hbddual_loop_param_t>
     Loop8Test9Param_hbd;
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(Loop8Test9Param_hbd);
 #endif
 typedef LoopTestParam<loop_op_t, loop_param_t> Loop8Test6Param_lbd;
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(Loop8Test6Param_lbd);
 typedef LoopTestParam<dual_loop_op_t, dual_loop_param_t> Loop8Test9Param_lbd;
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(Loop8Test9Param_lbd);
 
 #define OPCHECK(a, b)                                                          \
   ACMRandom rnd(ACMRandom::DeterministicSeed());                               \
diff --git a/test/masked_sad_test.cc b/test/masked_sad_test.cc
index aa4dd83..df7b3f8 100644
--- a/test/masked_sad_test.cc
+++ b/test/masked_sad_test.cc
@@ -86,6 +86,7 @@
   MaskedSADFunc maskedSAD_op_;
   MaskedSADFunc ref_maskedSAD_op_;
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(MaskedSADTest);
 
 class MaskedSADx4Test : public MaskedSADTestBase,
                         public ::testing::WithParamInterface<MaskedSADx4Param> {
@@ -109,6 +110,7 @@
   MaskedSADx4Func maskedSAD_op_;
   MaskedSADx4Func ref_maskedSAD_op_;
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(MaskedSADx4Test);
 
 void MaskedSADTest::runRef(const uint8_t *src_ptr, int src_stride,
                            const uint8_t *ref_ptr[], int ref_stride,
@@ -259,6 +261,8 @@
   HighbdMaskedSADFunc maskedSAD_op_;
   HighbdMaskedSADFunc ref_maskedSAD_op_;
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(HighbdMaskedSADTest);
+
 void HighbdMaskedSADTest::runHighbdMaskedSADTest(int run_times) {
   unsigned int ref_ret = 0, ret = 1;
   ACMRandom rnd(ACMRandom::DeterministicSeed());
@@ -348,12 +352,14 @@
   make_tuple(&aom_masked_sad64x128_ssse3, &aom_masked_sad64x128_c),
   make_tuple(&aom_masked_sad128x64_ssse3, &aom_masked_sad128x64_c),
   make_tuple(&aom_masked_sad128x128_ssse3, &aom_masked_sad128x128_c),
+#if !CONFIG_REALTIME_ONLY
   make_tuple(&aom_masked_sad4x16_ssse3, &aom_masked_sad4x16_c),
   make_tuple(&aom_masked_sad16x4_ssse3, &aom_masked_sad16x4_c),
   make_tuple(&aom_masked_sad8x32_ssse3, &aom_masked_sad8x32_c),
   make_tuple(&aom_masked_sad32x8_ssse3, &aom_masked_sad32x8_c),
   make_tuple(&aom_masked_sad16x64_ssse3, &aom_masked_sad16x64_c),
   make_tuple(&aom_masked_sad64x16_ssse3, &aom_masked_sad64x16_c),
+#endif
 };
 
 INSTANTIATE_TEST_SUITE_P(SSSE3, MaskedSADTest, ::testing::ValuesIn(msad_test));
@@ -375,12 +381,14 @@
   make_tuple(&aom_masked_sad64x128x4d_ssse3, &aom_masked_sad64x128x4d_c),
   make_tuple(&aom_masked_sad128x64x4d_ssse3, &aom_masked_sad128x64x4d_c),
   make_tuple(&aom_masked_sad128x128x4d_ssse3, &aom_masked_sad128x128x4d_c),
+#if !CONFIG_REALTIME_ONLY
   make_tuple(&aom_masked_sad4x16x4d_ssse3, &aom_masked_sad4x16x4d_c),
   make_tuple(&aom_masked_sad16x4x4d_ssse3, &aom_masked_sad16x4x4d_c),
   make_tuple(&aom_masked_sad8x32x4d_ssse3, &aom_masked_sad8x32x4d_c),
   make_tuple(&aom_masked_sad32x8x4d_ssse3, &aom_masked_sad32x8x4d_c),
   make_tuple(&aom_masked_sad16x64x4d_ssse3, &aom_masked_sad16x64x4d_c),
   make_tuple(&aom_masked_sad64x16x4d_ssse3, &aom_masked_sad64x16x4d_c),
+#endif
 };
 
 INSTANTIATE_TEST_SUITE_P(SSSE3, MaskedSADx4Test,
@@ -407,12 +415,14 @@
              &aom_highbd_masked_sad128x64_c),
   make_tuple(&aom_highbd_masked_sad128x128_ssse3,
              &aom_highbd_masked_sad128x128_c),
+#if !CONFIG_REALTIME_ONLY
   make_tuple(&aom_highbd_masked_sad4x16_ssse3, &aom_highbd_masked_sad4x16_c),
   make_tuple(&aom_highbd_masked_sad16x4_ssse3, &aom_highbd_masked_sad16x4_c),
   make_tuple(&aom_highbd_masked_sad8x32_ssse3, &aom_highbd_masked_sad8x32_c),
   make_tuple(&aom_highbd_masked_sad32x8_ssse3, &aom_highbd_masked_sad32x8_c),
   make_tuple(&aom_highbd_masked_sad16x64_ssse3, &aom_highbd_masked_sad16x64_c),
   make_tuple(&aom_highbd_masked_sad64x16_ssse3, &aom_highbd_masked_sad64x16_c),
+#endif
 };
 
 INSTANTIATE_TEST_SUITE_P(SSSE3, HighbdMaskedSADTest,
@@ -438,12 +448,14 @@
   make_tuple(&aom_masked_sad64x128_avx2, &aom_masked_sad64x128_ssse3),
   make_tuple(&aom_masked_sad128x64_avx2, &aom_masked_sad128x64_ssse3),
   make_tuple(&aom_masked_sad128x128_avx2, &aom_masked_sad128x128_ssse3),
+#if !CONFIG_REALTIME_ONLY
   make_tuple(&aom_masked_sad4x16_avx2, &aom_masked_sad4x16_ssse3),
   make_tuple(&aom_masked_sad16x4_avx2, &aom_masked_sad16x4_ssse3),
   make_tuple(&aom_masked_sad8x32_avx2, &aom_masked_sad8x32_ssse3),
   make_tuple(&aom_masked_sad32x8_avx2, &aom_masked_sad32x8_ssse3),
   make_tuple(&aom_masked_sad16x64_avx2, &aom_masked_sad16x64_ssse3),
   make_tuple(&aom_masked_sad64x16_avx2, &aom_masked_sad64x16_ssse3)
+#endif
 };
 
 INSTANTIATE_TEST_SUITE_P(AVX2, MaskedSADTest,
@@ -477,6 +489,7 @@
              &aom_highbd_masked_sad128x64_ssse3),
   make_tuple(&aom_highbd_masked_sad128x128_avx2,
              &aom_highbd_masked_sad128x128_ssse3),
+#if !CONFIG_REALTIME_ONLY
   make_tuple(&aom_highbd_masked_sad4x16_avx2, &aom_highbd_masked_sad4x16_ssse3),
   make_tuple(&aom_highbd_masked_sad16x4_avx2, &aom_highbd_masked_sad16x4_ssse3),
   make_tuple(&aom_highbd_masked_sad8x32_avx2, &aom_highbd_masked_sad8x32_ssse3),
@@ -485,6 +498,7 @@
              &aom_highbd_masked_sad16x64_ssse3),
   make_tuple(&aom_highbd_masked_sad64x16_avx2,
              &aom_highbd_masked_sad64x16_ssse3)
+#endif
 };
 
 INSTANTIATE_TEST_SUITE_P(AVX2, HighbdMaskedSADTest,
diff --git a/test/masked_variance_test.cc b/test/masked_variance_test.cc
index bf814ce..afffce9 100644
--- a/test/masked_variance_test.cc
+++ b/test/masked_variance_test.cc
@@ -56,6 +56,7 @@
   MaskedSubPixelVarianceFunc opt_func_;
   MaskedSubPixelVarianceFunc ref_func_;
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(MaskedSubPixelVarianceTest);
 
 TEST_P(MaskedSubPixelVarianceTest, OperationCheck) {
   unsigned int ref_ret, opt_ret;
@@ -193,6 +194,7 @@
   MaskedSubPixelVarianceFunc ref_func_;
   aom_bit_depth_t bit_depth_;
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(HighbdMaskedSubPixelVarianceTest);
 
 TEST_P(HighbdMaskedSubPixelVarianceTest, OperationCheck) {
   unsigned int ref_ret, opt_ret;
@@ -352,7 +354,7 @@
              &aom_masked_sub_pixel_variance4x8_c),
   make_tuple(&aom_masked_sub_pixel_variance4x4_ssse3,
              &aom_masked_sub_pixel_variance4x4_c),
-
+#if !CONFIG_REALTIME_ONLY
   make_tuple(&aom_masked_sub_pixel_variance64x16_ssse3,
              &aom_masked_sub_pixel_variance64x16_c),
   make_tuple(&aom_masked_sub_pixel_variance16x64_ssse3,
@@ -365,6 +367,7 @@
              &aom_masked_sub_pixel_variance16x4_c),
   make_tuple(&aom_masked_sub_pixel_variance4x16_ssse3,
              &aom_masked_sub_pixel_variance4x16_c),
+#endif
 };
 
 INSTANTIATE_TEST_SUITE_P(SSSE3_C_COMPARE, MaskedSubPixelVarianceTest,
@@ -468,7 +471,7 @@
              &aom_highbd_12_masked_sub_pixel_variance4x8_c, AOM_BITS_12),
   make_tuple(&aom_highbd_12_masked_sub_pixel_variance4x4_ssse3,
              &aom_highbd_12_masked_sub_pixel_variance4x4_c, AOM_BITS_12),
-
+#if !CONFIG_REALTIME_ONLY
   make_tuple(&aom_highbd_8_masked_sub_pixel_variance64x16_ssse3,
              &aom_highbd_8_masked_sub_pixel_variance64x16_c, AOM_BITS_8),
   make_tuple(&aom_highbd_8_masked_sub_pixel_variance16x64_ssse3,
@@ -505,6 +508,7 @@
              &aom_highbd_12_masked_sub_pixel_variance16x4_c, AOM_BITS_12),
   make_tuple(&aom_highbd_12_masked_sub_pixel_variance4x16_ssse3,
              &aom_highbd_12_masked_sub_pixel_variance4x16_c, AOM_BITS_12),
+#endif
 };
 
 INSTANTIATE_TEST_SUITE_P(SSSE3_C_COMPARE, HighbdMaskedSubPixelVarianceTest,
diff --git a/test/metadata_test.cc b/test/metadata_test.cc
index 79e08a7..c102b7a 100644
--- a/test/metadata_test.cc
+++ b/test/metadata_test.cc
@@ -193,8 +193,8 @@
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 }
 
-AV1_INSTANTIATE_TEST_CASE(MetadataEncodeTest,
-                          ::testing::Values(::libaom_test::kOnePassGood));
+AV1_INSTANTIATE_TEST_SUITE(MetadataEncodeTest,
+                           ::testing::Values(::libaom_test::kOnePassGood));
 
 #endif  // CONFIG_AV1_ENCODER
 }  // namespace
diff --git a/test/monochrome_test.cc b/test/monochrome_test.cc
index ebccba5..4688961 100644
--- a/test/monochrome_test.cc
+++ b/test/monochrome_test.cc
@@ -124,7 +124,8 @@
   }
 }
 
-AV1_INSTANTIATE_TEST_CASE(MonochromeTest,
-                          ::testing::Values(::libaom_test::kTwoPassGood));
+AV1_INSTANTIATE_TEST_SUITE(MonochromeTest,
+                           ::testing::Values(::libaom_test::kOnePassGood,
+                                             ::libaom_test::kTwoPassGood));
 
 }  // namespace
diff --git a/test/motion_vector_test.cc b/test/motion_vector_test.cc
index 2636c39..7516a36 100644
--- a/test/motion_vector_test.cc
+++ b/test/motion_vector_test.cc
@@ -100,8 +100,8 @@
   ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
 }
 
-AV1_INSTANTIATE_TEST_CASE(MotionVectorTestLarge,
-                          ::testing::ValuesIn(kEncodingModeVectors),
-                          ::testing::ValuesIn(kCpuUsedVectors),
-                          ::testing::ValuesIn(kMVTestModes));
+AV1_INSTANTIATE_TEST_SUITE(MotionVectorTestLarge,
+                           ::testing::ValuesIn(kEncodingModeVectors),
+                           ::testing::ValuesIn(kCpuUsedVectors),
+                           ::testing::ValuesIn(kMVTestModes));
 }  // namespace
diff --git a/test/noise_model_test.cc b/test/noise_model_test.cc
index 5b61236..aad8905 100644
--- a/test/noise_model_test.cc
+++ b/test/noise_model_test.cc
@@ -1153,7 +1153,7 @@
 template <typename T>
 class WienerDenoiseTest : public ::testing::Test, public T {
  public:
-  static void SetUpTestCase() { aom_dsp_rtcd(); }
+  static void SetUpTestSuite() { aom_dsp_rtcd(); }
 
  protected:
   void SetUp() {
diff --git a/test/obmc_sad_test.cc b/test/obmc_sad_test.cc
index 6b4382c..a8290b2 100644
--- a/test/obmc_sad_test.cc
+++ b/test/obmc_sad_test.cc
@@ -37,6 +37,7 @@
 ////////////////////////////////////////////////////////////////////////////////
 
 class ObmcSadTest : public FunctionEquivalenceTest<ObmcSadF> {};
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(ObmcSadTest);
 
 TEST_P(ObmcSadTest, RandomValues) {
   DECLARE_ALIGNED(32, uint8_t, pre[MAX_SB_SQUARE]);
@@ -152,6 +153,7 @@
 ////////////////////////////////////////////////////////////////////////////////
 
 class ObmcSadHBDTest : public FunctionEquivalenceTest<ObmcSadF> {};
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(ObmcSadHBDTest);
 
 TEST_P(ObmcSadHBDTest, RandomValues) {
   DECLARE_ALIGNED(32, uint16_t, pre[MAX_SB_SQUARE]);
diff --git a/test/obmc_variance_test.cc b/test/obmc_variance_test.cc
index fc281d7..58d2ad6 100644
--- a/test/obmc_variance_test.cc
+++ b/test/obmc_variance_test.cc
@@ -40,6 +40,7 @@
 ////////////////////////////////////////////////////////////////////////////////
 
 class ObmcVarianceTest : public FunctionEquivalenceTest<ObmcVarF> {};
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(ObmcVarianceTest);
 
 TEST_P(ObmcVarianceTest, RandomValues) {
   DECLARE_ALIGNED(32, uint8_t, pre[MAX_SB_SQUARE]);
@@ -197,6 +198,7 @@
 ////////////////////////////////////////////////////////////////////////////////
 #if CONFIG_AV1_HIGHBITDEPTH
 class ObmcVarianceHBDTest : public FunctionEquivalenceTest<ObmcVarF> {};
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(ObmcVarianceHBDTest);
 
 TEST_P(ObmcVarianceHBDTest, RandomValues) {
   DECLARE_ALIGNED(32, uint16_t, pre[MAX_SB_SQUARE]);
diff --git a/test/pickrst_test.cc b/test/pickrst_test.cc
index 9a2c5bc..131e1dd 100644
--- a/test/pickrst_test.cc
+++ b/test/pickrst_test.cc
@@ -75,6 +75,7 @@
   int32_t *flt0_;
   int32_t *flt1_;
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(PixelProjErrorTest);
 
 void PixelProjErrorTest::RunPixelProjErrorTest(int32_t run_times) {
   int h_end = run_times != 1 ? 128 : (rng_.Rand16() % MAX_DATA_BLOCK) + 1;
@@ -188,6 +189,12 @@
                          ::testing::Values(av1_lowbd_pixel_proj_error_avx2));
 #endif  // HAVE_AVX2
 
+#if HAVE_NEON
+
+INSTANTIATE_TEST_SUITE_P(NEON, PixelProjErrorTest,
+                         ::testing::Values(av1_lowbd_pixel_proj_error_neon));
+#endif  // HAVE_NEON
+
 }  // namespace pickrst_test_lowbd
 
 #if CONFIG_AV1_HIGHBITDEPTH
@@ -240,6 +247,7 @@
   int32_t *flt0_;
   int32_t *flt1_;
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(PixelProjHighbdErrorTest);
 
 void PixelProjHighbdErrorTest::RunPixelProjErrorTest(int32_t run_times) {
   int h_end = run_times != 1 ? 128 : (rng_.Rand16() % MAX_DATA_BLOCK) + 1;
@@ -356,6 +364,7 @@
 #endif  // HAVE_AVX2
 
 }  // namespace pickrst_test_highbd
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 
 ////////////////////////////////////////////////////////////////////////////////
 // Get_proj_subspace_Test
@@ -409,6 +418,7 @@
   int32_t *flt0_;
   int32_t *flt1_;
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(GetProjSubspaceTest);
 
 void GetProjSubspaceTest::RunGetProjSubspaceTest(int32_t run_times) {
   int h_end = run_times != 1
@@ -524,6 +534,12 @@
 
 TEST_P(GetProjSubspaceTest, DISABLED_Speed) { RunGetProjSubspaceTest(200000); }
 
+#if HAVE_SSE4_1
+
+INSTANTIATE_TEST_SUITE_P(SSE4_1, GetProjSubspaceTest,
+                         ::testing::Values(av1_calc_proj_params_sse4_1));
+#endif  // HAVE_SSE4_1
+
 #if HAVE_AVX2
 
 INSTANTIATE_TEST_SUITE_P(AVX2, GetProjSubspaceTest,
@@ -531,4 +547,187 @@
 #endif  // HAVE_AVX2
 
 }  // namespace get_proj_subspace_test_lowbd
+
+#if CONFIG_AV1_HIGHBITDEPTH
+namespace get_proj_subspace_test_hbd {
+static const int kIterations = 100;
+
+typedef void (*set_get_proj_subspace_hbd)(const uint8_t *src8, int width,
+                                          int height, int src_stride,
+                                          const uint8_t *dat8, int dat_stride,
+                                          int32_t *flt0, int flt0_stride,
+                                          int32_t *flt1, int flt1_stride,
+                                          int64_t H[2][2], int64_t C[2],
+                                          const sgr_params_type *params);
+
+typedef std::tuple<const set_get_proj_subspace_hbd> GetProjSubspaceHBDTestParam;
+
+class GetProjSubspaceTestHBD
+    : public ::testing::TestWithParam<GetProjSubspaceHBDTestParam> {
+ public:
+  virtual void SetUp() {
+    target_func_ = GET_PARAM(0);
+    src_ = (uint16_t *)(aom_malloc(MAX_DATA_BLOCK * MAX_DATA_BLOCK *
+                                   sizeof(*src_)));
+    ASSERT_NE(src_, nullptr);
+    dgd_ = (uint16_t *)(aom_malloc(MAX_DATA_BLOCK * MAX_DATA_BLOCK *
+                                   sizeof(*dgd_)));
+    ASSERT_NE(dgd_, nullptr);
+    flt0_ = (int32_t *)(aom_malloc(MAX_DATA_BLOCK * MAX_DATA_BLOCK *
+                                   sizeof(*flt0_)));
+    ASSERT_NE(flt0_, nullptr);
+    flt1_ = (int32_t *)(aom_malloc(MAX_DATA_BLOCK * MAX_DATA_BLOCK *
+                                   sizeof(*flt1_)));
+    ASSERT_NE(flt1_, nullptr);
+  }
+  virtual void TearDown() {
+    aom_free(src_);
+    aom_free(dgd_);
+    aom_free(flt0_);
+    aom_free(flt1_);
+  }
+  void RunGetProjSubspaceTestHBD(int32_t run_times);
+  void RunGetProjSubspaceTestHBD_ExtremeValues();
+
+ private:
+  set_get_proj_subspace_hbd target_func_;
+  libaom_test::ACMRandom rng_;
+  uint16_t *src_;
+  uint16_t *dgd_;
+  int32_t *flt0_;
+  int32_t *flt1_;
+};
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(GetProjSubspaceTestHBD);
+
+void GetProjSubspaceTestHBD::RunGetProjSubspaceTestHBD(int32_t run_times) {
+  int h_end = run_times != 1
+                  ? 128
+                  : ((rng_.Rand16() % MAX_DATA_BLOCK) &
+                     2147483640);  // We test for widths divisible by 8.
+  int v_end =
+      run_times != 1 ? 128 : ((rng_.Rand16() % MAX_DATA_BLOCK) & 2147483640);
+  const int dgd_stride = MAX_DATA_BLOCK;
+  const int src_stride = MAX_DATA_BLOCK;
+  const int flt0_stride = MAX_DATA_BLOCK;
+  const int flt1_stride = MAX_DATA_BLOCK;
+  sgr_params_type params;
+  const int iters = run_times == 1 ? kIterations : 4;
+  for (int iter = 0; iter < iters && !HasFatalFailure(); ++iter) {
+    int64_t C_ref[2] = { 0 }, C_test[2] = { 0 };
+    int64_t H_ref[2][2] = { { 0, 0 }, { 0, 0 } };
+    int64_t H_test[2][2] = { { 0, 0 }, { 0, 0 } };
+    for (int i = 0; i < MAX_DATA_BLOCK * MAX_DATA_BLOCK; ++i) {
+      dgd_[i] = rng_.Rand16() % 4095;
+      src_[i] = rng_.Rand16() % 4095;
+      flt0_[i] = rng_.Rand15Signed();
+      flt1_[i] = rng_.Rand15Signed();
+    }
+
+    params.r[0] = run_times == 1 ? (rng_.Rand8() % MAX_RADIUS) : 1;
+    params.r[1] = run_times == 1 ? (rng_.Rand8() % MAX_RADIUS) : 1;
+    params.s[0] = run_times == 1 ? (rng_.Rand8() % MAX_RADIUS) : (iter % 2);
+    params.s[1] = run_times == 1 ? (rng_.Rand8() % MAX_RADIUS) : (iter / 2);
+    uint8_t *dgd = CONVERT_TO_BYTEPTR(dgd_);
+    uint8_t *src = CONVERT_TO_BYTEPTR(src_);
+
+    aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
+    for (int i = 0; i < run_times; ++i) {
+      av1_calc_proj_params_high_bd_c(src, v_end, h_end, src_stride, dgd,
+                                     dgd_stride, flt0_, flt0_stride, flt1_,
+                                     flt1_stride, H_ref, C_ref, &params);
+    }
+    aom_usec_timer_mark(&timer);
+    const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+    aom_usec_timer_start(&timer);
+    for (int i = 0; i < run_times; ++i) {
+      target_func_(src, v_end, h_end, src_stride, dgd, dgd_stride, flt0_,
+                   flt0_stride, flt1_, flt1_stride, H_test, C_test, &params);
+    }
+    aom_usec_timer_mark(&timer);
+    const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+    if (run_times > 10) {
+      printf("r0 %d r1 %d %3dx%-3d:%7.2f/%7.2fns (%3.2f)\n", params.r[0],
+             params.r[1], h_end, v_end, time1, time2, time1 / time2);
+    } else {
+      ASSERT_EQ(H_ref[0][0], H_test[0][0]);
+      ASSERT_EQ(H_ref[0][1], H_test[0][1]);
+      ASSERT_EQ(H_ref[1][0], H_test[1][0]);
+      ASSERT_EQ(H_ref[1][1], H_test[1][1]);
+      ASSERT_EQ(C_ref[0], C_test[0]);
+      ASSERT_EQ(C_ref[1], C_test[1]);
+    }
+  }
+}
+
+void GetProjSubspaceTestHBD::RunGetProjSubspaceTestHBD_ExtremeValues() {
+  const int h_start = 0;
+  int h_end = MAX_DATA_BLOCK;
+  const int v_start = 0;
+  int v_end = MAX_DATA_BLOCK;
+  const int dgd_stride = MAX_DATA_BLOCK;
+  const int src_stride = MAX_DATA_BLOCK;
+  const int flt0_stride = MAX_DATA_BLOCK;
+  const int flt1_stride = MAX_DATA_BLOCK;
+  sgr_params_type params;
+  const int iters = kIterations;
+  for (int iter = 0; iter < iters && !HasFatalFailure(); ++iter) {
+    int64_t C_ref[2] = { 0 }, C_test[2] = { 0 };
+    int64_t H_ref[2][2] = { { 0, 0 }, { 0, 0 } };
+    int64_t H_test[2][2] = { { 0, 0 }, { 0, 0 } };
+    for (int i = 0; i < MAX_DATA_BLOCK * MAX_DATA_BLOCK; ++i) {
+      dgd_[i] = 0;
+      src_[i] = 4095;
+      flt0_[i] = rng_.Rand15Signed();
+      flt1_[i] = rng_.Rand15Signed();
+    }
+    params.r[0] = 1;
+    params.r[1] = 1;
+    params.s[0] = rng_.Rand8() % MAX_RADIUS;
+    params.s[1] = rng_.Rand8() % MAX_RADIUS;
+    uint8_t *dgd = CONVERT_TO_BYTEPTR(dgd_);
+    uint8_t *src = CONVERT_TO_BYTEPTR(src_);
+
+    av1_calc_proj_params_high_bd_c(
+        src, h_end - h_start, v_end - v_start, src_stride, dgd, dgd_stride,
+        flt0_, flt0_stride, flt1_, flt1_stride, H_ref, C_ref, &params);
+
+    target_func_(src, h_end - h_start, v_end - v_start, src_stride, dgd,
+                 dgd_stride, flt0_, flt0_stride, flt1_, flt1_stride, H_test,
+                 C_test, &params);
+
+    ASSERT_EQ(H_ref[0][0], H_test[0][0]);
+    ASSERT_EQ(H_ref[0][1], H_test[0][1]);
+    ASSERT_EQ(H_ref[1][0], H_test[1][0]);
+    ASSERT_EQ(H_ref[1][1], H_test[1][1]);
+    ASSERT_EQ(C_ref[0], C_test[0]);
+    ASSERT_EQ(C_ref[1], C_test[1]);
+  }
+}
+
+TEST_P(GetProjSubspaceTestHBD, RandomValues) { RunGetProjSubspaceTestHBD(1); }
+
+TEST_P(GetProjSubspaceTestHBD, ExtremeValues) {
+  RunGetProjSubspaceTestHBD_ExtremeValues();
+}
+
+TEST_P(GetProjSubspaceTestHBD, DISABLED_Speed) {
+  RunGetProjSubspaceTestHBD(200000);
+}
+
+#if HAVE_SSE4_1
+
+INSTANTIATE_TEST_SUITE_P(
+    SSE4_1, GetProjSubspaceTestHBD,
+    ::testing::Values(av1_calc_proj_params_high_bd_sse4_1));
+#endif  // HAVE_SSE4_1
+
+#if HAVE_AVX2
+
+INSTANTIATE_TEST_SUITE_P(AVX2, GetProjSubspaceTestHBD,
+                         ::testing::Values(av1_calc_proj_params_high_bd_avx2));
+#endif  // HAVE_AVX2
+
+}  // namespace get_proj_subspace_test_hbd
+
 #endif  // CONFIG_AV1_HIGHBITDEPTH
diff --git a/test/qm_test.cc b/test/qm_test.cc
deleted file mode 100644
index d1dfbb8..0000000
--- a/test/qm_test.cc
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-#include "config/aom_config.h"
-
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
-#include "test/codec_factory.h"
-#include "test/encode_test_driver.h"
-#include "test/i420_video_source.h"
-#include "test/util.h"
-
-namespace {
-
-class QMTest
-    : public ::libaom_test::CodecTestWith2Params<libaom_test::TestMode, int>,
-      public ::libaom_test::EncoderTest {
- protected:
-  QMTest() : EncoderTest(GET_PARAM(0)) {}
-  virtual ~QMTest() {}
-
-  virtual void SetUp() {
-    InitializeConfig();
-    SetMode(GET_PARAM(1));
-    set_cpu_used_ = GET_PARAM(2);
-  }
-
-  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
-                                  ::libaom_test::Encoder *encoder) {
-    if (video->frame() == 0) {
-      encoder->Control(AOME_SET_CPUUSED, set_cpu_used_);
-      encoder->Control(AV1E_SET_ENABLE_QM, 1);
-      encoder->Control(AV1E_SET_QM_MIN, qm_min_);
-      encoder->Control(AV1E_SET_QM_MAX, qm_max_);
-
-      encoder->Control(AOME_SET_MAX_INTRA_BITRATE_PCT, 100);
-    }
-  }
-
-  void DoTest(int qm_min, int qm_max) {
-    qm_min_ = qm_min;
-    qm_max_ = qm_max;
-    cfg_.kf_max_dist = 12;
-    cfg_.rc_min_quantizer = 8;
-    cfg_.rc_max_quantizer = 56;
-    cfg_.rc_end_usage = AOM_CBR;
-    cfg_.g_lag_in_frames = 6;
-    cfg_.rc_buf_initial_sz = 500;
-    cfg_.rc_buf_optimal_sz = 500;
-    cfg_.rc_buf_sz = 1000;
-    cfg_.rc_target_bitrate = 300;
-    ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352,
-                                         288, 30, 1, 0, 15);
-    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-  }
-
-  int set_cpu_used_;
-  int qm_min_;
-  int qm_max_;
-};
-
-// encodes and decodes without a mismatch.
-TEST_P(QMTest, TestNoMisMatchQM1) { DoTest(5, 9); }
-
-// encodes and decodes without a mismatch.
-TEST_P(QMTest, TestNoMisMatchQM2) { DoTest(0, 8); }
-
-// encodes and decodes without a mismatch.
-TEST_P(QMTest, TestNoMisMatchQM3) { DoTest(9, 15); }
-
-AV1_INSTANTIATE_TEST_CASE(QMTest,
-                          ::testing::Values(::libaom_test::kRealTime,
-                                            ::libaom_test::kOnePassGood),
-                          ::testing::Range(5, 9));
-}  // namespace
diff --git a/test/quant_test.cc b/test/quant_test.cc
new file mode 100644
index 0000000..2ef3a76
--- /dev/null
+++ b/test/quant_test.cc
@@ -0,0 +1,178 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include "config/aom_config.h"
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+#include "av1/encoder/av1_quantize.h"
+#include "test/y4m_video_source.h"
+
+namespace {
+
+class QMTest
+    : public ::libaom_test::CodecTestWith2Params<libaom_test::TestMode, int>,
+      public ::libaom_test::EncoderTest {
+ protected:
+  QMTest() : EncoderTest(GET_PARAM(0)) {}
+  virtual ~QMTest() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(GET_PARAM(1));
+    set_cpu_used_ = GET_PARAM(2);
+  }
+
+  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                                  ::libaom_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      encoder->Control(AOME_SET_CPUUSED, set_cpu_used_);
+      encoder->Control(AV1E_SET_ENABLE_QM, 1);
+      encoder->Control(AV1E_SET_QM_MIN, qm_min_);
+      encoder->Control(AV1E_SET_QM_MAX, qm_max_);
+
+      encoder->Control(AOME_SET_MAX_INTRA_BITRATE_PCT, 100);
+    }
+  }
+
+  void DoTest(int qm_min, int qm_max) {
+    qm_min_ = qm_min;
+    qm_max_ = qm_max;
+    cfg_.kf_max_dist = 12;
+    cfg_.rc_min_quantizer = 8;
+    cfg_.rc_max_quantizer = 56;
+    cfg_.rc_end_usage = AOM_CBR;
+    cfg_.g_lag_in_frames = 6;
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_buf_optimal_sz = 500;
+    cfg_.rc_buf_sz = 1000;
+    cfg_.rc_target_bitrate = 300;
+    ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352,
+                                         288, 30, 1, 0, 15);
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  }
+
+  int set_cpu_used_;
+  int qm_min_;
+  int qm_max_;
+};
+
+// encodes and decodes without a mismatch.
+TEST_P(QMTest, TestNoMisMatchQM1) { DoTest(5, 9); }
+
+// encodes and decodes without a mismatch.
+TEST_P(QMTest, TestNoMisMatchQM2) { DoTest(0, 8); }
+
+// encodes and decodes without a mismatch.
+TEST_P(QMTest, TestNoMisMatchQM3) { DoTest(9, 15); }
+
+AV1_INSTANTIATE_TEST_SUITE(QMTest,
+                           ::testing::Values(::libaom_test::kRealTime,
+                                             ::libaom_test::kOnePassGood),
+                           ::testing::Range(5, 9));
+
+typedef struct {
+  const unsigned int min_q;
+  const unsigned int max_q;
+} QuantParam;
+
+const QuantParam QuantTestParams[] = {
+  { 0, 10 }, { 0, 60 }, { 20, 35 }, { 35, 50 }, { 50, 63 }
+};
+
+std::ostream &operator<<(std::ostream &os, const QuantParam &test_arg) {
+  return os << "QuantParam { min_q:" << test_arg.min_q
+            << " max_q:" << test_arg.max_q << " }";
+}
+
+/*
+ * This class is used to test whether base_qindex is within min
+ * and max quantizer range configured by user.
+ */
+class QuantizerBoundsCheckTestLarge
+    : public ::libaom_test::CodecTestWith3Params<libaom_test::TestMode,
+                                                 QuantParam, aom_rc_mode>,
+      public ::libaom_test::EncoderTest {
+ protected:
+  QuantizerBoundsCheckTestLarge()
+      : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)),
+        quant_param_(GET_PARAM(2)), rc_end_usage_(GET_PARAM(3)) {
+    quant_bound_violated_ = false;
+  }
+  virtual ~QuantizerBoundsCheckTestLarge() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(encoding_mode_);
+    const aom_rational timebase = { 1, 30 };
+    cfg_.g_timebase = timebase;
+    cfg_.rc_end_usage = rc_end_usage_;
+    cfg_.g_threads = 1;
+    cfg_.rc_min_quantizer = quant_param_.min_q;
+    cfg_.rc_max_quantizer = quant_param_.max_q;
+    cfg_.g_lag_in_frames = 35;
+    if (rc_end_usage_ != AOM_Q) {
+      cfg_.rc_target_bitrate = 400;
+    }
+  }
+
+  virtual bool DoDecode() const { return 1; }
+
+  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                                  ::libaom_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      encoder->Control(AOME_SET_CPUUSED, 5);
+    }
+  }
+
+  virtual bool HandleDecodeResult(const aom_codec_err_t res_dec,
+                                  libaom_test::Decoder *decoder) {
+    EXPECT_EQ(AOM_CODEC_OK, res_dec) << decoder->DecodeError();
+    if (AOM_CODEC_OK == res_dec) {
+      aom_codec_ctx_t *ctx_dec = decoder->GetDecoder();
+      AOM_CODEC_CONTROL_TYPECHECKED(ctx_dec, AOMD_GET_LAST_QUANTIZER,
+                                    &base_qindex_);
+      min_bound_qindex_ = av1_quantizer_to_qindex(cfg_.rc_min_quantizer);
+      max_bound_qindex_ = av1_quantizer_to_qindex(cfg_.rc_max_quantizer);
+      if ((base_qindex_ < min_bound_qindex_ ||
+           base_qindex_ > max_bound_qindex_) &&
+          quant_bound_violated_ == false) {
+        quant_bound_violated_ = true;
+      }
+    }
+    return AOM_CODEC_OK == res_dec;
+  }
+
+  ::libaom_test::TestMode encoding_mode_;
+  const QuantParam quant_param_;
+  int base_qindex_;
+  int min_bound_qindex_;
+  int max_bound_qindex_;
+  bool quant_bound_violated_;
+  aom_rc_mode rc_end_usage_;
+};
+
+TEST_P(QuantizerBoundsCheckTestLarge, QuantizerBoundsCheckEncodeTest) {
+  libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                     cfg_.g_timebase.den, cfg_.g_timebase.num,
+                                     0, 50);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  ASSERT_EQ(quant_bound_violated_, false);
+}
+
+AV1_INSTANTIATE_TEST_SUITE(QuantizerBoundsCheckTestLarge,
+                           ::testing::Values(::libaom_test::kOnePassGood,
+                                             ::libaom_test::kTwoPassGood),
+                           ::testing::ValuesIn(QuantTestParams),
+                           ::testing::Values(AOM_Q, AOM_VBR, AOM_CBR, AOM_CQ));
+}  // namespace
diff --git a/test/quantize_func_test.cc b/test/quantize_func_test.cc
index b40b38d..3d79cf8 100644
--- a/test/quantize_func_test.cc
+++ b/test/quantize_func_test.cc
@@ -78,19 +78,20 @@
 
 const int kTestNum = 1000;
 
-class QuantizeTest : public ::testing::TestWithParam<QuantizeParam> {
+template <typename CoeffType>
+class QuantizeTestBase : public ::testing::TestWithParam<QuantizeParam> {
  protected:
-  QuantizeTest()
+  QuantizeTestBase()
       : quant_ref_(GET_PARAM(0)), quant_(GET_PARAM(1)), tx_size_(GET_PARAM(2)),
         type_(GET_PARAM(3)), bd_(GET_PARAM(4)) {}
 
-  virtual ~QuantizeTest() {}
+  virtual ~QuantizeTestBase() {}
 
   virtual void SetUp() {
     qtab_ = reinterpret_cast<QuanTable *>(aom_memalign(32, sizeof(*qtab_)));
     const int n_coeffs = coeff_num();
-    coeff_ = reinterpret_cast<tran_low_t *>(
-        aom_memalign(32, 6 * n_coeffs * sizeof(tran_low_t)));
+    coeff_ = reinterpret_cast<CoeffType *>(
+        aom_memalign(32, 6 * n_coeffs * sizeof(CoeffType)));
     InitQuantizer();
   }
 
@@ -106,15 +107,24 @@
     av1_build_quantizer(bd_, 0, 0, 0, 0, 0, &qtab_->quant, &qtab_->dequant);
   }
 
+  virtual void RunQuantizeFunc(
+      const CoeffType *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+      const int16_t *round_ptr, const int16_t *quant_ptr,
+      const int16_t *quant_shift_ptr, CoeffType *qcoeff_ptr,
+      CoeffType *qcoeff_ref_ptr, CoeffType *dqcoeff_ptr,
+      CoeffType *dqcoeff_ref_ptr, const int16_t *dequant_ptr,
+      uint16_t *eob_ref_ptr, uint16_t *eob_ptr, const int16_t *scan,
+      const int16_t *iscan) = 0;
+
   void QuantizeRun(bool is_loop, int q = 0, int test_num = 1) {
-    tran_low_t *coeff_ptr = coeff_;
+    CoeffType *coeff_ptr = coeff_;
     const intptr_t n_coeffs = coeff_num();
 
-    tran_low_t *qcoeff_ref = coeff_ptr + n_coeffs;
-    tran_low_t *dqcoeff_ref = qcoeff_ref + n_coeffs;
+    CoeffType *qcoeff_ref = coeff_ptr + n_coeffs;
+    CoeffType *dqcoeff_ref = qcoeff_ref + n_coeffs;
 
-    tran_low_t *qcoeff = dqcoeff_ref + n_coeffs;
-    tran_low_t *dqcoeff = qcoeff + n_coeffs;
+    CoeffType *qcoeff = dqcoeff_ref + n_coeffs;
+    CoeffType *dqcoeff = qcoeff + n_coeffs;
     uint16_t *eob = (uint16_t *)(dqcoeff + n_coeffs);
 
     // Testing uses 2-D DCT scan order table
@@ -141,6 +151,10 @@
 
       memset(qcoeff_ref, 0, 5 * n_coeffs * sizeof(*qcoeff_ref));
 
+      RunQuantizeFunc(coeff_ptr, n_coeffs, zbin, round, quant, quant_shift,
+                      qcoeff, qcoeff_ref, dqcoeff, dqcoeff_ref, dequant,
+                      &eob[0], &eob[1], sc->scan, sc->iscan);
+
       quant_ref_(coeff_ptr, n_coeffs, zbin, round, quant, quant_shift,
                  qcoeff_ref, dqcoeff_ref, dequant, &eob[0], sc->scan,
                  sc->iscan);
@@ -166,8 +180,8 @@
     }
   }
 
-  void CompareResults(const tran_low_t *buf_ref, const tran_low_t *buf,
-                      int size, const char *text, int q, int number) {
+  void CompareResults(const CoeffType *buf_ref, const CoeffType *buf, int size,
+                      const char *text, int q, int number) {
     int i;
     for (i = 0; i < size; ++i) {
       ASSERT_EQ(buf_ref[i], buf[i]) << text << " mismatch on test: " << number
@@ -177,7 +191,7 @@
 
   int coeff_num() const { return av1_get_max_eob(tx_size_); }
 
-  void FillCoeff(tran_low_t c) {
+  void FillCoeff(CoeffType c) {
     const int n_coeffs = coeff_num();
     for (int i = 0; i < n_coeffs; ++i) {
       coeff_[i] = c;
@@ -203,7 +217,7 @@
   void FillCoeffZero() { FillCoeff(0); }
 
   void FillCoeffConstant() {
-    tran_low_t c = GetRandomCoeff();
+    CoeffType c = GetRandomCoeff();
     FillCoeff(c);
   }
 
@@ -220,22 +234,22 @@
     coeff_[0] = -8191;
   }
 
-  tran_low_t GetRandomCoeff() {
-    tran_low_t coeff;
+  CoeffType GetRandomCoeff() {
+    CoeffType coeff;
     if (bd_ == AOM_BITS_8) {
       coeff =
           clamp(static_cast<int16_t>(rnd_.Rand16()), INT16_MIN + 1, INT16_MAX);
     } else {
-      tran_low_t min = -(1 << (7 + bd_));
-      tran_low_t max = -min - 1;
-      coeff = clamp(static_cast<tran_low_t>(rnd_.Rand31()), min, max);
+      CoeffType min = -(1 << (7 + bd_));
+      CoeffType max = -min - 1;
+      coeff = clamp(static_cast<CoeffType>(rnd_.Rand31()), min, max);
     }
     return coeff;
   }
 
   ACMRandom rnd_;
   QuanTable *qtab_;
-  tran_low_t *coeff_;
+  CoeffType *coeff_;
   QuantizeFunc quant_ref_;
   QuantizeFunc quant_;
   TX_SIZE tx_size_;
@@ -243,24 +257,45 @@
   aom_bit_depth_t bd_;
 };
 
-TEST_P(QuantizeTest, ZeroInput) {
+class FullPrecisionQuantizeTest : public QuantizeTestBase<tran_low_t> {
+  void RunQuantizeFunc(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                       const int16_t *zbin_ptr, const int16_t *round_ptr,
+                       const int16_t *quant_ptr, const int16_t *quant_shift_ptr,
+                       tran_low_t *qcoeff_ptr, tran_low_t *qcoeff_ref_ptr,
+                       tran_low_t *dqcoeff_ptr, tran_low_t *dqcoeff_ref_ptr,
+                       const int16_t *dequant_ptr, uint16_t *eob_ref_ptr,
+                       uint16_t *eob_ptr, const int16_t *scan,
+                       const int16_t *iscan) override {
+    quant_ref_(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr,
+               quant_shift_ptr, qcoeff_ref_ptr, dqcoeff_ref_ptr, dequant_ptr,
+               eob_ref_ptr, scan, iscan);
+
+    ASM_REGISTER_STATE_CHECK(quant_(
+        coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr, quant_shift_ptr,
+        qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan));
+  }
+};
+
+TEST_P(FullPrecisionQuantizeTest, ZeroInput) {
   FillCoeffZero();
   QuantizeRun(false);
 }
 
-TEST_P(QuantizeTest, LargeNegativeInput) {
+TEST_P(FullPrecisionQuantizeTest, LargeNegativeInput) {
   FillDcLargeNegative();
   QuantizeRun(false, 0, 1);
 }
 
-TEST_P(QuantizeTest, DcOnlyInput) {
+TEST_P(FullPrecisionQuantizeTest, DcOnlyInput) {
   FillDcOnly();
   QuantizeRun(false, 0, 1);
 }
 
-TEST_P(QuantizeTest, RandomInput) { QuantizeRun(true, 0, kTestNum); }
+TEST_P(FullPrecisionQuantizeTest, RandomInput) {
+  QuantizeRun(true, 0, kTestNum);
+}
 
-TEST_P(QuantizeTest, MultipleQ) {
+TEST_P(FullPrecisionQuantizeTest, MultipleQ) {
   for (int q = 0; q < QINDEX_RANGE; ++q) {
     QuantizeRun(true, q, kTestNum);
   }
@@ -268,12 +303,12 @@
 
 // Force the coeff to be half the value of the dequant.  This exposes a
 // mismatch found in av1_quantize_fp_sse2().
-TEST_P(QuantizeTest, CoeffHalfDequant) {
+TEST_P(FullPrecisionQuantizeTest, CoeffHalfDequant) {
   FillCoeff(16);
   QuantizeRun(false, 25, 1);
 }
 
-TEST_P(QuantizeTest, DISABLED_Speed) {
+TEST_P(FullPrecisionQuantizeTest, DISABLED_Speed) {
   tran_low_t *coeff_ptr = coeff_;
   const intptr_t n_coeffs = coeff_num();
 
@@ -408,7 +443,7 @@
              static_cast<TX_SIZE>(TX_4X4), TYPE_B, AOM_BITS_8)
 };
 
-INSTANTIATE_TEST_SUITE_P(AVX2, QuantizeTest,
+INSTANTIATE_TEST_SUITE_P(AVX2, FullPrecisionQuantizeTest,
                          ::testing::ValuesIn(kQParamArrayAvx2));
 #endif  // HAVE_AVX2
 
@@ -499,7 +534,7 @@
              static_cast<TX_SIZE>(TX_64X64), TYPE_B, AOM_BITS_8)
 };
 
-INSTANTIATE_TEST_SUITE_P(SSE2, QuantizeTest,
+INSTANTIATE_TEST_SUITE_P(SSE2, FullPrecisionQuantizeTest,
                          ::testing::ValuesIn(kQParamArraySSE2));
 #endif
 
@@ -514,16 +549,26 @@
   make_tuple(&av1_quantize_fp_c, &av1_quantize_fp_neon,
              static_cast<TX_SIZE>(TX_8X32), TYPE_FP, AOM_BITS_8),
   make_tuple(&av1_quantize_fp_c, &av1_quantize_fp_neon,
-             static_cast<TX_SIZE>(TX_32X8), TYPE_FP, AOM_BITS_8)
+             static_cast<TX_SIZE>(TX_32X8), TYPE_FP, AOM_BITS_8),
+  make_tuple(&av1_quantize_fp_32x32_c, &av1_quantize_fp_32x32_neon,
+             static_cast<TX_SIZE>(TX_32X32), TYPE_FP, AOM_BITS_8),
+  make_tuple(&av1_quantize_fp_64x64_c, &av1_quantize_fp_64x64_neon,
+             static_cast<TX_SIZE>(TX_64X64), TYPE_FP, AOM_BITS_8),
+  make_tuple(&aom_quantize_b_c, &aom_quantize_b_neon,
+             static_cast<TX_SIZE>(TX_16X16), TYPE_B, AOM_BITS_8),
+  make_tuple(&aom_quantize_b_32x32_c, &aom_quantize_b_32x32_neon,
+             static_cast<TX_SIZE>(TX_32X32), TYPE_B, AOM_BITS_8),
+  make_tuple(&aom_quantize_b_64x64_c, &aom_quantize_b_64x64_neon,
+             static_cast<TX_SIZE>(TX_64X64), TYPE_B, AOM_BITS_8)
 };
 
-INSTANTIATE_TEST_SUITE_P(NEON, QuantizeTest,
+INSTANTIATE_TEST_SUITE_P(NEON, FullPrecisionQuantizeTest,
                          ::testing::ValuesIn(kQParamArrayNEON));
 #endif
 
 #if HAVE_SSSE3 && ARCH_X86_64
 INSTANTIATE_TEST_SUITE_P(
-    SSSE3, QuantizeTest,
+    SSSE3, FullPrecisionQuantizeTest,
     ::testing::Values(
         make_tuple(&aom_quantize_b_c, &aom_quantize_b_ssse3,
                    static_cast<TX_SIZE>(TX_16X16), TYPE_B, AOM_BITS_8),
@@ -534,14 +579,14 @@
 
 #endif  // HAVE_SSSE3 && ARCH_X86_64
 
-#if HAVE_AVX && ARCH_X86_64
+#if HAVE_AVX
 INSTANTIATE_TEST_SUITE_P(
-    AVX, QuantizeTest,
+    AVX, FullPrecisionQuantizeTest,
     ::testing::Values(
         make_tuple(&aom_quantize_b_c, &aom_quantize_b_avx,
                    static_cast<TX_SIZE>(TX_16X16), TYPE_B, AOM_BITS_8),
         make_tuple(&aom_quantize_b_32x32_c, &aom_quantize_b_32x32_avx,
                    static_cast<TX_SIZE>(TX_32X32), TYPE_B, AOM_BITS_8)));
 
-#endif  // HAVE_AVX && ARCH_X86_64
+#endif  // HAVE_AVX
 }  // namespace
diff --git a/test/reconinter_test.cc b/test/reconinter_test.cc
index 51bec0e..7e440c9 100644
--- a/test/reconinter_test.cc
+++ b/test/reconinter_test.cc
@@ -88,6 +88,7 @@
                     DIFFWTD_MASK_TYPE mask_type);
   libaom_test::ACMRandom rnd_;
 };  // class BuildCompDiffwtdMaskD16Test
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(BuildCompDiffwtdMaskD16Test);
 
 void BuildCompDiffwtdMaskD16Test::RunCheckOutput(
     buildcompdiffwtdmaskd16_func test_impl) {
diff --git a/test/resize_test.cc b/test/resize_test.cc
index bcf6794..adc9bb5 100644
--- a/test/resize_test.cc
+++ b/test/resize_test.cc
@@ -13,12 +13,14 @@
 #include <vector>
 #include "aom_dsp/aom_dsp_common.h"
 #include "common/tools_common.h"
+#include "av1/encoder/encoder.h"
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 #include "test/codec_factory.h"
 #include "test/encode_test_driver.h"
 #include "test/i420_video_source.h"
 #include "test/video_source.h"
 #include "test/util.h"
+#include "test/y4m_video_source.h"
 
 // Enable(1) or Disable(0) writing of the compressed bitstream.
 #define WRITE_COMPRESSED_STREAM 0
@@ -298,7 +300,7 @@
 
   virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) {
     if (frame0_psnr_ == 0.) frame0_psnr_ = pkt->data.psnr.psnr[0];
-    EXPECT_NEAR(pkt->data.psnr.psnr[0], frame0_psnr_, 3.0);
+    EXPECT_NEAR(pkt->data.psnr.psnr[0], frame0_psnr_, 4.1);
   }
 
 #if WRITE_COMPRESSED_STREAM
@@ -367,7 +369,9 @@
     : public ::libaom_test::CodecTestWith2Params<libaom_test::TestMode, int>,
       public ::libaom_test::EncoderTest {
  protected:
-  ResizeRealtimeTest() : EncoderTest(GET_PARAM(0)) {}
+  ResizeRealtimeTest()
+      : EncoderTest(GET_PARAM(0)), set_scale_mode_(false),
+        set_scale_mode2_(false) {}
   virtual ~ResizeRealtimeTest() {}
 
   virtual void PreEncodeFrameHook(libaom_test::VideoSource *video,
@@ -377,8 +381,27 @@
       encoder->Control(AOME_SET_CPUUSED, set_cpu_used_);
       encoder->Control(AV1E_SET_FRAME_PARALLEL_DECODING, 1);
     }
+    if (set_scale_mode_) {
+      struct aom_scaling_mode mode;
+      if (video->frame() <= 20)
+        mode = { AOME_ONETWO, AOME_ONETWO };
+      else if (video->frame() <= 40)
+        mode = { AOME_ONEFOUR, AOME_ONEFOUR };
+      else if (video->frame() > 40)
+        mode = { AOME_NORMAL, AOME_NORMAL };
+      encoder->Control(AOME_SET_SCALEMODE, &mode);
+    } else if (set_scale_mode2_) {
+      struct aom_scaling_mode mode;
+      if (video->frame() <= 20)
+        mode = { AOME_ONEFOUR, AOME_ONEFOUR };
+      else if (video->frame() <= 40)
+        mode = { AOME_ONETWO, AOME_ONETWO };
+      else if (video->frame() > 40)
+        mode = { AOME_THREEFOUR, AOME_THREEFOUR };
+      encoder->Control(AOME_SET_SCALEMODE, &mode);
+    }
 
-    if (change_bitrate_ && video->frame() == 120) {
+    if (change_bitrate_ && video->frame() == frame_change_bitrate_) {
       change_bitrate_ = false;
       cfg_.rc_target_bitrate = 500;
       encoder->Config(&cfg_);
@@ -426,22 +449,135 @@
     // the width and height of the frame are swapped
     cfg_.g_forced_max_frame_width = cfg_.g_forced_max_frame_height =
         AOMMAX(kInitialWidth, kInitialHeight);
+    if (set_scale_mode_ || set_scale_mode2_) {
+      cfg_.rc_dropframe_thresh = 0;
+      cfg_.g_forced_max_frame_width = 1280;
+      cfg_.g_forced_max_frame_height = 1280;
+    }
   }
 
   std::vector<FrameInfo> frame_info_list_;
   int set_cpu_used_;
   bool change_bitrate_;
+  unsigned int frame_change_bitrate_;
   double mismatch_psnr_;
   int mismatch_nframes_;
+  bool set_scale_mode_;
+  bool set_scale_mode2_;
 };
 
+// Check the AOME_SET_SCALEMODE control by downsizing to
+// 1/2, then 1/4, and then back up to originsal.
+TEST_P(ResizeRealtimeTest, TestInternalResizeSetScaleMode1) {
+  ::libaom_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
+  cfg_.g_w = 1280;
+  cfg_.g_h = 720;
+  set_scale_mode_ = true;
+  set_scale_mode2_ = false;
+  DefaultConfig();
+  change_bitrate_ = false;
+  mismatch_nframes_ = 0;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  // Check we decoded the same number of frames as we attempted to encode
+  ASSERT_EQ(frame_info_list_.size(), video.limit());
+  for (std::vector<FrameInfo>::const_iterator info = frame_info_list_.begin();
+       info != frame_info_list_.end(); ++info) {
+    const auto frame = static_cast<unsigned>(info->pts);
+    unsigned int expected_w = 1280 >> 1;
+    unsigned int expected_h = 720 >> 1;
+    if (frame > 40) {
+      expected_w = 1280;
+      expected_h = 720;
+    } else if (frame > 20 && frame <= 40) {
+      expected_w = 1280 >> 2;
+      expected_h = 720 >> 2;
+    }
+    EXPECT_EQ(expected_w, info->w)
+        << "Frame " << frame << " had unexpected width";
+    EXPECT_EQ(expected_h, info->h)
+        << "Frame " << frame << " had unexpected height";
+    EXPECT_EQ(static_cast<unsigned int>(0), GetMismatchFrames());
+  }
+}
+
+// Check the AOME_SET_SCALEMODE control by downsizing to
+// 1/2, then 1/4, and then back up to originsal.
+TEST_P(ResizeRealtimeTest, TestInternalResizeSetScaleMode1QVGA) {
+  ::libaom_test::I420VideoSource video("desktop1.320_180.yuv", 320, 180, 30, 1,
+                                       0, 80);
+  cfg_.g_w = 320;
+  cfg_.g_h = 180;
+  set_scale_mode_ = true;
+  set_scale_mode2_ = false;
+  DefaultConfig();
+  change_bitrate_ = false;
+  mismatch_nframes_ = 0;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  // Check we decoded the same number of frames as we attempted to encode
+  ASSERT_EQ(frame_info_list_.size(), video.limit());
+  for (std::vector<FrameInfo>::const_iterator info = frame_info_list_.begin();
+       info != frame_info_list_.end(); ++info) {
+    const auto frame = static_cast<unsigned>(info->pts);
+    unsigned int expected_w = 320 >> 1;
+    unsigned int expected_h = 180 >> 1;
+    if (frame > 40) {
+      expected_w = 320;
+      expected_h = 180;
+    } else if (frame > 20 && frame <= 40) {
+      expected_w = 320 >> 2;
+      expected_h = 180 >> 2;
+    }
+    EXPECT_EQ(expected_w, info->w)
+        << "Frame " << frame << " had unexpected width";
+    EXPECT_EQ(expected_h, info->h)
+        << "Frame " << frame << " had unexpected height";
+    EXPECT_EQ(static_cast<unsigned int>(0), GetMismatchFrames());
+  }
+}
+
+// Check the AOME_SET_SCALEMODE control by downsizing to
+// 1/4, then 1/2, and then up to 3/4.
+TEST_P(ResizeRealtimeTest, TestInternalResizeSetScaleMode2) {
+  ::libaom_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
+  cfg_.g_w = 1280;
+  cfg_.g_h = 720;
+  set_scale_mode_ = false;
+  set_scale_mode2_ = true;
+  DefaultConfig();
+  change_bitrate_ = false;
+  mismatch_nframes_ = 0;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  // Check we decoded the same number of frames as we attempted to encode
+  ASSERT_EQ(frame_info_list_.size(), video.limit());
+  for (std::vector<FrameInfo>::const_iterator info = frame_info_list_.begin();
+       info != frame_info_list_.end(); ++info) {
+    const auto frame = static_cast<unsigned>(info->pts);
+    unsigned int expected_w = 1280 >> 2;
+    unsigned int expected_h = 720 >> 2;
+    if (frame > 40) {
+      expected_w = (3 * 1280) >> 2;
+      expected_h = (3 * 720) >> 2;
+    } else if (frame > 20 && frame <= 40) {
+      expected_w = 1280 >> 1;
+      expected_h = 720 >> 1;
+    }
+    EXPECT_EQ(expected_w, info->w)
+        << "Frame " << frame << " had unexpected width";
+    EXPECT_EQ(expected_h, info->h)
+        << "Frame " << frame << " had unexpected height";
+    EXPECT_EQ(static_cast<unsigned int>(0), GetMismatchFrames());
+  }
+}
+
 TEST_P(ResizeRealtimeTest, TestExternalResizeWorks) {
   ResizingVideoSource video;
   video.flag_codec_ = 1;
-  DefaultConfig();
   change_bitrate_ = false;
+  set_scale_mode_ = false;
+  set_scale_mode2_ = false;
   mismatch_psnr_ = 0.0;
   mismatch_nframes_ = 0;
+  DefaultConfig();
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 
   // Check we decoded the same number of frames as we attempted to encode
@@ -465,74 +601,35 @@
 // Verify the dynamic resizer behavior for real time, 1 pass CBR mode.
 // Run at low bitrate, with resize_allowed = 1, and verify that we get
 // one resize down event.
-TEST_P(ResizeRealtimeTest, DISABLED_TestInternalResizeDown) {
-  ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
-                                       30, 1, 0, 299);
-  DefaultConfig();
-  cfg_.g_w = 352;
-  cfg_.g_h = 288;
+TEST_P(ResizeRealtimeTest, TestInternalResizeDown) {
+  ::libaom_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
+                                       0, 400);
+  cfg_.g_w = 640;
+  cfg_.g_h = 480;
   change_bitrate_ = false;
+  set_scale_mode_ = false;
+  set_scale_mode2_ = false;
   mismatch_psnr_ = 0.0;
   mismatch_nframes_ = 0;
+  DefaultConfig();
+  // Disable dropped frames.
+  cfg_.rc_dropframe_thresh = 0;
+  // Starting bitrate low.
+  cfg_.rc_target_bitrate = 150;
+  cfg_.rc_resize_mode = RESIZE_DYNAMIC;
+  cfg_.g_forced_max_frame_width = 1280;
+  cfg_.g_forced_max_frame_height = 1280;
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 
   unsigned int last_w = cfg_.g_w;
   unsigned int last_h = cfg_.g_h;
-  int resize_count = 0;
+  int resize_down_count = 0;
   for (std::vector<FrameInfo>::const_iterator info = frame_info_list_.begin();
        info != frame_info_list_.end(); ++info) {
     if (info->w != last_w || info->h != last_h) {
       // Verify that resize down occurs.
-      ASSERT_LT(info->w, last_w);
-      ASSERT_LT(info->h, last_h);
-      last_w = info->w;
-      last_h = info->h;
-      resize_count++;
-    }
-  }
-
-#if CONFIG_AV1_DECODER
-  // Verify that we get 1 resize down event in this test.
-  ASSERT_EQ(1, resize_count) << "Resizing should occur.";
-  EXPECT_EQ(static_cast<unsigned int>(0), GetMismatchFrames());
-#else
-  printf("Warning: AV1 decoder unavailable, unable to check resize count!\n");
-#endif
-}
-
-// Verify the dynamic resizer behavior for real time, 1 pass CBR mode.
-// Start at low target bitrate, raise the bitrate in the middle of the clip,
-// scaling-up should occur after bitrate changed.
-TEST_P(ResizeRealtimeTest, DISABLED_TestInternalResizeDownUpChangeBitRate) {
-  ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
-                                       30, 1, 0, 359);
-  DefaultConfig();
-  cfg_.g_w = 352;
-  cfg_.g_h = 288;
-  change_bitrate_ = true;
-  mismatch_psnr_ = 0.0;
-  mismatch_nframes_ = 0;
-  // Disable dropped frames.
-  cfg_.rc_dropframe_thresh = 0;
-  // Starting bitrate low.
-  cfg_.rc_target_bitrate = 80;
-  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-
-  unsigned int last_w = cfg_.g_w;
-  unsigned int last_h = cfg_.g_h;
-  int resize_count = 0;
-  for (std::vector<FrameInfo>::const_iterator info = frame_info_list_.begin();
-       info != frame_info_list_.end(); ++info) {
-    if (info->w != last_w || info->h != last_h) {
-      resize_count++;
-      if (resize_count == 1) {
-        // Verify that resize down occurs.
-        ASSERT_LT(info->w, last_w);
-        ASSERT_LT(info->h, last_h);
-      } else if (resize_count == 2) {
-        // Verify that resize up occurs.
-        ASSERT_GT(info->w, last_w);
-        ASSERT_GT(info->h, last_h);
+      if (info->w < last_w && info->h < last_h) {
+        resize_down_count++;
       }
       last_w = info->w;
       last_h = info->h;
@@ -540,8 +637,69 @@
   }
 
 #if CONFIG_AV1_DECODER
-  // Verify that we get 2 resize events in this test.
-  ASSERT_EQ(resize_count, 2) << "Resizing should occur twice.";
+  // Verify that we get at lease 1 resize down event in this test.
+  ASSERT_GE(resize_down_count, 1) << "Resizing should occur.";
+  EXPECT_EQ(static_cast<unsigned int>(0), GetMismatchFrames());
+#else
+  printf("Warning: AV1 decoder unavailable, unable to check resize count!\n");
+#endif
+}
+
+// Verify the dynamic resizer behavior for real time, 1 pass CBR mode.
+// Start at low target bitrate, raise the bitrate in the middle of the clip
+// (at frame# = frame_change_bitrate_), scaling-up should occur after bitrate
+// is increased.
+TEST_P(ResizeRealtimeTest, TestInternalResizeDownUpChangeBitRate) {
+  ::libaom_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
+                                       0, 400);
+  cfg_.g_w = 640;
+  cfg_.g_h = 480;
+  change_bitrate_ = true;
+  frame_change_bitrate_ = 120;
+  set_scale_mode_ = false;
+  set_scale_mode2_ = false;
+  mismatch_psnr_ = 0.0;
+  mismatch_nframes_ = 0;
+  DefaultConfig();
+  // Disable dropped frames.
+  cfg_.rc_dropframe_thresh = 0;
+  // Starting bitrate low.
+  cfg_.rc_target_bitrate = 150;
+  cfg_.rc_resize_mode = RESIZE_DYNAMIC;
+  cfg_.g_forced_max_frame_width = 1280;
+  cfg_.g_forced_max_frame_height = 1280;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+  unsigned int last_w = cfg_.g_w;
+  unsigned int last_h = cfg_.g_h;
+  unsigned int frame_number = 0;
+  int resize_down_count = 0;
+  int resize_up_count = 0;
+  for (std::vector<FrameInfo>::const_iterator info = frame_info_list_.begin();
+       info != frame_info_list_.end(); ++info) {
+    if (info->w != last_w || info->h != last_h) {
+      if (frame_number < frame_change_bitrate_) {
+        // Verify that resize down occurs, before bitrate is increased.
+        ASSERT_LT(info->w, last_w);
+        ASSERT_LT(info->h, last_h);
+        resize_down_count++;
+      } else {
+        // Verify that resize up occurs, after bitrate is increased.
+        ASSERT_GT(info->w, last_w);
+        ASSERT_GT(info->h, last_h);
+        resize_up_count++;
+      }
+      last_w = info->w;
+      last_h = info->h;
+    }
+    frame_number++;
+  }
+
+#if CONFIG_AV1_DECODER
+  // Verify that we get at least 2 resize events in this test.
+  ASSERT_GE(resize_up_count, 1) << "Resizing up should occur at lease once.";
+  ASSERT_GE(resize_down_count, 1)
+      << "Resizing down should occur at lease once.";
   EXPECT_EQ(static_cast<unsigned int>(0), GetMismatchFrames());
 #else
   printf("Warning: AV1 decoder unavailable, unable to check resize count!\n");
@@ -632,13 +790,70 @@
   }
 }
 
-AV1_INSTANTIATE_TEST_CASE(ResizeTest,
-                          ::testing::Values(::libaom_test::kRealTime));
-AV1_INSTANTIATE_TEST_CASE(ResizeInternalTestLarge,
-                          ::testing::Values(::libaom_test::kOnePassGood));
-AV1_INSTANTIATE_TEST_CASE(ResizeRealtimeTest,
-                          ::testing::Values(::libaom_test::kRealTime),
-                          ::testing::Range(5, 9));
-AV1_INSTANTIATE_TEST_CASE(ResizeCspTest,
-                          ::testing::Values(::libaom_test::kRealTime));
+// This class is used to check if there are any fatal
+// failures while encoding with resize-mode > 0
+class ResizeModeTestLarge
+    : public ::libaom_test::CodecTestWith5Params<libaom_test::TestMode, int,
+                                                 int, int, int>,
+      public ::libaom_test::EncoderTest {
+ protected:
+  ResizeModeTestLarge()
+      : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)),
+        resize_mode_(GET_PARAM(2)), resize_denominator_(GET_PARAM(3)),
+        resize_kf_denominator_(GET_PARAM(4)), cpu_used_(GET_PARAM(5)) {}
+  virtual ~ResizeModeTestLarge() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(encoding_mode_);
+    const aom_rational timebase = { 1, 30 };
+    cfg_.g_timebase = timebase;
+    cfg_.rc_end_usage = AOM_VBR;
+    cfg_.g_threads = 1;
+    cfg_.g_lag_in_frames = 35;
+    cfg_.rc_target_bitrate = 1000;
+    cfg_.rc_resize_mode = resize_mode_;
+    cfg_.rc_resize_denominator = resize_denominator_;
+    cfg_.rc_resize_kf_denominator = resize_kf_denominator_;
+    init_flags_ = AOM_CODEC_USE_PSNR;
+  }
+
+  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                                  ::libaom_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      encoder->Control(AOME_SET_CPUUSED, cpu_used_);
+      encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
+    }
+  }
+
+  ::libaom_test::TestMode encoding_mode_;
+  int resize_mode_;
+  int resize_denominator_;
+  int resize_kf_denominator_;
+  int cpu_used_;
+};
+
+TEST_P(ResizeModeTestLarge, ResizeModeTest) {
+  ::libaom_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 30);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+
+AV1_INSTANTIATE_TEST_SUITE(ResizeTest,
+                           ::testing::Values(::libaom_test::kRealTime));
+AV1_INSTANTIATE_TEST_SUITE(ResizeInternalTestLarge,
+                           ::testing::Values(::libaom_test::kOnePassGood));
+AV1_INSTANTIATE_TEST_SUITE(ResizeRealtimeTest,
+                           ::testing::Values(::libaom_test::kRealTime),
+                           ::testing::Range(5, 10));
+AV1_INSTANTIATE_TEST_SUITE(ResizeCspTest,
+                           ::testing::Values(::libaom_test::kRealTime));
+
+// TODO(anyone): Enable below test once resize issues are fixed
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(ResizeModeTestLarge);
+// AV1_INSTANTIATE_TEST_SUITE(
+//    ResizeModeTestLarge,
+//    ::testing::Values(::libaom_test::kOnePassGood,
+//    ::libaom_test::kTwoPassGood),
+//    ::testing::Values(1, 2), ::testing::Values(8, 12, 16),
+//    ::testing::Values(8, 12, 16), ::testing::Range(2, 7));
 }  // namespace
diff --git a/test/rt_end_to_end_test.cc b/test/rt_end_to_end_test.cc
index f14d124..fade1e8 100644
--- a/test/rt_end_to_end_test.cc
+++ b/test/rt_end_to_end_test.cc
@@ -32,20 +32,23 @@
 std::unordered_map<std::string,
                    std::unordered_map<int, std::unordered_map<int, double>>>
     kPsnrThreshold = { { "park_joy_90p_8_420.y4m",
-                         { { 5, { { 0, 35.4 }, { 3, 36.4 } } },
+                         { { 5, { { 0, 35.4 }, { 3, 36.3 } } },
                            { 6, { { 0, 35.3 }, { 3, 36.2 } } },
                            { 7, { { 0, 34.9 }, { 3, 35.8 } } },
-                           { 8, { { 0, 35.0 }, { 3, 35.8 } } } } },
+                           { 8, { { 0, 35.0 }, { 3, 35.8 } } },
+                           { 9, { { 0, 34.9 }, { 3, 35.5 } } } } },
                        { "paris_352_288_30.y4m",
                          { { 5, { { 0, 36.2 }, { 3, 36.7 } } },
-                           { 6, { { 0, 36.1 }, { 3, 36.6 } } },
+                           { 6, { { 0, 36.1 }, { 3, 36.5 } } },
                            { 7, { { 0, 35.5 }, { 3, 36.0 } } },
-                           { 8, { { 0, 36.0 }, { 3, 36.5 } } } } },
+                           { 8, { { 0, 36.0 }, { 3, 36.5 } } },
+                           { 9, { { 0, 35.5 }, { 3, 36.1 } } } } },
                        { "niklas_1280_720_30.y4m",
-                         { { 5, { { 0, 34.6 }, { 3, 34.6 } } },
+                         { { 5, { { 0, 34.4 }, { 3, 34.4 } } },
                            { 6, { { 0, 34.2 }, { 3, 34.2 } } },
-                           { 7, { { 0, 33.7 }, { 3, 33.6 } } },
-                           { 8, { { 0, 33.6 }, { 3, 33.4 } } } } } };
+                           { 7, { { 0, 33.6 }, { 3, 33.6 } } },
+                           { 8, { { 0, 33.5 }, { 3, 33.5 } } },
+                           { 9, { { 0, 33.4 }, { 3, 33.4 } } } } } };
 
 typedef struct {
   const char *filename;
@@ -106,12 +109,15 @@
   virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
                                   ::libaom_test::Encoder *encoder) {
     if (video->frame() == 0) {
+      encoder->Control(AV1E_SET_ENABLE_RESTORATION, 0);
+      encoder->Control(AV1E_SET_ENABLE_OBMC, 0);
       encoder->Control(AV1E_SET_FRAME_PARALLEL_DECODING, 1);
       encoder->Control(AV1E_SET_TILE_COLUMNS, tile_columns_);
       encoder->Control(AOME_SET_CPUUSED, cpu_used_);
       encoder->Control(AV1E_SET_TUNE_CONTENT, AOM_CONTENT_DEFAULT);
       encoder->Control(AV1E_SET_AQ_MODE, aq_mode_);
       encoder->Control(AV1E_SET_ROW_MT, 1);
+      encoder->Control(AV1E_SET_ENABLE_CDEF, 1);
     }
   }
 
@@ -161,14 +167,14 @@
 
 TEST_P(RTEndToEndTestThreaded, EndtoEndPSNRTest) { DoTest(); }
 
-AV1_INSTANTIATE_TEST_CASE(RTEndToEndTest, ::testing::ValuesIn(kTestVectors),
-                          ::testing::Range(5, 9),
-                          ::testing::Values<unsigned int>(0, 3),
-                          ::testing::Values(1), ::testing::Values(1));
+AV1_INSTANTIATE_TEST_SUITE(RTEndToEndTest, ::testing::ValuesIn(kTestVectors),
+                           ::testing::Range(5, 10),
+                           ::testing::Values<unsigned int>(0, 3),
+                           ::testing::Values(1), ::testing::Values(1));
 
-AV1_INSTANTIATE_TEST_CASE(RTEndToEndTestThreaded,
-                          ::testing::ValuesIn(kTestVectors),
-                          ::testing::Range(5, 9),
-                          ::testing::Values<unsigned int>(0, 3),
-                          ::testing::Range(2, 5), ::testing::Range(2, 5));
+AV1_INSTANTIATE_TEST_SUITE(RTEndToEndTestThreaded,
+                           ::testing::ValuesIn(kTestVectors),
+                           ::testing::Range(5, 10),
+                           ::testing::Values<unsigned int>(0, 3),
+                           ::testing::Range(2, 5), ::testing::Range(2, 5));
 }  // namespace
diff --git a/test/sad_test.cc b/test/sad_test.cc
index 0bdbf37..afd84a8 100644
--- a/test/sad_test.cc
+++ b/test/sad_test.cc
@@ -31,6 +31,10 @@
                                    const uint8_t *ref_ptr, int ref_stride);
 typedef std::tuple<int, int, SadMxNFunc, int> SadMxNParam;
 
+typedef unsigned int (*SadSkipMxNFunc)(const uint8_t *src_ptr, int src_stride,
+                                       const uint8_t *ref_ptr, int ref_stride);
+typedef std::tuple<int, int, SadSkipMxNFunc, int> SadSkipMxNParam;
+
 typedef uint32_t (*SadMxNAvgFunc)(const uint8_t *src_ptr, int src_stride,
                                   const uint8_t *ref_ptr, int ref_stride,
                                   const uint8_t *second_pred);
@@ -60,6 +64,11 @@
                              uint32_t *sad_array);
 typedef std::tuple<int, int, SadMxNx4Func, int> SadMxNx4Param;
 
+typedef void (*SadSkipMxNx4Func)(const uint8_t *src_ptr, int src_stride,
+                                 const uint8_t *const ref_ptr[], int ref_stride,
+                                 uint32_t *sad_array);
+typedef std::tuple<int, int, SadSkipMxNx4Func, int> SadSkipMxNx4Param;
+
 typedef void (*SadMxNx4AvgFunc)(const uint8_t *src_ptr, int src_stride,
                                 const uint8_t *const ref_ptr[], int ref_stride,
                                 const uint8_t *second_pred,
@@ -74,7 +83,7 @@
   SADTestBase(int width, int height, int bit_depth)
       : width_(width), height_(height), bd_(bit_depth) {}
 
-  static void SetUpTestCase() {
+  static void SetUpTestSuite() {
     source_data8_ = reinterpret_cast<uint8_t *>(
         aom_memalign(kDataAlignment, kDataBlockSize));
     reference_data8_ = reinterpret_cast<uint8_t *>(
@@ -97,7 +106,7 @@
         aom_memalign(kDataAlignment, 128 * 128 * sizeof(uint16_t)));
   }
 
-  static void TearDownTestCase() {
+  static void TearDownTestSuite() {
     aom_free(source_data8_);
     source_data8_ = NULL;
     aom_free(reference_data8_);
@@ -182,6 +191,31 @@
     return sad;
   }
 
+  // Sum of Absolute Differences Skip rows. Given two blocks,
+  // calculate the absolute  difference between two pixels in the same
+  // relative location every other row; accumulate and double the result at the
+  // end.
+  unsigned int ReferenceSADSkip(int block_idx) {
+    unsigned int sad = 0;
+    const uint8_t *const reference8 = GetReference(block_idx);
+    const uint8_t *const source8 = source_data_;
+    const uint16_t *const reference16 =
+        CONVERT_TO_SHORTPTR(GetReference(block_idx));
+    const uint16_t *const source16 = CONVERT_TO_SHORTPTR(source_data_);
+    for (int h = 0; h < height_; h += 2) {
+      for (int w = 0; w < width_; ++w) {
+        if (!use_high_bit_depth_) {
+          sad += abs(source8[h * source_stride_ + w] -
+                     reference8[h * reference_stride_ + w]);
+        } else {
+          sad += abs(source16[h * source_stride_ + w] -
+                     reference16[h * reference_stride_ + w]);
+        }
+      }
+    }
+    return sad * 2;
+  }
+
   // Sum of Absolute Differences Average. Given two blocks, and a prediction
   // calculate the absolute difference between one pixel and average of the
   // corresponding and predicted pixels; accumulate.
@@ -343,6 +377,50 @@
       EXPECT_EQ(reference_sad, exp_sad[block]) << "block " << block;
     }
   }
+
+  void SpeedSAD() {
+    int test_count = 2000000;
+    unsigned int exp_sad[4];
+    while (test_count > 0) {
+      SADs(exp_sad);
+      test_count -= 1;
+    }
+  }
+};
+
+class SADSkipx4Test : public ::testing::WithParamInterface<SadMxNx4Param>,
+                      public SADTestBase {
+ public:
+  SADSkipx4Test() : SADTestBase(GET_PARAM(0), GET_PARAM(1), GET_PARAM(3)) {}
+
+ protected:
+  void SADs(unsigned int *results) {
+    const uint8_t *references[] = { GetReference(0), GetReference(1),
+                                    GetReference(2), GetReference(3) };
+
+    ASM_REGISTER_STATE_CHECK(GET_PARAM(2)(
+        source_data_, source_stride_, references, reference_stride_, results));
+  }
+
+  void CheckSADs() {
+    unsigned int reference_sad, exp_sad[4];
+
+    SADs(exp_sad);
+    for (int block = 0; block < 4; ++block) {
+      reference_sad = ReferenceSADSkip(block);
+
+      EXPECT_EQ(reference_sad, exp_sad[block]) << "block " << block;
+    }
+  }
+
+  void SpeedSAD() {
+    int test_count = 2000000;
+    unsigned int exp_sad[4];
+    while (test_count > 0) {
+      SADs(exp_sad);
+      test_count -= 1;
+    }
+  }
 };
 
 class SADx4AvgTest : public ::testing::WithParamInterface<SadMxNx4AvgParam>,
@@ -412,6 +490,37 @@
   }
 };
 
+class SADSkipTest : public ::testing::WithParamInterface<SadMxNParam>,
+                    public SADTestBase {
+ public:
+  SADSkipTest() : SADTestBase(GET_PARAM(0), GET_PARAM(1), GET_PARAM(3)) {}
+
+ protected:
+  unsigned int SAD(int block_idx) {
+    unsigned int ret;
+    const uint8_t *const reference = GetReference(block_idx);
+
+    ASM_REGISTER_STATE_CHECK(ret = GET_PARAM(2)(source_data_, source_stride_,
+                                                reference, reference_stride_));
+    return ret;
+  }
+
+  void CheckSAD() {
+    const unsigned int reference_sad = ReferenceSADSkip(0);
+    const unsigned int exp_sad = SAD(0);
+
+    ASSERT_EQ(reference_sad, exp_sad);
+  }
+
+  void SpeedSAD() {
+    int test_count = 20000000;
+    while (test_count > 0) {
+      SAD(0);
+      test_count -= 1;
+    }
+  }
+};
+
 class SADavgTest : public ::testing::WithParamInterface<SadMxNAvgParam>,
                    public SADTestBase {
  public:
@@ -501,6 +610,7 @@
     }
   }
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(DistWtdSADTest);
 
 class DistWtdSADavgTest
     : public ::testing::WithParamInterface<DistWtdSadMxNAvgParam>,
@@ -607,6 +717,62 @@
 }
 #endif
 
+TEST_P(SADSkipTest, MaxRef) {
+  FillConstant(source_data_, source_stride_, 0);
+  FillConstant(reference_data_, reference_stride_, mask_);
+  CheckSAD();
+}
+
+TEST_P(SADSkipTest, MaxSrc) {
+  FillConstant(source_data_, source_stride_, mask_);
+  FillConstant(reference_data_, reference_stride_, 0);
+  CheckSAD();
+}
+
+TEST_P(SADSkipTest, ShortRef) {
+  const int tmp_stride = reference_stride_;
+  reference_stride_ >>= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(reference_data_, reference_stride_);
+  CheckSAD();
+  reference_stride_ = tmp_stride;
+}
+
+TEST_P(SADSkipTest, UnalignedRef) {
+  // The reference frame, but not the source frame, may be unaligned for
+  // certain types of searches.
+  const int tmp_stride = reference_stride_;
+  reference_stride_ -= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(reference_data_, reference_stride_);
+  CheckSAD();
+  reference_stride_ = tmp_stride;
+}
+
+TEST_P(SADSkipTest, ShortSrc) {
+  const int tmp_stride = source_stride_;
+  source_stride_ >>= 1;
+  int test_count = 2000;
+  while (test_count > 0) {
+    FillRandom(source_data_, source_stride_);
+    FillRandom(reference_data_, reference_stride_);
+    CheckSAD();
+    test_count -= 1;
+  }
+  source_stride_ = tmp_stride;
+}
+
+#if SPEED_TEST
+TEST_P(SADSkipTest, Speed) {
+  const int tmp_stride = source_stride_;
+  source_stride_ >>= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(reference_data_, reference_stride_);
+  SpeedSAD();
+  source_stride_ = tmp_stride;
+}
+#endif
+
 TEST_P(SADavgTest, MaxRef) {
   FillConstant(source_data_, source_stride_, 0);
   FillConstant(reference_data_, reference_stride_, mask_);
@@ -854,6 +1020,101 @@
   source_data_ = tmp_source_data;
 }
 
+#if SPEED_TEST
+TEST_P(SADx4Test, Speed) {
+  FillRandom(source_data_, source_stride_);
+  FillRandom(GetReference(0), reference_stride_);
+  FillRandom(GetReference(1), reference_stride_);
+  FillRandom(GetReference(2), reference_stride_);
+  FillRandom(GetReference(3), reference_stride_);
+  SpeedSAD();
+}
+#endif
+
+// SADSkipx4
+TEST_P(SADSkipx4Test, MaxRef) {
+  FillConstant(source_data_, source_stride_, 0);
+  FillConstant(GetReference(0), reference_stride_, mask_);
+  FillConstant(GetReference(1), reference_stride_, mask_);
+  FillConstant(GetReference(2), reference_stride_, mask_);
+  FillConstant(GetReference(3), reference_stride_, mask_);
+  CheckSADs();
+}
+
+TEST_P(SADSkipx4Test, MaxSrc) {
+  FillConstant(source_data_, source_stride_, mask_);
+  FillConstant(GetReference(0), reference_stride_, 0);
+  FillConstant(GetReference(1), reference_stride_, 0);
+  FillConstant(GetReference(2), reference_stride_, 0);
+  FillConstant(GetReference(3), reference_stride_, 0);
+  CheckSADs();
+}
+
+TEST_P(SADSkipx4Test, ShortRef) {
+  int tmp_stride = reference_stride_;
+  reference_stride_ >>= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(GetReference(0), reference_stride_);
+  FillRandom(GetReference(1), reference_stride_);
+  FillRandom(GetReference(2), reference_stride_);
+  FillRandom(GetReference(3), reference_stride_);
+  CheckSADs();
+  reference_stride_ = tmp_stride;
+}
+
+TEST_P(SADSkipx4Test, UnalignedRef) {
+  // The reference frame, but not the source frame, may be unaligned for
+  // certain types of searches.
+  int tmp_stride = reference_stride_;
+  reference_stride_ -= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(GetReference(0), reference_stride_);
+  FillRandom(GetReference(1), reference_stride_);
+  FillRandom(GetReference(2), reference_stride_);
+  FillRandom(GetReference(3), reference_stride_);
+  CheckSADs();
+  reference_stride_ = tmp_stride;
+}
+
+TEST_P(SADSkipx4Test, ShortSrc) {
+  int tmp_stride = source_stride_;
+  source_stride_ >>= 1;
+  int test_count = 1000;
+  while (test_count > 0) {
+    FillRandom(source_data_, source_stride_);
+    FillRandom(GetReference(0), reference_stride_);
+    FillRandom(GetReference(1), reference_stride_);
+    FillRandom(GetReference(2), reference_stride_);
+    FillRandom(GetReference(3), reference_stride_);
+    CheckSADs();
+    test_count -= 1;
+  }
+  source_stride_ = tmp_stride;
+}
+
+TEST_P(SADSkipx4Test, SrcAlignedByWidth) {
+  uint8_t *tmp_source_data = source_data_;
+  source_data_ += width_;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(GetReference(0), reference_stride_);
+  FillRandom(GetReference(1), reference_stride_);
+  FillRandom(GetReference(2), reference_stride_);
+  FillRandom(GetReference(3), reference_stride_);
+  CheckSADs();
+  source_data_ = tmp_source_data;
+}
+
+#if SPEED_TEST
+TEST_P(SADSkipx4Test, Speed) {
+  FillRandom(source_data_, source_stride_);
+  FillRandom(GetReference(0), reference_stride_);
+  FillRandom(GetReference(1), reference_stride_);
+  FillRandom(GetReference(2), reference_stride_);
+  FillRandom(GetReference(3), reference_stride_);
+  SpeedSAD();
+}
+#endif
+
 using std::make_tuple;
 
 #if SPEED_TEST
@@ -988,6 +1249,7 @@
   make_tuple(4, 8, &aom_highbd_sad4x8_c, 12),
   make_tuple(4, 4, &aom_highbd_sad4x4_c, 12),
 #endif  // CONFIG_AV1_HIGHBITDEPTH
+#if !CONFIG_REALTIME_ONLY
   make_tuple(64, 16, &aom_sad64x16_c, -1),
   make_tuple(16, 64, &aom_sad16x64_c, -1),
 #if CONFIG_AV1_HIGHBITDEPTH
@@ -1018,9 +1280,112 @@
   make_tuple(16, 4, &aom_highbd_sad16x4_c, 12),
   make_tuple(4, 16, &aom_highbd_sad4x16_c, 12),
 #endif
+#endif  // !CONFIG_REALTIME_ONLY
 };
 INSTANTIATE_TEST_SUITE_P(C, SADTest, ::testing::ValuesIn(c_tests));
 
+const SadSkipMxNParam skip_c_tests[] = {
+  make_tuple(128, 128, &aom_sad_skip_128x128_c, -1),
+  make_tuple(128, 64, &aom_sad_skip_128x64_c, -1),
+  make_tuple(64, 128, &aom_sad_skip_64x128_c, -1),
+  make_tuple(64, 64, &aom_sad_skip_64x64_c, -1),
+  make_tuple(64, 32, &aom_sad_skip_64x32_c, -1),
+  make_tuple(32, 64, &aom_sad_skip_32x64_c, -1),
+  make_tuple(32, 32, &aom_sad_skip_32x32_c, -1),
+  make_tuple(32, 16, &aom_sad_skip_32x16_c, -1),
+  make_tuple(16, 32, &aom_sad_skip_16x32_c, -1),
+  make_tuple(16, 16, &aom_sad_skip_16x16_c, -1),
+  make_tuple(16, 8, &aom_sad_skip_16x8_c, -1),
+  make_tuple(8, 16, &aom_sad_skip_8x16_c, -1),
+  make_tuple(8, 8, &aom_sad_skip_8x8_c, -1),
+  make_tuple(8, 4, &aom_sad_skip_8x4_c, -1),
+  make_tuple(4, 8, &aom_sad_skip_4x8_c, -1),
+  make_tuple(4, 4, &aom_sad_skip_4x4_c, -1),
+#if !CONFIG_REALTIME_ONLY
+  make_tuple(64, 16, &aom_sad_skip_64x16_c, -1),
+  make_tuple(16, 64, &aom_sad_skip_16x64_c, -1),
+  make_tuple(32, 8, &aom_sad_skip_32x8_c, -1),
+  make_tuple(8, 32, &aom_sad_skip_8x32_c, -1),
+  make_tuple(16, 4, &aom_sad_skip_16x4_c, -1),
+  make_tuple(4, 16, &aom_sad_skip_4x16_c, -1),
+#endif
+#if CONFIG_AV1_HIGHBITDEPTH
+  make_tuple(128, 128, &aom_highbd_sad_skip_128x128_c, 8),
+  make_tuple(128, 64, &aom_highbd_sad_skip_128x64_c, 8),
+  make_tuple(64, 128, &aom_highbd_sad_skip_64x128_c, 8),
+  make_tuple(64, 64, &aom_highbd_sad_skip_64x64_c, 8),
+  make_tuple(64, 32, &aom_highbd_sad_skip_64x32_c, 8),
+  make_tuple(32, 64, &aom_highbd_sad_skip_32x64_c, 8),
+  make_tuple(32, 32, &aom_highbd_sad_skip_32x32_c, 8),
+  make_tuple(32, 16, &aom_highbd_sad_skip_32x16_c, 8),
+  make_tuple(16, 32, &aom_highbd_sad_skip_16x32_c, 8),
+  make_tuple(16, 16, &aom_highbd_sad_skip_16x16_c, 8),
+  make_tuple(16, 8, &aom_highbd_sad_skip_16x8_c, 8),
+  make_tuple(8, 16, &aom_highbd_sad_skip_8x16_c, 8),
+  make_tuple(8, 8, &aom_highbd_sad_skip_8x8_c, 8),
+  make_tuple(8, 4, &aom_highbd_sad_skip_8x4_c, 8),
+  make_tuple(4, 8, &aom_highbd_sad_skip_4x8_c, 8),
+  make_tuple(4, 4, &aom_highbd_sad_skip_4x4_c, 8),
+#if !CONFIG_REALTIME_ONLY
+  make_tuple(64, 16, &aom_highbd_sad_skip_64x16_c, 8),
+  make_tuple(16, 64, &aom_highbd_sad_skip_16x64_c, 8),
+  make_tuple(32, 8, &aom_highbd_sad_skip_32x8_c, 8),
+  make_tuple(8, 32, &aom_highbd_sad_skip_8x32_c, 8),
+  make_tuple(16, 4, &aom_highbd_sad_skip_16x4_c, 8),
+  make_tuple(4, 16, &aom_highbd_sad_skip_4x16_c, 8),
+#endif
+  make_tuple(128, 128, &aom_highbd_sad_skip_128x128_c, 10),
+  make_tuple(128, 64, &aom_highbd_sad_skip_128x64_c, 10),
+  make_tuple(64, 128, &aom_highbd_sad_skip_64x128_c, 10),
+  make_tuple(64, 64, &aom_highbd_sad_skip_64x64_c, 10),
+  make_tuple(64, 32, &aom_highbd_sad_skip_64x32_c, 10),
+  make_tuple(32, 64, &aom_highbd_sad_skip_32x64_c, 10),
+  make_tuple(32, 32, &aom_highbd_sad_skip_32x32_c, 10),
+  make_tuple(32, 16, &aom_highbd_sad_skip_32x16_c, 10),
+  make_tuple(16, 32, &aom_highbd_sad_skip_16x32_c, 10),
+  make_tuple(16, 16, &aom_highbd_sad_skip_16x16_c, 10),
+  make_tuple(16, 8, &aom_highbd_sad_skip_16x8_c, 10),
+  make_tuple(8, 16, &aom_highbd_sad_skip_8x16_c, 10),
+  make_tuple(8, 8, &aom_highbd_sad_skip_8x8_c, 10),
+  make_tuple(8, 4, &aom_highbd_sad_skip_8x4_c, 10),
+  make_tuple(4, 8, &aom_highbd_sad_skip_4x8_c, 10),
+  make_tuple(4, 4, &aom_highbd_sad_skip_4x4_c, 10),
+#if !CONFIG_REALTIME_ONLY
+  make_tuple(64, 16, &aom_highbd_sad_skip_64x16_c, 10),
+  make_tuple(16, 64, &aom_highbd_sad_skip_16x64_c, 10),
+  make_tuple(32, 8, &aom_highbd_sad_skip_32x8_c, 10),
+  make_tuple(8, 32, &aom_highbd_sad_skip_8x32_c, 10),
+  make_tuple(16, 4, &aom_highbd_sad_skip_16x4_c, 10),
+  make_tuple(4, 16, &aom_highbd_sad_skip_4x16_c, 10),
+#endif
+  make_tuple(128, 128, &aom_highbd_sad_skip_128x128_c, 12),
+  make_tuple(128, 64, &aom_highbd_sad_skip_128x64_c, 12),
+  make_tuple(64, 128, &aom_highbd_sad_skip_64x128_c, 12),
+  make_tuple(64, 64, &aom_highbd_sad_skip_64x64_c, 12),
+  make_tuple(64, 32, &aom_highbd_sad_skip_64x32_c, 12),
+  make_tuple(32, 64, &aom_highbd_sad_skip_32x64_c, 12),
+  make_tuple(32, 32, &aom_highbd_sad_skip_32x32_c, 12),
+  make_tuple(32, 16, &aom_highbd_sad_skip_32x16_c, 12),
+  make_tuple(16, 32, &aom_highbd_sad_skip_16x32_c, 12),
+  make_tuple(16, 16, &aom_highbd_sad_skip_16x16_c, 12),
+  make_tuple(16, 8, &aom_highbd_sad_skip_16x8_c, 12),
+  make_tuple(8, 16, &aom_highbd_sad_skip_8x16_c, 12),
+  make_tuple(8, 8, &aom_highbd_sad_skip_8x8_c, 12),
+  make_tuple(8, 4, &aom_highbd_sad_skip_8x4_c, 12),
+  make_tuple(4, 8, &aom_highbd_sad_skip_4x8_c, 12),
+  make_tuple(4, 4, &aom_highbd_sad_skip_4x4_c, 12),
+#if !CONFIG_REALTIME_ONLY
+  make_tuple(64, 16, &aom_highbd_sad_skip_64x16_c, 12),
+  make_tuple(16, 64, &aom_highbd_sad_skip_16x64_c, 12),
+  make_tuple(32, 8, &aom_highbd_sad_skip_32x8_c, 12),
+  make_tuple(8, 32, &aom_highbd_sad_skip_8x32_c, 12),
+  make_tuple(16, 4, &aom_highbd_sad_skip_16x4_c, 12),
+  make_tuple(4, 16, &aom_highbd_sad_skip_4x16_c, 12),
+#endif  // !CONFIG_REALTIME_ONLY
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+};
+INSTANTIATE_TEST_SUITE_P(C, SADSkipTest, ::testing::ValuesIn(skip_c_tests));
+
 const SadMxNAvgParam avg_c_tests[] = {
   make_tuple(128, 128, &aom_sad128x128_avg_c, -1),
   make_tuple(128, 64, &aom_sad128x64_avg_c, -1),
@@ -1088,6 +1453,7 @@
   make_tuple(4, 8, &aom_highbd_sad4x8_avg_c, 12),
   make_tuple(4, 4, &aom_highbd_sad4x4_avg_c, 12),
 #endif  // CONFIG_AV1_HIGHBITDEPTH
+#if !CONFIG_REALTIME_ONLY
   make_tuple(64, 16, &aom_sad64x16_avg_c, -1),
   make_tuple(16, 64, &aom_sad16x64_avg_c, -1),
 #if CONFIG_AV1_HIGHBITDEPTH
@@ -1118,6 +1484,7 @@
   make_tuple(16, 4, &aom_highbd_sad16x4_avg_c, 12),
   make_tuple(4, 16, &aom_highbd_sad4x16_avg_c, 12),
 #endif
+#endif  // !CONFIG_REALTIME_ONLY
 };
 INSTANTIATE_TEST_SUITE_P(C, SADavgTest, ::testing::ValuesIn(avg_c_tests));
 
@@ -1140,12 +1507,14 @@
   make_tuple(4, 8, &aom_dist_wtd_comp_avg_pred_c, -1),
   make_tuple(4, 4, &aom_dist_wtd_comp_avg_pred_c, -1),
 
+#if !CONFIG_REALTIME_ONLY
   make_tuple(64, 16, &aom_dist_wtd_comp_avg_pred_c, -1),
   make_tuple(16, 64, &aom_dist_wtd_comp_avg_pred_c, -1),
   make_tuple(32, 8, &aom_dist_wtd_comp_avg_pred_c, -1),
   make_tuple(8, 32, &aom_dist_wtd_comp_avg_pred_c, -1),
   make_tuple(16, 4, &aom_dist_wtd_comp_avg_pred_c, -1),
   make_tuple(4, 16, &aom_dist_wtd_comp_avg_pred_c, -1),
+#endif
 };
 
 INSTANTIATE_TEST_SUITE_P(C, DistWtdCompAvgTest,
@@ -1169,12 +1538,14 @@
   make_tuple(4, 8, &aom_dist_wtd_sad4x8_avg_c, -1),
   make_tuple(4, 4, &aom_dist_wtd_sad4x4_avg_c, -1),
 
+#if !CONFIG_REALTIME_ONLY
   make_tuple(64, 16, &aom_dist_wtd_sad64x16_avg_c, -1),
   make_tuple(16, 64, &aom_dist_wtd_sad16x64_avg_c, -1),
   make_tuple(32, 8, &aom_dist_wtd_sad32x8_avg_c, -1),
   make_tuple(8, 32, &aom_dist_wtd_sad8x32_avg_c, -1),
   make_tuple(16, 4, &aom_dist_wtd_sad16x4_avg_c, -1),
   make_tuple(4, 16, &aom_dist_wtd_sad4x16_avg_c, -1),
+#endif
 };
 
 INSTANTIATE_TEST_SUITE_P(C, DistWtdSADavgTest,
@@ -1247,6 +1618,7 @@
   make_tuple(4, 8, &aom_highbd_sad4x8x4d_c, 12),
   make_tuple(4, 4, &aom_highbd_sad4x4x4d_c, 12),
 #endif
+#if !CONFIG_REALTIME_ONLY
   make_tuple(64, 16, &aom_sad64x16x4d_c, -1),
   make_tuple(16, 64, &aom_sad16x64x4d_c, -1),
 #if CONFIG_AV1_HIGHBITDEPTH
@@ -1277,9 +1649,112 @@
   make_tuple(16, 4, &aom_highbd_sad16x4x4d_c, 12),
   make_tuple(4, 16, &aom_highbd_sad4x16x4d_c, 12),
 #endif
+#endif  // !CONFIG_REALTIME_ONLY
 };
 INSTANTIATE_TEST_SUITE_P(C, SADx4Test, ::testing::ValuesIn(x4d_c_tests));
 
+const SadMxNx4Param skip_x4d_c_tests[] = {
+  make_tuple(128, 128, &aom_sad_skip_128x128x4d_c, -1),
+  make_tuple(128, 64, &aom_sad_skip_128x64x4d_c, -1),
+  make_tuple(64, 128, &aom_sad_skip_64x128x4d_c, -1),
+  make_tuple(64, 64, &aom_sad_skip_64x64x4d_c, -1),
+  make_tuple(64, 32, &aom_sad_skip_64x32x4d_c, -1),
+  make_tuple(32, 64, &aom_sad_skip_32x64x4d_c, -1),
+  make_tuple(32, 32, &aom_sad_skip_32x32x4d_c, -1),
+  make_tuple(32, 16, &aom_sad_skip_32x16x4d_c, -1),
+  make_tuple(16, 32, &aom_sad_skip_16x32x4d_c, -1),
+  make_tuple(16, 16, &aom_sad_skip_16x16x4d_c, -1),
+  make_tuple(16, 8, &aom_sad_skip_16x8x4d_c, -1),
+  make_tuple(8, 16, &aom_sad_skip_8x16x4d_c, -1),
+  make_tuple(8, 8, &aom_sad_skip_8x8x4d_c, -1),
+  make_tuple(4, 8, &aom_sad_skip_4x8x4d_c, -1),
+#if !CONFIG_REALTIME_ONLY
+  make_tuple(64, 16, &aom_sad_skip_64x16x4d_c, -1),
+  make_tuple(16, 64, &aom_sad_skip_16x64x4d_c, -1),
+  make_tuple(32, 8, &aom_sad_skip_32x8x4d_c, -1),
+  make_tuple(8, 32, &aom_sad_skip_8x32x4d_c, -1),
+  make_tuple(4, 16, &aom_sad_skip_4x16x4d_c, -1),
+#endif
+#if CONFIG_AV1_HIGHBITDEPTH
+  make_tuple(128, 128, &aom_highbd_sad_skip_128x128x4d_c, 8),
+  make_tuple(128, 64, &aom_highbd_sad_skip_128x64x4d_c, 8),
+  make_tuple(64, 128, &aom_highbd_sad_skip_64x128x4d_c, 8),
+  make_tuple(64, 64, &aom_highbd_sad_skip_64x64x4d_c, 8),
+  make_tuple(64, 32, &aom_highbd_sad_skip_64x32x4d_c, 8),
+  make_tuple(32, 64, &aom_highbd_sad_skip_32x64x4d_c, 8),
+  make_tuple(32, 32, &aom_highbd_sad_skip_32x32x4d_c, 8),
+  make_tuple(32, 16, &aom_highbd_sad_skip_32x16x4d_c, 8),
+  make_tuple(16, 32, &aom_highbd_sad_skip_16x32x4d_c, 8),
+  make_tuple(16, 16, &aom_highbd_sad_skip_16x16x4d_c, 8),
+  make_tuple(16, 8, &aom_highbd_sad_skip_16x8x4d_c, 8),
+  make_tuple(8, 16, &aom_highbd_sad_skip_8x16x4d_c, 8),
+  make_tuple(8, 8, &aom_highbd_sad_skip_8x8x4d_c, 8),
+  make_tuple(8, 4, &aom_highbd_sad_skip_8x4x4d_c, 8),
+  make_tuple(4, 8, &aom_highbd_sad_skip_4x8x4d_c, 8),
+  make_tuple(4, 4, &aom_highbd_sad_skip_4x4x4d_c, 8),
+#if !CONFIG_REALTIME_ONLY
+  make_tuple(64, 16, &aom_highbd_sad_skip_64x16x4d_c, 8),
+  make_tuple(16, 64, &aom_highbd_sad_skip_16x64x4d_c, 8),
+  make_tuple(32, 8, &aom_highbd_sad_skip_32x8x4d_c, 8),
+  make_tuple(8, 32, &aom_highbd_sad_skip_8x32x4d_c, 8),
+  make_tuple(16, 4, &aom_highbd_sad_skip_16x4x4d_c, 8),
+  make_tuple(4, 16, &aom_highbd_sad_skip_4x16x4d_c, 8),
+#endif
+
+  make_tuple(128, 128, &aom_highbd_sad_skip_128x128x4d_c, 10),
+  make_tuple(128, 64, &aom_highbd_sad_skip_128x64x4d_c, 10),
+  make_tuple(64, 128, &aom_highbd_sad_skip_64x128x4d_c, 10),
+  make_tuple(64, 64, &aom_highbd_sad_skip_64x64x4d_c, 10),
+  make_tuple(64, 32, &aom_highbd_sad_skip_64x32x4d_c, 10),
+  make_tuple(32, 64, &aom_highbd_sad_skip_32x64x4d_c, 10),
+  make_tuple(32, 32, &aom_highbd_sad_skip_32x32x4d_c, 10),
+  make_tuple(32, 16, &aom_highbd_sad_skip_32x16x4d_c, 10),
+  make_tuple(16, 32, &aom_highbd_sad_skip_16x32x4d_c, 10),
+  make_tuple(16, 16, &aom_highbd_sad_skip_16x16x4d_c, 10),
+  make_tuple(16, 8, &aom_highbd_sad_skip_16x8x4d_c, 10),
+  make_tuple(8, 16, &aom_highbd_sad_skip_8x16x4d_c, 10),
+  make_tuple(8, 8, &aom_highbd_sad_skip_8x8x4d_c, 10),
+  make_tuple(8, 4, &aom_highbd_sad_skip_8x4x4d_c, 10),
+  make_tuple(4, 8, &aom_highbd_sad_skip_4x8x4d_c, 10),
+  make_tuple(4, 4, &aom_highbd_sad_skip_4x4x4d_c, 10),
+#if !CONFIG_REALTIME_ONLY
+  make_tuple(64, 16, &aom_highbd_sad_skip_64x16x4d_c, 10),
+  make_tuple(16, 64, &aom_highbd_sad_skip_16x64x4d_c, 10),
+  make_tuple(32, 8, &aom_highbd_sad_skip_32x8x4d_c, 10),
+  make_tuple(8, 32, &aom_highbd_sad_skip_8x32x4d_c, 10),
+  make_tuple(16, 4, &aom_highbd_sad_skip_16x4x4d_c, 10),
+  make_tuple(4, 16, &aom_highbd_sad_skip_4x16x4d_c, 10),
+#endif
+
+  make_tuple(128, 128, &aom_highbd_sad_skip_128x128x4d_c, 12),
+  make_tuple(128, 64, &aom_highbd_sad_skip_128x64x4d_c, 12),
+  make_tuple(64, 128, &aom_highbd_sad_skip_64x128x4d_c, 12),
+  make_tuple(64, 64, &aom_highbd_sad_skip_64x64x4d_c, 12),
+  make_tuple(64, 32, &aom_highbd_sad_skip_64x32x4d_c, 12),
+  make_tuple(32, 64, &aom_highbd_sad_skip_32x64x4d_c, 12),
+  make_tuple(32, 32, &aom_highbd_sad_skip_32x32x4d_c, 12),
+  make_tuple(32, 16, &aom_highbd_sad_skip_32x16x4d_c, 12),
+  make_tuple(16, 32, &aom_highbd_sad_skip_16x32x4d_c, 12),
+  make_tuple(16, 16, &aom_highbd_sad_skip_16x16x4d_c, 12),
+  make_tuple(16, 8, &aom_highbd_sad_skip_16x8x4d_c, 12),
+  make_tuple(8, 16, &aom_highbd_sad_skip_8x16x4d_c, 12),
+  make_tuple(8, 8, &aom_highbd_sad_skip_8x8x4d_c, 12),
+  make_tuple(8, 4, &aom_highbd_sad_skip_8x4x4d_c, 12),
+  make_tuple(4, 8, &aom_highbd_sad_skip_4x8x4d_c, 12),
+  make_tuple(4, 4, &aom_highbd_sad_skip_4x4x4d_c, 12),
+#if !CONFIG_REALTIME_ONLY
+  make_tuple(64, 16, &aom_highbd_sad_skip_64x16x4d_c, 12),
+  make_tuple(16, 64, &aom_highbd_sad_skip_16x64x4d_c, 12),
+  make_tuple(32, 8, &aom_highbd_sad_skip_32x8x4d_c, 12),
+  make_tuple(8, 32, &aom_highbd_sad_skip_8x32x4d_c, 12),
+  make_tuple(16, 4, &aom_highbd_sad_skip_16x4x4d_c, 12),
+  make_tuple(4, 16, &aom_highbd_sad_skip_4x16x4d_c, 12),
+#endif
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+};
+INSTANTIATE_TEST_SUITE_P(C, SADSkipx4Test,
+                         ::testing::ValuesIn(skip_x4d_c_tests));
+
 const SadMxNx4AvgParam x4d_avg_c_tests[] = {
   make_tuple(128, 128, &aom_sad128x128x4d_avg_c, -1),
   make_tuple(128, 64, &aom_sad128x64x4d_avg_c, -1),
@@ -1297,12 +1772,14 @@
   make_tuple(8, 4, &aom_sad8x4x4d_avg_c, -1),
   make_tuple(4, 8, &aom_sad4x8x4d_avg_c, -1),
   make_tuple(4, 4, &aom_sad4x4x4d_avg_c, -1),
+#if !CONFIG_REALTIME_ONLY
   make_tuple(64, 16, &aom_sad64x16x4d_avg_c, -1),
   make_tuple(16, 64, &aom_sad16x64x4d_avg_c, -1),
   make_tuple(32, 8, &aom_sad32x8x4d_avg_c, -1),
   make_tuple(8, 32, &aom_sad8x32x4d_avg_c, -1),
   make_tuple(16, 4, &aom_sad16x4x4d_avg_c, -1),
   make_tuple(4, 16, &aom_sad4x16x4d_avg_c, -1),
+#endif
 };
 INSTANTIATE_TEST_SUITE_P(C, SADx4AvgTest, ::testing::ValuesIn(x4d_avg_c_tests));
 
@@ -1310,6 +1787,7 @@
 // ARM functions
 #if HAVE_NEON
 const SadMxNParam neon_tests[] = {
+  make_tuple(128, 128, &aom_sad128x128_neon, -1),
   make_tuple(64, 64, &aom_sad64x64_neon, -1),
   make_tuple(32, 32, &aom_sad32x32_neon, -1),
   make_tuple(16, 16, &aom_sad16x16_neon, -1),
@@ -1326,6 +1804,57 @@
   make_tuple(16, 16, &aom_sad16x16x4d_neon, -1),
 };
 INSTANTIATE_TEST_SUITE_P(NEON, SADx4Test, ::testing::ValuesIn(x4d_neon_tests));
+const SadSkipMxNParam skip_neon_tests[] = {
+  make_tuple(128, 128, &aom_sad_skip_128x128_neon, -1),
+  make_tuple(128, 64, &aom_sad_skip_128x64_neon, -1),
+  make_tuple(64, 128, &aom_sad_skip_64x128_neon, -1),
+  make_tuple(64, 64, &aom_sad_skip_64x64_neon, -1),
+  make_tuple(64, 32, &aom_sad_skip_64x32_neon, -1),
+  make_tuple(32, 64, &aom_sad_skip_32x64_neon, -1),
+  make_tuple(32, 32, &aom_sad_skip_32x32_neon, -1),
+  make_tuple(32, 16, &aom_sad_skip_32x16_neon, -1),
+  make_tuple(16, 32, &aom_sad_skip_16x32_neon, -1),
+  make_tuple(16, 16, &aom_sad_skip_16x16_neon, -1),
+  make_tuple(16, 8, &aom_sad_skip_16x8_neon, -1),
+  make_tuple(8, 16, &aom_sad_skip_8x16_neon, -1),
+  make_tuple(8, 8, &aom_sad_skip_8x8_neon, -1),
+  make_tuple(4, 8, &aom_sad_skip_4x8_neon, -1),
+#if !CONFIG_REALTIME_ONLY
+  make_tuple(64, 16, &aom_sad_skip_64x16_neon, -1),
+  make_tuple(32, 8, &aom_sad_skip_32x8_neon, -1),
+  make_tuple(16, 64, &aom_sad_skip_16x64_neon, -1),
+  make_tuple(8, 32, &aom_sad_skip_8x32_neon, -1),
+  make_tuple(4, 16, &aom_sad_skip_4x16_neon, -1),
+#endif
+};
+INSTANTIATE_TEST_SUITE_P(NEON, SADSkipTest,
+                         ::testing::ValuesIn(skip_neon_tests));
+
+const SadSkipMxNx4Param skip_x4d_neon_tests[] = {
+  make_tuple(128, 128, &aom_sad_skip_128x128x4d_neon, -1),
+  make_tuple(128, 64, &aom_sad_skip_128x64x4d_neon, -1),
+  make_tuple(64, 128, &aom_sad_skip_64x128x4d_neon, -1),
+  make_tuple(64, 64, &aom_sad_skip_64x64x4d_neon, -1),
+  make_tuple(64, 32, &aom_sad_skip_64x32x4d_neon, -1),
+  make_tuple(32, 64, &aom_sad_skip_32x64x4d_neon, -1),
+  make_tuple(32, 32, &aom_sad_skip_32x32x4d_neon, -1),
+  make_tuple(32, 16, &aom_sad_skip_32x16x4d_neon, -1),
+  make_tuple(16, 32, &aom_sad_skip_16x32x4d_neon, -1),
+  make_tuple(16, 16, &aom_sad_skip_16x16x4d_neon, -1),
+  make_tuple(16, 8, &aom_sad_skip_16x8x4d_neon, -1),
+  make_tuple(8, 8, &aom_sad_skip_8x8x4d_neon, -1),
+  make_tuple(8, 16, &aom_sad_skip_8x16x4d_neon, -1),
+  make_tuple(4, 8, &aom_sad_skip_4x8x4d_neon, -1),
+#if !CONFIG_REALTIME_ONLY
+  make_tuple(64, 16, &aom_sad_skip_64x16x4d_neon, -1),
+  make_tuple(32, 8, &aom_sad_skip_32x8x4d_neon, -1),
+  make_tuple(16, 64, &aom_sad_skip_16x64x4d_neon, -1),
+  make_tuple(8, 32, &aom_sad_skip_8x32x4d_neon, -1),
+  make_tuple(4, 16, &aom_sad_skip_4x16x4d_neon, -1),
+#endif
+};
+INSTANTIATE_TEST_SUITE_P(NEON, SADSkipx4Test,
+                         ::testing::ValuesIn(skip_x4d_neon_tests));
 #endif  // HAVE_NEON
 
 //------------------------------------------------------------------------------
@@ -1389,6 +1918,7 @@
   make_tuple(4, 8, &aom_highbd_sad4x8_sse2, 12),
   make_tuple(4, 4, &aom_highbd_sad4x4_sse2, 12),
 #endif
+#if !CONFIG_REALTIME_ONLY
   make_tuple(64, 16, &aom_sad64x16_sse2, -1),
   make_tuple(16, 64, &aom_sad16x64_sse2, -1),
 #if CONFIG_AV1_HIGHBITDEPTH
@@ -1419,9 +1949,94 @@
   make_tuple(16, 4, &aom_highbd_sad16x4_sse2, 12),
   make_tuple(4, 16, &aom_highbd_sad4x16_sse2, 12),
 #endif
+#endif  // !CONFIG_REALTIME_ONLY
 };
 INSTANTIATE_TEST_SUITE_P(SSE2, SADTest, ::testing::ValuesIn(sse2_tests));
 
+const SadSkipMxNParam skip_sse2_tests[] = {
+  make_tuple(128, 128, &aom_sad_skip_128x128_sse2, -1),
+  make_tuple(128, 64, &aom_sad_skip_128x64_sse2, -1),
+  make_tuple(64, 128, &aom_sad_skip_64x128_sse2, -1),
+  make_tuple(64, 64, &aom_sad_skip_64x64_sse2, -1),
+  make_tuple(64, 32, &aom_sad_skip_64x32_sse2, -1),
+  make_tuple(32, 64, &aom_sad_skip_32x64_sse2, -1),
+  make_tuple(32, 32, &aom_sad_skip_32x32_sse2, -1),
+  make_tuple(32, 16, &aom_sad_skip_32x16_sse2, -1),
+  make_tuple(16, 32, &aom_sad_skip_16x32_sse2, -1),
+  make_tuple(16, 16, &aom_sad_skip_16x16_sse2, -1),
+  make_tuple(16, 8, &aom_sad_skip_16x8_sse2, -1),
+  make_tuple(8, 16, &aom_sad_skip_8x16_sse2, -1),
+  make_tuple(8, 8, &aom_sad_skip_8x8_sse2, -1),
+  make_tuple(4, 8, &aom_sad_skip_4x8_sse2, -1),
+#if !CONFIG_REALTIME_ONLY
+  make_tuple(64, 16, &aom_sad_skip_64x16_sse2, -1),
+  make_tuple(16, 64, &aom_sad_skip_16x64_sse2, -1),
+  make_tuple(32, 8, &aom_sad_skip_32x8_sse2, -1),
+  make_tuple(8, 32, &aom_sad_skip_8x32_sse2, -1),
+  make_tuple(4, 16, &aom_sad_skip_4x16_sse2, -1),
+#endif
+#if CONFIG_AV1_HIGHBITDEPTH
+  make_tuple(64, 64, &aom_highbd_sad_skip_64x64_sse2, 8),
+  make_tuple(64, 32, &aom_highbd_sad_skip_64x32_sse2, 8),
+  make_tuple(32, 64, &aom_highbd_sad_skip_32x64_sse2, 8),
+  make_tuple(32, 32, &aom_highbd_sad_skip_32x32_sse2, 8),
+  make_tuple(32, 16, &aom_highbd_sad_skip_32x16_sse2, 8),
+  make_tuple(16, 32, &aom_highbd_sad_skip_16x32_sse2, 8),
+  make_tuple(16, 16, &aom_highbd_sad_skip_16x16_sse2, 8),
+  make_tuple(16, 8, &aom_highbd_sad_skip_16x8_sse2, 8),
+  make_tuple(8, 16, &aom_highbd_sad_skip_8x16_sse2, 8),
+  make_tuple(8, 8, &aom_highbd_sad_skip_8x8_sse2, 8),
+  make_tuple(4, 8, &aom_highbd_sad_skip_4x8_sse2, 8),
+#if !CONFIG_REALTIME_ONLY
+  make_tuple(64, 16, &aom_highbd_sad_skip_64x16_sse2, 8),
+  make_tuple(16, 64, &aom_highbd_sad_skip_16x64_sse2, 8),
+  make_tuple(32, 8, &aom_highbd_sad_skip_32x8_sse2, 8),
+  make_tuple(8, 32, &aom_highbd_sad_skip_8x32_sse2, 8),
+  make_tuple(4, 16, &aom_highbd_sad_skip_4x16_sse2, 8),
+#endif
+
+  make_tuple(64, 64, &aom_highbd_sad_skip_64x64_sse2, 10),
+  make_tuple(64, 32, &aom_highbd_sad_skip_64x32_sse2, 10),
+  make_tuple(32, 64, &aom_highbd_sad_skip_32x64_sse2, 10),
+  make_tuple(32, 32, &aom_highbd_sad_skip_32x32_sse2, 10),
+  make_tuple(32, 16, &aom_highbd_sad_skip_32x16_sse2, 10),
+  make_tuple(16, 32, &aom_highbd_sad_skip_16x32_sse2, 10),
+  make_tuple(16, 16, &aom_highbd_sad_skip_16x16_sse2, 10),
+  make_tuple(16, 8, &aom_highbd_sad_skip_16x8_sse2, 10),
+  make_tuple(8, 16, &aom_highbd_sad_skip_8x16_sse2, 10),
+  make_tuple(8, 8, &aom_highbd_sad_skip_8x8_sse2, 10),
+  make_tuple(4, 8, &aom_highbd_sad_skip_4x8_sse2, 10),
+#if !CONFIG_REALTIME_ONLY
+  make_tuple(64, 16, &aom_highbd_sad_skip_64x16_sse2, 10),
+  make_tuple(16, 64, &aom_highbd_sad_skip_16x64_sse2, 10),
+  make_tuple(32, 8, &aom_highbd_sad_skip_32x8_sse2, 10),
+  make_tuple(8, 32, &aom_highbd_sad_skip_8x32_sse2, 10),
+  make_tuple(4, 16, &aom_highbd_sad_skip_4x16_sse2, 10),
+#endif
+
+  make_tuple(64, 64, &aom_highbd_sad_skip_64x64_sse2, 12),
+  make_tuple(64, 32, &aom_highbd_sad_skip_64x32_sse2, 12),
+  make_tuple(32, 64, &aom_highbd_sad_skip_32x64_sse2, 12),
+  make_tuple(32, 32, &aom_highbd_sad_skip_32x32_sse2, 12),
+  make_tuple(32, 16, &aom_highbd_sad_skip_32x16_sse2, 12),
+  make_tuple(16, 32, &aom_highbd_sad_skip_16x32_sse2, 12),
+  make_tuple(16, 16, &aom_highbd_sad_skip_16x16_sse2, 12),
+  make_tuple(16, 8, &aom_highbd_sad_skip_16x8_sse2, 12),
+  make_tuple(8, 16, &aom_highbd_sad_skip_8x16_sse2, 12),
+  make_tuple(8, 8, &aom_highbd_sad_skip_8x8_sse2, 12),
+  make_tuple(4, 8, &aom_highbd_sad_skip_4x8_sse2, 12),
+#if !CONFIG_REALTIME_ONLY
+  make_tuple(64, 16, &aom_highbd_sad_skip_64x16_sse2, 12),
+  make_tuple(16, 64, &aom_highbd_sad_skip_16x64_sse2, 12),
+  make_tuple(32, 8, &aom_highbd_sad_skip_32x8_sse2, 12),
+  make_tuple(8, 32, &aom_highbd_sad_skip_8x32_sse2, 12),
+  make_tuple(4, 16, &aom_highbd_sad_skip_4x16_sse2, 12),
+#endif
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+};
+INSTANTIATE_TEST_SUITE_P(SSE2, SADSkipTest,
+                         ::testing::ValuesIn(skip_sse2_tests));
+
 const SadMxNAvgParam avg_sse2_tests[] = {
   make_tuple(128, 128, &aom_sad128x128_avg_sse2, -1),
   make_tuple(128, 64, &aom_sad128x64_avg_sse2, -1),
@@ -1480,6 +2095,7 @@
   make_tuple(4, 8, &aom_highbd_sad4x8_avg_sse2, 12),
   make_tuple(4, 4, &aom_highbd_sad4x4_avg_sse2, 12),
 #endif
+#if !CONFIG_REALTIME_ONLY
   make_tuple(64, 16, &aom_sad64x16_avg_sse2, -1),
   make_tuple(16, 64, &aom_sad16x64_avg_sse2, -1),
 #if CONFIG_AV1_HIGHBITDEPTH
@@ -1510,6 +2126,7 @@
   make_tuple(16, 4, &aom_highbd_sad16x4_avg_sse2, 12),
   make_tuple(4, 16, &aom_highbd_sad4x16_avg_sse2, 12),
 #endif
+#endif  // !CONFIG_REALTIME_ONLY
 };
 INSTANTIATE_TEST_SUITE_P(SSE2, SADavgTest, ::testing::ValuesIn(avg_sse2_tests));
 
@@ -1571,6 +2188,7 @@
   make_tuple(4, 8, &aom_highbd_sad4x8x4d_sse2, 12),
   make_tuple(4, 4, &aom_highbd_sad4x4x4d_sse2, 12),
 #endif
+#if !CONFIG_REALTIME_ONLY
   make_tuple(64, 16, &aom_sad64x16x4d_sse2, -1),
   make_tuple(16, 64, &aom_sad16x64x4d_sse2, -1),
 #if CONFIG_AV1_HIGHBITDEPTH
@@ -1601,9 +2219,92 @@
   make_tuple(16, 4, &aom_highbd_sad16x4x4d_sse2, 12),
   make_tuple(4, 16, &aom_highbd_sad4x16x4d_sse2, 12),
 #endif
+#endif
 };
 INSTANTIATE_TEST_SUITE_P(SSE2, SADx4Test, ::testing::ValuesIn(x4d_sse2_tests));
 
+const SadSkipMxNx4Param skip_x4d_sse2_tests[] = {
+  make_tuple(128, 128, &aom_sad_skip_128x128x4d_sse2, -1),
+  make_tuple(128, 64, &aom_sad_skip_128x64x4d_sse2, -1),
+  make_tuple(64, 128, &aom_sad_skip_64x128x4d_sse2, -1),
+  make_tuple(64, 64, &aom_sad_skip_64x64x4d_sse2, -1),
+  make_tuple(64, 32, &aom_sad_skip_64x32x4d_sse2, -1),
+  make_tuple(32, 64, &aom_sad_skip_32x64x4d_sse2, -1),
+  make_tuple(32, 32, &aom_sad_skip_32x32x4d_sse2, -1),
+  make_tuple(32, 16, &aom_sad_skip_32x16x4d_sse2, -1),
+  make_tuple(16, 32, &aom_sad_skip_16x32x4d_sse2, -1),
+  make_tuple(16, 16, &aom_sad_skip_16x16x4d_sse2, -1),
+  make_tuple(16, 8, &aom_sad_skip_16x8x4d_sse2, -1),
+  make_tuple(8, 16, &aom_sad_skip_8x16x4d_sse2, -1),
+  make_tuple(8, 8, &aom_sad_skip_8x8x4d_sse2, -1),
+  make_tuple(4, 8, &aom_sad_skip_4x8x4d_sse2, -1),
+#if !CONFIG_REALTIME_ONLY
+  make_tuple(64, 16, &aom_sad_skip_64x16x4d_sse2, -1),
+  make_tuple(16, 64, &aom_sad_skip_16x64x4d_sse2, -1),
+  make_tuple(32, 8, &aom_sad_skip_32x8x4d_sse2, -1),
+  make_tuple(8, 32, &aom_sad_skip_8x32x4d_sse2, -1),
+  make_tuple(4, 16, &aom_sad_skip_4x16x4d_sse2, -1),
+#endif
+#if CONFIG_AV1_HIGHBITDEPTH
+  make_tuple(64, 64, &aom_highbd_sad_skip_64x64x4d_sse2, 8),
+  make_tuple(64, 32, &aom_highbd_sad_skip_64x32x4d_sse2, 8),
+  make_tuple(32, 64, &aom_highbd_sad_skip_32x64x4d_sse2, 8),
+  make_tuple(32, 32, &aom_highbd_sad_skip_32x32x4d_sse2, 8),
+  make_tuple(32, 16, &aom_highbd_sad_skip_32x16x4d_sse2, 8),
+  make_tuple(16, 32, &aom_highbd_sad_skip_16x32x4d_sse2, 8),
+  make_tuple(16, 16, &aom_highbd_sad_skip_16x16x4d_sse2, 8),
+  make_tuple(16, 8, &aom_highbd_sad_skip_16x8x4d_sse2, 8),
+  make_tuple(8, 16, &aom_highbd_sad_skip_8x16x4d_sse2, 8),
+  make_tuple(8, 8, &aom_highbd_sad_skip_8x8x4d_sse2, 8),
+  make_tuple(4, 8, &aom_highbd_sad_skip_4x8x4d_sse2, 8),
+#if !CONFIG_REALTIME_ONLY
+  make_tuple(64, 16, &aom_highbd_sad_skip_64x16x4d_sse2, 8),
+  make_tuple(16, 64, &aom_highbd_sad_skip_16x64x4d_sse2, 8),
+  make_tuple(32, 8, &aom_highbd_sad_skip_32x8x4d_sse2, 8),
+  make_tuple(8, 32, &aom_highbd_sad_skip_8x32x4d_sse2, 8),
+  make_tuple(4, 16, &aom_highbd_sad_skip_4x16x4d_sse2, 8),
+#endif
+  make_tuple(64, 64, &aom_highbd_sad_skip_64x64x4d_sse2, 10),
+  make_tuple(64, 32, &aom_highbd_sad_skip_64x32x4d_sse2, 10),
+  make_tuple(32, 64, &aom_highbd_sad_skip_32x64x4d_sse2, 10),
+  make_tuple(32, 32, &aom_highbd_sad_skip_32x32x4d_sse2, 10),
+  make_tuple(32, 16, &aom_highbd_sad_skip_32x16x4d_sse2, 10),
+  make_tuple(16, 32, &aom_highbd_sad_skip_16x32x4d_sse2, 10),
+  make_tuple(16, 16, &aom_highbd_sad_skip_16x16x4d_sse2, 10),
+  make_tuple(16, 8, &aom_highbd_sad_skip_16x8x4d_sse2, 10),
+  make_tuple(8, 16, &aom_highbd_sad_skip_8x16x4d_sse2, 10),
+  make_tuple(8, 8, &aom_highbd_sad_skip_8x8x4d_sse2, 10),
+  make_tuple(4, 8, &aom_highbd_sad_skip_4x8x4d_sse2, 10),
+#if !CONFIG_REALTIME_ONLY
+  make_tuple(64, 16, &aom_highbd_sad_skip_64x16x4d_sse2, 10),
+  make_tuple(16, 64, &aom_highbd_sad_skip_16x64x4d_sse2, 10),
+  make_tuple(32, 8, &aom_highbd_sad_skip_32x8x4d_sse2, 10),
+  make_tuple(8, 32, &aom_highbd_sad_skip_8x32x4d_sse2, 10),
+  make_tuple(4, 16, &aom_highbd_sad_skip_4x16x4d_sse2, 10),
+#endif
+  make_tuple(64, 64, &aom_highbd_sad_skip_64x64x4d_sse2, 12),
+  make_tuple(64, 32, &aom_highbd_sad_skip_64x32x4d_sse2, 12),
+  make_tuple(32, 64, &aom_highbd_sad_skip_32x64x4d_sse2, 12),
+  make_tuple(32, 32, &aom_highbd_sad_skip_32x32x4d_sse2, 12),
+  make_tuple(32, 16, &aom_highbd_sad_skip_32x16x4d_sse2, 12),
+  make_tuple(16, 32, &aom_highbd_sad_skip_16x32x4d_sse2, 12),
+  make_tuple(16, 16, &aom_highbd_sad_skip_16x16x4d_sse2, 12),
+  make_tuple(16, 8, &aom_highbd_sad_skip_16x8x4d_sse2, 12),
+  make_tuple(8, 16, &aom_highbd_sad_skip_8x16x4d_sse2, 12),
+  make_tuple(8, 8, &aom_highbd_sad_skip_8x8x4d_sse2, 12),
+  make_tuple(4, 8, &aom_highbd_sad_skip_4x8x4d_sse2, 12),
+#if !CONFIG_REALTIME_ONLY
+  make_tuple(64, 16, &aom_highbd_sad_skip_64x16x4d_sse2, 12),
+  make_tuple(16, 64, &aom_highbd_sad_skip_16x64x4d_sse2, 12),
+  make_tuple(32, 8, &aom_highbd_sad_skip_32x8x4d_sse2, 12),
+  make_tuple(8, 32, &aom_highbd_sad_skip_8x32x4d_sse2, 12),
+  make_tuple(4, 16, &aom_highbd_sad_skip_4x16x4d_sse2, 12),
+#endif
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+};
+INSTANTIATE_TEST_SUITE_P(SSE2, SADSkipx4Test,
+                         ::testing::ValuesIn(skip_x4d_sse2_tests));
+
 const SadMxNx4AvgParam x4d_avg_sse2_tests[] = {
   make_tuple(128, 128, &aom_sad128x128x4d_avg_sse2, -1),
   make_tuple(128, 64, &aom_sad128x64x4d_avg_sse2, -1),
@@ -1621,12 +2322,14 @@
   make_tuple(8, 4, &aom_sad8x4x4d_avg_sse2, -1),
   make_tuple(4, 8, &aom_sad4x8x4d_avg_sse2, -1),
   make_tuple(4, 4, &aom_sad4x4x4d_avg_sse2, -1),
+#if !CONFIG_REALTIME_ONLY
   make_tuple(64, 16, &aom_sad64x16x4d_avg_sse2, -1),
   make_tuple(16, 64, &aom_sad16x64x4d_avg_sse2, -1),
   make_tuple(32, 8, &aom_sad32x8x4d_avg_sse2, -1),
   make_tuple(8, 32, &aom_sad8x32x4d_avg_sse2, -1),
   make_tuple(16, 4, &aom_sad16x4x4d_avg_sse2, -1),
   make_tuple(4, 16, &aom_sad4x16x4d_avg_sse2, -1),
+#endif
 };
 INSTANTIATE_TEST_SUITE_P(SSE2, SADx4AvgTest,
                          ::testing::ValuesIn(x4d_avg_sse2_tests));
@@ -1658,13 +2361,14 @@
   make_tuple(32, 8, &aom_sad32xh_sse2, -1),
   make_tuple(16, 64, &aom_sad16xh_sse2, -1),
   make_tuple(64, 16, &aom_sad64xh_sse2, -1),
-
+#if !CONFIG_REALTIME_ONLY
   make_tuple(16, 64, &aom_sad16xh_sse2, -1),
   make_tuple(64, 16, &aom_sad64xh_sse2, -1),
   make_tuple(8, 32, &aom_sad8xh_sse2, -1),
   make_tuple(32, 8, &aom_sad32xh_sse2, -1),
   make_tuple(4, 16, &aom_sad4xh_sse2, -1),
   make_tuple(16, 4, &aom_sad16xh_sse2, -1),
+#endif
 };
 INSTANTIATE_TEST_SUITE_P(SSE2, DistWtdSADTest,
                          ::testing::ValuesIn(dist_wtd_sad_sse2_tests));
@@ -1694,13 +2398,14 @@
   make_tuple(4, 8, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
   make_tuple(4, 4, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
   make_tuple(16, 16, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
-
+#if !CONFIG_REALTIME_ONLY
   make_tuple(64, 16, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
   make_tuple(16, 64, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
   make_tuple(32, 8, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
   make_tuple(8, 32, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
   make_tuple(16, 4, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
   make_tuple(4, 16, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+#endif
 };
 
 INSTANTIATE_TEST_SUITE_P(SSSE3, DistWtdCompAvgTest,
@@ -1723,13 +2428,14 @@
   make_tuple(8, 4, &aom_dist_wtd_sad8x4_avg_ssse3, -1),
   make_tuple(4, 8, &aom_dist_wtd_sad4x8_avg_ssse3, -1),
   make_tuple(4, 4, &aom_dist_wtd_sad4x4_avg_ssse3, -1),
-
+#if !CONFIG_REALTIME_ONLY
   make_tuple(64, 16, &aom_dist_wtd_sad64x16_avg_ssse3, -1),
   make_tuple(16, 64, &aom_dist_wtd_sad16x64_avg_ssse3, -1),
   make_tuple(32, 8, &aom_dist_wtd_sad32x8_avg_ssse3, -1),
   make_tuple(8, 32, &aom_dist_wtd_sad8x32_avg_ssse3, -1),
   make_tuple(16, 4, &aom_dist_wtd_sad16x4_avg_ssse3, -1),
   make_tuple(4, 16, &aom_dist_wtd_sad4x16_avg_ssse3, -1),
+#endif
 };
 INSTANTIATE_TEST_SUITE_P(SSSE3, DistWtdSADavgTest,
                          ::testing::ValuesIn(dist_wtd_avg_ssse3_tests));
@@ -1784,6 +2490,7 @@
   make_tuple(16, 8, &aom_highbd_sad16x8_avx2, 10),
   make_tuple(16, 8, &aom_highbd_sad16x8_avx2, 12),
 
+#if !CONFIG_REALTIME_ONLY
   make_tuple(64, 16, &aom_highbd_sad64x16_avx2, 8),
   make_tuple(64, 16, &aom_highbd_sad64x16_avx2, 10),
   make_tuple(64, 16, &aom_highbd_sad64x16_avx2, 12),
@@ -1797,9 +2504,66 @@
   make_tuple(16, 4, &aom_highbd_sad16x4_avx2, 10),
   make_tuple(16, 4, &aom_highbd_sad16x4_avx2, 12),
 #endif
+#endif
 };
 INSTANTIATE_TEST_SUITE_P(AVX2, SADTest, ::testing::ValuesIn(avx2_tests));
 
+const SadSkipMxNParam skip_avx2_tests[] = {
+  make_tuple(128, 128, &aom_sad_skip_128x128_avx2, -1),
+  make_tuple(128, 64, &aom_sad_skip_128x64_avx2, -1),
+  make_tuple(64, 128, &aom_sad_skip_64x128_avx2, -1),
+  make_tuple(64, 64, &aom_sad_skip_64x64_avx2, -1),
+  make_tuple(64, 32, &aom_sad_skip_64x32_avx2, -1),
+  make_tuple(32, 64, &aom_sad_skip_32x64_avx2, -1),
+  make_tuple(32, 32, &aom_sad_skip_32x32_avx2, -1),
+  make_tuple(32, 16, &aom_sad_skip_32x16_avx2, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
+  make_tuple(128, 128, &aom_highbd_sad_skip_128x128_avx2, 8),
+  make_tuple(128, 64, &aom_highbd_sad_skip_128x64_avx2, 8),
+  make_tuple(64, 128, &aom_highbd_sad_skip_64x128_avx2, 8),
+  make_tuple(64, 64, &aom_highbd_sad_skip_64x64_avx2, 8),
+  make_tuple(64, 32, &aom_highbd_sad_skip_64x32_avx2, 8),
+  make_tuple(32, 64, &aom_highbd_sad_skip_32x64_avx2, 8),
+  make_tuple(32, 32, &aom_highbd_sad_skip_32x32_avx2, 8),
+  make_tuple(32, 16, &aom_highbd_sad_skip_32x16_avx2, 8),
+  make_tuple(16, 32, &aom_highbd_sad_skip_16x32_avx2, 8),
+  make_tuple(16, 16, &aom_highbd_sad_skip_16x16_avx2, 8),
+  make_tuple(16, 8, &aom_highbd_sad_skip_16x8_avx2, 8),
+
+  make_tuple(128, 128, &aom_highbd_sad_skip_128x128_avx2, 10),
+  make_tuple(128, 64, &aom_highbd_sad_skip_128x64_avx2, 10),
+  make_tuple(64, 128, &aom_highbd_sad_skip_64x128_avx2, 10),
+  make_tuple(64, 64, &aom_highbd_sad_skip_64x64_avx2, 10),
+  make_tuple(64, 32, &aom_highbd_sad_skip_64x32_avx2, 10),
+  make_tuple(32, 64, &aom_highbd_sad_skip_32x64_avx2, 10),
+  make_tuple(32, 32, &aom_highbd_sad_skip_32x32_avx2, 10),
+  make_tuple(32, 16, &aom_highbd_sad_skip_32x16_avx2, 10),
+  make_tuple(16, 32, &aom_highbd_sad_skip_16x32_avx2, 10),
+  make_tuple(16, 16, &aom_highbd_sad_skip_16x16_avx2, 10),
+  make_tuple(16, 8, &aom_highbd_sad_skip_16x8_avx2, 10),
+
+  make_tuple(128, 128, &aom_highbd_sad_skip_128x128_avx2, 12),
+  make_tuple(128, 64, &aom_highbd_sad_skip_128x64_avx2, 12),
+  make_tuple(64, 128, &aom_highbd_sad_skip_64x128_avx2, 12),
+  make_tuple(64, 64, &aom_highbd_sad_skip_64x64_avx2, 12),
+  make_tuple(64, 32, &aom_highbd_sad_skip_64x32_avx2, 12),
+  make_tuple(32, 64, &aom_highbd_sad_skip_32x64_avx2, 12),
+  make_tuple(32, 32, &aom_highbd_sad_skip_32x32_avx2, 12),
+  make_tuple(32, 16, &aom_highbd_sad_skip_32x16_avx2, 12),
+  make_tuple(16, 32, &aom_highbd_sad_skip_16x32_avx2, 12),
+  make_tuple(16, 16, &aom_highbd_sad_skip_16x16_avx2, 12),
+  make_tuple(16, 8, &aom_highbd_sad_skip_16x8_avx2, 12),
+
+#if !CONFIG_REALTIME_ONLY
+  make_tuple(16, 64, &aom_highbd_sad_skip_16x64_avx2, 8),
+  make_tuple(16, 64, &aom_highbd_sad_skip_16x64_avx2, 10),
+  make_tuple(16, 64, &aom_highbd_sad_skip_16x64_avx2, 12),
+#endif
+#endif
+};
+INSTANTIATE_TEST_SUITE_P(AVX2, SADSkipTest,
+                         ::testing::ValuesIn(skip_avx2_tests));
+
 const SadMxNAvgParam avg_avx2_tests[] = {
   make_tuple(64, 128, &aom_sad64x128_avg_avx2, -1),
   make_tuple(128, 64, &aom_sad128x64_avg_avx2, -1),
@@ -1844,6 +2608,7 @@
   make_tuple(16, 8, &aom_highbd_sad16x8_avg_avx2, 10),
   make_tuple(16, 8, &aom_highbd_sad16x8_avg_avx2, 12),
 
+#if !CONFIG_REALTIME_ONLY
   make_tuple(64, 16, &aom_highbd_sad64x16_avg_avx2, 8),
   make_tuple(64, 16, &aom_highbd_sad64x16_avg_avx2, 10),
   make_tuple(64, 16, &aom_highbd_sad64x16_avg_avx2, 12),
@@ -1857,20 +2622,96 @@
   make_tuple(16, 4, &aom_highbd_sad16x4_avg_avx2, 10),
   make_tuple(16, 4, &aom_highbd_sad16x4_avg_avx2, 12),
 #endif
+#endif
 };
 INSTANTIATE_TEST_SUITE_P(AVX2, SADavgTest, ::testing::ValuesIn(avg_avx2_tests));
 
+const SadSkipMxNx4Param skip_x4d_avx2_tests[] = {
+  make_tuple(128, 128, &aom_sad_skip_128x128x4d_avx2, -1),
+  make_tuple(128, 64, &aom_sad_skip_128x64x4d_avx2, -1),
+  make_tuple(64, 128, &aom_sad_skip_64x128x4d_avx2, -1),
+  make_tuple(64, 64, &aom_sad_skip_64x64x4d_avx2, -1),
+  make_tuple(64, 32, &aom_sad_skip_64x32x4d_avx2, -1),
+  make_tuple(32, 64, &aom_sad_skip_32x64x4d_avx2, -1),
+  make_tuple(32, 32, &aom_sad_skip_32x32x4d_avx2, -1),
+  make_tuple(32, 16, &aom_sad_skip_32x16x4d_avx2, -1),
+
+#if CONFIG_AV1_HIGHBITDEPTH
+  make_tuple(128, 128, &aom_highbd_sad_skip_128x128x4d_avx2, 8),
+  make_tuple(128, 64, &aom_highbd_sad_skip_128x64x4d_avx2, 8),
+  make_tuple(64, 128, &aom_highbd_sad_skip_64x128x4d_avx2, 8),
+  make_tuple(64, 64, &aom_highbd_sad_skip_64x64x4d_avx2, 8),
+  make_tuple(64, 32, &aom_highbd_sad_skip_64x32x4d_avx2, 8),
+  make_tuple(32, 64, &aom_highbd_sad_skip_32x64x4d_avx2, 8),
+  make_tuple(32, 32, &aom_highbd_sad_skip_32x32x4d_avx2, 8),
+  make_tuple(32, 16, &aom_highbd_sad_skip_32x16x4d_avx2, 8),
+  make_tuple(16, 32, &aom_highbd_sad_skip_16x32x4d_avx2, 8),
+  make_tuple(16, 16, &aom_highbd_sad_skip_16x16x4d_avx2, 8),
+  make_tuple(16, 8, &aom_highbd_sad_skip_16x8x4d_avx2, 8),
+
+  make_tuple(128, 128, &aom_highbd_sad_skip_128x128x4d_avx2, 10),
+  make_tuple(128, 64, &aom_highbd_sad_skip_128x64x4d_avx2, 10),
+  make_tuple(64, 128, &aom_highbd_sad_skip_64x128x4d_avx2, 10),
+  make_tuple(64, 64, &aom_highbd_sad_skip_64x64x4d_avx2, 10),
+  make_tuple(64, 32, &aom_highbd_sad_skip_64x32x4d_avx2, 10),
+  make_tuple(32, 64, &aom_highbd_sad_skip_32x64x4d_avx2, 10),
+  make_tuple(32, 32, &aom_highbd_sad_skip_32x32x4d_avx2, 10),
+  make_tuple(32, 16, &aom_highbd_sad_skip_32x16x4d_avx2, 10),
+  make_tuple(16, 32, &aom_highbd_sad_skip_16x32x4d_avx2, 10),
+  make_tuple(16, 16, &aom_highbd_sad_skip_16x16x4d_avx2, 10),
+  make_tuple(16, 8, &aom_highbd_sad_skip_16x8x4d_avx2, 10),
+
+  make_tuple(128, 128, &aom_highbd_sad_skip_128x128x4d_avx2, 12),
+  make_tuple(128, 64, &aom_highbd_sad_skip_128x64x4d_avx2, 12),
+  make_tuple(64, 128, &aom_highbd_sad_skip_64x128x4d_avx2, 12),
+  make_tuple(64, 64, &aom_highbd_sad_skip_64x64x4d_avx2, 12),
+  make_tuple(64, 32, &aom_highbd_sad_skip_64x32x4d_avx2, 12),
+  make_tuple(32, 64, &aom_highbd_sad_skip_32x64x4d_avx2, 12),
+  make_tuple(32, 32, &aom_highbd_sad_skip_32x32x4d_avx2, 12),
+  make_tuple(32, 16, &aom_highbd_sad_skip_32x16x4d_avx2, 12),
+  make_tuple(16, 32, &aom_highbd_sad_skip_16x32x4d_avx2, 12),
+  make_tuple(16, 16, &aom_highbd_sad_skip_16x16x4d_avx2, 12),
+  make_tuple(16, 8, &aom_highbd_sad_skip_16x8x4d_avx2, 12),
+
+#if !CONFIG_REALTIME_ONLY
+  make_tuple(64, 16, &aom_highbd_sad_skip_64x16x4d_avx2, 8),
+  make_tuple(32, 8, &aom_highbd_sad_skip_32x8x4d_avx2, 8),
+  make_tuple(16, 64, &aom_highbd_sad_skip_16x64x4d_avx2, 8),
+
+  make_tuple(64, 16, &aom_highbd_sad_skip_64x16x4d_avx2, 10),
+  make_tuple(32, 8, &aom_highbd_sad_skip_32x8x4d_avx2, 10),
+  make_tuple(16, 64, &aom_highbd_sad_skip_16x64x4d_avx2, 10),
+
+  make_tuple(64, 16, &aom_highbd_sad_skip_64x16x4d_avx2, 12),
+  make_tuple(32, 8, &aom_highbd_sad_skip_32x8x4d_avx2, 12),
+  make_tuple(16, 64, &aom_highbd_sad_skip_16x64x4d_avx2, 12),
+#endif
+#endif
+
+#if !CONFIG_REALTIME_ONLY
+  make_tuple(64, 16, &aom_sad_skip_64x16x4d_avx2, -1),
+  make_tuple(32, 8, &aom_sad_skip_32x8x4d_avx2, -1),
+#endif
+};
+
+INSTANTIATE_TEST_SUITE_P(AVX2, SADSkipx4Test,
+                         ::testing::ValuesIn(skip_x4d_avx2_tests));
+
 const SadMxNx4Param x4d_avx2_tests[] = {
   make_tuple(32, 64, &aom_sad32x64x4d_avx2, -1),
   make_tuple(32, 32, &aom_sad32x32x4d_avx2, -1),
   make_tuple(32, 16, &aom_sad32x16x4d_avx2, -1),
-  make_tuple(32, 8, &aom_sad32x8x4d_avx2, -1),
   make_tuple(64, 128, &aom_sad64x128x4d_avx2, -1),
   make_tuple(64, 64, &aom_sad64x64x4d_avx2, -1),
   make_tuple(64, 32, &aom_sad64x32x4d_avx2, -1),
-  make_tuple(64, 16, &aom_sad64x16x4d_avx2, -1),
   make_tuple(128, 128, &aom_sad128x128x4d_avx2, -1),
   make_tuple(128, 64, &aom_sad128x64x4d_avx2, -1),
+
+#if !CONFIG_REALTIME_ONLY
+  make_tuple(32, 8, &aom_sad32x8x4d_avx2, -1),
+  make_tuple(64, 16, &aom_sad64x16x4d_avx2, -1),
+#endif
+
 #if CONFIG_AV1_HIGHBITDEPTH
   make_tuple(128, 128, &aom_highbd_sad128x128x4d_avx2, 8),
   make_tuple(128, 128, &aom_highbd_sad128x128x4d_avx2, 10),
@@ -1906,6 +2747,7 @@
   make_tuple(16, 8, &aom_highbd_sad16x8x4d_avx2, 10),
   make_tuple(16, 8, &aom_highbd_sad16x8x4d_avx2, 12),
 
+#if !CONFIG_REALTIME_ONLY
   make_tuple(16, 64, &aom_highbd_sad16x64x4d_avx2, 8),
   make_tuple(16, 64, &aom_highbd_sad16x64x4d_avx2, 10),
   make_tuple(16, 64, &aom_highbd_sad16x64x4d_avx2, 12),
@@ -1919,6 +2761,7 @@
   make_tuple(16, 4, &aom_highbd_sad16x4x4d_avx2, 10),
   make_tuple(16, 4, &aom_highbd_sad16x4x4d_avx2, 12),
 #endif
+#endif
 };
 INSTANTIATE_TEST_SUITE_P(AVX2, SADx4Test, ::testing::ValuesIn(x4d_avx2_tests));
 #endif  // HAVE_AVX2
diff --git a/test/sb_multipass_test.cc b/test/sb_multipass_test.cc
index 0ca76ab..d2f56db 100644
--- a/test/sb_multipass_test.cc
+++ b/test/sb_multipass_test.cc
@@ -147,7 +147,7 @@
 
 TEST_P(AV1SBMultipassTest, TwoPassMatchTest) { DoTest(); }
 
-AV1_INSTANTIATE_TEST_CASE(AV1SBMultipassTest, ::testing::Range(0, 6),
-                          ::testing::Bool());
+AV1_INSTANTIATE_TEST_SUITE(AV1SBMultipassTest, ::testing::Range(4, 6),
+                           ::testing::Bool());
 
 }  // namespace
diff --git a/test/scalability_test.cc b/test/scalability_test.cc
index b399188..d38d8ab 100644
--- a/test/scalability_test.cc
+++ b/test/scalability_test.cc
@@ -75,7 +75,7 @@
 
 TEST_P(ScalabilityTest, TestNoMismatch3SpatialLayers) { DoTest(3); }
 
-AV1_INSTANTIATE_TEST_CASE(ScalabilityTest,
-                          ::testing::Values(::libaom_test::kRealTime));
+AV1_INSTANTIATE_TEST_SUITE(ScalabilityTest,
+                           ::testing::Values(::libaom_test::kRealTime));
 
 }  // namespace
diff --git a/test/screen_content_test.cc b/test/screen_content_test.cc
new file mode 100644
index 0000000..ecf2ef5
--- /dev/null
+++ b/test/screen_content_test.cc
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include "aom/aom_codec.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/y4m_video_source.h"
+#include "test/util.h"
+
+namespace {
+// This class is used to validate if screen_content_tools are turned on
+// appropriately.
+class ScreenContentToolsTestLarge
+    : public ::libaom_test::CodecTestWith2Params<libaom_test::TestMode,
+                                                 aom_rc_mode>,
+      public ::libaom_test::EncoderTest {
+ protected:
+  ScreenContentToolsTestLarge()
+      : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)),
+        rc_end_usage_(GET_PARAM(2)) {
+    is_screen_content_violated_ = true;
+    tune_content_ = AOM_CONTENT_DEFAULT;
+  }
+  virtual ~ScreenContentToolsTestLarge() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(encoding_mode_);
+    const aom_rational timebase = { 1, 30 };
+    cfg_.g_timebase = timebase;
+    cfg_.rc_end_usage = rc_end_usage_;
+    cfg_.g_threads = 1;
+    cfg_.g_lag_in_frames = 35;
+    cfg_.rc_target_bitrate = 1000;
+    cfg_.g_profile = 0;
+  }
+
+  virtual bool DoDecode() const { return 1; }
+
+  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                                  ::libaom_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      encoder->Control(AOME_SET_CPUUSED, 5);
+      encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
+      encoder->Control(AV1E_SET_TUNE_CONTENT, tune_content_);
+    }
+  }
+
+  virtual bool HandleDecodeResult(const aom_codec_err_t res_dec,
+                                  libaom_test::Decoder *decoder) {
+    EXPECT_EQ(AOM_CODEC_OK, res_dec) << decoder->DecodeError();
+    if (AOM_CODEC_OK == res_dec) {
+      aom_codec_ctx_t *ctx_dec = decoder->GetDecoder();
+      aom_screen_content_tools_info sc_info;
+
+      AOM_CODEC_CONTROL_TYPECHECKED(ctx_dec, AOMD_GET_SCREEN_CONTENT_TOOLS_INFO,
+                                    &sc_info);
+      if (sc_info.allow_screen_content_tools == 1) {
+        is_screen_content_violated_ = false;
+      }
+    }
+    return AOM_CODEC_OK == res_dec;
+  }
+
+  ::libaom_test::TestMode encoding_mode_;
+  bool is_screen_content_violated_;
+  int tune_content_;
+  aom_rc_mode rc_end_usage_;
+};
+
+TEST_P(ScreenContentToolsTestLarge, ScreenContentToolsTest) {
+  // force screen content tools on
+  ::libaom_test::Y4mVideoSource video_nonsc("park_joy_90p_8_444.y4m", 0, 1);
+  cfg_.g_profile = 1;
+  tune_content_ = AOM_CONTENT_SCREEN;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video_nonsc));
+  ASSERT_EQ(is_screen_content_violated_, false)
+      << "Failed for tune_content_ = AOM_CONTENT_SCREEN";
+
+  // Don't force screen content, however as the input is screen content
+  // allow_screen_content_tools should still be turned on
+  ::libaom_test::Y4mVideoSource video_sc("desktop_credits.y4m", 0, 1);
+  cfg_.g_profile = 1;
+  is_screen_content_violated_ = true;
+  tune_content_ = AOM_CONTENT_DEFAULT;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video_sc));
+  ASSERT_EQ(is_screen_content_violated_, false)
+      << "Failed detection of screen content";
+
+  // TODO(anyone): Enable below test once low resolution screen content
+  // detection issues are fixed.
+  // low resolution test
+  //  ::libaom_test::Y4mVideoSource video_sc("screendata.y4m", 0, 1);
+  //  cfg_.g_profile = 0;
+  //  is_screen_content_violated_ = true;
+  //  tune_content_ = AOM_CONTENT_DEFAULT;
+  //  ASSERT_NO_FATAL_FAILURE(RunLoop(&video_sc));
+  //  ASSERT_EQ(is_screen_content_violated_, false)
+  //      << "Failed detection of screen content(lowres)";
+}
+
+AV1_INSTANTIATE_TEST_SUITE(ScreenContentToolsTestLarge,
+                           ::testing::Values(::libaom_test::kOnePassGood,
+                                             ::libaom_test::kTwoPassGood),
+                           ::testing::Values(AOM_Q));
+}  // namespace
diff --git a/test/selfguided_filter_test.cc b/test/selfguided_filter_test.cc
index d65cce5..48ec461 100644
--- a/test/selfguided_filter_test.cc
+++ b/test/selfguided_filter_test.cc
@@ -197,6 +197,7 @@
  private:
   SgrFunc tst_fun_;
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1SelfguidedFilterTest);
 
 TEST_P(AV1SelfguidedFilterTest, DISABLED_SpeedTest) { RunSpeedTest(); }
 TEST_P(AV1SelfguidedFilterTest, CorrectnessTest) { RunCorrectnessTest(); }
@@ -388,6 +389,7 @@
  private:
   SgrFunc tst_fun_;
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1HighbdSelfguidedFilterTest);
 
 TEST_P(AV1HighbdSelfguidedFilterTest, DISABLED_SpeedTest) { RunSpeedTest(); }
 TEST_P(AV1HighbdSelfguidedFilterTest, CorrectnessTest) { RunCorrectnessTest(); }
diff --git a/test/set_maps.sh b/test/set_maps.sh
index 4f59b06..b79357a 100755
--- a/test/set_maps.sh
+++ b/test/set_maps.sh
@@ -36,7 +36,7 @@
 
   eval "${AOM_TEST_PREFIX}" "${encoder}" "${codec}" "${YUV_RAW_INPUT_WIDTH}" \
       "${YUV_RAW_INPUT_HEIGHT}" "${YUV_RAW_INPUT}" "${output_file}" \
-      ${devnull}
+      ${devnull} || return 1
 
   [ -e "${output_file}" ] || return 1
 }
diff --git a/test/simple_decoder.sh b/test/simple_decoder.sh
index 5f39ad2..9b1aea1 100755
--- a/test/simple_decoder.sh
+++ b/test/simple_decoder.sh
@@ -36,7 +36,7 @@
   fi
 
   eval "${AOM_TEST_PREFIX}" "${decoder}" "${input_file}" "${output_file}" \
-      ${devnull}
+      ${devnull} || return 1
 
   [ -e "${output_file}" ] || return 1
 }
diff --git a/test/simple_encoder.sh b/test/simple_encoder.sh
index 5cd6b46..dfb1a1b 100755
--- a/test/simple_encoder.sh
+++ b/test/simple_encoder.sh
@@ -36,7 +36,7 @@
 
   eval "${AOM_TEST_PREFIX}" "${encoder}" "${codec}" "${YUV_RAW_INPUT_WIDTH}" \
       "${YUV_RAW_INPUT_HEIGHT}" "${YUV_RAW_INPUT}" "${output_file}" 9999 0 5 \
-      ${devnull}
+      ${devnull} || return 1
 
   [ -e "${output_file}" ] || return 1
 }
diff --git a/test/sse_sum_test.cc b/test/sse_sum_test.cc
new file mode 100644
index 0000000..3e24e89
--- /dev/null
+++ b/test/sse_sum_test.cc
@@ -0,0 +1,173 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <cmath>
+#include <cstdlib>
+#include <string>
+#include <tuple>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_ports/mem.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+#include "test/function_equivalence_test.h"
+
+using libaom_test::ACMRandom;
+using libaom_test::FunctionEquivalenceTest;
+using ::testing::Combine;
+using ::testing::Range;
+using ::testing::Values;
+using ::testing::ValuesIn;
+
+namespace {
+const int kNumIterations = 10000;
+
+typedef uint64_t (*SSI16Func)(const int16_t *src, int src_stride, int width,
+                              int height, int *sum);
+typedef libaom_test::FuncParam<SSI16Func> TestFuncs;
+
+class SumSSETest : public ::testing::TestWithParam<TestFuncs> {
+ public:
+  virtual ~SumSSETest() {}
+  virtual void SetUp() {
+    params_ = this->GetParam();
+    rnd_.Reset(ACMRandom::DeterministicSeed());
+    src_ = reinterpret_cast<int16_t *>(aom_memalign(16, 256 * 256 * 2));
+    ASSERT_TRUE(src_ != NULL);
+  }
+
+  virtual void TearDown() {
+    libaom_test::ClearSystemState();
+    aom_free(src_);
+  }
+  void RunTest(int isRandom);
+  void RunSpeedTest();
+
+  void GenRandomData(int width, int height, int stride) {
+    const int msb = 11;  // Up to 12 bit input
+    const int limit = 1 << (msb + 1);
+    for (int ii = 0; ii < height; ii++) {
+      for (int jj = 0; jj < width; jj++) {
+        src_[ii * stride + jj] = rnd_(2) ? rnd_(limit) : -rnd_(limit);
+      }
+    }
+  }
+
+  void GenExtremeData(int width, int height, int stride) {
+    const int msb = 11;  // Up to 12 bit input
+    const int limit = 1 << (msb + 1);
+    const int val = rnd_(2) ? limit - 1 : -(limit - 1);
+    for (int ii = 0; ii < height; ii++) {
+      for (int jj = 0; jj < width; jj++) {
+        src_[ii * stride + jj] = val;
+      }
+    }
+  }
+
+ protected:
+  TestFuncs params_;
+  int16_t *src_;
+  ACMRandom rnd_;
+};
+
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(SumSSETest);
+
+void SumSSETest::RunTest(int isRandom) {
+  for (int k = 0; k < kNumIterations; k++) {
+    const int width = 4 * (rnd_(31) + 1);   // Up to 128x128
+    const int height = 4 * (rnd_(31) + 1);  // Up to 128x128
+    int stride = 4 << rnd_(7);              // Up to 256 stride
+    while (stride < width) {                // Make sure it's valid
+      stride = 4 << rnd_(7);
+    }
+    if (isRandom) {
+      GenRandomData(width, height, stride);
+    } else {
+      GenExtremeData(width, height, stride);
+    }
+    int sum_ref = 0, sum_tst = 0;
+    const uint64_t sse_ref =
+        params_.ref_func(src_, stride, width, height, &sum_ref);
+    const uint64_t sse_tst =
+        params_.tst_func(src_, stride, width, height, &sum_tst);
+
+    EXPECT_EQ(sse_ref, sse_tst)
+        << "Error: SumSSETest [" << width << "x" << height
+        << "] C SSE does not match optimized output.";
+    EXPECT_EQ(sum_ref, sum_tst)
+        << "Error: SumSSETest [" << width << "x" << height
+        << "] C Sum does not match optimized output.";
+  }
+}
+
+void SumSSETest::RunSpeedTest() {
+  for (int block = BLOCK_4X4; block < BLOCK_SIZES_ALL; block++) {
+    const int width = block_size_wide[block];   // Up to 128x128
+    const int height = block_size_high[block];  // Up to 128x128
+    int stride = 4 << rnd_(7);                  // Up to 256 stride
+    while (stride < width) {                    // Make sure it's valid
+      stride = 4 << rnd_(7);
+    }
+    GenExtremeData(width, height, stride);
+    const int num_loops = 1000000000 / (width + height);
+    int sum_ref = 0, sum_tst = 0;
+
+    aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
+
+    for (int i = 0; i < num_loops; ++i)
+      params_.ref_func(src_, stride, width, height, &sum_ref);
+
+    aom_usec_timer_mark(&timer);
+    const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
+    printf("SumSquaresTest C %3dx%-3d: %7.2f ns\n", width, height,
+           1000.0 * elapsed_time / num_loops);
+
+    aom_usec_timer timer1;
+    aom_usec_timer_start(&timer1);
+    for (int i = 0; i < num_loops; ++i)
+      params_.tst_func(src_, stride, width, height, &sum_tst);
+    aom_usec_timer_mark(&timer1);
+    const int elapsed_time1 = static_cast<int>(aom_usec_timer_elapsed(&timer1));
+    printf("SumSquaresTest Test %3dx%-3d: %7.2f ns\n", width, height,
+           1000.0 * elapsed_time1 / num_loops);
+  }
+}
+
+TEST_P(SumSSETest, OperationCheck) {
+  RunTest(1);  // GenRandomData
+}
+
+TEST_P(SumSSETest, ExtremeValues) {
+  RunTest(0);  // GenExtremeData
+}
+
+TEST_P(SumSSETest, DISABLED_Speed) { RunSpeedTest(); }
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_SUITE_P(SSE2, SumSSETest,
+                         ::testing::Values(TestFuncs(
+                             &aom_sum_sse_2d_i16_c, &aom_sum_sse_2d_i16_sse2)));
+
+#endif  // HAVE_SSE2
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(AVX2, SumSSETest,
+                         ::testing::Values(TestFuncs(
+                             &aom_sum_sse_2d_i16_c, &aom_sum_sse_2d_i16_avx2)));
+#endif  // HAVE_AVX2
+
+}  // namespace
diff --git a/test/still_picture_test.cc b/test/still_picture_test.cc
new file mode 100644
index 0000000..9800480
--- /dev/null
+++ b/test/still_picture_test.cc
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+
+namespace {
+
+// This class is used to test the presence of still picture feature.
+class StillPicturePresenceTest
+    : public ::libaom_test::CodecTestWith2Params<libaom_test::TestMode, int>,
+      public ::libaom_test::EncoderTest {
+ protected:
+  StillPicturePresenceTest()
+      : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)),
+        enable_full_header_(GET_PARAM(2)) {
+    still_picture_coding_violated_ = false;
+  }
+  virtual ~StillPicturePresenceTest() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(encoding_mode_);
+    const aom_rational timebase = { 1, 30 };
+    cfg_.g_timebase = timebase;
+    cfg_.rc_end_usage = AOM_Q;
+    cfg_.g_threads = 1;
+    cfg_.full_still_picture_hdr = enable_full_header_;
+    cfg_.g_limit = 1;
+  }
+
+  virtual bool DoDecode() const { return 1; }
+
+  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                                  ::libaom_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      encoder->Control(AOME_SET_CPUUSED, 5);
+      encoder->Control(AV1E_SET_FORCE_VIDEO_MODE, 0);
+    }
+  }
+
+  virtual bool HandleDecodeResult(const aom_codec_err_t res_dec,
+                                  libaom_test::Decoder *decoder) {
+    EXPECT_EQ(AOM_CODEC_OK, res_dec) << decoder->DecodeError();
+    if (AOM_CODEC_OK == res_dec) {
+      aom_codec_ctx_t *ctx_dec = decoder->GetDecoder();
+      AOM_CODEC_CONTROL_TYPECHECKED(ctx_dec, AOMD_GET_STILL_PICTURE,
+                                    &still_pic_info_);
+      if (still_pic_info_.is_still_picture != 1) {
+        still_picture_coding_violated_ = true;
+      }
+      if (still_pic_info_.is_reduced_still_picture_hdr == enable_full_header_) {
+        /* If full_still_picture_header is enabled in encoder config but
+         * bitstream contains reduced_still_picture_header set, then set
+         * still_picture_coding_violated_ to true.
+         * Similarly, if full_still_picture_header is disabled in encoder config
+         * but bitstream contains reduced_still_picture_header not set, then set
+         * still_picture_coding_violated_ to true.
+         */
+        still_picture_coding_violated_ = true;
+      }
+    }
+    return AOM_CODEC_OK == res_dec;
+  }
+
+  ::libaom_test::TestMode encoding_mode_;
+  bool still_picture_coding_violated_;
+  int enable_full_header_;
+  aom_still_picture_info still_pic_info_;
+  aom_rc_mode end_usage_check_;
+};
+
+TEST_P(StillPicturePresenceTest, StillPictureEncodePresenceTest) {
+  libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                     cfg_.g_timebase.den, cfg_.g_timebase.num,
+                                     0, 1);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  ASSERT_EQ(still_picture_coding_violated_, false);
+}
+
+AV1_INSTANTIATE_TEST_SUITE(StillPicturePresenceTest,
+                           ::testing::Values(::libaom_test::kOnePassGood,
+                                             ::libaom_test::kTwoPassGood),
+                           ::testing::Values(1, 0));
+}  // namespace
diff --git a/test/subtract_test.cc b/test/subtract_test.cc
index 4001e8b..9ac56fc 100644
--- a/test/subtract_test.cc
+++ b/test/subtract_test.cc
@@ -159,6 +159,7 @@
   uint8_t *pred_;
   int16_t *diff_;
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1HBDSubtractBlockTest);
 
 void AV1HBDSubtractBlockTest::CheckResult() {
   const int test_num = 100;
diff --git a/test/sum_squares_test.cc b/test/sum_squares_test.cc
index 8845466..4f26a3d 100644
--- a/test/sum_squares_test.cc
+++ b/test/sum_squares_test.cc
@@ -85,6 +85,7 @@
   int16_t *src_;
   ACMRandom rnd_;
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(SumSquaresTest);
 
 void SumSquaresTest::RunTest(int isRandom) {
   int failed = 0;
@@ -165,6 +166,15 @@
 
 #endif  // HAVE_SSE2
 
+#if HAVE_NEON
+
+INSTANTIATE_TEST_SUITE_P(
+    NEON, SumSquaresTest,
+    ::testing::Values(TestFuncs(&aom_sum_squares_2d_i16_c,
+                                &aom_sum_squares_2d_i16_neon)));
+
+#endif  // HAVE_NEON
+
 #if HAVE_AVX2
 INSTANTIATE_TEST_SUITE_P(
     AVX2, SumSquaresTest,
@@ -184,6 +194,7 @@
   static const int kIterations = 1000;
   static const int kMaxSize = 256;
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(SumSquares1DTest);
 
 TEST_P(SumSquares1DTest, RandomValues) {
   DECLARE_ALIGNED(16, int16_t, src[kMaxSize * kMaxSize]);
@@ -303,6 +314,7 @@
   uint8_t *ref_;
   ACMRandom rnd_;
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(SSETest);
 
 void SSETest::RunTest(int isRandom, int width, int height, int run_times) {
   int failed = 0;
@@ -472,6 +484,7 @@
   int16_t *src_;
   ACMRandom rnd_;
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(SSE_Sum_Test);
 
 void SSE_Sum_Test::RunTest(int isRandom, int width, int height, int run_times) {
   aom_usec_timer ref_timer, test_timer;
@@ -619,6 +632,7 @@
   uint8_t *src_;
   ACMRandom rnd_;
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(Lowbd2dVarTest);
 
 void Lowbd2dVarTest::RunTest(int isRandom) {
   int failed = 0;
@@ -749,6 +763,7 @@
   uint16_t *src_;
   ACMRandom rnd_;
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(Highbd2dVarTest);
 
 void Highbd2dVarTest::RunTest(int isRandom) {
   int failed = 0;
diff --git a/test/superframe_test.cc b/test/superframe_test.cc
index 024a18b..b164286 100644
--- a/test/superframe_test.cc
+++ b/test/superframe_test.cc
@@ -94,6 +94,7 @@
   int n_tile_cols_;
   int n_tile_rows_;
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(SuperframeTest);
 
 TEST_P(SuperframeTest, TestSuperframeIndexIsOptional) {
   sf_count_max_ = 0;  // early exit on successful test.
diff --git a/test/svc_datarate_test.cc b/test/svc_datarate_test.cc
index 28e517b..e773511 100644
--- a/test/svc_datarate_test.cc
+++ b/test/svc_datarate_test.cc
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
+#include <vector>
 #include "config/aom_config.h"
-
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 #include "test/codec_factory.h"
 #include "test/datarate_test.h"
@@ -20,10 +20,20 @@
 #include "test/y4m_video_source.h"
 #include "aom/aom_codec.h"
 #include "av1/common/enums.h"
+#include "av1/encoder/encoder.h"
 
 namespace datarate_test {
 namespace {
 
+struct FrameInfo {
+  FrameInfo(aom_codec_pts_t _pts, unsigned int _w, unsigned int _h)
+      : pts(_pts), w(_w), h(_h) {}
+
+  aom_codec_pts_t pts;
+  unsigned int w;
+  unsigned int h;
+};
+
 class DatarateTestSVC
     : public ::libaom_test::CodecTestWith4Params<libaom_test::TestMode, int,
                                                  unsigned int, int>,
@@ -41,6 +51,14 @@
     ResetModel();
   }
 
+  virtual void DecompressedFrameHook(const aom_image_t &img,
+                                     aom_codec_pts_t pts) {
+    frame_info_list_.push_back(FrameInfo(pts, img.d_w, img.d_h));
+    ++decoded_nframes_;
+  }
+
+  std::vector<FrameInfo> frame_info_list_;
+
   virtual int GetNumSpatialLayers() { return number_spatial_layers_; }
 
   virtual void ResetModel() {
@@ -56,6 +74,13 @@
     memset(&layer_id_, 0, sizeof(aom_svc_layer_id_t));
     memset(&svc_params_, 0, sizeof(aom_svc_params_t));
     memset(&ref_frame_config_, 0, sizeof(aom_svc_ref_frame_config_t));
+    drop_frames_ = 0;
+    for (int i = 0; i < 1000; i++) drop_frames_list_[i] = 1000;
+    decoded_nframes_ = 0;
+    mismatch_nframes_ = 0;
+    mismatch_psnr_ = 0.0;
+    set_frame_level_er_ = 0;
+    multi_ref_ = 0;
   }
 
   virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
@@ -78,10 +103,16 @@
     }
     // Set the reference/update flags, layer_id, and reference_map
     // buffer index.
-    frame_flags_ = set_layer_pattern(video->frame(), &layer_id_,
-                                     &ref_frame_config_, spatial_layer_id);
+    frame_flags_ =
+        set_layer_pattern(video->frame(), &layer_id_, &ref_frame_config_,
+                          spatial_layer_id, multi_ref_);
     encoder->Control(AV1E_SET_SVC_LAYER_ID, &layer_id_);
     encoder->Control(AV1E_SET_SVC_REF_FRAME_CONFIG, &ref_frame_config_);
+    if (set_frame_level_er_) {
+      int mode =
+          (layer_id_.spatial_layer_id > 0 || layer_id_.temporal_layer_id > 0);
+      encoder->Control(AV1E_SET_ERROR_RESILIENT_MODE, mode);
+    }
     layer_frame_cnt_++;
     DatarateTest::PreEncodeFrameHook(video, encoder);
   }
@@ -107,10 +138,34 @@
     }
   }
 
+  virtual bool DoDecode() const {
+    if (drop_frames_ > 0) {
+      for (unsigned int i = 0; i < drop_frames_; ++i) {
+        if (drop_frames_list_[i] == (unsigned int)superframe_cnt_) {
+          std::cout << "             Skipping decoding frame: "
+                    << drop_frames_list_[i] << "\n";
+          return 0;
+        }
+      }
+    }
+    return 1;
+  }
+
+  virtual void MismatchHook(const aom_image_t *img1, const aom_image_t *img2) {
+    double mismatch_psnr = compute_psnr(img1, img2);
+    mismatch_psnr_ += mismatch_psnr;
+    ++mismatch_nframes_;
+  }
+
+  unsigned int GetMismatchFrames() { return mismatch_nframes_; }
+  unsigned int GetDecodedFrames() { return decoded_nframes_; }
+
   // Layer pattern configuration.
   virtual int set_layer_pattern(int frame_cnt, aom_svc_layer_id_t *layer_id,
                                 aom_svc_ref_frame_config_t *ref_frame_config,
-                                int spatial_layer) {
+                                int spatial_layer, int multi_ref) {
+    int lag_index = 0;
+    int base_count = frame_cnt >> 2;
     layer_id->spatial_layer_id = spatial_layer;
     // Set the referende map buffer idx for the 7 references:
     // LAST_FRAME (0), LAST2_FRAME(1), LAST3_FRAME(2), GOLDEN_FRAME(3),
@@ -129,12 +184,26 @@
       //   1    3   5    7
       //     2        6
       // 0        4        8
+      if (multi_ref) {
+        // Keep golden fixed at slot 3.
+        ref_frame_config->ref_idx[3] = 3;
+        // Cyclically refresh slots 4, 5, 6, 7, for lag altref.
+        lag_index = 4 + (base_count % 4);
+        // Set the altref slot to lag_index.
+        ref_frame_config->ref_idx[6] = lag_index;
+      }
       if (frame_cnt % 4 == 0) {
         // Base layer.
         layer_id->temporal_layer_id = 0;
         // Update LAST on layer 0, reference LAST and GF.
         ref_frame_config->refresh[0] = 1;
         ref_frame_config->reference[3] = 1;
+        if (multi_ref) {
+          // Refresh GOLDEN every x ~10 base layer frames.
+          if (base_count % 10 == 0) ref_frame_config->refresh[3] = 1;
+          // Refresh lag_index slot, needed for lagging altref.
+          ref_frame_config->refresh[lag_index] = 1;
+        }
       } else if ((frame_cnt - 1) % 4 == 0) {
         layer_id->temporal_layer_id = 2;
         // First top layer: no updates, only reference LAST (TL0).
@@ -150,6 +219,11 @@
         ref_frame_config->ref_idx[0] = 1;
         ref_frame_config->ref_idx[1] = 0;
       }
+      if (multi_ref) {
+        // Every frame can reference GOLDEN AND ALTREF.
+        ref_frame_config->reference[3] = 1;
+        ref_frame_config->reference[6] = 1;
+      }
     } else if (number_temporal_layers_ == 1 && number_spatial_layers_ == 2) {
       layer_id->temporal_layer_id = 0;
       if (layer_id->spatial_layer_id == 0) {
@@ -191,11 +265,18 @@
         for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 1;
         ref_frame_config->ref_idx[0] = 2;
         ref_frame_config->refresh[2] = 1;
+        if (multi_ref) {
+          ref_frame_config->ref_idx[6] = 7;
+          ref_frame_config->reference[6] = 1;
+          if (base_count % 10 == 0) ref_frame_config->refresh[7] = 1;
+        }
       }
       // Reference GOLDEN.
       if (layer_id->spatial_layer_id > 0) ref_frame_config->reference[3] = 1;
     } else if (number_temporal_layers_ == 3 && number_spatial_layers_ == 3) {
       // 3 spatial and 3 temporal layer.
+      // Overlap in the buffer slot updates: the slots 3 and 4 updated by
+      // first TL2 are reused for update in TL1 superframe.
       if (superframe_cnt_ % 4 == 0) {
         // Base temporal layer.
         layer_id->temporal_layer_id = 0;
@@ -250,56 +331,65 @@
         if (layer_id->spatial_layer_id == 0) {
           // Reference LAST.
           // Set all buffer_idx to 0.
-          // Set GOLDEN to slot 5 and update slot 5.
+          // Set GOLDEN to slot 3 and update slot 3.
           for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 0;
-          ref_frame_config->ref_idx[3] = 5;
-          ref_frame_config->refresh[5] = 1;
+          ref_frame_config->ref_idx[3] = 3;
+          ref_frame_config->refresh[3] = 1;
         } else if (layer_id->spatial_layer_id == 1) {
           // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 1,
-          // GOLDEN (and all other refs) to slot 5.
-          // Set LAST2 to slot 6 and update slot 6.
-          for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 5;
+          // GOLDEN (and all other refs) to slot 3.
+          // Set LAST2 to slot 4 and update slot 4.
+          for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 3;
           ref_frame_config->ref_idx[0] = 1;
-          ref_frame_config->ref_idx[2] = 6;
-          ref_frame_config->refresh[6] = 1;
+          ref_frame_config->ref_idx[2] = 4;
+          ref_frame_config->refresh[4] = 1;
         } else if (layer_id->spatial_layer_id == 2) {
           // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 2,
-          // GOLDEN (and all other refs) to slot 6.
-          // Set LAST2 to slot 6 and update slot 7.
-          for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 6;
+          // GOLDEN (and all other refs) to slot 4.
+          // Set LAST2 to slot 5 and update slot 5.
+          for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 4;
           ref_frame_config->ref_idx[0] = 2;
-          ref_frame_config->ref_idx[2] = 7;
-          ref_frame_config->refresh[7] = 1;
+          ref_frame_config->ref_idx[2] = 5;
+          ref_frame_config->refresh[5] = 1;
         }
       } else if ((superframe_cnt_ - 3) % 4 == 0) {
         // Second top temporal enhancement layer.
         layer_id->temporal_layer_id = 2;
         if (layer_id->spatial_layer_id == 0) {
-          // Set LAST to slot 5 and reference LAST.
+          // Set LAST to slot 3 and reference LAST.
           // Set GOLDEN to slot 3 and update slot 3.
           // Set all other buffer_idx to 0.
           for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 0;
-          ref_frame_config->ref_idx[0] = 5;
+          ref_frame_config->ref_idx[0] = 3;
           ref_frame_config->ref_idx[3] = 3;
           ref_frame_config->refresh[3] = 1;
         } else if (layer_id->spatial_layer_id == 1) {
-          // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 6,
+          // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 4,
           // GOLDEN to slot 3. Set LAST2 to slot 4 and update slot 4.
           for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 0;
-          ref_frame_config->ref_idx[0] = 6;
+          ref_frame_config->ref_idx[0] = 4;
           ref_frame_config->ref_idx[3] = 3;
           ref_frame_config->ref_idx[1] = 4;
           ref_frame_config->refresh[4] = 1;
         } else if (layer_id->spatial_layer_id == 2) {
-          // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 7,
+          // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 5,
           // GOLDEN to slot 4. No update.
           for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 0;
-          ref_frame_config->ref_idx[0] = 7;
+          ref_frame_config->ref_idx[0] = 5;
           ref_frame_config->ref_idx[3] = 4;
         }
       }
       // Reference GOLDEN.
       if (layer_id->spatial_layer_id > 0) ref_frame_config->reference[3] = 1;
+      // Allow for top spatial layer to use additional temporal reference.
+      // Additional reference is only updated on base temporal layer, every
+      // 10 TL0 frames here.
+      if (multi_ref && layer_id->spatial_layer_id == 2) {
+        ref_frame_config->ref_idx[6] = 7;
+        ref_frame_config->reference[6] = 1;
+        if (base_count % 10 == 0 && layer_id->temporal_layer_id == 0)
+          ref_frame_config->refresh[7] = 1;
+      }
     }
     return layer_flags;
   }
@@ -368,6 +458,59 @@
       ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.30)
           << " The datarate for the file is greater than target by too much!";
     }
+    // Top temporal layers are non_reference, so exlcude them from
+    // mismatch count, since loopfilter/cdef is not applied for these on
+    // encoder side, but is always applied on decoder.
+    // This means 150 = #frames(300) - #TL2_frames(150).
+    EXPECT_EQ((int)GetMismatchFrames(), 150);
+  }
+
+  virtual void BasicRateTargetingSVC3TL1SLResizeTest() {
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_buf_optimal_sz = 500;
+    cfg_.rc_buf_sz = 1000;
+    cfg_.rc_dropframe_thresh = 0;
+    cfg_.rc_min_quantizer = 0;
+    cfg_.rc_max_quantizer = 63;
+    cfg_.rc_end_usage = AOM_CBR;
+    cfg_.g_lag_in_frames = 0;
+    cfg_.g_error_resilient = 0;
+    cfg_.rc_resize_mode = RESIZE_DYNAMIC;
+
+    ::libaom_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30,
+                                         1, 0, 400);
+    cfg_.g_w = 640;
+    cfg_.g_h = 480;
+    const int bitrate_array[2] = { 80, 100 };
+    cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
+    ResetModel();
+    number_temporal_layers_ = 3;
+    target_layer_bitrate_[0] = 50 * cfg_.rc_target_bitrate / 100;
+    target_layer_bitrate_[1] = 70 * cfg_.rc_target_bitrate / 100;
+    target_layer_bitrate_[2] = cfg_.rc_target_bitrate;
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    for (int i = 0; i < number_temporal_layers_ * number_spatial_layers_; i++) {
+      ASSERT_GE(effective_datarate_tl[i], target_layer_bitrate_[i] * 0.80)
+          << " The datarate for the file is lower than target by too much!";
+      ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.60)
+          << " The datarate for the file is greater than target by too much!";
+    }
+    unsigned int last_w = cfg_.g_w;
+    unsigned int last_h = cfg_.g_h;
+    int resize_down_count = 0;
+    for (std::vector<FrameInfo>::const_iterator info = frame_info_list_.begin();
+         info != frame_info_list_.end(); ++info) {
+      if (info->w != last_w || info->h != last_h) {
+        // Verify that resize down occurs.
+        ASSERT_LT(info->w, last_w);
+        ASSERT_LT(info->h, last_h);
+        last_w = info->w;
+        last_h = info->h;
+        resize_down_count++;
+      }
+    }
+    // Must be at least one resize down.
+    ASSERT_GE(resize_down_count, 1);
   }
 
   virtual void BasicRateTargetingSVC1TL2SLTest() {
@@ -379,7 +522,7 @@
     cfg_.rc_max_quantizer = 63;
     cfg_.rc_end_usage = AOM_CBR;
     cfg_.g_lag_in_frames = 0;
-    cfg_.g_error_resilient = 1;
+    cfg_.g_error_resilient = 0;
 
     ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352,
                                          288, 30, 1, 0, 300);
@@ -408,7 +551,7 @@
     cfg_.rc_max_quantizer = 63;
     cfg_.rc_end_usage = AOM_CBR;
     cfg_.g_lag_in_frames = 0;
-    cfg_.g_error_resilient = 1;
+    cfg_.g_error_resilient = 0;
 
     ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352,
                                          288, 30, 1, 0, 300);
@@ -429,6 +572,37 @@
     }
   }
 
+  virtual void BasicRateTargetingSVC1TL3SLMultiRefTest() {
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_buf_optimal_sz = 500;
+    cfg_.rc_buf_sz = 1000;
+    cfg_.rc_dropframe_thresh = 0;
+    cfg_.rc_min_quantizer = 0;
+    cfg_.rc_max_quantizer = 63;
+    cfg_.rc_end_usage = AOM_CBR;
+    cfg_.g_lag_in_frames = 0;
+    cfg_.g_error_resilient = 0;
+
+    ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352,
+                                         288, 30, 1, 0, 300);
+    const int bitrate_array[2] = { 500, 1000 };
+    cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
+    ResetModel();
+    multi_ref_ = 1;
+    number_temporal_layers_ = 1;
+    number_spatial_layers_ = 3;
+    target_layer_bitrate_[0] = 1 * cfg_.rc_target_bitrate / 8;
+    target_layer_bitrate_[1] = 3 * cfg_.rc_target_bitrate / 8;
+    target_layer_bitrate_[2] = 4 * cfg_.rc_target_bitrate / 8;
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    for (int i = 0; i < number_temporal_layers_ * number_spatial_layers_; i++) {
+      ASSERT_GE(effective_datarate_tl[i], target_layer_bitrate_[i] * 0.80)
+          << " The datarate for the file is lower than target by too much!";
+      ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.38)
+          << " The datarate for the file is greater than target by too much!";
+    }
+  }
+
   virtual void BasicRateTargetingSVC3TL3SLTest() {
     cfg_.rc_buf_initial_sz = 500;
     cfg_.rc_buf_optimal_sz = 500;
@@ -438,7 +612,7 @@
     cfg_.rc_max_quantizer = 63;
     cfg_.rc_end_usage = AOM_CBR;
     cfg_.g_lag_in_frames = 0;
-    cfg_.g_error_resilient = 1;
+    cfg_.g_error_resilient = 0;
 
     ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352,
                                          288, 30, 1, 0, 300);
@@ -480,7 +654,7 @@
     cfg_.rc_max_quantizer = 63;
     cfg_.rc_end_usage = AOM_CBR;
     cfg_.g_lag_in_frames = 0;
-    cfg_.g_error_resilient = 1;
+    cfg_.g_error_resilient = 0;
 
     ::libaom_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
     const int bitrate_array[2] = { 600, 1200 };
@@ -507,7 +681,49 @@
     for (int i = 0; i < number_temporal_layers_ * number_spatial_layers_; i++) {
       ASSERT_GE(effective_datarate_tl[i], target_layer_bitrate_[i] * 0.70)
           << " The datarate for the file is lower than target by too much!";
-      ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.4)
+      ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.45)
+          << " The datarate for the file is greater than target by too much!";
+    }
+  }
+
+  virtual void BasicRateTargetingSVC3TL3SLHDMultiRefTest() {
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_buf_optimal_sz = 500;
+    cfg_.rc_buf_sz = 1000;
+    cfg_.rc_dropframe_thresh = 0;
+    cfg_.rc_min_quantizer = 0;
+    cfg_.rc_max_quantizer = 63;
+    cfg_.rc_end_usage = AOM_CBR;
+    cfg_.g_lag_in_frames = 0;
+    cfg_.g_error_resilient = 0;
+
+    ::libaom_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
+    const int bitrate_array[2] = { 600, 1200 };
+    cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
+    ResetModel();
+    multi_ref_ = 1;
+    number_temporal_layers_ = 3;
+    number_spatial_layers_ = 3;
+    // SL0
+    const int bitrate_sl0 = 1 * cfg_.rc_target_bitrate / 8;
+    target_layer_bitrate_[0] = 50 * bitrate_sl0 / 100;
+    target_layer_bitrate_[1] = 70 * bitrate_sl0 / 100;
+    target_layer_bitrate_[2] = bitrate_sl0;
+    // SL1
+    const int bitrate_sl1 = 3 * cfg_.rc_target_bitrate / 8;
+    target_layer_bitrate_[3] = 50 * bitrate_sl1 / 100;
+    target_layer_bitrate_[4] = 70 * bitrate_sl1 / 100;
+    target_layer_bitrate_[5] = bitrate_sl1;
+    // SL2
+    const int bitrate_sl2 = 4 * cfg_.rc_target_bitrate / 8;
+    target_layer_bitrate_[6] = 50 * bitrate_sl2 / 100;
+    target_layer_bitrate_[7] = 70 * bitrate_sl2 / 100;
+    target_layer_bitrate_[8] = bitrate_sl2;
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    for (int i = 0; i < number_temporal_layers_ * number_spatial_layers_; i++) {
+      ASSERT_GE(effective_datarate_tl[i], target_layer_bitrate_[i] * 0.70)
+          << " The datarate for the file is lower than target by too much!";
+      ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.45)
           << " The datarate for the file is greater than target by too much!";
     }
   }
@@ -521,7 +737,7 @@
     cfg_.rc_max_quantizer = 63;
     cfg_.rc_end_usage = AOM_CBR;
     cfg_.g_lag_in_frames = 0;
-    cfg_.g_error_resilient = 1;
+    cfg_.g_error_resilient = 0;
     cfg_.kf_mode = AOM_KF_AUTO;
     cfg_.kf_min_dist = cfg_.kf_max_dist = 100;
 
@@ -556,6 +772,231 @@
     }
   }
 
+  virtual void BasicRateTargeting444SVC3TL3SLTest() {
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_buf_optimal_sz = 500;
+    cfg_.rc_buf_sz = 1000;
+    cfg_.rc_dropframe_thresh = 0;
+    cfg_.rc_min_quantizer = 0;
+    cfg_.rc_max_quantizer = 63;
+    cfg_.rc_end_usage = AOM_CBR;
+    cfg_.g_lag_in_frames = 0;
+    cfg_.g_error_resilient = 0;
+    cfg_.g_profile = 1;
+
+    ::libaom_test::Y4mVideoSource video("rush_hour_444.y4m", 0, 140);
+
+    const int bitrate_array[2] = { 600, 1200 };
+    cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
+    ResetModel();
+    number_temporal_layers_ = 3;
+    number_spatial_layers_ = 3;
+    // SL0
+    const int bitrate_sl0 = 1 * cfg_.rc_target_bitrate / 8;
+    target_layer_bitrate_[0] = 50 * bitrate_sl0 / 100;
+    target_layer_bitrate_[1] = 70 * bitrate_sl0 / 100;
+    target_layer_bitrate_[2] = bitrate_sl0;
+    // SL1
+    const int bitrate_sl1 = 3 * cfg_.rc_target_bitrate / 8;
+    target_layer_bitrate_[3] = 50 * bitrate_sl1 / 100;
+    target_layer_bitrate_[4] = 70 * bitrate_sl1 / 100;
+    target_layer_bitrate_[5] = bitrate_sl1;
+    // SL2
+    const int bitrate_sl2 = 4 * cfg_.rc_target_bitrate / 8;
+    target_layer_bitrate_[6] = 50 * bitrate_sl2 / 100;
+    target_layer_bitrate_[7] = 70 * bitrate_sl2 / 100;
+    target_layer_bitrate_[8] = bitrate_sl2;
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    for (int i = 0; i < number_temporal_layers_ * number_spatial_layers_; i++) {
+      ASSERT_GE(effective_datarate_tl[i], target_layer_bitrate_[i] * 0.70)
+          << " The datarate for the file is lower than target by too much!";
+      ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.38)
+          << " The datarate for the file is greater than target by too much!";
+    }
+  }
+
+  virtual void BasicRateTargetingSVC3TL1SLMultiRefDropAllEnhTest() {
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_buf_optimal_sz = 500;
+    cfg_.rc_buf_sz = 1000;
+    cfg_.rc_dropframe_thresh = 0;
+    cfg_.rc_min_quantizer = 0;
+    cfg_.rc_max_quantizer = 63;
+    cfg_.rc_end_usage = AOM_CBR;
+    cfg_.g_lag_in_frames = 0;
+    // error_resilient can set to off/0, since for SVC the context update
+    // is done per-layer.
+    cfg_.g_error_resilient = 0;
+
+    ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352,
+                                         288, 30, 1, 0, 300);
+    const int bitrate_array[2] = { 200, 550 };
+    cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
+    ResetModel();
+    multi_ref_ = 1;
+    // Drop TL1 and TL2: #frames(300) - #TL0.
+    drop_frames_ = 300 - 300 / 4;
+    int n = 0;
+    for (int i = 0; i < 300; i++) {
+      if (i % 4 != 0) {
+        drop_frames_list_[n] = i;
+        n++;
+      }
+    }
+    number_temporal_layers_ = 3;
+    target_layer_bitrate_[0] = 50 * cfg_.rc_target_bitrate / 100;
+    target_layer_bitrate_[1] = 70 * cfg_.rc_target_bitrate / 100;
+    target_layer_bitrate_[2] = cfg_.rc_target_bitrate;
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    for (int i = 0; i < number_temporal_layers_ * number_spatial_layers_; i++) {
+      ASSERT_GE(effective_datarate_tl[i], target_layer_bitrate_[i] * 0.80)
+          << " The datarate for the file is lower than target by too much!";
+      ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.30)
+          << " The datarate for the file is greater than target by too much!";
+    }
+    // Test that no mismatches have been found.
+    std::cout << "          Decoded frames: " << GetDecodedFrames() << "\n";
+    std::cout << "          Mismatch frames: " << GetMismatchFrames() << "\n";
+    EXPECT_EQ(300 - GetDecodedFrames(), drop_frames_);
+    EXPECT_EQ((int)GetMismatchFrames(), 0);
+  }
+
+  virtual void BasicRateTargetingSVC3TL1SLDropAllEnhTest() {
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_buf_optimal_sz = 500;
+    cfg_.rc_buf_sz = 1000;
+    cfg_.rc_dropframe_thresh = 0;
+    cfg_.rc_min_quantizer = 0;
+    cfg_.rc_max_quantizer = 63;
+    cfg_.rc_end_usage = AOM_CBR;
+    cfg_.g_lag_in_frames = 0;
+    // error_resilient can set to off/0, since for SVC the context update
+    // is done per-layer.
+    cfg_.g_error_resilient = 0;
+
+    ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352,
+                                         288, 30, 1, 0, 300);
+    const int bitrate_array[2] = { 200, 550 };
+    cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
+    ResetModel();
+    // Drop TL1 and TL2: #frames(300) - #TL0.
+    drop_frames_ = 300 - 300 / 4;
+    int n = 0;
+    for (int i = 0; i < 300; i++) {
+      if (i % 4 != 0) {
+        drop_frames_list_[n] = i;
+        n++;
+      }
+    }
+    number_temporal_layers_ = 3;
+    target_layer_bitrate_[0] = 50 * cfg_.rc_target_bitrate / 100;
+    target_layer_bitrate_[1] = 70 * cfg_.rc_target_bitrate / 100;
+    target_layer_bitrate_[2] = cfg_.rc_target_bitrate;
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    for (int i = 0; i < number_temporal_layers_ * number_spatial_layers_; i++) {
+      ASSERT_GE(effective_datarate_tl[i], target_layer_bitrate_[i] * 0.80)
+          << " The datarate for the file is lower than target by too much!";
+      ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.30)
+          << " The datarate for the file is greater than target by too much!";
+    }
+    // Test that no mismatches have been found.
+    std::cout << "          Decoded frames: " << GetDecodedFrames() << "\n";
+    std::cout << "          Mismatch frames: " << GetMismatchFrames() << "\n";
+    EXPECT_EQ(300 - GetDecodedFrames(), drop_frames_);
+    EXPECT_EQ((int)GetMismatchFrames(), 0);
+  }
+
+  virtual void BasicRateTargetingSVC3TL1SLDropTL2EnhTest() {
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_buf_optimal_sz = 500;
+    cfg_.rc_buf_sz = 1000;
+    cfg_.rc_dropframe_thresh = 0;
+    cfg_.rc_min_quantizer = 0;
+    cfg_.rc_max_quantizer = 63;
+    cfg_.rc_end_usage = AOM_CBR;
+    cfg_.g_lag_in_frames = 0;
+    // error_resilient for sequence can be off/0, since dropped frames (TL2)
+    // are non-reference frames.
+    cfg_.g_error_resilient = 0;
+
+    ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352,
+                                         288, 30, 1, 0, 300);
+    const int bitrate_array[2] = { 200, 550 };
+    cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
+    ResetModel();
+    // Drop TL2: #frames(300) - (#TL0 + #TL1).
+    drop_frames_ = 300 - 300 / 2;
+    int n = 0;
+    for (int i = 0; i < 300; i++) {
+      if (i % 2 != 0) {
+        drop_frames_list_[n] = i;
+        n++;
+      }
+    }
+    number_temporal_layers_ = 3;
+    target_layer_bitrate_[0] = 50 * cfg_.rc_target_bitrate / 100;
+    target_layer_bitrate_[1] = 70 * cfg_.rc_target_bitrate / 100;
+    target_layer_bitrate_[2] = cfg_.rc_target_bitrate;
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    for (int i = 0; i < number_temporal_layers_ * number_spatial_layers_; i++) {
+      ASSERT_GE(effective_datarate_tl[i], target_layer_bitrate_[i] * 0.80)
+          << " The datarate for the file is lower than target by too much!";
+      ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.30)
+          << " The datarate for the file is greater than target by too much!";
+    }
+    // Test that no mismatches have been found.
+    std::cout << "          Decoded frames: " << GetDecodedFrames() << "\n";
+    std::cout << "          Mismatch frames: " << GetMismatchFrames() << "\n";
+    EXPECT_EQ(300 - GetDecodedFrames(), drop_frames_);
+    EXPECT_EQ((int)GetMismatchFrames(), 0);
+  }
+
+  virtual void BasicRateTargetingSVC3TL1SLDropAllEnhFrameERTest() {
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_buf_optimal_sz = 500;
+    cfg_.rc_buf_sz = 1000;
+    cfg_.rc_dropframe_thresh = 0;
+    cfg_.rc_min_quantizer = 0;
+    cfg_.rc_max_quantizer = 63;
+    cfg_.rc_end_usage = AOM_CBR;
+    cfg_.g_lag_in_frames = 0;
+
+    ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352,
+                                         288, 30, 1, 0, 300);
+    const int bitrate_array[2] = { 200, 550 };
+    cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
+    ResetModel();
+    // Set error_resilience at frame level, with codec control,
+    // on/1 for enahancement layers and off/0 for base layer frames.
+    set_frame_level_er_ = 1;
+
+    // Drop TL1 and TL2: #frames(300) - #TL0.
+    drop_frames_ = 300 - 300 / 4;
+    int n = 0;
+    for (int i = 0; i < 300; i++) {
+      if (i % 4 != 0) {
+        drop_frames_list_[n] = i;
+        n++;
+      }
+    }
+    number_temporal_layers_ = 3;
+    target_layer_bitrate_[0] = 50 * cfg_.rc_target_bitrate / 100;
+    target_layer_bitrate_[1] = 70 * cfg_.rc_target_bitrate / 100;
+    target_layer_bitrate_[2] = cfg_.rc_target_bitrate;
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    for (int i = 0; i < number_temporal_layers_ * number_spatial_layers_; i++) {
+      ASSERT_GE(effective_datarate_tl[i], target_layer_bitrate_[i] * 0.80)
+          << " The datarate for the file is lower than target by too much!";
+      ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.30)
+          << " The datarate for the file is greater than target by too much!";
+    }
+    // Test that no mismatches have been found.
+    std::cout << "          Decoded frames: " << GetDecodedFrames() << "\n";
+    std::cout << "          Mismatch frames: " << GetMismatchFrames() << "\n";
+    EXPECT_EQ(300 - GetDecodedFrames(), drop_frames_);
+    EXPECT_EQ((int)GetMismatchFrames(), 0);
+  }
+
   int layer_frame_cnt_;
   int superframe_cnt_;
   int number_temporal_layers_;
@@ -566,6 +1007,13 @@
   aom_svc_ref_frame_config_t ref_frame_config_;
   aom_svc_layer_id_t layer_id_;
   double effective_datarate_tl[AOM_MAX_LAYERS];
+  unsigned int drop_frames_;
+  unsigned int drop_frames_list_[1000];
+  unsigned int mismatch_nframes_;
+  unsigned int decoded_nframes_;
+  double mismatch_psnr_;
+  int set_frame_level_er_;
+  int multi_ref_;
 };
 
 // Check basic rate targeting for CBR, for 3 temporal layers, 1 spatial.
@@ -573,6 +1021,13 @@
   BasicRateTargetingSVC3TL1SLTest();
 }
 
+// Check basic rate targeting for CBR, for 3 temporal layers, 1 spatial,
+// with dynamic resize on. Encode at very low bitrate and check that
+// there is at least one resize (down) event.
+TEST_P(DatarateTestSVC, BasicRateTargetingSVC3TL1SLResize) {
+  BasicRateTargetingSVC3TL1SLResizeTest();
+}
+
 // Check basic rate targeting for CBR, for 2 spatial layers, 1 temporal.
 TEST_P(DatarateTestSVC, BasicRateTargetingSVC1TL2SL) {
   BasicRateTargetingSVC1TL2SLTest();
@@ -583,6 +1038,12 @@
   BasicRateTargetingSVC1TL3SLTest();
 }
 
+// Check basic rate targeting for CBR, for 3 spatial layers, 1 temporal,
+// with additional temporal reference for top spatial layer.
+TEST_P(DatarateTestSVC, BasicRateTargetingSVC1TL3SLMultiRef) {
+  BasicRateTargetingSVC1TL3SLMultiRefTest();
+}
+
 // Check basic rate targeting for CBR, for 3 spatial, 3 temporal layers.
 TEST_P(DatarateTestSVC, BasicRateTargetingSVC3TL3SL) {
   BasicRateTargetingSVC3TL3SLTest();
@@ -594,16 +1055,63 @@
 }
 
 // Check basic rate targeting for CBR, for 3 spatial, 3 temporal layers,
+// with additional temporal reference for top spatial layer.
+TEST_P(DatarateTestSVC, BasicRateTargetingSVC3TL3SLHDMultiRef) {
+  BasicRateTargetingSVC3TL3SLHDMultiRefTest();
+}
+
+// Check basic rate targeting for CBR, for 3 spatial, 3 temporal layers,
 // for auto key frame mode with short key frame period.
 TEST_P(DatarateTestSVC, BasicRateTargetingSVC3TL3SLKf) {
   BasicRateTargetingSVC3TL3SLKfTest();
 }
 
-AV1_INSTANTIATE_TEST_CASE(DatarateTestSVC,
-                          ::testing::Values(::libaom_test::kRealTime),
-                          ::testing::Range(7, 9),
-                          ::testing::Range<unsigned int>(0, 4),
-                          ::testing::Values(0, 1));
+// Check basic rate targeting for CBR, for 3 spatial, 3 temporal layers,
+// for 4:4:4 input.
+TEST_P(DatarateTestSVC, BasicRateTargeting444SVC3TL3SL) {
+  BasicRateTargeting444SVC3TL3SLTest();
+}
+
+// Check basic rate targeting for CBR, for 3 temporal layers, 1 spatial layer,
+// with dropping of all enhancement layers (TL 1 and TL2). Check that the base
+// layer (TL0) can still be decodeable (with no mismatch) with the
+// error_resilient flag set to 0. This test used the pattern with multiple
+// references (last, golden, and altref), updated on base layer.
+TEST_P(DatarateTestSVC, BasicRateTargetingSVC3TL1SLMultiRefDropAllEnh) {
+  BasicRateTargetingSVC3TL1SLMultiRefDropAllEnhTest();
+}
+
+// Check basic rate targeting for CBR, for 3 temporal layers, 1 spatial layer,
+// with dropping of all enhancement layers (TL 1 and TL2). Check that the base
+// layer (TL0) can still be decodeable (with no mismatch) with the
+// error_resilient flag set to 0.
+TEST_P(DatarateTestSVC, BasicRateTargetingSVC3TL1SLDropAllEnh) {
+  BasicRateTargetingSVC3TL1SLDropAllEnhTest();
+}
+
+// Check basic rate targeting for CBR, for 3 temporal layers, 1 spatial layer,
+// with dropping of the TL2 enhancement layer, which are non-reference
+// (droppble) frames. For the base layer (TL0) and TL1 to still be decodeable
+// (with no mismatch), the error_resilient_flag may be off (set to 0),
+// since TL2 are non-reference frames.
+TEST_P(DatarateTestSVC, BasicRateTargetingSVC3TL1SLDropTL2Enh) {
+  BasicRateTargetingSVC3TL1SLDropTL2EnhTest();
+}
+
+// Check basic rate targeting for CBR, for 3 temporal layers, 1 spatial layer,
+// with dropping of all enhancement layers (TL 1 and TL2). Test that the
+// error_resilient flag can be set at frame level, with on/1 on
+// enhancement layers and off/0 on base layer.
+// This allows for successful decoding after dropping enhancement layer frames.
+TEST_P(DatarateTestSVC, BasicRateTargetingSVC3TL1SLDropAllEnhFrameER) {
+  BasicRateTargetingSVC3TL1SLDropAllEnhFrameERTest();
+}
+
+AV1_INSTANTIATE_TEST_SUITE(DatarateTestSVC,
+                           ::testing::Values(::libaom_test::kRealTime),
+                           ::testing::Range(7, 10),
+                           ::testing::Range<unsigned int>(0, 4),
+                           ::testing::Values(0, 1));
 
 }  // namespace
 }  // namespace datarate_test
diff --git a/test/temporal_filter_planewise_test.cc b/test/temporal_filter_planewise_test.cc
deleted file mode 100644
index c3f3e9e..0000000
--- a/test/temporal_filter_planewise_test.cc
+++ /dev/null
@@ -1,242 +0,0 @@
-/*
- * Copyright (c) 2019, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <cmath>
-#include <cstdlib>
-#include <string>
-#include <tuple>
-
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
-
-#include "config/aom_config.h"
-#include "config/aom_dsp_rtcd.h"
-#include "config/av1_rtcd.h"
-
-#include "aom_ports/mem.h"
-#include "test/acm_random.h"
-#include "test/clear_system_state.h"
-#include "test/register_state_check.h"
-#include "test/util.h"
-#include "test/function_equivalence_test.h"
-
-using libaom_test::ACMRandom;
-using libaom_test::FunctionEquivalenceTest;
-using ::testing::Combine;
-using ::testing::Range;
-using ::testing::Values;
-using ::testing::ValuesIn;
-
-#if !CONFIG_REALTIME_ONLY
-namespace {
-
-typedef void (*TemporalFilterPlanewiseFunc)(
-    const YV12_BUFFER_CONFIG *ref_frame, const MACROBLOCKD *mbd,
-    const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
-    const int num_planes, const double *noise_level, const int use_subblock,
-    const int block_mse, const int *subblock_mses, const int q_factor,
-    const uint8_t *pred, uint32_t *accum, uint16_t *count);
-typedef libaom_test::FuncParam<TemporalFilterPlanewiseFunc>
-    TemporalFilterPlanewiseFuncParam;
-
-typedef std::tuple<TemporalFilterPlanewiseFuncParam, int>
-    TemporalFilterPlanewiseWithParam;
-
-class TemporalFilterPlanewiseTest
-    : public ::testing::TestWithParam<TemporalFilterPlanewiseWithParam> {
- public:
-  virtual ~TemporalFilterPlanewiseTest() {}
-  virtual void SetUp() {
-    params_ = GET_PARAM(0);
-    rnd_.Reset(ACMRandom::DeterministicSeed());
-    src1_ = reinterpret_cast<uint8_t *>(aom_memalign(8, 256 * 256));
-    src2_ = reinterpret_cast<uint8_t *>(aom_memalign(8, 256 * 256));
-
-    ASSERT_TRUE(src1_ != NULL);
-    ASSERT_TRUE(src2_ != NULL);
-  }
-
-  virtual void TearDown() {
-    libaom_test::ClearSystemState();
-    aom_free(src1_);
-    aom_free(src2_);
-  }
-  void RunTest(int isRandom, int width, int height, int run_times);
-
-  void GenRandomData(int width, int height, int stride, int stride2) {
-    for (int ii = 0; ii < height; ii++) {
-      for (int jj = 0; jj < width; jj++) {
-        src1_[ii * stride + jj] = rnd_.Rand8();
-        src2_[ii * stride2 + jj] = rnd_.Rand8();
-      }
-    }
-  }
-
-  void GenExtremeData(int width, int height, int stride, uint8_t *data,
-                      int stride2, uint8_t *data2, uint8_t val) {
-    for (int ii = 0; ii < height; ii++) {
-      for (int jj = 0; jj < width; jj++) {
-        data[ii * stride + jj] = val;
-        data2[ii * stride2 + jj] = (255 - val);
-      }
-    }
-  }
-
- protected:
-  TemporalFilterPlanewiseFuncParam params_;
-  uint8_t *src1_;
-  uint8_t *src2_;
-  ACMRandom rnd_;
-};
-
-void TemporalFilterPlanewiseTest::RunTest(int isRandom, int width, int height,
-                                          int run_times) {
-  aom_usec_timer ref_timer, test_timer;
-  for (int k = 0; k < 3; k++) {
-    const int stride = width;
-    const int stride2 = width;
-    if (isRandom) {
-      GenRandomData(width, height, stride, stride2);
-    } else {
-      const int msb = 8;  // Up to 8 bit input
-      const int limit = (1 << msb) - 1;
-      if (k == 0) {
-        GenExtremeData(width, height, stride, src1_, stride2, src2_, limit);
-      } else {
-        GenExtremeData(width, height, stride, src1_, stride2, src2_, 0);
-      }
-    }
-    double sigma[1] = { 2.1002103677063437 };
-    DECLARE_ALIGNED(16, unsigned int, accumulator_ref[1024 * 3]);
-    DECLARE_ALIGNED(16, uint16_t, count_ref[1024 * 3]);
-    memset(accumulator_ref, 0, 1024 * 3 * sizeof(accumulator_ref[0]));
-    memset(count_ref, 0, 1024 * 3 * sizeof(count_ref[0]));
-    DECLARE_ALIGNED(16, unsigned int, accumulator_mod[1024 * 3]);
-    DECLARE_ALIGNED(16, uint16_t, count_mod[1024 * 3]);
-    memset(accumulator_mod, 0, 1024 * 3 * sizeof(accumulator_mod[0]));
-    memset(count_mod, 0, 1024 * 3 * sizeof(count_mod[0]));
-
-    assert(width == 32 && height == 32);
-    const BLOCK_SIZE block_size = BLOCK_32X32;
-    const int use_subblock = 0;
-    const int block_mse = 20;
-    const int subblock_mses[4] = { 15, 16, 17, 18 };
-    const int q_factor = 12;
-    const int mb_row = 0;
-    const int mb_col = 0;
-    const int num_planes = 1;
-    YV12_BUFFER_CONFIG *ref_frame =
-        (YV12_BUFFER_CONFIG *)malloc(sizeof(YV12_BUFFER_CONFIG));
-    ref_frame->heights[0] = height;
-    ref_frame->strides[0] = stride;
-    DECLARE_ALIGNED(16, uint8_t, src[1024 * 3]);
-    ref_frame->buffer_alloc = src;
-    ref_frame->buffers[0] = ref_frame->buffer_alloc;
-    ref_frame->flags = 0;  // Only support low bit-depth test.
-    memcpy(src, src1_, 1024 * 3 * sizeof(uint8_t));
-
-    MACROBLOCKD *mbd = (MACROBLOCKD *)malloc(sizeof(MACROBLOCKD));
-    mbd->plane[0].subsampling_y = 0;
-    mbd->plane[0].subsampling_x = 0;
-    mbd->bd = 8;
-
-    params_.ref_func(ref_frame, mbd, block_size, mb_row, mb_col, num_planes,
-                     sigma, use_subblock, block_mse, subblock_mses, q_factor,
-                     src2_, accumulator_ref, count_ref);
-    params_.tst_func(ref_frame, mbd, block_size, mb_row, mb_col, num_planes,
-                     sigma, use_subblock, block_mse, subblock_mses, q_factor,
-                     src2_, accumulator_mod, count_mod);
-
-    if (run_times > 1) {
-      aom_usec_timer_start(&ref_timer);
-      for (int j = 0; j < run_times; j++) {
-        params_.ref_func(ref_frame, mbd, block_size, mb_row, mb_col, num_planes,
-                         sigma, use_subblock, block_mse, subblock_mses,
-                         q_factor, src2_, accumulator_ref, count_ref);
-      }
-      aom_usec_timer_mark(&ref_timer);
-      const int elapsed_time_c =
-          static_cast<int>(aom_usec_timer_elapsed(&ref_timer));
-
-      aom_usec_timer_start(&test_timer);
-      for (int j = 0; j < run_times; j++) {
-        params_.tst_func(ref_frame, mbd, block_size, mb_row, mb_col, num_planes,
-                         sigma, use_subblock, block_mse, subblock_mses,
-                         q_factor, src2_, accumulator_mod, count_mod);
-      }
-      aom_usec_timer_mark(&test_timer);
-      const int elapsed_time_simd =
-          static_cast<int>(aom_usec_timer_elapsed(&test_timer));
-
-      printf(
-          "c_time=%d \t simd_time=%d \t "
-          "gain=%f\t width=%d\t height=%d \n",
-          elapsed_time_c, elapsed_time_simd,
-          (float)((float)elapsed_time_c / (float)elapsed_time_simd), width,
-          height);
-
-    } else {
-      for (int i = 0, l = 0; i < height; i++) {
-        for (int j = 0; j < width; j++, l++) {
-          EXPECT_EQ(accumulator_ref[l], accumulator_mod[l])
-              << "Error:" << k << " SSE Sum Test [" << width << "x" << height
-              << "] C accumulator does not match optimized accumulator.";
-          EXPECT_EQ(count_ref[l], count_mod[l])
-              << "Error:" << k << " SSE Sum Test [" << width << "x" << height
-              << "] C count does not match optimized count.";
-        }
-      }
-    }
-
-    free(ref_frame);
-    free(mbd);
-  }
-}
-
-TEST_P(TemporalFilterPlanewiseTest, OperationCheck) {
-  for (int height = 32; height <= 32; height = height * 2) {
-    RunTest(1, height, height, 1);  // GenRandomData
-  }
-}
-
-TEST_P(TemporalFilterPlanewiseTest, ExtremeValues) {
-  for (int height = 32; height <= 32; height = height * 2) {
-    RunTest(0, height, height, 1);
-  }
-}
-
-TEST_P(TemporalFilterPlanewiseTest, DISABLED_Speed) {
-  for (int height = 32; height <= 32; height = height * 2) {
-    RunTest(1, height, height, 100000);
-  }
-}
-
-#if HAVE_AVX2
-TemporalFilterPlanewiseFuncParam temporal_filter_planewise_test_avx2[] = {
-  TemporalFilterPlanewiseFuncParam(&av1_apply_temporal_filter_planewise_c,
-                                   &av1_apply_temporal_filter_planewise_avx2)
-};
-INSTANTIATE_TEST_SUITE_P(AVX2, TemporalFilterPlanewiseTest,
-                         Combine(ValuesIn(temporal_filter_planewise_test_avx2),
-                                 Range(64, 65, 4)));
-#endif  // HAVE_AVX2
-
-#if HAVE_SSE2
-TemporalFilterPlanewiseFuncParam temporal_filter_planewise_test_sse2[] = {
-  TemporalFilterPlanewiseFuncParam(&av1_apply_temporal_filter_planewise_c,
-                                   &av1_apply_temporal_filter_planewise_sse2)
-};
-INSTANTIATE_TEST_SUITE_P(SSE2, TemporalFilterPlanewiseTest,
-                         Combine(ValuesIn(temporal_filter_planewise_test_sse2),
-                                 Range(64, 65, 4)));
-#endif  // HAVE_SSE2
-
-}  // namespace
-#endif
diff --git a/test/temporal_filter_test.cc b/test/temporal_filter_test.cc
new file mode 100644
index 0000000..d665c85
--- /dev/null
+++ b/test/temporal_filter_test.cc
@@ -0,0 +1,562 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <cmath>
+#include <cstdlib>
+#include <string>
+#include <tuple>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_ports/mem.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/temporal_filter.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+#include "test/function_equivalence_test.h"
+
+using libaom_test::ACMRandom;
+using libaom_test::FunctionEquivalenceTest;
+using ::testing::Combine;
+using ::testing::Range;
+using ::testing::Values;
+using ::testing::ValuesIn;
+
+#if !CONFIG_REALTIME_ONLY
+namespace {
+typedef enum {
+  I400,  // Monochrome
+  I420,  // 4:2:0
+  I422,  // 4:2:2
+  I444,  // 4:4:4
+} ColorFormat;
+static const char *color_fmt_str[] = { "I400", "I420", "I422", "I444" };
+typedef void (*TemporalFilterFunc)(
+    const YV12_BUFFER_CONFIG *ref_frame, const MACROBLOCKD *mbd,
+    const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
+    const int num_planes, const double *noise_level, const MV *subblock_mvs,
+    const int *subblock_mses, const int q_factor, const int filter_strenght,
+    const uint8_t *pred, uint32_t *accum, uint16_t *count);
+typedef libaom_test::FuncParam<TemporalFilterFunc> TemporalFilterFuncParam;
+
+typedef std::tuple<TemporalFilterFuncParam, int> TemporalFilterWithParam;
+
+class TemporalFilterTest
+    : public ::testing::TestWithParam<TemporalFilterWithParam> {
+ public:
+  virtual ~TemporalFilterTest() {}
+  virtual void SetUp() {
+    params_ = GET_PARAM(0);
+    rnd_.Reset(ACMRandom::DeterministicSeed());
+    src1_ = reinterpret_cast<uint8_t *>(
+        aom_memalign(8, sizeof(uint8_t) * MAX_MB_PLANE * BH * BW));
+    src2_ = reinterpret_cast<uint8_t *>(
+        aom_memalign(8, sizeof(uint8_t) * MAX_MB_PLANE * BH * BW));
+
+    ASSERT_TRUE(src1_ != NULL);
+    ASSERT_TRUE(src2_ != NULL);
+  }
+
+  virtual void TearDown() {
+    libaom_test::ClearSystemState();
+    aom_free(src1_);
+    aom_free(src2_);
+  }
+  void RunTest(int isRandom, int run_times, ColorFormat color_fmt);
+
+  void GenRandomData(int width, int height, int stride, int stride2,
+                     int num_planes, int subsampling_x, int subsampling_y) {
+    uint8_t *src1p = src1_;
+    uint8_t *src2p = src2_;
+    for (int plane = 0; plane < num_planes; plane++) {
+      int plane_w = plane ? width >> subsampling_x : width;
+      int plane_h = plane ? height >> subsampling_y : height;
+      int plane_stride = plane ? stride >> subsampling_x : stride;
+      int plane_stride2 = plane ? stride2 >> subsampling_x : stride2;
+      for (int ii = 0; ii < plane_h; ii++) {
+        for (int jj = 0; jj < plane_w; jj++) {
+          src1p[jj] = rnd_.Rand8();
+          src2p[jj] = rnd_.Rand8();
+        }
+        src1p += plane_stride;
+        src2p += plane_stride2;
+      }
+    }
+  }
+
+  void GenExtremeData(int width, int height, int stride, int stride2,
+                      int num_planes, int subsampling_x, int subsampling_y,
+                      uint8_t val) {
+    uint8_t *src1p = src1_;
+    uint8_t *src2p = src2_;
+    for (int plane = 0; plane < num_planes; plane++) {
+      int plane_w = plane ? width >> subsampling_x : width;
+      int plane_h = plane ? height >> subsampling_y : height;
+      int plane_stride = plane ? stride >> subsampling_x : stride;
+      int plane_stride2 = plane ? stride2 >> subsampling_x : stride2;
+      for (int ii = 0; ii < plane_h; ii++) {
+        for (int jj = 0; jj < plane_w; jj++) {
+          src1p[jj] = val;
+          src2p[jj] = (255 - val);
+        }
+        src1p += plane_stride;
+        src2p += plane_stride2;
+      }
+    }
+  }
+
+ protected:
+  TemporalFilterFuncParam params_;
+  uint8_t *src1_;
+  uint8_t *src2_;
+  ACMRandom rnd_;
+};
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(TemporalFilterTest);
+
+void TemporalFilterTest::RunTest(int isRandom, int run_times,
+                                 ColorFormat color_fmt) {
+  aom_usec_timer ref_timer, test_timer;
+  const BLOCK_SIZE block_size = TF_BLOCK_SIZE;
+  const int width = block_size_wide[block_size];
+  const int height = block_size_high[block_size];
+  int num_planes = MAX_MB_PLANE;
+  int subsampling_x = 0;
+  int subsampling_y = 0;
+  if (color_fmt == I420) {
+    subsampling_x = 1;
+    subsampling_y = 1;
+  } else if (color_fmt == I422) {
+    subsampling_x = 1;
+    subsampling_y = 0;
+  } else if (color_fmt == I400) {
+    num_planes = 1;
+  }
+  for (int k = 0; k < 3; k++) {
+    const int stride = width;
+    const int stride2 = width;
+    if (isRandom) {
+      GenRandomData(width, height, stride, stride2, num_planes, subsampling_x,
+                    subsampling_y);
+    } else {
+      const int msb = 8;  // Up to 8 bit input
+      const int limit = (1 << msb) - 1;
+      if (k == 0) {
+        GenExtremeData(width, height, stride, stride2, num_planes,
+                       subsampling_x, subsampling_y, limit);
+      } else {
+        GenExtremeData(width, height, stride, stride2, num_planes,
+                       subsampling_x, subsampling_y, 0);
+      }
+    }
+    double sigma[MAX_MB_PLANE] = { 2.1002103677063437, 2.1002103677063437,
+                                   2.1002103677063437 };
+    DECLARE_ALIGNED(16, unsigned int, accumulator_ref[1024 * 3]);
+    DECLARE_ALIGNED(16, uint16_t, count_ref[1024 * 3]);
+    memset(accumulator_ref, 0, 1024 * 3 * sizeof(accumulator_ref[0]));
+    memset(count_ref, 0, 1024 * 3 * sizeof(count_ref[0]));
+    DECLARE_ALIGNED(16, unsigned int, accumulator_mod[1024 * 3]);
+    DECLARE_ALIGNED(16, uint16_t, count_mod[1024 * 3]);
+    memset(accumulator_mod, 0, 1024 * 3 * sizeof(accumulator_mod[0]));
+    memset(count_mod, 0, 1024 * 3 * sizeof(count_mod[0]));
+
+    assert(width == 32 && height == 32);
+    const MV subblock_mvs[4] = { { 0, 0 }, { 5, 5 }, { 7, 8 }, { 2, 10 } };
+    const int subblock_mses[4] = { 15, 16, 17, 18 };
+    const int q_factor = 12;
+    const int filter_strength = 5;
+    const int mb_row = 0;
+    const int mb_col = 0;
+    YV12_BUFFER_CONFIG *ref_frame =
+        (YV12_BUFFER_CONFIG *)malloc(sizeof(YV12_BUFFER_CONFIG));
+    ref_frame->y_crop_height = 360;
+    ref_frame->y_crop_width = 540;
+    ref_frame->heights[PLANE_TYPE_Y] = height;
+    ref_frame->heights[PLANE_TYPE_UV] = height >> subsampling_y;
+    ref_frame->strides[PLANE_TYPE_Y] = stride;
+    ref_frame->strides[PLANE_TYPE_UV] = stride >> subsampling_x;
+    DECLARE_ALIGNED(16, uint8_t, src[1024 * 3]);
+    ref_frame->buffer_alloc = src;
+    ref_frame->flags = 0;  // Only support low bit-depth test.
+    memcpy(src, src1_, 1024 * 3 * sizeof(uint8_t));
+
+    MACROBLOCKD *mbd = (MACROBLOCKD *)malloc(sizeof(MACROBLOCKD));
+    mbd->bd = 8;
+    for (int plane = AOM_PLANE_Y; plane < num_planes; plane++) {
+      int plane_height = plane ? height >> subsampling_y : height;
+      int plane_stride = plane ? stride >> subsampling_x : stride;
+      ref_frame->buffers[plane] =
+          ref_frame->buffer_alloc + plane * plane_stride * plane_height;
+      mbd->plane[plane].subsampling_x = plane ? subsampling_x : 0;
+      mbd->plane[plane].subsampling_y = plane ? subsampling_y : 0;
+    }
+
+    params_.ref_func(ref_frame, mbd, block_size, mb_row, mb_col, num_planes,
+                     sigma, subblock_mvs, subblock_mses, q_factor,
+                     filter_strength, src2_, accumulator_ref, count_ref);
+    params_.tst_func(ref_frame, mbd, block_size, mb_row, mb_col, num_planes,
+                     sigma, subblock_mvs, subblock_mses, q_factor,
+                     filter_strength, src2_, accumulator_mod, count_mod);
+
+    if (run_times > 1) {
+      aom_usec_timer_start(&ref_timer);
+      for (int j = 0; j < run_times; j++) {
+        params_.ref_func(ref_frame, mbd, block_size, mb_row, mb_col, num_planes,
+                         sigma, subblock_mvs, subblock_mses, q_factor,
+                         filter_strength, src2_, accumulator_ref, count_ref);
+      }
+      aom_usec_timer_mark(&ref_timer);
+      const int elapsed_time_c =
+          static_cast<int>(aom_usec_timer_elapsed(&ref_timer));
+
+      aom_usec_timer_start(&test_timer);
+      for (int j = 0; j < run_times; j++) {
+        params_.tst_func(ref_frame, mbd, block_size, mb_row, mb_col, num_planes,
+                         sigma, subblock_mvs, subblock_mses, q_factor,
+                         filter_strength, src2_, accumulator_mod, count_mod);
+      }
+      aom_usec_timer_mark(&test_timer);
+      const int elapsed_time_simd =
+          static_cast<int>(aom_usec_timer_elapsed(&test_timer));
+
+      printf(
+          "c_time=%d \t simd_time=%d \t "
+          "gain=%f\t width=%d\t height=%d\t color_format=%s\n",
+          elapsed_time_c, elapsed_time_simd,
+          (float)((float)elapsed_time_c / (float)elapsed_time_simd), width,
+          height, color_fmt_str[color_fmt]);
+
+    } else {
+      for (int i = 0, l = 0; i < height; i++) {
+        for (int j = 0; j < width; j++, l++) {
+          EXPECT_EQ(accumulator_ref[l], accumulator_mod[l])
+              << "Error:" << k << " SSE Sum Test [" << width << "x" << height
+              << "] " << color_fmt_str[color_fmt]
+              << " C accumulator does not match optimized accumulator.";
+          EXPECT_EQ(count_ref[l], count_mod[l])
+              << "Error:" << k << " SSE Sum Test [" << width << "x" << height
+              << "] " << color_fmt_str[color_fmt]
+              << " count does not match optimized count.";
+        }
+      }
+    }
+
+    free(ref_frame);
+    free(mbd);
+  }
+}
+
+TEST_P(TemporalFilterTest, OperationCheck) {
+  RunTest(1, 1, I400);
+  RunTest(1, 1, I420);
+  RunTest(1, 1, I422);
+  RunTest(1, 1, I444);
+}
+
+TEST_P(TemporalFilterTest, ExtremeValues) {
+  RunTest(0, 1, I400);
+  RunTest(0, 1, I420);
+  RunTest(0, 1, I422);
+  RunTest(0, 1, I444);
+}
+
+TEST_P(TemporalFilterTest, DISABLED_Speed) {
+  RunTest(1, 100000, I400);
+  RunTest(1, 100000, I420);
+  RunTest(1, 100000, I422);
+  RunTest(1, 100000, I444);
+}
+
+#if HAVE_AVX2
+TemporalFilterFuncParam temporal_filter_test_avx2[] = { TemporalFilterFuncParam(
+    &av1_apply_temporal_filter_c, &av1_apply_temporal_filter_avx2) };
+INSTANTIATE_TEST_SUITE_P(AVX2, TemporalFilterTest,
+                         Combine(ValuesIn(temporal_filter_test_avx2),
+                                 Range(64, 65, 4)));
+#endif  // HAVE_AVX2
+
+#if HAVE_SSE2
+TemporalFilterFuncParam temporal_filter_test_sse2[] = { TemporalFilterFuncParam(
+    &av1_apply_temporal_filter_c, &av1_apply_temporal_filter_sse2) };
+INSTANTIATE_TEST_SUITE_P(SSE2, TemporalFilterTest,
+                         Combine(ValuesIn(temporal_filter_test_sse2),
+                                 Range(64, 65, 4)));
+#endif  // HAVE_SSE2
+#if CONFIG_AV1_HIGHBITDEPTH
+
+typedef void (*HBDTemporalFilterFunc)(
+    const YV12_BUFFER_CONFIG *ref_frame, const MACROBLOCKD *mbd,
+    const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
+    const int num_planes, const double *noise_level, const MV *subblock_mvs,
+    const int *subblock_mses, const int q_factor, const int filter_strenght,
+    const uint8_t *pred, uint32_t *accum, uint16_t *count);
+typedef libaom_test::FuncParam<HBDTemporalFilterFunc>
+    HBDTemporalFilterFuncParam;
+
+typedef std::tuple<HBDTemporalFilterFuncParam, int> HBDTemporalFilterWithParam;
+
+class HBDTemporalFilterTest
+    : public ::testing::TestWithParam<HBDTemporalFilterWithParam> {
+ public:
+  virtual ~HBDTemporalFilterTest() {}
+  virtual void SetUp() {
+    params_ = GET_PARAM(0);
+    rnd_.Reset(ACMRandom::DeterministicSeed());
+    src1_ = reinterpret_cast<uint16_t *>(
+        aom_memalign(16, sizeof(uint16_t) * MAX_MB_PLANE * BH * BW));
+    src2_ = reinterpret_cast<uint16_t *>(
+        aom_memalign(16, sizeof(uint16_t) * MAX_MB_PLANE * BH * BW));
+
+    ASSERT_TRUE(src1_ != NULL);
+    ASSERT_TRUE(src2_ != NULL);
+  }
+
+  virtual void TearDown() {
+    libaom_test::ClearSystemState();
+    aom_free(src1_);
+    aom_free(src2_);
+  }
+  void RunTest(int isRandom, int run_times, int bd, ColorFormat color_fmt);
+
+  void GenRandomData(int width, int height, int stride, int stride2, int bd,
+                     int subsampling_x, int subsampling_y, int num_planes) {
+    uint16_t *src1p = src1_;
+    uint16_t *src2p = src2_;
+    for (int plane = AOM_PLANE_Y; plane < num_planes; plane++) {
+      int plane_w = plane ? width >> subsampling_x : width;
+      int plane_h = plane ? height >> subsampling_y : height;
+      int plane_stride = plane ? stride >> subsampling_x : stride;
+      int plane_stride2 = plane ? stride2 >> subsampling_x : stride2;
+      const uint16_t max_val = (1 << bd) - 1;
+      for (int ii = 0; ii < plane_h; ii++) {
+        for (int jj = 0; jj < plane_w; jj++) {
+          src1p[jj] = rnd_.Rand16() & max_val;
+          src2p[jj] = rnd_.Rand16() & max_val;
+        }
+        src1p += plane_stride;
+        src2p += plane_stride2;
+      }
+    }
+  }
+
+  void GenExtremeData(int width, int height, int stride, int stride2, int bd,
+                      int subsampling_x, int subsampling_y, int num_planes,
+                      uint16_t val) {
+    uint16_t *src1p = src1_;
+    uint16_t *src2p = src2_;
+    for (int plane = AOM_PLANE_Y; plane < num_planes; plane++) {
+      int plane_w = plane ? width >> subsampling_x : width;
+      int plane_h = plane ? height >> subsampling_y : height;
+      int plane_stride = plane ? stride >> subsampling_x : stride;
+      int plane_stride2 = plane ? stride2 >> subsampling_x : stride2;
+      uint16_t max_val = (1 << bd) - 1;
+      for (int ii = 0; ii < plane_h; ii++) {
+        for (int jj = 0; jj < plane_w; jj++) {
+          src1p[jj] = val;
+          src2p[jj] = (max_val - val);
+        }
+        src1p += plane_stride;
+        src2p += plane_stride2;
+      }
+    }
+  }
+
+ protected:
+  HBDTemporalFilterFuncParam params_;
+  uint16_t *src1_;
+  uint16_t *src2_;
+  ACMRandom rnd_;
+};
+
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(HBDTemporalFilterTest);
+
+void HBDTemporalFilterTest::RunTest(int isRandom, int run_times, int BD,
+                                    ColorFormat color_fmt) {
+  aom_usec_timer ref_timer, test_timer;
+  const BLOCK_SIZE block_size = TF_BLOCK_SIZE;
+  const int width = block_size_wide[block_size];
+  const int height = block_size_high[block_size];
+  int num_planes = MAX_MB_PLANE;
+  int subsampling_x = 0;
+  int subsampling_y = 0;
+  if (color_fmt == I420) {
+    subsampling_x = 1;
+    subsampling_y = 1;
+  } else if (color_fmt == I422) {
+    subsampling_x = 1;
+    subsampling_y = 0;
+  } else if (color_fmt == I400) {
+    num_planes = 1;
+  }
+  for (int k = 0; k < 3; k++) {
+    const int stride = width;
+    const int stride2 = width;
+    if (isRandom) {
+      GenRandomData(width, height, stride, stride2, BD, subsampling_x,
+                    subsampling_y, num_planes);
+    } else {
+      const int msb = BD;
+      const uint16_t limit = (1 << msb) - 1;
+      if (k == 0) {
+        GenExtremeData(width, height, stride, stride2, BD, subsampling_x,
+                       subsampling_y, num_planes, limit);
+      } else {
+        GenExtremeData(width, height, stride, stride2, BD, subsampling_x,
+                       subsampling_y, num_planes, 0);
+      }
+    }
+    double sigma[MAX_MB_PLANE] = { 2.1002103677063437, 2.1002103677063437,
+                                   2.1002103677063437 };
+    DECLARE_ALIGNED(16, unsigned int, accumulator_ref[1024 * 3]);
+    DECLARE_ALIGNED(16, uint16_t, count_ref[1024 * 3]);
+    memset(accumulator_ref, 0, 1024 * 3 * sizeof(accumulator_ref[0]));
+    memset(count_ref, 0, 1024 * 3 * sizeof(count_ref[0]));
+    DECLARE_ALIGNED(16, unsigned int, accumulator_mod[1024 * 3]);
+    DECLARE_ALIGNED(16, uint16_t, count_mod[1024 * 3]);
+    memset(accumulator_mod, 0, 1024 * 3 * sizeof(accumulator_mod[0]));
+    memset(count_mod, 0, 1024 * 3 * sizeof(count_mod[0]));
+
+    assert(width == 32 && height == 32);
+    const MV subblock_mvs[4] = { { 0, 0 }, { 5, 5 }, { 7, 8 }, { 2, 10 } };
+    const int subblock_mses[4] = { 15, 16, 17, 18 };
+    const int q_factor = 12;
+    const int filter_strength = 5;
+    const int mb_row = 0;
+    const int mb_col = 0;
+    YV12_BUFFER_CONFIG *ref_frame =
+        (YV12_BUFFER_CONFIG *)malloc(sizeof(YV12_BUFFER_CONFIG));
+    ref_frame->y_crop_height = 360;
+    ref_frame->y_crop_width = 540;
+    ref_frame->heights[PLANE_TYPE_Y] = height;
+    ref_frame->heights[PLANE_TYPE_UV] = height >> subsampling_y;
+    ref_frame->strides[PLANE_TYPE_Y] = stride;
+    ref_frame->strides[PLANE_TYPE_UV] = stride >> subsampling_x;
+    DECLARE_ALIGNED(16, uint16_t, src[1024 * 3]);
+    ref_frame->buffer_alloc = CONVERT_TO_BYTEPTR(src);
+    ref_frame->flags = YV12_FLAG_HIGHBITDEPTH;  // Only Hihgbd bit-depth test.
+    memcpy(src, src1_, 1024 * 3 * sizeof(uint16_t));
+
+    MACROBLOCKD *mbd = (MACROBLOCKD *)malloc(sizeof(MACROBLOCKD));
+    mbd->bd = BD;
+    for (int plane = AOM_PLANE_Y; plane < num_planes; plane++) {
+      int plane_height = plane ? height >> subsampling_y : height;
+      int plane_stride = plane ? stride >> subsampling_x : stride;
+      ref_frame->buffers[plane] =
+          ref_frame->buffer_alloc + plane * plane_stride * plane_height;
+      mbd->plane[plane].subsampling_x = plane ? subsampling_x : 0;
+      mbd->plane[plane].subsampling_y = plane ? subsampling_y : 0;
+    }
+
+    params_.ref_func(ref_frame, mbd, block_size, mb_row, mb_col, num_planes,
+                     sigma, subblock_mvs, subblock_mses, q_factor,
+                     filter_strength, CONVERT_TO_BYTEPTR(src2_),
+                     accumulator_ref, count_ref);
+    params_.tst_func(ref_frame, mbd, block_size, mb_row, mb_col, num_planes,
+                     sigma, subblock_mvs, subblock_mses, q_factor,
+                     filter_strength, CONVERT_TO_BYTEPTR(src2_),
+                     accumulator_mod, count_mod);
+
+    if (run_times > 1) {
+      aom_usec_timer_start(&ref_timer);
+      for (int j = 0; j < run_times; j++) {
+        params_.ref_func(ref_frame, mbd, block_size, mb_row, mb_col, num_planes,
+                         sigma, subblock_mvs, subblock_mses, q_factor,
+                         filter_strength, CONVERT_TO_BYTEPTR(src2_),
+                         accumulator_ref, count_ref);
+      }
+      aom_usec_timer_mark(&ref_timer);
+      const int elapsed_time_c =
+          static_cast<int>(aom_usec_timer_elapsed(&ref_timer));
+
+      aom_usec_timer_start(&test_timer);
+      for (int j = 0; j < run_times; j++) {
+        params_.tst_func(ref_frame, mbd, block_size, mb_row, mb_col, num_planes,
+                         sigma, subblock_mvs, subblock_mses, q_factor,
+                         filter_strength, CONVERT_TO_BYTEPTR(src2_),
+                         accumulator_mod, count_mod);
+      }
+      aom_usec_timer_mark(&test_timer);
+      const int elapsed_time_simd =
+          static_cast<int>(aom_usec_timer_elapsed(&test_timer));
+
+      printf(
+          "c_time=%d \t simd_time=%d \t "
+          "gain=%f\t width=%d\t height=%d\t color_format=%s\n",
+          elapsed_time_c, elapsed_time_simd,
+          (float)((float)elapsed_time_c / (float)elapsed_time_simd), width,
+          height, color_fmt_str[color_fmt]);
+
+    } else {
+      for (int i = 0, l = 0; i < height; i++) {
+        for (int j = 0; j < width; j++, l++) {
+          EXPECT_EQ(accumulator_ref[l], accumulator_mod[l])
+              << "Error:" << k << " SSE Sum Test [" << width << "x" << height
+              << "] " << color_fmt_str[color_fmt]
+              << " C accumulator does not match optimized accumulator.";
+          EXPECT_EQ(count_ref[l], count_mod[l])
+              << "Error:" << k << " SSE Sum Test [" << width << "x" << height
+              << "] " << color_fmt_str[color_fmt]
+              << " C count does not match optimized count.";
+        }
+      }
+    }
+
+    free(ref_frame);
+    free(mbd);
+  }
+}
+
+TEST_P(HBDTemporalFilterTest, OperationCheck) {
+  RunTest(1, 1, 10, I400);
+  RunTest(1, 1, 10, I420);
+  RunTest(1, 1, 10, I422);
+  RunTest(1, 1, 10, I444);
+}
+
+TEST_P(HBDTemporalFilterTest, ExtremeValues) {
+  RunTest(0, 1, 10, I400);
+  RunTest(0, 1, 10, I420);
+  RunTest(0, 1, 10, I422);
+  RunTest(0, 1, 10, I444);
+}
+
+TEST_P(HBDTemporalFilterTest, DISABLED_Speed) {
+  RunTest(1, 100000, 10, I400);
+  RunTest(1, 100000, 10, I420);
+  RunTest(1, 100000, 10, I422);
+  RunTest(1, 100000, 10, I444);
+}
+#if HAVE_SSE2
+HBDTemporalFilterFuncParam HBDtemporal_filter_test_sse2[] = {
+  HBDTemporalFilterFuncParam(&av1_highbd_apply_temporal_filter_c,
+                             &av1_highbd_apply_temporal_filter_sse2)
+};
+INSTANTIATE_TEST_SUITE_P(SSE2, HBDTemporalFilterTest,
+                         Combine(ValuesIn(HBDtemporal_filter_test_sse2),
+                                 Range(64, 65, 4)));
+#endif  // HAVE_SSE2
+#if HAVE_AVX2
+HBDTemporalFilterFuncParam HBDtemporal_filter_test_avx2[] = {
+  HBDTemporalFilterFuncParam(&av1_highbd_apply_temporal_filter_c,
+                             &av1_highbd_apply_temporal_filter_avx2)
+};
+INSTANTIATE_TEST_SUITE_P(AVX2, HBDTemporalFilterTest,
+                         Combine(ValuesIn(HBDtemporal_filter_test_avx2),
+                                 Range(64, 65, 4)));
+#endif  // HAVE_AVX2
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+}  // namespace
+#endif
diff --git a/test/temporal_filter_yuv_test.cc b/test/temporal_filter_yuv_test.cc
deleted file mode 100644
index dc17aaa..0000000
--- a/test/temporal_filter_yuv_test.cc
+++ /dev/null
@@ -1,841 +0,0 @@
-/*
- * Copyright (c) 2019, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <ostream>
-
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
-
-#include "config/av1_rtcd.h"
-#include "test/acm_random.h"
-#include "test/register_state_check.h"
-#include "aom_ports/aom_timer.h"
-#include "aom_ports/mem.h"
-
-namespace {
-
-using ::libaom_test::ACMRandom;
-
-const int MAX_WIDTH = 32;
-const int MAX_HEIGHT = 32;
-
-typedef void (*TemporalFilterYUVFunc)(
-    const YV12_BUFFER_CONFIG *ref_frame, const MACROBLOCKD *mbd,
-    const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
-    const int num_planes, const int strength, const int use_subblock,
-    const int *blk_fw, const uint8_t *pred, uint32_t *accum, uint16_t *count);
-
-struct TemporalFilterWithBd {
-  TemporalFilterWithBd(TemporalFilterYUVFunc func, int bitdepth)
-      : temporal_filter(func), bd(bitdepth) {}
-
-  TemporalFilterYUVFunc temporal_filter;
-  int bd;
-};
-
-std::ostream &operator<<(std::ostream &os, const TemporalFilterWithBd &tf) {
-  return os << "Bitdepth: " << tf.bd;
-}
-
-int GetFilterWeight(unsigned int row, unsigned int col,
-                    unsigned int block_height, unsigned int block_width,
-                    const int *const blk_fw, int use_32x32) {
-  if (use_32x32) {
-    return blk_fw[0];
-  }
-
-  return blk_fw[2 * (row >= block_height / 2) + (col >= block_width / 2)];
-}
-
-template <typename PixelType>
-int GetModIndex(int sum_dist, int index, int rounding, int strength,
-                int filter_weight) {
-  int mod = sum_dist * 3 / index;
-  mod += rounding;
-  mod >>= strength;
-
-  mod = AOMMIN(16, mod);
-
-  mod = 16 - mod;
-  mod *= filter_weight;
-
-  return mod;
-}
-
-// Lowbitdepth version
-template <>
-int GetModIndex<uint8_t>(int sum_dist, int index, int rounding, int strength,
-                         int filter_weight) {
-  unsigned int index_mult[14] = { 0,     0,     0,     0,     49152,
-                                  39322, 32768, 28087, 24576, 21846,
-                                  19661, 17874, 0,     15124 };
-
-  assert(index >= 0 && index <= 13);
-  assert(index_mult[index] != 0);
-
-  int mod = (clamp(sum_dist, 0, UINT16_MAX) * index_mult[index]) >> 16;
-  mod += rounding;
-  mod >>= strength;
-
-  mod = AOMMIN(16, mod);
-
-  mod = 16 - mod;
-  mod *= filter_weight;
-
-  return mod;
-}
-
-// Highbitdepth version
-template <>
-int GetModIndex<uint16_t>(int sum_dist, int index, int rounding, int strength,
-                          int filter_weight) {
-  int64_t index_mult[14] = { 0U,          0U,          0U,          0U,
-                             3221225472U, 2576980378U, 2147483648U, 1840700270U,
-                             1610612736U, 1431655766U, 1288490189U, 1171354718U,
-                             0U,          991146300U };
-
-  assert(index >= 0 && index <= 13);
-  assert(index_mult[index] != 0);
-
-  int mod = static_cast<int>((sum_dist * index_mult[index]) >> 32);
-  mod += rounding;
-  mod >>= strength;
-
-  mod = AOMMIN(16, mod);
-
-  mod = 16 - mod;
-  mod *= filter_weight;
-
-  return mod;
-}
-
-template <typename PixelType>
-void SetArray(PixelType *pixel_array, int width, int height, int stride,
-              int val) {
-  for (int row = 0; row < height; row++) {
-    for (int col = 0; col < width; col++) {
-      pixel_array[col] = val;
-    }
-    pixel_array += stride;
-  }
-}
-
-template <typename PixelType>
-void SetArray(PixelType *pixel_array, int width, int height, int stride,
-              ACMRandom *rnd, int low_val, int high_val) {
-  EXPECT_LE(low_val, high_val);
-
-  for (int row = 0; row < height; row++) {
-    for (int col = 0; col < width; col++) {
-      const int val =
-          static_cast<int>((*rnd).PseudoUniform(high_val - low_val));
-      pixel_array[col] = low_val + val;
-    }
-    pixel_array += stride;
-  }
-}
-
-template <typename ValueType>
-bool CheckArrayEqual(const ValueType *arr_1, const ValueType *arr_2, int width,
-                     int height, int stride_1, int stride_2) {
-  for (int row = 0; row < height; row++) {
-    for (int col = 0; col < width; col++) {
-      if (arr_1[col] != arr_2[col]) {
-        return false;
-      }
-    }
-    arr_1 += stride_1;
-    arr_2 += stride_2;
-  }
-  return true;
-}
-
-template <typename ValueType>
-void PrintArrayDiff(const ValueType *arr_1, const ValueType *arr_2, int width,
-                    int height, int stride_1, int stride_2) {
-  const ValueType *arr_1_start = arr_1, *arr_2_start = arr_2;
-
-  printf("Array 1:\n");
-  for (int row = 0; row < height; ++row) {
-    for (int col = 0; col < width; ++col) {
-      if (arr_1[col] != arr_2[col]) {
-        printf("*%3d", arr_1[col]);
-      } else {
-        printf("%4d", arr_1[col]);
-      }
-    }
-    printf("\n");
-    arr_1 += stride_1;
-    arr_2 += stride_2;
-  }
-
-  arr_1 = arr_1_start;
-  arr_2 = arr_2_start;
-
-  printf("Array 2:\n");
-  for (int row = 0; row < height; ++row) {
-    for (int col = 0; col < width; ++col) {
-      if (arr_1[col] != arr_2[col]) {
-        printf("*%3d", arr_2[col]);
-      } else {
-        printf("%4d", arr_2[col]);
-      }
-    }
-    printf("\n");
-    arr_1 += stride_1;
-    arr_2 += stride_2;
-  }
-
-  arr_1 = arr_1_start;
-  arr_2 = arr_2_start;
-  printf("Difference:\n");
-  for (int row = 0; row < height; ++row) {
-    for (int col = 0; col < width; ++col) {
-      printf("%4d", arr_1[col] - arr_2[col]);
-    }
-    printf("\n");
-    arr_1 += stride_1;
-    arr_2 += stride_2;
-  }
-}
-
-template <typename PixelType>
-void ApplyReferenceFilter(const PixelType *y_src, const PixelType *y_pre,
-                          const PixelType *u_src, const PixelType *v_src,
-                          const PixelType *u_pre, const PixelType *v_pre,
-                          unsigned int block_width, unsigned int block_height,
-                          int ss_x, int ss_y, int strength,
-                          const int *const blk_fw, int use_32x32,
-                          uint32_t *y_accum, uint16_t *y_count,
-                          uint32_t *u_accum, uint16_t *u_count,
-                          uint32_t *v_accum, uint16_t *v_count) {
-  const int uv_block_width = block_width >> ss_x,
-            uv_block_height = block_height >> ss_y;
-  const int y_src_stride = block_width, y_pre_stride = block_width;
-  const int uv_src_stride = uv_block_width, uv_pre_stride = uv_block_width;
-  const int y_diff_stride = block_width, uv_diff_stride = uv_block_width;
-  const int y_count_stride = block_width, u_count_stride = uv_block_width,
-            v_count_stride = uv_block_width;
-  const int y_accum_stride = block_width, u_accum_stride = uv_block_width,
-            v_accum_stride = uv_block_width;
-
-  int y_dif[MAX_WIDTH * MAX_HEIGHT] = { 0 };
-  int u_dif[MAX_WIDTH * MAX_HEIGHT] = { 0 };
-  int v_dif[MAX_WIDTH * MAX_HEIGHT] = { 0 };
-
-  const int rounding = (1 << strength) >> 1;
-
-  // Get the square diffs
-  for (int row = 0; row < (int)block_height; row++) {
-    for (int col = 0; col < (int)block_width; col++) {
-      const int diff =
-          y_src[row * y_src_stride + col] - y_pre[row * y_pre_stride + col];
-      y_dif[row * y_diff_stride + col] = diff * diff;
-    }
-  }
-
-  for (int row = 0; row < (int)uv_block_height; row++) {
-    for (int col = 0; col < (int)uv_block_width; col++) {
-      const int u_diff =
-          u_src[row * uv_src_stride + col] - u_pre[row * uv_pre_stride + col];
-      const int v_diff =
-          v_src[row * uv_src_stride + col] - v_pre[row * uv_pre_stride + col];
-      u_dif[row * uv_diff_stride + col] = u_diff * u_diff;
-      v_dif[row * uv_diff_stride + col] = v_diff * v_diff;
-    }
-  }
-
-  // Apply the filter to luma
-  for (int row = 0; row < (int)block_height; row++) {
-    for (int col = 0; col < (int)block_width; col++) {
-      const int uv_row = row >> ss_y;
-      const int uv_col = col >> ss_x;
-      const int filter_weight = GetFilterWeight(row, col, block_height,
-                                                block_width, blk_fw, use_32x32);
-
-      // First we get the modifier for the current y pixel
-      const int y_pixel = y_pre[row * y_pre_stride + col];
-      int y_num_used = 0;
-      int y_mod = 0;
-
-      // Sum the neighboring 3x3 y pixels
-      for (int row_step = -1; row_step <= 1; row_step++) {
-        for (int col_step = -1; col_step <= 1; col_step++) {
-          const int sub_row = row + row_step;
-          const int sub_col = col + col_step;
-
-          if (sub_row >= 0 && sub_row < (int)block_height && sub_col >= 0 &&
-              sub_col < (int)block_width) {
-            y_mod += y_dif[sub_row * y_diff_stride + sub_col];
-            y_num_used++;
-          }
-        }
-      }
-
-      // Sum the corresponding uv pixels to the current y modifier
-      // Note we are rounding down instead of rounding to the nearest pixel.
-      y_mod += u_dif[uv_row * uv_diff_stride + uv_col];
-      y_mod += v_dif[uv_row * uv_diff_stride + uv_col];
-
-      y_num_used += 2;
-
-      // Set the modifier
-      y_mod = GetModIndex<PixelType>(y_mod, y_num_used, rounding, strength,
-                                     filter_weight);
-
-      // Accumulate the result
-      y_count[row * y_count_stride + col] += y_mod;
-      y_accum[row * y_accum_stride + col] += y_mod * y_pixel;
-    }
-  }
-
-  // Apply the filter to chroma
-  for (int uv_row = 0; uv_row < (int)uv_block_height; uv_row++) {
-    for (int uv_col = 0; uv_col < (int)uv_block_width; uv_col++) {
-      const int y_row = uv_row << ss_y;
-      const int y_col = uv_col << ss_x;
-      const int filter_weight = GetFilterWeight(
-          uv_row, uv_col, uv_block_height, uv_block_width, blk_fw, use_32x32);
-
-      const int u_pixel = u_pre[uv_row * uv_pre_stride + uv_col];
-      const int v_pixel = v_pre[uv_row * uv_pre_stride + uv_col];
-
-      int uv_num_used = 0;
-      int u_mod = 0, v_mod = 0;
-
-      // Sum the neighboring 3x3 chromal pixels to the chroma modifier
-      for (int row_step = -1; row_step <= 1; row_step++) {
-        for (int col_step = -1; col_step <= 1; col_step++) {
-          const int sub_row = uv_row + row_step;
-          const int sub_col = uv_col + col_step;
-
-          if (sub_row >= 0 && sub_row < uv_block_height && sub_col >= 0 &&
-              sub_col < uv_block_width) {
-            u_mod += u_dif[sub_row * uv_diff_stride + sub_col];
-            v_mod += v_dif[sub_row * uv_diff_stride + sub_col];
-            uv_num_used++;
-          }
-        }
-      }
-
-      // Sum all the luma pixels associated with the current luma pixel
-      for (int row_step = 0; row_step < 1 + ss_y; row_step++) {
-        for (int col_step = 0; col_step < 1 + ss_x; col_step++) {
-          const int sub_row = y_row + row_step;
-          const int sub_col = y_col + col_step;
-          const int y_diff = y_dif[sub_row * y_diff_stride + sub_col];
-
-          u_mod += y_diff;
-          v_mod += y_diff;
-          uv_num_used++;
-        }
-      }
-
-      // Set the modifier
-      u_mod = GetModIndex<PixelType>(u_mod, uv_num_used, rounding, strength,
-                                     filter_weight);
-      v_mod = GetModIndex<PixelType>(v_mod, uv_num_used, rounding, strength,
-                                     filter_weight);
-
-      // Accumulate the result
-      u_count[uv_row * u_count_stride + uv_col] += u_mod;
-      u_accum[uv_row * u_accum_stride + uv_col] += u_mod * u_pixel;
-      v_count[uv_row * v_count_stride + uv_col] += v_mod;
-      v_accum[uv_row * v_accum_stride + uv_col] += v_mod * v_pixel;
-    }
-  }
-}
-
-class TemporalFilterYUVTest
-    : public ::testing::TestWithParam<TemporalFilterWithBd> {
- public:
-  virtual void SetUp() {
-    filter_func_ = GetParam().temporal_filter;
-    bd_ = GetParam().bd;
-    use_highbd_ = (bd_ != 8);
-
-    rnd_.Reset(ACMRandom::DeterministicSeed());
-    saturate_test_ = 0;
-    num_repeats_ = 10;
-
-    ASSERT_TRUE(bd_ == 8 || bd_ == 10 || bd_ == 12);
-  }
-
- protected:
-  template <typename PixelType>
-  void CompareTestWithParam(int width, int height, int ss_x, int ss_y,
-                            int filter_strength, int use_32x32,
-                            const int *filter_weight);
-  template <typename PixelType>
-  void RunTestFilterWithParam(int width, int height, int ss_x, int ss_y,
-                              int filter_strength, int use_32x32,
-                              const int *filter_weight);
-  template <typename PixelType>
-  void ApplyTestFilter(const PixelType *y_src, int y_src_stride,
-                       const PixelType *y_pre, int y_pre_stride,
-                       const PixelType *u_src, const PixelType *v_src,
-                       int uv_src_stride, const PixelType *u_pre,
-                       const PixelType *v_pre, int uv_pre_stride,
-                       unsigned int block_width, unsigned int block_height,
-                       int ss_x, int ss_y, int strength, const int *blk_fw,
-                       int use_32x32, uint32_t *y_accum, uint16_t *y_count,
-                       uint32_t *u_accumu, uint16_t *u_count, uint32_t *v_accum,
-                       uint16_t *v_count);
-
-  TemporalFilterYUVFunc filter_func_;
-  ACMRandom rnd_;
-  int saturate_test_;
-  int num_repeats_;
-  int use_highbd_;
-  int bd_;
-};
-
-template <>
-void TemporalFilterYUVTest::ApplyTestFilter<uint8_t>(
-    const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre,
-    int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src,
-    int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre,
-    int uv_pre_stride, unsigned int block_width, unsigned int block_height,
-    int ss_x, int ss_y, int strength, const int *blk_fw, int use_32x32,
-    uint32_t *y_accum, uint16_t *y_count, uint32_t *u_accum, uint16_t *u_count,
-    uint32_t *v_accum, uint16_t *v_count) {
-  (void)block_width;
-  (void)block_height;
-  (void)y_src_stride;
-  (void)uv_src_stride;
-
-  assert(block_width == MAX_WIDTH && MAX_WIDTH == 32);
-  assert(block_height == MAX_HEIGHT && MAX_HEIGHT == 32);
-  const BLOCK_SIZE block_size = BLOCK_32X32;
-  const int num_planes = 3;
-  const int mb_pels = MAX_WIDTH * MAX_HEIGHT;
-  const int mb_row = 0;
-  const int mb_col = 0;
-  const int use_subblock = !(use_32x32);
-
-  YV12_BUFFER_CONFIG *ref_frame =
-      (YV12_BUFFER_CONFIG *)malloc(sizeof(YV12_BUFFER_CONFIG));
-  ref_frame->strides[0] = y_pre_stride;
-  ref_frame->strides[1] = uv_pre_stride;
-  const int alloc_size = MAX_MB_PLANE * mb_pels;
-  DECLARE_ALIGNED(16, uint8_t, src[alloc_size]);
-  ref_frame->buffer_alloc = src;
-  ref_frame->buffers[0] = ref_frame->buffer_alloc + 0 * mb_pels;
-  ref_frame->buffers[1] = ref_frame->buffer_alloc + 1 * mb_pels;
-  ref_frame->buffers[2] = ref_frame->buffer_alloc + 2 * mb_pels;
-  ref_frame->flags = bd_ > 8 ? YV12_FLAG_HIGHBITDEPTH : 0;
-
-  MACROBLOCKD *mbd = (MACROBLOCKD *)malloc(sizeof(MACROBLOCKD));
-  mbd->plane[0].subsampling_y = 0;
-  mbd->plane[0].subsampling_x = 0;
-  mbd->plane[1].subsampling_y = ss_y;
-  mbd->plane[1].subsampling_x = ss_x;
-  mbd->plane[2].subsampling_y = ss_y;
-  mbd->plane[2].subsampling_x = ss_x;
-
-  DECLARE_ALIGNED(16, uint8_t, pred[alloc_size]);
-  DECLARE_ALIGNED(16, uint32_t, accum[alloc_size]);
-  DECLARE_ALIGNED(16, uint16_t, count[alloc_size]);
-  memcpy(src + 0 * mb_pels, y_src, mb_pels * sizeof(uint8_t));
-  memcpy(src + 1 * mb_pels, u_src, mb_pels * sizeof(uint8_t));
-  memcpy(src + 2 * mb_pels, v_src, mb_pels * sizeof(uint8_t));
-  memcpy(pred + 0 * mb_pels, y_pre, mb_pels * sizeof(uint8_t));
-  memcpy(pred + 1 * mb_pels, u_pre, mb_pels * sizeof(uint8_t));
-  memcpy(pred + 2 * mb_pels, v_pre, mb_pels * sizeof(uint8_t));
-  memcpy(accum + 0 * mb_pels, y_accum, mb_pels * sizeof(uint32_t));
-  memcpy(accum + 1 * mb_pels, u_accum, mb_pels * sizeof(uint32_t));
-  memcpy(accum + 2 * mb_pels, v_accum, mb_pels * sizeof(uint32_t));
-  memcpy(count + 0 * mb_pels, y_count, mb_pels * sizeof(uint16_t));
-  memcpy(count + 1 * mb_pels, u_count, mb_pels * sizeof(uint16_t));
-  memcpy(count + 2 * mb_pels, v_count, mb_pels * sizeof(uint16_t));
-
-  ASM_REGISTER_STATE_CHECK(
-      filter_func_(ref_frame, mbd, block_size, mb_row, mb_col, num_planes,
-                   strength, use_subblock, blk_fw, pred, accum, count));
-
-  memcpy(y_accum, accum + 0 * mb_pels, mb_pels * sizeof(uint32_t));
-  memcpy(u_accum, accum + 1 * mb_pels, mb_pels * sizeof(uint32_t));
-  memcpy(v_accum, accum + 2 * mb_pels, mb_pels * sizeof(uint32_t));
-  memcpy(y_count, count + 0 * mb_pels, mb_pels * sizeof(uint16_t));
-  memcpy(u_count, count + 1 * mb_pels, mb_pels * sizeof(uint16_t));
-  memcpy(v_count, count + 2 * mb_pels, mb_pels * sizeof(uint16_t));
-
-  free(ref_frame);
-  free(mbd);
-}
-
-template <>
-void TemporalFilterYUVTest::ApplyTestFilter<uint16_t>(
-    const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre,
-    int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src,
-    int uv_src_stride, const uint16_t *u_pre, const uint16_t *v_pre,
-    int uv_pre_stride, unsigned int block_width, unsigned int block_height,
-    int ss_x, int ss_y, int strength, const int *blk_fw, int use_32x32,
-    uint32_t *y_accum, uint16_t *y_count, uint32_t *u_accum, uint16_t *u_count,
-    uint32_t *v_accum, uint16_t *v_count) {
-  (void)block_width;
-  (void)block_height;
-  (void)y_src_stride;
-  (void)uv_src_stride;
-
-  assert(block_width == MAX_WIDTH && MAX_WIDTH == 32);
-  assert(block_height == MAX_HEIGHT && MAX_HEIGHT == 32);
-  const BLOCK_SIZE block_size = BLOCK_32X32;
-  const int num_planes = 3;
-  const int mb_pels = MAX_WIDTH * MAX_HEIGHT;
-  const int mb_row = 0;
-  const int mb_col = 0;
-  const int use_subblock = !(use_32x32);
-
-  YV12_BUFFER_CONFIG *ref_frame =
-      (YV12_BUFFER_CONFIG *)malloc(sizeof(YV12_BUFFER_CONFIG));
-  ref_frame->strides[0] = y_pre_stride;
-  ref_frame->strides[1] = uv_pre_stride;
-  const int alloc_size = MAX_MB_PLANE * mb_pels;
-  DECLARE_ALIGNED(16, uint16_t, src16[alloc_size]);
-  ref_frame->buffer_alloc = CONVERT_TO_BYTEPTR(src16);
-  ref_frame->buffers[0] = ref_frame->buffer_alloc + 0 * mb_pels;
-  ref_frame->buffers[1] = ref_frame->buffer_alloc + 1 * mb_pels;
-  ref_frame->buffers[2] = ref_frame->buffer_alloc + 2 * mb_pels;
-  ref_frame->flags = bd_ > 8 ? YV12_FLAG_HIGHBITDEPTH : 0;
-
-  MACROBLOCKD *mbd = (MACROBLOCKD *)malloc(sizeof(MACROBLOCKD));
-  mbd->plane[0].subsampling_y = 0;
-  mbd->plane[0].subsampling_x = 0;
-  mbd->plane[1].subsampling_y = ss_y;
-  mbd->plane[1].subsampling_x = ss_x;
-  mbd->plane[2].subsampling_y = ss_y;
-  mbd->plane[2].subsampling_x = ss_x;
-
-  DECLARE_ALIGNED(16, uint16_t, pred16[alloc_size]);
-  DECLARE_ALIGNED(16, uint32_t, accum[alloc_size]);
-  DECLARE_ALIGNED(16, uint16_t, count[alloc_size]);
-  memcpy(src16 + 0 * mb_pels, y_src, mb_pels * sizeof(uint16_t));
-  memcpy(src16 + 1 * mb_pels, u_src, mb_pels * sizeof(uint16_t));
-  memcpy(src16 + 2 * mb_pels, v_src, mb_pels * sizeof(uint16_t));
-  memcpy(pred16 + 0 * mb_pels, y_pre, mb_pels * sizeof(uint16_t));
-  memcpy(pred16 + 1 * mb_pels, u_pre, mb_pels * sizeof(uint16_t));
-  memcpy(pred16 + 2 * mb_pels, v_pre, mb_pels * sizeof(uint16_t));
-  memcpy(accum + 0 * mb_pels, y_accum, mb_pels * sizeof(uint32_t));
-  memcpy(accum + 1 * mb_pels, u_accum, mb_pels * sizeof(uint32_t));
-  memcpy(accum + 2 * mb_pels, v_accum, mb_pels * sizeof(uint32_t));
-  memcpy(count + 0 * mb_pels, y_count, mb_pels * sizeof(uint16_t));
-  memcpy(count + 1 * mb_pels, u_count, mb_pels * sizeof(uint16_t));
-  memcpy(count + 2 * mb_pels, v_count, mb_pels * sizeof(uint16_t));
-  const uint8_t *pred = CONVERT_TO_BYTEPTR(pred16);
-
-  ASM_REGISTER_STATE_CHECK(
-      filter_func_(ref_frame, mbd, block_size, mb_row, mb_col, num_planes,
-                   strength, use_subblock, blk_fw, pred, accum, count));
-
-  memcpy(y_accum, accum + 0 * mb_pels, mb_pels * sizeof(uint32_t));
-  memcpy(u_accum, accum + 1 * mb_pels, mb_pels * sizeof(uint32_t));
-  memcpy(v_accum, accum + 2 * mb_pels, mb_pels * sizeof(uint32_t));
-  memcpy(y_count, count + 0 * mb_pels, mb_pels * sizeof(uint16_t));
-  memcpy(u_count, count + 1 * mb_pels, mb_pels * sizeof(uint16_t));
-  memcpy(v_count, count + 2 * mb_pels, mb_pels * sizeof(uint16_t));
-
-  free(ref_frame);
-  free(mbd);
-}
-
-template <typename PixelType>
-void TemporalFilterYUVTest::CompareTestWithParam(int width, int height,
-                                                 int ss_x, int ss_y,
-                                                 int filter_strength,
-                                                 int use_32x32,
-                                                 const int *filter_weight) {
-  const int uv_width = width >> ss_x, uv_height = height >> ss_y;
-  const int y_stride = width, uv_stride = uv_width;
-
-  DECLARE_ALIGNED(16, PixelType, y_src[MAX_WIDTH * MAX_HEIGHT]) = { 0 };
-  DECLARE_ALIGNED(16, PixelType, y_pre[MAX_WIDTH * MAX_HEIGHT]) = { 0 };
-  DECLARE_ALIGNED(16, uint16_t, y_count_ref[MAX_WIDTH * MAX_HEIGHT]) = { 0 };
-  DECLARE_ALIGNED(16, uint32_t, y_accum_ref[MAX_WIDTH * MAX_HEIGHT]) = { 0 };
-  DECLARE_ALIGNED(16, uint16_t, y_count_tst[MAX_WIDTH * MAX_HEIGHT]) = { 0 };
-  DECLARE_ALIGNED(16, uint32_t, y_accum_tst[MAX_WIDTH * MAX_HEIGHT]) = { 0 };
-
-  DECLARE_ALIGNED(16, PixelType, u_src[MAX_WIDTH * MAX_HEIGHT]) = { 0 };
-  DECLARE_ALIGNED(16, PixelType, u_pre[MAX_WIDTH * MAX_HEIGHT]) = { 0 };
-  DECLARE_ALIGNED(16, uint16_t, u_count_ref[MAX_WIDTH * MAX_HEIGHT]) = { 0 };
-  DECLARE_ALIGNED(16, uint32_t, u_accum_ref[MAX_WIDTH * MAX_HEIGHT]) = { 0 };
-  DECLARE_ALIGNED(16, uint16_t, u_count_tst[MAX_WIDTH * MAX_HEIGHT]) = { 0 };
-  DECLARE_ALIGNED(16, uint32_t, u_accum_tst[MAX_WIDTH * MAX_HEIGHT]) = { 0 };
-
-  DECLARE_ALIGNED(16, PixelType, v_src[MAX_WIDTH * MAX_HEIGHT]) = { 0 };
-  DECLARE_ALIGNED(16, PixelType, v_pre[MAX_WIDTH * MAX_HEIGHT]) = { 0 };
-  DECLARE_ALIGNED(16, uint16_t, v_count_ref[MAX_WIDTH * MAX_HEIGHT]) = { 0 };
-  DECLARE_ALIGNED(16, uint32_t, v_accum_ref[MAX_WIDTH * MAX_HEIGHT]) = { 0 };
-  DECLARE_ALIGNED(16, uint16_t, v_count_tst[MAX_WIDTH * MAX_HEIGHT]) = { 0 };
-  DECLARE_ALIGNED(16, uint32_t, v_accum_tst[MAX_WIDTH * MAX_HEIGHT]) = { 0 };
-
-  for (int repeats = 0; repeats < num_repeats_; repeats++) {
-    if (saturate_test_) {
-      const int max_val = (1 << bd_) - 1;
-      SetArray(y_src, width, height, y_stride, max_val);
-      SetArray(y_pre, width, height, y_stride, 0);
-      SetArray(u_src, uv_width, uv_height, uv_stride, max_val);
-      SetArray(u_pre, uv_width, uv_height, uv_stride, 0);
-      SetArray(v_src, uv_width, uv_height, uv_stride, max_val);
-      SetArray(v_pre, uv_width, uv_height, uv_stride, 0);
-    } else {
-      const int max_val = 7 << (bd_ - 8);
-      SetArray(y_src, width, height, y_stride, &rnd_, 0, max_val);
-      SetArray(y_pre, width, height, y_stride, &rnd_, 0, max_val);
-      SetArray(u_src, uv_width, uv_height, uv_stride, &rnd_, 0, max_val);
-      SetArray(u_pre, uv_width, uv_height, uv_stride, &rnd_, 0, max_val);
-      SetArray(v_src, uv_width, uv_height, uv_stride, &rnd_, 0, max_val);
-      SetArray(v_pre, uv_width, uv_height, uv_stride, &rnd_, 0, max_val);
-    }
-
-    ApplyReferenceFilter<PixelType>(
-        y_src, y_pre, u_src, v_src, u_pre, v_pre, width, height, ss_x, ss_y,
-        filter_strength, filter_weight, use_32x32, y_accum_ref, y_count_ref,
-        u_accum_ref, u_count_ref, v_accum_ref, v_count_ref);
-
-    ApplyTestFilter(y_src, y_stride, y_pre, y_stride, u_src, v_src, uv_stride,
-                    u_pre, v_pre, uv_stride, width, height, ss_x, ss_y,
-                    filter_strength, filter_weight, use_32x32, y_accum_tst,
-                    y_count_tst, u_accum_tst, u_count_tst, v_accum_tst,
-                    v_count_tst);
-
-    EXPECT_TRUE(CheckArrayEqual(y_accum_tst, y_accum_ref, width, height,
-                                y_stride, y_stride));
-    EXPECT_TRUE(CheckArrayEqual(y_count_tst, y_count_ref, width, height,
-                                y_stride, y_stride));
-    EXPECT_TRUE(CheckArrayEqual(u_accum_tst, u_accum_ref, uv_width, uv_height,
-                                uv_stride, uv_stride));
-    EXPECT_TRUE(CheckArrayEqual(u_count_tst, u_count_ref, uv_width, uv_height,
-                                uv_stride, uv_stride));
-    EXPECT_TRUE(CheckArrayEqual(v_accum_tst, v_accum_ref, uv_width, uv_height,
-                                uv_stride, uv_stride));
-    EXPECT_TRUE(CheckArrayEqual(v_count_tst, v_count_ref, uv_width, uv_height,
-                                uv_stride, uv_stride));
-
-    if (HasFailure()) {
-      if (use_32x32) {
-        printf("SS_X: %d, SS_Y: %d, Strength: %d, Weight: %d\n", ss_x, ss_y,
-               filter_strength, *filter_weight);
-      } else {
-        printf("SS_X: %d, SS_Y: %d, Strength: %d, Weights: %d,%d,%d,%d\n", ss_x,
-               ss_y, filter_strength, filter_weight[0], filter_weight[1],
-               filter_weight[2], filter_weight[3]);
-      }
-
-      PrintArrayDiff(y_accum_ref, y_accum_tst, width, height, y_stride,
-                     y_stride);
-      PrintArrayDiff(y_count_ref, y_count_tst, width, height, y_stride,
-                     y_stride);
-      PrintArrayDiff(u_accum_ref, v_accum_tst, uv_width, uv_height, uv_stride,
-                     uv_stride);
-      PrintArrayDiff(u_count_ref, v_count_tst, uv_width, uv_height, uv_stride,
-                     uv_stride);
-      PrintArrayDiff(u_accum_ref, v_accum_tst, uv_width, uv_height, uv_stride,
-                     uv_stride);
-      PrintArrayDiff(u_count_ref, v_count_tst, uv_width, uv_height, uv_stride,
-                     uv_stride);
-
-      return;
-    }
-  }
-}
-
-template <typename PixelType>
-void TemporalFilterYUVTest::RunTestFilterWithParam(int width, int height,
-                                                   int ss_x, int ss_y,
-                                                   int filter_strength,
-                                                   int use_32x32,
-                                                   const int *filter_weight) {
-  PixelType y_src[MAX_WIDTH * MAX_HEIGHT] = { 0 };
-  PixelType y_pre[MAX_WIDTH * MAX_HEIGHT] = { 0 };
-  uint16_t y_count[MAX_WIDTH * MAX_HEIGHT] = { 0 };
-  uint32_t y_accum[MAX_WIDTH * MAX_HEIGHT] = { 0 };
-
-  PixelType u_src[MAX_WIDTH * MAX_HEIGHT] = { 0 };
-  PixelType u_pre[MAX_WIDTH * MAX_HEIGHT] = { 0 };
-  uint16_t u_count[MAX_WIDTH * MAX_HEIGHT] = { 0 };
-  uint32_t u_accum[MAX_WIDTH * MAX_HEIGHT] = { 0 };
-
-  PixelType v_src[MAX_WIDTH * MAX_HEIGHT] = { 0 };
-  PixelType v_pre[MAX_WIDTH * MAX_HEIGHT] = { 0 };
-  uint16_t v_count[MAX_WIDTH * MAX_HEIGHT] = { 0 };
-  uint32_t v_accum[MAX_WIDTH * MAX_HEIGHT] = { 0 };
-
-  SetArray(y_src, width, height, MAX_WIDTH, &rnd_, 0, 7 << (bd_ = 8));
-  SetArray(y_pre, width, height, MAX_WIDTH, &rnd_, 0, 7 << (bd_ = 8));
-  SetArray(u_src, width, height, MAX_WIDTH, &rnd_, 0, 7 << (bd_ = 8));
-  SetArray(u_pre, width, height, MAX_WIDTH, &rnd_, 0, 7 << (bd_ = 8));
-  SetArray(v_src, width, height, MAX_WIDTH, &rnd_, 0, 7 << (bd_ = 8));
-  SetArray(v_pre, width, height, MAX_WIDTH, &rnd_, 0, 7 << (bd_ = 8));
-
-  for (int repeats = 0; repeats < num_repeats_; repeats++) {
-    ApplyTestFilter(y_src, MAX_WIDTH, y_pre, MAX_WIDTH, u_src, v_src, MAX_WIDTH,
-                    u_pre, v_pre, MAX_WIDTH, width, height, ss_x, ss_y,
-                    filter_strength, filter_weight, use_32x32, y_accum, y_count,
-                    u_accum, u_count, v_accum, v_count);
-  }
-}
-
-TEST_P(TemporalFilterYUVTest, Use32x32) {
-  const int width = 32, height = 32;
-  const int use_32x32 = 1;
-
-  for (int ss_x = 0; ss_x <= 1; ss_x++) {
-    for (int ss_y = 0; ss_y <= 1; ss_y++) {
-      for (int filter_strength = 0; filter_strength <= 6;
-           filter_strength += 2) {
-        for (int filter_weight = 0; filter_weight <= 2; filter_weight++) {
-          if (use_highbd_) {
-            const int adjusted_strength = filter_strength + 2 * (bd_ - 8);
-            CompareTestWithParam<uint16_t>(width, height, ss_x, ss_y,
-                                           adjusted_strength, use_32x32,
-                                           &filter_weight);
-          } else {
-            CompareTestWithParam<uint8_t>(width, height, ss_x, ss_y,
-                                          filter_strength, use_32x32,
-                                          &filter_weight);
-          }
-          ASSERT_FALSE(HasFailure());
-        }
-      }
-    }
-  }
-}
-
-TEST_P(TemporalFilterYUVTest, Use16x16) {
-  const int width = 32, height = 32;
-  const int use_32x32 = 0;
-
-  for (int ss_x = 0; ss_x <= 1; ss_x++) {
-    for (int ss_y = 0; ss_y <= 1; ss_y++) {
-      for (int filter_idx = 0; filter_idx < 3 * 3 * 3 * 3; filter_idx++) {
-        // Set up the filter
-        int filter_weight[4];
-        int filter_idx_cp = filter_idx;
-        for (int idx = 0; idx < 4; idx++) {
-          filter_weight[idx] = filter_idx_cp % 3;
-          filter_idx_cp /= 3;
-        }
-
-        // Test each parameter
-        for (int filter_strength = 0; filter_strength <= 6;
-             filter_strength += 2) {
-          if (use_highbd_) {
-            const int adjusted_strength = filter_strength + 2 * (bd_ - 8);
-            CompareTestWithParam<uint16_t>(width, height, ss_x, ss_y,
-                                           adjusted_strength, use_32x32,
-                                           filter_weight);
-          } else {
-            CompareTestWithParam<uint8_t>(width, height, ss_x, ss_y,
-                                          filter_strength, use_32x32,
-                                          filter_weight);
-          }
-
-          ASSERT_FALSE(HasFailure());
-        }
-      }
-    }
-  }
-}
-
-TEST_P(TemporalFilterYUVTest, SaturationTest) {
-  const int width = 32, height = 32;
-  const int use_32x32 = 1;
-  const int filter_weight = 1;
-  saturate_test_ = 1;
-
-  for (int ss_x = 0; ss_x <= 1; ss_x++) {
-    for (int ss_y = 0; ss_y <= 1; ss_y++) {
-      for (int filter_strength = 0; filter_strength <= 6;
-           filter_strength += 2) {
-        if (use_highbd_) {
-          const int adjusted_strength = filter_strength + 2 * (bd_ - 8);
-          CompareTestWithParam<uint16_t>(width, height, ss_x, ss_y,
-                                         adjusted_strength, use_32x32,
-                                         &filter_weight);
-        } else {
-          CompareTestWithParam<uint8_t>(width, height, ss_x, ss_y,
-                                        filter_strength, use_32x32,
-                                        &filter_weight);
-        }
-
-        ASSERT_FALSE(HasFailure());
-      }
-    }
-  }
-}
-
-TEST_P(TemporalFilterYUVTest, DISABLED_Speed) {
-  const int width = 32, height = 32;
-  num_repeats_ = 1000;
-
-  for (int use_32x32 = 0; use_32x32 <= 1; use_32x32++) {
-    const int num_filter_weights = use_32x32 ? 3 : 3 * 3 * 3 * 3;
-    for (int ss_x = 0; ss_x <= 1; ss_x++) {
-      for (int ss_y = 0; ss_y <= 1; ss_y++) {
-        for (int filter_idx = 0; filter_idx < num_filter_weights;
-             filter_idx++) {
-          // Set up the filter
-          int filter_weight[4];
-          int filter_idx_cp = filter_idx;
-          for (int idx = 0; idx < 4; idx++) {
-            filter_weight[idx] = filter_idx_cp % 3;
-            filter_idx_cp /= 3;
-          }
-
-          // Test each parameter
-          for (int filter_strength = 0; filter_strength <= 6;
-               filter_strength += 2) {
-            aom_usec_timer timer;
-            aom_usec_timer_start(&timer);
-
-            if (use_highbd_) {
-              RunTestFilterWithParam<uint16_t>(width, height, ss_x, ss_y,
-                                               filter_strength, use_32x32,
-                                               filter_weight);
-            } else {
-              RunTestFilterWithParam<uint8_t>(width, height, ss_x, ss_y,
-                                              filter_strength, use_32x32,
-                                              filter_weight);
-            }
-
-            aom_usec_timer_mark(&timer);
-            const int elapsed_time =
-                static_cast<int>(aom_usec_timer_elapsed(&timer));
-
-            printf(
-                "Bitdepth: %d, Use 32X32: %d, SS_X: %d, SS_Y: %d, Weight Idx: "
-                "%d, Strength: %d, Time: %5d\n",
-                bd_, use_32x32, ss_x, ss_y, filter_idx, filter_strength,
-                elapsed_time);
-          }
-        }
-      }
-    }
-  }
-}
-
-INSTANTIATE_TEST_SUITE_P(
-    C, TemporalFilterYUVTest,
-    ::testing::Values(
-        TemporalFilterWithBd(&av1_apply_temporal_filter_yuv_c, 8),
-        TemporalFilterWithBd(&av1_apply_temporal_filter_yuv_c, 10),
-        TemporalFilterWithBd(&av1_apply_temporal_filter_yuv_c, 12)));
-
-#if HAVE_SSE4_1
-INSTANTIATE_TEST_SUITE_P(
-    SSE4_1, TemporalFilterYUVTest,
-    ::testing::Values(
-        TemporalFilterWithBd(&av1_apply_temporal_filter_yuv_sse4_1, 8),
-        TemporalFilterWithBd(&av1_apply_temporal_filter_yuv_sse4_1, 10),
-        TemporalFilterWithBd(&av1_apply_temporal_filter_yuv_sse4_1, 12)));
-#endif  // HAVE_SSE4_1
-
-}  // namespace
diff --git a/test/test-data.sha1 b/test/test-data.sha1
index 383ae79..b8cd5ce 100644
--- a/test/test-data.sha1
+++ b/test/test-data.sha1
@@ -15,33 +15,30 @@
 91a5bedeb4832c1c2900736cc0f644bb63971bbc *invalid-oss-fuzz-10227.ivf
 b055f06b9a95aaa5697fa26497b592a47843a7c8 *invalid-oss-fuzz-10227.ivf.res
 b2d0a29a65879436bf483d04865faca7d11cc2ee *invalid-oss-fuzz-10389.ivf
-9655e6275888547ecd1f14e20e08ce4891372e76 *invalid-oss-fuzz-10389.ivf.res
-e5fe0e8984c42d53d4ff734c3fbfd57d5c5c25cf *invalid-oss-fuzz-10389.ivf.res.2
+f4ce175af1d871ed1603c8936f6b78e968f93c85 *invalid-oss-fuzz-10389.ivf.res.4
 11df8e9a068669c678097d460b63609d3da73828 *invalid-oss-fuzz-10555.ivf
 b055f06b9a95aaa5697fa26497b592a47843a7c8 *invalid-oss-fuzz-10555.ivf.res
 cf5945085fe85456a1f74bf4cc7998b88b3f4b62 *invalid-oss-fuzz-10705.ivf
 758671858368ffd2a2c0727898de5661f7cf7d68 *invalid-oss-fuzz-10705.ivf.res
 88e29851122cca3f336824f7fa4d9f757f91110c *invalid-oss-fuzz-10723.ivf
-1af486cd2cc83ebeddc76ca7a1c512cc0ec568d5 *invalid-oss-fuzz-10723.ivf.res
 64f8a208dec7f1580fbe0371aa15e62bb1262715 *invalid-oss-fuzz-10723.ivf.res.2
 0784acc8931090ec24eba752d6c27e359e68fe7d *invalid-oss-fuzz-10779.ivf
 5d9474c0309b7ca09a182d888f73b37a8fe1362c *invalid-oss-fuzz-10779.ivf.res
 7d37be9357f89a100ced694aee1ca5a6fad35ba9 *invalid-oss-fuzz-11477.ivf
 15932651aacfc4622f0910f728f3f95e08e1753d *invalid-oss-fuzz-11477.ivf.res
 1674787c38ddf82a2e5c804203f04f56a304e8e0 *invalid-oss-fuzz-11479.ivf
-1af486cd2cc83ebeddc76ca7a1c512cc0ec568d5 *invalid-oss-fuzz-11479.ivf.res
 64f8a208dec7f1580fbe0371aa15e62bb1262715 *invalid-oss-fuzz-11479.ivf.res.2
 b1a45514f0c59be03c9991cd04882426b9b930fa *invalid-oss-fuzz-11523.ivf
-7c44ac1723c14d98bcb888fbf118c959511519ba *invalid-oss-fuzz-11523.ivf.res
 3198c7af55a7d50173ce3c369c0cf2d9cdfface6 *invalid-oss-fuzz-11523.ivf.res.2
 cb445173be760c3554f1740ce4d119f57a7be043 *invalid-oss-fuzz-15363.ivf
 d3964f9dad9f60363c81b688324d95b4ec7c8038 *invalid-oss-fuzz-15363.ivf.res
 5b697360bf0f02de31bae9b8da78e93570958fa4 *invalid-oss-fuzz-16437.ivf
-09d2af8dd22201dd8d48e5dcfcaed281ff9422c7 *invalid-oss-fuzz-16437.ivf.res
+d3964f9dad9f60363c81b688324d95b4ec7c8038 *invalid-oss-fuzz-16437.ivf.res.2
+e821070cea8eb687be102a1a118e0341c2e9df69 *invalid-oss-fuzz-24706.ivf
+d3964f9dad9f60363c81b688324d95b4ec7c8038 *invalid-oss-fuzz-24706.ivf.res
 ccbe4081557eb44820a0e6337c4a094421826b9a *invalid-oss-fuzz-9288.ivf
 67c54283fe1a26ccf02cc991e4f9a1eea3ac5e78 *invalid-oss-fuzz-9288.ivf.res
 c0960f032484579f967881cc025b71cfd7a79ee1 *invalid-oss-fuzz-9463.ivf
-d3964f9dad9f60363c81b688324d95b4ec7c8038 *invalid-oss-fuzz-9463.ivf.res
 5d9474c0309b7ca09a182d888f73b37a8fe1362c *invalid-oss-fuzz-9463.ivf.res.2
 f448caf378e250b7eea4fa2d1c3cd7ef4a3211ce *invalid-oss-fuzz-9482.ivf
 b055f06b9a95aaa5697fa26497b592a47843a7c8 *invalid-oss-fuzz-9482.ivf.res
@@ -557,3 +554,4 @@
 c58ccf7ff04711acc559c06f0bfce3c5b14800c3 *av1-1-b8-23-film_grain-50.ivf.md5
 2f883c7e11c21a31f79bd9c809541be90b0c7c4a *av1-1-b10-23-film_grain-50.ivf
 83f2094fca597ad38b4fd623b807de1774c53ffb *av1-1-b10-23-film_grain-50.ivf.md5
+644e05c6bc0418a72b86427aa01e8b4ecea85e03 *desktop1.320_180.yuv
diff --git a/test/test.cmake b/test/test.cmake
index d4d3b29..22e5ce8 100644
--- a/test/test.cmake
+++ b/test/test.cmake
@@ -27,11 +27,9 @@
             "${AOM_ROOT}/test/acm_random.h"
             "${AOM_ROOT}/test/aom_integer_test.cc"
             "${AOM_ROOT}/test/av1_config_test.cc"
-            "${AOM_ROOT}/test/blockd_test.cc"
+            "${AOM_ROOT}/test/block_test.cc"
             "${AOM_ROOT}/test/clear_system_state.h"
             "${AOM_ROOT}/test/codec_factory.h"
-            "${AOM_ROOT}/test/decode_test_driver.cc"
-            "${AOM_ROOT}/test/decode_test_driver.h"
             "${AOM_ROOT}/test/function_equivalence_test.h"
             "${AOM_ROOT}/test/log2_test.cc"
             "${AOM_ROOT}/test/md5_helper.h"
@@ -43,6 +41,12 @@
             "${AOM_ROOT}/test/util.h"
             "${AOM_ROOT}/test/video_source.h")
 
+if(CONFIG_AV1_DECODER)
+  list(APPEND AOM_UNIT_TEST_COMMON_SOURCES
+              "${AOM_ROOT}/test/decode_test_driver.cc"
+              "${AOM_ROOT}/test/decode_test_driver.h")
+endif()
+
 if(CONFIG_INTERNAL_STATS)
   list(APPEND AOM_UNIT_TEST_COMMON_SOURCES
               "${AOM_ROOT}/test/hbd_metrics_test.cc")
@@ -56,7 +60,6 @@
 
 list(APPEND AOM_UNIT_TEST_ENCODER_SOURCES
             "${AOM_ROOT}/test/active_map_test.cc"
-            "${AOM_ROOT}/test/altref_test.cc"
             "${AOM_ROOT}/test/aq_segment_test.cc"
             "${AOM_ROOT}/test/borders_test.cc"
             "${AOM_ROOT}/test/cpu_speed_test.cc"
@@ -64,20 +67,17 @@
             "${AOM_ROOT}/test/datarate_test.h"
             "${AOM_ROOT}/test/svc_datarate_test.cc"
             "${AOM_ROOT}/test/encode_api_test.cc"
+            "${AOM_ROOT}/test/encode_small_width_height_test.cc"
             "${AOM_ROOT}/test/encode_test_driver.cc"
             "${AOM_ROOT}/test/encode_test_driver.h"
             "${AOM_ROOT}/test/end_to_end_test.cc"
-            "${AOM_ROOT}/test/fwd_kf_test.cc"
             "${AOM_ROOT}/test/gf_pyr_height_test.cc"
             "${AOM_ROOT}/test/rt_end_to_end_test.cc"
-            "${AOM_ROOT}/test/error_resilience_test.cc"
             "${AOM_ROOT}/test/frame_size_tests.cc"
             "${AOM_ROOT}/test/horz_superres_test.cc"
             "${AOM_ROOT}/test/i420_video_source.h"
             "${AOM_ROOT}/test/level_test.cc"
-            "${AOM_ROOT}/test/lossless_test.cc"
             "${AOM_ROOT}/test/monochrome_test.cc"
-            "${AOM_ROOT}/test/qm_test.cc"
             "${AOM_ROOT}/test/resize_test.cc"
             "${AOM_ROOT}/test/scalability_test.cc"
             "${AOM_ROOT}/test/y4m_test.cc"
@@ -115,6 +115,7 @@
 
   if(CONFIG_AV1_DECODER AND CONFIG_AV1_ENCODER)
     list(APPEND AOM_UNIT_TEST_COMMON_SOURCES
+                "${AOM_ROOT}/test/altref_test.cc"
                 "${AOM_ROOT}/test/av1_encoder_parms_get_to_decoder.cc"
                 "${AOM_ROOT}/test/av1_ext_tile_test.cc"
                 "${AOM_ROOT}/test/binary_codes_test.cc"
@@ -125,18 +126,25 @@
                 "${AOM_ROOT}/test/divu_small_test.cc"
                 "${AOM_ROOT}/test/dr_prediction_test.cc"
                 "${AOM_ROOT}/test/ec_test.cc"
+                "${AOM_ROOT}/test/error_resilience_test.cc"
                 "${AOM_ROOT}/test/ethread_test.cc"
                 "${AOM_ROOT}/test/film_grain_table_test.cc"
+                "${AOM_ROOT}/test/fwd_kf_test.cc"
+                "${AOM_ROOT}/test/kf_test.cc"
+                "${AOM_ROOT}/test/lossless_test.cc"
+                "${AOM_ROOT}/test/quant_test.cc"
                 "${AOM_ROOT}/test/sb_multipass_test.cc"
+                "${AOM_ROOT}/test/screen_content_test.cc"
                 "${AOM_ROOT}/test/segment_binarization_sync.cc"
+                "${AOM_ROOT}/test/still_picture_test.cc"
                 "${AOM_ROOT}/test/superframe_test.cc"
+                "${AOM_ROOT}/test/tile_config_test.cc"
                 "${AOM_ROOT}/test/tile_independence_test.cc"
-                "${AOM_ROOT}/test/temporal_filter_planewise_test.cc"
-                "${AOM_ROOT}/test/temporal_filter_yuv_test.cc")
+                "${AOM_ROOT}/test/temporal_filter_test.cc")
     if(CONFIG_REALTIME_ONLY)
       list(REMOVE_ITEM AOM_UNIT_TEST_COMMON_SOURCES
                        "${AOM_ROOT}/test/cnn_test.cc"
-                       "${AOM_ROOT}/test/temporal_filter_yuv_test.cc")
+                       "${AOM_ROOT}/test/selfguided_filter_test.cc")
     endif()
     if(NOT CONFIG_AV1_HIGHBITDEPTH)
       list(REMOVE_ITEM AOM_UNIT_TEST_COMMON_SOURCES
@@ -170,7 +178,7 @@
                 "${AOM_ROOT}/test/simd_sse4_test.cc")
   endif()
 
-  if(HAVE_SSE4_1)
+  if(HAVE_SSE4_1 OR HAVE_NEON)
     list(APPEND AOM_UNIT_TEST_COMMON_SOURCES
                 "${AOM_ROOT}/test/filterintra_test.cc")
   endif()
@@ -184,9 +192,7 @@
 
   list(APPEND AOM_UNIT_TEST_ENCODER_SOURCES
               "${AOM_ROOT}/test/arf_freq_test.cc"
-              "${AOM_ROOT}/test/av1_convolve_2d_test.cc"
-              "${AOM_ROOT}/test/av1_convolve_2d_test_util.cc"
-              "${AOM_ROOT}/test/av1_convolve_2d_test_util.h"
+              "${AOM_ROOT}/test/av1_convolve_test.cc"
               "${AOM_ROOT}/test/av1_fwd_txfm1d_test.cc"
               "${AOM_ROOT}/test/av1_fwd_txfm2d_test.cc"
               "${AOM_ROOT}/test/av1_inv_txfm1d_test.cc"
@@ -222,15 +228,34 @@
               "${AOM_ROOT}/test/subtract_test.cc"
               "${AOM_ROOT}/test/reconinter_test.cc"
               "${AOM_ROOT}/test/sum_squares_test.cc"
+              "${AOM_ROOT}/test/sse_sum_test.cc"
               "${AOM_ROOT}/test/variance_test.cc"
               "${AOM_ROOT}/test/wiener_test.cc"
               "${AOM_ROOT}/test/frame_error_test.cc"
               "${AOM_ROOT}/test/warp_filter_test.cc"
               "${AOM_ROOT}/test/warp_filter_test_util.cc"
-              "${AOM_ROOT}/test/warp_filter_test_util.h")
+              "${AOM_ROOT}/test/warp_filter_test_util.h"
+              "${AOM_ROOT}/test/webmenc_test.cc"
+              "${AOM_ROOT}/test/av1_k_means_test.cc")
+
+  if(CONFIG_REALTIME_ONLY)
+    list(REMOVE_ITEM AOM_UNIT_TEST_ENCODER_SOURCES
+                     "${AOM_ROOT}/test/frame_error_test.cc"
+                     "${AOM_ROOT}/test/obmc_sad_test.cc"
+                     "${AOM_ROOT}/test/obmc_variance_test.cc"
+                     "${AOM_ROOT}/test/pickrst_test.cc"
+                     "${AOM_ROOT}/test/warp_filter_test.cc"
+                     "${AOM_ROOT}/test/warp_filter_test_util.cc"
+                     "${AOM_ROOT}/test/warp_filter_test_util.h"
+                     "${AOM_ROOT}/test/wiener_test.cc")
+  endif()
+
+  if((HAVE_SSE4_1 OR HAVE_NEON))
+    list(APPEND AOM_UNIT_TEST_ENCODER_SOURCES
+                "${AOM_ROOT}/test/av1_highbd_iht_test.cc")
+  endif()
 
   list(APPEND AOM_UNIT_TEST_ENCODER_INTRIN_SSE4_1
-              "${AOM_ROOT}/test/av1_highbd_iht_test.cc"
               "${AOM_ROOT}/test/av1_quantize_test.cc"
               "${AOM_ROOT}/test/corner_match_test.cc"
               "${AOM_ROOT}/test/simd_cmp_sse4.cc")
diff --git a/test/test_data_util.cmake b/test/test_data_util.cmake
index 050600e..142a137 100644
--- a/test/test_data_util.cmake
+++ b/test/test_data_util.cmake
@@ -10,6 +10,7 @@
 #
 
 list(APPEND AOM_TEST_DATA_FILE_NAMES
+            "desktop1.320_180.yuv"
             "hantro_collage_w352h288.yuv"
             "hantro_odd.yuv"
             "paris_352_288_30.y4m"
@@ -533,33 +534,30 @@
               "invalid-oss-fuzz-10227.ivf"
               "invalid-oss-fuzz-10227.ivf.res"
               "invalid-oss-fuzz-10389.ivf"
-              "invalid-oss-fuzz-10389.ivf.res"
-              "invalid-oss-fuzz-10389.ivf.res.2"
+              "invalid-oss-fuzz-10389.ivf.res.4"
               "invalid-oss-fuzz-10555.ivf"
               "invalid-oss-fuzz-10555.ivf.res"
               "invalid-oss-fuzz-10705.ivf"
               "invalid-oss-fuzz-10705.ivf.res"
               "invalid-oss-fuzz-10723.ivf"
-              "invalid-oss-fuzz-10723.ivf.res"
               "invalid-oss-fuzz-10723.ivf.res.2"
               "invalid-oss-fuzz-10779.ivf"
               "invalid-oss-fuzz-10779.ivf.res"
               "invalid-oss-fuzz-11477.ivf"
               "invalid-oss-fuzz-11477.ivf.res"
               "invalid-oss-fuzz-11479.ivf"
-              "invalid-oss-fuzz-11479.ivf.res"
               "invalid-oss-fuzz-11479.ivf.res.2"
               "invalid-oss-fuzz-11523.ivf"
-              "invalid-oss-fuzz-11523.ivf.res"
               "invalid-oss-fuzz-11523.ivf.res.2"
               "invalid-oss-fuzz-15363.ivf"
               "invalid-oss-fuzz-15363.ivf.res"
               "invalid-oss-fuzz-16437.ivf"
-              "invalid-oss-fuzz-16437.ivf.res"
+              "invalid-oss-fuzz-16437.ivf.res.2"
+              "invalid-oss-fuzz-24706.ivf"
+              "invalid-oss-fuzz-24706.ivf.res"
               "invalid-oss-fuzz-9288.ivf"
               "invalid-oss-fuzz-9288.ivf.res"
               "invalid-oss-fuzz-9463.ivf"
-              "invalid-oss-fuzz-9463.ivf.res"
               "invalid-oss-fuzz-9463.ivf.res.2"
               "invalid-oss-fuzz-9482.ivf"
               "invalid-oss-fuzz-9482.ivf.res"
diff --git a/test/test_intra_pred_speed.cc b/test/test_intra_pred_speed.cc
index 25c50d0..54bdbcb 100644
--- a/test/test_intra_pred_speed.cc
+++ b/test/test_intra_pred_speed.cc
@@ -427,12 +427,14 @@
                 aom_smooth_predictor_4x8_c, aom_smooth_v_predictor_4x8_c,
                 aom_smooth_h_predictor_4x8_c)
 
+#if !CONFIG_REALTIME_ONLY
 INTRA_PRED_TEST(C_3, TX_4X16, aom_dc_predictor_4x16_c,
                 aom_dc_left_predictor_4x16_c, aom_dc_top_predictor_4x16_c,
                 aom_dc_128_predictor_4x16_c, aom_v_predictor_4x16_c,
                 aom_h_predictor_4x16_c, aom_paeth_predictor_4x16_c,
                 aom_smooth_predictor_4x16_c, aom_smooth_v_predictor_4x16_c,
                 aom_smooth_h_predictor_4x16_c)
+#endif
 
 #if HAVE_SSE2
 INTRA_PRED_TEST(SSE2_1, TX_4X4, aom_dc_predictor_4x4_sse2,
@@ -443,10 +445,12 @@
                 aom_dc_left_predictor_4x8_sse2, aom_dc_top_predictor_4x8_sse2,
                 aom_dc_128_predictor_4x8_sse2, aom_v_predictor_4x8_sse2,
                 aom_h_predictor_4x8_sse2, NULL, NULL, NULL, NULL)
+#if !CONFIG_REALTIME_ONLY
 INTRA_PRED_TEST(SSE2_3, TX_4X16, aom_dc_predictor_4x16_sse2,
                 aom_dc_left_predictor_4x16_sse2, aom_dc_top_predictor_4x16_sse2,
                 aom_dc_128_predictor_4x16_sse2, aom_v_predictor_4x16_sse2,
                 aom_h_predictor_4x16_sse2, NULL, NULL, NULL, NULL)
+#endif
 #endif  // HAVE_SSE2
 
 #if HAVE_SSSE3
@@ -458,10 +462,12 @@
                 aom_paeth_predictor_4x8_ssse3, aom_smooth_predictor_4x8_ssse3,
                 aom_smooth_v_predictor_4x8_ssse3,
                 aom_smooth_h_predictor_4x8_ssse3)
+#if !CONFIG_REALTIME_ONLY
 INTRA_PRED_TEST(SSSE3_3, TX_4X16, NULL, NULL, NULL, NULL, NULL, NULL,
                 aom_paeth_predictor_4x16_ssse3, aom_smooth_predictor_4x16_ssse3,
                 aom_smooth_v_predictor_4x16_ssse3,
                 aom_smooth_h_predictor_4x16_ssse3)
+#endif
 #endif  // HAVE_SSSE3
 
 #if HAVE_DSPR2
@@ -507,12 +513,14 @@
                 aom_smooth_predictor_8x16_c, aom_smooth_v_predictor_8x16_c,
                 aom_smooth_h_predictor_8x16_c)
 
+#if !CONFIG_REALTIME_ONLY
 INTRA_PRED_TEST(C_4, TX_8X32, aom_dc_predictor_8x32_c,
                 aom_dc_left_predictor_8x32_c, aom_dc_top_predictor_8x32_c,
                 aom_dc_128_predictor_8x32_c, aom_v_predictor_8x32_c,
                 aom_h_predictor_8x32_c, aom_paeth_predictor_8x32_c,
                 aom_smooth_predictor_8x32_c, aom_smooth_v_predictor_8x32_c,
                 aom_smooth_h_predictor_8x32_c)
+#endif
 
 #if HAVE_SSE2
 INTRA_PRED_TEST(SSE2_1, TX_8X8, aom_dc_predictor_8x8_sse2,
@@ -527,10 +535,12 @@
                 aom_dc_left_predictor_8x16_sse2, aom_dc_top_predictor_8x16_sse2,
                 aom_dc_128_predictor_8x16_sse2, aom_v_predictor_8x16_sse2,
                 aom_h_predictor_8x16_sse2, NULL, NULL, NULL, NULL)
+#if !CONFIG_REALTIME_ONLY
 INTRA_PRED_TEST(SSE2_4, TX_8X32, aom_dc_predictor_8x32_sse2,
                 aom_dc_left_predictor_8x32_sse2, aom_dc_top_predictor_8x32_sse2,
                 aom_dc_128_predictor_8x32_sse2, aom_v_predictor_8x32_sse2,
                 aom_h_predictor_8x32_sse2, NULL, NULL, NULL, NULL)
+#endif
 #endif  // HAVE_SSE2
 
 #if HAVE_SSSE3
@@ -546,10 +556,12 @@
                 aom_paeth_predictor_8x16_ssse3, aom_smooth_predictor_8x16_ssse3,
                 aom_smooth_v_predictor_8x16_ssse3,
                 aom_smooth_h_predictor_8x16_ssse3)
+#if !CONFIG_REALTIME_ONLY
 INTRA_PRED_TEST(SSSE3_4, TX_8X32, NULL, NULL, NULL, NULL, NULL, NULL,
                 aom_paeth_predictor_8x32_ssse3, aom_smooth_predictor_8x32_ssse3,
                 aom_smooth_v_predictor_8x32_ssse3,
                 aom_smooth_h_predictor_8x32_ssse3)
+#endif
 #endif  // HAVE_SSSE3
 
 #if HAVE_DSPR2
@@ -595,6 +607,7 @@
                 aom_smooth_predictor_16x32_c, aom_smooth_v_predictor_16x32_c,
                 aom_smooth_h_predictor_16x32_c)
 
+#if !CONFIG_REALTIME_ONLY
 INTRA_PRED_TEST(C_4, TX_16X4, aom_dc_predictor_16x4_c,
                 aom_dc_left_predictor_16x4_c, aom_dc_top_predictor_16x4_c,
                 aom_dc_128_predictor_16x4_c, aom_v_predictor_16x4_c,
@@ -608,6 +621,7 @@
                 aom_h_predictor_16x64_c, aom_paeth_predictor_16x64_c,
                 aom_smooth_predictor_16x64_c, aom_smooth_v_predictor_16x64_c,
                 aom_smooth_h_predictor_16x64_c)
+#endif
 
 #if HAVE_SSE2
 INTRA_PRED_TEST(SSE2_1, TX_16X16, aom_dc_predictor_16x16_sse2,
@@ -624,6 +638,7 @@
                 aom_dc_top_predictor_16x32_sse2,
                 aom_dc_128_predictor_16x32_sse2, aom_v_predictor_16x32_sse2,
                 aom_h_predictor_16x32_sse2, NULL, NULL, NULL, NULL)
+#if !CONFIG_REALTIME_ONLY
 INTRA_PRED_TEST(SSE2_4, TX_16X64, aom_dc_predictor_16x64_sse2,
                 aom_dc_left_predictor_16x64_sse2,
                 aom_dc_top_predictor_16x64_sse2,
@@ -633,6 +648,7 @@
                 aom_dc_left_predictor_16x4_sse2, aom_dc_top_predictor_16x4_sse2,
                 aom_dc_128_predictor_16x4_sse2, aom_v_predictor_16x4_sse2,
                 aom_h_predictor_16x4_sse2, NULL, NULL, NULL, NULL)
+#endif
 #endif  // HAVE_SSE2
 
 #if HAVE_SSSE3
@@ -650,6 +666,7 @@
                 aom_smooth_predictor_16x32_ssse3,
                 aom_smooth_v_predictor_16x32_ssse3,
                 aom_smooth_h_predictor_16x32_ssse3)
+#if !CONFIG_REALTIME_ONLY
 INTRA_PRED_TEST(SSSE3_4, TX_16X64, NULL, NULL, NULL, NULL, NULL, NULL,
                 aom_paeth_predictor_16x64_ssse3,
                 aom_smooth_predictor_16x64_ssse3,
@@ -659,6 +676,7 @@
                 aom_paeth_predictor_16x4_ssse3, aom_smooth_predictor_16x4_ssse3,
                 aom_smooth_v_predictor_16x4_ssse3,
                 aom_smooth_h_predictor_16x4_ssse3)
+#endif
 #endif  // HAVE_SSSE3
 
 #if HAVE_AVX2
@@ -668,8 +686,10 @@
                 aom_paeth_predictor_16x8_avx2, NULL, NULL, NULL)
 INTRA_PRED_TEST(AVX2_3, TX_16X32, NULL, NULL, NULL, NULL, NULL, NULL,
                 aom_paeth_predictor_16x32_avx2, NULL, NULL, NULL)
+#if !CONFIG_REALTIME_ONLY
 INTRA_PRED_TEST(AVX2_4, TX_16X64, NULL, NULL, NULL, NULL, NULL, NULL,
                 aom_paeth_predictor_16x64_avx2, NULL, NULL, NULL)
+#endif
 #endif  // HAVE_AVX2
 
 #if HAVE_DSPR2
@@ -716,12 +736,14 @@
                 aom_smooth_predictor_32x64_c, aom_smooth_v_predictor_32x64_c,
                 aom_smooth_h_predictor_32x64_c)
 
+#if !CONFIG_REALTIME_ONLY
 INTRA_PRED_TEST(C_4, TX_32X8, aom_dc_predictor_32x8_c,
                 aom_dc_left_predictor_32x8_c, aom_dc_top_predictor_32x8_c,
                 aom_dc_128_predictor_32x8_c, aom_v_predictor_32x8_c,
                 aom_h_predictor_32x8_c, aom_paeth_predictor_32x8_c,
                 aom_smooth_predictor_32x8_c, aom_smooth_v_predictor_32x8_c,
                 aom_smooth_h_predictor_32x8_c)
+#endif
 
 #if HAVE_SSE2
 INTRA_PRED_TEST(SSE2_1, TX_32X32, aom_dc_predictor_32x32_sse2,
@@ -739,10 +761,12 @@
                 aom_dc_top_predictor_32x64_sse2,
                 aom_dc_128_predictor_32x64_sse2, aom_v_predictor_32x64_sse2,
                 aom_h_predictor_32x64_sse2, NULL, NULL, NULL, NULL)
+#if !CONFIG_REALTIME_ONLY
 INTRA_PRED_TEST(SSE2_4, TX_32X8, aom_dc_predictor_32x8_sse2,
                 aom_dc_left_predictor_32x8_sse2, aom_dc_top_predictor_32x8_sse2,
                 aom_dc_128_predictor_32x8_sse2, aom_v_predictor_32x8_sse2,
                 aom_h_predictor_32x8_sse2, NULL, NULL, NULL, NULL)
+#endif
 #endif  // HAVE_SSE2
 
 #if HAVE_SSSE3
@@ -761,10 +785,12 @@
                 aom_smooth_predictor_32x64_ssse3,
                 aom_smooth_v_predictor_32x64_ssse3,
                 aom_smooth_h_predictor_32x64_ssse3)
+#if !CONFIG_REALTIME_ONLY
 INTRA_PRED_TEST(SSSE3_4, TX_32X8, NULL, NULL, NULL, NULL, NULL, NULL,
                 aom_paeth_predictor_32x8_ssse3, aom_smooth_predictor_32x8_ssse3,
                 aom_smooth_v_predictor_32x8_ssse3,
                 aom_smooth_h_predictor_32x8_ssse3)
+#endif
 #endif  // HAVE_SSSE3
 
 #if HAVE_AVX2
@@ -818,12 +844,14 @@
                 aom_smooth_predictor_64x32_c, aom_smooth_v_predictor_64x32_c,
                 aom_smooth_h_predictor_64x32_c)
 
+#if !CONFIG_REALTIME_ONLY
 INTRA_PRED_TEST(C_3, TX_64X16, aom_dc_predictor_64x16_c,
                 aom_dc_left_predictor_64x16_c, aom_dc_top_predictor_64x16_c,
                 aom_dc_128_predictor_64x16_c, aom_v_predictor_64x16_c,
                 aom_h_predictor_64x16_c, aom_paeth_predictor_64x16_c,
                 aom_smooth_predictor_64x16_c, aom_smooth_v_predictor_64x16_c,
                 aom_smooth_h_predictor_64x16_c)
+#endif
 
 #if HAVE_SSE2
 INTRA_PRED_TEST(SSE2_4, TX_64X64, aom_dc_predictor_64x64_sse2,
@@ -836,12 +864,14 @@
                 aom_dc_top_predictor_64x32_sse2,
                 aom_dc_128_predictor_64x32_sse2, aom_v_predictor_64x32_sse2,
                 aom_h_predictor_64x32_sse2, NULL, NULL, NULL, NULL)
+#if !CONFIG_REALTIME_ONLY
 INTRA_PRED_TEST(SSE2_6, TX_64X16, aom_dc_predictor_64x16_sse2,
                 aom_dc_left_predictor_64x16_sse2,
                 aom_dc_top_predictor_64x16_sse2,
                 aom_dc_128_predictor_64x16_sse2, aom_v_predictor_64x16_sse2,
                 aom_h_predictor_64x16_sse2, NULL, NULL, NULL, NULL)
 #endif
+#endif
 
 #if HAVE_SSSE3
 INTRA_PRED_TEST(SSSE3_4, TX_64X64, NULL, NULL, NULL, NULL, NULL, NULL,
@@ -854,12 +884,14 @@
                 aom_smooth_predictor_64x32_ssse3,
                 aom_smooth_v_predictor_64x32_ssse3,
                 aom_smooth_h_predictor_64x32_ssse3)
+#if !CONFIG_REALTIME_ONLY
 INTRA_PRED_TEST(SSSE3_6, TX_64X16, NULL, NULL, NULL, NULL, NULL, NULL,
                 aom_paeth_predictor_64x16_ssse3,
                 aom_smooth_predictor_64x16_ssse3,
                 aom_smooth_v_predictor_64x16_ssse3,
                 aom_smooth_h_predictor_64x16_ssse3)
 #endif
+#endif
 
 #if HAVE_AVX2
 INTRA_PRED_TEST(AVX2_4, TX_64X64, aom_dc_predictor_64x64_avx2,
@@ -872,12 +904,14 @@
                 aom_dc_top_predictor_64x32_avx2,
                 aom_dc_128_predictor_64x32_avx2, aom_v_predictor_64x32_avx2,
                 NULL, aom_paeth_predictor_64x32_avx2, NULL, NULL, NULL)
+#if !CONFIG_REALTIME_ONLY
 INTRA_PRED_TEST(AVX2_6, TX_64X16, aom_dc_predictor_64x16_avx2,
                 aom_dc_left_predictor_64x16_avx2,
                 aom_dc_top_predictor_64x16_avx2,
                 aom_dc_128_predictor_64x16_avx2, aom_v_predictor_64x16_avx2,
                 NULL, aom_paeth_predictor_64x16_avx2, NULL, NULL, NULL)
 #endif
+#endif
 
 #if CONFIG_AV1_HIGHBITDEPTH
 // -----------------------------------------------------------------------------
@@ -1201,7 +1235,7 @@
     aom_highbd_h_predictor_4x8_c, aom_highbd_paeth_predictor_4x8_c,
     aom_highbd_smooth_predictor_4x8_c, aom_highbd_smooth_v_predictor_4x8_c,
     aom_highbd_smooth_h_predictor_4x8_c)
-
+#if !CONFIG_REALTIME_ONLY
 HIGHBD_INTRA_PRED_TEST(
     C_3, TX_4X16, aom_highbd_dc_predictor_4x16_c,
     aom_highbd_dc_left_predictor_4x16_c, aom_highbd_dc_top_predictor_4x16_c,
@@ -1209,7 +1243,7 @@
     aom_highbd_h_predictor_4x16_c, aom_highbd_paeth_predictor_4x16_c,
     aom_highbd_smooth_predictor_4x16_c, aom_highbd_smooth_v_predictor_4x16_c,
     aom_highbd_smooth_h_predictor_4x16_c)
-
+#endif
 #if HAVE_SSE2
 HIGHBD_INTRA_PRED_TEST(SSE2_1, TX_4X4, aom_highbd_dc_predictor_4x4_sse2,
                        aom_highbd_dc_left_predictor_4x4_sse2,
@@ -1252,7 +1286,7 @@
     aom_highbd_h_predictor_8x16_c, aom_highbd_paeth_predictor_8x16_c,
     aom_highbd_smooth_predictor_8x16_c, aom_highbd_smooth_v_predictor_8x16_c,
     aom_highbd_smooth_h_predictor_8x16_c)
-
+#if !CONFIG_REALTIME_ONLY
 HIGHBD_INTRA_PRED_TEST(
     C_4, TX_8X32, aom_highbd_dc_predictor_8x32_c,
     aom_highbd_dc_left_predictor_8x32_c, aom_highbd_dc_top_predictor_8x32_c,
@@ -1260,7 +1294,7 @@
     aom_highbd_h_predictor_8x32_c, aom_highbd_paeth_predictor_8x32_c,
     aom_highbd_smooth_predictor_8x32_c, aom_highbd_smooth_v_predictor_8x32_c,
     aom_highbd_smooth_h_predictor_8x32_c)
-
+#endif
 #if HAVE_SSE2
 HIGHBD_INTRA_PRED_TEST(SSE2_1, TX_8X8, aom_highbd_dc_predictor_8x8_sse2,
                        aom_highbd_dc_left_predictor_8x8_sse2,
@@ -1314,6 +1348,7 @@
     aom_highbd_smooth_predictor_16x32_c, aom_highbd_smooth_v_predictor_16x32_c,
     aom_highbd_smooth_h_predictor_16x32_c)
 
+#if !CONFIG_REALTIME_ONLY
 HIGHBD_INTRA_PRED_TEST(
     C_4, TX_16X4, aom_highbd_dc_predictor_16x4_c,
     aom_highbd_dc_left_predictor_16x4_c, aom_highbd_dc_top_predictor_16x4_c,
@@ -1329,6 +1364,7 @@
     aom_highbd_h_predictor_16x64_c, aom_highbd_paeth_predictor_16x64_c,
     aom_highbd_smooth_predictor_16x64_c, aom_highbd_smooth_v_predictor_16x64_c,
     aom_highbd_smooth_h_predictor_16x64_c)
+#endif
 
 #if HAVE_SSE2
 HIGHBD_INTRA_PRED_TEST(SSE2_1, TX_16X16, aom_highbd_dc_predictor_16x16_sse2,
@@ -1396,6 +1432,7 @@
     aom_highbd_smooth_predictor_32x64_c, aom_highbd_smooth_v_predictor_32x64_c,
     aom_highbd_smooth_h_predictor_32x64_c)
 
+#if !CONFIG_REALTIME_ONLY
 HIGHBD_INTRA_PRED_TEST(
     C_4, TX_32X8, aom_highbd_dc_predictor_32x8_c,
     aom_highbd_dc_left_predictor_32x8_c, aom_highbd_dc_top_predictor_32x8_c,
@@ -1403,6 +1440,7 @@
     aom_highbd_h_predictor_32x8_c, aom_highbd_paeth_predictor_32x8_c,
     aom_highbd_smooth_predictor_32x8_c, aom_highbd_smooth_v_predictor_32x8_c,
     aom_highbd_smooth_h_predictor_32x8_c)
+#endif
 
 #if HAVE_SSE2
 HIGHBD_INTRA_PRED_TEST(SSE2_1, TX_32X32, aom_highbd_dc_predictor_32x32_sse2,
@@ -1453,6 +1491,7 @@
     aom_highbd_smooth_predictor_64x32_c, aom_highbd_smooth_v_predictor_64x32_c,
     aom_highbd_smooth_h_predictor_64x32_c)
 
+#if !CONFIG_REALTIME_ONLY
 HIGHBD_INTRA_PRED_TEST(
     C_3, TX_64X16, aom_highbd_dc_predictor_64x16_c,
     aom_highbd_dc_left_predictor_64x16_c, aom_highbd_dc_top_predictor_64x16_c,
@@ -1460,6 +1499,7 @@
     aom_highbd_h_predictor_64x16_c, aom_highbd_paeth_predictor_64x16_c,
     aom_highbd_smooth_predictor_64x16_c, aom_highbd_smooth_v_predictor_64x16_c,
     aom_highbd_smooth_h_predictor_64x16_c)
+#endif
 
 // -----------------------------------------------------------------------------
 #endif  // CONFIG_AV1_HIGHBITDEPTH
diff --git a/test/test_vector_test.cc b/test/test_vector_test.cc
index eab92b6..5cfcb6f 100644
--- a/test/test_vector_test.cc
+++ b/test/test_vector_test.cc
@@ -148,7 +148,7 @@
 }
 
 #if CONFIG_AV1_DECODER
-AV1_INSTANTIATE_TEST_CASE(
+AV1_INSTANTIATE_TEST_SUITE(
     TestVectorTest,
     ::testing::Combine(::testing::Values(1),  // Single thread.
                        ::testing::ValuesIn(libaom_test::kAV1TestVectors,
diff --git a/test/tile_config_test.cc b/test/tile_config_test.cc
new file mode 100644
index 0000000..8c92c14
--- /dev/null
+++ b/test/tile_config_test.cc
@@ -0,0 +1,361 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom/aom_codec.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/y4m_video_source.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+
+namespace {
+typedef struct {
+  // Superblock size
+  const unsigned int sb_size;
+  // log2(number of tile rows)
+  const unsigned int tile_rows;
+  // log2(number of tile columns)
+  const unsigned int tile_cols;
+} uniformTileConfigParam;
+
+static const uniformTileConfigParam uniformTileConfigParams[] = {
+  { 128, 0, 0 }, { 128, 0, 2 }, { 128, 2, 0 }, { 128, 1, 2 }, { 128, 2, 2 },
+  { 128, 3, 2 }, { 64, 0, 0 },  { 64, 0, 2 },  { 64, 2, 0 },  { 64, 1, 2 },
+  { 64, 2, 2 },  { 64, 3, 3 },  { 64, 4, 4 }
+};
+
+typedef struct {
+  // Superblock size
+  const unsigned int sb_size;
+  // number of tile widths
+  const unsigned int tile_width_count;
+  // list of tile widths
+  int tile_widths[AOM_MAX_TILE_COLS];
+  // number of tile heights
+  const unsigned int tile_height_count;
+  // list of tile heights
+  int tile_heights[AOM_MAX_TILE_ROWS];
+} nonUniformTileConfigParam;
+
+const nonUniformTileConfigParam nonUniformTileConfigParams[] = {
+  { 64, 1, { 3 }, 1, { 3 } },          { 64, 2, { 1, 2 }, 2, { 1, 2 } },
+  { 64, 3, { 2, 3, 4 }, 2, { 2, 3 } }, { 128, 1, { 3 }, 1, { 3 } },
+  { 128, 2, { 1, 2 }, 2, { 1, 2 } },   { 128, 3, { 2, 3, 4 }, 2, { 2, 3 } },
+};
+
+// Find smallest k>=0 such that (blk_size << k) >= target
+static INLINE int tile_log2(int blk_size, int target) {
+  int k;
+  for (k = 0; (blk_size << k) < target; k++) {
+  }
+  return k;
+}
+
+// This class is used to validate tile configuration for uniform spacing.
+class UniformTileConfigTestLarge
+    : public ::libaom_test::CodecTestWith3Params<
+          libaom_test::TestMode, uniformTileConfigParam, aom_rc_mode>,
+      public ::libaom_test::EncoderTest {
+ protected:
+  UniformTileConfigTestLarge()
+      : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)),
+        tile_config_param_(GET_PARAM(2)), end_usage_check_(GET_PARAM(3)) {
+    tile_config_violated_ = false;
+    max_tile_cols_log2_ = tile_log2(1, AOM_MAX_TILE_COLS);
+    max_tile_rows_log2_ = tile_log2(1, AOM_MAX_TILE_ROWS);
+  }
+  virtual ~UniformTileConfigTestLarge() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(encoding_mode_);
+    const aom_rational timebase = { 1, 30 };
+    cfg_.g_timebase = timebase;
+    cfg_.rc_end_usage = end_usage_check_;
+    cfg_.g_threads = 1;
+    cfg_.g_lag_in_frames = 19;
+  }
+
+  virtual bool DoDecode() const { return 1; }
+
+  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                                  ::libaom_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      encoder->Control(AV1E_SET_TILE_COLUMNS, tile_config_param_.tile_cols);
+      encoder->Control(AV1E_SET_TILE_ROWS, tile_config_param_.tile_rows);
+      encoder->Control(AOME_SET_CPUUSED, 5);
+      encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
+      encoder->Control(AV1E_SET_SUPERBLOCK_SIZE,
+                       tile_config_param_.sb_size == 64
+                           ? AOM_SUPERBLOCK_SIZE_64X64
+                           : AOM_SUPERBLOCK_SIZE_128X128);
+    }
+  }
+
+  virtual bool HandleDecodeResult(const aom_codec_err_t res_dec,
+                                  libaom_test::Decoder *decoder) {
+    EXPECT_EQ(AOM_CODEC_OK, res_dec) << decoder->DecodeError();
+    if (AOM_CODEC_OK == res_dec) {
+      aom_codec_ctx_t *ctx_dec = decoder->GetDecoder();
+      aom_tile_info tile_info;
+      int config_tile_columns = AOMMIN(1 << (int)tile_config_param_.tile_cols,
+                                       1 << max_tile_cols_log2_);
+      int config_tile_rows = AOMMIN(1 << (int)tile_config_param_.tile_rows,
+                                    1 << max_tile_rows_log2_);
+
+      AOM_CODEC_CONTROL_TYPECHECKED(ctx_dec, AOMD_GET_TILE_INFO, &tile_info);
+      if (tile_info.tile_columns != config_tile_columns ||
+          tile_info.tile_rows != config_tile_rows) {
+        tile_config_violated_ = true;
+      }
+    }
+    return AOM_CODEC_OK == res_dec;
+  }
+
+  ::libaom_test::TestMode encoding_mode_;
+  const uniformTileConfigParam tile_config_param_;
+  int max_tile_cols_log2_;
+  int max_tile_rows_log2_;
+  bool tile_config_violated_;
+  aom_rc_mode end_usage_check_;
+};
+
+// This class is used to validate tile configuration for non uniform spacing.
+class NonUniformTileConfigTestLarge
+    : public ::libaom_test::CodecTestWith3Params<
+          libaom_test::TestMode, nonUniformTileConfigParam, aom_rc_mode>,
+      public ::libaom_test::EncoderTest {
+ protected:
+  NonUniformTileConfigTestLarge()
+      : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)),
+        tile_config_param_(GET_PARAM(2)), rc_end_usage_(GET_PARAM(3)) {
+    tile_config_violated_ = false;
+  }
+  virtual ~NonUniformTileConfigTestLarge() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(encoding_mode_);
+    const aom_rational timebase = { 1, 30 };
+    cfg_.g_timebase = timebase;
+    cfg_.rc_end_usage = rc_end_usage_;
+    cfg_.g_threads = 1;
+    cfg_.g_lag_in_frames = 35;
+    cfg_.rc_target_bitrate = 1000;
+    cfg_.tile_width_count = tile_config_param_.tile_width_count;
+    memcpy(cfg_.tile_widths, tile_config_param_.tile_widths,
+           sizeof(tile_config_param_.tile_widths[0]) *
+               tile_config_param_.tile_width_count);
+    cfg_.tile_height_count = tile_config_param_.tile_height_count;
+    memcpy(cfg_.tile_heights, tile_config_param_.tile_heights,
+           sizeof(tile_config_param_.tile_heights[0]) *
+               tile_config_param_.tile_height_count);
+  }
+
+  virtual bool DoDecode() const { return 1; }
+
+  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                                  ::libaom_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      encoder->Control(AOME_SET_CPUUSED, 5);
+      encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
+      encoder->Control(AV1E_SET_SUPERBLOCK_SIZE,
+                       tile_config_param_.sb_size == 64
+                           ? AOM_SUPERBLOCK_SIZE_64X64
+                           : AOM_SUPERBLOCK_SIZE_128X128);
+    }
+  }
+
+  virtual bool HandleDecodeResult(const aom_codec_err_t res_dec,
+                                  libaom_test::Decoder *decoder) {
+    EXPECT_EQ(AOM_CODEC_OK, res_dec) << decoder->DecodeError();
+    if (AOM_CODEC_OK == res_dec) {
+      aom_codec_ctx_t *ctx_dec = decoder->GetDecoder();
+      aom_tile_info tile_info;
+      AOM_CODEC_CONTROL_TYPECHECKED(ctx_dec, AOMD_GET_TILE_INFO, &tile_info);
+
+      // check validity of tile cols
+      int tile_col_idx, tile_col = 0;
+      for (tile_col_idx = 0; tile_col_idx < tile_info.tile_columns - 1;
+           tile_col_idx++) {
+        if (tile_config_param_.tile_widths[tile_col] !=
+            tile_info.tile_widths[tile_col_idx])
+          tile_config_violated_ = true;
+        tile_col = (tile_col + 1) % (int)tile_config_param_.tile_width_count;
+      }
+      // last column may not be able to accommodate config, but if it is
+      // greater than what is configured, there is a violation.
+      if (tile_config_param_.tile_widths[tile_col] <
+          tile_info.tile_widths[tile_col_idx])
+        tile_config_violated_ = true;
+
+      // check validity of tile rows
+      int tile_row_idx, tile_row = 0;
+      for (tile_row_idx = 0; tile_row_idx < tile_info.tile_rows - 1;
+           tile_row_idx++) {
+        if (tile_config_param_.tile_heights[tile_row] !=
+            tile_info.tile_heights[tile_row_idx])
+          tile_config_violated_ = true;
+        tile_row = (tile_row + 1) % (int)tile_config_param_.tile_height_count;
+      }
+      // last row may not be able to accommodate config, but if it is
+      // greater than what is configured, there is a violation.
+      if (tile_config_param_.tile_heights[tile_row] <
+          tile_info.tile_heights[tile_row_idx])
+        tile_config_violated_ = true;
+    }
+    return AOM_CODEC_OK == res_dec;
+  }
+
+  ::libaom_test::TestMode encoding_mode_;
+  const nonUniformTileConfigParam tile_config_param_;
+  bool tile_config_violated_;
+  aom_rc_mode rc_end_usage_;
+};
+
+TEST_P(UniformTileConfigTestLarge, UniformTileConfigTest) {
+  ::libaom_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 1);
+  ASSERT_NO_FATAL_FAILURE(video.Begin());
+
+  int max_tiles_cols = video.img()->w / (int)tile_config_param_.sb_size;
+  int max_tiles_rows = video.img()->h / (int)tile_config_param_.sb_size;
+  max_tile_cols_log2_ = tile_log2(1, AOMMIN(max_tiles_cols, AOM_MAX_TILE_COLS));
+  max_tile_rows_log2_ = tile_log2(1, AOMMIN(max_tiles_rows, AOM_MAX_TILE_ROWS));
+
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  ASSERT_EQ(tile_config_violated_, false);
+}
+
+TEST_P(UniformTileConfigTestLarge, UniformTileConfigTestLowRes) {
+  ::libaom_test::Y4mVideoSource video("screendata.y4m", 0, 1);
+  ASSERT_NO_FATAL_FAILURE(video.Begin());
+
+  int max_tiles_cols = video.img()->w / (int)tile_config_param_.sb_size;
+  int max_tiles_rows = video.img()->h / (int)tile_config_param_.sb_size;
+  max_tile_cols_log2_ = tile_log2(1, AOMMIN(max_tiles_cols, AOM_MAX_TILE_COLS));
+  max_tile_rows_log2_ = tile_log2(1, AOMMIN(max_tiles_rows, AOM_MAX_TILE_ROWS));
+
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  ASSERT_EQ(tile_config_violated_, false);
+}
+
+TEST_P(NonUniformTileConfigTestLarge, NonUniformTileConfigTest) {
+  ::libaom_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 1);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  ASSERT_EQ(tile_config_violated_, false);
+}
+
+AV1_INSTANTIATE_TEST_SUITE(UniformTileConfigTestLarge,
+                           ::testing::Values(::libaom_test::kOnePassGood,
+                                             ::libaom_test::kTwoPassGood),
+                           ::testing::ValuesIn(uniformTileConfigParams),
+                           ::testing::Values(AOM_Q, AOM_VBR, AOM_CBR, AOM_CQ));
+
+AV1_INSTANTIATE_TEST_SUITE(NonUniformTileConfigTestLarge,
+                           ::testing::Values(::libaom_test::kOnePassGood,
+                                             ::libaom_test::kTwoPassGood),
+                           ::testing::ValuesIn(nonUniformTileConfigParams),
+                           ::testing::Values(AOM_Q, AOM_VBR, AOM_CBR, AOM_CQ));
+
+typedef struct {
+  // Number of tile groups to set.
+  const int num_tg;
+  // Number of tile rows to set
+  const int num_tile_rows;
+  // Number of tile columns to set
+  const int num_tile_cols;
+} TileGroupConfigParams;
+
+static const TileGroupConfigParams tileGroupTestParams[] = {
+  { 5, 4, 4 }, { 3, 3, 3 }, { 5, 3, 3 }, { 7, 5, 5 }, { 7, 3, 3 }, { 7, 4, 4 }
+};
+
+std::ostream &operator<<(std::ostream &os,
+                         const TileGroupConfigParams &test_arg) {
+  return os << "TileGroupConfigParams { num_tg:" << test_arg.num_tg
+            << " num_tile_rows:" << test_arg.num_tile_rows
+            << " num_tile_cols:" << test_arg.num_tile_cols << " }";
+}
+
+// This class is used to test number of tile groups present in header.
+class TileGroupTestLarge
+    : public ::libaom_test::CodecTestWith2Params<libaom_test::TestMode,
+                                                 TileGroupConfigParams>,
+      public ::libaom_test::EncoderTest {
+ protected:
+  TileGroupTestLarge()
+      : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)),
+        tile_group_config_params_(GET_PARAM(2)) {
+    tile_group_config_violated_ = false;
+  }
+  virtual ~TileGroupTestLarge() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(encoding_mode_);
+    const aom_rational timebase = { 1, 30 };
+    cfg_.g_timebase = timebase;
+    cfg_.rc_end_usage = AOM_Q;
+    cfg_.g_threads = 1;
+  }
+
+  virtual bool DoDecode() const { return 1; }
+
+  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                                  ::libaom_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      encoder->Control(AOME_SET_CPUUSED, 5);
+      encoder->Control(AV1E_SET_NUM_TG, tile_group_config_params_.num_tg);
+      encoder->Control(AV1E_SET_TILE_COLUMNS,
+                       tile_group_config_params_.num_tile_cols);
+      encoder->Control(AV1E_SET_TILE_ROWS,
+                       tile_group_config_params_.num_tile_rows);
+    }
+  }
+
+  virtual bool HandleDecodeResult(const aom_codec_err_t res_dec,
+                                  libaom_test::Decoder *decoder) {
+    EXPECT_EQ(AOM_CODEC_OK, res_dec) << decoder->DecodeError();
+    if (AOM_CODEC_OK == res_dec) {
+      aom_tile_info tile_info;
+      aom_codec_ctx_t *ctx_dec = decoder->GetDecoder();
+      AOM_CODEC_CONTROL_TYPECHECKED(ctx_dec, AOMD_GET_TILE_INFO, &tile_info);
+      AOM_CODEC_CONTROL_TYPECHECKED(ctx_dec, AOMD_GET_SHOW_EXISTING_FRAME_FLAG,
+                                    &show_existing_frame_);
+      if (tile_info.num_tile_groups != tile_group_config_params_.num_tg &&
+          !show_existing_frame_)
+        tile_group_config_violated_ = true;
+      EXPECT_EQ(tile_group_config_violated_, false);
+    }
+    return AOM_CODEC_OK == res_dec;
+  }
+
+  int show_existing_frame_;
+  bool tile_group_config_violated_;
+  aom_rc_mode end_usage_check_;
+  ::libaom_test::TestMode encoding_mode_;
+  const TileGroupConfigParams tile_group_config_params_;
+};
+
+TEST_P(TileGroupTestLarge, TileGroupCountTest) {
+  libaom_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480,
+                                     cfg_.g_timebase.den, cfg_.g_timebase.num,
+                                     0, 5);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+
+AV1_INSTANTIATE_TEST_SUITE(TileGroupTestLarge,
+                           ::testing::Values(::libaom_test::kOnePassGood,
+                                             ::libaom_test::kTwoPassGood),
+                           ::testing::ValuesIn(tileGroupTestParams));
+}  // namespace
diff --git a/test/tile_independence_test.cc b/test/tile_independence_test.cc
index 4f7c4a4..5f167be 100644
--- a/test/tile_independence_test.cc
+++ b/test/tile_independence_test.cc
@@ -139,10 +139,10 @@
   DoTest();
 }
 
-AV1_INSTANTIATE_TEST_CASE(TileIndependenceTest, ::testing::Values(0, 1),
-                          ::testing::Values(0, 1), ::testing::Values(1, 2, 4));
-AV1_INSTANTIATE_TEST_CASE(TileIndependenceTestLarge, ::testing::Values(0, 1),
-                          ::testing::Values(0, 1), ::testing::Values(1, 2, 4));
+AV1_INSTANTIATE_TEST_SUITE(TileIndependenceTest, ::testing::Values(0, 1),
+                           ::testing::Values(0, 1), ::testing::Values(1, 2, 4));
+AV1_INSTANTIATE_TEST_SUITE(TileIndependenceTestLarge, ::testing::Values(0, 1),
+                           ::testing::Values(0, 1), ::testing::Values(1, 2, 4));
 
 class TileIndependenceLSTest : public TileIndependenceTest {};
 
@@ -166,8 +166,8 @@
   DoTest();
 }
 
-AV1_INSTANTIATE_TEST_CASE(TileIndependenceLSTest, ::testing::Values(6),
-                          ::testing::Values(6), ::testing::Values(1));
-AV1_INSTANTIATE_TEST_CASE(TileIndependenceLSTestLarge, ::testing::Values(6),
-                          ::testing::Values(6), ::testing::Values(1));
+AV1_INSTANTIATE_TEST_SUITE(TileIndependenceLSTest, ::testing::Values(6),
+                           ::testing::Values(6), ::testing::Values(1));
+AV1_INSTANTIATE_TEST_SUITE(TileIndependenceLSTestLarge, ::testing::Values(6),
+                           ::testing::Values(6), ::testing::Values(1));
 }  // namespace
diff --git a/test/time_stamp_test.cc b/test/time_stamp_test.cc
index 679e4da..3e2fa97 100644
--- a/test/time_stamp_test.cc
+++ b/test/time_stamp_test.cc
@@ -99,7 +99,7 @@
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 }
 
-AV1_INSTANTIATE_TEST_CASE(TimestampTest,
-                          ::testing::Values(::libaom_test::kTwoPassGood));
+AV1_INSTANTIATE_TEST_SUITE(TimestampTest,
+                           ::testing::Values(::libaom_test::kTwoPassGood));
 
 }  // namespace
diff --git a/test/tools_common.sh b/test/tools_common.sh
index c087106..d40709b 100755
--- a/test/tools_common.sh
+++ b/test/tools_common.sh
@@ -196,7 +196,7 @@
 
 # Echoes "fast" encode params for use with aomenc.
 aomenc_encode_test_fast_params() {
-  echo "--cpu-used=1
+  echo "--cpu-used=2
         --limit=${AV1_ENCODE_TEST_FRAME_LIMIT}
         --lag-in-frames=0
         --test-decode=fatal"
diff --git a/test/twopass_encoder.sh b/test/twopass_encoder.sh
index cca44ce..44e7327 100755
--- a/test/twopass_encoder.sh
+++ b/test/twopass_encoder.sh
@@ -38,7 +38,7 @@
 
   eval "${AOM_TEST_PREFIX}" "${encoder}" "${codec}" "${YUV_RAW_INPUT_WIDTH}" \
       "${YUV_RAW_INPUT_HEIGHT}" "${YUV_RAW_INPUT}" "${output_file}" "${limit}" \
-      ${devnull}
+      ${devnull} || return 1
 
   [ -e "${output_file}" ] || return 1
 }
diff --git a/test/util.h b/test/util.h
index aa4b106..ef9f92d 100644
--- a/test/util.h
+++ b/test/util.h
@@ -15,7 +15,6 @@
 #include <stdio.h>
 #include <math.h>
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
-#include "aom/aom_integer.h"
 #include "aom/aom_image.h"
 #include "aom_ports/aom_timer.h"
 
diff --git a/test/variance_test.cc b/test/variance_test.cc
index 1458ece..5bf040c 100644
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@@ -30,6 +30,8 @@
 
 namespace {
 
+typedef uint64_t (*MseWxH16bitFunc)(uint8_t *dst, int dstride, uint16_t *src,
+                                    int sstride, int w, int h);
 typedef unsigned int (*VarianceMxNFunc)(const uint8_t *a, int a_stride,
                                         const uint8_t *b, int b_stride,
                                         unsigned int *sse);
@@ -49,10 +51,13 @@
     const uint8_t *a, int a_stride, int xoffset, int yoffset, const uint8_t *b,
     int b_stride, uint32_t *sse, const uint8_t *second_pred,
     const DIST_WTD_COMP_PARAMS *jcp_param);
+
+#if !CONFIG_REALTIME_ONLY
 typedef uint32_t (*ObmcSubpelVarFunc)(const uint8_t *pre, int pre_stride,
                                       int xoffset, int yoffset,
                                       const int32_t *wsrc, const int32_t *mask,
                                       unsigned int *sse);
+#endif
 
 using libaom_test::ACMRandom;
 
@@ -275,6 +280,7 @@
   return static_cast<uint32_t>(sse - ((se * se) >> (l2w + l2h)));
 }
 
+#if !CONFIG_REALTIME_ONLY
 static uint32_t obmc_subpel_variance_ref(const uint8_t *pre, int l2w, int l2h,
                                          int xoff, int yoff,
                                          const int32_t *wsrc,
@@ -324,6 +330,7 @@
   *sse_ptr = static_cast<uint32_t>(sse);
   return static_cast<uint32_t>(sse - ((se * se) >> (l2w + l2h)));
 }
+#endif
 
 ////////////////////////////////////////////////////////////////////////////////
 
@@ -407,6 +414,106 @@
 
 // Main class for testing a function type
 template <typename FunctionType>
+class MseWxHTestClass
+    : public ::testing::TestWithParam<TestParams<FunctionType> > {
+ public:
+  virtual void SetUp() {
+    params_ = this->GetParam();
+
+    rnd_.Reset(ACMRandom::DeterministicSeed());
+    src_ = reinterpret_cast<uint16_t *>(
+        aom_memalign(16, block_size() * sizeof(src_)));
+    dst_ = reinterpret_cast<uint8_t *>(
+        aom_memalign(16, block_size() * sizeof(dst_)));
+    ASSERT_TRUE(src_ != NULL);
+    ASSERT_TRUE(dst_ != NULL);
+  }
+
+  virtual void TearDown() {
+    aom_free(src_);
+    aom_free(dst_);
+    src_ = NULL;
+    dst_ = NULL;
+    libaom_test::ClearSystemState();
+  }
+
+ protected:
+  void RefMatchTestMse();
+  void SpeedTest();
+
+ protected:
+  ACMRandom rnd_;
+  uint8_t *dst_;
+  uint16_t *src_;
+  TestParams<FunctionType> params_;
+
+  // some relay helpers
+  int block_size() const { return params_.block_size; }
+  int width() const { return params_.width; }
+  int height() const { return params_.height; }
+  int d_stride() const { return params_.width; }  // stride is same as width
+  int s_stride() const { return params_.width; }  // stride is same as width
+};
+
+template <typename MseWxHFunctionType>
+void MseWxHTestClass<MseWxHFunctionType>::SpeedTest() {
+  aom_usec_timer ref_timer, test_timer;
+  double elapsed_time_c = 0;
+  double elapsed_time_simd = 0;
+  int run_time = 10000000;
+  int w = width();
+  int h = height();
+  int dstride = d_stride();
+  int sstride = s_stride();
+
+  for (int k = 0; k < block_size(); ++k) {
+    dst_[k] = rnd_.Rand8();
+    src_[k] = rnd_.Rand8();
+  }
+  aom_usec_timer_start(&ref_timer);
+  for (int i = 0; i < run_time; i++) {
+    aom_mse_wxh_16bit_c(dst_, dstride, src_, sstride, w, h);
+  }
+  aom_usec_timer_mark(&ref_timer);
+  elapsed_time_c = static_cast<double>(aom_usec_timer_elapsed(&ref_timer));
+
+  aom_usec_timer_start(&test_timer);
+  for (int i = 0; i < run_time; i++) {
+    params_.func(dst_, dstride, src_, sstride, w, h);
+  }
+  aom_usec_timer_mark(&test_timer);
+  elapsed_time_simd = static_cast<double>(aom_usec_timer_elapsed(&test_timer));
+
+  printf("%dx%d\tc_time=%lf \t simd_time=%lf \t gain=%lf\n", width(), height(),
+         elapsed_time_c, elapsed_time_simd,
+         (elapsed_time_c / elapsed_time_simd));
+}
+
+template <typename MseWxHFunctionType>
+void MseWxHTestClass<MseWxHFunctionType>::RefMatchTestMse() {
+  uint64_t mse_ref = 0;
+  uint64_t mse_mod = 0;
+  int w = width();
+  int h = height();
+  int dstride = d_stride();
+  int sstride = s_stride();
+
+  for (int i = 0; i < 10; i++) {
+    for (int k = 0; k < block_size(); ++k) {
+      dst_[k] = rnd_.Rand8();
+      src_[k] = rnd_.Rand8();
+    }
+    ASM_REGISTER_STATE_CHECK(
+        mse_ref = aom_mse_wxh_16bit_c(dst_, dstride, src_, sstride, w, h));
+    ASM_REGISTER_STATE_CHECK(
+        mse_mod = params_.func(dst_, dstride, src_, sstride, w, h));
+    EXPECT_EQ(mse_ref, mse_mod)
+        << "ref mse: " << mse_ref << " mod mse: " << mse_mod;
+  }
+}
+
+// Main class for testing a function type
+template <typename FunctionType>
 class MainTestClass
     : public ::testing::TestWithParam<TestParams<FunctionType> > {
  public:
@@ -915,6 +1022,8 @@
 
 ////////////////////////////////////////////////////////////////////////////////
 
+#if !CONFIG_REALTIME_ONLY
+
 static const int kMaskMax = 64;
 
 typedef TestParams<ObmcSubpelVarFunc> ObmcSubpelVarianceParams;
@@ -1070,6 +1179,10 @@
          params_.bit_depth, elapsed_time);
 }
 
+#endif  // !CONFIG_REALTIME_ONLY
+
+typedef MseWxHTestClass<MseWxH16bitFunc> MseWxHTest;
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(MseWxHTest);
 typedef MainTestClass<Get4x4SseFunc> AvxSseTest;
 typedef MainTestClass<VarianceMxNFunc> AvxMseTest;
 typedef MainTestClass<VarianceMxNFunc> AvxVarianceTest;
@@ -1077,10 +1190,15 @@
 typedef SubpelVarianceTest<SubpixAvgVarMxNFunc> AvxSubpelAvgVarianceTest;
 typedef SubpelVarianceTest<DistWtdSubpixAvgVarMxNFunc>
     AvxDistWtdSubpelAvgVarianceTest;
+#if !CONFIG_REALTIME_ONLY
 typedef ObmcVarianceTest<ObmcSubpelVarFunc> AvxObmcSubpelVarianceTest;
+#endif
+typedef TestParams<MseWxH16bitFunc> MseWxHParams;
 
 TEST_P(AvxSseTest, RefSse) { RefTestSse(); }
 TEST_P(AvxSseTest, MaxSse) { MaxTestSse(); }
+TEST_P(MseWxHTest, RefMse) { RefMatchTestMse(); }
+TEST_P(MseWxHTest, DISABLED_SpeedMse) { SpeedTest(); }
 TEST_P(AvxMseTest, RefMse) { RefTestMse(); }
 TEST_P(AvxMseTest, MaxMse) { MaxTestMse(); }
 TEST_P(AvxVarianceTest, Zero) { ZeroTest(); }
@@ -1095,9 +1213,11 @@
 TEST_P(AvxSubpelVarianceTest, DISABLED_Speed) { SpeedTest(); }
 TEST_P(AvxSubpelAvgVarianceTest, Ref) { RefTest(); }
 TEST_P(AvxDistWtdSubpelAvgVarianceTest, Ref) { RefTest(); }
+#if !CONFIG_REALTIME_ONLY
 TEST_P(AvxObmcSubpelVarianceTest, Ref) { RefTest(); }
 TEST_P(AvxObmcSubpelVarianceTest, ExtremeRef) { ExtremeRefTest(); }
 TEST_P(AvxObmcSubpelVarianceTest, DISABLED_Speed) { SpeedTest(); }
+#endif
 
 INSTANTIATE_TEST_SUITE_P(C, SumOfSquaresTest,
                          ::testing::Values(aom_get_mb_ss_c));
@@ -1115,132 +1235,143 @@
                                            MseParams(3, 3, &aom_mse8x8_c)));
 
 typedef TestParams<VarianceMxNFunc> VarianceParams;
-INSTANTIATE_TEST_SUITE_P(
-    C, AvxVarianceTest,
-    ::testing::Values(VarianceParams(7, 7, &aom_variance128x128_c),
-                      VarianceParams(7, 6, &aom_variance128x64_c),
-                      VarianceParams(6, 7, &aom_variance64x128_c),
-                      VarianceParams(6, 6, &aom_variance64x64_c),
-                      VarianceParams(6, 5, &aom_variance64x32_c),
-                      VarianceParams(5, 6, &aom_variance32x64_c),
-                      VarianceParams(5, 5, &aom_variance32x32_c),
-                      VarianceParams(5, 4, &aom_variance32x16_c),
-                      VarianceParams(4, 5, &aom_variance16x32_c),
-                      VarianceParams(4, 4, &aom_variance16x16_c),
-                      VarianceParams(4, 3, &aom_variance16x8_c),
-                      VarianceParams(3, 4, &aom_variance8x16_c),
-                      VarianceParams(3, 3, &aom_variance8x8_c),
-                      VarianceParams(3, 2, &aom_variance8x4_c),
-                      VarianceParams(2, 3, &aom_variance4x8_c),
-                      VarianceParams(2, 2, &aom_variance4x4_c),
-
-                      VarianceParams(6, 4, &aom_variance64x16_c),
-                      VarianceParams(4, 6, &aom_variance16x64_c),
-                      VarianceParams(5, 3, &aom_variance32x8_c),
-                      VarianceParams(3, 5, &aom_variance8x32_c),
-                      VarianceParams(4, 2, &aom_variance16x4_c),
-                      VarianceParams(2, 4, &aom_variance4x16_c)));
+const VarianceParams kArrayVariance_c[] = {
+  VarianceParams(7, 7, &aom_variance128x128_c),
+  VarianceParams(7, 6, &aom_variance128x64_c),
+  VarianceParams(6, 7, &aom_variance64x128_c),
+  VarianceParams(6, 6, &aom_variance64x64_c),
+  VarianceParams(6, 5, &aom_variance64x32_c),
+  VarianceParams(5, 6, &aom_variance32x64_c),
+  VarianceParams(5, 5, &aom_variance32x32_c),
+  VarianceParams(5, 4, &aom_variance32x16_c),
+  VarianceParams(4, 5, &aom_variance16x32_c),
+  VarianceParams(4, 4, &aom_variance16x16_c),
+  VarianceParams(4, 3, &aom_variance16x8_c),
+  VarianceParams(3, 4, &aom_variance8x16_c),
+  VarianceParams(3, 3, &aom_variance8x8_c),
+  VarianceParams(3, 2, &aom_variance8x4_c),
+  VarianceParams(2, 3, &aom_variance4x8_c),
+  VarianceParams(2, 2, &aom_variance4x4_c),
+#if !CONFIG_REALTIME_ONLY
+  VarianceParams(6, 4, &aom_variance64x16_c),
+  VarianceParams(4, 6, &aom_variance16x64_c),
+  VarianceParams(5, 3, &aom_variance32x8_c),
+  VarianceParams(3, 5, &aom_variance8x32_c),
+  VarianceParams(4, 2, &aom_variance16x4_c),
+  VarianceParams(2, 4, &aom_variance4x16_c),
+#endif
+};
+INSTANTIATE_TEST_SUITE_P(C, AvxVarianceTest,
+                         ::testing::ValuesIn(kArrayVariance_c));
 
 typedef TestParams<SubpixVarMxNFunc> SubpelVarianceParams;
-INSTANTIATE_TEST_SUITE_P(
-    C, AvxSubpelVarianceTest,
-    ::testing::Values(
-        SubpelVarianceParams(7, 7, &aom_sub_pixel_variance128x128_c, 0),
-        SubpelVarianceParams(7, 6, &aom_sub_pixel_variance128x64_c, 0),
-        SubpelVarianceParams(6, 7, &aom_sub_pixel_variance64x128_c, 0),
-        SubpelVarianceParams(6, 6, &aom_sub_pixel_variance64x64_c, 0),
-        SubpelVarianceParams(6, 5, &aom_sub_pixel_variance64x32_c, 0),
-        SubpelVarianceParams(5, 6, &aom_sub_pixel_variance32x64_c, 0),
-        SubpelVarianceParams(5, 5, &aom_sub_pixel_variance32x32_c, 0),
-        SubpelVarianceParams(5, 4, &aom_sub_pixel_variance32x16_c, 0),
-        SubpelVarianceParams(4, 5, &aom_sub_pixel_variance16x32_c, 0),
-        SubpelVarianceParams(4, 4, &aom_sub_pixel_variance16x16_c, 0),
-        SubpelVarianceParams(4, 3, &aom_sub_pixel_variance16x8_c, 0),
-        SubpelVarianceParams(3, 4, &aom_sub_pixel_variance8x16_c, 0),
-        SubpelVarianceParams(3, 3, &aom_sub_pixel_variance8x8_c, 0),
-        SubpelVarianceParams(3, 2, &aom_sub_pixel_variance8x4_c, 0),
-        SubpelVarianceParams(2, 3, &aom_sub_pixel_variance4x8_c, 0),
-        SubpelVarianceParams(2, 2, &aom_sub_pixel_variance4x4_c, 0),
-
-        SubpelVarianceParams(6, 4, &aom_sub_pixel_variance64x16_c, 0),
-        SubpelVarianceParams(4, 6, &aom_sub_pixel_variance16x64_c, 0),
-        SubpelVarianceParams(5, 3, &aom_sub_pixel_variance32x8_c, 0),
-        SubpelVarianceParams(3, 5, &aom_sub_pixel_variance8x32_c, 0),
-        SubpelVarianceParams(4, 2, &aom_sub_pixel_variance16x4_c, 0),
-        SubpelVarianceParams(2, 4, &aom_sub_pixel_variance4x16_c, 0)));
+const SubpelVarianceParams kArraySubpelVariance_c[] = {
+  SubpelVarianceParams(7, 7, &aom_sub_pixel_variance128x128_c, 0),
+  SubpelVarianceParams(7, 6, &aom_sub_pixel_variance128x64_c, 0),
+  SubpelVarianceParams(6, 7, &aom_sub_pixel_variance64x128_c, 0),
+  SubpelVarianceParams(6, 6, &aom_sub_pixel_variance64x64_c, 0),
+  SubpelVarianceParams(6, 5, &aom_sub_pixel_variance64x32_c, 0),
+  SubpelVarianceParams(5, 6, &aom_sub_pixel_variance32x64_c, 0),
+  SubpelVarianceParams(5, 5, &aom_sub_pixel_variance32x32_c, 0),
+  SubpelVarianceParams(5, 4, &aom_sub_pixel_variance32x16_c, 0),
+  SubpelVarianceParams(4, 5, &aom_sub_pixel_variance16x32_c, 0),
+  SubpelVarianceParams(4, 4, &aom_sub_pixel_variance16x16_c, 0),
+  SubpelVarianceParams(4, 3, &aom_sub_pixel_variance16x8_c, 0),
+  SubpelVarianceParams(3, 4, &aom_sub_pixel_variance8x16_c, 0),
+  SubpelVarianceParams(3, 3, &aom_sub_pixel_variance8x8_c, 0),
+  SubpelVarianceParams(3, 2, &aom_sub_pixel_variance8x4_c, 0),
+  SubpelVarianceParams(2, 3, &aom_sub_pixel_variance4x8_c, 0),
+  SubpelVarianceParams(2, 2, &aom_sub_pixel_variance4x4_c, 0),
+#if !CONFIG_REALTIME_ONLY
+  SubpelVarianceParams(6, 4, &aom_sub_pixel_variance64x16_c, 0),
+  SubpelVarianceParams(4, 6, &aom_sub_pixel_variance16x64_c, 0),
+  SubpelVarianceParams(5, 3, &aom_sub_pixel_variance32x8_c, 0),
+  SubpelVarianceParams(3, 5, &aom_sub_pixel_variance8x32_c, 0),
+  SubpelVarianceParams(4, 2, &aom_sub_pixel_variance16x4_c, 0),
+  SubpelVarianceParams(2, 4, &aom_sub_pixel_variance4x16_c, 0),
+#endif
+};
+INSTANTIATE_TEST_SUITE_P(C, AvxSubpelVarianceTest,
+                         ::testing::ValuesIn(kArraySubpelVariance_c));
 
 typedef TestParams<SubpixAvgVarMxNFunc> SubpelAvgVarianceParams;
-INSTANTIATE_TEST_SUITE_P(
-    C, AvxSubpelAvgVarianceTest,
-    ::testing::Values(
-        SubpelAvgVarianceParams(7, 7, &aom_sub_pixel_avg_variance128x128_c, 0),
-        SubpelAvgVarianceParams(7, 6, &aom_sub_pixel_avg_variance128x64_c, 0),
-        SubpelAvgVarianceParams(6, 7, &aom_sub_pixel_avg_variance64x128_c, 0),
-        SubpelAvgVarianceParams(6, 6, &aom_sub_pixel_avg_variance64x64_c, 0),
-        SubpelAvgVarianceParams(6, 5, &aom_sub_pixel_avg_variance64x32_c, 0),
-        SubpelAvgVarianceParams(5, 6, &aom_sub_pixel_avg_variance32x64_c, 0),
-        SubpelAvgVarianceParams(5, 5, &aom_sub_pixel_avg_variance32x32_c, 0),
-        SubpelAvgVarianceParams(5, 4, &aom_sub_pixel_avg_variance32x16_c, 0),
-        SubpelAvgVarianceParams(4, 5, &aom_sub_pixel_avg_variance16x32_c, 0),
-        SubpelAvgVarianceParams(4, 4, &aom_sub_pixel_avg_variance16x16_c, 0),
-        SubpelAvgVarianceParams(4, 3, &aom_sub_pixel_avg_variance16x8_c, 0),
-        SubpelAvgVarianceParams(3, 4, &aom_sub_pixel_avg_variance8x16_c, 0),
-        SubpelAvgVarianceParams(3, 3, &aom_sub_pixel_avg_variance8x8_c, 0),
-        SubpelAvgVarianceParams(3, 2, &aom_sub_pixel_avg_variance8x4_c, 0),
-        SubpelAvgVarianceParams(2, 3, &aom_sub_pixel_avg_variance4x8_c, 0),
-        SubpelAvgVarianceParams(2, 2, &aom_sub_pixel_avg_variance4x4_c, 0),
-
-        SubpelAvgVarianceParams(6, 4, &aom_sub_pixel_avg_variance64x16_c, 0),
-        SubpelAvgVarianceParams(4, 6, &aom_sub_pixel_avg_variance16x64_c, 0),
-        SubpelAvgVarianceParams(5, 3, &aom_sub_pixel_avg_variance32x8_c, 0),
-        SubpelAvgVarianceParams(3, 5, &aom_sub_pixel_avg_variance8x32_c, 0),
-        SubpelAvgVarianceParams(4, 2, &aom_sub_pixel_avg_variance16x4_c, 0),
-        SubpelAvgVarianceParams(2, 4, &aom_sub_pixel_avg_variance4x16_c, 0)));
+const SubpelAvgVarianceParams kArraySubpelAvgVariance_c[] = {
+  SubpelAvgVarianceParams(7, 7, &aom_sub_pixel_avg_variance128x128_c, 0),
+  SubpelAvgVarianceParams(7, 6, &aom_sub_pixel_avg_variance128x64_c, 0),
+  SubpelAvgVarianceParams(6, 7, &aom_sub_pixel_avg_variance64x128_c, 0),
+  SubpelAvgVarianceParams(6, 6, &aom_sub_pixel_avg_variance64x64_c, 0),
+  SubpelAvgVarianceParams(6, 5, &aom_sub_pixel_avg_variance64x32_c, 0),
+  SubpelAvgVarianceParams(5, 6, &aom_sub_pixel_avg_variance32x64_c, 0),
+  SubpelAvgVarianceParams(5, 5, &aom_sub_pixel_avg_variance32x32_c, 0),
+  SubpelAvgVarianceParams(5, 4, &aom_sub_pixel_avg_variance32x16_c, 0),
+  SubpelAvgVarianceParams(4, 5, &aom_sub_pixel_avg_variance16x32_c, 0),
+  SubpelAvgVarianceParams(4, 4, &aom_sub_pixel_avg_variance16x16_c, 0),
+  SubpelAvgVarianceParams(4, 3, &aom_sub_pixel_avg_variance16x8_c, 0),
+  SubpelAvgVarianceParams(3, 4, &aom_sub_pixel_avg_variance8x16_c, 0),
+  SubpelAvgVarianceParams(3, 3, &aom_sub_pixel_avg_variance8x8_c, 0),
+  SubpelAvgVarianceParams(3, 2, &aom_sub_pixel_avg_variance8x4_c, 0),
+  SubpelAvgVarianceParams(2, 3, &aom_sub_pixel_avg_variance4x8_c, 0),
+  SubpelAvgVarianceParams(2, 2, &aom_sub_pixel_avg_variance4x4_c, 0),
+#if !CONFIG_REALTIME_ONLY
+  SubpelAvgVarianceParams(6, 4, &aom_sub_pixel_avg_variance64x16_c, 0),
+  SubpelAvgVarianceParams(4, 6, &aom_sub_pixel_avg_variance16x64_c, 0),
+  SubpelAvgVarianceParams(5, 3, &aom_sub_pixel_avg_variance32x8_c, 0),
+  SubpelAvgVarianceParams(3, 5, &aom_sub_pixel_avg_variance8x32_c, 0),
+  SubpelAvgVarianceParams(4, 2, &aom_sub_pixel_avg_variance16x4_c, 0),
+  SubpelAvgVarianceParams(2, 4, &aom_sub_pixel_avg_variance4x16_c, 0),
+#endif
+};
+INSTANTIATE_TEST_SUITE_P(C, AvxSubpelAvgVarianceTest,
+                         ::testing::ValuesIn(kArraySubpelAvgVariance_c));
 
 typedef TestParams<DistWtdSubpixAvgVarMxNFunc> DistWtdSubpelAvgVarianceParams;
-INSTANTIATE_TEST_SUITE_P(
-    C, AvxDistWtdSubpelAvgVarianceTest,
-    ::testing::Values(DistWtdSubpelAvgVarianceParams(
-                          6, 6, &aom_dist_wtd_sub_pixel_avg_variance64x64_c, 0),
-                      DistWtdSubpelAvgVarianceParams(
-                          6, 5, &aom_dist_wtd_sub_pixel_avg_variance64x32_c, 0),
-                      DistWtdSubpelAvgVarianceParams(
-                          5, 6, &aom_dist_wtd_sub_pixel_avg_variance32x64_c, 0),
-                      DistWtdSubpelAvgVarianceParams(
-                          5, 5, &aom_dist_wtd_sub_pixel_avg_variance32x32_c, 0),
-                      DistWtdSubpelAvgVarianceParams(
-                          5, 4, &aom_dist_wtd_sub_pixel_avg_variance32x16_c, 0),
-                      DistWtdSubpelAvgVarianceParams(
-                          4, 5, &aom_dist_wtd_sub_pixel_avg_variance16x32_c, 0),
-                      DistWtdSubpelAvgVarianceParams(
-                          4, 4, &aom_dist_wtd_sub_pixel_avg_variance16x16_c, 0),
-                      DistWtdSubpelAvgVarianceParams(
-                          4, 3, &aom_dist_wtd_sub_pixel_avg_variance16x8_c, 0),
-                      DistWtdSubpelAvgVarianceParams(
-                          3, 4, &aom_dist_wtd_sub_pixel_avg_variance8x16_c, 0),
-                      DistWtdSubpelAvgVarianceParams(
-                          3, 3, &aom_dist_wtd_sub_pixel_avg_variance8x8_c, 0),
-                      DistWtdSubpelAvgVarianceParams(
-                          3, 2, &aom_dist_wtd_sub_pixel_avg_variance8x4_c, 0),
-                      DistWtdSubpelAvgVarianceParams(
-                          2, 3, &aom_dist_wtd_sub_pixel_avg_variance4x8_c, 0),
-                      DistWtdSubpelAvgVarianceParams(
-                          2, 2, &aom_dist_wtd_sub_pixel_avg_variance4x4_c, 0),
+const DistWtdSubpelAvgVarianceParams kArrayDistWtdSubpelAvgVariance_c[] = {
+  DistWtdSubpelAvgVarianceParams(
+      6, 6, &aom_dist_wtd_sub_pixel_avg_variance64x64_c, 0),
+  DistWtdSubpelAvgVarianceParams(
+      6, 5, &aom_dist_wtd_sub_pixel_avg_variance64x32_c, 0),
+  DistWtdSubpelAvgVarianceParams(
+      5, 6, &aom_dist_wtd_sub_pixel_avg_variance32x64_c, 0),
+  DistWtdSubpelAvgVarianceParams(
+      5, 5, &aom_dist_wtd_sub_pixel_avg_variance32x32_c, 0),
+  DistWtdSubpelAvgVarianceParams(
+      5, 4, &aom_dist_wtd_sub_pixel_avg_variance32x16_c, 0),
+  DistWtdSubpelAvgVarianceParams(
+      4, 5, &aom_dist_wtd_sub_pixel_avg_variance16x32_c, 0),
+  DistWtdSubpelAvgVarianceParams(
+      4, 4, &aom_dist_wtd_sub_pixel_avg_variance16x16_c, 0),
+  DistWtdSubpelAvgVarianceParams(4, 3,
+                                 &aom_dist_wtd_sub_pixel_avg_variance16x8_c, 0),
+  DistWtdSubpelAvgVarianceParams(3, 4,
+                                 &aom_dist_wtd_sub_pixel_avg_variance8x16_c, 0),
+  DistWtdSubpelAvgVarianceParams(3, 3,
+                                 &aom_dist_wtd_sub_pixel_avg_variance8x8_c, 0),
+  DistWtdSubpelAvgVarianceParams(3, 2,
+                                 &aom_dist_wtd_sub_pixel_avg_variance8x4_c, 0),
+  DistWtdSubpelAvgVarianceParams(2, 3,
+                                 &aom_dist_wtd_sub_pixel_avg_variance4x8_c, 0),
+  DistWtdSubpelAvgVarianceParams(2, 2,
+                                 &aom_dist_wtd_sub_pixel_avg_variance4x4_c, 0),
+#if !CONFIG_REALTIME_ONLY
 
-                      DistWtdSubpelAvgVarianceParams(
-                          6, 4, &aom_dist_wtd_sub_pixel_avg_variance64x16_c, 0),
-                      DistWtdSubpelAvgVarianceParams(
-                          4, 6, &aom_dist_wtd_sub_pixel_avg_variance16x64_c, 0),
-                      DistWtdSubpelAvgVarianceParams(
-                          5, 3, &aom_dist_wtd_sub_pixel_avg_variance32x8_c, 0),
-                      DistWtdSubpelAvgVarianceParams(
-                          3, 5, &aom_dist_wtd_sub_pixel_avg_variance8x32_c, 0),
-                      DistWtdSubpelAvgVarianceParams(
-                          4, 2, &aom_dist_wtd_sub_pixel_avg_variance16x4_c, 0),
-                      DistWtdSubpelAvgVarianceParams(
-                          2, 4, &aom_dist_wtd_sub_pixel_avg_variance4x16_c,
-                          0)));
+  DistWtdSubpelAvgVarianceParams(
+      6, 4, &aom_dist_wtd_sub_pixel_avg_variance64x16_c, 0),
+  DistWtdSubpelAvgVarianceParams(
+      4, 6, &aom_dist_wtd_sub_pixel_avg_variance16x64_c, 0),
+  DistWtdSubpelAvgVarianceParams(5, 3,
+                                 &aom_dist_wtd_sub_pixel_avg_variance32x8_c, 0),
+  DistWtdSubpelAvgVarianceParams(3, 5,
+                                 &aom_dist_wtd_sub_pixel_avg_variance8x32_c, 0),
+  DistWtdSubpelAvgVarianceParams(4, 2,
+                                 &aom_dist_wtd_sub_pixel_avg_variance16x4_c, 0),
+  DistWtdSubpelAvgVarianceParams(2, 4,
+                                 &aom_dist_wtd_sub_pixel_avg_variance4x16_c, 0),
+#endif
+};
+INSTANTIATE_TEST_SUITE_P(C, AvxDistWtdSubpelAvgVarianceTest,
+                         ::testing::ValuesIn(kArrayDistWtdSubpelAvgVariance_c));
 
+#if !CONFIG_REALTIME_ONLY
 INSTANTIATE_TEST_SUITE_P(
     C, AvxObmcSubpelVarianceTest,
     ::testing::Values(
@@ -1268,14 +1399,126 @@
         ObmcSubpelVarianceParams(3, 5, &aom_obmc_sub_pixel_variance8x32_c, 0),
         ObmcSubpelVarianceParams(4, 2, &aom_obmc_sub_pixel_variance16x4_c, 0),
         ObmcSubpelVarianceParams(2, 4, &aom_obmc_sub_pixel_variance4x16_c, 0)));
+#endif
 
 #if CONFIG_AV1_HIGHBITDEPTH
+typedef uint64_t (*MseHBDWxH16bitFunc)(uint16_t *dst, int dstride,
+                                       uint16_t *src, int sstride, int w,
+                                       int h);
+
+template <typename FunctionType>
+class MseHBDWxHTestClass
+    : public ::testing::TestWithParam<TestParams<FunctionType> > {
+ public:
+  virtual void SetUp() {
+    params_ = this->GetParam();
+
+    rnd_.Reset(ACMRandom::DeterministicSeed());
+    src_ = reinterpret_cast<uint16_t *>(
+        aom_memalign(16, block_size() * sizeof(src_)));
+    dst_ = reinterpret_cast<uint16_t *>(
+        aom_memalign(16, block_size() * sizeof(dst_)));
+    ASSERT_TRUE(src_ != NULL);
+    ASSERT_TRUE(dst_ != NULL);
+  }
+
+  virtual void TearDown() {
+    aom_free(src_);
+    aom_free(dst_);
+    src_ = NULL;
+    dst_ = NULL;
+    libaom_test::ClearSystemState();
+  }
+
+ protected:
+  void RefMatchTestMse();
+  void SpeedTest();
+
+ protected:
+  ACMRandom rnd_;
+  uint16_t *dst_;
+  uint16_t *src_;
+  TestParams<FunctionType> params_;
+
+  // some relay helpers
+  int block_size() const { return params_.block_size; }
+  int width() const { return params_.width; }
+  int d_stride() const { return params_.width; }  // stride is same as width
+  int s_stride() const { return params_.width; }  // stride is same as width
+  int height() const { return params_.height; }
+  int mask() const { return params_.mask; }
+};
+
+template <typename MseHBDWxHFunctionType>
+void MseHBDWxHTestClass<MseHBDWxHFunctionType>::SpeedTest() {
+  aom_usec_timer ref_timer, test_timer;
+  double elapsed_time_c = 0;
+  double elapsed_time_simd = 0;
+  int run_time = 10000000;
+  int w = width();
+  int h = height();
+  int dstride = d_stride();
+  int sstride = s_stride();
+  for (int k = 0; k < block_size(); ++k) {
+    dst_[k] = rnd_.Rand16() & mask();
+    src_[k] = rnd_.Rand16() & mask();
+  }
+  aom_usec_timer_start(&ref_timer);
+  for (int i = 0; i < run_time; i++) {
+    aom_mse_wxh_16bit_highbd_c(dst_, dstride, src_, sstride, w, h);
+  }
+  aom_usec_timer_mark(&ref_timer);
+  elapsed_time_c = static_cast<double>(aom_usec_timer_elapsed(&ref_timer));
+
+  aom_usec_timer_start(&test_timer);
+  for (int i = 0; i < run_time; i++) {
+    params_.func(dst_, dstride, src_, sstride, w, h);
+  }
+  aom_usec_timer_mark(&test_timer);
+  elapsed_time_simd = static_cast<double>(aom_usec_timer_elapsed(&test_timer));
+
+  printf("%dx%d\tc_time=%lf \t simd_time=%lf \t gain=%lf\n", width(), height(),
+         elapsed_time_c, elapsed_time_simd,
+         (elapsed_time_c / elapsed_time_simd));
+}
+
+template <typename MseHBDWxHFunctionType>
+void MseHBDWxHTestClass<MseHBDWxHFunctionType>::RefMatchTestMse() {
+  uint64_t mse_ref = 0;
+  uint64_t mse_mod = 0;
+  int w = width();
+  int h = height();
+  int dstride = d_stride();
+  int sstride = s_stride();
+  for (int i = 0; i < 10; i++) {
+    for (int k = 0; k < block_size(); ++k) {
+      dst_[k] = rnd_.Rand16() & mask();
+      src_[k] = rnd_.Rand16() & mask();
+    }
+    ASM_REGISTER_STATE_CHECK(mse_ref = aom_mse_wxh_16bit_highbd_c(
+                                 dst_, dstride, src_, sstride, w, h));
+    ASM_REGISTER_STATE_CHECK(
+        mse_mod = params_.func(dst_, dstride, src_, sstride, w, h));
+    EXPECT_EQ(mse_ref, mse_mod)
+        << "ref mse: " << mse_ref << " mod mse: " << mse_mod;
+  }
+}
+
+typedef TestParams<MseHBDWxH16bitFunc> MseHBDWxHParams;
+typedef MseHBDWxHTestClass<MseHBDWxH16bitFunc> MseHBDWxHTest;
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(MseHBDWxHTest);
 typedef MainTestClass<VarianceMxNFunc> AvxHBDMseTest;
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AvxHBDMseTest);
 typedef MainTestClass<VarianceMxNFunc> AvxHBDVarianceTest;
 typedef SubpelVarianceTest<SubpixVarMxNFunc> AvxHBDSubpelVarianceTest;
 typedef SubpelVarianceTest<SubpixAvgVarMxNFunc> AvxHBDSubpelAvgVarianceTest;
+#if !CONFIG_REALTIME_ONLY
 typedef ObmcVarianceTest<ObmcSubpelVarFunc> AvxHBDObmcSubpelVarianceTest;
+#endif
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AvxHBDObmcSubpelVarianceTest);
 
+TEST_P(MseHBDWxHTest, RefMse) { RefMatchTestMse(); }
+TEST_P(MseHBDWxHTest, DISABLED_SpeedMse) { SpeedTest(); }
 TEST_P(AvxHBDMseTest, RefMse) { RefTestMse(); }
 TEST_P(AvxHBDMseTest, MaxMse) { MaxTestMse(); }
 TEST_P(AvxHBDVarianceTest, Zero) { ZeroTest(); }
@@ -1354,7 +1597,7 @@
   VarianceParams(3, 2, &aom_highbd_8_variance8x4_c, 8),
   VarianceParams(2, 3, &aom_highbd_8_variance4x8_c, 8),
   VarianceParams(2, 2, &aom_highbd_8_variance4x4_c, 8),
-
+#if !CONFIG_REALTIME_ONLY
   VarianceParams(6, 4, &aom_highbd_12_variance64x16_c, 12),
   VarianceParams(4, 6, &aom_highbd_12_variance16x64_c, 12),
   VarianceParams(5, 3, &aom_highbd_12_variance32x8_c, 12),
@@ -1373,6 +1616,7 @@
   VarianceParams(3, 5, &aom_highbd_8_variance8x32_c, 8),
   VarianceParams(4, 2, &aom_highbd_8_variance16x4_c, 8),
   VarianceParams(2, 4, &aom_highbd_8_variance4x16_c, 8),
+#endif
 };
 INSTANTIATE_TEST_SUITE_P(C, AvxHBDVarianceTest,
                          ::testing::ValuesIn(kArrayHBDVariance_c));
@@ -1435,7 +1679,7 @@
   SubpelVarianceParams(3, 2, &aom_highbd_12_sub_pixel_variance8x4_c, 12),
   SubpelVarianceParams(2, 3, &aom_highbd_12_sub_pixel_variance4x8_c, 12),
   SubpelVarianceParams(2, 2, &aom_highbd_12_sub_pixel_variance4x4_c, 12),
-
+#if !CONFIG_REALTIME_ONLY
   SubpelVarianceParams(6, 4, &aom_highbd_8_sub_pixel_variance64x16_c, 8),
   SubpelVarianceParams(4, 6, &aom_highbd_8_sub_pixel_variance16x64_c, 8),
   SubpelVarianceParams(5, 3, &aom_highbd_8_sub_pixel_variance32x8_c, 8),
@@ -1454,6 +1698,7 @@
   SubpelVarianceParams(3, 5, &aom_highbd_12_sub_pixel_variance8x32_c, 12),
   SubpelVarianceParams(4, 2, &aom_highbd_12_sub_pixel_variance16x4_c, 12),
   SubpelVarianceParams(2, 4, &aom_highbd_12_sub_pixel_variance4x16_c, 12),
+#endif
 };
 INSTANTIATE_TEST_SUITE_P(C, AvxHBDSubpelVarianceTest,
                          ::testing::ValuesIn(kArrayHBDSubpelVariance_c));
@@ -1535,6 +1780,7 @@
   SubpelAvgVarianceParams(2, 3, &aom_highbd_12_sub_pixel_avg_variance4x8_c, 12),
   SubpelAvgVarianceParams(2, 2, &aom_highbd_12_sub_pixel_avg_variance4x4_c, 12),
 
+#if !CONFIG_REALTIME_ONLY
   SubpelAvgVarianceParams(6, 4, &aom_highbd_8_sub_pixel_avg_variance64x16_c, 8),
   SubpelAvgVarianceParams(4, 6, &aom_highbd_8_sub_pixel_avg_variance16x64_c, 8),
   SubpelAvgVarianceParams(5, 3, &aom_highbd_8_sub_pixel_avg_variance32x8_c, 8),
@@ -1565,10 +1811,12 @@
                           12),
   SubpelAvgVarianceParams(2, 4, &aom_highbd_12_sub_pixel_avg_variance4x16_c,
                           12),
+#endif
 };
 INSTANTIATE_TEST_SUITE_P(C, AvxHBDSubpelAvgVarianceTest,
                          ::testing::ValuesIn(kArrayHBDSubpelAvgVariance_c));
 
+#if !CONFIG_REALTIME_ONLY
 const ObmcSubpelVarianceParams kArrayHBDObmcSubpelVariance_c[] = {
   ObmcSubpelVarianceParams(7, 7, &aom_highbd_obmc_sub_pixel_variance128x128_c,
                            8),
@@ -1687,9 +1935,17 @@
 };
 INSTANTIATE_TEST_SUITE_P(C, AvxHBDObmcSubpelVarianceTest,
                          ::testing::ValuesIn(kArrayHBDObmcSubpelVariance_c));
+#endif  // !CONFIG_REALTIME_ONLY
 #endif  // CONFIG_AV1_HIGHBITDEPTH
 
 #if HAVE_SSE2
+INSTANTIATE_TEST_SUITE_P(
+    SSE2, MseWxHTest,
+    ::testing::Values(MseWxHParams(3, 3, &aom_mse_wxh_16bit_sse2, 8),
+                      MseWxHParams(3, 2, &aom_mse_wxh_16bit_sse2, 8),
+                      MseWxHParams(2, 3, &aom_mse_wxh_16bit_sse2, 8),
+                      MseWxHParams(2, 2, &aom_mse_wxh_16bit_sse2, 8)));
+
 INSTANTIATE_TEST_SUITE_P(SSE2, SumOfSquaresTest,
                          ::testing::Values(aom_get_mb_ss_sse2));
 
@@ -1699,90 +1955,103 @@
                                            MseParams(3, 4, &aom_mse8x16_sse2),
                                            MseParams(3, 3, &aom_mse8x8_sse2)));
 
-INSTANTIATE_TEST_SUITE_P(
-    SSE2, AvxVarianceTest,
-    ::testing::Values(VarianceParams(7, 7, &aom_variance128x128_sse2),
-                      VarianceParams(7, 6, &aom_variance128x64_sse2),
-                      VarianceParams(6, 7, &aom_variance64x128_sse2),
-                      VarianceParams(6, 6, &aom_variance64x64_sse2),
-                      VarianceParams(6, 5, &aom_variance64x32_sse2),
-                      VarianceParams(6, 4, &aom_variance64x16_sse2),
-                      VarianceParams(5, 6, &aom_variance32x64_sse2),
-                      VarianceParams(5, 5, &aom_variance32x32_sse2),
-                      VarianceParams(5, 4, &aom_variance32x16_sse2),
-                      VarianceParams(5, 3, &aom_variance32x8_sse2),
-                      VarianceParams(4, 6, &aom_variance16x64_sse2),
-                      VarianceParams(4, 5, &aom_variance16x32_sse2),
-                      VarianceParams(4, 4, &aom_variance16x16_sse2),
-                      VarianceParams(4, 3, &aom_variance16x8_sse2),
-                      VarianceParams(4, 2, &aom_variance16x4_sse2),
-                      VarianceParams(3, 5, &aom_variance8x32_sse2),
-                      VarianceParams(3, 4, &aom_variance8x16_sse2),
-                      VarianceParams(3, 3, &aom_variance8x8_sse2),
-                      VarianceParams(3, 2, &aom_variance8x4_sse2),
-                      VarianceParams(2, 4, &aom_variance4x16_sse2),
-                      VarianceParams(2, 3, &aom_variance4x8_sse2),
-                      VarianceParams(2, 2, &aom_variance4x4_sse2)));
+const VarianceParams kArrayVariance_sse2[] = {
+  VarianceParams(7, 7, &aom_variance128x128_sse2),
+  VarianceParams(7, 6, &aom_variance128x64_sse2),
+  VarianceParams(6, 7, &aom_variance64x128_sse2),
+  VarianceParams(6, 6, &aom_variance64x64_sse2),
+  VarianceParams(6, 5, &aom_variance64x32_sse2),
+  VarianceParams(5, 6, &aom_variance32x64_sse2),
+  VarianceParams(5, 5, &aom_variance32x32_sse2),
+  VarianceParams(5, 4, &aom_variance32x16_sse2),
+  VarianceParams(4, 5, &aom_variance16x32_sse2),
+  VarianceParams(4, 4, &aom_variance16x16_sse2),
+  VarianceParams(4, 3, &aom_variance16x8_sse2),
+  VarianceParams(3, 4, &aom_variance8x16_sse2),
+  VarianceParams(3, 3, &aom_variance8x8_sse2),
+  VarianceParams(3, 2, &aom_variance8x4_sse2),
+  VarianceParams(2, 3, &aom_variance4x8_sse2),
+  VarianceParams(2, 2, &aom_variance4x4_sse2),
+#if !CONFIG_REALTIME_ONLY
+  VarianceParams(6, 4, &aom_variance64x16_sse2),
+  VarianceParams(5, 3, &aom_variance32x8_sse2),
+  VarianceParams(4, 6, &aom_variance16x64_sse2),
+  VarianceParams(4, 2, &aom_variance16x4_sse2),
+  VarianceParams(3, 5, &aom_variance8x32_sse2),
+  VarianceParams(2, 4, &aom_variance4x16_sse2),
+#endif
+};
+INSTANTIATE_TEST_SUITE_P(SSE2, AvxVarianceTest,
+                         ::testing::ValuesIn(kArrayVariance_sse2));
 
-INSTANTIATE_TEST_SUITE_P(
-    SSE2, AvxSubpelVarianceTest,
-    ::testing::Values(
-        SubpelVarianceParams(7, 7, &aom_sub_pixel_variance128x128_sse2, 0),
-        SubpelVarianceParams(7, 6, &aom_sub_pixel_variance128x64_sse2, 0),
-        SubpelVarianceParams(6, 7, &aom_sub_pixel_variance64x128_sse2, 0),
-        SubpelVarianceParams(6, 6, &aom_sub_pixel_variance64x64_sse2, 0),
-        SubpelVarianceParams(6, 5, &aom_sub_pixel_variance64x32_sse2, 0),
-        SubpelVarianceParams(5, 6, &aom_sub_pixel_variance32x64_sse2, 0),
-        SubpelVarianceParams(5, 5, &aom_sub_pixel_variance32x32_sse2, 0),
-        SubpelVarianceParams(5, 4, &aom_sub_pixel_variance32x16_sse2, 0),
-        SubpelVarianceParams(4, 5, &aom_sub_pixel_variance16x32_sse2, 0),
-        SubpelVarianceParams(4, 4, &aom_sub_pixel_variance16x16_sse2, 0),
-        SubpelVarianceParams(4, 3, &aom_sub_pixel_variance16x8_sse2, 0),
-        SubpelVarianceParams(3, 4, &aom_sub_pixel_variance8x16_sse2, 0),
-        SubpelVarianceParams(3, 3, &aom_sub_pixel_variance8x8_sse2, 0),
-        SubpelVarianceParams(3, 2, &aom_sub_pixel_variance8x4_sse2, 0),
-        SubpelVarianceParams(2, 3, &aom_sub_pixel_variance4x8_sse2, 0),
-        SubpelVarianceParams(2, 2, &aom_sub_pixel_variance4x4_sse2, 0),
+const SubpelVarianceParams kArraySubpelVariance_sse2[] = {
+  SubpelVarianceParams(7, 7, &aom_sub_pixel_variance128x128_sse2, 0),
+  SubpelVarianceParams(7, 6, &aom_sub_pixel_variance128x64_sse2, 0),
+  SubpelVarianceParams(6, 7, &aom_sub_pixel_variance64x128_sse2, 0),
+  SubpelVarianceParams(6, 6, &aom_sub_pixel_variance64x64_sse2, 0),
+  SubpelVarianceParams(6, 5, &aom_sub_pixel_variance64x32_sse2, 0),
+  SubpelVarianceParams(5, 6, &aom_sub_pixel_variance32x64_sse2, 0),
+  SubpelVarianceParams(5, 5, &aom_sub_pixel_variance32x32_sse2, 0),
+  SubpelVarianceParams(5, 4, &aom_sub_pixel_variance32x16_sse2, 0),
+  SubpelVarianceParams(4, 5, &aom_sub_pixel_variance16x32_sse2, 0),
+  SubpelVarianceParams(4, 4, &aom_sub_pixel_variance16x16_sse2, 0),
+  SubpelVarianceParams(4, 3, &aom_sub_pixel_variance16x8_sse2, 0),
+  SubpelVarianceParams(3, 4, &aom_sub_pixel_variance8x16_sse2, 0),
+  SubpelVarianceParams(3, 3, &aom_sub_pixel_variance8x8_sse2, 0),
+  SubpelVarianceParams(3, 2, &aom_sub_pixel_variance8x4_sse2, 0),
+  SubpelVarianceParams(2, 3, &aom_sub_pixel_variance4x8_sse2, 0),
+  SubpelVarianceParams(2, 2, &aom_sub_pixel_variance4x4_sse2, 0),
+#if !CONFIG_REALTIME_ONLY
+  SubpelVarianceParams(6, 4, &aom_sub_pixel_variance64x16_sse2, 0),
+  SubpelVarianceParams(4, 6, &aom_sub_pixel_variance16x64_sse2, 0),
+  SubpelVarianceParams(5, 3, &aom_sub_pixel_variance32x8_sse2, 0),
+  SubpelVarianceParams(3, 5, &aom_sub_pixel_variance8x32_sse2, 0),
+  SubpelVarianceParams(4, 2, &aom_sub_pixel_variance16x4_sse2, 0),
+  SubpelVarianceParams(2, 4, &aom_sub_pixel_variance4x16_sse2, 0),
+#endif
+};
+INSTANTIATE_TEST_SUITE_P(SSE2, AvxSubpelVarianceTest,
+                         ::testing::ValuesIn(kArraySubpelVariance_sse2));
 
-        SubpelVarianceParams(6, 4, &aom_sub_pixel_variance64x16_sse2, 0),
-        SubpelVarianceParams(4, 6, &aom_sub_pixel_variance16x64_sse2, 0),
-        SubpelVarianceParams(5, 3, &aom_sub_pixel_variance32x8_sse2, 0),
-        SubpelVarianceParams(3, 5, &aom_sub_pixel_variance8x32_sse2, 0),
-        SubpelVarianceParams(4, 2, &aom_sub_pixel_variance16x4_sse2, 0),
-        SubpelVarianceParams(2, 4, &aom_sub_pixel_variance4x16_sse2, 0)));
-
-INSTANTIATE_TEST_SUITE_P(
-    SSE2, AvxSubpelAvgVarianceTest,
-    ::testing::Values(
-        SubpelAvgVarianceParams(7, 7, &aom_sub_pixel_avg_variance128x128_sse2,
-                                0),
-        SubpelAvgVarianceParams(7, 6, &aom_sub_pixel_avg_variance128x64_sse2,
-                                0),
-        SubpelAvgVarianceParams(6, 7, &aom_sub_pixel_avg_variance64x128_sse2,
-                                0),
-        SubpelAvgVarianceParams(6, 6, &aom_sub_pixel_avg_variance64x64_sse2, 0),
-        SubpelAvgVarianceParams(6, 5, &aom_sub_pixel_avg_variance64x32_sse2, 0),
-        SubpelAvgVarianceParams(5, 6, &aom_sub_pixel_avg_variance32x64_sse2, 0),
-        SubpelAvgVarianceParams(5, 5, &aom_sub_pixel_avg_variance32x32_sse2, 0),
-        SubpelAvgVarianceParams(5, 4, &aom_sub_pixel_avg_variance32x16_sse2, 0),
-        SubpelAvgVarianceParams(4, 5, &aom_sub_pixel_avg_variance16x32_sse2, 0),
-        SubpelAvgVarianceParams(4, 4, &aom_sub_pixel_avg_variance16x16_sse2, 0),
-        SubpelAvgVarianceParams(4, 3, &aom_sub_pixel_avg_variance16x8_sse2, 0),
-        SubpelAvgVarianceParams(3, 4, &aom_sub_pixel_avg_variance8x16_sse2, 0),
-        SubpelAvgVarianceParams(3, 3, &aom_sub_pixel_avg_variance8x8_sse2, 0),
-        SubpelAvgVarianceParams(3, 2, &aom_sub_pixel_avg_variance8x4_sse2, 0),
-        SubpelAvgVarianceParams(2, 3, &aom_sub_pixel_avg_variance4x8_sse2, 0),
-        SubpelAvgVarianceParams(2, 2, &aom_sub_pixel_avg_variance4x4_sse2, 0),
-
-        SubpelAvgVarianceParams(6, 4, &aom_sub_pixel_avg_variance64x16_sse2, 0),
-        SubpelAvgVarianceParams(4, 6, &aom_sub_pixel_avg_variance16x64_sse2, 0),
-        SubpelAvgVarianceParams(5, 3, &aom_sub_pixel_avg_variance32x8_sse2, 0),
-        SubpelAvgVarianceParams(3, 5, &aom_sub_pixel_avg_variance8x32_sse2, 0),
-        SubpelAvgVarianceParams(4, 2, &aom_sub_pixel_avg_variance16x4_sse2, 0),
-        SubpelAvgVarianceParams(2, 4, &aom_sub_pixel_avg_variance4x16_sse2,
-                                0)));
+const SubpelAvgVarianceParams kArraySubpelAvgVariance_sse2[] = {
+  SubpelAvgVarianceParams(7, 7, &aom_sub_pixel_avg_variance128x128_sse2, 0),
+  SubpelAvgVarianceParams(7, 6, &aom_sub_pixel_avg_variance128x64_sse2, 0),
+  SubpelAvgVarianceParams(6, 7, &aom_sub_pixel_avg_variance64x128_sse2, 0),
+  SubpelAvgVarianceParams(6, 6, &aom_sub_pixel_avg_variance64x64_sse2, 0),
+  SubpelAvgVarianceParams(6, 5, &aom_sub_pixel_avg_variance64x32_sse2, 0),
+  SubpelAvgVarianceParams(5, 6, &aom_sub_pixel_avg_variance32x64_sse2, 0),
+  SubpelAvgVarianceParams(5, 5, &aom_sub_pixel_avg_variance32x32_sse2, 0),
+  SubpelAvgVarianceParams(5, 4, &aom_sub_pixel_avg_variance32x16_sse2, 0),
+  SubpelAvgVarianceParams(4, 5, &aom_sub_pixel_avg_variance16x32_sse2, 0),
+  SubpelAvgVarianceParams(4, 4, &aom_sub_pixel_avg_variance16x16_sse2, 0),
+  SubpelAvgVarianceParams(4, 3, &aom_sub_pixel_avg_variance16x8_sse2, 0),
+  SubpelAvgVarianceParams(3, 4, &aom_sub_pixel_avg_variance8x16_sse2, 0),
+  SubpelAvgVarianceParams(3, 3, &aom_sub_pixel_avg_variance8x8_sse2, 0),
+  SubpelAvgVarianceParams(3, 2, &aom_sub_pixel_avg_variance8x4_sse2, 0),
+  SubpelAvgVarianceParams(2, 3, &aom_sub_pixel_avg_variance4x8_sse2, 0),
+  SubpelAvgVarianceParams(2, 2, &aom_sub_pixel_avg_variance4x4_sse2, 0),
+#if !CONFIG_REALTIME_ONLY
+  SubpelAvgVarianceParams(6, 4, &aom_sub_pixel_avg_variance64x16_sse2, 0),
+  SubpelAvgVarianceParams(4, 6, &aom_sub_pixel_avg_variance16x64_sse2, 0),
+  SubpelAvgVarianceParams(5, 3, &aom_sub_pixel_avg_variance32x8_sse2, 0),
+  SubpelAvgVarianceParams(3, 5, &aom_sub_pixel_avg_variance8x32_sse2, 0),
+  SubpelAvgVarianceParams(4, 2, &aom_sub_pixel_avg_variance16x4_sse2, 0),
+  SubpelAvgVarianceParams(2, 4, &aom_sub_pixel_avg_variance4x16_sse2, 0),
+#endif
+};
+INSTANTIATE_TEST_SUITE_P(SSE2, AvxSubpelAvgVarianceTest,
+                         ::testing::ValuesIn(kArraySubpelAvgVariance_sse2));
 
 #if CONFIG_AV1_HIGHBITDEPTH
+#if HAVE_SSE2
+INSTANTIATE_TEST_SUITE_P(
+    SSE2, MseHBDWxHTest,
+    ::testing::Values(MseHBDWxHParams(3, 3, &aom_mse_wxh_16bit_highbd_sse2, 10),
+                      MseHBDWxHParams(3, 2, &aom_mse_wxh_16bit_highbd_sse2, 10),
+                      MseHBDWxHParams(2, 3, &aom_mse_wxh_16bit_highbd_sse2, 10),
+                      MseHBDWxHParams(2, 2, &aom_mse_wxh_16bit_highbd_sse2,
+                                      10)));
+#endif  // HAVE_SSE2
 #if HAVE_SSE4_1
 INSTANTIATE_TEST_SUITE_P(
     SSE4_1, AvxSubpelVarianceTest,
@@ -1865,7 +2134,7 @@
   VarianceParams(4, 3, &aom_highbd_8_variance16x8_sse2, 8),
   VarianceParams(3, 4, &aom_highbd_8_variance8x16_sse2, 8),
   VarianceParams(3, 3, &aom_highbd_8_variance8x8_sse2, 8),
-
+#if !CONFIG_REALTIME_ONLY
   VarianceParams(6, 4, &aom_highbd_12_variance64x16_sse2, 12),
   VarianceParams(4, 6, &aom_highbd_12_variance16x64_sse2, 12),
   VarianceParams(5, 3, &aom_highbd_12_variance32x8_sse2, 12),
@@ -1882,14 +2151,23 @@
   VarianceParams(4, 6, &aom_highbd_8_variance16x64_sse2, 8),
   VarianceParams(5, 3, &aom_highbd_8_variance32x8_sse2, 8),
   VarianceParams(3, 5, &aom_highbd_8_variance8x32_sse2, 8),
-  // VarianceParams(4, 2, &aom_highbd_8_variance16x4_sse2, 8),
-  // VarianceParams(2, 4, &aom_highbd_8_variance4x16_sse2, 8),
+// VarianceParams(4, 2, &aom_highbd_8_variance16x4_sse2, 8),
+// VarianceParams(2, 4, &aom_highbd_8_variance4x16_sse2, 8),
+#endif
 };
 INSTANTIATE_TEST_SUITE_P(SSE2, AvxHBDVarianceTest,
                          ::testing::ValuesIn(kArrayHBDVariance_sse2));
 
 #if HAVE_AVX2
 
+INSTANTIATE_TEST_SUITE_P(
+    AVX2, MseHBDWxHTest,
+    ::testing::Values(MseHBDWxHParams(3, 3, &aom_mse_wxh_16bit_highbd_avx2, 10),
+                      MseHBDWxHParams(3, 2, &aom_mse_wxh_16bit_highbd_avx2, 10),
+                      MseHBDWxHParams(2, 3, &aom_mse_wxh_16bit_highbd_avx2, 10),
+                      MseHBDWxHParams(2, 2, &aom_mse_wxh_16bit_highbd_avx2,
+                                      10)));
+
 const VarianceParams kArrayHBDVariance_avx2[] = {
   VarianceParams(7, 7, &aom_highbd_10_variance128x128_avx2, 10),
   VarianceParams(7, 6, &aom_highbd_10_variance128x64_avx2, 10),
@@ -1908,6 +2186,25 @@
 
 INSTANTIATE_TEST_SUITE_P(AVX2, AvxHBDVarianceTest,
                          ::testing::ValuesIn(kArrayHBDVariance_avx2));
+
+const SubpelVarianceParams kArrayHBDSubpelVariance_avx2[] = {
+  SubpelVarianceParams(7, 7, &aom_highbd_10_sub_pixel_variance128x128_avx2, 10),
+  SubpelVarianceParams(7, 6, &aom_highbd_10_sub_pixel_variance128x64_avx2, 10),
+  SubpelVarianceParams(6, 7, &aom_highbd_10_sub_pixel_variance64x128_avx2, 10),
+  SubpelVarianceParams(6, 6, &aom_highbd_10_sub_pixel_variance64x64_avx2, 10),
+  SubpelVarianceParams(6, 5, &aom_highbd_10_sub_pixel_variance64x32_avx2, 10),
+  SubpelVarianceParams(5, 6, &aom_highbd_10_sub_pixel_variance32x64_avx2, 10),
+  SubpelVarianceParams(5, 5, &aom_highbd_10_sub_pixel_variance32x32_avx2, 10),
+  SubpelVarianceParams(5, 4, &aom_highbd_10_sub_pixel_variance32x16_avx2, 10),
+  SubpelVarianceParams(4, 5, &aom_highbd_10_sub_pixel_variance16x32_avx2, 10),
+  SubpelVarianceParams(4, 4, &aom_highbd_10_sub_pixel_variance16x16_avx2, 10),
+  SubpelVarianceParams(4, 3, &aom_highbd_10_sub_pixel_variance16x8_avx2, 10),
+  SubpelVarianceParams(3, 4, &aom_highbd_10_sub_pixel_variance8x16_avx2, 10),
+  SubpelVarianceParams(3, 3, &aom_highbd_10_sub_pixel_variance8x8_avx2, 10),
+};
+
+INSTANTIATE_TEST_SUITE_P(AVX2, AvxHBDSubpelVarianceTest,
+                         ::testing::ValuesIn(kArrayHBDSubpelVariance_avx2));
 #endif  // HAVE_AVX2
 
 const SubpelVarianceParams kArrayHBDSubpelVariance_sse2[] = {
@@ -1953,7 +2250,7 @@
   SubpelVarianceParams(3, 4, &aom_highbd_8_sub_pixel_variance8x16_sse2, 8),
   SubpelVarianceParams(3, 3, &aom_highbd_8_sub_pixel_variance8x8_sse2, 8),
   SubpelVarianceParams(3, 2, &aom_highbd_8_sub_pixel_variance8x4_sse2, 8),
-
+#if !CONFIG_REALTIME_ONLY
   SubpelVarianceParams(6, 4, &aom_highbd_12_sub_pixel_variance64x16_sse2, 12),
   SubpelVarianceParams(4, 6, &aom_highbd_12_sub_pixel_variance16x64_sse2, 12),
   SubpelVarianceParams(5, 3, &aom_highbd_12_sub_pixel_variance32x8_sse2, 12),
@@ -1971,7 +2268,8 @@
   SubpelVarianceParams(5, 3, &aom_highbd_8_sub_pixel_variance32x8_sse2, 8),
   SubpelVarianceParams(3, 5, &aom_highbd_8_sub_pixel_variance8x32_sse2, 8),
   SubpelVarianceParams(4, 2, &aom_highbd_8_sub_pixel_variance16x4_sse2, 8),
-  // SubpelVarianceParams(2, 4, &aom_highbd_8_sub_pixel_variance4x16_sse2, 8),
+// SubpelVarianceParams(2, 4, &aom_highbd_8_sub_pixel_variance4x16_sse2, 8),
+#endif
 };
 INSTANTIATE_TEST_SUITE_P(SSE2, AvxHBDSubpelVarianceTest,
                          ::testing::ValuesIn(kArrayHBDSubpelVariance_sse2));
@@ -2044,6 +2342,7 @@
   SubpelAvgVarianceParams(3, 2, &aom_highbd_8_sub_pixel_avg_variance8x4_sse2,
                           8),
 
+#if !CONFIG_REALTIME_ONLY
   SubpelAvgVarianceParams(6, 4, &aom_highbd_12_sub_pixel_avg_variance64x16_sse2,
                           12),
   SubpelAvgVarianceParams(4, 6, &aom_highbd_12_sub_pixel_avg_variance16x64_sse2,
@@ -2078,8 +2377,9 @@
                           8),
   SubpelAvgVarianceParams(4, 2, &aom_highbd_8_sub_pixel_avg_variance16x4_sse2,
                           8),
-  // SubpelAvgVarianceParams(2, 4,
-  // &aom_highbd_8_sub_pixel_avg_variance4x16_sse2, 8),
+// SubpelAvgVarianceParams(2, 4,
+// &aom_highbd_8_sub_pixel_avg_variance4x16_sse2, 8),
+#endif
 };
 
 INSTANTIATE_TEST_SUITE_P(SSE2, AvxHBDSubpelAvgVarianceTest,
@@ -2088,124 +2388,119 @@
 #endif  // CONFIG_AV1_HIGHBITDEPTH
 
 #if HAVE_SSSE3
-INSTANTIATE_TEST_SUITE_P(
-    SSSE3, AvxSubpelVarianceTest,
-    ::testing::Values(
-        SubpelVarianceParams(7, 7, &aom_sub_pixel_variance128x128_ssse3, 0),
-        SubpelVarianceParams(7, 6, &aom_sub_pixel_variance128x64_ssse3, 0),
-        SubpelVarianceParams(6, 7, &aom_sub_pixel_variance64x128_ssse3, 0),
-        SubpelVarianceParams(6, 6, &aom_sub_pixel_variance64x64_ssse3, 0),
-        SubpelVarianceParams(6, 5, &aom_sub_pixel_variance64x32_ssse3, 0),
-        SubpelVarianceParams(5, 6, &aom_sub_pixel_variance32x64_ssse3, 0),
-        SubpelVarianceParams(5, 5, &aom_sub_pixel_variance32x32_ssse3, 0),
-        SubpelVarianceParams(5, 4, &aom_sub_pixel_variance32x16_ssse3, 0),
-        SubpelVarianceParams(4, 5, &aom_sub_pixel_variance16x32_ssse3, 0),
-        SubpelVarianceParams(4, 4, &aom_sub_pixel_variance16x16_ssse3, 0),
-        SubpelVarianceParams(4, 3, &aom_sub_pixel_variance16x8_ssse3, 0),
-        SubpelVarianceParams(3, 4, &aom_sub_pixel_variance8x16_ssse3, 0),
-        SubpelVarianceParams(3, 3, &aom_sub_pixel_variance8x8_ssse3, 0),
-        SubpelVarianceParams(3, 2, &aom_sub_pixel_variance8x4_ssse3, 0),
-        SubpelVarianceParams(2, 3, &aom_sub_pixel_variance4x8_ssse3, 0),
-        SubpelVarianceParams(2, 2, &aom_sub_pixel_variance4x4_ssse3, 0),
+const SubpelVarianceParams kArraySubpelVariance_ssse3[] = {
+  SubpelVarianceParams(7, 7, &aom_sub_pixel_variance128x128_ssse3, 0),
+  SubpelVarianceParams(7, 6, &aom_sub_pixel_variance128x64_ssse3, 0),
+  SubpelVarianceParams(6, 7, &aom_sub_pixel_variance64x128_ssse3, 0),
+  SubpelVarianceParams(6, 6, &aom_sub_pixel_variance64x64_ssse3, 0),
+  SubpelVarianceParams(6, 5, &aom_sub_pixel_variance64x32_ssse3, 0),
+  SubpelVarianceParams(5, 6, &aom_sub_pixel_variance32x64_ssse3, 0),
+  SubpelVarianceParams(5, 5, &aom_sub_pixel_variance32x32_ssse3, 0),
+  SubpelVarianceParams(5, 4, &aom_sub_pixel_variance32x16_ssse3, 0),
+  SubpelVarianceParams(4, 5, &aom_sub_pixel_variance16x32_ssse3, 0),
+  SubpelVarianceParams(4, 4, &aom_sub_pixel_variance16x16_ssse3, 0),
+  SubpelVarianceParams(4, 3, &aom_sub_pixel_variance16x8_ssse3, 0),
+  SubpelVarianceParams(3, 4, &aom_sub_pixel_variance8x16_ssse3, 0),
+  SubpelVarianceParams(3, 3, &aom_sub_pixel_variance8x8_ssse3, 0),
+  SubpelVarianceParams(3, 2, &aom_sub_pixel_variance8x4_ssse3, 0),
+  SubpelVarianceParams(2, 3, &aom_sub_pixel_variance4x8_ssse3, 0),
+  SubpelVarianceParams(2, 2, &aom_sub_pixel_variance4x4_ssse3, 0),
+#if !CONFIG_REALTIME_ONLY
+  SubpelVarianceParams(6, 4, &aom_sub_pixel_variance64x16_ssse3, 0),
+  SubpelVarianceParams(4, 6, &aom_sub_pixel_variance16x64_ssse3, 0),
+  SubpelVarianceParams(5, 3, &aom_sub_pixel_variance32x8_ssse3, 0),
+  SubpelVarianceParams(3, 5, &aom_sub_pixel_variance8x32_ssse3, 0),
+  SubpelVarianceParams(4, 2, &aom_sub_pixel_variance16x4_ssse3, 0),
+  SubpelVarianceParams(2, 4, &aom_sub_pixel_variance4x16_ssse3, 0),
+#endif
+};
+INSTANTIATE_TEST_SUITE_P(SSSE3, AvxSubpelVarianceTest,
+                         ::testing::ValuesIn(kArraySubpelVariance_ssse3));
 
-        SubpelVarianceParams(6, 4, &aom_sub_pixel_variance64x16_ssse3, 0),
-        SubpelVarianceParams(4, 6, &aom_sub_pixel_variance16x64_ssse3, 0),
-        SubpelVarianceParams(5, 3, &aom_sub_pixel_variance32x8_ssse3, 0),
-        SubpelVarianceParams(3, 5, &aom_sub_pixel_variance8x32_ssse3, 0),
-        SubpelVarianceParams(4, 2, &aom_sub_pixel_variance16x4_ssse3, 0),
-        SubpelVarianceParams(2, 4, &aom_sub_pixel_variance4x16_ssse3, 0)));
+const SubpelAvgVarianceParams kArraySubpelAvgVariance_ssse3[] = {
+  SubpelAvgVarianceParams(7, 7, &aom_sub_pixel_avg_variance128x128_ssse3, 0),
+  SubpelAvgVarianceParams(7, 6, &aom_sub_pixel_avg_variance128x64_ssse3, 0),
+  SubpelAvgVarianceParams(6, 7, &aom_sub_pixel_avg_variance64x128_ssse3, 0),
+  SubpelAvgVarianceParams(6, 6, &aom_sub_pixel_avg_variance64x64_ssse3, 0),
+  SubpelAvgVarianceParams(6, 5, &aom_sub_pixel_avg_variance64x32_ssse3, 0),
+  SubpelAvgVarianceParams(5, 6, &aom_sub_pixel_avg_variance32x64_ssse3, 0),
+  SubpelAvgVarianceParams(5, 5, &aom_sub_pixel_avg_variance32x32_ssse3, 0),
+  SubpelAvgVarianceParams(5, 4, &aom_sub_pixel_avg_variance32x16_ssse3, 0),
+  SubpelAvgVarianceParams(4, 5, &aom_sub_pixel_avg_variance16x32_ssse3, 0),
+  SubpelAvgVarianceParams(4, 4, &aom_sub_pixel_avg_variance16x16_ssse3, 0),
+  SubpelAvgVarianceParams(4, 3, &aom_sub_pixel_avg_variance16x8_ssse3, 0),
+  SubpelAvgVarianceParams(3, 4, &aom_sub_pixel_avg_variance8x16_ssse3, 0),
+  SubpelAvgVarianceParams(3, 3, &aom_sub_pixel_avg_variance8x8_ssse3, 0),
+  SubpelAvgVarianceParams(3, 2, &aom_sub_pixel_avg_variance8x4_ssse3, 0),
+  SubpelAvgVarianceParams(2, 3, &aom_sub_pixel_avg_variance4x8_ssse3, 0),
+  SubpelAvgVarianceParams(2, 2, &aom_sub_pixel_avg_variance4x4_ssse3, 0),
+#if !CONFIG_REALTIME_ONLY
+  SubpelAvgVarianceParams(6, 4, &aom_sub_pixel_avg_variance64x16_ssse3, 0),
+  SubpelAvgVarianceParams(4, 6, &aom_sub_pixel_avg_variance16x64_ssse3, 0),
+  SubpelAvgVarianceParams(5, 3, &aom_sub_pixel_avg_variance32x8_ssse3, 0),
+  SubpelAvgVarianceParams(3, 5, &aom_sub_pixel_avg_variance8x32_ssse3, 0),
+  SubpelAvgVarianceParams(4, 2, &aom_sub_pixel_avg_variance16x4_ssse3, 0),
+  SubpelAvgVarianceParams(2, 4, &aom_sub_pixel_avg_variance4x16_ssse3, 0),
+#endif
+};
+INSTANTIATE_TEST_SUITE_P(SSSE3, AvxSubpelAvgVarianceTest,
+                         ::testing::ValuesIn(kArraySubpelAvgVariance_ssse3));
 
-INSTANTIATE_TEST_SUITE_P(
-    SSSE3, AvxSubpelAvgVarianceTest,
-    ::testing::Values(
-        SubpelAvgVarianceParams(7, 7, &aom_sub_pixel_avg_variance128x128_ssse3,
-                                0),
-        SubpelAvgVarianceParams(7, 6, &aom_sub_pixel_avg_variance128x64_ssse3,
-                                0),
-        SubpelAvgVarianceParams(6, 7, &aom_sub_pixel_avg_variance64x128_ssse3,
-                                0),
-        SubpelAvgVarianceParams(6, 6, &aom_sub_pixel_avg_variance64x64_ssse3,
-                                0),
-        SubpelAvgVarianceParams(6, 5, &aom_sub_pixel_avg_variance64x32_ssse3,
-                                0),
-        SubpelAvgVarianceParams(5, 6, &aom_sub_pixel_avg_variance32x64_ssse3,
-                                0),
-        SubpelAvgVarianceParams(5, 5, &aom_sub_pixel_avg_variance32x32_ssse3,
-                                0),
-        SubpelAvgVarianceParams(5, 4, &aom_sub_pixel_avg_variance32x16_ssse3,
-                                0),
-        SubpelAvgVarianceParams(4, 5, &aom_sub_pixel_avg_variance16x32_ssse3,
-                                0),
-        SubpelAvgVarianceParams(4, 4, &aom_sub_pixel_avg_variance16x16_ssse3,
-                                0),
-        SubpelAvgVarianceParams(4, 3, &aom_sub_pixel_avg_variance16x8_ssse3, 0),
-        SubpelAvgVarianceParams(3, 4, &aom_sub_pixel_avg_variance8x16_ssse3, 0),
-        SubpelAvgVarianceParams(3, 3, &aom_sub_pixel_avg_variance8x8_ssse3, 0),
-        SubpelAvgVarianceParams(3, 2, &aom_sub_pixel_avg_variance8x4_ssse3, 0),
-        SubpelAvgVarianceParams(2, 3, &aom_sub_pixel_avg_variance4x8_ssse3, 0),
-        SubpelAvgVarianceParams(2, 2, &aom_sub_pixel_avg_variance4x4_ssse3, 0),
-
-        SubpelAvgVarianceParams(6, 4, &aom_sub_pixel_avg_variance64x16_ssse3,
-                                0),
-        SubpelAvgVarianceParams(4, 6, &aom_sub_pixel_avg_variance16x64_ssse3,
-                                0),
-        SubpelAvgVarianceParams(5, 3, &aom_sub_pixel_avg_variance32x8_ssse3, 0),
-        SubpelAvgVarianceParams(3, 5, &aom_sub_pixel_avg_variance8x32_ssse3, 0),
-        SubpelAvgVarianceParams(4, 2, &aom_sub_pixel_avg_variance16x4_ssse3, 0),
-        SubpelAvgVarianceParams(2, 4, &aom_sub_pixel_avg_variance4x16_ssse3,
-                                0)));
-
+const DistWtdSubpelAvgVarianceParams kArrayDistWtdSubpelAvgVariance_ssse3[] = {
+  DistWtdSubpelAvgVarianceParams(
+      7, 7, &aom_dist_wtd_sub_pixel_avg_variance128x128_ssse3, 0),
+  DistWtdSubpelAvgVarianceParams(
+      7, 6, &aom_dist_wtd_sub_pixel_avg_variance128x64_ssse3, 0),
+  DistWtdSubpelAvgVarianceParams(
+      6, 7, &aom_dist_wtd_sub_pixel_avg_variance64x128_ssse3, 0),
+  DistWtdSubpelAvgVarianceParams(
+      6, 6, &aom_dist_wtd_sub_pixel_avg_variance64x64_ssse3, 0),
+  DistWtdSubpelAvgVarianceParams(
+      6, 5, &aom_dist_wtd_sub_pixel_avg_variance64x32_ssse3, 0),
+  DistWtdSubpelAvgVarianceParams(
+      5, 6, &aom_dist_wtd_sub_pixel_avg_variance32x64_ssse3, 0),
+  DistWtdSubpelAvgVarianceParams(
+      5, 5, &aom_dist_wtd_sub_pixel_avg_variance32x32_ssse3, 0),
+  DistWtdSubpelAvgVarianceParams(
+      5, 4, &aom_dist_wtd_sub_pixel_avg_variance32x16_ssse3, 0),
+  DistWtdSubpelAvgVarianceParams(
+      4, 5, &aom_dist_wtd_sub_pixel_avg_variance16x32_ssse3, 0),
+  DistWtdSubpelAvgVarianceParams(
+      4, 4, &aom_dist_wtd_sub_pixel_avg_variance16x16_ssse3, 0),
+  DistWtdSubpelAvgVarianceParams(
+      4, 3, &aom_dist_wtd_sub_pixel_avg_variance16x8_ssse3, 0),
+  DistWtdSubpelAvgVarianceParams(
+      3, 4, &aom_dist_wtd_sub_pixel_avg_variance8x16_ssse3, 0),
+  DistWtdSubpelAvgVarianceParams(
+      3, 3, &aom_dist_wtd_sub_pixel_avg_variance8x8_ssse3, 0),
+  DistWtdSubpelAvgVarianceParams(
+      3, 2, &aom_dist_wtd_sub_pixel_avg_variance8x4_ssse3, 0),
+  DistWtdSubpelAvgVarianceParams(
+      2, 3, &aom_dist_wtd_sub_pixel_avg_variance4x8_ssse3, 0),
+  DistWtdSubpelAvgVarianceParams(
+      2, 2, &aom_dist_wtd_sub_pixel_avg_variance4x4_ssse3, 0),
+#if !CONFIG_REALTIME_ONLY
+  DistWtdSubpelAvgVarianceParams(
+      6, 4, &aom_dist_wtd_sub_pixel_avg_variance64x16_ssse3, 0),
+  DistWtdSubpelAvgVarianceParams(
+      4, 6, &aom_dist_wtd_sub_pixel_avg_variance16x64_ssse3, 0),
+  DistWtdSubpelAvgVarianceParams(
+      5, 3, &aom_dist_wtd_sub_pixel_avg_variance32x8_ssse3, 0),
+  DistWtdSubpelAvgVarianceParams(
+      3, 5, &aom_dist_wtd_sub_pixel_avg_variance8x32_ssse3, 0),
+  DistWtdSubpelAvgVarianceParams(
+      4, 2, &aom_dist_wtd_sub_pixel_avg_variance16x4_ssse3, 0),
+  DistWtdSubpelAvgVarianceParams(
+      2, 4, &aom_dist_wtd_sub_pixel_avg_variance4x16_ssse3, 0),
+#endif
+};
 INSTANTIATE_TEST_SUITE_P(
     SSSE3, AvxDistWtdSubpelAvgVarianceTest,
-    ::testing::Values(
-        DistWtdSubpelAvgVarianceParams(
-            7, 7, &aom_dist_wtd_sub_pixel_avg_variance128x128_ssse3, 0),
-        DistWtdSubpelAvgVarianceParams(
-            7, 6, &aom_dist_wtd_sub_pixel_avg_variance128x64_ssse3, 0),
-        DistWtdSubpelAvgVarianceParams(
-            6, 7, &aom_dist_wtd_sub_pixel_avg_variance64x128_ssse3, 0),
-        DistWtdSubpelAvgVarianceParams(
-            6, 6, &aom_dist_wtd_sub_pixel_avg_variance64x64_ssse3, 0),
-        DistWtdSubpelAvgVarianceParams(
-            6, 5, &aom_dist_wtd_sub_pixel_avg_variance64x32_ssse3, 0),
-        DistWtdSubpelAvgVarianceParams(
-            5, 6, &aom_dist_wtd_sub_pixel_avg_variance32x64_ssse3, 0),
-        DistWtdSubpelAvgVarianceParams(
-            5, 5, &aom_dist_wtd_sub_pixel_avg_variance32x32_ssse3, 0),
-        DistWtdSubpelAvgVarianceParams(
-            5, 4, &aom_dist_wtd_sub_pixel_avg_variance32x16_ssse3, 0),
-        DistWtdSubpelAvgVarianceParams(
-            4, 5, &aom_dist_wtd_sub_pixel_avg_variance16x32_ssse3, 0),
-        DistWtdSubpelAvgVarianceParams(
-            4, 4, &aom_dist_wtd_sub_pixel_avg_variance16x16_ssse3, 0),
-        DistWtdSubpelAvgVarianceParams(
-            4, 3, &aom_dist_wtd_sub_pixel_avg_variance16x8_ssse3, 0),
-        DistWtdSubpelAvgVarianceParams(
-            3, 4, &aom_dist_wtd_sub_pixel_avg_variance8x16_ssse3, 0),
-        DistWtdSubpelAvgVarianceParams(
-            3, 3, &aom_dist_wtd_sub_pixel_avg_variance8x8_ssse3, 0),
-        DistWtdSubpelAvgVarianceParams(
-            3, 2, &aom_dist_wtd_sub_pixel_avg_variance8x4_ssse3, 0),
-        DistWtdSubpelAvgVarianceParams(
-            2, 3, &aom_dist_wtd_sub_pixel_avg_variance4x8_ssse3, 0),
-        DistWtdSubpelAvgVarianceParams(
-            2, 2, &aom_dist_wtd_sub_pixel_avg_variance4x4_ssse3, 0),
-
-        DistWtdSubpelAvgVarianceParams(
-            6, 4, &aom_dist_wtd_sub_pixel_avg_variance64x16_ssse3, 0),
-        DistWtdSubpelAvgVarianceParams(
-            4, 6, &aom_dist_wtd_sub_pixel_avg_variance16x64_ssse3, 0),
-        DistWtdSubpelAvgVarianceParams(
-            5, 3, &aom_dist_wtd_sub_pixel_avg_variance32x8_ssse3, 0),
-        DistWtdSubpelAvgVarianceParams(
-            3, 5, &aom_dist_wtd_sub_pixel_avg_variance8x32_ssse3, 0),
-        DistWtdSubpelAvgVarianceParams(
-            4, 2, &aom_dist_wtd_sub_pixel_avg_variance16x4_ssse3, 0),
-        DistWtdSubpelAvgVarianceParams(
-            2, 4, &aom_dist_wtd_sub_pixel_avg_variance4x16_ssse3, 0)));
+    ::testing::ValuesIn(kArrayDistWtdSubpelAvgVariance_ssse3));
 #endif  // HAVE_SSSE3
 
 #if HAVE_SSE4_1
+#if !CONFIG_REALTIME_ONLY
 INSTANTIATE_TEST_SUITE_P(
     SSE4_1, AvxObmcSubpelVarianceTest,
     ::testing::Values(
@@ -2241,7 +2536,6 @@
                                  0),
         ObmcSubpelVarianceParams(2, 2, &aom_obmc_sub_pixel_variance4x4_sse4_1,
                                  0),
-
         ObmcSubpelVarianceParams(6, 4, &aom_obmc_sub_pixel_variance64x16_sse4_1,
                                  0),
         ObmcSubpelVarianceParams(4, 6, &aom_obmc_sub_pixel_variance16x64_sse4_1,
@@ -2254,47 +2548,64 @@
                                  0),
         ObmcSubpelVarianceParams(2, 4, &aom_obmc_sub_pixel_variance4x16_sse4_1,
                                  0)));
+#endif
 #endif  // HAVE_SSE4_1
 
 #if HAVE_AVX2
+
+INSTANTIATE_TEST_SUITE_P(
+    AVX2, MseWxHTest,
+    ::testing::Values(MseWxHParams(3, 3, &aom_mse_wxh_16bit_avx2, 8),
+                      MseWxHParams(3, 2, &aom_mse_wxh_16bit_avx2, 8),
+                      MseWxHParams(2, 3, &aom_mse_wxh_16bit_avx2, 8),
+                      MseWxHParams(2, 2, &aom_mse_wxh_16bit_avx2, 8)));
+
 INSTANTIATE_TEST_SUITE_P(AVX2, AvxMseTest,
                          ::testing::Values(MseParams(4, 4,
                                                      &aom_mse16x16_avx2)));
 
-INSTANTIATE_TEST_SUITE_P(
-    AVX2, AvxVarianceTest,
-    ::testing::Values(VarianceParams(7, 7, &aom_variance128x128_avx2),
-                      VarianceParams(7, 6, &aom_variance128x64_avx2),
-                      VarianceParams(6, 7, &aom_variance64x128_avx2),
-                      VarianceParams(6, 6, &aom_variance64x64_avx2),
-                      VarianceParams(6, 5, &aom_variance64x32_avx2),
-                      VarianceParams(6, 4, &aom_variance64x16_avx2),
-                      VarianceParams(5, 6, &aom_variance32x64_avx2),
-                      VarianceParams(5, 5, &aom_variance32x32_avx2),
-                      VarianceParams(5, 4, &aom_variance32x16_avx2),
-                      VarianceParams(5, 3, &aom_variance32x8_avx2),
-                      VarianceParams(4, 6, &aom_variance16x64_avx2),
-                      VarianceParams(4, 5, &aom_variance16x32_avx2),
-                      VarianceParams(4, 4, &aom_variance16x16_avx2),
-                      VarianceParams(4, 3, &aom_variance16x8_avx2),
-                      VarianceParams(4, 2, &aom_variance16x4_avx2)));
+const VarianceParams kArrayVariance_avx2[] = {
+  VarianceParams(7, 7, &aom_variance128x128_avx2),
+  VarianceParams(7, 6, &aom_variance128x64_avx2),
+  VarianceParams(6, 7, &aom_variance64x128_avx2),
+  VarianceParams(6, 6, &aom_variance64x64_avx2),
+  VarianceParams(6, 5, &aom_variance64x32_avx2),
+  VarianceParams(5, 6, &aom_variance32x64_avx2),
+  VarianceParams(5, 5, &aom_variance32x32_avx2),
+  VarianceParams(5, 4, &aom_variance32x16_avx2),
+  VarianceParams(4, 5, &aom_variance16x32_avx2),
+  VarianceParams(4, 4, &aom_variance16x16_avx2),
+  VarianceParams(4, 3, &aom_variance16x8_avx2),
+#if !CONFIG_REALTIME_ONLY
+  VarianceParams(6, 4, &aom_variance64x16_avx2),
+  VarianceParams(4, 6, &aom_variance16x64_avx2),
+  VarianceParams(5, 3, &aom_variance32x8_avx2),
+  VarianceParams(4, 2, &aom_variance16x4_avx2),
+#endif
+};
+INSTANTIATE_TEST_SUITE_P(AVX2, AvxVarianceTest,
+                         ::testing::ValuesIn(kArrayVariance_avx2));
 
-INSTANTIATE_TEST_SUITE_P(
-    AVX2, AvxSubpelVarianceTest,
-    ::testing::Values(
-        SubpelVarianceParams(7, 7, &aom_sub_pixel_variance128x128_avx2, 0),
-        SubpelVarianceParams(7, 6, &aom_sub_pixel_variance128x64_avx2, 0),
-        SubpelVarianceParams(6, 7, &aom_sub_pixel_variance64x128_avx2, 0),
-        SubpelVarianceParams(6, 6, &aom_sub_pixel_variance64x64_avx2, 0),
-        SubpelVarianceParams(6, 5, &aom_sub_pixel_variance64x32_avx2, 0),
-        SubpelVarianceParams(5, 6, &aom_sub_pixel_variance32x64_avx2, 0),
-        SubpelVarianceParams(5, 5, &aom_sub_pixel_variance32x32_avx2, 0),
-        SubpelVarianceParams(5, 4, &aom_sub_pixel_variance32x16_avx2, 0),
-        SubpelVarianceParams(4, 6, &aom_sub_pixel_variance16x64_avx2, 0),
-        SubpelVarianceParams(4, 5, &aom_sub_pixel_variance16x32_avx2, 0),
-        SubpelVarianceParams(4, 4, &aom_sub_pixel_variance16x16_avx2, 0),
-        SubpelVarianceParams(4, 3, &aom_sub_pixel_variance16x8_avx2, 0),
-        SubpelVarianceParams(4, 2, &aom_sub_pixel_variance16x4_avx2, 0)));
+const SubpelVarianceParams kArraySubpelVariance_avx2[] = {
+  SubpelVarianceParams(7, 7, &aom_sub_pixel_variance128x128_avx2, 0),
+  SubpelVarianceParams(7, 6, &aom_sub_pixel_variance128x64_avx2, 0),
+  SubpelVarianceParams(6, 7, &aom_sub_pixel_variance64x128_avx2, 0),
+  SubpelVarianceParams(6, 6, &aom_sub_pixel_variance64x64_avx2, 0),
+  SubpelVarianceParams(6, 5, &aom_sub_pixel_variance64x32_avx2, 0),
+  SubpelVarianceParams(5, 6, &aom_sub_pixel_variance32x64_avx2, 0),
+  SubpelVarianceParams(5, 5, &aom_sub_pixel_variance32x32_avx2, 0),
+  SubpelVarianceParams(5, 4, &aom_sub_pixel_variance32x16_avx2, 0),
+
+  SubpelVarianceParams(4, 5, &aom_sub_pixel_variance16x32_avx2, 0),
+  SubpelVarianceParams(4, 4, &aom_sub_pixel_variance16x16_avx2, 0),
+  SubpelVarianceParams(4, 3, &aom_sub_pixel_variance16x8_avx2, 0),
+#if !CONFIG_REALTIME_ONLY
+  SubpelVarianceParams(4, 6, &aom_sub_pixel_variance16x64_avx2, 0),
+  SubpelVarianceParams(4, 2, &aom_sub_pixel_variance16x4_avx2, 0),
+#endif
+};
+INSTANTIATE_TEST_SUITE_P(AVX2, AvxSubpelVarianceTest,
+                         ::testing::ValuesIn(kArraySubpelVariance_avx2));
 
 INSTANTIATE_TEST_SUITE_P(
     AVX2, AvxSubpelAvgVarianceTest,
@@ -2326,21 +2637,50 @@
     NEON, AvxVarianceTest,
     ::testing::Values(VarianceParams(7, 7, &aom_variance128x128_neon),
                       VarianceParams(6, 6, &aom_variance64x64_neon),
+                      VarianceParams(7, 6, &aom_variance128x64_neon),
+                      VarianceParams(6, 7, &aom_variance64x128_neon),
+                      VarianceParams(6, 6, &aom_variance64x64_neon),
                       VarianceParams(6, 5, &aom_variance64x32_neon),
                       VarianceParams(5, 6, &aom_variance32x64_neon),
                       VarianceParams(5, 5, &aom_variance32x32_neon),
+                      VarianceParams(5, 4, &aom_variance32x16_neon),
+                      VarianceParams(4, 5, &aom_variance16x32_neon),
                       VarianceParams(4, 4, &aom_variance16x16_neon),
                       VarianceParams(4, 3, &aom_variance16x8_neon),
                       VarianceParams(3, 4, &aom_variance8x16_neon),
-                      VarianceParams(3, 3, &aom_variance8x8_neon)));
+                      VarianceParams(3, 3, &aom_variance8x8_neon),
+                      VarianceParams(3, 2, &aom_variance8x4_neon),
+                      VarianceParams(2, 3, &aom_variance4x8_neon),
+                      VarianceParams(2, 2, &aom_variance4x4_neon)));
 
-INSTANTIATE_TEST_SUITE_P(
-    NEON, AvxSubpelVarianceTest,
-    ::testing::Values(
-        SubpelVarianceParams(6, 6, &aom_sub_pixel_variance64x64_neon, 0),
-        SubpelVarianceParams(5, 5, &aom_sub_pixel_variance32x32_neon, 0),
-        SubpelVarianceParams(4, 4, &aom_sub_pixel_variance16x16_neon, 0),
-        SubpelVarianceParams(3, 3, &aom_sub_pixel_variance8x8_neon, 0)));
+const SubpelVarianceParams kArraySubpelVariance_neon[] = {
+  SubpelVarianceParams(7, 7, &aom_sub_pixel_variance128x128_neon, 0),
+  SubpelVarianceParams(7, 6, &aom_sub_pixel_variance128x64_neon, 0),
+  SubpelVarianceParams(6, 7, &aom_sub_pixel_variance64x128_neon, 0),
+  SubpelVarianceParams(6, 6, &aom_sub_pixel_variance64x64_neon, 0),
+  SubpelVarianceParams(6, 5, &aom_sub_pixel_variance64x32_neon, 0),
+  SubpelVarianceParams(5, 6, &aom_sub_pixel_variance32x64_neon, 0),
+  SubpelVarianceParams(5, 5, &aom_sub_pixel_variance32x32_neon, 0),
+  SubpelVarianceParams(5, 4, &aom_sub_pixel_variance32x16_neon, 0),
+  SubpelVarianceParams(4, 5, &aom_sub_pixel_variance16x32_neon, 0),
+  SubpelVarianceParams(4, 4, &aom_sub_pixel_variance16x16_neon, 0),
+  SubpelVarianceParams(4, 3, &aom_sub_pixel_variance16x8_neon, 0),
+  SubpelVarianceParams(3, 4, &aom_sub_pixel_variance8x16_neon, 0),
+  SubpelVarianceParams(3, 3, &aom_sub_pixel_variance8x8_neon, 0),
+  SubpelVarianceParams(3, 2, &aom_sub_pixel_variance8x4_neon, 0),
+  SubpelVarianceParams(2, 3, &aom_sub_pixel_variance4x8_neon, 0),
+  SubpelVarianceParams(2, 2, &aom_sub_pixel_variance4x4_neon, 0),
+#if !CONFIG_REALTIME_ONLY
+  SubpelVarianceParams(6, 4, &aom_sub_pixel_variance64x16_neon, 0),
+  SubpelVarianceParams(4, 6, &aom_sub_pixel_variance16x64_neon, 0),
+  SubpelVarianceParams(5, 3, &aom_sub_pixel_variance32x8_neon, 0),
+  SubpelVarianceParams(3, 5, &aom_sub_pixel_variance8x32_neon, 0),
+  SubpelVarianceParams(4, 2, &aom_sub_pixel_variance16x4_neon, 0),
+  SubpelVarianceParams(2, 4, &aom_sub_pixel_variance4x16_neon, 0),
+#endif
+};
+INSTANTIATE_TEST_SUITE_P(NEON, AvxSubpelVarianceTest,
+                         ::testing::ValuesIn(kArraySubpelVariance_neon));
 #endif  // HAVE_NEON
 
 #if HAVE_MSA
diff --git a/test/warp_filter_test.cc b/test/warp_filter_test.cc
index c5e87f0..1d9dd45 100644
--- a/test/warp_filter_test.cc
+++ b/test/warp_filter_test.cc
@@ -56,6 +56,13 @@
 INSTANTIATE_TEST_SUITE_P(
     AVX2, AV1WarpFilterTest,
     libaom_test::AV1WarpFilter::BuildParams(av1_warp_affine_avx2));
+
+#if CONFIG_AV1_HIGHBITDEPTH
+
+INSTANTIATE_TEST_SUITE_P(
+    AVX2, AV1HighbdWarpFilterTest,
+    libaom_test::AV1HighbdWarpFilter::BuildParams(av1_highbd_warp_affine_avx2));
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 #endif  // HAVE_AVX2
 
 #if HAVE_NEON
diff --git a/test/warp_filter_test_util.cc b/test/warp_filter_test_util.cc
index bcb0c18..07a2e3f 100644
--- a/test/warp_filter_test_util.cc
+++ b/test/warp_filter_test_util.cc
@@ -97,9 +97,9 @@
 ::testing::internal::ParamGenerator<WarpTestParams> BuildParams(
     warp_affine_func filter) {
   WarpTestParam params[] = {
-    make_tuple(4, 4, 50000, filter),  make_tuple(8, 8, 50000, filter),
-    make_tuple(64, 64, 1000, filter), make_tuple(4, 16, 20000, filter),
-    make_tuple(32, 8, 10000, filter),
+    make_tuple(4, 4, 5000, filter),  make_tuple(8, 8, 5000, filter),
+    make_tuple(64, 64, 100, filter), make_tuple(4, 16, 2000, filter),
+    make_tuple(32, 8, 1000, filter),
   };
   return ::testing::Combine(::testing::ValuesIn(params),
                             ::testing::Values(0, 1), ::testing::Values(0, 1),
diff --git a/test/webmenc_test.cc b/test/webmenc_test.cc
new file mode 100644
index 0000000..acd795f
--- /dev/null
+++ b/test/webmenc_test.cc
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <string>
+#include "common/webmenc.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+namespace {
+
+#if CONFIG_WEBM_IO
+
+class WebmencTest : public ::testing::Test {};
+
+// All of these variations on output should be identical.
+TEST(WebmencTest, ExtractEncoderSettingsOutput1) {
+  const char *argv[] = { "aomenc", "-o", "output", "input",
+                         "--target-bitrate=300" };
+  int argc = 5;
+  const std::string expected("version:1.2.3 --target-bitrate=300");
+  char *result = extract_encoder_settings("1.2.3", argv, argc, "input");
+  ASSERT_EQ(expected, std::string(result));
+  free(result);
+}
+
+TEST(WebmencTest, ExtractEncoderSettingsOutput2) {
+  const char *argv[] = { "aomenc", "--output", "bar", "foo", "--cpu-used=3" };
+  int argc = 5;
+  const std::string expected("version:abc --cpu-used=3");
+  char *result = extract_encoder_settings("abc", argv, argc, "foo");
+  ASSERT_EQ(expected, std::string(result));
+  free(result);
+}
+
+TEST(WebmencTest, ExtractEncoderSettingsOutput3) {
+  const char *argv[] = { "aomenc", "--cq-level=63", "--end-usage=q",
+                         "--output=foo", "baz" };
+  int argc = 5;
+  const std::string expected("version:23 --cq-level=63 --end-usage=q");
+  char *result = extract_encoder_settings("23", argv, argc, "baz");
+  ASSERT_EQ(expected, std::string(result));
+  free(result);
+}
+
+TEST(WebmencTest, ExtractEncoderSettingsInput) {
+  // Check that input filename is filtered regardless of position.
+  const char *argv[] = { "aomenc", "-o", "out", "input", "-p", "2" };
+  int argc = 6;
+  const char version[] = "1.0.0";
+  const std::string expected("version:1.0.0 -p 2");
+  char *result = extract_encoder_settings(version, argv, argc, "input");
+  ASSERT_EQ(expected, std::string(result));
+  free(result);
+
+  const char *argv2[] = { "aomenc", "input", "-o", "out", "-p", "2" };
+  result = extract_encoder_settings(version, argv2, argc, "input");
+  ASSERT_EQ(expected, std::string(result));
+  free(result);
+}
+
+#endif  // CONFIG_WEBM_IO
+}  // namespace
diff --git a/third_party/googletest/README.libaom b/third_party/googletest/README.libaom
index 9b8a863..7b151af 100644
--- a/third_party/googletest/README.libaom
+++ b/third_party/googletest/README.libaom
@@ -1,5 +1,5 @@
 URL: https://github.com/google/googletest
-Version: 1.10.x
+Version: release-1.10.0-224-g23b2a3b1
 License: BSD
 License File: LICENSE
 
@@ -12,6 +12,14 @@
 generation.
 
 Local Modifications:
-- Replace everything in:
-  third_party/googletest/src/googletest/src/
-  third_party/googletest/src/googletest/include/
+- Remove everything but:
+  googletest/
+   cmake
+   CMakeLists.txt
+   CONTRIBUTORS
+   include
+   LICENSE
+   README.md
+   src
+- Enable kErrorOnUninstantiatedParameterizedTest and
+  kErrorOnUninstantiatedTypeParameterizedTest in gtest.cc
diff --git a/third_party/googletest/src/googletest/CHANGES b/third_party/googletest/src/googletest/CHANGES
deleted file mode 100644
index 0552132..0000000
--- a/third_party/googletest/src/googletest/CHANGES
+++ /dev/null
@@ -1,157 +0,0 @@
-Changes for 1.7.0:
-
-* New feature: death tests are supported on OpenBSD and in iOS
-  simulator now.
-* New feature: Google Test now implements a protocol to allow
-  a test runner to detect that a test program has exited
-  prematurely and report it as a failure (before it would be
-  falsely reported as a success if the exit code is 0).
-* New feature: Test::RecordProperty() can now be used outside of the
-  lifespan of a test method, in which case it will be attributed to
-  the current test case or the test program in the XML report.
-* New feature (potentially breaking): --gtest_list_tests now prints
-  the type parameters and value parameters for each test.
-* Improvement: char pointers and char arrays are now escaped properly
-  in failure messages.
-* Improvement: failure summary in XML reports now includes file and
-  line information.
-* Improvement: the <testsuites> XML element now has a timestamp attribute.
-* Improvement: When --gtest_filter is specified, XML report now doesn't
-  contain information about tests that are filtered out.
-* Fixed the bug where long --gtest_filter flag values are truncated in
-  death tests.
-* Potentially breaking change: RUN_ALL_TESTS() is now implemented as a
-  function instead of a macro in order to work better with Clang.
-* Compatibility fixes with C++ 11 and various platforms.
-* Bug/warning fixes.
-
-Changes for 1.6.0:
-
-* New feature: ADD_FAILURE_AT() for reporting a test failure at the
-  given source location -- useful for writing testing utilities.
-* New feature: the universal value printer is moved from Google Mock
-  to Google Test.
-* New feature: type parameters and value parameters are reported in
-  the XML report now.
-* A gtest_disable_pthreads CMake option.
-* Colored output works in GNU Screen sessions now.
-* Parameters of value-parameterized tests are now printed in the
-  textual output.
-* Failures from ad hoc test assertions run before RUN_ALL_TESTS() are
-  now correctly reported.
-* Arguments of ASSERT_XY and EXPECT_XY no longer need to support << to
-  ostream.
-* More complete handling of exceptions.
-* GTEST_ASSERT_XY can be used instead of ASSERT_XY in case the latter
-  name is already used by another library.
-* --gtest_catch_exceptions is now true by default, allowing a test
-  program to continue after an exception is thrown.
-* Value-parameterized test fixtures can now derive from Test and
-  WithParamInterface<T> separately, easing conversion of legacy tests.
-* Death test messages are clearly marked to make them more
-  distinguishable from other messages.
-* Compatibility fixes for Android, Google Native Client, MinGW, HP UX,
-  PowerPC, Lucid autotools, libCStd, Sun C++, Borland C++ Builder (Code Gear),
-  IBM XL C++ (Visual Age C++), and C++0x.
-* Bug fixes and implementation clean-ups.
-* Potentially incompatible changes: disables the harmful 'make install'
-  command in autotools.
-
-Changes for 1.5.0:
-
- * New feature: assertions can be safely called in multiple threads
-   where the pthreads library is available.
- * New feature: predicates used inside EXPECT_TRUE() and friends
-   can now generate custom failure messages.
- * New feature: Google Test can now be compiled as a DLL.
- * New feature: fused source files are included.
- * New feature: prints help when encountering unrecognized Google Test flags.
- * Experimental feature: CMake build script (requires CMake 2.6.4+).
- * Experimental feature: the Pump script for meta programming.
- * double values streamed to an assertion are printed with enough precision
-   to differentiate any two different values.
- * Google Test now works on Solaris and AIX.
- * Build and test script improvements.
- * Bug fixes and implementation clean-ups.
-
- Potentially breaking changes:
-
- * Stopped supporting VC++ 7.1 with exceptions disabled.
- * Dropped support for 'make install'.
-
-Changes for 1.4.0:
-
- * New feature: the event listener API
- * New feature: test shuffling
- * New feature: the XML report format is closer to junitreport and can
-   be parsed by Hudson now.
- * New feature: when a test runs under Visual Studio, its failures are
-   integrated in the IDE.
- * New feature: /MD(d) versions of VC++ projects.
- * New feature: elapsed time for the tests is printed by default.
- * New feature: comes with a TR1 tuple implementation such that Boost
-   is no longer needed for Combine().
- * New feature: EXPECT_DEATH_IF_SUPPORTED macro and friends.
- * New feature: the Xcode project can now produce static gtest
-   libraries in addition to a framework.
- * Compatibility fixes for Solaris, Cygwin, minGW, Windows Mobile,
-   Symbian, gcc, and C++Builder.
- * Bug fixes and implementation clean-ups.
-
-Changes for 1.3.0:
-
- * New feature: death tests on Windows, Cygwin, and Mac.
- * New feature: ability to use Google Test assertions in other testing
-   frameworks.
- * New feature: ability to run disabled test via
-   --gtest_also_run_disabled_tests.
- * New feature: the --help flag for printing the usage.
- * New feature: access to Google Test flag values in user code.
- * New feature: a script that packs Google Test into one .h and one
-   .cc file for easy deployment.
- * New feature: support for distributing test functions to multiple
-   machines (requires support from the test runner).
- * Bug fixes and implementation clean-ups.
-
-Changes for 1.2.1:
-
- * Compatibility fixes for Linux IA-64 and IBM z/OS.
- * Added support for using Boost and other TR1 implementations.
- * Changes to the build scripts to support upcoming release of Google C++
-   Mocking Framework.
- * Added Makefile to the distribution package.
- * Improved build instructions in README.
-
-Changes for 1.2.0:
-
- * New feature: value-parameterized tests.
- * New feature: the ASSERT/EXPECT_(NON)FATAL_FAILURE(_ON_ALL_THREADS)
-   macros.
- * Changed the XML report format to match JUnit/Ant's.
- * Added tests to the Xcode project.
- * Added scons/SConscript for building with SCons.
- * Added src/gtest-all.cc for building Google Test from a single file.
- * Fixed compatibility with Solaris and z/OS.
- * Enabled running Python tests on systems with python 2.3 installed,
-   e.g. Mac OS X 10.4.
- * Bug fixes.
-
-Changes for 1.1.0:
-
- * New feature: type-parameterized tests.
- * New feature: exception assertions.
- * New feature: printing elapsed time of tests.
- * Improved the robustness of death tests.
- * Added an Xcode project and samples.
- * Adjusted the output format on Windows to be understandable by Visual Studio.
- * Minor bug fixes.
-
-Changes for 1.0.1:
-
- * Added project files for Visual Studio 7.1.
- * Fixed issues with compiling on Mac OS X.
- * Fixed issues with compiling on Cygwin.
-
-Changes for 1.0.0:
-
- * Initial Open Source release of Google Test
diff --git a/third_party/googletest/src/googletest/CONTRIBUTORS b/third_party/googletest/src/googletest/CONTRIBUTORS
index feae2fc..1e4afe2 100644
--- a/third_party/googletest/src/googletest/CONTRIBUTORS
+++ b/third_party/googletest/src/googletest/CONTRIBUTORS
@@ -17,6 +17,7 @@
 Keir Mierle <mierle@gmail.com>
 Keith Ray <keith.ray@gmail.com>
 Kenton Varda <kenton@google.com>
+Krystian Kuzniarek <krystian.kuzniarek@gmail.com>
 Manuel Klimek <klimek@google.com>
 Markus Heule <markus.heule@gmail.com>
 Mika Raento <mikie@iki.fi>
diff --git a/third_party/googletest/src/googletest/README.md b/third_party/googletest/src/googletest/README.md
index e30fe80..904048f 100644
--- a/third_party/googletest/src/googletest/README.md
+++ b/third_party/googletest/src/googletest/README.md
@@ -6,51 +6,10 @@
 system where to find its headers and source files. The exact way to do it
 depends on which build system you use, and is usually straightforward.
 
-#### Build
+### Build with CMake
 
-Suppose you put Google Test in directory `${GTEST_DIR}`. To build it, create a
-library build target (or a project as called by Visual Studio and Xcode) to
-compile
-
-    ${GTEST_DIR}/src/gtest-all.cc
-
-with `${GTEST_DIR}/include` in the system header search path and `${GTEST_DIR}`
-in the normal header search path. Assuming a Linux-like system and gcc,
-something like the following will do:
-
-    g++ -isystem ${GTEST_DIR}/include -I${GTEST_DIR} \
-        -pthread -c ${GTEST_DIR}/src/gtest-all.cc
-    ar -rv libgtest.a gtest-all.o
-
-(We need `-pthread` as Google Test uses threads.)
-
-Next, you should compile your test source file with `${GTEST_DIR}/include` in
-the system header search path, and link it with gtest and any other necessary
-libraries:
-
-    g++ -isystem ${GTEST_DIR}/include -pthread path/to/your_test.cc libgtest.a \
-        -o your_test
-
-As an example, the make/ directory contains a Makefile that you can use to build
-Google Test on systems where GNU make is available (e.g. Linux, Mac OS X, and
-Cygwin). It doesn't try to build Google Test's own tests. Instead, it just
-builds the Google Test library and a sample test. You can use it as a starting
-point for your own build script.
-
-If the default settings are correct for your environment, the following commands
-should succeed:
-
-    cd ${GTEST_DIR}/make
-    make
-    ./sample1_unittest
-
-If you see errors, try to tweak the contents of `make/Makefile` to make them go
-away. There are instructions in `make/Makefile` on how to do it.
-
-### Using CMake
-
-Google Test comes with a CMake build script (
-[CMakeLists.txt](https://github.com/google/googletest/blob/master/CMakeLists.txt))
+Google Test comes with a CMake build script
+([CMakeLists.txt](https://github.com/google/googletest/blob/master/CMakeLists.txt))
 that can be used on a wide range of platforms ("C" stands for cross-platform.).
 If you don't have CMake installed already, you can download it for free from
 <http://www.cmake.org/>.
@@ -115,60 +74,64 @@
 
 New file `CMakeLists.txt.in`:
 
-    cmake_minimum_required(VERSION 2.8.2)
+```cmake
+cmake_minimum_required(VERSION 2.8.2)
 
-    project(googletest-download NONE)
+project(googletest-download NONE)
 
-    include(ExternalProject)
-    ExternalProject_Add(googletest
-      GIT_REPOSITORY    https://github.com/google/googletest.git
-      GIT_TAG           master
-      SOURCE_DIR        "${CMAKE_BINARY_DIR}/googletest-src"
-      BINARY_DIR        "${CMAKE_BINARY_DIR}/googletest-build"
-      CONFIGURE_COMMAND ""
-      BUILD_COMMAND     ""
-      INSTALL_COMMAND   ""
-      TEST_COMMAND      ""
-    )
+include(ExternalProject)
+ExternalProject_Add(googletest
+  GIT_REPOSITORY    https://github.com/google/googletest.git
+  GIT_TAG           master
+  SOURCE_DIR        "${CMAKE_CURRENT_BINARY_DIR}/googletest-src"
+  BINARY_DIR        "${CMAKE_CURRENT_BINARY_DIR}/googletest-build"
+  CONFIGURE_COMMAND ""
+  BUILD_COMMAND     ""
+  INSTALL_COMMAND   ""
+  TEST_COMMAND      ""
+)
+```
 
 Existing build's `CMakeLists.txt`:
 
-    # Download and unpack googletest at configure time
-    configure_file(CMakeLists.txt.in googletest-download/CMakeLists.txt)
-    execute_process(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" .
-      RESULT_VARIABLE result
-      WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/googletest-download )
-    if(result)
-      message(FATAL_ERROR "CMake step for googletest failed: ${result}")
-    endif()
-    execute_process(COMMAND ${CMAKE_COMMAND} --build .
-      RESULT_VARIABLE result
-      WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/googletest-download )
-    if(result)
-      message(FATAL_ERROR "Build step for googletest failed: ${result}")
-    endif()
+```cmake
+# Download and unpack googletest at configure time
+configure_file(CMakeLists.txt.in googletest-download/CMakeLists.txt)
+execute_process(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" .
+  RESULT_VARIABLE result
+  WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/googletest-download )
+if(result)
+  message(FATAL_ERROR "CMake step for googletest failed: ${result}")
+endif()
+execute_process(COMMAND ${CMAKE_COMMAND} --build .
+  RESULT_VARIABLE result
+  WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/googletest-download )
+if(result)
+  message(FATAL_ERROR "Build step for googletest failed: ${result}")
+endif()
 
-    # Prevent overriding the parent project's compiler/linker
-    # settings on Windows
-    set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
+# Prevent overriding the parent project's compiler/linker
+# settings on Windows
+set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
 
-    # Add googletest directly to our build. This defines
-    # the gtest and gtest_main targets.
-    add_subdirectory(${CMAKE_BINARY_DIR}/googletest-src
-                     ${CMAKE_BINARY_DIR}/googletest-build
-                     EXCLUDE_FROM_ALL)
+# Add googletest directly to our build. This defines
+# the gtest and gtest_main targets.
+add_subdirectory(${CMAKE_CURRENT_BINARY_DIR}/googletest-src
+                 ${CMAKE_CURRENT_BINARY_DIR}/googletest-build
+                 EXCLUDE_FROM_ALL)
 
-    # The gtest/gtest_main targets carry header search path
-    # dependencies automatically when using CMake 2.8.11 or
-    # later. Otherwise we have to add them here ourselves.
-    if (CMAKE_VERSION VERSION_LESS 2.8.11)
-      include_directories("${gtest_SOURCE_DIR}/include")
-    endif()
+# The gtest/gtest_main targets carry header search path
+# dependencies automatically when using CMake 2.8.11 or
+# later. Otherwise we have to add them here ourselves.
+if (CMAKE_VERSION VERSION_LESS 2.8.11)
+  include_directories("${gtest_SOURCE_DIR}/include")
+endif()
 
-    # Now simply link against gtest or gtest_main as needed. Eg
-    add_executable(example example.cpp)
-    target_link_libraries(example gtest_main)
-    add_test(NAME example_test COMMAND example)
+# Now simply link against gtest or gtest_main as needed. Eg
+add_executable(example example.cpp)
+target_link_libraries(example gtest_main)
+add_test(NAME example_test COMMAND example)
+```
 
 Note that this approach requires CMake 2.8.2 or later due to its use of the
 `ExternalProject_Add()` command. The above technique is discussed in more detail
@@ -188,47 +151,14 @@
 Enabling this option will make gtest link the runtimes dynamically too, and
 match the project in which it is included.
 
-### Legacy Build Scripts
+#### C++ Standard Version
 
-Before settling on CMake, we have been providing hand-maintained build
-projects/scripts for Visual Studio, Xcode, and Autotools. While we continue to
-provide them for convenience, they are not actively maintained any more. We
-highly recommend that you follow the instructions in the above sections to
-integrate Google Test with your existing build system.
-
-If you still need to use the legacy build scripts, here's how:
-
-The msvc\ folder contains two solutions with Visual C++ projects. Open the
-`gtest.sln` or `gtest-md.sln` file using Visual Studio, and you are ready to
-build Google Test the same way you build any Visual Studio project. Files that
-have names ending with -md use DLL versions of Microsoft runtime libraries (the
-/MD or the /MDd compiler option). Files without that suffix use static versions
-of the runtime libraries (the /MT or the /MTd option). Please note that one must
-use the same option to compile both gtest and the test code. If you use Visual
-Studio 2005 or above, we recommend the -md version as /MD is the default for new
-projects in these versions of Visual Studio.
-
-On Mac OS X, open the `gtest.xcodeproj` in the `xcode/` folder using Xcode.
-Build the "gtest" target. The universal binary framework will end up in your
-selected build directory (selected in the Xcode "Preferences..." -> "Building"
-pane and defaults to xcode/build). Alternatively, at the command line, enter:
-
-    xcodebuild
-
-This will build the "Release" configuration of gtest.framework in your default
-build location. See the "xcodebuild" man page for more information about
-building different configurations and building in different locations.
-
-If you wish to use the Google Test Xcode project with Xcode 4.x and above, you
-need to either:
-
-*   update the SDK configuration options in xcode/Config/General.xconfig.
-    Comment options `SDKROOT`, `MACOS_DEPLOYMENT_TARGET`, and `GCC_VERSION`. If
-    you choose this route you lose the ability to target earlier versions of
-    MacOS X.
-*   Install an SDK for an earlier version. This doesn't appear to be supported
-    by Apple, but has been reported to work
-    (http://stackoverflow.com/questions/5378518).
+An environment that supports C++11 is required in order to successfully build
+Google Test. One way to ensure this is to specify the standard in the top-level
+project, for example by using the `set(CMAKE_CXX_STANDARD 11)` command. If this
+is not feasible, for example in a C project using Google Test for validation,
+then it can be specified by adding it to the options for cmake via the
+`DCMAKE_CXX_FLAGS` option.
 
 ### Tweaking Google Test
 
@@ -239,41 +169,14 @@
 them to either 1 or 0 to enable or disable a certain feature.
 
 We list the most frequently used macros below. For a complete list, see file
-[include/gtest/internal/gtest-port.h](https://github.com/google/googletest/blob/master/include/gtest/internal/gtest-port.h).
-
-### Choosing a TR1 Tuple Library
-
-Some Google Test features require the C++ Technical Report 1 (TR1) tuple
-library, which is not yet available with all compilers. The good news is that
-Google Test implements a subset of TR1 tuple that's enough for its own need, and
-will automatically use this when the compiler doesn't provide TR1 tuple.
-
-Usually you don't need to care about which tuple library Google Test uses.
-However, if your project already uses TR1 tuple, you need to tell Google Test to
-use the same TR1 tuple library the rest of your project uses, or the two tuple
-implementations will clash. To do that, add
-
-    -DGTEST_USE_OWN_TR1_TUPLE=0
-
-to the compiler flags while compiling Google Test and your tests. If you want to
-force Google Test to use its own tuple library, just add
-
-    -DGTEST_USE_OWN_TR1_TUPLE=1
-
-to the compiler flags instead.
-
-If you don't want Google Test to use tuple at all, add
-
-    -DGTEST_HAS_TR1_TUPLE=0
-
-and all features using tuple will be disabled.
+[include/gtest/internal/gtest-port.h](https://github.com/google/googletest/blob/master/googletest/include/gtest/internal/gtest-port.h).
 
 ### Multi-threaded Tests
 
 Google Test is thread-safe where the pthread library is available. After
-`#include "gtest/gtest.h"`, you can check the `GTEST_IS_THREADSAFE` macro to see
-whether this is the case (yes if the macro is `#defined` to 1, no if it's
-undefined.).
+`#include "gtest/gtest.h"`, you can check the
+`GTEST_IS_THREADSAFE` macro to see whether this is the case (yes if the macro is
+`#defined` to 1, no if it's undefined.).
 
 If Google Test doesn't correctly detect whether pthread is available in your
 environment, you can force it with
diff --git a/third_party/googletest/src/googletest/src/gtest.cc b/third_party/googletest/src/googletest/src/gtest.cc
index 5b4037f..021c82e 100644
--- a/third_party/googletest/src/googletest/src/gtest.cc
+++ b/third_party/googletest/src/googletest/src/gtest.cc
@@ -400,8 +400,8 @@
 // inserted to report ether an error or a log message.
 //
 // This configuration bit will likely be removed at some point.
-constexpr bool kErrorOnUninstantiatedParameterizedTest = false;
-constexpr bool kErrorOnUninstantiatedTypeParameterizedTest = false;
+constexpr bool kErrorOnUninstantiatedParameterizedTest = true;
+constexpr bool kErrorOnUninstantiatedTypeParameterizedTest = true;
 
 // A test that fails at a given file/line location with a given message.
 class FailureTest : public Test {
diff --git a/third_party/x86inc/README.libaom b/third_party/x86inc/README.libaom
index 07c4dad..2f3e5c2 100644
--- a/third_party/x86inc/README.libaom
+++ b/third_party/x86inc/README.libaom
@@ -1,5 +1,5 @@
 URL: https://git.videolan.org/git/x264.git
-Version: d23d18655249944c1ca894b451e2c82c7a584c62
+Version: 3e5aed95cc470f37e2db3e6506a8deb89b527720
 License: ISC
 License File: LICENSE
 
@@ -8,13 +8,11 @@
 defines that help automatically allow assembly to work cross-platform.
 
 Local Modifications:
-Get configuration from aom_config.asm.
+Get configuration from config/aom_config.asm.
 Prefix functions with aom by default.
 Manage name mangling (prefixing with '_') manually because 'PREFIX' does not
   exist in libaom.
-Expand PIC default to macho64 and respect CONFIG_PIC from libaom
-Set 'private_extern' visibility for macho targets.
 Copy PIC 'GLOBAL' macros from x86_abi_support.asm
 Use .text instead of .rodata on macho to avoid broken tables in PIC mode.
-Use .text with no alignment for aout
-Only use 'hidden' visibility with Chromium
+Use .text with no alignment for aout.
+Only use 'hidden' visibility with Chromium.
diff --git a/third_party/x86inc/x86inc.asm b/third_party/x86inc/x86inc.asm
index adaf2d9..e48d644 100644
--- a/third_party/x86inc/x86inc.asm
+++ b/third_party/x86inc/x86inc.asm
@@ -1,12 +1,12 @@
 ;*****************************************************************************
 ;* x86inc.asm: x264asm abstraction layer
 ;*****************************************************************************
-;* Copyright (C) 2005-2016 x264 project
+;* Copyright (C) 2005-2019 x264 project
 ;*
 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
+;*          Henrik Gramner <henrik@gramner.com>
 ;*          Anton Mitrofanov <BugMaster@narod.ru>
 ;*          Fiona Glaser <fiona@x264.com>
-;*          Henrik Gramner <henrik@gramner.com>
 ;*
 ;* Permission to use, copy, modify, and/or distribute this software for any
 ;* purpose with or without fee is hereby granted, provided that the above
@@ -67,19 +67,19 @@
 %endif
 
 %define FORMAT_ELF 0
+%define FORMAT_MACHO 0
 %ifidn __OUTPUT_FORMAT__,elf
     %define FORMAT_ELF 1
 %elifidn __OUTPUT_FORMAT__,elf32
     %define FORMAT_ELF 1
 %elifidn __OUTPUT_FORMAT__,elf64
     %define FORMAT_ELF 1
-%endif
-
-%define FORMAT_MACHO 0
-%ifidn __OUTPUT_FORMAT__,macho32
-     %define FORMAT_MACHO 1
+%elifidn __OUTPUT_FORMAT__,macho
+    %define FORMAT_MACHO 1
+%elifidn __OUTPUT_FORMAT__,macho32
+    %define FORMAT_MACHO 1
 %elifidn __OUTPUT_FORMAT__,macho64
-     %define FORMAT_MACHO 1
+    %define FORMAT_MACHO 1
 %endif
 
 ; Set PREFIX for libaom builds.
@@ -103,7 +103,11 @@
 ; works around the issue. It appears to be specific to the way libaom
 ; handles the tables.
 %macro SECTION_RODATA 0-1 16
-    %ifidn __OUTPUT_FORMAT__,macho32
+    %ifidn __OUTPUT_FORMAT__,win32
+        SECTION .rdata align=%1
+    %elif WIN64
+        SECTION .rdata align=%1
+    %elifidn __OUTPUT_FORMAT__,macho32
         SECTION .text align=%1
         fakegot:
     %elifidn __OUTPUT_FORMAT__,aout
@@ -113,8 +117,7 @@
     %endif
 %endmacro
 
-; PIC macros are copied from aom_ports/x86_abi_support.asm. The "define PIC"
-; from original code is added in for 64bit.
+; PIC macros from aom_ports/x86_abi_support.asm.
 %ifidn __OUTPUT_FORMAT__,elf32
 %define ABI_IS_32BIT 1
 %elifidn __OUTPUT_FORMAT__,macho32
@@ -203,10 +206,24 @@
 %ifndef GET_GOT_DEFINED
     %define GET_GOT_DEFINED 0
 %endif
-; Done with PIC macros
+; End PIC macros from aom_ports/x86_abi_support.asm.
+
+; libaom explicitly sets visibilty in shared object builds. Avoid setting
+; visibility to hidden as it may break builds that split sources on e.g.,
+; directory boundaries.
+%ifdef CHROMIUM
+    %define VISIBILITY hidden
+    %define HAVE_PRIVATE_EXTERN 1
+%else
+    %define VISIBILITY
+    %define HAVE_PRIVATE_EXTERN 0
+%endif
 
 %ifdef __NASM_VER__
     %use smartalign
+    %if __NASM_VERSION_ID__ < 0x020e0000 ; 2.14
+        %define HAVE_PRIVATE_EXTERN 0
+    %endif
 %endif
 
 ; Macros to eliminate most code duplication between x86_32 and x86_64:
@@ -324,6 +341,18 @@
     %define gprsize 4
 %endif
 
+%macro LEA 2
+%if ARCH_X86_64
+    lea %1, [%2]
+%elif PIC
+    call $+5 ; special-cased to not affect the RSB on most CPU:s
+    pop %1
+    add %1, (%2)-$+1
+%else
+    mov %1, %2
+%endif
+%endmacro
+
 %macro PUSH 1
     push %1
     %ifidn rstk, rsp
@@ -385,6 +414,10 @@
     %endif
 %endmacro
 
+%if ARCH_X86_64 == 0
+    %define movsxd movifnidn
+%endif
+
 %macro movsxdifnidn 2
     %ifnidn %1, %2
         movsxd %1, %2
@@ -433,6 +466,8 @@
 %endmacro
 
 %define required_stack_alignment ((mmsize + 15) & ~15)
+%define vzeroupper_required (mmsize > 16 && (ARCH_X86_64 == 0 || xmm_regs_used > 16 || notcpuflag(avx512)))
+%define high_mm_regs (16*cpuflag(avx512))
 
 %macro ALLOC_STACK 1-2 0 ; stack_size, n_xmm_regs (for win64 only)
     %ifnum %1
@@ -483,10 +518,18 @@
     %ifnum %1
         %if %1 != 0 && required_stack_alignment > STACK_ALIGNMENT
             %if %1 > 0
+                ; Reserve an additional register for storing the original stack pointer, but avoid using
+                ; eax/rax for this purpose since it can potentially get overwritten as a return value.
                 %assign regs_used (regs_used + 1)
+                %if ARCH_X86_64 && regs_used == 7
+                    %assign regs_used 8
+                %elif ARCH_X86_64 == 0 && regs_used == 1
+                    %assign regs_used 2
+                %endif
             %endif
             %if ARCH_X86_64 && regs_used < 5 + UNIX64 * 3
-                ; Ensure that we don't clobber any registers containing arguments
+                ; Ensure that we don't clobber any registers containing arguments. For UNIX64 we also preserve r6 (rax)
+                ; since it's used as a hidden argument in vararg functions to specify the number of vector registers used.
                 %assign regs_used 5 + UNIX64 * 3
             %endif
         %endif
@@ -516,10 +559,10 @@
 DECLARE_REG 8,  rsi, 72
 DECLARE_REG 9,  rbx, 80
 DECLARE_REG 10, rbp, 88
-DECLARE_REG 11, R12, 96
-DECLARE_REG 12, R13, 104
-DECLARE_REG 13, R14, 112
-DECLARE_REG 14, R15, 120
+DECLARE_REG 11, R14, 96
+DECLARE_REG 12, R15, 104
+DECLARE_REG 13, R12, 112
+DECLARE_REG 14, R13, 120
 
 %macro PROLOGUE 2-5+ 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
     %assign num_args %1
@@ -538,15 +581,16 @@
 
 %macro WIN64_PUSH_XMM 0
     ; Use the shadow space to store XMM6 and XMM7, the rest needs stack space allocated.
-    %if xmm_regs_used > 6
+    %if xmm_regs_used > 6 + high_mm_regs
         movaps [rstk + stack_offset +  8], xmm6
     %endif
-    %if xmm_regs_used > 7
+    %if xmm_regs_used > 7 + high_mm_regs
         movaps [rstk + stack_offset + 24], xmm7
     %endif
-    %if xmm_regs_used > 8
+    %assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8
+    %if %%xmm_regs_on_stack > 0
         %assign %%i 8
-        %rep xmm_regs_used-8
+        %rep %%xmm_regs_on_stack
             movaps [rsp + (%%i-8)*16 + stack_size + 32], xmm %+ %%i
             %assign %%i %%i+1
         %endrep
@@ -555,53 +599,56 @@
 
 %macro WIN64_SPILL_XMM 1
     %assign xmm_regs_used %1
-    ASSERT xmm_regs_used <= 16
-    %if xmm_regs_used > 8
+    ASSERT xmm_regs_used <= 16 + high_mm_regs
+    %assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8
+    %if %%xmm_regs_on_stack > 0
         ; Allocate stack space for callee-saved xmm registers plus shadow space and align the stack.
-        %assign %%pad (xmm_regs_used-8)*16 + 32
+        %assign %%pad %%xmm_regs_on_stack*16 + 32
         %assign stack_size_padded %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1))
         SUB rsp, stack_size_padded
     %endif
     WIN64_PUSH_XMM
 %endmacro
 
-%macro WIN64_RESTORE_XMM_INTERNAL 1
+%macro WIN64_RESTORE_XMM_INTERNAL 0
     %assign %%pad_size 0
-    %if xmm_regs_used > 8
-        %assign %%i xmm_regs_used
-        %rep xmm_regs_used-8
+    %assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8
+    %if %%xmm_regs_on_stack > 0
+        %assign %%i xmm_regs_used - high_mm_regs
+        %rep %%xmm_regs_on_stack
             %assign %%i %%i-1
-            movaps xmm %+ %%i, [%1 + (%%i-8)*16 + stack_size + 32]
+            movaps xmm %+ %%i, [rsp + (%%i-8)*16 + stack_size + 32]
         %endrep
     %endif
     %if stack_size_padded > 0
         %if stack_size > 0 && required_stack_alignment > STACK_ALIGNMENT
             mov rsp, rstkm
         %else
-            add %1, stack_size_padded
+            add rsp, stack_size_padded
             %assign %%pad_size stack_size_padded
         %endif
     %endif
-    %if xmm_regs_used > 7
-        movaps xmm7, [%1 + stack_offset - %%pad_size + 24]
+    %if xmm_regs_used > 7 + high_mm_regs
+        movaps xmm7, [rsp + stack_offset - %%pad_size + 24]
     %endif
-    %if xmm_regs_used > 6
-        movaps xmm6, [%1 + stack_offset - %%pad_size +  8]
+    %if xmm_regs_used > 6 + high_mm_regs
+        movaps xmm6, [rsp + stack_offset - %%pad_size +  8]
     %endif
 %endmacro
 
-%macro WIN64_RESTORE_XMM 1
-    WIN64_RESTORE_XMM_INTERNAL %1
+%macro WIN64_RESTORE_XMM 0
+    WIN64_RESTORE_XMM_INTERNAL
     %assign stack_offset (stack_offset-stack_size_padded)
+    %assign stack_size_padded 0
     %assign xmm_regs_used 0
 %endmacro
 
-%define has_epilogue regs_used > 7 || xmm_regs_used > 6 || mmsize == 32 || stack_size > 0
+%define has_epilogue regs_used > 7 || stack_size > 0 || vzeroupper_required || xmm_regs_used > 6+high_mm_regs
 
 %macro RET 0
-    WIN64_RESTORE_XMM_INTERNAL rsp
+    WIN64_RESTORE_XMM_INTERNAL
     POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7
-    %if mmsize == 32
+    %if vzeroupper_required
         vzeroupper
     %endif
     AUTO_REP_RET
@@ -620,14 +667,15 @@
 DECLARE_REG 8,  R11, 24
 DECLARE_REG 9,  rbx, 32
 DECLARE_REG 10, rbp, 40
-DECLARE_REG 11, R12, 48
-DECLARE_REG 12, R13, 56
-DECLARE_REG 13, R14, 64
-DECLARE_REG 14, R15, 72
+DECLARE_REG 11, R14, 48
+DECLARE_REG 12, R15, 56
+DECLARE_REG 13, R12, 64
+DECLARE_REG 14, R13, 72
 
-%macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
+%macro PROLOGUE 2-5+ 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
     %assign num_args %1
     %assign regs_used %2
+    %assign xmm_regs_used %3
     ASSERT regs_used >= num_args
     SETUP_STACK_POINTER %4
     ASSERT regs_used <= 15
@@ -637,7 +685,7 @@
     DEFINE_ARGS_INTERNAL %0, %4, %5
 %endmacro
 
-%define has_epilogue regs_used > 9 || mmsize == 32 || stack_size > 0
+%define has_epilogue regs_used > 9 || stack_size > 0 || vzeroupper_required
 
 %macro RET 0
     %if stack_size_padded > 0
@@ -648,7 +696,7 @@
         %endif
     %endif
     POP_IF_USED 14, 13, 12, 11, 10, 9
-    %if mmsize == 32
+    %if vzeroupper_required
         vzeroupper
     %endif
     AUTO_REP_RET
@@ -693,7 +741,7 @@
     DEFINE_ARGS_INTERNAL %0, %4, %5
 %endmacro
 
-%define has_epilogue regs_used > 3 || mmsize == 32 || stack_size > 0
+%define has_epilogue regs_used > 3 || stack_size > 0 || vzeroupper_required
 
 %macro RET 0
     %if stack_size_padded > 0
@@ -704,7 +752,7 @@
         %endif
     %endif
     POP_IF_USED 6, 5, 4, 3
-    %if mmsize == 32
+    %if vzeroupper_required
         vzeroupper
     %endif
     AUTO_REP_RET
@@ -715,7 +763,7 @@
 %if WIN64 == 0
     %macro WIN64_SPILL_XMM 1
     %endmacro
-    %macro WIN64_RESTORE_XMM 1
+    %macro WIN64_RESTORE_XMM 0
     %endmacro
     %macro WIN64_PUSH_XMM 0
     %endmacro
@@ -726,7 +774,7 @@
 ; We can automatically detect "follows a branch", but not a branch target.
 ; (SSSE3 is a sufficient condition to know that your cpu doesn't have this problem.)
 %macro REP_RET 0
-    %if has_epilogue
+    %if has_epilogue || cpuflag(ssse3)
         RET
     %else
         rep ret
@@ -758,7 +806,7 @@
 
 BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, jna, jnae, jb, jbe, jnb, jnbe, jc, jnc, js, jns, jo, jno, jp, jnp
 
-%macro TAIL_CALL 2 ; callee, is_nonadjacent
+%macro TAIL_CALL 1-2 1 ; callee, is_nonadjacent
     %if has_epilogue
         call %1
         RET
@@ -788,35 +836,25 @@
 %endmacro
 %macro cglobal_internal 2-3+
     annotate_function_size
-    %if %1
-        %xdefine %%FUNCTION_PREFIX private_prefix
-        ; libaom explicitly sets visibility in shared object builds. Avoid
-        ; setting visibility to hidden as it may break builds that split
-        ; sources on e.g., directory boundaries.
-        %ifdef CHROMIUM
-            %xdefine %%VISIBILITY hidden
-        %else
-            %xdefine %%VISIBILITY
-        %endif
-    %else
-        %xdefine %%FUNCTION_PREFIX public_prefix
-        %xdefine %%VISIBILITY
-    %endif
     %ifndef cglobaled_%2
-        %xdefine %2 mangle(%%FUNCTION_PREFIX %+ _ %+ %2)
+        %if %1
+            %xdefine %2 mangle(private_prefix %+ _ %+ %2)
+        %else
+            %xdefine %2 mangle(public_prefix %+ _ %+ %2)
+        %endif
         %xdefine %2.skip_prologue %2 %+ .skip_prologue
         CAT_XDEFINE cglobaled_, %2, 1
     %endif
     %xdefine current_function %2
     %xdefine current_function_section __SECT__
     %if FORMAT_ELF
-        global %2:function %%VISIBILITY
-    %elif FORMAT_MACHO
-        %ifdef __NASM_VER__
-            global %2
+        %if %1
+            global %2:function VISIBILITY
         %else
-            global %2:private_extern
+            global %2:function
         %endif
+    %elif FORMAT_MACHO && HAVE_PRIVATE_EXTERN && %1
+        global %2:private_extern
     %else
         global %2
     %endif
@@ -827,12 +865,24 @@
     %assign stack_offset 0      ; stack pointer offset relative to the return address
     %assign stack_size 0        ; amount of stack space that can be freely used inside a function
     %assign stack_size_padded 0 ; total amount of allocated stack space, including space for callee-saved xmm registers on WIN64 and alignment padding
-    %assign xmm_regs_used 0     ; number of XMM registers requested, used for dealing with callee-saved registers on WIN64
+    %assign xmm_regs_used 0     ; number of XMM registers requested, used for dealing with callee-saved registers on WIN64 and vzeroupper
     %ifnidn %3, ""
         PROLOGUE %3
     %endif
 %endmacro
 
+; Create a global symbol from a local label with the correct name mangling and type
+%macro cglobal_label 1
+    %if FORMAT_ELF
+        global current_function %+ %1:function VISIBILITY
+    %elif FORMAT_MACHO && HAVE_PRIVATE_EXTERN
+        global current_function %+ %1:private_extern
+    %else
+        global current_function %+ %1
+    %endif
+    %1:
+%endmacro
+
 %macro cextern 1
     %xdefine %1 mangle(private_prefix %+ _ %+ %1)
     CAT_XDEFINE cglobaled_, %1, 1
@@ -851,7 +901,9 @@
 %macro const 1-2+
     %xdefine %1 mangle(private_prefix %+ _ %+ %1)
     %if FORMAT_ELF
-        global %1:data hidden
+        global %1:data VISIBILITY
+    %elif FORMAT_MACHO && HAVE_PRIVATE_EXTERN
+        global %1:private_extern
     %else
         global %1
     %endif
@@ -890,24 +942,26 @@
 %assign cpuflags_sse      (1<<4) | cpuflags_mmx2
 %assign cpuflags_sse2     (1<<5) | cpuflags_sse
 %assign cpuflags_sse2slow (1<<6) | cpuflags_sse2
-%assign cpuflags_sse3     (1<<7) | cpuflags_sse2
-%assign cpuflags_ssse3    (1<<8) | cpuflags_sse3
-%assign cpuflags_sse4     (1<<9) | cpuflags_ssse3
-%assign cpuflags_sse42    (1<<10)| cpuflags_sse4
-%assign cpuflags_avx      (1<<11)| cpuflags_sse42
-%assign cpuflags_xop      (1<<12)| cpuflags_avx
-%assign cpuflags_fma4     (1<<13)| cpuflags_avx
-%assign cpuflags_fma3     (1<<14)| cpuflags_avx
-%assign cpuflags_avx2     (1<<15)| cpuflags_fma3
+%assign cpuflags_lzcnt    (1<<7) | cpuflags_sse2
+%assign cpuflags_sse3     (1<<8) | cpuflags_sse2
+%assign cpuflags_ssse3    (1<<9) | cpuflags_sse3
+%assign cpuflags_sse4     (1<<10)| cpuflags_ssse3
+%assign cpuflags_sse42    (1<<11)| cpuflags_sse4
+%assign cpuflags_aesni    (1<<12)| cpuflags_sse42
+%assign cpuflags_gfni     (1<<13)| cpuflags_sse42
+%assign cpuflags_avx      (1<<14)| cpuflags_sse42
+%assign cpuflags_xop      (1<<15)| cpuflags_avx
+%assign cpuflags_fma4     (1<<16)| cpuflags_avx
+%assign cpuflags_fma3     (1<<17)| cpuflags_avx
+%assign cpuflags_bmi1     (1<<18)| cpuflags_avx|cpuflags_lzcnt
+%assign cpuflags_bmi2     (1<<19)| cpuflags_bmi1
+%assign cpuflags_avx2     (1<<20)| cpuflags_fma3|cpuflags_bmi2
+%assign cpuflags_avx512   (1<<21)| cpuflags_avx2 ; F, CD, BW, DQ, VL
 
-%assign cpuflags_cache32  (1<<16)
-%assign cpuflags_cache64  (1<<17)
-%assign cpuflags_slowctz  (1<<18)
-%assign cpuflags_lzcnt    (1<<19)
-%assign cpuflags_aligned  (1<<20) ; not a cpu feature, but a function variant
-%assign cpuflags_atom     (1<<21)
-%assign cpuflags_bmi1     (1<<22)|cpuflags_lzcnt
-%assign cpuflags_bmi2     (1<<23)|cpuflags_bmi1
+%assign cpuflags_cache32  (1<<22)
+%assign cpuflags_cache64  (1<<23)
+%assign cpuflags_aligned  (1<<24) ; not a cpu feature, but a function variant
+%assign cpuflags_atom     (1<<25)
 
 ; Returns a boolean value expressing whether or not the specified cpuflag is enabled.
 %define    cpuflag(x) (((((cpuflags & (cpuflags_ %+ x)) ^ (cpuflags_ %+ x)) - 1) >> 31) & 1)
@@ -950,7 +1004,7 @@
 
     %if ARCH_X86_64 || cpuflag(sse2)
         %ifdef __NASM_VER__
-            ALIGNMODE k8
+            ALIGNMODE p6
         %else
             CPU amdnop
         %endif
@@ -963,11 +1017,12 @@
     %endif
 %endmacro
 
-; Merge mmx and sse*
+; Merge mmx, sse*, and avx*
 ; m# is a simd register of the currently selected size
 ; xm# is the corresponding xmm register if mmsize >= 16, otherwise the same as m#
 ; ym# is the corresponding ymm register if mmsize >= 32, otherwise the same as m#
-; (All 3 remain in sync through SWAP.)
+; zm# is the corresponding zmm register if mmsize >= 64, otherwise the same as m#
+; (All 4 remain in sync through SWAP.)
 
 %macro CAT_XDEFINE 3
     %xdefine %1%2 %3
@@ -977,69 +1032,99 @@
     %undef %1%2
 %endmacro
 
+%macro DEFINE_MMREGS 1 ; mmtype
+    %assign %%prev_mmregs 0
+    %ifdef num_mmregs
+        %assign %%prev_mmregs num_mmregs
+    %endif
+
+    %assign num_mmregs 8
+    %if ARCH_X86_64 && mmsize >= 16
+        %assign num_mmregs 16
+        %if cpuflag(avx512) || mmsize == 64
+            %assign num_mmregs 32
+        %endif
+    %endif
+
+    %assign %%i 0
+    %rep num_mmregs
+        CAT_XDEFINE m, %%i, %1 %+ %%i
+        CAT_XDEFINE nn%1, %%i, %%i
+        %assign %%i %%i+1
+    %endrep
+    %if %%prev_mmregs > num_mmregs
+        %rep %%prev_mmregs - num_mmregs
+            CAT_UNDEF m, %%i
+            CAT_UNDEF nn %+ mmtype, %%i
+            %assign %%i %%i+1
+        %endrep
+    %endif
+    %xdefine mmtype %1
+%endmacro
+
+; Prefer registers 16-31 over 0-15 to avoid having to use vzeroupper
+%macro AVX512_MM_PERMUTATION 0-1 0 ; start_reg
+    %if ARCH_X86_64 && cpuflag(avx512)
+        %assign %%i %1
+        %rep 16-%1
+            %assign %%i_high %%i+16
+            SWAP %%i, %%i_high
+            %assign %%i %%i+1
+        %endrep
+    %endif
+%endmacro
+
 %macro INIT_MMX 0-1+
     %assign avx_enabled 0
     %define RESET_MM_PERMUTATION INIT_MMX %1
     %define mmsize 8
-    %define num_mmregs 8
     %define mova movq
     %define movu movq
     %define movh movd
     %define movnta movntq
-    %assign %%i 0
-    %rep 8
-        CAT_XDEFINE m, %%i, mm %+ %%i
-        CAT_XDEFINE nnmm, %%i, %%i
-        %assign %%i %%i+1
-    %endrep
-    %rep 8
-        CAT_UNDEF m, %%i
-        CAT_UNDEF nnmm, %%i
-        %assign %%i %%i+1
-    %endrep
     INIT_CPUFLAGS %1
+    DEFINE_MMREGS mm
 %endmacro
 
 %macro INIT_XMM 0-1+
     %assign avx_enabled 0
     %define RESET_MM_PERMUTATION INIT_XMM %1
     %define mmsize 16
-    %define num_mmregs 8
-    %if ARCH_X86_64
-        %define num_mmregs 16
-    %endif
     %define mova movdqa
     %define movu movdqu
     %define movh movq
     %define movnta movntdq
-    %assign %%i 0
-    %rep num_mmregs
-        CAT_XDEFINE m, %%i, xmm %+ %%i
-        CAT_XDEFINE nnxmm, %%i, %%i
-        %assign %%i %%i+1
-    %endrep
     INIT_CPUFLAGS %1
+    DEFINE_MMREGS xmm
+    %if WIN64
+        AVX512_MM_PERMUTATION 6 ; Swap callee-saved registers with volatile registers
+    %endif
 %endmacro
 
 %macro INIT_YMM 0-1+
     %assign avx_enabled 1
     %define RESET_MM_PERMUTATION INIT_YMM %1
     %define mmsize 32
-    %define num_mmregs 8
-    %if ARCH_X86_64
-        %define num_mmregs 16
-    %endif
     %define mova movdqa
     %define movu movdqu
     %undef movh
     %define movnta movntdq
-    %assign %%i 0
-    %rep num_mmregs
-        CAT_XDEFINE m, %%i, ymm %+ %%i
-        CAT_XDEFINE nnymm, %%i, %%i
-        %assign %%i %%i+1
-    %endrep
     INIT_CPUFLAGS %1
+    DEFINE_MMREGS ymm
+    AVX512_MM_PERMUTATION
+%endmacro
+
+%macro INIT_ZMM 0-1+
+    %assign avx_enabled 1
+    %define RESET_MM_PERMUTATION INIT_ZMM %1
+    %define mmsize 64
+    %define mova movdqa
+    %define movu movdqu
+    %undef movh
+    %define movnta movntdq
+    INIT_CPUFLAGS %1
+    DEFINE_MMREGS zmm
+    AVX512_MM_PERMUTATION
 %endmacro
 
 INIT_XMM
@@ -1048,18 +1133,26 @@
     %define  mmmm%1   mm%1
     %define  mmxmm%1  mm%1
     %define  mmymm%1  mm%1
+    %define  mmzmm%1  mm%1
     %define xmmmm%1   mm%1
     %define xmmxmm%1 xmm%1
     %define xmmymm%1 xmm%1
+    %define xmmzmm%1 xmm%1
     %define ymmmm%1   mm%1
     %define ymmxmm%1 xmm%1
     %define ymmymm%1 ymm%1
+    %define ymmzmm%1 ymm%1
+    %define zmmmm%1   mm%1
+    %define zmmxmm%1 xmm%1
+    %define zmmymm%1 ymm%1
+    %define zmmzmm%1 zmm%1
     %define xm%1 xmm %+ m%1
     %define ym%1 ymm %+ m%1
+    %define zm%1 zmm %+ m%1
 %endmacro
 
 %assign i 0
-%rep 16
+%rep 32
     DECLARE_MMCAST i
     %assign i i+1
 %endrep
@@ -1129,25 +1222,42 @@
     %endif
     %assign %%i 0
     %rep num_mmregs
-        CAT_XDEFINE %%f, %%i, m %+ %%i
+        %xdefine %%tmp m %+ %%i
+        CAT_XDEFINE %%f, %%i, regnumof %+ %%tmp
         %assign %%i %%i+1
     %endrep
 %endmacro
 
-%macro LOAD_MM_PERMUTATION 1 ; name to load from
-    %ifdef %1_m0
+%macro LOAD_MM_PERMUTATION 0-1 ; name to load from
+    %if %0
+        %xdefine %%f %1_m
+    %else
+        %xdefine %%f current_function %+ _m
+    %endif
+    %xdefine %%tmp %%f %+ 0
+    %ifnum %%tmp
+        RESET_MM_PERMUTATION
         %assign %%i 0
         %rep num_mmregs
-            CAT_XDEFINE m, %%i, %1_m %+ %%i
-            CAT_XDEFINE nn, m %+ %%i, %%i
+            %xdefine %%tmp %%f %+ %%i
+            CAT_XDEFINE %%m, %%i, m %+ %%tmp
             %assign %%i %%i+1
         %endrep
+        %rep num_mmregs
+            %assign %%i %%i-1
+            CAT_XDEFINE m, %%i, %%m %+ %%i
+            CAT_XDEFINE nn, m %+ %%i, %%i
+        %endrep
     %endif
 %endmacro
 
 ; Append cpuflags to the callee's name iff the appended name is known and the plain name isn't
 %macro call 1
-    call_internal %1 %+ SUFFIX, %1
+    %ifid %1
+        call_internal %1 %+ SUFFIX, %1
+    %else
+        call %1
+    %endif
 %endmacro
 %macro call_internal 2
     %xdefine %%i %2
@@ -1190,12 +1300,17 @@
 ;=============================================================================
 
 %assign i 0
-%rep 16
+%rep 32
     %if i < 8
         CAT_XDEFINE sizeofmm, i, 8
+        CAT_XDEFINE regnumofmm, i, i
     %endif
     CAT_XDEFINE sizeofxmm, i, 16
     CAT_XDEFINE sizeofymm, i, 32
+    CAT_XDEFINE sizeofzmm, i, 64
+    CAT_XDEFINE regnumofxmm, i, i
+    CAT_XDEFINE regnumofymm, i, i
+    CAT_XDEFINE regnumofzmm, i, i
     %assign i i+1
 %endrep
 %undef i
@@ -1214,7 +1329,7 @@
 ;%1 == instruction
 ;%2 == minimal instruction set
 ;%3 == 1 if float, 0 if int
-;%4 == 1 if non-destructive or 4-operand (xmm, xmm, xmm, imm), 0 otherwise
+;%4 == 1 if 4-operand emulation, 0 if 3-operand emulation, 255 otherwise (no emulation)
 ;%5 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not
 ;%6+: operands
 %macro RUN_AVX_INSTR 6-9+
@@ -1238,8 +1353,22 @@
         %ifdef cpuname
             %if notcpuflag(%2)
                 %error use of ``%1'' %2 instruction in cpuname function: current_function
-            %elif cpuflags_%2 < cpuflags_sse && notcpuflag(sse2) && __sizeofreg > 8
+            %elif %3 == 0 && __sizeofreg == 16 && notcpuflag(sse2)
                 %error use of ``%1'' sse2 instruction in cpuname function: current_function
+            %elif %3 == 0 && __sizeofreg == 32 && notcpuflag(avx2)
+                %error use of ``%1'' avx2 instruction in cpuname function: current_function
+            %elif __sizeofreg == 16 && notcpuflag(sse)
+                %error use of ``%1'' sse instruction in cpuname function: current_function
+            %elif __sizeofreg == 32 && notcpuflag(avx)
+                %error use of ``%1'' avx instruction in cpuname function: current_function
+            %elif __sizeofreg == 64 && notcpuflag(avx512)
+                %error use of ``%1'' avx512 instruction in cpuname function: current_function
+            %elifidn %1, pextrw ; special case because the base instruction is mmx2,
+                %ifnid %6       ; but sse4 is required for memory operands
+                    %if notcpuflag(sse4)
+                        %error use of ``%1'' sse4 instruction in cpuname function: current_function
+                    %endif
+                %endif
             %endif
         %endif
     %endif
@@ -1247,14 +1376,12 @@
     %if __emulate_avx
         %xdefine __src1 %7
         %xdefine __src2 %8
-        %ifnidn %6, %7
-            %if %0 >= 9
-                CHECK_AVX_INSTR_EMU {%1 %6, %7, %8, %9}, %6, %8, %9
-            %else
-                CHECK_AVX_INSTR_EMU {%1 %6, %7, %8}, %6, %8
-            %endif
-            %if %5 && %4 == 0
-                %ifnid %8
+        %if %5 && %4 == 0
+            %ifnidn %6, %7
+                %ifidn %6, %8
+                    %xdefine __src1 %8
+                    %xdefine __src2 %7
+                %elifnnum sizeof%8
                     ; 3-operand AVX instructions with a memory arg can only have it in src2,
                     ; whereas SSE emulation prefers to have it in src1 (i.e. the mov).
                     ; So, if the instruction is commutative with a memory arg, swap them.
@@ -1262,6 +1389,13 @@
                     %xdefine __src2 %7
                 %endif
             %endif
+        %endif
+        %ifnidn %6, __src1
+            %if %0 >= 9
+                CHECK_AVX_INSTR_EMU {%1 %6, %7, %8, %9}, %6, __src2, %9
+            %else
+                CHECK_AVX_INSTR_EMU {%1 %6, %7, %8}, %6, __src2
+            %endif
             %if __sizeofreg == 8
                 MOVQ %6, __src1
             %elif %3
@@ -1278,9 +1412,40 @@
     %elif %0 >= 9
         __instr %6, %7, %8, %9
     %elif %0 == 8
-        __instr %6, %7, %8
+        %if avx_enabled && %5
+            %xdefine __src1 %7
+            %xdefine __src2 %8
+            %ifnum regnumof%7
+                %ifnum regnumof%8
+                    %if regnumof%7 < 8 && regnumof%8 >= 8 && regnumof%8 < 16 && sizeof%8 <= 32
+                        ; Most VEX-encoded instructions require an additional byte to encode when
+                        ; src2 is a high register (e.g. m8..15). If the instruction is commutative
+                        ; we can swap src1 and src2 when doing so reduces the instruction length.
+                        %xdefine __src1 %8
+                        %xdefine __src2 %7
+                    %endif
+                %endif
+            %endif
+            __instr %6, __src1, __src2
+        %else
+            __instr %6, %7, %8
+        %endif
     %elif %0 == 7
-        __instr %6, %7
+        %if avx_enabled && %5
+            %xdefine __src1 %6
+            %xdefine __src2 %7
+            %ifnum regnumof%6
+                %ifnum regnumof%7
+                    %if regnumof%6 < 8 && regnumof%7 >= 8 && regnumof%7 < 16 && sizeof%7 <= 32
+                        %xdefine __src1 %7
+                        %xdefine __src2 %6
+                    %endif
+                %endif
+            %endif
+            __instr %6, __src1, __src2
+        %else
+            __instr %6, %7
+        %endif
     %else
         __instr %6
     %endif
@@ -1289,9 +1454,9 @@
 ;%1 == instruction
 ;%2 == minimal instruction set
 ;%3 == 1 if float, 0 if int
-;%4 == 1 if non-destructive or 4-operand (xmm, xmm, xmm, imm), 0 otherwise
+;%4 == 1 if 4-operand emulation, 0 if 3-operand emulation, 255 otherwise (no emulation)
 ;%5 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not
-%macro AVX_INSTR 1-5 fnord, 0, 1, 0
+%macro AVX_INSTR 1-5 fnord, 0, 255, 0
     %macro %1 1-10 fnord, fnord, fnord, fnord, %1, %2, %3, %4, %5
         %ifidn %2, fnord
             RUN_AVX_INSTR %6, %7, %8, %9, %10, %1
@@ -1307,77 +1472,112 @@
     %endmacro
 %endmacro
 
-; Instructions with both VEX and non-VEX encodings
+; Instructions with both VEX/EVEX and legacy encodings
 ; Non-destructive instructions are written without parameters
 AVX_INSTR addpd, sse2, 1, 0, 1
 AVX_INSTR addps, sse, 1, 0, 1
-AVX_INSTR addsd, sse2, 1, 0, 1
-AVX_INSTR addss, sse, 1, 0, 1
+AVX_INSTR addsd, sse2, 1, 0, 0
+AVX_INSTR addss, sse, 1, 0, 0
 AVX_INSTR addsubpd, sse3, 1, 0, 0
 AVX_INSTR addsubps, sse3, 1, 0, 0
-AVX_INSTR aesdec, fnord, 0, 0, 0
-AVX_INSTR aesdeclast, fnord, 0, 0, 0
-AVX_INSTR aesenc, fnord, 0, 0, 0
-AVX_INSTR aesenclast, fnord, 0, 0, 0
-AVX_INSTR aesimc
-AVX_INSTR aeskeygenassist
+AVX_INSTR aesdec, aesni, 0, 0, 0
+AVX_INSTR aesdeclast, aesni, 0, 0, 0
+AVX_INSTR aesenc, aesni, 0, 0, 0
+AVX_INSTR aesenclast, aesni, 0, 0, 0
+AVX_INSTR aesimc, aesni
+AVX_INSTR aeskeygenassist, aesni
 AVX_INSTR andnpd, sse2, 1, 0, 0
 AVX_INSTR andnps, sse, 1, 0, 0
 AVX_INSTR andpd, sse2, 1, 0, 1
 AVX_INSTR andps, sse, 1, 0, 1
-AVX_INSTR blendpd, sse4, 1, 0, 0
-AVX_INSTR blendps, sse4, 1, 0, 0
-AVX_INSTR blendvpd, sse4, 1, 0, 0
-AVX_INSTR blendvps, sse4, 1, 0, 0
+AVX_INSTR blendpd, sse4, 1, 1, 0
+AVX_INSTR blendps, sse4, 1, 1, 0
+AVX_INSTR blendvpd, sse4 ; can't be emulated
+AVX_INSTR blendvps, sse4 ; can't be emulated
+AVX_INSTR cmpeqpd, sse2, 1, 0, 1
+AVX_INSTR cmpeqps, sse, 1, 0, 1
+AVX_INSTR cmpeqsd, sse2, 1, 0, 0
+AVX_INSTR cmpeqss, sse, 1, 0, 0
+AVX_INSTR cmplepd, sse2, 1, 0, 0
+AVX_INSTR cmpleps, sse, 1, 0, 0
+AVX_INSTR cmplesd, sse2, 1, 0, 0
+AVX_INSTR cmpless, sse, 1, 0, 0
+AVX_INSTR cmpltpd, sse2, 1, 0, 0
+AVX_INSTR cmpltps, sse, 1, 0, 0
+AVX_INSTR cmpltsd, sse2, 1, 0, 0
+AVX_INSTR cmpltss, sse, 1, 0, 0
+AVX_INSTR cmpneqpd, sse2, 1, 0, 1
+AVX_INSTR cmpneqps, sse, 1, 0, 1
+AVX_INSTR cmpneqsd, sse2, 1, 0, 0
+AVX_INSTR cmpneqss, sse, 1, 0, 0
+AVX_INSTR cmpnlepd, sse2, 1, 0, 0
+AVX_INSTR cmpnleps, sse, 1, 0, 0
+AVX_INSTR cmpnlesd, sse2, 1, 0, 0
+AVX_INSTR cmpnless, sse, 1, 0, 0
+AVX_INSTR cmpnltpd, sse2, 1, 0, 0
+AVX_INSTR cmpnltps, sse, 1, 0, 0
+AVX_INSTR cmpnltsd, sse2, 1, 0, 0
+AVX_INSTR cmpnltss, sse, 1, 0, 0
+AVX_INSTR cmpordpd, sse2 1, 0, 1
+AVX_INSTR cmpordps, sse 1, 0, 1
+AVX_INSTR cmpordsd, sse2 1, 0, 0
+AVX_INSTR cmpordss, sse 1, 0, 0
 AVX_INSTR cmppd, sse2, 1, 1, 0
 AVX_INSTR cmpps, sse, 1, 1, 0
 AVX_INSTR cmpsd, sse2, 1, 1, 0
 AVX_INSTR cmpss, sse, 1, 1, 0
-AVX_INSTR comisd, sse2
-AVX_INSTR comiss, sse
-AVX_INSTR cvtdq2pd, sse2
-AVX_INSTR cvtdq2ps, sse2
-AVX_INSTR cvtpd2dq, sse2
-AVX_INSTR cvtpd2ps, sse2
-AVX_INSTR cvtps2dq, sse2
-AVX_INSTR cvtps2pd, sse2
-AVX_INSTR cvtsd2si, sse2
-AVX_INSTR cvtsd2ss, sse2
-AVX_INSTR cvtsi2sd, sse2
-AVX_INSTR cvtsi2ss, sse
-AVX_INSTR cvtss2sd, sse2
-AVX_INSTR cvtss2si, sse
-AVX_INSTR cvttpd2dq, sse2
-AVX_INSTR cvttps2dq, sse2
-AVX_INSTR cvttsd2si, sse2
-AVX_INSTR cvttss2si, sse
+AVX_INSTR cmpunordpd, sse2, 1, 0, 1
+AVX_INSTR cmpunordps, sse, 1, 0, 1
+AVX_INSTR cmpunordsd, sse2, 1, 0, 0
+AVX_INSTR cmpunordss, sse, 1, 0, 0
+AVX_INSTR comisd, sse2, 1
+AVX_INSTR comiss, sse, 1
+AVX_INSTR cvtdq2pd, sse2, 1
+AVX_INSTR cvtdq2ps, sse2, 1
+AVX_INSTR cvtpd2dq, sse2, 1
+AVX_INSTR cvtpd2ps, sse2, 1
+AVX_INSTR cvtps2dq, sse2, 1
+AVX_INSTR cvtps2pd, sse2, 1
+AVX_INSTR cvtsd2si, sse2, 1
+AVX_INSTR cvtsd2ss, sse2, 1, 0, 0
+AVX_INSTR cvtsi2sd, sse2, 1, 0, 0
+AVX_INSTR cvtsi2ss, sse, 1, 0, 0
+AVX_INSTR cvtss2sd, sse2, 1, 0, 0
+AVX_INSTR cvtss2si, sse, 1
+AVX_INSTR cvttpd2dq, sse2, 1
+AVX_INSTR cvttps2dq, sse2, 1
+AVX_INSTR cvttsd2si, sse2, 1
+AVX_INSTR cvttss2si, sse, 1
 AVX_INSTR divpd, sse2, 1, 0, 0
 AVX_INSTR divps, sse, 1, 0, 0
 AVX_INSTR divsd, sse2, 1, 0, 0
 AVX_INSTR divss, sse, 1, 0, 0
 AVX_INSTR dppd, sse4, 1, 1, 0
 AVX_INSTR dpps, sse4, 1, 1, 0
-AVX_INSTR extractps, sse4
+AVX_INSTR extractps, sse4, 1
+AVX_INSTR gf2p8affineinvqb, gfni, 0, 1, 0
+AVX_INSTR gf2p8affineqb, gfni, 0, 1, 0
+AVX_INSTR gf2p8mulb, gfni, 0, 0, 0
 AVX_INSTR haddpd, sse3, 1, 0, 0
 AVX_INSTR haddps, sse3, 1, 0, 0
 AVX_INSTR hsubpd, sse3, 1, 0, 0
 AVX_INSTR hsubps, sse3, 1, 0, 0
 AVX_INSTR insertps, sse4, 1, 1, 0
 AVX_INSTR lddqu, sse3
-AVX_INSTR ldmxcsr, sse
+AVX_INSTR ldmxcsr, sse, 1
 AVX_INSTR maskmovdqu, sse2
 AVX_INSTR maxpd, sse2, 1, 0, 1
 AVX_INSTR maxps, sse, 1, 0, 1
-AVX_INSTR maxsd, sse2, 1, 0, 1
-AVX_INSTR maxss, sse, 1, 0, 1
+AVX_INSTR maxsd, sse2, 1, 0, 0
+AVX_INSTR maxss, sse, 1, 0, 0
 AVX_INSTR minpd, sse2, 1, 0, 1
 AVX_INSTR minps, sse, 1, 0, 1
-AVX_INSTR minsd, sse2, 1, 0, 1
-AVX_INSTR minss, sse, 1, 0, 1
-AVX_INSTR movapd, sse2
-AVX_INSTR movaps, sse
+AVX_INSTR minsd, sse2, 1, 0, 0
+AVX_INSTR minss, sse, 1, 0, 0
+AVX_INSTR movapd, sse2, 1
+AVX_INSTR movaps, sse, 1
 AVX_INSTR movd, mmx
-AVX_INSTR movddup, sse3
+AVX_INSTR movddup, sse3, 1
 AVX_INSTR movdqa, sse2
 AVX_INSTR movdqu, sse2
 AVX_INSTR movhlps, sse, 1, 0, 0
@@ -1386,24 +1586,24 @@
 AVX_INSTR movlhps, sse, 1, 0, 0
 AVX_INSTR movlpd, sse2, 1, 0, 0
 AVX_INSTR movlps, sse, 1, 0, 0
-AVX_INSTR movmskpd, sse2
-AVX_INSTR movmskps, sse
+AVX_INSTR movmskpd, sse2, 1
+AVX_INSTR movmskps, sse, 1
 AVX_INSTR movntdq, sse2
 AVX_INSTR movntdqa, sse4
-AVX_INSTR movntpd, sse2
-AVX_INSTR movntps, sse
+AVX_INSTR movntpd, sse2, 1
+AVX_INSTR movntps, sse, 1
 AVX_INSTR movq, mmx
 AVX_INSTR movsd, sse2, 1, 0, 0
-AVX_INSTR movshdup, sse3
-AVX_INSTR movsldup, sse3
+AVX_INSTR movshdup, sse3, 1
+AVX_INSTR movsldup, sse3, 1
 AVX_INSTR movss, sse, 1, 0, 0
-AVX_INSTR movupd, sse2
-AVX_INSTR movups, sse
-AVX_INSTR mpsadbw, sse4
+AVX_INSTR movupd, sse2, 1
+AVX_INSTR movups, sse, 1
+AVX_INSTR mpsadbw, sse4, 0, 1, 0
 AVX_INSTR mulpd, sse2, 1, 0, 1
 AVX_INSTR mulps, sse, 1, 0, 1
-AVX_INSTR mulsd, sse2, 1, 0, 1
-AVX_INSTR mulss, sse, 1, 0, 1
+AVX_INSTR mulsd, sse2, 1, 0, 0
+AVX_INSTR mulss, sse, 1, 0, 0
 AVX_INSTR orpd, sse2, 1, 0, 1
 AVX_INSTR orps, sse, 1, 0, 1
 AVX_INSTR pabsb, ssse3
@@ -1421,14 +1621,18 @@
 AVX_INSTR paddsw, mmx, 0, 0, 1
 AVX_INSTR paddusb, mmx, 0, 0, 1
 AVX_INSTR paddusw, mmx, 0, 0, 1
-AVX_INSTR palignr, ssse3
+AVX_INSTR palignr, ssse3, 0, 1, 0
 AVX_INSTR pand, mmx, 0, 0, 1
 AVX_INSTR pandn, mmx, 0, 0, 0
 AVX_INSTR pavgb, mmx2, 0, 0, 1
 AVX_INSTR pavgw, mmx2, 0, 0, 1
-AVX_INSTR pblendvb, sse4, 0, 0, 0
-AVX_INSTR pblendw, sse4
-AVX_INSTR pclmulqdq
+AVX_INSTR pblendvb, sse4 ; can't be emulated
+AVX_INSTR pblendw, sse4, 0, 1, 0
+AVX_INSTR pclmulqdq, fnord, 0, 1, 0
+AVX_INSTR pclmulhqhqdq, fnord, 0, 0, 0
+AVX_INSTR pclmulhqlqdq, fnord, 0, 0, 0
+AVX_INSTR pclmullqhqdq, fnord, 0, 0, 0
+AVX_INSTR pclmullqlqdq, fnord, 0, 0, 0
 AVX_INSTR pcmpestri, sse42
 AVX_INSTR pcmpestrm, sse42
 AVX_INSTR pcmpistri, sse42
@@ -1452,10 +1656,10 @@
 AVX_INSTR phsubw, ssse3, 0, 0, 0
 AVX_INSTR phsubd, ssse3, 0, 0, 0
 AVX_INSTR phsubsw, ssse3, 0, 0, 0
-AVX_INSTR pinsrb, sse4
-AVX_INSTR pinsrd, sse4
-AVX_INSTR pinsrq, sse4
-AVX_INSTR pinsrw, mmx2
+AVX_INSTR pinsrb, sse4, 0, 1, 0
+AVX_INSTR pinsrd, sse4, 0, 1, 0
+AVX_INSTR pinsrq, sse4, 0, 1, 0
+AVX_INSTR pinsrw, mmx2, 0, 1, 0
 AVX_INSTR pmaddwd, mmx, 0, 0, 1
 AVX_INSTR pmaddubsw, ssse3, 0, 0, 0
 AVX_INSTR pmaxsb, sse4, 0, 0, 1
@@ -1527,27 +1731,27 @@
 AVX_INSTR punpckldq, mmx, 0, 0, 0
 AVX_INSTR punpcklqdq, sse2, 0, 0, 0
 AVX_INSTR pxor, mmx, 0, 0, 1
-AVX_INSTR rcpps, sse, 1, 0, 0
+AVX_INSTR rcpps, sse, 1
 AVX_INSTR rcpss, sse, 1, 0, 0
-AVX_INSTR roundpd, sse4
-AVX_INSTR roundps, sse4
-AVX_INSTR roundsd, sse4
-AVX_INSTR roundss, sse4
-AVX_INSTR rsqrtps, sse, 1, 0, 0
+AVX_INSTR roundpd, sse4, 1
+AVX_INSTR roundps, sse4, 1
+AVX_INSTR roundsd, sse4, 1, 1, 0
+AVX_INSTR roundss, sse4, 1, 1, 0
+AVX_INSTR rsqrtps, sse, 1
 AVX_INSTR rsqrtss, sse, 1, 0, 0
 AVX_INSTR shufpd, sse2, 1, 1, 0
 AVX_INSTR shufps, sse, 1, 1, 0
-AVX_INSTR sqrtpd, sse2, 1, 0, 0
-AVX_INSTR sqrtps, sse, 1, 0, 0
+AVX_INSTR sqrtpd, sse2, 1
+AVX_INSTR sqrtps, sse, 1
 AVX_INSTR sqrtsd, sse2, 1, 0, 0
 AVX_INSTR sqrtss, sse, 1, 0, 0
-AVX_INSTR stmxcsr, sse
+AVX_INSTR stmxcsr, sse, 1
 AVX_INSTR subpd, sse2, 1, 0, 0
 AVX_INSTR subps, sse, 1, 0, 0
 AVX_INSTR subsd, sse2, 1, 0, 0
 AVX_INSTR subss, sse, 1, 0, 0
-AVX_INSTR ucomisd, sse2
-AVX_INSTR ucomiss, sse
+AVX_INSTR ucomisd, sse2, 1
+AVX_INSTR ucomiss, sse, 1
 AVX_INSTR unpckhpd, sse2, 1, 0, 0
 AVX_INSTR unpckhps, sse, 1, 0, 0
 AVX_INSTR unpcklpd, sse2, 1, 0, 0
@@ -1560,6 +1764,38 @@
 AVX_INSTR pfsub, 3dnow, 1, 0, 0
 AVX_INSTR pfmul, 3dnow, 1, 0, 1
 
+;%1 == instruction
+;%2 == minimal instruction set
+%macro GPR_INSTR 2
+    %macro %1 2-5 fnord, %1, %2
+        %ifdef cpuname
+            %if notcpuflag(%5)
+                %error use of ``%4'' %5 instruction in cpuname function: current_function
+            %endif
+        %endif
+        %ifidn %3, fnord
+            %4 %1, %2
+        %else
+            %4 %1, %2, %3
+        %endif
+    %endmacro
+%endmacro
+
+GPR_INSTR andn, bmi1
+GPR_INSTR bextr, bmi1
+GPR_INSTR blsi, bmi1
+GPR_INSTR blsr, bmi1
+GPR_INSTR blsmsk, bmi1
+GPR_INSTR bzhi, bmi2
+GPR_INSTR mulx, bmi2
+GPR_INSTR pdep, bmi2
+GPR_INSTR pext, bmi2
+GPR_INSTR popcnt, sse42
+GPR_INSTR rorx, bmi2
+GPR_INSTR sarx, bmi2
+GPR_INSTR shlx, bmi2
+GPR_INSTR shrx, bmi2
+
 ; base-4 constants for shuffles
 %assign i 0
 %rep 256
@@ -1610,7 +1846,7 @@
                 v%5%6 %1, %2, %3, %4
             %elifidn %1, %2
                 ; If %3 or %4 is a memory operand it needs to be encoded as the last operand.
-                %ifid %3
+                %ifnum sizeof%3
                     v%{5}213%6 %2, %3, %4
                 %else
                     v%{5}132%6 %2, %4, %3
@@ -1635,15 +1871,53 @@
 FMA4_INSTR fnmadd,   pd, ps, sd, ss
 FMA4_INSTR fnmsub,   pd, ps, sd, ss
 
-; workaround: vpbroadcastq is broken in x86_32 due to a yasm bug (fixed in 1.3.0)
-%ifdef __YASM_VER__
-    %if __YASM_VERSION_ID__ < 0x01030000 && ARCH_X86_64 == 0
-        %macro vpbroadcastq 2
-            %if sizeof%1 == 16
-                movddup %1, %2
-            %else
-                vbroadcastsd %1, %2
+; Macros for converting VEX instructions to equivalent EVEX ones.
+%macro EVEX_INSTR 2-3 0 ; vex, evex, prefer_evex
+    %macro %1 2-7 fnord, fnord, %1, %2, %3
+        %ifidn %3, fnord
+            %define %%args %1, %2
+        %elifidn %4, fnord
+            %define %%args %1, %2, %3
+        %else
+            %define %%args %1, %2, %3, %4
+        %endif
+        %assign %%evex_required cpuflag(avx512) & %7
+        %ifnum regnumof%1
+            %if regnumof%1 >= 16 || sizeof%1 > 32
+                %assign %%evex_required 1
             %endif
-        %endmacro
-    %endif
-%endif
+        %endif
+        %ifnum regnumof%2
+            %if regnumof%2 >= 16 || sizeof%2 > 32
+                %assign %%evex_required 1
+            %endif
+        %endif
+        %ifnum regnumof%3
+            %if regnumof%3 >= 16 || sizeof%3 > 32
+                %assign %%evex_required 1
+            %endif
+        %endif
+        %if %%evex_required
+            %6 %%args
+        %else
+            %5 %%args ; Prefer VEX over EVEX due to shorter instruction length
+        %endif
+    %endmacro
+%endmacro
+
+EVEX_INSTR vbroadcastf128, vbroadcastf32x4
+EVEX_INSTR vbroadcasti128, vbroadcasti32x4
+EVEX_INSTR vextractf128,   vextractf32x4
+EVEX_INSTR vextracti128,   vextracti32x4
+EVEX_INSTR vinsertf128,    vinsertf32x4
+EVEX_INSTR vinserti128,    vinserti32x4
+EVEX_INSTR vmovdqa,        vmovdqa32
+EVEX_INSTR vmovdqu,        vmovdqu32
+EVEX_INSTR vpand,          vpandd
+EVEX_INSTR vpandn,         vpandnd
+EVEX_INSTR vpor,           vpord
+EVEX_INSTR vpxor,          vpxord
+EVEX_INSTR vrcpps,         vrcp14ps,   1 ; EVEX versions have higher precision
+EVEX_INSTR vrcpss,         vrcp14ss,   1
+EVEX_INSTR vrsqrtps,       vrsqrt14ps, 1
+EVEX_INSTR vrsqrtss,       vrsqrt14ss, 1
diff --git a/tools/aom_entropy_optimizer.c b/tools/aom_entropy_optimizer.c
index 9f529d9..fa7bf7e 100644
--- a/tools/aom_entropy_optimizer.c
+++ b/tools/aom_entropy_optimizer.c
@@ -572,9 +572,9 @@
   /* Skip flag */
   cts_each_dim[0] = SKIP_CONTEXTS;
   cts_each_dim[1] = 2;
-  optimize_cdf_table(&fc.skip[0][0], probsfile, 2, cts_each_dim,
+  optimize_cdf_table(&fc.skip_txfm[0][0], probsfile, 2, cts_each_dim,
                      "static const aom_cdf_prob "
-                     "default_skip_cdfs[SKIP_CONTEXTS][CDF_SIZE(2)]");
+                     "default_skip_txfm_cdfs[SKIP_CONTEXTS][CDF_SIZE(2)]");
 
   /* Skip mode flag */
   cts_each_dim[0] = SKIP_MODE_CONTEXTS;
diff --git a/tools/auto_refactor/auto_refactor.py b/tools/auto_refactor/auto_refactor.py
new file mode 100644
index 0000000..0e9c289
--- /dev/null
+++ b/tools/auto_refactor/auto_refactor.py
@@ -0,0 +1,874 @@
+from __future__ import print_function
+import sys
+import os
+import operator
+from pycparser import c_parser, c_ast, parse_file
+from math import *
+
+from inspect import currentframe, getframeinfo
+from collections import deque
+
+
+def debug_print(frameinfo):
+  print('******** ERROR:', frameinfo.filename, frameinfo.lineno, '********')
+
+
+class StructItem():
+
+  def __init__(self,
+               typedef_name=None,
+               struct_name=None,
+               struct_node=None,
+               is_union=False):
+    self.typedef_name = typedef_name
+    self.struct_name = struct_name
+    self.struct_node = struct_node
+    self.is_union = is_union
+    self.child_decl_map = None
+
+  def __str__(self):
+    return str(self.typedef_name) + ' ' + str(self.struct_name) + ' ' + str(
+        self.is_union)
+
+  def compute_child_decl_map(self, struct_info):
+    self.child_decl_map = {}
+    if self.struct_node != None and self.struct_node.decls != None:
+      for decl_node in self.struct_node.decls:
+        if decl_node.name == None:
+          for sub_decl_node in decl_node.type.decls:
+            sub_decl_status = parse_decl_node(struct_info, sub_decl_node)
+            self.child_decl_map[sub_decl_node.name] = sub_decl_status
+        else:
+          decl_status = parse_decl_node(struct_info, decl_node)
+          self.child_decl_map[decl_status.name] = decl_status
+
+  def get_child_decl_status(self, decl_name):
+    if self.child_decl_map == None:
+      debug_print(getframeinfo(currentframe()))
+      print('child_decl_map is None')
+      return None
+    if decl_name not in self.child_decl_map:
+      debug_print(getframeinfo(currentframe()))
+      print(decl_name, 'does not exist ')
+      return None
+    return self.child_decl_map[decl_name]
+
+
+class StructInfo():
+
+  def __init__(self):
+    self.struct_name_dic = {}
+    self.typedef_name_dic = {}
+    self.enum_value_dic = {}  # enum value -> enum_node
+    self.enum_name_dic = {}  # enum name -> enum_node
+    self.struct_item_list = []
+
+  def get_struct_by_typedef_name(self, typedef_name):
+    if typedef_name in self.typedef_name_dic:
+      return self.typedef_name_dic[typedef_name]
+    else:
+      return None
+
+  def get_struct_by_struct_name(self, struct_name):
+    if struct_name in self.struct_name_dic:
+      return self.struct_name_dic[struct_name]
+    else:
+      debug_print(getframeinfo(currentframe()))
+      print('Cant find', struct_name)
+      return None
+
+  def update_struct_item_list(self):
+    # Collect all struct_items from struct_name_dic and typedef_name_dic
+    # Compute child_decl_map for each struct item.
+    for struct_name in self.struct_name_dic.keys():
+      struct_item = self.struct_name_dic[struct_name]
+      struct_item.compute_child_decl_map(self)
+      self.struct_item_list.append(struct_item)
+
+    for typedef_name in self.typedef_name_dic.keys():
+      struct_item = self.typedef_name_dic[typedef_name]
+      if struct_item.struct_name not in self.struct_name_dic:
+        struct_item.compute_child_decl_map(self)
+        self.struct_item_list.append(struct_item)
+
+  def update_enum(self, enum_node):
+    if enum_node.name != None:
+      self.enum_name_dic[enum_node.name] = enum_node
+
+    if enum_node.values != None:
+      enumerator_list = enum_node.values.enumerators
+      for enumerator in enumerator_list:
+        self.enum_value_dic[enumerator.name] = enum_node
+
+  def update(self,
+             typedef_name=None,
+             struct_name=None,
+             struct_node=None,
+             is_union=False):
+    """T: typedef_name S: struct_name N: struct_node
+
+           T S N
+   case 1: o o o
+   typedef struct P {
+    int u;
+   } K;
+           T S N
+   case 2: o o x
+   typedef struct P K;
+
+           T S N
+   case 3: x o o
+   struct P {
+    int u;
+   };
+
+           T S N
+   case 4: o x o
+   typedef struct {
+    int u;
+   } K;
+    """
+    struct_item = None
+
+    # Check whether struct_name or typedef_name is already in the dictionary
+    if struct_name in self.struct_name_dic:
+      struct_item = self.struct_name_dic[struct_name]
+
+    if typedef_name in self.typedef_name_dic:
+      struct_item = self.typedef_name_dic[typedef_name]
+
+    if struct_item == None:
+      struct_item = StructItem(typedef_name, struct_name, struct_node, is_union)
+
+    if struct_node.decls != None:
+      struct_item.struct_node = struct_node
+
+    if struct_name != None:
+      self.struct_name_dic[struct_name] = struct_item
+
+    if typedef_name != None:
+      self.typedef_name_dic[typedef_name] = struct_item
+
+
+class StructDefVisitor(c_ast.NodeVisitor):
+
+  def __init__(self):
+    self.struct_info = StructInfo()
+
+  def visit_Struct(self, node):
+    if node.decls != None:
+      self.struct_info.update(None, node.name, node)
+    self.generic_visit(node)
+
+  def visit_Union(self, node):
+    if node.decls != None:
+      self.struct_info.update(None, node.name, node, True)
+    self.generic_visit(node)
+
+  def visit_Enum(self, node):
+    self.struct_info.update_enum(node)
+    self.generic_visit(node)
+
+  def visit_Typedef(self, node):
+    if node.type.__class__.__name__ == 'TypeDecl':
+      typedecl = node.type
+      if typedecl.type.__class__.__name__ == 'Struct':
+        struct_node = typedecl.type
+        typedef_name = node.name
+        struct_name = struct_node.name
+        self.struct_info.update(typedef_name, struct_name, struct_node)
+      elif typedecl.type.__class__.__name__ == 'Union':
+        union_node = typedecl.type
+        typedef_name = node.name
+        union_name = union_node.name
+        self.struct_info.update(typedef_name, union_name, union_node, True)
+      # TODO(angiebird): Do we need to deal with enum here?
+    self.generic_visit(node)
+
+
+def build_struct_info(ast):
+  v = StructDefVisitor()
+  v.visit(ast)
+  struct_info = v.struct_info
+  struct_info.update_struct_item_list()
+  return v.struct_info
+
+
+class DeclStatus():
+
+  def __init__(self, name, struct_item=None, is_ptr_decl=False):
+    self.name = name
+    self.struct_item = struct_item
+    self.is_ptr_decl = is_ptr_decl
+
+  def get_child_decl_status(self, decl_name):
+    if self.struct_item != None:
+      return self.struct_item.get_child_decl_status(decl_name)
+    else:
+      #TODO(angiebird): 2. Investigage the situation when a struct's definition can't be found.
+      return None
+
+  def __str__(self):
+    return str(self.struct_item) + ' ' + str(self.name) + ' ' + str(
+        self.is_ptr_decl)
+
+
+def peel_ptr_decl(decl_type_node):
+  """ Remove PtrDecl and ArrayDecl layer """
+  is_ptr_decl = False
+  peeled_decl_type_node = decl_type_node
+  while peeled_decl_type_node.__class__.__name__ == 'PtrDecl' or peeled_decl_type_node.__class__.__name__ == 'ArrayDecl':
+    is_ptr_decl = True
+    peeled_decl_type_node = peeled_decl_type_node.type
+  return is_ptr_decl, peeled_decl_type_node
+
+
+def parse_peeled_decl_type_node(struct_info, node):
+  struct_item = None
+  if node.__class__.__name__ == 'TypeDecl':
+    if node.type.__class__.__name__ == 'IdentifierType':
+      identifier_type_node = node.type
+      typedef_name = identifier_type_node.names[0]
+      struct_item = struct_info.get_struct_by_typedef_name(typedef_name)
+    elif node.type.__class__.__name__ == 'Struct':
+      struct_node = node.type
+      if struct_node.name != None:
+        struct_item = struct_info.get_struct_by_struct_name(struct_node.name)
+      else:
+        struct_item = StructItem(None, None, struct_node, False)
+        struct_item.compute_child_decl_map(struct_info)
+    elif node.type.__class__.__name__ == 'Union':
+      # TODO(angiebird): Special treatment for Union?
+      struct_node = node.type
+      if struct_node.name != None:
+        struct_item = struct_info.get_struct_by_struct_name(struct_node.name)
+      else:
+        struct_item = StructItem(None, None, struct_node, True)
+        struct_item.compute_child_decl_map(struct_info)
+    elif node.type.__class__.__name__ == 'Enum':
+      # TODO(angiebird): Special treatment for Union?
+      struct_node = node.type
+      struct_item = None
+    else:
+      print('Unrecognized peeled_decl_type_node.type',
+            node.type.__class__.__name__)
+  else:
+    # debug_print(getframeinfo(currentframe()))
+    # print(node.__class__.__name__)
+    #TODO(angiebird): Do we need to take care of this part?
+    pass
+
+  return struct_item
+
+
+def parse_decl_node(struct_info, decl_node):
+  # struct_item is None if this decl_node is not a struct_item
+  decl_node_name = decl_node.name
+  decl_type_node = decl_node.type
+  is_ptr_decl, peeled_decl_type_node = peel_ptr_decl(decl_type_node)
+  struct_item = parse_peeled_decl_type_node(struct_info, peeled_decl_type_node)
+  return DeclStatus(decl_node_name, struct_item, is_ptr_decl)
+
+
+def get_lvalue_lead(lvalue_node):
+  """return '&' or '*' of lvalue if available"""
+  if lvalue_node.__class__.__name__ == 'UnaryOp' and lvalue_node.op == '&':
+    return '&'
+  elif lvalue_node.__class__.__name__ == 'UnaryOp' and lvalue_node.op == '*':
+    return '*'
+  return None
+
+
+def parse_lvalue(lvalue_node):
+  """get id_chain from lvalue"""
+  id_chain = parse_lvalue_recursive(lvalue_node, [])
+  return id_chain
+
+
+def parse_lvalue_recursive(lvalue_node, id_chain):
+  """cpi->rd->u -> (cpi->rd)->u"""
+  if lvalue_node.__class__.__name__ == 'ID':
+    id_chain.append(lvalue_node.name)
+    id_chain.reverse()
+    return id_chain
+  elif lvalue_node.__class__.__name__ == 'StructRef':
+    id_chain.append(lvalue_node.field.name)
+    return parse_lvalue_recursive(lvalue_node.name, id_chain)
+  elif lvalue_node.__class__.__name__ == 'ArrayRef':
+    return parse_lvalue_recursive(lvalue_node.name, id_chain)
+  elif lvalue_node.__class__.__name__ == 'UnaryOp' and lvalue_node.op == '&':
+    return parse_lvalue_recursive(lvalue_node.expr, id_chain)
+  elif lvalue_node.__class__.__name__ == 'UnaryOp' and lvalue_node.op == '*':
+    return parse_lvalue_recursive(lvalue_node.expr, id_chain)
+  else:
+    return None
+
+
+class FuncDefVisitor(c_ast.NodeVisitor):
+  func_dictionary = {}
+
+  def visit_FuncDef(self, node):
+    func_name = node.decl.name
+    self.func_dictionary[func_name] = node
+
+
+def build_func_dictionary(ast):
+  v = FuncDefVisitor()
+  v.visit(ast)
+  return v.func_dictionary
+
+
+def get_func_start_coord(func_node):
+  return func_node.coord
+
+
+def find_end_node(node):
+  node_list = []
+  for c in node:
+    node_list.append(c)
+  if len(node_list) == 0:
+    return node
+  else:
+    return find_end_node(node_list[-1])
+
+
+def get_func_end_coord(func_node):
+  return find_end_node(func_node).coord
+
+
+def get_func_size(func_node):
+  start_coord = get_func_start_coord(func_node)
+  end_coord = get_func_end_coord(func_node)
+  if start_coord.file == end_coord.file:
+    return end_coord.line - start_coord.line + 1
+  else:
+    return None
+
+
+def save_object(obj, filename):
+  with open(filename, 'wb') as obj_fp:
+    pickle.dump(obj, obj_fp, protocol=-1)
+
+
+def load_object(filename):
+  obj = None
+  with open(filename, 'rb') as obj_fp:
+    obj = pickle.load(obj_fp)
+  return obj
+
+
+def get_av1_ast(gen_ast=False):
+  # TODO(angiebird): Generalize this path
+  c_filename = './av1_pp.c'
+  print('generate ast')
+  ast = parse_file(c_filename)
+  #save_object(ast, ast_file)
+  print('finished generate ast')
+  return ast
+
+
+def get_func_param_id_map(func_def_node):
+  param_id_map = {}
+  func_decl = func_def_node.decl.type
+  param_list = func_decl.args.params
+  for decl in param_list:
+    param_id_map[decl.name] = decl
+  return param_id_map
+
+
+class IDTreeStack():
+
+  def __init__(self, global_id_tree):
+    self.stack = deque()
+    self.global_id_tree = global_id_tree
+
+  def add_link_node(self, node, link_id_chain):
+    link_node = self.add_id_node(link_id_chain)
+    node.link_node = link_node
+    node.link_id_chain = link_id_chain
+
+  def push_id_tree(self, id_tree=None):
+    if id_tree == None:
+      id_tree = IDStatusNode()
+    self.stack.append(id_tree)
+    return id_tree
+
+  def pop_id_tree(self):
+    return self.stack.pop()
+
+  def add_id_seed_node(self, id_seed, decl_status):
+    return self.stack[-1].add_child(id_seed, decl_status)
+
+  def get_id_seed_node(self, id_seed):
+    idx = len(self.stack) - 1
+    while idx >= 0:
+      id_node = self.stack[idx].get_child(id_seed)
+      if id_node != None:
+        return id_node
+      idx -= 1
+
+    id_node = self.global_id_tree.get_child(id_seed)
+    if id_node != None:
+      return id_node
+    return None
+
+  def add_id_node(self, id_chain):
+    id_seed = id_chain[0]
+    id_seed_node = self.get_id_seed_node(id_seed)
+    if id_seed_node == None:
+      return None
+    if len(id_chain) == 1:
+      return id_seed_node
+    return id_seed_node.add_descendant(id_chain[1:])
+
+  def get_id_node(self, id_chain):
+    id_seed = id_chain[0]
+    id_seed_node = self.get_id_seed_node(id_seed)
+    if id_seed_node == None:
+      return None
+    if len(id_chain) == 1:
+      return id_seed_node
+    return id_seed_node.get_descendant(id_chain[1:])
+
+  def top(self):
+    return self.stack[-1]
+
+
+class IDStatusNode():
+
+  def __init__(self, name=None, root=None):
+    if root is None:
+      self.root = self
+    else:
+      self.root = root
+
+    self.name = name
+
+    self.parent = None
+    self.children = {}
+
+    self.assign = False
+    self.refer = False
+
+    self.decl_status = None
+
+    self.link_id_chain = None
+    self.link_node = None
+
+    self.visit = False
+
+  def set_link_id_chain(self, link_id_chain):
+    self.set_assign(False)
+    self.link_id_chain = link_id_chain
+    self.link_node = self.root.get_descendant(link_id_chain)
+
+  def set_link_node(self, link_node):
+    self.set_assign(False)
+    self.link_id_chain = ['*']
+    self.link_node = link_node
+
+  def get_link_id_chain(self):
+    return self.link_id_chain
+
+  def get_concrete_node(self):
+    if self.visit == True:
+      # return None when there is a loop
+      return None
+    self.visit = True
+    if self.link_node == None:
+      self.visit = False
+      return self
+    else:
+      concrete_node = self.link_node.get_concrete_node()
+      self.visit = False
+      if concrete_node == None:
+        return self
+      return concrete_node
+
+  def set_assign(self, assign):
+    concrete_node = self.get_concrete_node()
+    concrete_node.assign = assign
+
+  def get_assign(self):
+    concrete_node = self.get_concrete_node()
+    return concrete_node.assign
+
+  def set_refer(self, refer):
+    concrete_node = self.get_concrete_node()
+    concrete_node.refer = refer
+
+  def get_refer(self):
+    concrete_node = self.get_concrete_node()
+    return concrete_node.refer
+
+  def set_parent(self, parent):
+    concrete_node = self.get_concrete_node()
+    concrete_node.parent = parent
+
+  def add_child(self, name, decl_status=None):
+    concrete_node = self.get_concrete_node()
+    if name not in concrete_node.children:
+      child_id_node = IDStatusNode(name, concrete_node.root)
+      concrete_node.children[name] = child_id_node
+      if decl_status == None:
+        # Check if the child decl_status can be inferred from its parent's
+        # decl_status
+        if self.decl_status != None:
+          decl_status = self.decl_status.get_child_decl_status(name)
+      child_id_node.set_decl_status(decl_status)
+    return concrete_node.children[name]
+
+  def get_child(self, name):
+    concrete_node = self.get_concrete_node()
+    if name in concrete_node.children:
+      return concrete_node.children[name]
+    else:
+      return None
+
+  def add_descendant(self, id_chain):
+    current_node = self.get_concrete_node()
+    for name in id_chain:
+      current_node.add_child(name)
+      parent_node = current_node
+      current_node = current_node.get_child(name)
+      current_node.set_parent(parent_node)
+    return current_node
+
+  def get_descendant(self, id_chain):
+    current_node = self.get_concrete_node()
+    for name in id_chain:
+      current_node = current_node.get_child(name)
+      if current_node == None:
+        return None
+    return current_node
+
+  def get_children(self):
+    current_node = self.get_concrete_node()
+    return current_node.children
+
+  def set_decl_status(self, decl_status):
+    current_node = self.get_concrete_node()
+    current_node.decl_status = decl_status
+
+  def get_decl_status(self):
+    current_node = self.get_concrete_node()
+    return current_node.decl_status
+
+  def __str__(self):
+    if self.link_id_chain is None:
+      return str(self.name) + ' a: ' + str(int(self.assign)) + ' r: ' + str(
+          int(self.refer))
+    else:
+      return str(self.name) + ' -> ' + ' '.join(self.link_id_chain)
+
+  def show(self, id_chain=None):
+    if id_chain == None:
+      id_chain = []
+    id_chain.append(self.name)
+    if self.assign or self.refer:
+      print(' '.join(id_chain[1:]), 'a:', int(self.assign), 'r:',
+            int(self.refer))
+    for c in self.children:
+      self.children[c].show(id_chain)
+    id_chain.pop()
+
+
+class FuncInOutVisitor(c_ast.NodeVisitor):
+
+  def __init__(self,
+               func_def_node,
+               struct_info,
+               func_dictionary,
+               keep_body_id_tree=True,
+               call_param_map=None,
+               global_id_tree=None,
+               func_history=None,
+               unknown=None):
+    self.func_dictionary = func_dictionary
+    self.struct_info = struct_info
+    self.param_id_map = get_func_param_id_map(func_def_node)
+    self.parent_node = None
+    self.global_id_tree = global_id_tree
+    self.body_id_tree = None
+    self.keep_body_id_tree = keep_body_id_tree
+    if func_history == None:
+      self.func_history = {}
+    else:
+      self.func_history = func_history
+
+    if unknown == None:
+      self.unknown = []
+    else:
+      self.unknown = unknown
+
+    self.id_tree_stack = IDTreeStack(global_id_tree)
+    self.id_tree_stack.push_id_tree()
+
+    #TODO move this part into a function
+    for param in self.param_id_map:
+      decl_node = self.param_id_map[param]
+      decl_status = parse_decl_node(self.struct_info, decl_node)
+      descendant = self.id_tree_stack.add_id_seed_node(decl_status.name,
+                                                       decl_status)
+      if call_param_map is not None and param in call_param_map:
+        # This is a function call.
+        # Map the input parameter to the caller's nodes
+        # TODO(angiebird): Can we use add_link_node here?
+        descendant.set_link_node(call_param_map[param])
+
+  def get_id_tree_stack(self):
+    return self.id_tree_stack
+
+  def generic_visit(self, node):
+    prev_parent = self.parent_node
+    self.parent_node = node
+    for c in node:
+      self.visit(c)
+    self.parent_node = prev_parent
+
+  # TODO rename
+  def add_new_id_tree(self, node):
+    self.id_tree_stack.push_id_tree()
+    self.generic_visit(node)
+    id_tree = self.id_tree_stack.pop_id_tree()
+    if self.parent_node == None and self.keep_body_id_tree == True:
+      # this is function body
+      self.body_id_tree = id_tree
+
+  def visit_For(self, node):
+    self.add_new_id_tree(node)
+
+  def visit_Compound(self, node):
+    self.add_new_id_tree(node)
+
+  def visit_Decl(self, node):
+    if node.type.__class__.__name__ != 'FuncDecl':
+      decl_status = parse_decl_node(self.struct_info, node)
+      descendant = self.id_tree_stack.add_id_seed_node(decl_status.name,
+                                                       decl_status)
+      if node.init is not None:
+        init_id_chain = self.process_lvalue(node.init)
+        if init_id_chain != None:
+          if decl_status.struct_item is None:
+            init_descendant = self.id_tree_stack.add_id_node(init_id_chain)
+            if init_descendant != None:
+              init_descendant.set_refer(True)
+            else:
+              self.unknown.append(node)
+            descendant.set_assign(True)
+          else:
+            self.id_tree_stack.add_link_node(descendant, init_id_chain)
+        else:
+          self.unknown.append(node)
+      else:
+        descendant.set_assign(True)
+      self.generic_visit(node)
+
+  def is_lvalue(self, node):
+    if self.parent_node is None:
+      # TODO(angiebird): Do every lvalue has parent_node != None?
+      return False
+    if self.parent_node.__class__.__name__ == 'StructRef':
+      return False
+    if self.parent_node.__class__.__name__ == 'ArrayRef' and node == self.parent_node.name:
+      # if node == self.parent_node.subscript, the node could be lvalue
+      return False
+    if self.parent_node.__class__.__name__ == 'UnaryOp' and self.parent_node.op == '&':
+      return False
+    if self.parent_node.__class__.__name__ == 'UnaryOp' and self.parent_node.op == '*':
+      return False
+    return True
+
+  def process_lvalue(self, node):
+    id_chain = parse_lvalue(node)
+    if id_chain == None:
+      return id_chain
+    elif id_chain[0] in self.struct_info.enum_value_dic:
+      return None
+    else:
+      return id_chain
+
+  def process_possible_lvalue(self, node):
+    if self.is_lvalue(node):
+      id_chain = self.process_lvalue(node)
+      lead_char = get_lvalue_lead(node)
+      # make sure the id is not an enum value
+      if id_chain == None:
+        self.unknown.append(node)
+        return
+      descendant = self.id_tree_stack.add_id_node(id_chain)
+      if descendant == None:
+        self.unknown.append(node)
+        return
+      decl_status = descendant.get_decl_status()
+      if decl_status == None:
+        descendant.set_assign(True)
+        descendant.set_refer(True)
+        self.unknown.append(node)
+        return
+      if self.parent_node.__class__.__name__ == 'Assignment':
+        if node is self.parent_node.lvalue:
+          if decl_status.struct_item != None:
+            if len(id_chain) > 1:
+              descendant.set_assign(True)
+            elif len(id_chain) == 1:
+              if lead_char == '*':
+                descendant.set_assign(True)
+              else:
+                right_id_chain = self.process_lvalue(self.parent_node.rvalue)
+                if right_id_chain != None:
+                  self.id_tree_stack.add_link_node(descendant, right_id_chain)
+                else:
+                  #TODO(angiebird): 1.Find a better way to deal with this case.
+                  descendant.set_assign(True)
+            else:
+              debug_print(getframeinfo(currentframe()))
+          else:
+            descendant.set_assign(True)
+        elif node is self.parent_node.rvalue:
+          if decl_status.struct_item is None:
+            descendant.set_refer(True)
+            if lead_char == '&':
+              descendant.set_assign(True)
+          else:
+            left_id_chain = self.process_lvalue(self.parent_node.lvalue)
+            left_lead_char = get_lvalue_lead(self.parent_node.lvalue)
+            if left_id_chain != None:
+              if len(left_id_chain) > 1:
+                descendant.set_refer(True)
+              elif len(left_id_chain) == 1:
+                if left_lead_char == '*':
+                  descendant.set_refer(True)
+                else:
+                  #TODO(angiebird): Check whether the other node is linked to this node.
+                  pass
+              else:
+                self.unknown.append(self.parent_node.lvalue)
+                debug_print(getframeinfo(currentframe()))
+            else:
+              self.unknown.append(self.parent_node.lvalue)
+              debug_print(getframeinfo(currentframe()))
+        else:
+          debug_print(getframeinfo(currentframe()))
+      elif self.parent_node.__class__.__name__ == 'UnaryOp':
+        # TODO(angiebird): Consider +=, *=, -=, /= etc
+        if self.parent_node.op == '--' or self.parent_node.op == '++' or\
+        self.parent_node.op == 'p--' or self.parent_node.op == 'p++':
+          descendant.set_assign(True)
+          descendant.set_refer(True)
+        else:
+          descendant.set_refer(True)
+      elif self.parent_node.__class__.__name__ == 'Decl':
+        #The logic is at visit_Decl
+        pass
+      elif self.parent_node.__class__.__name__ == 'ExprList':
+        #The logic is at visit_FuncCall
+        pass
+      else:
+        descendant.set_refer(True)
+
+  def visit_ID(self, node):
+    # If the parent is a FuncCall, this ID is a function name.
+    if self.parent_node.__class__.__name__ != 'FuncCall':
+      self.process_possible_lvalue(node)
+    self.generic_visit(node)
+
+  def visit_StructRef(self, node):
+    self.process_possible_lvalue(node)
+    self.generic_visit(node)
+
+  def visit_ArrayRef(self, node):
+    self.process_possible_lvalue(node)
+    self.generic_visit(node)
+
+  def visit_UnaryOp(self, node):
+    if node.op == '&' or node.op == '*':
+      self.process_possible_lvalue(node)
+    self.generic_visit(node)
+
+  def visit_FuncCall(self, node):
+    if node.name.__class__.__name__ == 'ID':
+      if node.name.name in self.func_dictionary:
+        if node.name.name not in self.func_history:
+          self.func_history[node.name.name] = True
+          func_def_node = self.func_dictionary[node.name.name]
+          call_param_map = self.process_func_call(node, func_def_node)
+
+          visitor = FuncInOutVisitor(func_def_node, self.struct_info,
+                                     self.func_dictionary, False,
+                                     call_param_map, self.global_id_tree,
+                                     self.func_history, self.unknown)
+          visitor.visit(func_def_node.body)
+    else:
+      self.unknown.append(node)
+    self.generic_visit(node)
+
+  def process_func_call(self, func_call_node, func_def_node):
+    # set up a refer/assign for func parameters
+    # return call_param_map
+    call_param_ls = func_call_node.args.exprs
+    call_param_map = {}
+
+    func_decl = func_def_node.decl.type
+    decl_param_ls = func_decl.args.params
+    for param_node, decl_node in zip(call_param_ls, decl_param_ls):
+      id_chain = self.process_lvalue(param_node)
+      if id_chain != None:
+        descendant = self.id_tree_stack.add_id_node(id_chain)
+        if descendant == None:
+          self.unknown.append(param_node)
+        else:
+          decl_status = descendant.get_decl_status()
+          if decl_status != None:
+            if decl_status.struct_item == None:
+              if decl_status.is_ptr_decl == True:
+                descendant.set_assign(True)
+                descendant.set_refer(True)
+              else:
+                descendant.set_refer(True)
+            else:
+              call_param_map[decl_node.name] = descendant
+          else:
+            self.unknown.append(param_node)
+      else:
+        self.unknown.append(param_node)
+    return call_param_map
+
+
+def build_global_id_tree(ast, struct_info):
+  global_id_tree = IDStatusNode()
+  for node in ast.ext:
+    if node.__class__.__name__ == 'Decl':
+      # id tree is for tracking assign/refer status
+      # we don't care about function id because they can't be changed
+      if node.type.__class__.__name__ != 'FuncDecl':
+        decl_status = parse_decl_node(struct_info, node)
+        descendant = global_id_tree.add_child(decl_status.name, decl_status)
+  return global_id_tree
+
+
+class FuncAnalyzer():
+
+  def __init__(self):
+    self.ast = get_av1_ast()
+    self.struct_info = build_struct_info(self.ast)
+    self.func_dictionary = build_func_dictionary(self.ast)
+    self.global_id_tree = build_global_id_tree(self.ast, self.struct_info)
+
+  def analyze(self, func_name):
+    if func_name in self.func_dictionary:
+      func_def_node = self.func_dictionary[func_name]
+      visitor = FuncInOutVisitor(func_def_node, self.struct_info,
+                                 self.func_dictionary, True, None,
+                                 self.global_id_tree)
+      visitor.visit(func_def_node.body)
+      root = visitor.get_id_tree_stack()
+      root.top().show()
+    else:
+      print(func_name, "doesn't exist")
+
+
+if __name__ == '__main__':
+  fa = FuncAnalyzer()
+  fa.analyze('tpl_get_satd_cost')
+  pass
diff --git a/tools/auto_refactor/av1_preprocess.py b/tools/auto_refactor/av1_preprocess.py
new file mode 100644
index 0000000..d7afd69
--- /dev/null
+++ b/tools/auto_refactor/av1_preprocess.py
@@ -0,0 +1,96 @@
+import os
+import sys
+
+
+def is_code_file(filename):
+  return filename.endswith(".c") or filename.endswith(".h")
+
+
+def is_simd_file(filename):
+  simd_keywords = [
+      "avx2", "sse2", "sse3", "ssse3", "sse4", "dspr2", "neon", "msa", "simd",
+      "x86"
+  ]
+  for keyword in simd_keywords:
+    if filename.find(keyword) >= 0:
+      return True
+  return False
+
+
+def get_code_file_list(path, exclude_file_set):
+  code_file_list = []
+  for cur_dir, sub_dir, file_list in os.walk(path):
+    for filename in file_list:
+      if is_code_file(filename) and not is_simd_file(
+          filename) and filename not in exclude_file_set:
+        file_path = os.path.join(cur_dir, filename)
+        code_file_list.append(file_path)
+  return code_file_list
+
+
+def av1_exclude_file_set():
+  exclude_file_set = {
+      "cfl_ppc.c",
+      "ppc_cpudetect.c",
+  }
+  return exclude_file_set
+
+
+def get_av1_pp_command(fake_header_dir, code_file_list):
+  pre_command = "gcc -w -nostdinc -E -I./ -I../ -I" + fake_header_dir + (" "
+                                                                         "-D'ATTRIBUTE_PACKED='"
+                                                                         " "
+                                                                         "-D'__attribute__(x)='"
+                                                                         " "
+                                                                         "-D'__inline__='"
+                                                                         " "
+                                                                         "-D'float_t=float'"
+                                                                         " "
+                                                                         "-D'DECLARE_ALIGNED(n,"
+                                                                         " typ,"
+                                                                         " "
+                                                                         "val)=typ"
+                                                                         " val'"
+                                                                         " "
+                                                                         "-D'volatile='"
+                                                                         " "
+                                                                         "-D'AV1_K_MEANS_DIM=2'"
+                                                                         " "
+                                                                         "-D'INLINE='")
+  return pre_command + " " + " ".join(code_file_list)
+
+
+def modify_av1_rtcd(build_dir):
+  av1_rtcd = os.path.join(build_dir, "config/av1_rtcd.h")
+  fp = open(av1_rtcd)
+  string = fp.read()
+  fp.close()
+  new_string = string.replace("#ifdef RTCD_C", "#if 0")
+  fp = open(av1_rtcd, "w")
+  fp.write(new_string)
+  fp.close()
+
+
+def preprocess_av1(aom_dir, build_dir, fake_header_dir):
+  cur_dir = os.getcwd()
+  output = os.path.join(cur_dir, "av1_pp.c")
+  path_list = [
+      os.path.join(aom_dir, "av1/encoder"),
+      os.path.join(aom_dir, "av1/common")
+  ]
+  code_file_list = []
+  for path in path_list:
+    path = os.path.realpath(path)
+    code_file_list.extend(get_code_file_list(path, av1_exclude_file_set()))
+  modify_av1_rtcd(build_dir)
+  cmd = get_av1_pp_command(fake_header_dir, code_file_list) + " >" + output
+  os.chdir(build_dir)
+  os.system(cmd)
+  os.chdir(cur_dir)
+
+
+if __name__ == "__main__":
+  aom_dir = sys.argv[1]
+  build_dir = sys.argv[2]
+  fake_header_dir = sys.argv[3]
+  preprocess_av1(aom_dir, build_dir, fake_header_dir)
diff --git a/tools/auto_refactor/c_files/decl_status_code.c b/tools/auto_refactor/c_files/decl_status_code.c
new file mode 100644
index 0000000..4c7afba
--- /dev/null
+++ b/tools/auto_refactor/c_files/decl_status_code.c
@@ -0,0 +1,20 @@
+typedef struct S1 {
+  int x;
+} T1;
+
+int parse_decl_node_2() { int arr[3]; }
+
+int parse_decl_node_3() { int *a; }
+
+int parse_decl_node_4() { T1 t1[3]; }
+
+int parse_decl_node_5() { T1 *t2[3]; }
+
+int parse_decl_node_6() { T1 t3[3][3]; }
+
+int main() {
+  int a;
+  T1 t1;
+  struct S1 s1;
+  T1 *t2;
+}
diff --git a/tools/auto_refactor/c_files/func_in_out.c b/tools/auto_refactor/c_files/func_in_out.c
new file mode 100644
index 0000000..8c14edc
--- /dev/null
+++ b/tools/auto_refactor/c_files/func_in_out.c
@@ -0,0 +1,197 @@
+typedef struct XD {
+  int u;
+  int v;
+} XD;
+
+typedef struct RD {
+  XD *xd;
+  int u;
+  int v;
+} RD;
+
+typedef struct VP9_COMP {
+  int y;
+  RD *rd;
+  RD rd2;
+  int arr[3];
+  union {
+    int z;
+  };
+  struct {
+    int w;
+  };
+} VP9_COMP;
+
+int sub_func(VP9_COMP *cpi, int b) {
+  int d;
+  cpi->y += 1;
+  cpi->y -= b;
+  d = cpi->y * 2;
+  return d;
+}
+
+int func_id_forrest_show(VP9_COMP *cpi, int b) {
+  int c = 2;
+  int x = cpi->y + c * 2 + 1;
+  int y;
+  RD *rd = cpi->rd;
+  y = cpi->rd->u;
+  return x + y;
+}
+
+int func_link_id_chain_1(VP9_COMP *cpi) {
+  RD *rd = cpi->rd;
+  rd->u = 0;
+}
+
+int func_link_id_chain_2(VP9_COMP *cpi) {
+  RD *rd = cpi->rd;
+  XD *xd = rd->xd;
+  xd->u = 0;
+}
+
+int func_assign_refer_status_1(VP9_COMP *cpi) { RD *rd = cpi->rd; }
+
+int func_assign_refer_status_2(VP9_COMP *cpi) {
+  RD *rd2;
+  rd2 = cpi->rd;
+}
+
+int func_assign_refer_status_3(VP9_COMP *cpi) {
+  int a;
+  a = cpi->y;
+}
+
+int func_assign_refer_status_4(VP9_COMP *cpi) {
+  int *b;
+  b = &cpi->y;
+}
+
+int func_assign_refer_status_5(VP9_COMP *cpi) {
+  RD *rd5;
+  rd5 = &cpi->rd2;
+}
+
+int func_assign_refer_status_6(VP9_COMP *cpi, VP9_COMP *cpi2) {
+  cpi->rd = cpi2->rd;
+}
+
+int func_assign_refer_status_7(VP9_COMP *cpi, VP9_COMP *cpi2) {
+  cpi->arr[3] = 0;
+}
+
+int func_assign_refer_status_8(VP9_COMP *cpi, VP9_COMP *cpi2) {
+  int x = cpi->arr[3];
+}
+
+int func_assign_refer_status_9(VP9_COMP *cpi) {
+  {
+    RD *rd = cpi->rd;
+    { rd->u = 0; }
+  }
+}
+
+int func_assign_refer_status_10(VP9_COMP *cpi) { cpi->arr[cpi->rd->u] = 0; }
+
+int func_assign_refer_status_11(VP9_COMP *cpi) {
+  RD *rd11 = &cpi->rd2;
+  rd11->v = 1;
+}
+
+int func_assign_refer_status_12(VP9_COMP *cpi, VP9_COMP *cpi2) {
+  *cpi->rd = *cpi2->rd;
+}
+
+int func_assign_refer_status_13(VP9_COMP *cpi) {
+  cpi->z = 0;
+  cpi->w = 0;
+}
+
+int func(VP9_COMP *cpi, int x) {
+  int a;
+  cpi->y = 4;
+  a = 3 + cpi->y;
+  a = a * x;
+  cpi->y *= 4;
+  RD *ref_rd = cpi->rd;
+  ref_rd->u = 0;
+  cpi->rd2.v = 1;
+  cpi->rd->v = 1;
+  RD *ref_rd2 = &cpi->rd2;
+  RD **ref_rd3 = &(&cpi->rd2);
+  int b = sub_func(cpi, a);
+  cpi->rd->v++;
+  return b;
+}
+
+int func_sub_call_1(VP9_COMP *cpi2, int x) { cpi2->y = 4; }
+
+int func_call_1(VP9_COMP *cpi, int y) { func_sub_call_1(cpi, y); }
+
+int func_sub_call_2(VP9_COMP *cpi2, RD *rd, int x) { rd->u = 0; }
+
+int func_call_2(VP9_COMP *cpi, int y) { func_sub_call_2(cpi, &cpi->rd, y); }
+
+int func_sub_call_3(VP9_COMP *cpi2, int x) {}
+
+int func_call_3(VP9_COMP *cpi, int y) { func_sub_call_3(cpi, ++cpi->y); }
+
+int func_sub_sub_call_4(VP9_COMP *cpi3, XD *xd) {
+  cpi3->rd.u = 0;
+  xd->u = 0;
+}
+
+int func_sub_call_4(VP9_COMP *cpi2, RD *rd) {
+  func_sub_sub_call_4(cpi2, rd->xd);
+}
+
+int func_call_4(VP9_COMP *cpi, int y) { func_sub_call_4(cpi, &cpi->rd); }
+
+int func_sub_call_5(VP9_COMP *cpi) {
+  cpi->y = 2;
+  func_call_5(cpi);
+}
+
+int func_call_5(VP9_COMP *cpi) { func_sub_call_5(cpi); }
+
+int func_compound_1(VP9_COMP *cpi) {
+  for (int i = 0; i < 10; ++i) {
+    cpi->y++;
+  }
+}
+
+int func_compound_2(VP9_COMP *cpi) {
+  for (int i = 0; i < cpi->y; ++i) {
+    cpi->rd->u = i;
+  }
+}
+
+int func_compound_3(VP9_COMP *cpi) {
+  int i = 3;
+  while (i > 0) {
+    cpi->rd->u = i;
+    i--;
+  }
+}
+
+int func_compound_4(VP9_COMP *cpi) {
+  while (cpi->y-- >= 0) {
+  }
+}
+
+int func_compound_5(VP9_COMP *cpi) {
+  do {
+  } while (cpi->y-- >= 0);
+}
+
+int func_compound_6(VP9_COMP *cpi) {
+  for (int i = 0; i < 10; ++i) cpi->y--;
+}
+
+int main() {
+  int x;
+  VP9_COMP cpi;
+  RD rd;
+  cpi->rd = rd;
+  func(&cpi, x);
+}
diff --git a/tools/auto_refactor/c_files/global_variable.c b/tools/auto_refactor/c_files/global_variable.c
new file mode 100644
index 0000000..1934e20
--- /dev/null
+++ b/tools/auto_refactor/c_files/global_variable.c
@@ -0,0 +1,16 @@
+extern const int global_a[13];
+
+const int global_b = 0;
+
+typedef struct S1 {
+  int x;
+} T1;
+
+struct S3 {
+  int x;
+} s3;
+
+int func_global_1(int *a) {
+  *a = global_a[3];
+  return 0;
+}
diff --git a/tools/auto_refactor/c_files/parse_lvalue.c b/tools/auto_refactor/c_files/parse_lvalue.c
new file mode 100644
index 0000000..093ab55
--- /dev/null
+++ b/tools/auto_refactor/c_files/parse_lvalue.c
@@ -0,0 +1,35 @@
+typedef struct RD {
+  int u;
+  int v;
+  int arr[3];
+} RD;
+
+typedef struct VP9_COMP {
+  int y;
+  RD *rd;
+  RD rd2;
+  RD rd3[2];
+} VP9_COMP;
+
+int parse_lvalue_2(VP9_COMP *cpi) { RD *rd2 = &cpi->rd2; }
+
+int func(VP9_COMP *cpi, int x) {
+  cpi->rd->u = 0;
+
+  int y;
+  y = 0;
+
+  cpi->rd2.v = 0;
+
+  cpi->rd->arr[2] = 0;
+
+  cpi->rd3[1]->arr[2] = 0;
+
+  return 0;
+}
+
+int main() {
+  int x = 0;
+  VP9_COMP cpi;
+  func(&cpi, x);
+}
diff --git a/tools/auto_refactor/c_files/simple_code.c b/tools/auto_refactor/c_files/simple_code.c
new file mode 100644
index 0000000..330fc3a
--- /dev/null
+++ b/tools/auto_refactor/c_files/simple_code.c
@@ -0,0 +1,53 @@
+typedef struct S {
+  int x;
+  int y;
+  int z;
+} S;
+
+typedef struct T {
+  S s;
+} T;
+
+int d(S *s) {
+  ++s->x;
+  s->x--;
+  s->y = s->y + 1;
+  int *c = &s->x;
+  S ss;
+  ss.x = 1;
+  ss.x += 2;
+  ss.z *= 2;
+  return 0;
+}
+int b(S *s) {
+  d(s);
+  return 0;
+}
+int c(int x) {
+  if (x) {
+    c(x - 1);
+  } else {
+    S s;
+    d(&s);
+  }
+  return 0;
+}
+int a(S *s) {
+  b(s);
+  c(1);
+  return 0;
+}
+int e() {
+  c(0);
+  return 0;
+}
+int main() {
+  int p = 3;
+  S s;
+  s.x = p + 1;
+  s.y = 2;
+  s.z = 3;
+  a(&s);
+  T t;
+  t.s.x = 3;
+}
diff --git a/tools/auto_refactor/c_files/struct_code.c b/tools/auto_refactor/c_files/struct_code.c
new file mode 100644
index 0000000..62b9d7a
--- /dev/null
+++ b/tools/auto_refactor/c_files/struct_code.c
@@ -0,0 +1,38 @@
+typedef struct S1 {
+  int x;
+} T1;
+
+struct S3 {
+  int x;
+};
+
+typedef struct {
+  int x;
+  struct S3 s3;
+} T4;
+
+typedef union U5 {
+  int x;
+  double y;
+} T5;
+
+typedef struct S6 {
+  struct {
+    int x;
+  };
+  union {
+    int y;
+    int z;
+  };
+} T6;
+
+typedef struct S7 {
+  struct {
+    int x;
+  } y;
+  union {
+    int w;
+  } z;
+} T7;
+
+int main() {}
diff --git a/tools/auto_refactor/test_auto_refactor.py b/tools/auto_refactor/test_auto_refactor.py
new file mode 100644
index 0000000..836e145
--- /dev/null
+++ b/tools/auto_refactor/test_auto_refactor.py
@@ -0,0 +1,666 @@
+#!/usr/bin/env python
+
+import pprint
+import re
+import os, sys
+import io
+import unittest as googletest
+
+sys.path[0:0] = ['.', '..']
+
+from pycparser import c_parser, parse_file
+from pycparser.c_ast import *
+from pycparser.c_parser import CParser, Coord, ParseError
+
+from auto_refactor import *
+
+
+def get_c_file_path(filename):
+  return os.path.join('c_files', filename)
+
+
+class TestStructInfo(googletest.TestCase):
+
+  def setUp(self):
+    filename = get_c_file_path('struct_code.c')
+    self.ast = parse_file(filename)
+
+  def test_build_struct_info(self):
+    struct_info = build_struct_info(self.ast)
+    typedef_name_dic = struct_info.typedef_name_dic
+    self.assertEqual('T1' in typedef_name_dic, True)
+    self.assertEqual('T4' in typedef_name_dic, True)
+    self.assertEqual('T5' in typedef_name_dic, True)
+
+    struct_name_dic = struct_info.struct_name_dic
+    struct_name = 'S1'
+    self.assertEqual(struct_name in struct_name_dic, True)
+    struct_item = struct_name_dic[struct_name]
+    self.assertEqual(struct_item.is_union, False)
+
+    struct_name = 'S3'
+    self.assertEqual(struct_name in struct_name_dic, True)
+    struct_item = struct_name_dic[struct_name]
+    self.assertEqual(struct_item.is_union, False)
+
+    struct_name = 'U5'
+    self.assertEqual(struct_name in struct_name_dic, True)
+    struct_item = struct_name_dic[struct_name]
+    self.assertEqual(struct_item.is_union, True)
+
+    self.assertEqual(len(struct_info.struct_item_list), 6)
+
+  def test_get_child_decl_status(self):
+    struct_info = build_struct_info(self.ast)
+    struct_item = struct_info.typedef_name_dic['T4']
+
+    decl_status = struct_item.child_decl_map['x']
+    self.assertEqual(decl_status.struct_item, None)
+    self.assertEqual(decl_status.is_ptr_decl, False)
+
+    decl_status = struct_item.child_decl_map['s3']
+    self.assertEqual(decl_status.struct_item.struct_name, 'S3')
+    self.assertEqual(decl_status.is_ptr_decl, False)
+
+    struct_item = struct_info.typedef_name_dic['T6']
+    decl_status = struct_item.child_decl_map['x']
+    self.assertEqual(decl_status.struct_item, None)
+    self.assertEqual(decl_status.is_ptr_decl, False)
+
+    decl_status = struct_item.child_decl_map['y']
+    self.assertEqual(decl_status.struct_item, None)
+    self.assertEqual(decl_status.is_ptr_decl, False)
+
+    decl_status = struct_item.child_decl_map['z']
+    self.assertEqual(decl_status.struct_item, None)
+    self.assertEqual(decl_status.is_ptr_decl, False)
+
+    struct_item = struct_info.typedef_name_dic['T7']
+    decl_status = struct_item.child_decl_map['y']
+    self.assertEqual('x' in decl_status.struct_item.child_decl_map, True)
+
+    struct_item = struct_info.typedef_name_dic['T7']
+    decl_status = struct_item.child_decl_map['z']
+    self.assertEqual('w' in decl_status.struct_item.child_decl_map, True)
+
+
+class TestParseLvalue(googletest.TestCase):
+
+  def setUp(self):
+    filename = get_c_file_path('parse_lvalue.c')
+    self.ast = parse_file(filename)
+    self.func_dictionary = build_func_dictionary(self.ast)
+
+  def test_parse_lvalue(self):
+    func_node = self.func_dictionary['func']
+    func_body_items = func_node.body.block_items
+    id_list = parse_lvalue(func_body_items[0].lvalue)
+    ref_id_list = ['cpi', 'rd', 'u']
+    self.assertEqual(id_list, ref_id_list)
+
+    id_list = parse_lvalue(func_body_items[2].lvalue)
+    ref_id_list = ['y']
+    self.assertEqual(id_list, ref_id_list)
+
+    id_list = parse_lvalue(func_body_items[3].lvalue)
+    ref_id_list = ['cpi', 'rd2', 'v']
+    self.assertEqual(id_list, ref_id_list)
+
+    id_list = parse_lvalue(func_body_items[4].lvalue)
+    ref_id_list = ['cpi', 'rd', 'arr']
+    self.assertEqual(id_list, ref_id_list)
+
+    id_list = parse_lvalue(func_body_items[5].lvalue)
+    ref_id_list = ['cpi', 'rd3', 'arr']
+    self.assertEqual(id_list, ref_id_list)
+
+  def test_parse_lvalue_2(self):
+    func_node = self.func_dictionary['parse_lvalue_2']
+    func_body_items = func_node.body.block_items
+    id_list = parse_lvalue(func_body_items[0].init)
+    ref_id_list = ['cpi', 'rd2']
+    self.assertEqual(id_list, ref_id_list)
+
+
+class TestIDStatusNode(googletest.TestCase):
+
+  def test_add_descendant(self):
+    root = IDStatusNode('root')
+    id_chain1 = ['cpi', 'rd', 'u']
+    id_chain2 = ['cpi', 'rd', 'v']
+    root.add_descendant(id_chain1)
+    root.add_descendant(id_chain2)
+
+    ref_children_list1 = ['cpi']
+    children_list1 = list(root.children.keys())
+    self.assertEqual(children_list1, ref_children_list1)
+
+    ref_children_list2 = ['rd']
+    children_list2 = list(root.children['cpi'].children.keys())
+    self.assertEqual(children_list2, ref_children_list2)
+
+    ref_children_list3 = ['u', 'v']
+    children_list3 = list(root.children['cpi'].children['rd'].children.keys())
+    self.assertEqual(children_list3, ref_children_list3)
+
+  def test_get_descendant(self):
+    root = IDStatusNode('root')
+    id_chain1 = ['cpi', 'rd', 'u']
+    id_chain2 = ['cpi', 'rd', 'v']
+    ref_descendant_1 = root.add_descendant(id_chain1)
+    ref_descendant_2 = root.add_descendant(id_chain2)
+
+    descendant_1 = root.get_descendant(id_chain1)
+    self.assertEqual(descendant_1 is ref_descendant_1, True)
+
+    descendant_2 = root.get_descendant(id_chain2)
+    self.assertEqual(descendant_2 is ref_descendant_2, True)
+
+    id_chain3 = ['cpi', 'rd', 'h']
+    descendant_3 = root.get_descendant(id_chain3)
+    self.assertEqual(descendant_3, None)
+
+
+class TestFuncInOut(googletest.TestCase):
+
+  def setUp(self):
+    c_filename = get_c_file_path('func_in_out.c')
+    self.ast = parse_file(c_filename)
+    self.func_dictionary = build_func_dictionary(self.ast)
+    self.struct_info = build_struct_info(self.ast)
+
+  def test_get_func_param_id_map(self):
+    func_def_node = self.func_dictionary['func']
+    param_id_map = get_func_param_id_map(func_def_node)
+    ref_param_id_map_keys = ['cpi', 'x']
+    self.assertEqual(list(param_id_map.keys()), ref_param_id_map_keys)
+
+  def test_assign_refer_status_1(self):
+    func_def_node = self.func_dictionary['func_assign_refer_status_1']
+    visitor = FuncInOutVisitor(func_def_node, self.struct_info,
+                               self.func_dictionary)
+    visitor.visit(func_def_node.body)
+    root = visitor.get_id_tree_stack()
+    body_id_tree = visitor.body_id_tree
+
+    id_chain = ['rd']
+    descendant = body_id_tree.get_descendant(id_chain)
+    self.assertEqual(descendant.get_assign(), False)
+    self.assertEqual(descendant.get_refer(), False)
+    ref_link_id_chain = ['cpi', 'rd']
+    self.assertEqual(ref_link_id_chain, descendant.get_link_id_chain())
+
+    id_chain = ['cpi', 'rd']
+    descendant = root.get_id_node(id_chain)
+    self.assertEqual(descendant.get_assign(), False)
+    self.assertEqual(descendant.get_refer(), False)
+    self.assertEqual(None, descendant.get_link_id_chain())
+
+  def test_assign_refer_status_2(self):
+    func_def_node = self.func_dictionary['func_assign_refer_status_2']
+    visitor = FuncInOutVisitor(func_def_node, self.struct_info,
+                               self.func_dictionary)
+    visitor.visit(func_def_node.body)
+    root = visitor.get_id_tree_stack()
+    body_id_tree = visitor.body_id_tree
+
+    id_chain = ['rd2']
+    descendant = body_id_tree.get_descendant(id_chain)
+    self.assertEqual(descendant.get_assign(), False)
+    self.assertEqual(descendant.get_refer(), False)
+
+    ref_link_id_chain = ['cpi', 'rd']
+    self.assertEqual(ref_link_id_chain, descendant.get_link_id_chain())
+
+    id_chain = ['cpi', 'rd']
+    descendant = root.get_id_node(id_chain)
+    self.assertEqual(descendant.get_assign(), False)
+    self.assertEqual(descendant.get_refer(), False)
+    self.assertEqual(None, descendant.get_link_id_chain())
+
+  def test_assign_refer_status_3(self):
+    func_def_node = self.func_dictionary['func_assign_refer_status_3']
+    visitor = FuncInOutVisitor(func_def_node, self.struct_info,
+                               self.func_dictionary)
+    visitor.visit(func_def_node.body)
+    root = visitor.get_id_tree_stack()
+    body_id_tree = visitor.body_id_tree
+
+    id_chain = ['a']
+    descendant = body_id_tree.get_descendant(id_chain)
+    self.assertEqual(descendant.get_assign(), True)
+    self.assertEqual(descendant.get_refer(), False)
+    self.assertEqual(None, descendant.get_link_id_chain())
+
+    id_chain = ['cpi', 'y']
+    descendant = root.get_id_node(id_chain)
+    self.assertEqual(descendant.get_assign(), False)
+    self.assertEqual(descendant.get_refer(), True)
+    self.assertEqual(None, descendant.get_link_id_chain())
+
+  def test_assign_refer_status_4(self):
+    func_def_node = self.func_dictionary['func_assign_refer_status_4']
+    visitor = FuncInOutVisitor(func_def_node, self.struct_info,
+                               self.func_dictionary)
+    visitor.visit(func_def_node.body)
+    root = visitor.get_id_tree_stack()
+    body_id_tree = visitor.body_id_tree
+
+    id_chain = ['b']
+    descendant = body_id_tree.get_descendant(id_chain)
+    self.assertEqual(descendant.get_assign(), True)
+    self.assertEqual(descendant.get_refer(), False)
+    self.assertEqual(None, descendant.get_link_id_chain())
+
+    id_chain = ['cpi', 'y']
+    descendant = root.get_id_node(id_chain)
+    self.assertEqual(descendant.get_assign(), True)
+    self.assertEqual(descendant.get_refer(), True)
+    self.assertEqual(None, descendant.get_link_id_chain())
+
+  def test_assign_refer_status_5(self):
+    func_def_node = self.func_dictionary['func_assign_refer_status_5']
+    visitor = FuncInOutVisitor(func_def_node, self.struct_info,
+                               self.func_dictionary)
+    visitor.visit(func_def_node.body)
+    root = visitor.get_id_tree_stack()
+    body_id_tree = visitor.body_id_tree
+
+    id_chain = ['rd5']
+    descendant = body_id_tree.get_descendant(id_chain)
+    self.assertEqual(descendant.get_assign(), False)
+    self.assertEqual(descendant.get_refer(), False)
+
+    id_chain = ['cpi', 'rd2']
+    descendant = root.get_id_node(id_chain)
+    self.assertEqual(descendant.get_assign(), False)
+    self.assertEqual(descendant.get_refer(), False)
+    self.assertEqual(None, descendant.get_link_id_chain())
+
+  def test_assign_refer_status_6(self):
+    func_def_node = self.func_dictionary['func_assign_refer_status_6']
+    visitor = FuncInOutVisitor(func_def_node, self.struct_info,
+                               self.func_dictionary)
+    visitor.visit(func_def_node.body)
+    root = visitor.get_id_tree_stack()
+
+    id_chain = ['cpi', 'rd']
+    descendant = root.get_id_node(id_chain)
+    self.assertEqual(descendant.get_assign(), True)
+    self.assertEqual(descendant.get_refer(), False)
+    self.assertEqual(None, descendant.get_link_id_chain())
+
+    id_chain = ['cpi2', 'rd']
+    descendant = root.get_id_node(id_chain)
+    self.assertEqual(descendant.get_assign(), False)
+    self.assertEqual(descendant.get_refer(), True)
+    self.assertEqual(None, descendant.get_link_id_chain())
+
+  def test_assign_refer_status_7(self):
+    func_def_node = self.func_dictionary['func_assign_refer_status_7']
+    visitor = FuncInOutVisitor(func_def_node, self.struct_info,
+                               self.func_dictionary)
+    visitor.visit(func_def_node.body)
+    root = visitor.get_id_tree_stack()
+    id_chain = ['cpi', 'arr']
+    descendant = root.get_id_node(id_chain)
+    self.assertEqual(descendant.get_assign(), True)
+    self.assertEqual(descendant.get_refer(), False)
+
+  def test_assign_refer_status_8(self):
+    func_def_node = self.func_dictionary['func_assign_refer_status_8']
+    visitor = FuncInOutVisitor(func_def_node, self.struct_info,
+                               self.func_dictionary)
+    visitor.visit(func_def_node.body)
+    root = visitor.get_id_tree_stack()
+    id_chain = ['cpi', 'arr']
+    descendant = root.get_id_node(id_chain)
+    self.assertEqual(descendant.get_assign(), False)
+    self.assertEqual(descendant.get_refer(), True)
+
+  def test_assign_refer_status_9(self):
+    func_def_node = self.func_dictionary['func_assign_refer_status_9']
+    visitor = FuncInOutVisitor(func_def_node, self.struct_info,
+                               self.func_dictionary)
+    visitor.visit(func_def_node.body)
+    root = visitor.get_id_tree_stack()
+    id_chain = ['cpi', 'rd', 'u']
+    descendant = root.get_id_node(id_chain)
+    self.assertEqual(descendant.get_assign(), True)
+    self.assertEqual(descendant.get_refer(), False)
+
+  def test_assign_refer_status_10(self):
+    func_def_node = self.func_dictionary['func_assign_refer_status_10']
+    visitor = FuncInOutVisitor(func_def_node, self.struct_info,
+                               self.func_dictionary)
+    visitor.visit(func_def_node.body)
+    root = visitor.get_id_tree_stack()
+    id_chain = ['cpi', 'rd', 'u']
+    descendant = root.get_id_node(id_chain)
+    self.assertEqual(descendant.get_assign(), False)
+    self.assertEqual(descendant.get_refer(), True)
+
+    id_chain = ['cpi', 'arr']
+    descendant = root.get_id_node(id_chain)
+    self.assertEqual(descendant.get_assign(), True)
+    self.assertEqual(descendant.get_refer(), False)
+
+  def test_assign_refer_status_11(self):
+    func_def_node = self.func_dictionary['func_assign_refer_status_11']
+    visitor = FuncInOutVisitor(func_def_node, self.struct_info,
+                               self.func_dictionary)
+    visitor.visit(func_def_node.body)
+    root = visitor.get_id_tree_stack()
+    id_chain = ['cpi', 'rd2', 'v']
+    descendant = root.get_id_node(id_chain)
+    self.assertEqual(descendant.get_assign(), True)
+    self.assertEqual(descendant.get_refer(), False)
+
+  def test_assign_refer_status_12(self):
+    func_def_node = self.func_dictionary['func_assign_refer_status_12']
+    visitor = FuncInOutVisitor(func_def_node, self.struct_info,
+                               self.func_dictionary)
+    visitor.visit(func_def_node.body)
+    root = visitor.get_id_tree_stack()
+    id_chain = ['cpi', 'rd']
+    descendant = root.get_id_node(id_chain)
+    self.assertEqual(descendant.get_assign(), True)
+    self.assertEqual(descendant.get_refer(), False)
+
+    id_chain = ['cpi2', 'rd']
+    descendant = root.get_id_node(id_chain)
+    self.assertEqual(descendant.get_assign(), False)
+    self.assertEqual(descendant.get_refer(), True)
+
+  def test_assign_refer_status_13(self):
+    func_def_node = self.func_dictionary['func_assign_refer_status_13']
+    visitor = FuncInOutVisitor(func_def_node, self.struct_info,
+                               self.func_dictionary)
+    visitor.visit(func_def_node.body)
+    root = visitor.get_id_tree_stack()
+    id_chain = ['cpi', 'z']
+    descendant = root.get_id_node(id_chain)
+    self.assertEqual(descendant.get_assign(), True)
+    self.assertEqual(descendant.get_refer(), False)
+
+    id_chain = ['cpi', 'w']
+    descendant = root.get_id_node(id_chain)
+    self.assertEqual(descendant.get_assign(), True)
+    self.assertEqual(descendant.get_refer(), False)
+
+  def test_id_status_forrest_1(self):
+    func_def_node = self.func_dictionary['func']
+    visitor = FuncInOutVisitor(func_def_node, self.struct_info,
+                               self.func_dictionary)
+    visitor.visit(func_def_node.body)
+    root = visitor.get_id_tree_stack().top()
+    children_names = set(root.get_children().keys())
+    ref_children_names = set(['cpi', 'x'])
+    self.assertEqual(children_names, ref_children_names)
+
+    root = visitor.body_id_tree
+    children_names = set(root.get_children().keys())
+    ref_children_names = set(['a', 'ref_rd', 'ref_rd2', 'ref_rd3', 'b'])
+    self.assertEqual(children_names, ref_children_names)
+
+  def test_id_status_forrest_show(self):
+    func_def_node = self.func_dictionary['func_id_forrest_show']
+    visitor = FuncInOutVisitor(func_def_node, self.struct_info,
+                               self.func_dictionary)
+    visitor.visit(func_def_node.body)
+    visitor.get_id_tree_stack().top().show()
+
+  def test_id_status_forrest_2(self):
+    func_def_node = self.func_dictionary['func_id_forrest_show']
+    visitor = FuncInOutVisitor(func_def_node, self.struct_info,
+                               self.func_dictionary)
+    visitor.visit(func_def_node.body)
+    root = visitor.get_id_tree_stack().top()
+    self.assertEqual(root, root.root)
+
+    id_chain = ['cpi', 'rd']
+    descendant = root.get_descendant(id_chain)
+    self.assertEqual(root, descendant.root)
+
+    id_chain = ['b']
+    descendant = root.get_descendant(id_chain)
+    self.assertEqual(root, descendant.root)
+
+  def test_link_id_chain_1(self):
+    func_def_node = self.func_dictionary['func_link_id_chain_1']
+    visitor = FuncInOutVisitor(func_def_node, self.struct_info,
+                               self.func_dictionary)
+    visitor.visit(func_def_node.body)
+    root = visitor.get_id_tree_stack()
+    id_chain = ['cpi', 'rd', 'u']
+    descendant = root.get_id_node(id_chain)
+    self.assertEqual(descendant.get_assign(), True)
+
+  def test_link_id_chain_2(self):
+    func_def_node = self.func_dictionary['func_link_id_chain_2']
+    visitor = FuncInOutVisitor(func_def_node, self.struct_info,
+                               self.func_dictionary)
+    visitor.visit(func_def_node.body)
+    root = visitor.get_id_tree_stack()
+    id_chain = ['cpi', 'rd', 'xd', 'u']
+    descendant = root.get_id_node(id_chain)
+    self.assertEqual(descendant.get_assign(), True)
+
+  def test_func_call_1(self):
+    func_def_node = self.func_dictionary['func_call_1']
+    visitor = FuncInOutVisitor(func_def_node, self.struct_info,
+                               self.func_dictionary)
+    visitor.visit(func_def_node.body)
+    root = visitor.get_id_tree_stack()
+    id_chain = ['cpi', 'y']
+    descendant = root.get_id_node(id_chain)
+    self.assertEqual(descendant.get_assign(), True)
+    self.assertEqual(descendant.get_refer(), False)
+
+    id_chain = ['y']
+    descendant = root.get_id_node(id_chain)
+    self.assertEqual(descendant.get_assign(), False)
+    self.assertEqual(descendant.get_refer(), True)
+
+  def test_func_call_2(self):
+    func_def_node = self.func_dictionary['func_call_2']
+    visitor = FuncInOutVisitor(func_def_node, self.struct_info,
+                               self.func_dictionary)
+    visitor.visit(func_def_node.body)
+    root = visitor.get_id_tree_stack()
+    id_chain = ['cpi', 'rd', 'u']
+    descendant = root.get_id_node(id_chain)
+    self.assertEqual(descendant.get_assign(), True)
+    self.assertEqual(descendant.get_refer(), False)
+
+    id_chain = ['cpi', 'rd']
+    descendant = root.get_id_node(id_chain)
+    self.assertEqual(descendant.get_assign(), False)
+    self.assertEqual(descendant.get_refer(), False)
+
+  def test_func_call_3(self):
+    func_def_node = self.func_dictionary['func_call_3']
+    visitor = FuncInOutVisitor(func_def_node, self.struct_info,
+                               self.func_dictionary)
+    visitor.visit(func_def_node.body)
+    root = visitor.get_id_tree_stack()
+    id_chain = ['cpi', 'y']
+    descendant = root.get_id_node(id_chain)
+    self.assertEqual(descendant.get_assign(), True)
+    self.assertEqual(descendant.get_refer(), True)
+
+  def test_func_call_4(self):
+    func_def_node = self.func_dictionary['func_call_4']
+    visitor = FuncInOutVisitor(func_def_node, self.struct_info,
+                               self.func_dictionary)
+    visitor.visit(func_def_node.body)
+    root = visitor.get_id_tree_stack()
+
+    id_chain = ['cpi', 'rd', 'u']
+    descendant = root.get_id_node(id_chain)
+    self.assertEqual(descendant.get_assign(), True)
+    self.assertEqual(descendant.get_refer(), False)
+
+    id_chain = ['cpi', 'rd', 'xd', 'u']
+    descendant = root.get_id_node(id_chain)
+    self.assertEqual(descendant.get_assign(), True)
+    self.assertEqual(descendant.get_refer(), False)
+
+  def test_func_call_5(self):
+    func_def_node = self.func_dictionary['func_call_5']
+    visitor = FuncInOutVisitor(func_def_node, self.struct_info,
+                               self.func_dictionary)
+    visitor.visit(func_def_node.body)
+    root = visitor.get_id_tree_stack()
+
+    id_chain = ['cpi', 'y']
+    descendant = root.get_id_node(id_chain)
+    self.assertEqual(descendant.get_assign(), True)
+    self.assertEqual(descendant.get_refer(), False)
+
+  def test_func_compound_1(self):
+    func_def_node = self.func_dictionary['func_compound_1']
+    visitor = FuncInOutVisitor(func_def_node, self.struct_info,
+                               self.func_dictionary)
+    visitor.visit(func_def_node.body)
+    root = visitor.get_id_tree_stack()
+    id_chain = ['cpi', 'y']
+    descendant = root.get_id_node(id_chain)
+    self.assertEqual(descendant.get_assign(), True)
+    self.assertEqual(descendant.get_refer(), True)
+
+  def test_func_compound_2(self):
+    func_def_node = self.func_dictionary['func_compound_2']
+    visitor = FuncInOutVisitor(func_def_node, self.struct_info,
+                               self.func_dictionary)
+    visitor.visit(func_def_node.body)
+    root = visitor.get_id_tree_stack()
+    id_chain = ['cpi', 'y']
+    descendant = root.get_id_node(id_chain)
+    self.assertEqual(descendant.get_assign(), False)
+    self.assertEqual(descendant.get_refer(), True)
+
+    id_chain = ['cpi', 'rd', 'u']
+    descendant = root.get_id_node(id_chain)
+    self.assertEqual(descendant.get_assign(), True)
+    self.assertEqual(descendant.get_refer(), False)
+
+  def test_func_compound_3(self):
+    func_def_node = self.func_dictionary['func_compound_3']
+    visitor = FuncInOutVisitor(func_def_node, self.struct_info,
+                               self.func_dictionary)
+    visitor.visit(func_def_node.body)
+    root = visitor.get_id_tree_stack()
+
+    id_chain = ['cpi', 'rd', 'u']
+    descendant = root.get_id_node(id_chain)
+    self.assertEqual(descendant.get_assign(), True)
+    self.assertEqual(descendant.get_refer(), False)
+
+  def test_func_compound_4(self):
+    func_def_node = self.func_dictionary['func_compound_4']
+    visitor = FuncInOutVisitor(func_def_node, self.struct_info,
+                               self.func_dictionary)
+    visitor.visit(func_def_node.body)
+    root = visitor.get_id_tree_stack()
+    id_chain = ['cpi', 'y']
+    descendant = root.get_id_node(id_chain)
+    self.assertEqual(descendant.get_assign(), True)
+    self.assertEqual(descendant.get_refer(), True)
+
+  def test_func_compound_5(self):
+    func_def_node = self.func_dictionary['func_compound_5']
+    visitor = FuncInOutVisitor(func_def_node, self.struct_info,
+                               self.func_dictionary)
+    visitor.visit(func_def_node.body)
+    root = visitor.get_id_tree_stack()
+    id_chain = ['cpi', 'y']
+    descendant = root.get_id_node(id_chain)
+    self.assertEqual(descendant.get_assign(), True)
+    self.assertEqual(descendant.get_refer(), True)
+
+  def test_func_compound_6(self):
+    func_def_node = self.func_dictionary['func_compound_6']
+    visitor = FuncInOutVisitor(func_def_node, self.struct_info,
+                               self.func_dictionary)
+    visitor.visit(func_def_node.body)
+    root = visitor.get_id_tree_stack()
+    id_chain = ['cpi', 'y']
+    descendant = root.get_id_node(id_chain)
+    self.assertEqual(descendant.get_assign(), True)
+    self.assertEqual(descendant.get_refer(), True)
+
+
+class TestDeclStatus(googletest.TestCase):
+
+  def setUp(self):
+    filename = get_c_file_path('decl_status_code.c')
+    self.ast = parse_file(filename)
+    self.func_dictionary = build_func_dictionary(self.ast)
+    self.struct_info = build_struct_info(self.ast)
+
+  def test_parse_decl_node(self):
+    func_def_node = self.func_dictionary['main']
+    decl_list = func_def_node.body.block_items
+    decl_status = parse_decl_node(self.struct_info, decl_list[0])
+    self.assertEqual(decl_status.name, 'a')
+    self.assertEqual(decl_status.is_ptr_decl, False)
+
+    decl_status = parse_decl_node(self.struct_info, decl_list[1])
+    self.assertEqual(decl_status.name, 't1')
+    self.assertEqual(decl_status.is_ptr_decl, False)
+
+    decl_status = parse_decl_node(self.struct_info, decl_list[2])
+    self.assertEqual(decl_status.name, 's1')
+    self.assertEqual(decl_status.is_ptr_decl, False)
+
+    decl_status = parse_decl_node(self.struct_info, decl_list[3])
+    self.assertEqual(decl_status.name, 't2')
+    self.assertEqual(decl_status.is_ptr_decl, True)
+
+  def test_parse_decl_node_2(self):
+    func_def_node = self.func_dictionary['parse_decl_node_2']
+    decl_list = func_def_node.body.block_items
+    decl_status = parse_decl_node(self.struct_info, decl_list[0])
+    self.assertEqual(decl_status.name, 'arr')
+    self.assertEqual(decl_status.is_ptr_decl, True)
+    self.assertEqual(decl_status.struct_item, None)
+
+  def test_parse_decl_node_3(self):
+    func_def_node = self.func_dictionary['parse_decl_node_3']
+    decl_list = func_def_node.body.block_items
+    decl_status = parse_decl_node(self.struct_info, decl_list[0])
+    self.assertEqual(decl_status.name, 'a')
+    self.assertEqual(decl_status.is_ptr_decl, True)
+    self.assertEqual(decl_status.struct_item, None)
+
+  def test_parse_decl_node_4(self):
+    func_def_node = self.func_dictionary['parse_decl_node_4']
+    decl_list = func_def_node.body.block_items
+    decl_status = parse_decl_node(self.struct_info, decl_list[0])
+    self.assertEqual(decl_status.name, 't1')
+    self.assertEqual(decl_status.is_ptr_decl, True)
+    self.assertEqual(decl_status.struct_item.typedef_name, 'T1')
+    self.assertEqual(decl_status.struct_item.struct_name, 'S1')
+
+  def test_parse_decl_node_5(self):
+    func_def_node = self.func_dictionary['parse_decl_node_5']
+    decl_list = func_def_node.body.block_items
+    decl_status = parse_decl_node(self.struct_info, decl_list[0])
+    self.assertEqual(decl_status.name, 't2')
+    self.assertEqual(decl_status.is_ptr_decl, True)
+    self.assertEqual(decl_status.struct_item.typedef_name, 'T1')
+    self.assertEqual(decl_status.struct_item.struct_name, 'S1')
+
+  def test_parse_decl_node_6(self):
+    func_def_node = self.func_dictionary['parse_decl_node_6']
+    decl_list = func_def_node.body.block_items
+    decl_status = parse_decl_node(self.struct_info, decl_list[0])
+    self.assertEqual(decl_status.name, 't3')
+    self.assertEqual(decl_status.is_ptr_decl, True)
+    self.assertEqual(decl_status.struct_item.typedef_name, 'T1')
+    self.assertEqual(decl_status.struct_item.struct_name, 'S1')
+
+
+if __name__ == '__main__':
+  googletest.main()